feat: Add YouTube transcript extraction component and frontend integration (#4502)
* add new youtube transcripts component
* [autofix.ci] apply automated fixes
* ✨ (youtube-transcripts.spec.ts): add integration test for youtube transcripts component in the frontend to ensure user can interact with it successfully
* [autofix.ci] apply automated fixes
---------
Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
This commit is contained in:
parent
0e101ef1e7
commit
84dd03198d
8 changed files with 9946 additions and 0 deletions
|
|
@ -19,6 +19,7 @@ from .tavily_search import TavilySearchToolComponent
|
|||
from .wikipedia_api import WikipediaAPIComponent
|
||||
from .wolfram_alpha_api import WolframAlphaAPIComponent
|
||||
from .yahoo_finance import YfinanceToolComponent
|
||||
from .youtube_transcripts import YouTubeTranscriptsComponent
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("ignore", LangChainDeprecationWarning)
|
||||
|
|
@ -45,4 +46,5 @@ __all__ = [
|
|||
"WikipediaAPIComponent",
|
||||
"WolframAlphaAPIComponent",
|
||||
"YfinanceToolComponent",
|
||||
"YouTubeTranscriptsComponent",
|
||||
]
|
||||
|
|
|
|||
|
|
@ -0,0 +1,173 @@
|
|||
from langchain.tools import StructuredTool
|
||||
from langchain_community.document_loaders import YoutubeLoader
|
||||
from langchain_community.document_loaders.youtube import TranscriptFormat
|
||||
from langchain_core.tools import ToolException
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from langflow.base.langchain_utilities.model import LCToolComponent
|
||||
from langflow.field_typing import Tool
|
||||
from langflow.inputs import DropdownInput, IntInput, MultilineInput
|
||||
from langflow.schema import Data
|
||||
from langflow.template import Output
|
||||
|
||||
|
||||
class YoutubeApiSchema(BaseModel):
|
||||
"""Schema to define the input structure for the tool."""
|
||||
|
||||
url: str = Field(..., description="The YouTube URL to get transcripts from.")
|
||||
transcript_format: TranscriptFormat = Field(
|
||||
TranscriptFormat.TEXT,
|
||||
description="The format of the transcripts. Either 'text' for a single "
|
||||
"text output or 'chunks' for timestamped chunks.",
|
||||
)
|
||||
chunk_size_seconds: int = Field(
|
||||
120,
|
||||
description="The size of each transcript chunk in seconds. Only "
|
||||
"applicable when 'Transcript Format' is set to 'chunks'.",
|
||||
)
|
||||
language: str = Field(
|
||||
"",
|
||||
description="A comma-separated list of language codes in descending " "priority. Leave empty for default.",
|
||||
)
|
||||
translation: str = Field(
|
||||
"", description="Translate the transcripts to the specified language. " "Leave empty for no translation."
|
||||
)
|
||||
|
||||
|
||||
class YouTubeTranscriptsComponent(LCToolComponent):
|
||||
"""A component that extracts spoken content from YouTube videos as transcripts."""
|
||||
|
||||
display_name: str = "YouTube Transcripts"
|
||||
description: str = "Extracts spoken content from YouTube videos as transcripts."
|
||||
icon: str = "YouTube"
|
||||
|
||||
inputs = [
|
||||
MultilineInput(
|
||||
name="url", display_name="Video URL", info="Enter the YouTube video URL to get transcripts from."
|
||||
),
|
||||
DropdownInput(
|
||||
name="transcript_format",
|
||||
display_name="Transcript Format",
|
||||
options=["text", "chunks"],
|
||||
value="text",
|
||||
info="The format of the transcripts. Either 'text' for a single output "
|
||||
"or 'chunks' for timestamped chunks.",
|
||||
),
|
||||
IntInput(
|
||||
name="chunk_size_seconds",
|
||||
display_name="Chunk Size (seconds)",
|
||||
value=60,
|
||||
advanced=True,
|
||||
info="The size of each transcript chunk in seconds. Only applicable when "
|
||||
"'Transcript Format' is set to 'chunks'.",
|
||||
),
|
||||
MultilineInput(
|
||||
name="language",
|
||||
display_name="Language",
|
||||
info="A comma-separated list of language codes in descending priority. " "Leave empty for default.",
|
||||
),
|
||||
DropdownInput(
|
||||
name="translation",
|
||||
display_name="Translation Language",
|
||||
advanced=True,
|
||||
options=["", "en", "es", "fr", "de", "it", "pt", "ru", "ja", "ko", "hi", "ar", "id"],
|
||||
info="Translate the transcripts to the specified language. " "Leave empty for no translation.",
|
||||
),
|
||||
]
|
||||
|
||||
outputs = [
|
||||
Output(name="transcripts", display_name="Data", method="build_youtube_transcripts"),
|
||||
Output(name="transcripts_tool", display_name="Tool", method="build_youtube_tool"),
|
||||
]
|
||||
|
||||
def build_youtube_transcripts(self) -> Data | list[Data]:
|
||||
"""Method to build transcripts from the provided YouTube URL.
|
||||
|
||||
Returns:
|
||||
Data | list[Data]: The transcripts of the video, either as a single
|
||||
Data object or a list of Data objects.
|
||||
"""
|
||||
try:
|
||||
loader = YoutubeLoader.from_youtube_url(
|
||||
self.url,
|
||||
transcript_format=TranscriptFormat.TEXT
|
||||
if self.transcript_format == "text"
|
||||
else TranscriptFormat.CHUNKS,
|
||||
chunk_size_seconds=self.chunk_size_seconds,
|
||||
language=self.language.split(",") if self.language else ["en"],
|
||||
translation=self.translation if self.translation else None,
|
||||
)
|
||||
|
||||
transcripts = loader.load()
|
||||
|
||||
if self.transcript_format == "text":
|
||||
# Extract only the page_content from the Document
|
||||
return Data(data={"transcripts": transcripts[0].page_content})
|
||||
# For chunks, extract page_content and metadata separately
|
||||
return [Data(data={"content": doc.page_content, "metadata": doc.metadata}) for doc in transcripts]
|
||||
|
||||
except Exception as exc: # noqa: BLE001
|
||||
# Using a specific error type for the return value
|
||||
return Data(data={"error": f"Failed to get YouTube transcripts: {exc!s}"})
|
||||
|
||||
def youtube_transcripts(
|
||||
self,
|
||||
url: str = "",
|
||||
transcript_format: TranscriptFormat = TranscriptFormat.TEXT,
|
||||
chunk_size_seconds: int = 120,
|
||||
language: str = "",
|
||||
translation: str = "",
|
||||
) -> Data | list[Data]:
|
||||
"""Helper method to handle transcripts outside of component calls.
|
||||
|
||||
Args:
|
||||
url: The YouTube URL to get transcripts from.
|
||||
transcript_format: Format of transcripts ('text' or 'chunks').
|
||||
chunk_size_seconds: Size of each transcript chunk in seconds.
|
||||
language: Comma-separated list of language codes.
|
||||
translation: Target language for translation.
|
||||
|
||||
Returns:
|
||||
Data | list[Data]: Video transcripts as single Data or list of Data.
|
||||
"""
|
||||
try:
|
||||
if isinstance(transcript_format, str):
|
||||
transcript_format = TranscriptFormat(transcript_format)
|
||||
loader = YoutubeLoader.from_youtube_url(
|
||||
url,
|
||||
transcript_format=TranscriptFormat.TEXT
|
||||
if transcript_format == TranscriptFormat.TEXT
|
||||
else TranscriptFormat.CHUNKS,
|
||||
chunk_size_seconds=chunk_size_seconds,
|
||||
language=language.split(",") if language else ["en"],
|
||||
translation=translation if translation else None,
|
||||
)
|
||||
|
||||
transcripts = loader.load()
|
||||
if transcript_format == TranscriptFormat.TEXT and len(transcripts) > 0:
|
||||
return Data(data={"transcript": transcripts[0].page_content})
|
||||
return [Data(data={"content": doc.page_content, "metadata": doc.metadata}) for doc in transcripts]
|
||||
except Exception as exc:
|
||||
msg = f"Failed to get YouTube transcripts: {exc!s}"
|
||||
raise ToolException(msg) from exc
|
||||
|
||||
def build_youtube_tool(self) -> Tool:
|
||||
"""Method to build the transcripts tool.
|
||||
|
||||
Returns:
|
||||
Tool: A structured tool that uses the transcripts method.
|
||||
|
||||
Raises:
|
||||
RuntimeError: If tool creation fails.
|
||||
"""
|
||||
try:
|
||||
return StructuredTool.from_function(
|
||||
name="youtube_transcripts",
|
||||
description="Get transcripts from YouTube videos.",
|
||||
func=self.youtube_transcripts,
|
||||
args_schema=YoutubeApiSchema,
|
||||
)
|
||||
|
||||
except Exception as exc:
|
||||
msg = f"Failed to build the YouTube transcripts tool: {exc!s}"
|
||||
raise RuntimeError(msg) from exc
|
||||
9
src/frontend/src/icons/Youtube/index.tsx
Normal file
9
src/frontend/src/icons/Youtube/index.tsx
Normal file
|
|
@ -0,0 +1,9 @@
|
|||
import React, { forwardRef } from "react";
|
||||
import YouTubeIcon from "./youtube";
|
||||
|
||||
export const YouTubeSvgIcon = forwardRef<
|
||||
SVGSVGElement,
|
||||
React.PropsWithChildren<{}>
|
||||
>((props, ref) => {
|
||||
return <YouTubeIcon ref={ref} {...props} />;
|
||||
});
|
||||
5952
src/frontend/src/icons/Youtube/youtube.jsx
Normal file
5952
src/frontend/src/icons/Youtube/youtube.jsx
Normal file
File diff suppressed because it is too large
Load diff
3732
src/frontend/src/icons/Youtube/youtube.svg
Normal file
3732
src/frontend/src/icons/Youtube/youtube.svg
Normal file
File diff suppressed because it is too large
Load diff
|
After Width: | Height: | Size: 149 KiB |
|
|
@ -6,6 +6,7 @@ import { MilvusIcon } from "@/icons/Milvus";
|
|||
import Perplexity from "@/icons/Perplexity/Perplexity";
|
||||
import { TavilyIcon } from "@/icons/Tavily";
|
||||
import { UnstructuredIcon } from "@/icons/Unstructured";
|
||||
import YouTubeIcon from "@/icons/Youtube/youtube";
|
||||
import { ZepMemoryIcon } from "@/icons/ZepMemory";
|
||||
import { AthenaIcon } from "@/icons/athena/index";
|
||||
import { freezeAllIcon } from "@/icons/freezeAll";
|
||||
|
|
@ -658,6 +659,7 @@ export const nodeIconsLucide: iconsType = {
|
|||
GithubIcon,
|
||||
FaGithub,
|
||||
FaApple,
|
||||
YouTube: YouTubeIcon,
|
||||
Milvus: MilvusIcon,
|
||||
ExaSearch: ExaIcon,
|
||||
ZepMemory: ZepMemoryIcon,
|
||||
|
|
|
|||
|
|
@ -40,6 +40,8 @@ test("CodeAreaModalComponent", async ({ page }) => {
|
|||
await page.getByTestId("sidebar-legacy-switch").isVisible({ timeout: 5000 });
|
||||
await page.getByTestId("sidebar-legacy-switch").click();
|
||||
|
||||
await page.waitForTimeout(1000);
|
||||
|
||||
await page
|
||||
.getByTestId("prototypesPython Function")
|
||||
.dragTo(page.locator('//*[@id="react-flow-id"]'));
|
||||
|
|
|
|||
|
|
@ -0,0 +1,74 @@
|
|||
import { expect, test } from "@playwright/test";
|
||||
|
||||
test("user should be able to use youtube transcripts component", async ({
|
||||
page,
|
||||
}) => {
|
||||
await page.goto("/");
|
||||
await page.waitForSelector('[data-testid="mainpage_title"]', {
|
||||
timeout: 30000,
|
||||
});
|
||||
|
||||
await page.waitForSelector('[id="new-project-btn"]', {
|
||||
timeout: 30000,
|
||||
});
|
||||
|
||||
let modalCount = 0;
|
||||
try {
|
||||
const modalTitleElement = await page?.getByTestId("modal-title");
|
||||
if (modalTitleElement) {
|
||||
modalCount = await modalTitleElement.count();
|
||||
}
|
||||
} catch (error) {
|
||||
modalCount = 0;
|
||||
}
|
||||
|
||||
while (modalCount === 0) {
|
||||
await page.getByText("New Flow", { exact: true }).click();
|
||||
await page.waitForTimeout(3000);
|
||||
modalCount = await page.getByTestId("modal-title")?.count();
|
||||
}
|
||||
|
||||
await page.getByTestId("blank-flow").click();
|
||||
await page.getByTestId("sidebar-search-input").click();
|
||||
await page.getByTestId("sidebar-search-input").fill("youtube");
|
||||
|
||||
await page.waitForTimeout(1000);
|
||||
|
||||
await page
|
||||
.locator('//*[@id="toolsYouTube Transcripts"]')
|
||||
.dragTo(page.locator('//*[@id="react-flow-id"]'));
|
||||
await page.mouse.up();
|
||||
await page.mouse.down();
|
||||
await page.getByTestId("fit_view").click();
|
||||
|
||||
let outdatedComponents = await page.getByTestId("icon-AlertTriangle").count();
|
||||
|
||||
while (outdatedComponents > 0) {
|
||||
await page.getByTestId("icon-AlertTriangle").first().click();
|
||||
await page.waitForTimeout(1000);
|
||||
outdatedComponents = await page.getByTestId("icon-AlertTriangle").count();
|
||||
}
|
||||
|
||||
await page
|
||||
.getByTestId("textarea_str_url")
|
||||
.fill("https://www.youtube.com/watch?v=VqhCQZaH4Vs");
|
||||
|
||||
await page.getByTestId("textarea_str_language").fill("en");
|
||||
|
||||
await page.getByTestId("button_run_youtube transcripts").click();
|
||||
|
||||
await page.waitForSelector("text=built successfully", { timeout: 30000 });
|
||||
|
||||
await page.getByTestId("output-inspection-data").first().click();
|
||||
|
||||
await page.waitForTimeout(1000);
|
||||
|
||||
await page.getByRole("gridcell").first().click();
|
||||
|
||||
const value = await page.getByPlaceholder("Empty").inputValue();
|
||||
expect(value.length).toBeGreaterThan(10);
|
||||
expect(value.toLowerCase()).toContain("i see trees of green");
|
||||
expect(value.toLowerCase()).toContain(
|
||||
"and i think to myself what a wonderful world",
|
||||
);
|
||||
});
|
||||
Loading…
Add table
Add a link
Reference in a new issue