feat: Add YouTube transcript extraction component and frontend integration (#4502)

* add new youtube transcripts component

* [autofix.ci] apply automated fixes

*  (youtube-transcripts.spec.ts): add integration test for youtube transcripts component in the frontend to ensure user can interact with it successfully

* [autofix.ci] apply automated fixes

---------

Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
This commit is contained in:
Cristhian Zanforlin Lousa 2024-11-12 10:13:30 -03:00 committed by GitHub
commit 84dd03198d
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
8 changed files with 9946 additions and 0 deletions

View file

@ -19,6 +19,7 @@ from .tavily_search import TavilySearchToolComponent
from .wikipedia_api import WikipediaAPIComponent
from .wolfram_alpha_api import WolframAlphaAPIComponent
from .yahoo_finance import YfinanceToolComponent
from .youtube_transcripts import YouTubeTranscriptsComponent
with warnings.catch_warnings():
warnings.simplefilter("ignore", LangChainDeprecationWarning)
@ -45,4 +46,5 @@ __all__ = [
"WikipediaAPIComponent",
"WolframAlphaAPIComponent",
"YfinanceToolComponent",
"YouTubeTranscriptsComponent",
]

View file

@ -0,0 +1,173 @@
from langchain.tools import StructuredTool
from langchain_community.document_loaders import YoutubeLoader
from langchain_community.document_loaders.youtube import TranscriptFormat
from langchain_core.tools import ToolException
from pydantic import BaseModel, Field
from langflow.base.langchain_utilities.model import LCToolComponent
from langflow.field_typing import Tool
from langflow.inputs import DropdownInput, IntInput, MultilineInput
from langflow.schema import Data
from langflow.template import Output
class YoutubeApiSchema(BaseModel):
"""Schema to define the input structure for the tool."""
url: str = Field(..., description="The YouTube URL to get transcripts from.")
transcript_format: TranscriptFormat = Field(
TranscriptFormat.TEXT,
description="The format of the transcripts. Either 'text' for a single "
"text output or 'chunks' for timestamped chunks.",
)
chunk_size_seconds: int = Field(
120,
description="The size of each transcript chunk in seconds. Only "
"applicable when 'Transcript Format' is set to 'chunks'.",
)
language: str = Field(
"",
description="A comma-separated list of language codes in descending " "priority. Leave empty for default.",
)
translation: str = Field(
"", description="Translate the transcripts to the specified language. " "Leave empty for no translation."
)
class YouTubeTranscriptsComponent(LCToolComponent):
"""A component that extracts spoken content from YouTube videos as transcripts."""
display_name: str = "YouTube Transcripts"
description: str = "Extracts spoken content from YouTube videos as transcripts."
icon: str = "YouTube"
inputs = [
MultilineInput(
name="url", display_name="Video URL", info="Enter the YouTube video URL to get transcripts from."
),
DropdownInput(
name="transcript_format",
display_name="Transcript Format",
options=["text", "chunks"],
value="text",
info="The format of the transcripts. Either 'text' for a single output "
"or 'chunks' for timestamped chunks.",
),
IntInput(
name="chunk_size_seconds",
display_name="Chunk Size (seconds)",
value=60,
advanced=True,
info="The size of each transcript chunk in seconds. Only applicable when "
"'Transcript Format' is set to 'chunks'.",
),
MultilineInput(
name="language",
display_name="Language",
info="A comma-separated list of language codes in descending priority. " "Leave empty for default.",
),
DropdownInput(
name="translation",
display_name="Translation Language",
advanced=True,
options=["", "en", "es", "fr", "de", "it", "pt", "ru", "ja", "ko", "hi", "ar", "id"],
info="Translate the transcripts to the specified language. " "Leave empty for no translation.",
),
]
outputs = [
Output(name="transcripts", display_name="Data", method="build_youtube_transcripts"),
Output(name="transcripts_tool", display_name="Tool", method="build_youtube_tool"),
]
def build_youtube_transcripts(self) -> Data | list[Data]:
"""Method to build transcripts from the provided YouTube URL.
Returns:
Data | list[Data]: The transcripts of the video, either as a single
Data object or a list of Data objects.
"""
try:
loader = YoutubeLoader.from_youtube_url(
self.url,
transcript_format=TranscriptFormat.TEXT
if self.transcript_format == "text"
else TranscriptFormat.CHUNKS,
chunk_size_seconds=self.chunk_size_seconds,
language=self.language.split(",") if self.language else ["en"],
translation=self.translation if self.translation else None,
)
transcripts = loader.load()
if self.transcript_format == "text":
# Extract only the page_content from the Document
return Data(data={"transcripts": transcripts[0].page_content})
# For chunks, extract page_content and metadata separately
return [Data(data={"content": doc.page_content, "metadata": doc.metadata}) for doc in transcripts]
except Exception as exc: # noqa: BLE001
# Using a specific error type for the return value
return Data(data={"error": f"Failed to get YouTube transcripts: {exc!s}"})
def youtube_transcripts(
self,
url: str = "",
transcript_format: TranscriptFormat = TranscriptFormat.TEXT,
chunk_size_seconds: int = 120,
language: str = "",
translation: str = "",
) -> Data | list[Data]:
"""Helper method to handle transcripts outside of component calls.
Args:
url: The YouTube URL to get transcripts from.
transcript_format: Format of transcripts ('text' or 'chunks').
chunk_size_seconds: Size of each transcript chunk in seconds.
language: Comma-separated list of language codes.
translation: Target language for translation.
Returns:
Data | list[Data]: Video transcripts as single Data or list of Data.
"""
try:
if isinstance(transcript_format, str):
transcript_format = TranscriptFormat(transcript_format)
loader = YoutubeLoader.from_youtube_url(
url,
transcript_format=TranscriptFormat.TEXT
if transcript_format == TranscriptFormat.TEXT
else TranscriptFormat.CHUNKS,
chunk_size_seconds=chunk_size_seconds,
language=language.split(",") if language else ["en"],
translation=translation if translation else None,
)
transcripts = loader.load()
if transcript_format == TranscriptFormat.TEXT and len(transcripts) > 0:
return Data(data={"transcript": transcripts[0].page_content})
return [Data(data={"content": doc.page_content, "metadata": doc.metadata}) for doc in transcripts]
except Exception as exc:
msg = f"Failed to get YouTube transcripts: {exc!s}"
raise ToolException(msg) from exc
def build_youtube_tool(self) -> Tool:
"""Method to build the transcripts tool.
Returns:
Tool: A structured tool that uses the transcripts method.
Raises:
RuntimeError: If tool creation fails.
"""
try:
return StructuredTool.from_function(
name="youtube_transcripts",
description="Get transcripts from YouTube videos.",
func=self.youtube_transcripts,
args_schema=YoutubeApiSchema,
)
except Exception as exc:
msg = f"Failed to build the YouTube transcripts tool: {exc!s}"
raise RuntimeError(msg) from exc

View file

@ -0,0 +1,9 @@
import React, { forwardRef } from "react";
import YouTubeIcon from "./youtube";
export const YouTubeSvgIcon = forwardRef<
SVGSVGElement,
React.PropsWithChildren<{}>
>((props, ref) => {
return <YouTubeIcon ref={ref} {...props} />;
});

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

After

Width:  |  Height:  |  Size: 149 KiB

View file

@ -6,6 +6,7 @@ import { MilvusIcon } from "@/icons/Milvus";
import Perplexity from "@/icons/Perplexity/Perplexity";
import { TavilyIcon } from "@/icons/Tavily";
import { UnstructuredIcon } from "@/icons/Unstructured";
import YouTubeIcon from "@/icons/Youtube/youtube";
import { ZepMemoryIcon } from "@/icons/ZepMemory";
import { AthenaIcon } from "@/icons/athena/index";
import { freezeAllIcon } from "@/icons/freezeAll";
@ -658,6 +659,7 @@ export const nodeIconsLucide: iconsType = {
GithubIcon,
FaGithub,
FaApple,
YouTube: YouTubeIcon,
Milvus: MilvusIcon,
ExaSearch: ExaIcon,
ZepMemory: ZepMemoryIcon,

View file

@ -40,6 +40,8 @@ test("CodeAreaModalComponent", async ({ page }) => {
await page.getByTestId("sidebar-legacy-switch").isVisible({ timeout: 5000 });
await page.getByTestId("sidebar-legacy-switch").click();
await page.waitForTimeout(1000);
await page
.getByTestId("prototypesPython Function")
.dragTo(page.locator('//*[@id="react-flow-id"]'));

View file

@ -0,0 +1,74 @@
import { expect, test } from "@playwright/test";
test("user should be able to use youtube transcripts component", async ({
page,
}) => {
await page.goto("/");
await page.waitForSelector('[data-testid="mainpage_title"]', {
timeout: 30000,
});
await page.waitForSelector('[id="new-project-btn"]', {
timeout: 30000,
});
let modalCount = 0;
try {
const modalTitleElement = await page?.getByTestId("modal-title");
if (modalTitleElement) {
modalCount = await modalTitleElement.count();
}
} catch (error) {
modalCount = 0;
}
while (modalCount === 0) {
await page.getByText("New Flow", { exact: true }).click();
await page.waitForTimeout(3000);
modalCount = await page.getByTestId("modal-title")?.count();
}
await page.getByTestId("blank-flow").click();
await page.getByTestId("sidebar-search-input").click();
await page.getByTestId("sidebar-search-input").fill("youtube");
await page.waitForTimeout(1000);
await page
.locator('//*[@id="toolsYouTube Transcripts"]')
.dragTo(page.locator('//*[@id="react-flow-id"]'));
await page.mouse.up();
await page.mouse.down();
await page.getByTestId("fit_view").click();
let outdatedComponents = await page.getByTestId("icon-AlertTriangle").count();
while (outdatedComponents > 0) {
await page.getByTestId("icon-AlertTriangle").first().click();
await page.waitForTimeout(1000);
outdatedComponents = await page.getByTestId("icon-AlertTriangle").count();
}
await page
.getByTestId("textarea_str_url")
.fill("https://www.youtube.com/watch?v=VqhCQZaH4Vs");
await page.getByTestId("textarea_str_language").fill("en");
await page.getByTestId("button_run_youtube transcripts").click();
await page.waitForSelector("text=built successfully", { timeout: 30000 });
await page.getByTestId("output-inspection-data").first().click();
await page.waitForTimeout(1000);
await page.getByRole("gridcell").first().click();
const value = await page.getByPlaceholder("Empty").inputValue();
expect(value.length).toBeGreaterThan(10);
expect(value.toLowerCase()).toContain("i see trees of green");
expect(value.toLowerCase()).toContain(
"and i think to myself what a wonderful world",
);
});