feat: Add YouTube transcript extraction component and frontend integration (#4502)

* add new youtube transcripts component * [autofix.ci] apply automated fixes * ✨ (youtube-transcripts.spec.ts): add integration test for youtube transcripts component in the frontend to ensure user can interact with it successfully * [autofix.ci] apply automated fixes --------- Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
2024-11-12 10:13:30 -03:00 · 2024-11-12 10:13:30 -03:00 · 84dd03198d
commit 84dd03198d
parent 0e101ef1e7
8 changed files with 9946 additions and 0 deletions
--- a/src/backend/base/langflow/components/tools/init.py
+++ b/src/backend/base/langflow/components/tools/init.py
@ -19,6 +19,7 @@ from .tavily_search import TavilySearchToolComponent
 from .wikipedia_api import WikipediaAPIComponent
 from .wolfram_alpha_api import WolframAlphaAPIComponent
 from .yahoo_finance import YfinanceToolComponent
+from .youtube_transcripts import YouTubeTranscriptsComponent

 with warnings.catch_warnings():
    warnings.simplefilter("ignore", LangChainDeprecationWarning)
@ -45,4 +46,5 @@ __all__ = [
    "WikipediaAPIComponent",
    "WolframAlphaAPIComponent",
    "YfinanceToolComponent",
+    "YouTubeTranscriptsComponent",
 ]
--- a/src/backend/base/langflow/components/tools/youtube_transcripts.py
+++ b/src/backend/base/langflow/components/tools/youtube_transcripts.py
@ -0,0 +1,173 @@
+from langchain.tools import StructuredTool
+from langchain_community.document_loaders import YoutubeLoader
+from langchain_community.document_loaders.youtube import TranscriptFormat
+from langchain_core.tools import ToolException
+from pydantic import BaseModel, Field
+
+from langflow.base.langchain_utilities.model import LCToolComponent
+from langflow.field_typing import Tool
+from langflow.inputs import DropdownInput, IntInput, MultilineInput
+from langflow.schema import Data
+from langflow.template import Output
+
+
+class YoutubeApiSchema(BaseModel):
+    """Schema to define the input structure for the tool."""
+
+    url: str = Field(..., description="The YouTube URL to get transcripts from.")
+    transcript_format: TranscriptFormat = Field(
+        TranscriptFormat.TEXT,
+        description="The format of the transcripts. Either 'text' for a single "
+        "text output or 'chunks' for timestamped chunks.",
+    )
+    chunk_size_seconds: int = Field(
+        120,
+        description="The size of each transcript chunk in seconds. Only "
+        "applicable when 'Transcript Format' is set to 'chunks'.",
+    )
+    language: str = Field(
+        "",
+        description="A comma-separated list of language codes in descending " "priority. Leave empty for default.",
+    )
+    translation: str = Field(
+        "", description="Translate the transcripts to the specified language. " "Leave empty for no translation."
+    )
+
+
+class YouTubeTranscriptsComponent(LCToolComponent):
+    """A component that extracts spoken content from YouTube videos as transcripts."""
+
+    display_name: str = "YouTube Transcripts"
+    description: str = "Extracts spoken content from YouTube videos as transcripts."
+    icon: str = "YouTube"
+
+    inputs = [
+        MultilineInput(
+            name="url", display_name="Video URL", info="Enter the YouTube video URL to get transcripts from."
+        ),
+        DropdownInput(
+            name="transcript_format",
+            display_name="Transcript Format",
+            options=["text", "chunks"],
+            value="text",
+            info="The format of the transcripts. Either 'text' for a single output "
+            "or 'chunks' for timestamped chunks.",
+        ),
+        IntInput(
+            name="chunk_size_seconds",
+            display_name="Chunk Size (seconds)",
+            value=60,
+            advanced=True,
+            info="The size of each transcript chunk in seconds. Only applicable when "
+            "'Transcript Format' is set to 'chunks'.",
+        ),
+        MultilineInput(
+            name="language",
+            display_name="Language",
+            info="A comma-separated list of language codes in descending priority. " "Leave empty for default.",
+        ),
+        DropdownInput(
+            name="translation",
+            display_name="Translation Language",
+            advanced=True,
+            options=["", "en", "es", "fr", "de", "it", "pt", "ru", "ja", "ko", "hi", "ar", "id"],
+            info="Translate the transcripts to the specified language. " "Leave empty for no translation.",
+        ),
+    ]
+
+    outputs = [
+        Output(name="transcripts", display_name="Data", method="build_youtube_transcripts"),
+        Output(name="transcripts_tool", display_name="Tool", method="build_youtube_tool"),
+    ]
+
+    def build_youtube_transcripts(self) -> Data | list[Data]:
+        """Method to build transcripts from the provided YouTube URL.
+
+        Returns:
+            Data | list[Data]: The transcripts of the video, either as a single
+            Data object or a list of Data objects.
+        """
+        try:
+            loader = YoutubeLoader.from_youtube_url(
+                self.url,
+                transcript_format=TranscriptFormat.TEXT
+                if self.transcript_format == "text"
+                else TranscriptFormat.CHUNKS,
+                chunk_size_seconds=self.chunk_size_seconds,
+                language=self.language.split(",") if self.language else ["en"],
+                translation=self.translation if self.translation else None,
+            )
+
+            transcripts = loader.load()
+
+            if self.transcript_format == "text":
+                # Extract only the page_content from the Document
+                return Data(data={"transcripts": transcripts[0].page_content})
+            # For chunks, extract page_content and metadata separately
+            return [Data(data={"content": doc.page_content, "metadata": doc.metadata}) for doc in transcripts]
+
+        except Exception as exc:  # noqa: BLE001
+            # Using a specific error type for the return value
+            return Data(data={"error": f"Failed to get YouTube transcripts: {exc!s}"})
+
+    def youtube_transcripts(
+        self,
+        url: str = "",
+        transcript_format: TranscriptFormat = TranscriptFormat.TEXT,
+        chunk_size_seconds: int = 120,
+        language: str = "",
+        translation: str = "",
+    ) -> Data | list[Data]:
+        """Helper method to handle transcripts outside of component calls.
+
+        Args:
+            url: The YouTube URL to get transcripts from.
+            transcript_format: Format of transcripts ('text' or 'chunks').
+            chunk_size_seconds: Size of each transcript chunk in seconds.
+            language: Comma-separated list of language codes.
+            translation: Target language for translation.
+
+        Returns:
+            Data | list[Data]: Video transcripts as single Data or list of Data.
+        """
+        try:
+            if isinstance(transcript_format, str):
+                transcript_format = TranscriptFormat(transcript_format)
+            loader = YoutubeLoader.from_youtube_url(
+                url,
+                transcript_format=TranscriptFormat.TEXT
+                if transcript_format == TranscriptFormat.TEXT
+                else TranscriptFormat.CHUNKS,
+                chunk_size_seconds=chunk_size_seconds,
+                language=language.split(",") if language else ["en"],
+                translation=translation if translation else None,
+            )
+
+            transcripts = loader.load()
+            if transcript_format == TranscriptFormat.TEXT and len(transcripts) > 0:
+                return Data(data={"transcript": transcripts[0].page_content})
+            return [Data(data={"content": doc.page_content, "metadata": doc.metadata}) for doc in transcripts]
+        except Exception as exc:
+            msg = f"Failed to get YouTube transcripts: {exc!s}"
+            raise ToolException(msg) from exc
+
+    def build_youtube_tool(self) -> Tool:
+        """Method to build the transcripts tool.
+
+        Returns:
+            Tool: A structured tool that uses the transcripts method.
+
+        Raises:
+            RuntimeError: If tool creation fails.
+        """
+        try:
+            return StructuredTool.from_function(
+                name="youtube_transcripts",
+                description="Get transcripts from YouTube videos.",
+                func=self.youtube_transcripts,
+                args_schema=YoutubeApiSchema,
+            )
+
+        except Exception as exc:
+            msg = f"Failed to build the YouTube transcripts tool: {exc!s}"
+            raise RuntimeError(msg) from exc
--- a/src/frontend/src/icons/Youtube/index.tsx
+++ b/src/frontend/src/icons/Youtube/index.tsx
@ -0,0 +1,9 @@
+import React, { forwardRef } from "react";
+import YouTubeIcon from "./youtube";
+
+export const YouTubeSvgIcon = forwardRef<
+  SVGSVGElement,
+  React.PropsWithChildren<{}>
+>((props, ref) => {
+  return <YouTubeIcon ref={ref} {...props} />;
+});
--- a/src/frontend/src/icons/Youtube/youtube.jsx
+++ b/src/frontend/src/icons/Youtube/youtube.jsx
--- a/src/frontend/src/icons/Youtube/youtube.svg
+++ b/src/frontend/src/icons/Youtube/youtube.svg
--- a/src/frontend/src/utils/styleUtils.ts
+++ b/src/frontend/src/utils/styleUtils.ts
@ -6,6 +6,7 @@ import { MilvusIcon } from "@/icons/Milvus";
 import Perplexity from "@/icons/Perplexity/Perplexity";
 import { TavilyIcon } from "@/icons/Tavily";
 import { UnstructuredIcon } from "@/icons/Unstructured";
+import YouTubeIcon from "@/icons/Youtube/youtube";
 import { ZepMemoryIcon } from "@/icons/ZepMemory";
 import { AthenaIcon } from "@/icons/athena/index";
 import { freezeAllIcon } from "@/icons/freezeAll";
@ -658,6 +659,7 @@ export const nodeIconsLucide: iconsType = {
  GithubIcon,
  FaGithub,
  FaApple,
+  YouTube: YouTubeIcon,
  Milvus: MilvusIcon,
  ExaSearch: ExaIcon,
  ZepMemory: ZepMemoryIcon,
--- a/src/frontend/tests/core/unit/codeAreaModalComponent.spec.ts
+++ b/src/frontend/tests/core/unit/codeAreaModalComponent.spec.ts
@ -40,6 +40,8 @@ test("CodeAreaModalComponent", async ({ page }) => {
  await page.getByTestId("sidebar-legacy-switch").isVisible({ timeout: 5000 });
  await page.getByTestId("sidebar-legacy-switch").click();

+  await page.waitForTimeout(1000);
+
  await page
    .getByTestId("prototypesPython Function")
    .dragTo(page.locator('//*[@id="react-flow-id"]'));
--- a/src/frontend/tests/extended/integrations/youtube-transcripts.spec.ts
+++ b/src/frontend/tests/extended/integrations/youtube-transcripts.spec.ts
@ -0,0 +1,74 @@
+import { expect, test } from "@playwright/test";
+
+test("user should be able to use youtube transcripts component", async ({
+  page,
+}) => {
+  await page.goto("/");
+  await page.waitForSelector('[data-testid="mainpage_title"]', {
+    timeout: 30000,
+  });
+
+  await page.waitForSelector('[id="new-project-btn"]', {
+    timeout: 30000,
+  });
+
+  let modalCount = 0;
+  try {
+    const modalTitleElement = await page?.getByTestId("modal-title");
+    if (modalTitleElement) {
+      modalCount = await modalTitleElement.count();
+    }
+  } catch (error) {
+    modalCount = 0;
+  }
+
+  while (modalCount === 0) {
+    await page.getByText("New Flow", { exact: true }).click();
+    await page.waitForTimeout(3000);
+    modalCount = await page.getByTestId("modal-title")?.count();
+  }
+
+  await page.getByTestId("blank-flow").click();
+  await page.getByTestId("sidebar-search-input").click();
+  await page.getByTestId("sidebar-search-input").fill("youtube");
+
+  await page.waitForTimeout(1000);
+
+  await page
+    .locator('//*[@id="toolsYouTube Transcripts"]')
+    .dragTo(page.locator('//*[@id="react-flow-id"]'));
+  await page.mouse.up();
+  await page.mouse.down();
+  await page.getByTestId("fit_view").click();
+
+  let outdatedComponents = await page.getByTestId("icon-AlertTriangle").count();
+
+  while (outdatedComponents > 0) {
+    await page.getByTestId("icon-AlertTriangle").first().click();
+    await page.waitForTimeout(1000);
+    outdatedComponents = await page.getByTestId("icon-AlertTriangle").count();
+  }
+
+  await page
+    .getByTestId("textarea_str_url")
+    .fill("https://www.youtube.com/watch?v=VqhCQZaH4Vs");
+
+  await page.getByTestId("textarea_str_language").fill("en");
+
+  await page.getByTestId("button_run_youtube transcripts").click();
+
+  await page.waitForSelector("text=built successfully", { timeout: 30000 });
+
+  await page.getByTestId("output-inspection-data").first().click();
+
+  await page.waitForTimeout(1000);
+
+  await page.getByRole("gridcell").first().click();
+
+  const value = await page.getByPlaceholder("Empty").inputValue();
+  expect(value.length).toBeGreaterThan(10);
+  expect(value.toLowerCase()).toContain("i see trees of green");
+  expect(value.toLowerCase()).toContain(
+    "and i think to myself what a wonderful world",
+  );
+});