Refactor youtube transcripts for proper tool mode

2024-12-01 01:16:21 -03:00 · 2024-12-01 01:16:21 -03:00 · 2a9396d68e
commit 2a9396d68e
parent d0c93992d8
1 changed files with 38 additions and 74 deletions
--- a/src/backend/base/langflow/components/tools/youtube_transcripts.py
+++ b/src/backend/base/langflow/components/tools/youtube_transcripts.py
@ -1,49 +1,25 @@
-from langchain.tools import StructuredTool
+
 from langchain_community.document_loaders import YoutubeLoader
 from langchain_community.document_loaders.youtube import TranscriptFormat
-from langchain_core.tools import ToolException
-from pydantic import BaseModel, Field

-from langflow.base.langchain_utilities.model import LCToolComponent
-from langflow.field_typing import Tool
+from langflow.custom import Component
 from langflow.inputs import DropdownInput, IntInput, MultilineInput
 from langflow.schema import Data
 from langflow.template import Output


-class YoutubeApiSchema(BaseModel):
-    """Schema to define the input structure for the tool."""
-
-    url: str = Field(..., description="The YouTube URL to get transcripts from.")
-    transcript_format: TranscriptFormat = Field(
-        TranscriptFormat.TEXT,
-        description="The format of the transcripts. Either 'text' for a single "
-        "text output or 'chunks' for timestamped chunks.",
-    )
-    chunk_size_seconds: int = Field(
-        120,
-        description="The size of each transcript chunk in seconds. Only "
-        "applicable when 'Transcript Format' is set to 'chunks'.",
-    )
-    language: str = Field(
-        "",
-        description="A comma-separated list of language codes in descending " "priority. Leave empty for default.",
-    )
-    translation: str = Field(
-        "", description="Translate the transcripts to the specified language. " "Leave empty for no translation."
-    )
-
-
-class YouTubeTranscriptsComponent(LCToolComponent):
+class YouTubeTranscriptsComponent(Component):
    """A component that extracts spoken content from YouTube videos as transcripts."""

    display_name: str = "YouTube Transcripts"
    description: str = "Extracts spoken content from YouTube videos as transcripts."
    icon: str = "YouTube"
+    name="YouTubeTranscripts"

    inputs = [
        MultilineInput(
-            name="url", display_name="Video URL", info="Enter the YouTube video URL to get transcripts from."
+            name="url", display_name="Video URL", info="Enter the YouTube video URL to get transcripts from.",
+            tool_mode=True,
        ),
        DropdownInput(
            name="transcript_format",
@ -52,6 +28,7 @@ class YouTubeTranscriptsComponent(LCToolComponent):
            value="text",
            info="The format of the transcripts. Either 'text' for a single output "
            "or 'chunks' for timestamped chunks.",
+            advanced=True,
        ),
        IntInput(
            name="chunk_size_seconds",
@ -61,10 +38,23 @@ class YouTubeTranscriptsComponent(LCToolComponent):
            info="The size of each transcript chunk in seconds. Only applicable when "
            "'Transcript Format' is set to 'chunks'.",
        ),
-        MultilineInput(
+        DropdownInput(
            name="language",
            display_name="Language",
-            info="A comma-separated list of language codes in descending priority. " "Leave empty for default.",
+            options = [
+            "af", "ak", "sq", "am", "ar", "hy", "as", "ay", "az", "bn", "eu", "be", "bho",
+            "bs", "bg", "my", "ca", "ceb", "zh", "zh-HK", "zh-CN", "zh-SG", "zh-TW",
+            "zh-Hans", "zh-Hant", "hak-TW", "nan-TW", "co", "hr", "cs", "da", "dv", "nl",
+            "en", "en-US", "eo", "et", "ee", "fil", "fi", "fr", "gl", "lg", "ka", "de",
+            "el", "gn", "gu", "ht", "ha", "haw", "iw", "hi", "hmn", "hu", "is", "ig", "id",
+            "ga", "it", "ja", "jv", "kn", "kk", "km", "rw", "ko", "kri", "ku", "ky", "lo",
+            "la", "lv", "ln", "lt", "lb", "mk", "mg", "ms", "ml", "mt", "mi", "mr", "mn",
+            "ne", "nso", "no", "ny", "or", "om", "ps", "fa", "pl", "pt", "pa", "qu", "ro",
+            "ru", "sm", "sa", "gd", "sr", "sn", "sd", "si", "sk", "sl", "so", "st", "es",
+            "su", "sw", "sv", "tg", "ta", "tt", "te", "th", "ti", "ts", "tr", "tk", "uk",
+            "ur", "ug", "uz", "vi", "cy", "fy", "xh", "yi", "yo", "zu"],
+            value="en",
+            info="Specify to make sure the transcripts are retrieved in your desired language. Defaults to English: 'en'",
        ),
        DropdownInput(
            name="translation",
@ -77,7 +67,6 @@ class YouTubeTranscriptsComponent(LCToolComponent):

    outputs = [
        Output(name="transcripts", display_name="Data", method="build_youtube_transcripts"),
-        Output(name="transcripts_tool", display_name="Tool", method="build_youtube_tool"),
    ]

    def build_youtube_transcripts(self) -> Data | list[Data]:
@ -94,7 +83,7 @@ class YouTubeTranscriptsComponent(LCToolComponent):
                if self.transcript_format == "text"
                else TranscriptFormat.CHUNKS,
                chunk_size_seconds=self.chunk_size_seconds,
-                language=self.language.split(",") if self.language else ["en"],
+                language=[self.language],
                translation=self.translation or None,
            )

@ -130,44 +119,19 @@ class YouTubeTranscriptsComponent(LCToolComponent):
        Returns:
            Data | list[Data]: Video transcripts as single Data or list of Data.
        """
-        try:
-            if isinstance(transcript_format, str):
-                transcript_format = TranscriptFormat(transcript_format)
-            loader = YoutubeLoader.from_youtube_url(
-                url,
-                transcript_format=TranscriptFormat.TEXT
-                if transcript_format == TranscriptFormat.TEXT
-                else TranscriptFormat.CHUNKS,
-                chunk_size_seconds=chunk_size_seconds,
-                language=language.split(",") if language else ["en"],
-                translation=translation or None,
-            )
+        if isinstance(transcript_format, str):
+            transcript_format = TranscriptFormat(transcript_format)
+        loader = YoutubeLoader.from_youtube_url(
+            url,
+            transcript_format=TranscriptFormat.TEXT
+            if transcript_format == TranscriptFormat.TEXT
+            else TranscriptFormat.CHUNKS,
+            chunk_size_seconds=chunk_size_seconds,
+            language=language.split(",") if language else ["en"],
+            translation=translation or None,
+        )

-            transcripts = loader.load()
-            if transcript_format == TranscriptFormat.TEXT and len(transcripts) > 0:
-                return Data(data={"transcript": transcripts[0].page_content})
-            return [Data(data={"content": doc.page_content, "metadata": doc.metadata}) for doc in transcripts]
-        except Exception as exc:
-            msg = f"Failed to get YouTube transcripts: {exc!s}"
-            raise ToolException(msg) from exc
-
-    def build_youtube_tool(self) -> Tool:
-        """Method to build the transcripts tool.
-
-        Returns:
-            Tool: A structured tool that uses the transcripts method.
-
-        Raises:
-            RuntimeError: If tool creation fails.
-        """
-        try:
-            return StructuredTool.from_function(
-                name="youtube_transcripts",
-                description="Get transcripts from YouTube videos.",
-                func=self.youtube_transcripts,
-                args_schema=YoutubeApiSchema,
-            )
-
-        except Exception as exc:
-            msg = f"Failed to build the YouTube transcripts tool: {exc!s}"
-            raise RuntimeError(msg) from exc
+        transcripts = loader.load()
+        if transcript_format == TranscriptFormat.TEXT and len(transcripts) > 0:
+            return Data(data={"transcript": transcripts[0].page_content})
+        return [Data(data={"content": doc.page_content, "metadata": doc.metadata}) for doc in transcripts]