Refactor youtube transcripts for proper tool mode

This commit is contained in:
Rodrigo Nader 2024-12-01 01:16:21 -03:00 committed by GitHub
commit 2a9396d68e
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -1,49 +1,25 @@
from langchain.tools import StructuredTool
from langchain_community.document_loaders import YoutubeLoader
from langchain_community.document_loaders.youtube import TranscriptFormat
from langchain_core.tools import ToolException
from pydantic import BaseModel, Field
from langflow.base.langchain_utilities.model import LCToolComponent
from langflow.field_typing import Tool
from langflow.custom import Component
from langflow.inputs import DropdownInput, IntInput, MultilineInput
from langflow.schema import Data
from langflow.template import Output
class YoutubeApiSchema(BaseModel):
"""Schema to define the input structure for the tool."""
url: str = Field(..., description="The YouTube URL to get transcripts from.")
transcript_format: TranscriptFormat = Field(
TranscriptFormat.TEXT,
description="The format of the transcripts. Either 'text' for a single "
"text output or 'chunks' for timestamped chunks.",
)
chunk_size_seconds: int = Field(
120,
description="The size of each transcript chunk in seconds. Only "
"applicable when 'Transcript Format' is set to 'chunks'.",
)
language: str = Field(
"",
description="A comma-separated list of language codes in descending " "priority. Leave empty for default.",
)
translation: str = Field(
"", description="Translate the transcripts to the specified language. " "Leave empty for no translation."
)
class YouTubeTranscriptsComponent(LCToolComponent):
class YouTubeTranscriptsComponent(Component):
"""A component that extracts spoken content from YouTube videos as transcripts."""
display_name: str = "YouTube Transcripts"
description: str = "Extracts spoken content from YouTube videos as transcripts."
icon: str = "YouTube"
name="YouTubeTranscripts"
inputs = [
MultilineInput(
name="url", display_name="Video URL", info="Enter the YouTube video URL to get transcripts from."
name="url", display_name="Video URL", info="Enter the YouTube video URL to get transcripts from.",
tool_mode=True,
),
DropdownInput(
name="transcript_format",
@ -52,6 +28,7 @@ class YouTubeTranscriptsComponent(LCToolComponent):
value="text",
info="The format of the transcripts. Either 'text' for a single output "
"or 'chunks' for timestamped chunks.",
advanced=True,
),
IntInput(
name="chunk_size_seconds",
@ -61,10 +38,23 @@ class YouTubeTranscriptsComponent(LCToolComponent):
info="The size of each transcript chunk in seconds. Only applicable when "
"'Transcript Format' is set to 'chunks'.",
),
MultilineInput(
DropdownInput(
name="language",
display_name="Language",
info="A comma-separated list of language codes in descending priority. " "Leave empty for default.",
options = [
"af", "ak", "sq", "am", "ar", "hy", "as", "ay", "az", "bn", "eu", "be", "bho",
"bs", "bg", "my", "ca", "ceb", "zh", "zh-HK", "zh-CN", "zh-SG", "zh-TW",
"zh-Hans", "zh-Hant", "hak-TW", "nan-TW", "co", "hr", "cs", "da", "dv", "nl",
"en", "en-US", "eo", "et", "ee", "fil", "fi", "fr", "gl", "lg", "ka", "de",
"el", "gn", "gu", "ht", "ha", "haw", "iw", "hi", "hmn", "hu", "is", "ig", "id",
"ga", "it", "ja", "jv", "kn", "kk", "km", "rw", "ko", "kri", "ku", "ky", "lo",
"la", "lv", "ln", "lt", "lb", "mk", "mg", "ms", "ml", "mt", "mi", "mr", "mn",
"ne", "nso", "no", "ny", "or", "om", "ps", "fa", "pl", "pt", "pa", "qu", "ro",
"ru", "sm", "sa", "gd", "sr", "sn", "sd", "si", "sk", "sl", "so", "st", "es",
"su", "sw", "sv", "tg", "ta", "tt", "te", "th", "ti", "ts", "tr", "tk", "uk",
"ur", "ug", "uz", "vi", "cy", "fy", "xh", "yi", "yo", "zu"],
value="en",
info="Specify to make sure the transcripts are retrieved in your desired language. Defaults to English: 'en'",
),
DropdownInput(
name="translation",
@ -77,7 +67,6 @@ class YouTubeTranscriptsComponent(LCToolComponent):
outputs = [
Output(name="transcripts", display_name="Data", method="build_youtube_transcripts"),
Output(name="transcripts_tool", display_name="Tool", method="build_youtube_tool"),
]
def build_youtube_transcripts(self) -> Data | list[Data]:
@ -94,7 +83,7 @@ class YouTubeTranscriptsComponent(LCToolComponent):
if self.transcript_format == "text"
else TranscriptFormat.CHUNKS,
chunk_size_seconds=self.chunk_size_seconds,
language=self.language.split(",") if self.language else ["en"],
language=[self.language],
translation=self.translation or None,
)
@ -130,44 +119,19 @@ class YouTubeTranscriptsComponent(LCToolComponent):
Returns:
Data | list[Data]: Video transcripts as single Data or list of Data.
"""
try:
if isinstance(transcript_format, str):
transcript_format = TranscriptFormat(transcript_format)
loader = YoutubeLoader.from_youtube_url(
url,
transcript_format=TranscriptFormat.TEXT
if transcript_format == TranscriptFormat.TEXT
else TranscriptFormat.CHUNKS,
chunk_size_seconds=chunk_size_seconds,
language=language.split(",") if language else ["en"],
translation=translation or None,
)
if isinstance(transcript_format, str):
transcript_format = TranscriptFormat(transcript_format)
loader = YoutubeLoader.from_youtube_url(
url,
transcript_format=TranscriptFormat.TEXT
if transcript_format == TranscriptFormat.TEXT
else TranscriptFormat.CHUNKS,
chunk_size_seconds=chunk_size_seconds,
language=language.split(",") if language else ["en"],
translation=translation or None,
)
transcripts = loader.load()
if transcript_format == TranscriptFormat.TEXT and len(transcripts) > 0:
return Data(data={"transcript": transcripts[0].page_content})
return [Data(data={"content": doc.page_content, "metadata": doc.metadata}) for doc in transcripts]
except Exception as exc:
msg = f"Failed to get YouTube transcripts: {exc!s}"
raise ToolException(msg) from exc
def build_youtube_tool(self) -> Tool:
"""Method to build the transcripts tool.
Returns:
Tool: A structured tool that uses the transcripts method.
Raises:
RuntimeError: If tool creation fails.
"""
try:
return StructuredTool.from_function(
name="youtube_transcripts",
description="Get transcripts from YouTube videos.",
func=self.youtube_transcripts,
args_schema=YoutubeApiSchema,
)
except Exception as exc:
msg = f"Failed to build the YouTube transcripts tool: {exc!s}"
raise RuntimeError(msg) from exc
transcripts = loader.load()
if transcript_format == TranscriptFormat.TEXT and len(transcripts) > 0:
return Data(data={"transcript": transcripts[0].page_content})
return [Data(data={"content": doc.page_content, "metadata": doc.metadata}) for doc in transcripts]