Refactor youtube transcripts for proper tool mode
This commit is contained in:
parent
d0c93992d8
commit
2a9396d68e
1 changed files with 38 additions and 74 deletions
|
|
@ -1,49 +1,25 @@
|
|||
from langchain.tools import StructuredTool
|
||||
|
||||
from langchain_community.document_loaders import YoutubeLoader
|
||||
from langchain_community.document_loaders.youtube import TranscriptFormat
|
||||
from langchain_core.tools import ToolException
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from langflow.base.langchain_utilities.model import LCToolComponent
|
||||
from langflow.field_typing import Tool
|
||||
from langflow.custom import Component
|
||||
from langflow.inputs import DropdownInput, IntInput, MultilineInput
|
||||
from langflow.schema import Data
|
||||
from langflow.template import Output
|
||||
|
||||
|
||||
class YoutubeApiSchema(BaseModel):
|
||||
"""Schema to define the input structure for the tool."""
|
||||
|
||||
url: str = Field(..., description="The YouTube URL to get transcripts from.")
|
||||
transcript_format: TranscriptFormat = Field(
|
||||
TranscriptFormat.TEXT,
|
||||
description="The format of the transcripts. Either 'text' for a single "
|
||||
"text output or 'chunks' for timestamped chunks.",
|
||||
)
|
||||
chunk_size_seconds: int = Field(
|
||||
120,
|
||||
description="The size of each transcript chunk in seconds. Only "
|
||||
"applicable when 'Transcript Format' is set to 'chunks'.",
|
||||
)
|
||||
language: str = Field(
|
||||
"",
|
||||
description="A comma-separated list of language codes in descending " "priority. Leave empty for default.",
|
||||
)
|
||||
translation: str = Field(
|
||||
"", description="Translate the transcripts to the specified language. " "Leave empty for no translation."
|
||||
)
|
||||
|
||||
|
||||
class YouTubeTranscriptsComponent(LCToolComponent):
|
||||
class YouTubeTranscriptsComponent(Component):
|
||||
"""A component that extracts spoken content from YouTube videos as transcripts."""
|
||||
|
||||
display_name: str = "YouTube Transcripts"
|
||||
description: str = "Extracts spoken content from YouTube videos as transcripts."
|
||||
icon: str = "YouTube"
|
||||
name="YouTubeTranscripts"
|
||||
|
||||
inputs = [
|
||||
MultilineInput(
|
||||
name="url", display_name="Video URL", info="Enter the YouTube video URL to get transcripts from."
|
||||
name="url", display_name="Video URL", info="Enter the YouTube video URL to get transcripts from.",
|
||||
tool_mode=True,
|
||||
),
|
||||
DropdownInput(
|
||||
name="transcript_format",
|
||||
|
|
@ -52,6 +28,7 @@ class YouTubeTranscriptsComponent(LCToolComponent):
|
|||
value="text",
|
||||
info="The format of the transcripts. Either 'text' for a single output "
|
||||
"or 'chunks' for timestamped chunks.",
|
||||
advanced=True,
|
||||
),
|
||||
IntInput(
|
||||
name="chunk_size_seconds",
|
||||
|
|
@ -61,10 +38,23 @@ class YouTubeTranscriptsComponent(LCToolComponent):
|
|||
info="The size of each transcript chunk in seconds. Only applicable when "
|
||||
"'Transcript Format' is set to 'chunks'.",
|
||||
),
|
||||
MultilineInput(
|
||||
DropdownInput(
|
||||
name="language",
|
||||
display_name="Language",
|
||||
info="A comma-separated list of language codes in descending priority. " "Leave empty for default.",
|
||||
options = [
|
||||
"af", "ak", "sq", "am", "ar", "hy", "as", "ay", "az", "bn", "eu", "be", "bho",
|
||||
"bs", "bg", "my", "ca", "ceb", "zh", "zh-HK", "zh-CN", "zh-SG", "zh-TW",
|
||||
"zh-Hans", "zh-Hant", "hak-TW", "nan-TW", "co", "hr", "cs", "da", "dv", "nl",
|
||||
"en", "en-US", "eo", "et", "ee", "fil", "fi", "fr", "gl", "lg", "ka", "de",
|
||||
"el", "gn", "gu", "ht", "ha", "haw", "iw", "hi", "hmn", "hu", "is", "ig", "id",
|
||||
"ga", "it", "ja", "jv", "kn", "kk", "km", "rw", "ko", "kri", "ku", "ky", "lo",
|
||||
"la", "lv", "ln", "lt", "lb", "mk", "mg", "ms", "ml", "mt", "mi", "mr", "mn",
|
||||
"ne", "nso", "no", "ny", "or", "om", "ps", "fa", "pl", "pt", "pa", "qu", "ro",
|
||||
"ru", "sm", "sa", "gd", "sr", "sn", "sd", "si", "sk", "sl", "so", "st", "es",
|
||||
"su", "sw", "sv", "tg", "ta", "tt", "te", "th", "ti", "ts", "tr", "tk", "uk",
|
||||
"ur", "ug", "uz", "vi", "cy", "fy", "xh", "yi", "yo", "zu"],
|
||||
value="en",
|
||||
info="Specify to make sure the transcripts are retrieved in your desired language. Defaults to English: 'en'",
|
||||
),
|
||||
DropdownInput(
|
||||
name="translation",
|
||||
|
|
@ -77,7 +67,6 @@ class YouTubeTranscriptsComponent(LCToolComponent):
|
|||
|
||||
outputs = [
|
||||
Output(name="transcripts", display_name="Data", method="build_youtube_transcripts"),
|
||||
Output(name="transcripts_tool", display_name="Tool", method="build_youtube_tool"),
|
||||
]
|
||||
|
||||
def build_youtube_transcripts(self) -> Data | list[Data]:
|
||||
|
|
@ -94,7 +83,7 @@ class YouTubeTranscriptsComponent(LCToolComponent):
|
|||
if self.transcript_format == "text"
|
||||
else TranscriptFormat.CHUNKS,
|
||||
chunk_size_seconds=self.chunk_size_seconds,
|
||||
language=self.language.split(",") if self.language else ["en"],
|
||||
language=[self.language],
|
||||
translation=self.translation or None,
|
||||
)
|
||||
|
||||
|
|
@ -130,44 +119,19 @@ class YouTubeTranscriptsComponent(LCToolComponent):
|
|||
Returns:
|
||||
Data | list[Data]: Video transcripts as single Data or list of Data.
|
||||
"""
|
||||
try:
|
||||
if isinstance(transcript_format, str):
|
||||
transcript_format = TranscriptFormat(transcript_format)
|
||||
loader = YoutubeLoader.from_youtube_url(
|
||||
url,
|
||||
transcript_format=TranscriptFormat.TEXT
|
||||
if transcript_format == TranscriptFormat.TEXT
|
||||
else TranscriptFormat.CHUNKS,
|
||||
chunk_size_seconds=chunk_size_seconds,
|
||||
language=language.split(",") if language else ["en"],
|
||||
translation=translation or None,
|
||||
)
|
||||
if isinstance(transcript_format, str):
|
||||
transcript_format = TranscriptFormat(transcript_format)
|
||||
loader = YoutubeLoader.from_youtube_url(
|
||||
url,
|
||||
transcript_format=TranscriptFormat.TEXT
|
||||
if transcript_format == TranscriptFormat.TEXT
|
||||
else TranscriptFormat.CHUNKS,
|
||||
chunk_size_seconds=chunk_size_seconds,
|
||||
language=language.split(",") if language else ["en"],
|
||||
translation=translation or None,
|
||||
)
|
||||
|
||||
transcripts = loader.load()
|
||||
if transcript_format == TranscriptFormat.TEXT and len(transcripts) > 0:
|
||||
return Data(data={"transcript": transcripts[0].page_content})
|
||||
return [Data(data={"content": doc.page_content, "metadata": doc.metadata}) for doc in transcripts]
|
||||
except Exception as exc:
|
||||
msg = f"Failed to get YouTube transcripts: {exc!s}"
|
||||
raise ToolException(msg) from exc
|
||||
|
||||
def build_youtube_tool(self) -> Tool:
|
||||
"""Method to build the transcripts tool.
|
||||
|
||||
Returns:
|
||||
Tool: A structured tool that uses the transcripts method.
|
||||
|
||||
Raises:
|
||||
RuntimeError: If tool creation fails.
|
||||
"""
|
||||
try:
|
||||
return StructuredTool.from_function(
|
||||
name="youtube_transcripts",
|
||||
description="Get transcripts from YouTube videos.",
|
||||
func=self.youtube_transcripts,
|
||||
args_schema=YoutubeApiSchema,
|
||||
)
|
||||
|
||||
except Exception as exc:
|
||||
msg = f"Failed to build the YouTube transcripts tool: {exc!s}"
|
||||
raise RuntimeError(msg) from exc
|
||||
transcripts = loader.load()
|
||||
if transcript_format == TranscriptFormat.TEXT and len(transcripts) > 0:
|
||||
return Data(data={"transcript": transcripts[0].page_content})
|
||||
return [Data(data={"content": doc.page_content, "metadata": doc.metadata}) for doc in transcripts]
|
||||
Loading…
Add table
Add a link
Reference in a new issue