feat: Restructure Youtube Transcripts component (#5118)
* feat: Restructure Youtube Transcripts component - Changed the return type of the `build_youtube_transcripts` method from `Data` to `Message`. - Updated the output name from "Data" to "Transcription" to reflect the new return type. - Modified the return statements in `build_youtube_transcripts` to return `Message` objects instead of `Data`. Co-authored-by: Vinícios Batista da Silva <vinicios.batsi@gmail.com> * test(youtube-transcripts): enhance test stability by updating component ID and improving interaction checks - Modified the component id in the test script to match the changes in the Youtube Transcripts component Co-authored-by: Vinícios Batista da Silva <vinicios.batsi@gmail.com> * fix: Fixed lint on line 180-181 Co-authored-by: Vinícios Batista da Silva <vinicios.batsi@gmail.com> --------- Co-authored-by: Eric Hare <ericrhare@gmail.com>
This commit is contained in:
parent
32ab4452f9
commit
3f9dab912c
2 changed files with 30 additions and 12 deletions
|
|
@ -3,7 +3,7 @@ from langchain_community.document_loaders.youtube import TranscriptFormat
|
|||
|
||||
from langflow.custom import Component
|
||||
from langflow.inputs import DropdownInput, IntInput, MultilineInput
|
||||
from langflow.schema import Data
|
||||
from langflow.schema import Message
|
||||
from langflow.template import Output
|
||||
|
||||
|
||||
|
|
@ -191,24 +191,31 @@ class YouTubeTranscriptsComponent(Component):
|
|||
]
|
||||
|
||||
outputs = [
|
||||
Output(name="transcripts", display_name="Data", method="build_youtube_transcripts"),
|
||||
Output(name="transcripts", display_name="Transcription", method="build_youtube_transcripts"),
|
||||
]
|
||||
|
||||
def build_youtube_transcripts(self) -> Data | list[Data]:
|
||||
"""Method to build transcripts from the provided YouTube URL.
|
||||
def build_youtube_transcripts(self) -> Message:
|
||||
"""Method to extracts transcripts from a YouTube video URL.
|
||||
|
||||
Returns:
|
||||
Data | list[Data]: The transcripts of the video, either as a single
|
||||
Data object or a list of Data objects.
|
||||
Message: The transcripts of the video as a text string. If 'transcript_format'
|
||||
is 'text', the transcripts are returned as a single continuous string. If
|
||||
'transcript_format' is 'chunks', the transcripts are returned as a string
|
||||
with timestamped segments.
|
||||
|
||||
Raises:
|
||||
Exception: Returns an error message if transcript retrieval fails.
|
||||
"""
|
||||
try:
|
||||
# Attempt to load transcripts in the specified language, fallback to any available language
|
||||
languages = [self.language] if self.language else None
|
||||
loader = YoutubeLoader.from_youtube_url(
|
||||
self.url,
|
||||
transcript_format=TranscriptFormat.TEXT
|
||||
if self.transcript_format == "text"
|
||||
else TranscriptFormat.CHUNKS,
|
||||
chunk_size_seconds=self.chunk_size_seconds,
|
||||
language=[self.language],
|
||||
language=languages,
|
||||
translation=self.translation or None,
|
||||
)
|
||||
|
||||
|
|
@ -216,10 +223,21 @@ class YouTubeTranscriptsComponent(Component):
|
|||
|
||||
if self.transcript_format == "text":
|
||||
# Extract only the page_content from the Document
|
||||
return Data(data={"transcripts": transcripts[0].page_content})
|
||||
# For chunks, extract page_content and metadata separately
|
||||
return [Data(data={"content": doc.page_content, "metadata": doc.metadata}) for doc in transcripts]
|
||||
result = transcripts[0].page_content
|
||||
return Message(text=result)
|
||||
|
||||
# For chunks, format the output with timestamps
|
||||
formatted_chunks = []
|
||||
for doc in transcripts:
|
||||
start_seconds = int(doc.metadata["start_seconds"])
|
||||
start_minutes = start_seconds // 60
|
||||
start_seconds %= 60
|
||||
timestamp = f"{start_minutes:02d}:{start_seconds:02d}"
|
||||
formatted_chunks.append(f"{timestamp} {doc.page_content}")
|
||||
result = "\n".join(formatted_chunks)
|
||||
return Message(text=result)
|
||||
|
||||
except Exception as exc: # noqa: BLE001
|
||||
# Using a specific error type for the return value
|
||||
return Data(data={"error": f"Failed to get YouTube transcripts: {exc!s}"})
|
||||
error_msg = f"Failed to get YouTube transcripts: {exc!s}"
|
||||
return Message(text=error_msg)
|
||||
|
|
|
|||
|
|
@ -39,7 +39,7 @@ test(
|
|||
|
||||
await page.waitForSelector("text=built successfully", { timeout: 30000 });
|
||||
|
||||
await page.getByTestId("output-inspection-data").first().click();
|
||||
await page.getByTestId("output-inspection-transcription").first().click();
|
||||
|
||||
await page.waitForSelector("text=Component Output", { timeout: 30000 });
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue