feat: enhance YouTubeTranscripts component with Data output support (#6113)
* 📝 (youtube_transcripts.py): update description of YouTubeTranscriptsComponent to be more concise and accurate ✨ (youtube_transcripts.py): add new output option 'data_output' to provide transcript along with the source video URL 🔧 (youtube_transcripts.py): add method 'get_data_output' to handle the new 'data_output' output option and return a Data object with transcript, video URL, and error message * [autofix.ci] apply automated fixes * 📝 (youtube_transcripts.py): improve documentation for get_data_output method to provide a clear description of the returned data object and its contents 🐛 (youtube_transcripts.py): handle specific exceptions from the youtube_transcript_api library to provide more informative error messages and improve error handling in the get_data_output method * [autofix.ci] apply automated fixes * 🐛 (youtube_transcripts.py): handle case where no transcripts are found by updating the error message and returning a default data object 🔧 (youtube_transcripts.py): refactor get_data_output method to use a default data object and combine all transcript parts into a single continuous text * [autofix.ci] apply automated fixes * ✨ (test_youtube_transcript_component.py): Add unit tests for YouTubeTranscriptsComponent to test various functionalities such as component initialization, output generation, error handling, and setting translation languages. * [autofix.ci] apply automated fixes * ✅ (test_youtube_transcript_component.py): update file_names_mapping fixture to return a non-empty list to properly test different versions of file names mapping in the YouTube transcripts component * [autofix.ci] apply automated fixes * 📝 (test_youtube_transcript_component.py): Add docstrings and improve variable names for better readability and maintainability 🔧 (test_youtube_transcript_component.py): Refactor error handling in test methods to use descriptive error messages and improve code readability * [autofix.ci] apply automated fixes --------- Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
This commit is contained in:
parent
17f1ecf997
commit
d98d37778d
4 changed files with 200 additions and 2 deletions
|
|
@ -5,7 +5,7 @@ from langchain_community.document_loaders.youtube import TranscriptFormat
|
|||
|
||||
from langflow.custom import Component
|
||||
from langflow.inputs import DropdownInput, IntInput, MultilineInput
|
||||
from langflow.schema import DataFrame, Message
|
||||
from langflow.schema import Data, DataFrame, Message
|
||||
from langflow.template import Output
|
||||
|
||||
|
||||
|
|
@ -13,7 +13,7 @@ class YouTubeTranscriptsComponent(Component):
|
|||
"""A component that extracts spoken content from YouTube videos as transcripts."""
|
||||
|
||||
display_name: str = "YouTube Transcripts"
|
||||
description: str = "Extracts spoken content from YouTube videos with both DataFrame and text output options."
|
||||
description: str = "Extracts spoken content from YouTube videos with multiple output options."
|
||||
icon: str = "YouTube"
|
||||
name = "YouTubeTranscripts"
|
||||
|
||||
|
|
@ -43,6 +43,7 @@ class YouTubeTranscriptsComponent(Component):
|
|||
outputs = [
|
||||
Output(name="dataframe", display_name="Chunks", method="get_dataframe_output"),
|
||||
Output(name="message", display_name="Transcript", method="get_message_output"),
|
||||
Output(name="data_output", display_name="Transcript + Source", method="get_data_output"),
|
||||
]
|
||||
|
||||
def _load_transcripts(self, *, as_chunks: bool = True):
|
||||
|
|
@ -68,6 +69,7 @@ class YouTubeTranscriptsComponent(Component):
|
|||
start_seconds %= 60
|
||||
timestamp = f"{start_minutes:02d}:{start_seconds:02d}"
|
||||
data.append({"timestamp": timestamp, "text": doc.page_content})
|
||||
|
||||
return DataFrame(pd.DataFrame(data))
|
||||
|
||||
except (youtube_transcript_api.TranscriptsDisabled, youtube_transcript_api.NoTranscriptFound) as exc:
|
||||
|
|
@ -83,3 +85,32 @@ class YouTubeTranscriptsComponent(Component):
|
|||
except (youtube_transcript_api.TranscriptsDisabled, youtube_transcript_api.NoTranscriptFound) as exc:
|
||||
error_msg = f"Failed to get YouTube transcripts: {exc!s}"
|
||||
return Message(text=error_msg)
|
||||
|
||||
def get_data_output(self) -> Data:
|
||||
"""Creates a structured data object with transcript and metadata.
|
||||
|
||||
Returns a Data object containing transcript text, video URL, and any error
|
||||
messages that occurred during processing. The object includes:
|
||||
- 'transcript': continuous text from the entire video (concatenated if multiple parts)
|
||||
- 'video_url': the input YouTube URL
|
||||
- 'error': error message if an exception occurs
|
||||
"""
|
||||
default_data = {"transcript": "", "video_url": self.url, "error": None}
|
||||
|
||||
try:
|
||||
transcripts = self._load_transcripts(as_chunks=False)
|
||||
if not transcripts:
|
||||
default_data["error"] = "No transcripts found."
|
||||
return Data(data=default_data)
|
||||
|
||||
# Combine all transcript parts
|
||||
full_transcript = " ".join(doc.page_content for doc in transcripts)
|
||||
return Data(data={"transcript": full_transcript, "video_url": self.url})
|
||||
|
||||
except (
|
||||
youtube_transcript_api.TranscriptsDisabled,
|
||||
youtube_transcript_api.NoTranscriptFound,
|
||||
youtube_transcript_api.CouldNotRetrieveTranscript,
|
||||
) as exc:
|
||||
default_data["error"] = str(exc)
|
||||
return Data(data=default_data)
|
||||
|
|
|
|||
0
src/backend/tests/unit/components/bundles/__init__.py
Normal file
0
src/backend/tests/unit/components/bundles/__init__.py
Normal file
|
|
@ -0,0 +1,167 @@
|
|||
from unittest.mock import Mock, patch
|
||||
|
||||
import pytest
|
||||
from langflow.components.youtube.youtube_transcripts import YouTubeTranscriptsComponent
|
||||
from langflow.schema import Data, DataFrame, Message
|
||||
from youtube_transcript_api import NoTranscriptFound, TranscriptsDisabled
|
||||
|
||||
from tests.base import ComponentTestBaseWithoutClient
|
||||
|
||||
|
||||
class TestYouTubeTranscriptsComponent(ComponentTestBaseWithoutClient):
|
||||
@pytest.fixture
|
||||
def component_class(self):
|
||||
"""Return the component class to test."""
|
||||
return YouTubeTranscriptsComponent
|
||||
|
||||
@pytest.fixture
|
||||
def default_kwargs(self):
|
||||
"""Return the default kwargs for the component."""
|
||||
return {
|
||||
"url": "https://www.youtube.com/watch?v=test123",
|
||||
"chunk_size_seconds": 60,
|
||||
"translation": "",
|
||||
}
|
||||
|
||||
@pytest.fixture
|
||||
def file_names_mapping(self):
|
||||
"""Return the file names mapping for different versions."""
|
||||
return []
|
||||
|
||||
@pytest.fixture
|
||||
def mock_transcript_data(self):
|
||||
"""Return mock transcript data for testing."""
|
||||
return [
|
||||
Mock(page_content="First part of the transcript", metadata={"start_seconds": 0}),
|
||||
Mock(page_content="Second part of the transcript", metadata={"start_seconds": 60}),
|
||||
]
|
||||
|
||||
def test_basic_setup(self, component_class, default_kwargs):
|
||||
"""Test basic component initialization."""
|
||||
component = component_class()
|
||||
component.set_attributes(default_kwargs)
|
||||
assert component.url == default_kwargs["url"]
|
||||
assert component.chunk_size_seconds == default_kwargs["chunk_size_seconds"]
|
||||
assert component.translation == default_kwargs["translation"]
|
||||
|
||||
@patch("langflow.components.youtube.youtube_transcripts.YoutubeLoader")
|
||||
def test_get_dataframe_output_success(self, mock_loader, component_class, default_kwargs, mock_transcript_data):
|
||||
"""Test successful DataFrame output generation."""
|
||||
mock_loader.from_youtube_url.return_value.load.return_value = mock_transcript_data
|
||||
|
||||
component = component_class()
|
||||
component.set_attributes(default_kwargs)
|
||||
result = component.get_dataframe_output()
|
||||
|
||||
assert isinstance(result, DataFrame)
|
||||
result_df = result # More descriptive variable name
|
||||
assert len(result_df) == 2
|
||||
assert list(result_df.columns) == ["timestamp", "text"]
|
||||
assert result_df.iloc[0]["timestamp"] == "00:00"
|
||||
assert result_df.iloc[1]["timestamp"] == "01:00"
|
||||
assert result_df.iloc[0]["text"] == "First part of the transcript"
|
||||
|
||||
@patch("langflow.components.youtube.youtube_transcripts.YoutubeLoader")
|
||||
def test_get_message_output_success(self, mock_loader, component_class, default_kwargs, mock_transcript_data):
|
||||
"""Test successful Message output generation."""
|
||||
mock_loader.from_youtube_url.return_value.load.return_value = mock_transcript_data
|
||||
|
||||
component = component_class()
|
||||
component.set_attributes(default_kwargs)
|
||||
result = component.get_message_output()
|
||||
|
||||
assert isinstance(result, Message)
|
||||
assert result.text == "First part of the transcript"
|
||||
|
||||
@patch("langflow.components.youtube.youtube_transcripts.YoutubeLoader")
|
||||
def test_get_data_output_success(self, mock_loader, component_class, default_kwargs, mock_transcript_data):
|
||||
"""Test successful Data output generation."""
|
||||
mock_loader.from_youtube_url.return_value.load.return_value = mock_transcript_data
|
||||
|
||||
component = component_class()
|
||||
component.set_attributes(default_kwargs)
|
||||
result = component.get_data_output()
|
||||
|
||||
assert isinstance(result, Data)
|
||||
assert result.data["video_url"] == default_kwargs["url"]
|
||||
assert result.data["transcript"] == "First part of the transcript Second part of the transcript"
|
||||
assert "error" not in result.data
|
||||
|
||||
@patch("langflow.components.youtube.youtube_transcripts.YoutubeLoader")
|
||||
def test_transcript_disabled_error(self, mock_loader, component_class, default_kwargs):
|
||||
"""Test handling of TranscriptsDisabled error."""
|
||||
error_message = "Transcripts are disabled for this video"
|
||||
|
||||
# Mock the load method to raise TranscriptsDisabled
|
||||
def raise_error(*_): # Use underscore to indicate unused arguments
|
||||
raise TranscriptsDisabled(error_message)
|
||||
|
||||
mock_loader.from_youtube_url.return_value.load.side_effect = raise_error
|
||||
|
||||
component = component_class()
|
||||
component.set_attributes(default_kwargs)
|
||||
|
||||
# Test DataFrame output
|
||||
df_result = component.get_dataframe_output()
|
||||
assert isinstance(df_result, DataFrame)
|
||||
assert len(df_result) == 1 # One row for error message
|
||||
assert "error" in df_result.columns
|
||||
assert "Failed to get YouTube transcripts" in df_result["error"][0]
|
||||
|
||||
# Test Message output
|
||||
msg_result = component.get_message_output()
|
||||
assert isinstance(msg_result, Message)
|
||||
assert "Failed to get YouTube transcripts" in msg_result.text
|
||||
|
||||
# Test Data output
|
||||
data_result = component.get_data_output()
|
||||
assert isinstance(data_result, Data)
|
||||
assert "error" in data_result.data
|
||||
assert data_result.data["transcript"] == ""
|
||||
|
||||
@patch("langflow.components.youtube.youtube_transcripts.YoutubeLoader")
|
||||
def test_no_transcript_found_error(self, mock_loader, component_class, default_kwargs):
|
||||
"""Test handling of NoTranscriptFound error."""
|
||||
video_id = "test123"
|
||||
requested_langs = ["en"]
|
||||
transcript_data = {"en": {"translationLanguages": []}}
|
||||
|
||||
# Mock the load method to raise NoTranscriptFound
|
||||
def raise_error(*_): # Use underscore to indicate unused arguments
|
||||
raise NoTranscriptFound(video_id, requested_langs, transcript_data)
|
||||
|
||||
mock_loader.from_youtube_url.return_value.load.side_effect = raise_error
|
||||
|
||||
component = component_class()
|
||||
component.set_attributes(default_kwargs)
|
||||
|
||||
data_result = component.get_data_output()
|
||||
assert isinstance(data_result, Data)
|
||||
assert "error" in data_result.data
|
||||
assert data_result.data["transcript"] == ""
|
||||
|
||||
def test_translation_setting(self, component_class):
|
||||
"""Test setting different translation languages."""
|
||||
component = component_class()
|
||||
test_cases = ["en", "es", "fr", ""]
|
||||
|
||||
for lang in test_cases:
|
||||
component.set_attributes({"url": "https://youtube.com/watch?v=test", "translation": lang})
|
||||
assert component.translation == lang
|
||||
|
||||
@patch("langflow.components.youtube.youtube_transcripts.YoutubeLoader")
|
||||
def test_empty_transcript_handling(self, mock_loader, component_class, default_kwargs):
|
||||
"""Test handling of empty transcript response."""
|
||||
mock_loader.from_youtube_url.return_value.load.return_value = []
|
||||
|
||||
component = component_class()
|
||||
component.set_attributes(default_kwargs)
|
||||
|
||||
# Test Data output with empty transcript
|
||||
data_result = component.get_data_output()
|
||||
assert data_result.data["error"] == "No transcripts found."
|
||||
assert data_result.data["transcript"] == ""
|
||||
|
||||
# Test DataFrame output with empty transcript
|
||||
df_result = component.get_dataframe_output()
|
||||
assert len(df_result) == 0
|
||||
Loading…
Add table
Add a link
Reference in a new issue