From d98d37778db5b022df045b305b55c7c21147d103 Mon Sep 17 00:00:00 2001 From: Cristhian Zanforlin Lousa Date: Wed, 12 Feb 2025 15:25:22 -0300 Subject: [PATCH] feat: enhance YouTubeTranscripts component with Data output support (#6113) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * 📝 (youtube_transcripts.py): update description of YouTubeTranscriptsComponent to be more concise and accurate ✨ (youtube_transcripts.py): add new output option 'data_output' to provide transcript along with the source video URL 🔧 (youtube_transcripts.py): add method 'get_data_output' to handle the new 'data_output' output option and return a Data object with transcript, video URL, and error message * [autofix.ci] apply automated fixes * 📝 (youtube_transcripts.py): improve documentation for get_data_output method to provide a clear description of the returned data object and its contents 🐛 (youtube_transcripts.py): handle specific exceptions from the youtube_transcript_api library to provide more informative error messages and improve error handling in the get_data_output method * [autofix.ci] apply automated fixes * 🐛 (youtube_transcripts.py): handle case where no transcripts are found by updating the error message and returning a default data object 🔧 (youtube_transcripts.py): refactor get_data_output method to use a default data object and combine all transcript parts into a single continuous text * [autofix.ci] apply automated fixes * ✨ (test_youtube_transcript_component.py): Add unit tests for YouTubeTranscriptsComponent to test various functionalities such as component initialization, output generation, error handling, and setting translation languages. * [autofix.ci] apply automated fixes * ✅ (test_youtube_transcript_component.py): update file_names_mapping fixture to return a non-empty list to properly test different versions of file names mapping in the YouTube transcripts component * [autofix.ci] apply automated fixes * 📝 (test_youtube_transcript_component.py): Add docstrings and improve variable names for better readability and maintainability 🔧 (test_youtube_transcript_component.py): Refactor error handling in test methods to use descriptive error messages and improve code readability * [autofix.ci] apply automated fixes --------- Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com> --- .../components/youtube/youtube_transcripts.py | 35 +++- .../tests/unit/components/bundles/__init__.py | 0 .../components/bundles/youtube/__init__.py | 0 .../test_youtube_transcript_component.py | 167 ++++++++++++++++++ 4 files changed, 200 insertions(+), 2 deletions(-) create mode 100644 src/backend/tests/unit/components/bundles/__init__.py create mode 100644 src/backend/tests/unit/components/bundles/youtube/__init__.py create mode 100644 src/backend/tests/unit/components/bundles/youtube/test_youtube_transcript_component.py diff --git a/src/backend/base/langflow/components/youtube/youtube_transcripts.py b/src/backend/base/langflow/components/youtube/youtube_transcripts.py index 73eeb012a..28ed2ed76 100644 --- a/src/backend/base/langflow/components/youtube/youtube_transcripts.py +++ b/src/backend/base/langflow/components/youtube/youtube_transcripts.py @@ -5,7 +5,7 @@ from langchain_community.document_loaders.youtube import TranscriptFormat from langflow.custom import Component from langflow.inputs import DropdownInput, IntInput, MultilineInput -from langflow.schema import DataFrame, Message +from langflow.schema import Data, DataFrame, Message from langflow.template import Output @@ -13,7 +13,7 @@ class YouTubeTranscriptsComponent(Component): """A component that extracts spoken content from YouTube videos as transcripts.""" display_name: str = "YouTube Transcripts" - description: str = "Extracts spoken content from YouTube videos with both DataFrame and text output options." + description: str = "Extracts spoken content from YouTube videos with multiple output options." icon: str = "YouTube" name = "YouTubeTranscripts" @@ -43,6 +43,7 @@ class YouTubeTranscriptsComponent(Component): outputs = [ Output(name="dataframe", display_name="Chunks", method="get_dataframe_output"), Output(name="message", display_name="Transcript", method="get_message_output"), + Output(name="data_output", display_name="Transcript + Source", method="get_data_output"), ] def _load_transcripts(self, *, as_chunks: bool = True): @@ -68,6 +69,7 @@ class YouTubeTranscriptsComponent(Component): start_seconds %= 60 timestamp = f"{start_minutes:02d}:{start_seconds:02d}" data.append({"timestamp": timestamp, "text": doc.page_content}) + return DataFrame(pd.DataFrame(data)) except (youtube_transcript_api.TranscriptsDisabled, youtube_transcript_api.NoTranscriptFound) as exc: @@ -83,3 +85,32 @@ class YouTubeTranscriptsComponent(Component): except (youtube_transcript_api.TranscriptsDisabled, youtube_transcript_api.NoTranscriptFound) as exc: error_msg = f"Failed to get YouTube transcripts: {exc!s}" return Message(text=error_msg) + + def get_data_output(self) -> Data: + """Creates a structured data object with transcript and metadata. + + Returns a Data object containing transcript text, video URL, and any error + messages that occurred during processing. The object includes: + - 'transcript': continuous text from the entire video (concatenated if multiple parts) + - 'video_url': the input YouTube URL + - 'error': error message if an exception occurs + """ + default_data = {"transcript": "", "video_url": self.url, "error": None} + + try: + transcripts = self._load_transcripts(as_chunks=False) + if not transcripts: + default_data["error"] = "No transcripts found." + return Data(data=default_data) + + # Combine all transcript parts + full_transcript = " ".join(doc.page_content for doc in transcripts) + return Data(data={"transcript": full_transcript, "video_url": self.url}) + + except ( + youtube_transcript_api.TranscriptsDisabled, + youtube_transcript_api.NoTranscriptFound, + youtube_transcript_api.CouldNotRetrieveTranscript, + ) as exc: + default_data["error"] = str(exc) + return Data(data=default_data) diff --git a/src/backend/tests/unit/components/bundles/__init__.py b/src/backend/tests/unit/components/bundles/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/backend/tests/unit/components/bundles/youtube/__init__.py b/src/backend/tests/unit/components/bundles/youtube/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/backend/tests/unit/components/bundles/youtube/test_youtube_transcript_component.py b/src/backend/tests/unit/components/bundles/youtube/test_youtube_transcript_component.py new file mode 100644 index 000000000..fae298591 --- /dev/null +++ b/src/backend/tests/unit/components/bundles/youtube/test_youtube_transcript_component.py @@ -0,0 +1,167 @@ +from unittest.mock import Mock, patch + +import pytest +from langflow.components.youtube.youtube_transcripts import YouTubeTranscriptsComponent +from langflow.schema import Data, DataFrame, Message +from youtube_transcript_api import NoTranscriptFound, TranscriptsDisabled + +from tests.base import ComponentTestBaseWithoutClient + + +class TestYouTubeTranscriptsComponent(ComponentTestBaseWithoutClient): + @pytest.fixture + def component_class(self): + """Return the component class to test.""" + return YouTubeTranscriptsComponent + + @pytest.fixture + def default_kwargs(self): + """Return the default kwargs for the component.""" + return { + "url": "https://www.youtube.com/watch?v=test123", + "chunk_size_seconds": 60, + "translation": "", + } + + @pytest.fixture + def file_names_mapping(self): + """Return the file names mapping for different versions.""" + return [] + + @pytest.fixture + def mock_transcript_data(self): + """Return mock transcript data for testing.""" + return [ + Mock(page_content="First part of the transcript", metadata={"start_seconds": 0}), + Mock(page_content="Second part of the transcript", metadata={"start_seconds": 60}), + ] + + def test_basic_setup(self, component_class, default_kwargs): + """Test basic component initialization.""" + component = component_class() + component.set_attributes(default_kwargs) + assert component.url == default_kwargs["url"] + assert component.chunk_size_seconds == default_kwargs["chunk_size_seconds"] + assert component.translation == default_kwargs["translation"] + + @patch("langflow.components.youtube.youtube_transcripts.YoutubeLoader") + def test_get_dataframe_output_success(self, mock_loader, component_class, default_kwargs, mock_transcript_data): + """Test successful DataFrame output generation.""" + mock_loader.from_youtube_url.return_value.load.return_value = mock_transcript_data + + component = component_class() + component.set_attributes(default_kwargs) + result = component.get_dataframe_output() + + assert isinstance(result, DataFrame) + result_df = result # More descriptive variable name + assert len(result_df) == 2 + assert list(result_df.columns) == ["timestamp", "text"] + assert result_df.iloc[0]["timestamp"] == "00:00" + assert result_df.iloc[1]["timestamp"] == "01:00" + assert result_df.iloc[0]["text"] == "First part of the transcript" + + @patch("langflow.components.youtube.youtube_transcripts.YoutubeLoader") + def test_get_message_output_success(self, mock_loader, component_class, default_kwargs, mock_transcript_data): + """Test successful Message output generation.""" + mock_loader.from_youtube_url.return_value.load.return_value = mock_transcript_data + + component = component_class() + component.set_attributes(default_kwargs) + result = component.get_message_output() + + assert isinstance(result, Message) + assert result.text == "First part of the transcript" + + @patch("langflow.components.youtube.youtube_transcripts.YoutubeLoader") + def test_get_data_output_success(self, mock_loader, component_class, default_kwargs, mock_transcript_data): + """Test successful Data output generation.""" + mock_loader.from_youtube_url.return_value.load.return_value = mock_transcript_data + + component = component_class() + component.set_attributes(default_kwargs) + result = component.get_data_output() + + assert isinstance(result, Data) + assert result.data["video_url"] == default_kwargs["url"] + assert result.data["transcript"] == "First part of the transcript Second part of the transcript" + assert "error" not in result.data + + @patch("langflow.components.youtube.youtube_transcripts.YoutubeLoader") + def test_transcript_disabled_error(self, mock_loader, component_class, default_kwargs): + """Test handling of TranscriptsDisabled error.""" + error_message = "Transcripts are disabled for this video" + + # Mock the load method to raise TranscriptsDisabled + def raise_error(*_): # Use underscore to indicate unused arguments + raise TranscriptsDisabled(error_message) + + mock_loader.from_youtube_url.return_value.load.side_effect = raise_error + + component = component_class() + component.set_attributes(default_kwargs) + + # Test DataFrame output + df_result = component.get_dataframe_output() + assert isinstance(df_result, DataFrame) + assert len(df_result) == 1 # One row for error message + assert "error" in df_result.columns + assert "Failed to get YouTube transcripts" in df_result["error"][0] + + # Test Message output + msg_result = component.get_message_output() + assert isinstance(msg_result, Message) + assert "Failed to get YouTube transcripts" in msg_result.text + + # Test Data output + data_result = component.get_data_output() + assert isinstance(data_result, Data) + assert "error" in data_result.data + assert data_result.data["transcript"] == "" + + @patch("langflow.components.youtube.youtube_transcripts.YoutubeLoader") + def test_no_transcript_found_error(self, mock_loader, component_class, default_kwargs): + """Test handling of NoTranscriptFound error.""" + video_id = "test123" + requested_langs = ["en"] + transcript_data = {"en": {"translationLanguages": []}} + + # Mock the load method to raise NoTranscriptFound + def raise_error(*_): # Use underscore to indicate unused arguments + raise NoTranscriptFound(video_id, requested_langs, transcript_data) + + mock_loader.from_youtube_url.return_value.load.side_effect = raise_error + + component = component_class() + component.set_attributes(default_kwargs) + + data_result = component.get_data_output() + assert isinstance(data_result, Data) + assert "error" in data_result.data + assert data_result.data["transcript"] == "" + + def test_translation_setting(self, component_class): + """Test setting different translation languages.""" + component = component_class() + test_cases = ["en", "es", "fr", ""] + + for lang in test_cases: + component.set_attributes({"url": "https://youtube.com/watch?v=test", "translation": lang}) + assert component.translation == lang + + @patch("langflow.components.youtube.youtube_transcripts.YoutubeLoader") + def test_empty_transcript_handling(self, mock_loader, component_class, default_kwargs): + """Test handling of empty transcript response.""" + mock_loader.from_youtube_url.return_value.load.return_value = [] + + component = component_class() + component.set_attributes(default_kwargs) + + # Test Data output with empty transcript + data_result = component.get_data_output() + assert data_result.data["error"] == "No transcripts found." + assert data_result.data["transcript"] == "" + + # Test DataFrame output with empty transcript + df_result = component.get_dataframe_output() + assert len(df_result) == 0