fix: Split text should only have dataframe output (#8362)

* fix: Split text should only have dataframe output * [autofix.ci] apply automated fixes * Update templates * test fix * Update Vector Store RAG.json * Update starter projects * fix tests * add shards --------- Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com> Co-authored-by: Mike Fortman <michael.fortman@datastax.com> Co-authored-by: cristhianzl <cristhian.lousa@gmail.com>
2025-06-10 12:21:15 -07:00 · 2025-06-10 12:21:15 -07:00 · e5e54ea606
commit e5e54ea606
parent 79c37c8f50
6 changed files with 268 additions and 291 deletions
--- a/.github/workflows/typescript_test.yml
+++ b/.github/workflows/typescript_test.yml
@ -220,12 +220,12 @@ jobs:

          echo "Total tests to run: $TEST_COUNT"

-          # Calculate optimal shard count - 1 shard per 5 tests, min 1, max 10
+          # Calculate optimal shard count - 1 shard per 5 tests, min 1, max 40
          SHARD_COUNT=$(( (TEST_COUNT + 4) / 5 ))
          if [ $SHARD_COUNT -lt 1 ]; then
            SHARD_COUNT=1
-          elif [ $SHARD_COUNT -gt 10 ]; then
-            SHARD_COUNT=10
+          elif [ $SHARD_COUNT -gt 40 ]; then
+            SHARD_COUNT=40
          fi

          # Create the matrix combinations string
--- a/src/backend/base/langflow/components/processing/split_text.py
+++ b/src/backend/base/langflow/components/processing/split_text.py
@ -64,8 +64,7 @@ class SplitTextComponent(Component):
    ]

    outputs = [
-        Output(display_name="Chunks", name="chunks", method="split_text"),
-        Output(display_name="DataFrame", name="dataframe", method="as_dataframe"),
+        Output(display_name="Chunks", name="dataframe", method="split_text"),
    ]

    def _docs_to_data(self, docs) -> list[Data]:
@ -133,8 +132,5 @@ class SplitTextComponent(Component):
            msg = f"Error splitting text: {e}"
            raise TypeError(msg) from e

-    def split_text(self) -> list[Data]:
-        return self._docs_to_data(self.split_text_base())
-
-    def as_dataframe(self) -> DataFrame:
-        return DataFrame(self.split_text())
+    def split_text(self) -> DataFrame:
+        return DataFrame(self._docs_to_data(self.split_text_base()))
--- a/src/backend/base/langflow/initial_setup/starter_projects/Vector
+++ b/src/backend/base/langflow/initial_setup/starter_projects/Vector
--- a/src/backend/tests/unit/components/processing/test_split_text_component.py
+++ b/src/backend/tests/unit/components/processing/test_split_text_component.py
@ -38,7 +38,7 @@ class TestSplitTextComponent(ComponentTestBaseWithoutClient):
    def test_split_text_basic(self):
        """Test basic text splitting functionality."""
        component = SplitTextComponent()
-        test_text = "This is a test.\nIt has multiple lines.\nEach line should be a chunk."
+        test_text = "First chunk\nSecond chunk\nThird chunk"
        component.set_attributes(
            {
                "data_inputs": [Data(text=test_text)],
@ -52,12 +52,18 @@ class TestSplitTextComponent(ComponentTestBaseWithoutClient):
            }
        )

-        results = component.split_text()
-        assert len(results) == 3, f"Expected 3 chunks, got {len(results)}"
-        assert "This is a test" in results[0].text, f"Expected 'This is a test', got '{results[0].text}'"
-        assert "It has multiple lines" in results[1].text, f"Expected 'It has multiple lines', got '{results[1].text}'"
-        assert "Each line should be a chunk" in results[2].text, (
-            f"Expected 'Each line should be a chunk', got '{results[2].text}'"
+        data_frame = component.split_text()
+        assert isinstance(data_frame, DataFrame), "Expected DataFrame instance"
+        assert len(data_frame) == 3, f"Expected DataFrame with 3 rows, got {len(data_frame)}"
+        assert list(data_frame.columns) == ["text"], f"Expected columns ['text'], got {list(data_frame.columns)}"
+        assert "First chunk" in data_frame.iloc[0]["text"], (
+            f"Expected 'First chunk', got '{data_frame.iloc[0]['text']}'"
+        )
+        assert "Second chunk" in data_frame.iloc[1]["text"], (
+            f"Expected 'Second chunk', got '{data_frame.iloc[1]['text']}'"
+        )
+        assert "Third chunk" in data_frame.iloc[2]["text"], (
+            f"Expected 'Third chunk', got '{data_frame.iloc[2]['text']}'"
        )

    def test_split_text_with_overlap(self):
@ -76,79 +82,7 @@ class TestSplitTextComponent(ComponentTestBaseWithoutClient):
            }
        )

-        results = component.split_text()
-        assert len(results) > 1, f"Expected more than 1 chunk, got {len(results)}"
-        # Check that chunks contain the expected text
-        assert "First chunk" in results[0].text, f"Expected 'First chunk' in '{results[0].text}'"
-        assert "Second chunk" in results[1].text, f"Expected 'Second chunk' in '{results[1].text}'"
-        assert "Third chunk" in results[2].text, f"Expected 'Third chunk' in '{results[2].text}'"
-
-    def test_split_text_custom_separator(self):
-        """Test text splitting with a custom separator."""
-        component = SplitTextComponent()
-        test_text = "First part|Second part|Third part"
-        component.set_attributes(
-            {
-                "data_inputs": [Data(text=test_text)],
-                "chunk_overlap": 0,
-                "chunk_size": 10,
-                "separator": "|",
-                "session_id": "test_session",
-                "sender": "test_sender",
-                "sender_name": "test_sender_name",
-            }
-        )
-
-        results = component.split_text()
-        assert len(results) == 3, f"Expected 3 chunks, got {len(results)}"
-        assert "First part" in results[0].text, f"Expected 'First part', got '{results[0].text}'"
-        assert "Second part" in results[1].text, f"Expected 'Second part', got '{results[1].text}'"
-        assert "Third part" in results[2].text, f"Expected 'Third part', got '{results[2].text}'"
-
-    def test_split_text_with_metadata(self):
-        """Test text splitting while preserving metadata."""
-        component = SplitTextComponent()
-        test_metadata = {"source": "test.txt", "author": "test"}
-        test_text = "Chunk 1\nChunk 2"
-        component.set_attributes(
-            {
-                "data_inputs": [Data(text=test_text, data=test_metadata)],
-                "chunk_overlap": 0,
-                "chunk_size": 7,
-                "separator": "\n",
-                "session_id": "test_session",
-                "sender": "test_sender",
-                "sender_name": "test_sender_name",
-            }
-        )
-
-        results = component.split_text()
-        assert len(results) == 2, f"Expected 2 chunks, got {len(results)}"
-        for result in results:
-            assert result.data["source"] == test_metadata["source"], (
-                f"Expected source '{test_metadata['source']}', got '{result.data.get('source')}'"
-            )
-            assert result.data["author"] == test_metadata["author"], (
-                f"Expected author '{test_metadata['author']}', got '{result.data.get('author')}'"
-            )
-
-    def test_split_text_as_dataframe(self):
-        """Test converting split text results to DataFrame."""
-        component = SplitTextComponent()
-        test_text = "First chunk\nSecond chunk\nThird chunk"
-        component.set_attributes(
-            {
-                "data_inputs": [Data(text=test_text)],
-                "chunk_overlap": 0,
-                "chunk_size": 11,
-                "separator": "\n",
-                "session_id": "test_session",
-                "sender": "test_sender",
-                "sender_name": "test_sender_name",
-            }
-        )
-
-        data_frame = component.as_dataframe()
+        data_frame = component.split_text()
        assert isinstance(data_frame, DataFrame), "Expected DataFrame instance"
        assert len(data_frame) == 3, f"Expected DataFrame with 3 rows, got {len(data_frame)}"
        assert list(data_frame.columns) == ["text"], f"Expected columns ['text'], got {list(data_frame.columns)}"
@ -162,6 +96,71 @@ class TestSplitTextComponent(ComponentTestBaseWithoutClient):
            f"Expected 'Third chunk', got '{data_frame.iloc[2]['text']}'"
        )

+    def test_split_text_custom_separator(self):
+        """Test text splitting with a custom separator."""
+        component = SplitTextComponent()
+        test_text = "First chunk.|Second chunk.|Third chunk."
+        component.set_attributes(
+            {
+                "data_inputs": [Data(text=test_text)],
+                "chunk_overlap": 0,
+                "chunk_size": 10,
+                "separator": "|",
+                "session_id": "test_session",
+                "sender": "test_sender",
+                "sender_name": "test_sender_name",
+            }
+        )
+
+        data_frame = component.split_text()
+        assert isinstance(data_frame, DataFrame), "Expected DataFrame instance"
+        assert len(data_frame) == 3, f"Expected DataFrame with 3 rows, got {len(data_frame)}"
+        assert list(data_frame.columns) == ["text"], f"Expected columns ['text'], got {list(data_frame.columns)}"
+        assert "First chunk" in data_frame.iloc[0]["text"], (
+            f"Expected 'First chunk', got '{data_frame.iloc[0]['text']}'"
+        )
+        assert "Second chunk" in data_frame.iloc[1]["text"], (
+            f"Expected 'Second chunk', got '{data_frame.iloc[1]['text']}'"
+        )
+        assert "Third chunk" in data_frame.iloc[2]["text"], (
+            f"Expected 'Third chunk', got '{data_frame.iloc[2]['text']}'"
+        )
+
+    def test_split_text_with_metadata(self):
+        """Test text splitting while preserving metadata."""
+        component = SplitTextComponent()
+        test_metadata = {"source": "test.txt", "author": "test"}
+        test_text = "First chunk\nSecond chunk"
+        component.set_attributes(
+            {
+                "data_inputs": [Data(text=test_text, data=test_metadata)],
+                "chunk_overlap": 0,
+                "chunk_size": 7,
+                "separator": "\n",
+                "session_id": "test_session",
+                "sender": "test_sender",
+                "sender_name": "test_sender_name",
+            }
+        )
+
+        data_frame = component.split_text()
+        assert isinstance(data_frame, DataFrame), "Expected DataFrame instance"
+        assert len(data_frame) == 2, f"Expected DataFrame with 2 rows, got {len(data_frame)}"
+        assert "First chunk" in data_frame.iloc[0]["text"], (
+            f"Expected 'First chunk', got '{data_frame.iloc[0]['text']}'"
+        )
+        assert "Second chunk" in data_frame.iloc[1]["text"], (
+            f"Expected 'Second chunk', got '{data_frame.iloc[1]['text']}'"
+        )
+        # Loop over each row to check metadata
+        for _, row in data_frame.iterrows():
+            assert row["source"] == test_metadata["source"], (
+                f"Expected source '{test_metadata['source']}', got '{row['source']}'"
+            )
+            assert row["author"] == test_metadata["author"], (
+                f"Expected author '{test_metadata['author']}', got '{row['author']}'"
+            )
+
    def test_split_text_empty_input(self):
        """Test handling of empty input text."""
        component = SplitTextComponent()
@ -198,7 +197,7 @@ class TestSplitTextComponent(ComponentTestBaseWithoutClient):

        results = component.split_text()
        assert len(results) == 1, f"Expected 1 chunk, got {len(results)}"
-        assert results[0].text == test_text, f"Expected '{test_text}', got '{results[0].text}'"
+        assert results["text"][0] == test_text, f"Expected '{test_text}', got '{results['text'][0]}'"

    def test_split_text_multiple_inputs(self):
        """Test splitting multiple input texts."""
@ -218,10 +217,10 @@ class TestSplitTextComponent(ComponentTestBaseWithoutClient):

        results = component.split_text()
        assert len(results) == 4, f"Expected 4 chunks (2 from each text), got {len(results)}"
-        assert "First text" in results[0].text, f"Expected 'First text', got '{results[0].text}'"
-        assert "Second line" in results[1].text, f"Expected 'Second line', got '{results[1].text}'"
-        assert "Another text" in results[2].text, f"Expected 'Another text', got '{results[2].text}'"
-        assert "Another line" in results[3].text, f"Expected 'Another line', got '{results[3].text}'"
+        assert "First text" in results["text"][0], f"Expected 'First text', got '{results['text'][0]}'"
+        assert "Second line" in results["text"][1], f"Expected 'Second line', got '{results['text'][1]}'"
+        assert "Another text" in results["text"][2], f"Expected 'Another text', got '{results['text'][2]}'"
+        assert "Another line" in results["text"][3], f"Expected 'Another line', got '{results['text'][3]}'"

    def test_split_text_with_dataframe_input(self):
        """Test splitting text with DataFrame input."""
@ -242,10 +241,10 @@ class TestSplitTextComponent(ComponentTestBaseWithoutClient):

        results = component.split_text()
        assert len(results) == 4, f"Expected 4 chunks (2 from each text), got {len(results)}"
-        assert "First text" in results[0].text, f"Expected 'First text', got '{results[0].text}'"
-        assert "Second line" in results[1].text, f"Expected 'Second line', got '{results[1].text}'"
-        assert "Another text" in results[2].text, f"Expected 'Another text', got '{results[2].text}'"
-        assert "Another line" in results[3].text, f"Expected 'Another line', got '{results[3].text}'"
+        assert "First text" in results["text"][0], f"Expected 'First text', got '{results['text'][0]}'"
+        assert "Second line" in results["text"][1], f"Expected 'Second line', got '{results['text'][1]}'"
+        assert "Another text" in results["text"][2], f"Expected 'Another text', got '{results['text'][2]}'"
+        assert "Another line" in results["text"][3], f"Expected 'Another line', got '{results['text'][3]}'"

    def test_with_url_loader(self):
        """Test splitting text with URL loader."""
@ -267,5 +266,5 @@ class TestSplitTextComponent(ComponentTestBaseWithoutClient):
        )

        results = component.split_text()
-        assert isinstance(results, list), "Expected list instance"
+        assert isinstance(results, DataFrame), "Expected DataFrame instance"
        assert len(results) > 2, f"Expected DataFrame with more than 2 rows, got {len(results)}"
--- a/src/frontend/tests/core/features/filterSidebar.spec.ts
+++ b/src/frontend/tests/core/features/filterSidebar.spec.ts
@ -130,7 +130,7 @@ test(

    await expect(page.getByTestId("logicSub Flow [Deprecated]")).toBeVisible();

-    await expect(page.getByTestId("processingSplit Text")).toBeVisible();
+    await expect(page.getByTestId("processingData Operations")).toBeVisible();

    await page.getByTestId("icon-X").first().click();

--- a/src/frontend/tests/core/features/freeze.spec.ts
+++ b/src/frontend/tests/core/features/freeze.spec.ts
@ -58,13 +58,13 @@ test(
    //fourth component

    await page.getByTestId("sidebar-search-input").click();
-    await page.getByTestId("sidebar-search-input").fill("data to message");
-    await page.waitForSelector('[data-testid="processingData to Message"]', {
+    await page.getByTestId("sidebar-search-input").fill("Parser");
+    await page.waitForSelector('[data-testid="processingParser"]', {
      timeout: 1000,
    });

    await page
-      .getByTestId("processingData to Message")
+      .getByTestId("processingParser")
      .dragTo(page.locator('//*[@id="react-flow-id"]'), {
        targetPosition: { x: 50, y: 300 },
      });
@ -133,11 +133,13 @@ test(
      .getByTestId("handle-splittext-shownode-chunks-right")
      .nth(0)
      .click();
-    await page.getByTestId("handle-parsedata-shownode-data-left").click();
+    await page
+      .getByTestId("handle-parsercomponent-shownode-data or dataframe-left")
+      .click();

    //connection 4
    await page
-      .getByTestId("handle-parsedata-shownode-message-right")
+      .getByTestId("handle-parsercomponent-shownode-parsed text-right")
      .nth(0)
      .click();
    await page.getByTestId("handle-chatoutput-shownode-inputs-left").click();