fix: Split text should only have dataframe output (#8362)

* fix: Split text should only have dataframe output

* [autofix.ci] apply automated fixes

* Update templates

* test fix

* Update Vector Store RAG.json

* Update starter projects

* fix tests

* add shards

---------

Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
Co-authored-by: Mike Fortman <michael.fortman@datastax.com>
Co-authored-by: cristhianzl <cristhian.lousa@gmail.com>
This commit is contained in:
Eric Hare 2025-06-10 12:21:15 -07:00 committed by GitHub
commit e5e54ea606
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 268 additions and 291 deletions

View file

@ -220,12 +220,12 @@ jobs:
echo "Total tests to run: $TEST_COUNT"
# Calculate optimal shard count - 1 shard per 5 tests, min 1, max 10
# Calculate optimal shard count - 1 shard per 5 tests, min 1, max 40
SHARD_COUNT=$(( (TEST_COUNT + 4) / 5 ))
if [ $SHARD_COUNT -lt 1 ]; then
SHARD_COUNT=1
elif [ $SHARD_COUNT -gt 10 ]; then
SHARD_COUNT=10
elif [ $SHARD_COUNT -gt 40 ]; then
SHARD_COUNT=40
fi
# Create the matrix combinations string

View file

@ -64,8 +64,7 @@ class SplitTextComponent(Component):
]
outputs = [
Output(display_name="Chunks", name="chunks", method="split_text"),
Output(display_name="DataFrame", name="dataframe", method="as_dataframe"),
Output(display_name="Chunks", name="dataframe", method="split_text"),
]
def _docs_to_data(self, docs) -> list[Data]:
@ -133,8 +132,5 @@ class SplitTextComponent(Component):
msg = f"Error splitting text: {e}"
raise TypeError(msg) from e
def split_text(self) -> list[Data]:
return self._docs_to_data(self.split_text_base())
def as_dataframe(self) -> DataFrame:
return DataFrame(self.split_text())
def split_text(self) -> DataFrame:
return DataFrame(self._docs_to_data(self.split_text_base()))

File diff suppressed because one or more lines are too long

View file

@ -38,7 +38,7 @@ class TestSplitTextComponent(ComponentTestBaseWithoutClient):
def test_split_text_basic(self):
"""Test basic text splitting functionality."""
component = SplitTextComponent()
test_text = "This is a test.\nIt has multiple lines.\nEach line should be a chunk."
test_text = "First chunk\nSecond chunk\nThird chunk"
component.set_attributes(
{
"data_inputs": [Data(text=test_text)],
@ -52,12 +52,18 @@ class TestSplitTextComponent(ComponentTestBaseWithoutClient):
}
)
results = component.split_text()
assert len(results) == 3, f"Expected 3 chunks, got {len(results)}"
assert "This is a test" in results[0].text, f"Expected 'This is a test', got '{results[0].text}'"
assert "It has multiple lines" in results[1].text, f"Expected 'It has multiple lines', got '{results[1].text}'"
assert "Each line should be a chunk" in results[2].text, (
f"Expected 'Each line should be a chunk', got '{results[2].text}'"
data_frame = component.split_text()
assert isinstance(data_frame, DataFrame), "Expected DataFrame instance"
assert len(data_frame) == 3, f"Expected DataFrame with 3 rows, got {len(data_frame)}"
assert list(data_frame.columns) == ["text"], f"Expected columns ['text'], got {list(data_frame.columns)}"
assert "First chunk" in data_frame.iloc[0]["text"], (
f"Expected 'First chunk', got '{data_frame.iloc[0]['text']}'"
)
assert "Second chunk" in data_frame.iloc[1]["text"], (
f"Expected 'Second chunk', got '{data_frame.iloc[1]['text']}'"
)
assert "Third chunk" in data_frame.iloc[2]["text"], (
f"Expected 'Third chunk', got '{data_frame.iloc[2]['text']}'"
)
def test_split_text_with_overlap(self):
@ -76,79 +82,7 @@ class TestSplitTextComponent(ComponentTestBaseWithoutClient):
}
)
results = component.split_text()
assert len(results) > 1, f"Expected more than 1 chunk, got {len(results)}"
# Check that chunks contain the expected text
assert "First chunk" in results[0].text, f"Expected 'First chunk' in '{results[0].text}'"
assert "Second chunk" in results[1].text, f"Expected 'Second chunk' in '{results[1].text}'"
assert "Third chunk" in results[2].text, f"Expected 'Third chunk' in '{results[2].text}'"
def test_split_text_custom_separator(self):
"""Test text splitting with a custom separator."""
component = SplitTextComponent()
test_text = "First part|Second part|Third part"
component.set_attributes(
{
"data_inputs": [Data(text=test_text)],
"chunk_overlap": 0,
"chunk_size": 10,
"separator": "|",
"session_id": "test_session",
"sender": "test_sender",
"sender_name": "test_sender_name",
}
)
results = component.split_text()
assert len(results) == 3, f"Expected 3 chunks, got {len(results)}"
assert "First part" in results[0].text, f"Expected 'First part', got '{results[0].text}'"
assert "Second part" in results[1].text, f"Expected 'Second part', got '{results[1].text}'"
assert "Third part" in results[2].text, f"Expected 'Third part', got '{results[2].text}'"
def test_split_text_with_metadata(self):
"""Test text splitting while preserving metadata."""
component = SplitTextComponent()
test_metadata = {"source": "test.txt", "author": "test"}
test_text = "Chunk 1\nChunk 2"
component.set_attributes(
{
"data_inputs": [Data(text=test_text, data=test_metadata)],
"chunk_overlap": 0,
"chunk_size": 7,
"separator": "\n",
"session_id": "test_session",
"sender": "test_sender",
"sender_name": "test_sender_name",
}
)
results = component.split_text()
assert len(results) == 2, f"Expected 2 chunks, got {len(results)}"
for result in results:
assert result.data["source"] == test_metadata["source"], (
f"Expected source '{test_metadata['source']}', got '{result.data.get('source')}'"
)
assert result.data["author"] == test_metadata["author"], (
f"Expected author '{test_metadata['author']}', got '{result.data.get('author')}'"
)
def test_split_text_as_dataframe(self):
"""Test converting split text results to DataFrame."""
component = SplitTextComponent()
test_text = "First chunk\nSecond chunk\nThird chunk"
component.set_attributes(
{
"data_inputs": [Data(text=test_text)],
"chunk_overlap": 0,
"chunk_size": 11,
"separator": "\n",
"session_id": "test_session",
"sender": "test_sender",
"sender_name": "test_sender_name",
}
)
data_frame = component.as_dataframe()
data_frame = component.split_text()
assert isinstance(data_frame, DataFrame), "Expected DataFrame instance"
assert len(data_frame) == 3, f"Expected DataFrame with 3 rows, got {len(data_frame)}"
assert list(data_frame.columns) == ["text"], f"Expected columns ['text'], got {list(data_frame.columns)}"
@ -162,6 +96,71 @@ class TestSplitTextComponent(ComponentTestBaseWithoutClient):
f"Expected 'Third chunk', got '{data_frame.iloc[2]['text']}'"
)
def test_split_text_custom_separator(self):
"""Test text splitting with a custom separator."""
component = SplitTextComponent()
test_text = "First chunk.|Second chunk.|Third chunk."
component.set_attributes(
{
"data_inputs": [Data(text=test_text)],
"chunk_overlap": 0,
"chunk_size": 10,
"separator": "|",
"session_id": "test_session",
"sender": "test_sender",
"sender_name": "test_sender_name",
}
)
data_frame = component.split_text()
assert isinstance(data_frame, DataFrame), "Expected DataFrame instance"
assert len(data_frame) == 3, f"Expected DataFrame with 3 rows, got {len(data_frame)}"
assert list(data_frame.columns) == ["text"], f"Expected columns ['text'], got {list(data_frame.columns)}"
assert "First chunk" in data_frame.iloc[0]["text"], (
f"Expected 'First chunk', got '{data_frame.iloc[0]['text']}'"
)
assert "Second chunk" in data_frame.iloc[1]["text"], (
f"Expected 'Second chunk', got '{data_frame.iloc[1]['text']}'"
)
assert "Third chunk" in data_frame.iloc[2]["text"], (
f"Expected 'Third chunk', got '{data_frame.iloc[2]['text']}'"
)
def test_split_text_with_metadata(self):
"""Test text splitting while preserving metadata."""
component = SplitTextComponent()
test_metadata = {"source": "test.txt", "author": "test"}
test_text = "First chunk\nSecond chunk"
component.set_attributes(
{
"data_inputs": [Data(text=test_text, data=test_metadata)],
"chunk_overlap": 0,
"chunk_size": 7,
"separator": "\n",
"session_id": "test_session",
"sender": "test_sender",
"sender_name": "test_sender_name",
}
)
data_frame = component.split_text()
assert isinstance(data_frame, DataFrame), "Expected DataFrame instance"
assert len(data_frame) == 2, f"Expected DataFrame with 2 rows, got {len(data_frame)}"
assert "First chunk" in data_frame.iloc[0]["text"], (
f"Expected 'First chunk', got '{data_frame.iloc[0]['text']}'"
)
assert "Second chunk" in data_frame.iloc[1]["text"], (
f"Expected 'Second chunk', got '{data_frame.iloc[1]['text']}'"
)
# Loop over each row to check metadata
for _, row in data_frame.iterrows():
assert row["source"] == test_metadata["source"], (
f"Expected source '{test_metadata['source']}', got '{row['source']}'"
)
assert row["author"] == test_metadata["author"], (
f"Expected author '{test_metadata['author']}', got '{row['author']}'"
)
def test_split_text_empty_input(self):
"""Test handling of empty input text."""
component = SplitTextComponent()
@ -198,7 +197,7 @@ class TestSplitTextComponent(ComponentTestBaseWithoutClient):
results = component.split_text()
assert len(results) == 1, f"Expected 1 chunk, got {len(results)}"
assert results[0].text == test_text, f"Expected '{test_text}', got '{results[0].text}'"
assert results["text"][0] == test_text, f"Expected '{test_text}', got '{results['text'][0]}'"
def test_split_text_multiple_inputs(self):
"""Test splitting multiple input texts."""
@ -218,10 +217,10 @@ class TestSplitTextComponent(ComponentTestBaseWithoutClient):
results = component.split_text()
assert len(results) == 4, f"Expected 4 chunks (2 from each text), got {len(results)}"
assert "First text" in results[0].text, f"Expected 'First text', got '{results[0].text}'"
assert "Second line" in results[1].text, f"Expected 'Second line', got '{results[1].text}'"
assert "Another text" in results[2].text, f"Expected 'Another text', got '{results[2].text}'"
assert "Another line" in results[3].text, f"Expected 'Another line', got '{results[3].text}'"
assert "First text" in results["text"][0], f"Expected 'First text', got '{results['text'][0]}'"
assert "Second line" in results["text"][1], f"Expected 'Second line', got '{results['text'][1]}'"
assert "Another text" in results["text"][2], f"Expected 'Another text', got '{results['text'][2]}'"
assert "Another line" in results["text"][3], f"Expected 'Another line', got '{results['text'][3]}'"
def test_split_text_with_dataframe_input(self):
"""Test splitting text with DataFrame input."""
@ -242,10 +241,10 @@ class TestSplitTextComponent(ComponentTestBaseWithoutClient):
results = component.split_text()
assert len(results) == 4, f"Expected 4 chunks (2 from each text), got {len(results)}"
assert "First text" in results[0].text, f"Expected 'First text', got '{results[0].text}'"
assert "Second line" in results[1].text, f"Expected 'Second line', got '{results[1].text}'"
assert "Another text" in results[2].text, f"Expected 'Another text', got '{results[2].text}'"
assert "Another line" in results[3].text, f"Expected 'Another line', got '{results[3].text}'"
assert "First text" in results["text"][0], f"Expected 'First text', got '{results['text'][0]}'"
assert "Second line" in results["text"][1], f"Expected 'Second line', got '{results['text'][1]}'"
assert "Another text" in results["text"][2], f"Expected 'Another text', got '{results['text'][2]}'"
assert "Another line" in results["text"][3], f"Expected 'Another line', got '{results['text'][3]}'"
def test_with_url_loader(self):
"""Test splitting text with URL loader."""
@ -267,5 +266,5 @@ class TestSplitTextComponent(ComponentTestBaseWithoutClient):
)
results = component.split_text()
assert isinstance(results, list), "Expected list instance"
assert isinstance(results, DataFrame), "Expected DataFrame instance"
assert len(results) > 2, f"Expected DataFrame with more than 2 rows, got {len(results)}"

View file

@ -130,7 +130,7 @@ test(
await expect(page.getByTestId("logicSub Flow [Deprecated]")).toBeVisible();
await expect(page.getByTestId("processingSplit Text")).toBeVisible();
await expect(page.getByTestId("processingData Operations")).toBeVisible();
await page.getByTestId("icon-X").first().click();

View file

@ -58,13 +58,13 @@ test(
//fourth component
await page.getByTestId("sidebar-search-input").click();
await page.getByTestId("sidebar-search-input").fill("data to message");
await page.waitForSelector('[data-testid="processingData to Message"]', {
await page.getByTestId("sidebar-search-input").fill("Parser");
await page.waitForSelector('[data-testid="processingParser"]', {
timeout: 1000,
});
await page
.getByTestId("processingData to Message")
.getByTestId("processingParser")
.dragTo(page.locator('//*[@id="react-flow-id"]'), {
targetPosition: { x: 50, y: 300 },
});
@ -133,11 +133,13 @@ test(
.getByTestId("handle-splittext-shownode-chunks-right")
.nth(0)
.click();
await page.getByTestId("handle-parsedata-shownode-data-left").click();
await page
.getByTestId("handle-parsercomponent-shownode-data or dataframe-left")
.click();
//connection 4
await page
.getByTestId("handle-parsedata-shownode-message-right")
.getByTestId("handle-parsercomponent-shownode-parsed text-right")
.nth(0)
.click();
await page.getByTestId("handle-chatoutput-shownode-inputs-left").click();