fix: Split text should only have dataframe output (#8362)
* fix: Split text should only have dataframe output * [autofix.ci] apply automated fixes * Update templates * test fix * Update Vector Store RAG.json * Update starter projects * fix tests * add shards --------- Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com> Co-authored-by: Mike Fortman <michael.fortman@datastax.com> Co-authored-by: cristhianzl <cristhian.lousa@gmail.com>
This commit is contained in:
parent
79c37c8f50
commit
e5e54ea606
6 changed files with 268 additions and 291 deletions
6
.github/workflows/typescript_test.yml
vendored
6
.github/workflows/typescript_test.yml
vendored
|
|
@ -220,12 +220,12 @@ jobs:
|
|||
|
||||
echo "Total tests to run: $TEST_COUNT"
|
||||
|
||||
# Calculate optimal shard count - 1 shard per 5 tests, min 1, max 10
|
||||
# Calculate optimal shard count - 1 shard per 5 tests, min 1, max 40
|
||||
SHARD_COUNT=$(( (TEST_COUNT + 4) / 5 ))
|
||||
if [ $SHARD_COUNT -lt 1 ]; then
|
||||
SHARD_COUNT=1
|
||||
elif [ $SHARD_COUNT -gt 10 ]; then
|
||||
SHARD_COUNT=10
|
||||
elif [ $SHARD_COUNT -gt 40 ]; then
|
||||
SHARD_COUNT=40
|
||||
fi
|
||||
|
||||
# Create the matrix combinations string
|
||||
|
|
|
|||
|
|
@ -64,8 +64,7 @@ class SplitTextComponent(Component):
|
|||
]
|
||||
|
||||
outputs = [
|
||||
Output(display_name="Chunks", name="chunks", method="split_text"),
|
||||
Output(display_name="DataFrame", name="dataframe", method="as_dataframe"),
|
||||
Output(display_name="Chunks", name="dataframe", method="split_text"),
|
||||
]
|
||||
|
||||
def _docs_to_data(self, docs) -> list[Data]:
|
||||
|
|
@ -133,8 +132,5 @@ class SplitTextComponent(Component):
|
|||
msg = f"Error splitting text: {e}"
|
||||
raise TypeError(msg) from e
|
||||
|
||||
def split_text(self) -> list[Data]:
|
||||
return self._docs_to_data(self.split_text_base())
|
||||
|
||||
def as_dataframe(self) -> DataFrame:
|
||||
return DataFrame(self.split_text())
|
||||
def split_text(self) -> DataFrame:
|
||||
return DataFrame(self._docs_to_data(self.split_text_base()))
|
||||
|
|
|
|||
File diff suppressed because one or more lines are too long
|
|
@ -38,7 +38,7 @@ class TestSplitTextComponent(ComponentTestBaseWithoutClient):
|
|||
def test_split_text_basic(self):
|
||||
"""Test basic text splitting functionality."""
|
||||
component = SplitTextComponent()
|
||||
test_text = "This is a test.\nIt has multiple lines.\nEach line should be a chunk."
|
||||
test_text = "First chunk\nSecond chunk\nThird chunk"
|
||||
component.set_attributes(
|
||||
{
|
||||
"data_inputs": [Data(text=test_text)],
|
||||
|
|
@ -52,12 +52,18 @@ class TestSplitTextComponent(ComponentTestBaseWithoutClient):
|
|||
}
|
||||
)
|
||||
|
||||
results = component.split_text()
|
||||
assert len(results) == 3, f"Expected 3 chunks, got {len(results)}"
|
||||
assert "This is a test" in results[0].text, f"Expected 'This is a test', got '{results[0].text}'"
|
||||
assert "It has multiple lines" in results[1].text, f"Expected 'It has multiple lines', got '{results[1].text}'"
|
||||
assert "Each line should be a chunk" in results[2].text, (
|
||||
f"Expected 'Each line should be a chunk', got '{results[2].text}'"
|
||||
data_frame = component.split_text()
|
||||
assert isinstance(data_frame, DataFrame), "Expected DataFrame instance"
|
||||
assert len(data_frame) == 3, f"Expected DataFrame with 3 rows, got {len(data_frame)}"
|
||||
assert list(data_frame.columns) == ["text"], f"Expected columns ['text'], got {list(data_frame.columns)}"
|
||||
assert "First chunk" in data_frame.iloc[0]["text"], (
|
||||
f"Expected 'First chunk', got '{data_frame.iloc[0]['text']}'"
|
||||
)
|
||||
assert "Second chunk" in data_frame.iloc[1]["text"], (
|
||||
f"Expected 'Second chunk', got '{data_frame.iloc[1]['text']}'"
|
||||
)
|
||||
assert "Third chunk" in data_frame.iloc[2]["text"], (
|
||||
f"Expected 'Third chunk', got '{data_frame.iloc[2]['text']}'"
|
||||
)
|
||||
|
||||
def test_split_text_with_overlap(self):
|
||||
|
|
@ -76,79 +82,7 @@ class TestSplitTextComponent(ComponentTestBaseWithoutClient):
|
|||
}
|
||||
)
|
||||
|
||||
results = component.split_text()
|
||||
assert len(results) > 1, f"Expected more than 1 chunk, got {len(results)}"
|
||||
# Check that chunks contain the expected text
|
||||
assert "First chunk" in results[0].text, f"Expected 'First chunk' in '{results[0].text}'"
|
||||
assert "Second chunk" in results[1].text, f"Expected 'Second chunk' in '{results[1].text}'"
|
||||
assert "Third chunk" in results[2].text, f"Expected 'Third chunk' in '{results[2].text}'"
|
||||
|
||||
def test_split_text_custom_separator(self):
|
||||
"""Test text splitting with a custom separator."""
|
||||
component = SplitTextComponent()
|
||||
test_text = "First part|Second part|Third part"
|
||||
component.set_attributes(
|
||||
{
|
||||
"data_inputs": [Data(text=test_text)],
|
||||
"chunk_overlap": 0,
|
||||
"chunk_size": 10,
|
||||
"separator": "|",
|
||||
"session_id": "test_session",
|
||||
"sender": "test_sender",
|
||||
"sender_name": "test_sender_name",
|
||||
}
|
||||
)
|
||||
|
||||
results = component.split_text()
|
||||
assert len(results) == 3, f"Expected 3 chunks, got {len(results)}"
|
||||
assert "First part" in results[0].text, f"Expected 'First part', got '{results[0].text}'"
|
||||
assert "Second part" in results[1].text, f"Expected 'Second part', got '{results[1].text}'"
|
||||
assert "Third part" in results[2].text, f"Expected 'Third part', got '{results[2].text}'"
|
||||
|
||||
def test_split_text_with_metadata(self):
|
||||
"""Test text splitting while preserving metadata."""
|
||||
component = SplitTextComponent()
|
||||
test_metadata = {"source": "test.txt", "author": "test"}
|
||||
test_text = "Chunk 1\nChunk 2"
|
||||
component.set_attributes(
|
||||
{
|
||||
"data_inputs": [Data(text=test_text, data=test_metadata)],
|
||||
"chunk_overlap": 0,
|
||||
"chunk_size": 7,
|
||||
"separator": "\n",
|
||||
"session_id": "test_session",
|
||||
"sender": "test_sender",
|
||||
"sender_name": "test_sender_name",
|
||||
}
|
||||
)
|
||||
|
||||
results = component.split_text()
|
||||
assert len(results) == 2, f"Expected 2 chunks, got {len(results)}"
|
||||
for result in results:
|
||||
assert result.data["source"] == test_metadata["source"], (
|
||||
f"Expected source '{test_metadata['source']}', got '{result.data.get('source')}'"
|
||||
)
|
||||
assert result.data["author"] == test_metadata["author"], (
|
||||
f"Expected author '{test_metadata['author']}', got '{result.data.get('author')}'"
|
||||
)
|
||||
|
||||
def test_split_text_as_dataframe(self):
|
||||
"""Test converting split text results to DataFrame."""
|
||||
component = SplitTextComponent()
|
||||
test_text = "First chunk\nSecond chunk\nThird chunk"
|
||||
component.set_attributes(
|
||||
{
|
||||
"data_inputs": [Data(text=test_text)],
|
||||
"chunk_overlap": 0,
|
||||
"chunk_size": 11,
|
||||
"separator": "\n",
|
||||
"session_id": "test_session",
|
||||
"sender": "test_sender",
|
||||
"sender_name": "test_sender_name",
|
||||
}
|
||||
)
|
||||
|
||||
data_frame = component.as_dataframe()
|
||||
data_frame = component.split_text()
|
||||
assert isinstance(data_frame, DataFrame), "Expected DataFrame instance"
|
||||
assert len(data_frame) == 3, f"Expected DataFrame with 3 rows, got {len(data_frame)}"
|
||||
assert list(data_frame.columns) == ["text"], f"Expected columns ['text'], got {list(data_frame.columns)}"
|
||||
|
|
@ -162,6 +96,71 @@ class TestSplitTextComponent(ComponentTestBaseWithoutClient):
|
|||
f"Expected 'Third chunk', got '{data_frame.iloc[2]['text']}'"
|
||||
)
|
||||
|
||||
def test_split_text_custom_separator(self):
|
||||
"""Test text splitting with a custom separator."""
|
||||
component = SplitTextComponent()
|
||||
test_text = "First chunk.|Second chunk.|Third chunk."
|
||||
component.set_attributes(
|
||||
{
|
||||
"data_inputs": [Data(text=test_text)],
|
||||
"chunk_overlap": 0,
|
||||
"chunk_size": 10,
|
||||
"separator": "|",
|
||||
"session_id": "test_session",
|
||||
"sender": "test_sender",
|
||||
"sender_name": "test_sender_name",
|
||||
}
|
||||
)
|
||||
|
||||
data_frame = component.split_text()
|
||||
assert isinstance(data_frame, DataFrame), "Expected DataFrame instance"
|
||||
assert len(data_frame) == 3, f"Expected DataFrame with 3 rows, got {len(data_frame)}"
|
||||
assert list(data_frame.columns) == ["text"], f"Expected columns ['text'], got {list(data_frame.columns)}"
|
||||
assert "First chunk" in data_frame.iloc[0]["text"], (
|
||||
f"Expected 'First chunk', got '{data_frame.iloc[0]['text']}'"
|
||||
)
|
||||
assert "Second chunk" in data_frame.iloc[1]["text"], (
|
||||
f"Expected 'Second chunk', got '{data_frame.iloc[1]['text']}'"
|
||||
)
|
||||
assert "Third chunk" in data_frame.iloc[2]["text"], (
|
||||
f"Expected 'Third chunk', got '{data_frame.iloc[2]['text']}'"
|
||||
)
|
||||
|
||||
def test_split_text_with_metadata(self):
|
||||
"""Test text splitting while preserving metadata."""
|
||||
component = SplitTextComponent()
|
||||
test_metadata = {"source": "test.txt", "author": "test"}
|
||||
test_text = "First chunk\nSecond chunk"
|
||||
component.set_attributes(
|
||||
{
|
||||
"data_inputs": [Data(text=test_text, data=test_metadata)],
|
||||
"chunk_overlap": 0,
|
||||
"chunk_size": 7,
|
||||
"separator": "\n",
|
||||
"session_id": "test_session",
|
||||
"sender": "test_sender",
|
||||
"sender_name": "test_sender_name",
|
||||
}
|
||||
)
|
||||
|
||||
data_frame = component.split_text()
|
||||
assert isinstance(data_frame, DataFrame), "Expected DataFrame instance"
|
||||
assert len(data_frame) == 2, f"Expected DataFrame with 2 rows, got {len(data_frame)}"
|
||||
assert "First chunk" in data_frame.iloc[0]["text"], (
|
||||
f"Expected 'First chunk', got '{data_frame.iloc[0]['text']}'"
|
||||
)
|
||||
assert "Second chunk" in data_frame.iloc[1]["text"], (
|
||||
f"Expected 'Second chunk', got '{data_frame.iloc[1]['text']}'"
|
||||
)
|
||||
# Loop over each row to check metadata
|
||||
for _, row in data_frame.iterrows():
|
||||
assert row["source"] == test_metadata["source"], (
|
||||
f"Expected source '{test_metadata['source']}', got '{row['source']}'"
|
||||
)
|
||||
assert row["author"] == test_metadata["author"], (
|
||||
f"Expected author '{test_metadata['author']}', got '{row['author']}'"
|
||||
)
|
||||
|
||||
def test_split_text_empty_input(self):
|
||||
"""Test handling of empty input text."""
|
||||
component = SplitTextComponent()
|
||||
|
|
@ -198,7 +197,7 @@ class TestSplitTextComponent(ComponentTestBaseWithoutClient):
|
|||
|
||||
results = component.split_text()
|
||||
assert len(results) == 1, f"Expected 1 chunk, got {len(results)}"
|
||||
assert results[0].text == test_text, f"Expected '{test_text}', got '{results[0].text}'"
|
||||
assert results["text"][0] == test_text, f"Expected '{test_text}', got '{results['text'][0]}'"
|
||||
|
||||
def test_split_text_multiple_inputs(self):
|
||||
"""Test splitting multiple input texts."""
|
||||
|
|
@ -218,10 +217,10 @@ class TestSplitTextComponent(ComponentTestBaseWithoutClient):
|
|||
|
||||
results = component.split_text()
|
||||
assert len(results) == 4, f"Expected 4 chunks (2 from each text), got {len(results)}"
|
||||
assert "First text" in results[0].text, f"Expected 'First text', got '{results[0].text}'"
|
||||
assert "Second line" in results[1].text, f"Expected 'Second line', got '{results[1].text}'"
|
||||
assert "Another text" in results[2].text, f"Expected 'Another text', got '{results[2].text}'"
|
||||
assert "Another line" in results[3].text, f"Expected 'Another line', got '{results[3].text}'"
|
||||
assert "First text" in results["text"][0], f"Expected 'First text', got '{results['text'][0]}'"
|
||||
assert "Second line" in results["text"][1], f"Expected 'Second line', got '{results['text'][1]}'"
|
||||
assert "Another text" in results["text"][2], f"Expected 'Another text', got '{results['text'][2]}'"
|
||||
assert "Another line" in results["text"][3], f"Expected 'Another line', got '{results['text'][3]}'"
|
||||
|
||||
def test_split_text_with_dataframe_input(self):
|
||||
"""Test splitting text with DataFrame input."""
|
||||
|
|
@ -242,10 +241,10 @@ class TestSplitTextComponent(ComponentTestBaseWithoutClient):
|
|||
|
||||
results = component.split_text()
|
||||
assert len(results) == 4, f"Expected 4 chunks (2 from each text), got {len(results)}"
|
||||
assert "First text" in results[0].text, f"Expected 'First text', got '{results[0].text}'"
|
||||
assert "Second line" in results[1].text, f"Expected 'Second line', got '{results[1].text}'"
|
||||
assert "Another text" in results[2].text, f"Expected 'Another text', got '{results[2].text}'"
|
||||
assert "Another line" in results[3].text, f"Expected 'Another line', got '{results[3].text}'"
|
||||
assert "First text" in results["text"][0], f"Expected 'First text', got '{results['text'][0]}'"
|
||||
assert "Second line" in results["text"][1], f"Expected 'Second line', got '{results['text'][1]}'"
|
||||
assert "Another text" in results["text"][2], f"Expected 'Another text', got '{results['text'][2]}'"
|
||||
assert "Another line" in results["text"][3], f"Expected 'Another line', got '{results['text'][3]}'"
|
||||
|
||||
def test_with_url_loader(self):
|
||||
"""Test splitting text with URL loader."""
|
||||
|
|
@ -267,5 +266,5 @@ class TestSplitTextComponent(ComponentTestBaseWithoutClient):
|
|||
)
|
||||
|
||||
results = component.split_text()
|
||||
assert isinstance(results, list), "Expected list instance"
|
||||
assert isinstance(results, DataFrame), "Expected DataFrame instance"
|
||||
assert len(results) > 2, f"Expected DataFrame with more than 2 rows, got {len(results)}"
|
||||
|
|
|
|||
|
|
@ -130,7 +130,7 @@ test(
|
|||
|
||||
await expect(page.getByTestId("logicSub Flow [Deprecated]")).toBeVisible();
|
||||
|
||||
await expect(page.getByTestId("processingSplit Text")).toBeVisible();
|
||||
await expect(page.getByTestId("processingData Operations")).toBeVisible();
|
||||
|
||||
await page.getByTestId("icon-X").first().click();
|
||||
|
||||
|
|
|
|||
|
|
@ -58,13 +58,13 @@ test(
|
|||
//fourth component
|
||||
|
||||
await page.getByTestId("sidebar-search-input").click();
|
||||
await page.getByTestId("sidebar-search-input").fill("data to message");
|
||||
await page.waitForSelector('[data-testid="processingData to Message"]', {
|
||||
await page.getByTestId("sidebar-search-input").fill("Parser");
|
||||
await page.waitForSelector('[data-testid="processingParser"]', {
|
||||
timeout: 1000,
|
||||
});
|
||||
|
||||
await page
|
||||
.getByTestId("processingData to Message")
|
||||
.getByTestId("processingParser")
|
||||
.dragTo(page.locator('//*[@id="react-flow-id"]'), {
|
||||
targetPosition: { x: 50, y: 300 },
|
||||
});
|
||||
|
|
@ -133,11 +133,13 @@ test(
|
|||
.getByTestId("handle-splittext-shownode-chunks-right")
|
||||
.nth(0)
|
||||
.click();
|
||||
await page.getByTestId("handle-parsedata-shownode-data-left").click();
|
||||
await page
|
||||
.getByTestId("handle-parsercomponent-shownode-data or dataframe-left")
|
||||
.click();
|
||||
|
||||
//connection 4
|
||||
await page
|
||||
.getByTestId("handle-parsedata-shownode-message-right")
|
||||
.getByTestId("handle-parsercomponent-shownode-parsed text-right")
|
||||
.nth(0)
|
||||
.click();
|
||||
await page.getByTestId("handle-chatoutput-shownode-inputs-left").click();
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue