feat: add support to accept Dataframe as input to split text, and added relevant tests (#6302)
* update to support dataframe * [autofix.ci] apply automated fixes * [autofix.ci] apply automated fixes (attempt 2/3) * Update split_text.py * [autofix.ci] apply automated fixes * [autofix.ci] apply automated fixes * update names * Update src/backend/base/langflow/schema/dataframe.py Co-authored-by: Gabriel Luiz Freitas Almeida <gabriel@langflow.org> * [autofix.ci] apply automated fixes * update to template * update review changes * Update Vector Store RAG.json * fix lint errors * fix tests * 📝 (freeze.spec.ts): update test description to match the actual element being tested for better clarity and accuracy * ✨ (stop-button-playground.spec.ts): improve test reliability by specifying target position for drag action to prevent flakiness * ✅ (logs.spec.ts): increase timeout from 1000ms to 3000ms for better test reliability ✅ (stop-building.spec.ts): update test selector from "handle-splittext-shownode-data inputs-left" to "handle-splittext-shownode-input documents-left" for accurate testing ✅ (starter-projects.spec.ts): add a 1000ms timeout before asserting visibility of an element for better test stability --------- Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com> Co-authored-by: Gabriel Luiz Freitas Almeida <gabriel@langflow.org> Co-authored-by: cristhianzl <cristhian.lousa@gmail.com>
This commit is contained in:
parent
b43bf8f783
commit
e8529eaecb
11 changed files with 1184 additions and 924 deletions
|
|
@ -1,4 +1,5 @@
|
|||
import pytest
|
||||
from langflow.components.data import URLComponent
|
||||
from langflow.components.processing import SplitTextComponent
|
||||
from langflow.schema import Data, DataFrame
|
||||
|
||||
|
|
@ -44,6 +45,7 @@ class TestSplitTextComponent(ComponentTestBaseWithoutClient):
|
|||
"chunk_overlap": 0,
|
||||
"chunk_size": 15,
|
||||
"separator": "\n",
|
||||
"text_key": "text",
|
||||
"session_id": "test_session",
|
||||
"sender": "test_sender",
|
||||
"sender_name": "test_sender_name",
|
||||
|
|
@ -220,3 +222,53 @@ class TestSplitTextComponent(ComponentTestBaseWithoutClient):
|
|||
assert "Second line" in results[1].text, f"Expected 'Second line', got '{results[1].text}'"
|
||||
assert "Another text" in results[2].text, f"Expected 'Another text', got '{results[2].text}'"
|
||||
assert "Another line" in results[3].text, f"Expected 'Another line', got '{results[3].text}'"
|
||||
|
||||
def test_split_text_with_dataframe_input(self):
|
||||
"""Test splitting text with DataFrame input."""
|
||||
component = SplitTextComponent()
|
||||
test_texts = ["First text\nSecond line", "Another text\nAnother line"]
|
||||
data_frame = DataFrame([Data(text=text) for text in test_texts])
|
||||
component.set_attributes(
|
||||
{
|
||||
"data_inputs": data_frame,
|
||||
"chunk_overlap": 0,
|
||||
"chunk_size": 10,
|
||||
"separator": "\n",
|
||||
"session_id": "test_session",
|
||||
"sender": "test_sender",
|
||||
"sender_name": "test_sender_name",
|
||||
}
|
||||
)
|
||||
|
||||
results = component.split_text()
|
||||
assert len(results) == 4, f"Expected 4 chunks (2 from each text), got {len(results)}"
|
||||
assert "First text" in results[0].text, f"Expected 'First text', got '{results[0].text}'"
|
||||
assert "Second line" in results[1].text, f"Expected 'Second line', got '{results[1].text}'"
|
||||
assert "Another text" in results[2].text, f"Expected 'Another text', got '{results[2].text}'"
|
||||
assert "Another line" in results[3].text, f"Expected 'Another line', got '{results[3].text}'"
|
||||
|
||||
def test_with_url_loader(self):
|
||||
"""Test splitting text with URL loader."""
|
||||
component = SplitTextComponent()
|
||||
url = ["https://en.wikipedia.org/wiki/London", "https://en.wikipedia.org/wiki/Paris"]
|
||||
data_frame = URLComponent(urls=url, format="Text").as_dataframe()
|
||||
assert isinstance(data_frame, DataFrame), "Expected DataFrame instance"
|
||||
assert len(data_frame) == 2, f"Expected DataFrame with 2 rows, got {len(data_frame)}"
|
||||
component.set_attributes(
|
||||
{
|
||||
"data_inputs": data_frame,
|
||||
"chunk_overlap": 0,
|
||||
"chunk_size": 10,
|
||||
"separator": "\n",
|
||||
"session_id": "test_session",
|
||||
"sender": "test_sender",
|
||||
"sender_name": "test_sender_name",
|
||||
}
|
||||
)
|
||||
results = component.as_dataframe()
|
||||
assert isinstance(results, DataFrame), "Expected DataFrame instance"
|
||||
assert len(results) > 2, f"Expected DataFrame with more than 2 rows, got {len(results)}"
|
||||
|
||||
results = component.split_text()
|
||||
assert isinstance(results, list), "Expected list instance"
|
||||
assert len(results) > 2, f"Expected DataFrame with more than 2 rows, got {len(results)}"
|
||||
|
|
|
|||
66
src/backend/tests/unit/schema/test_schema_dataframe.py
Normal file
66
src/backend/tests/unit/schema/test_schema_dataframe.py
Normal file
|
|
@ -0,0 +1,66 @@
|
|||
import pandas as pd
|
||||
import pytest
|
||||
from langchain_core.documents import Document
|
||||
from langflow.schema.data import Data
|
||||
from langflow.schema.dataframe import DataFrame
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_dataframe():
|
||||
"""Create a sample DataFrame for testing."""
|
||||
return pd.DataFrame({"name": ["John", "Jane"], "text": ["name is John", "name is Jane"]})
|
||||
|
||||
|
||||
class TestDataFrameSchema:
|
||||
def test_to_data_list(self, sample_dataframe):
|
||||
"""Test conversion of DataFrame to list of Data objects."""
|
||||
data_frame = DataFrame(sample_dataframe)
|
||||
data_list = data_frame.to_data_list()
|
||||
assert isinstance(data_list, list)
|
||||
assert all(isinstance(item, Data) for item in data_list)
|
||||
assert len(data_list) == len(sample_dataframe)
|
||||
assert data_list[0].data["name"] == "John"
|
||||
assert data_list[0].data["text"] == "name is John"
|
||||
|
||||
def test_add_row(self, sample_dataframe):
|
||||
"""Test adding a single row to DataFrame."""
|
||||
data_frame = DataFrame(sample_dataframe)
|
||||
# Test adding dict
|
||||
new_df = data_frame.add_row({"name": "Bob", "text": "name is Bob"})
|
||||
assert len(new_df) == len(sample_dataframe) + 1
|
||||
assert new_df.iloc[-1]["name"] == "Bob"
|
||||
assert new_df.iloc[-1]["text"] == "name is Bob"
|
||||
|
||||
# Test adding Data object
|
||||
data_obj = Data(data={"name": "Alice", "text": "name is Alice"})
|
||||
new_df = data_frame.add_row(data_obj)
|
||||
assert len(new_df) == len(sample_dataframe) + 1
|
||||
assert new_df.iloc[-1]["name"] == "Alice"
|
||||
assert new_df.iloc[-1]["text"] == "name is Alice"
|
||||
|
||||
def test_add_rows(self, sample_dataframe):
|
||||
"""Test adding multiple rows to DataFrame."""
|
||||
data_frame = DataFrame(sample_dataframe)
|
||||
new_rows = [{"name": "Bob", "text": "name is Bob"}, Data(data={"name": "Alice", "text": "name is Alice"})]
|
||||
new_df = data_frame.add_rows(new_rows)
|
||||
assert len(new_df) == len(sample_dataframe) + 2
|
||||
assert new_df.iloc[-2:]["name"].tolist() == ["Bob", "Alice"]
|
||||
assert new_df.iloc[-2:]["text"].tolist() == ["name is Bob", "name is Alice"]
|
||||
|
||||
def test_to_lc_documents(self, sample_dataframe):
|
||||
"""Test conversion to LangChain documents."""
|
||||
data_frame = DataFrame(sample_dataframe)
|
||||
documents = data_frame.to_lc_documents()
|
||||
assert isinstance(documents, list)
|
||||
assert all(isinstance(doc, Document) for doc in documents)
|
||||
assert len(documents) == 2
|
||||
assert documents[0].page_content == "name is John"
|
||||
assert documents[0].metadata == {"name": "John"}
|
||||
|
||||
def test_bool_operator(self):
|
||||
"""Test boolean operator behavior."""
|
||||
empty_df = DataFrame()
|
||||
assert not bool(empty_df)
|
||||
|
||||
non_empty_df = DataFrame({"name": ["John"], "text": ["name is John"]})
|
||||
assert bool(non_empty_df)
|
||||
Loading…
Add table
Add a link
Reference in a new issue