feat: add support to accept Dataframe as input to split text, and added relevant tests (#6302)

* update to support dataframe

* [autofix.ci] apply automated fixes

* [autofix.ci] apply automated fixes (attempt 2/3)

* Update split_text.py

* [autofix.ci] apply automated fixes

* [autofix.ci] apply automated fixes

* update names

* Update src/backend/base/langflow/schema/dataframe.py

Co-authored-by: Gabriel Luiz Freitas Almeida <gabriel@langflow.org>

* [autofix.ci] apply automated fixes

* update to template

* update review changes

* Update Vector Store RAG.json

* fix lint errors

* fix tests

* 📝 (freeze.spec.ts): update test description to match the actual element being tested for better clarity and accuracy

*  (stop-button-playground.spec.ts): improve test reliability by specifying target position for drag action to prevent flakiness

*  (logs.spec.ts): increase timeout from 1000ms to 3000ms for better test reliability
 (stop-building.spec.ts): update test selector from "handle-splittext-shownode-data inputs-left" to "handle-splittext-shownode-input documents-left" for accurate testing
 (starter-projects.spec.ts): add a 1000ms timeout before asserting visibility of an element for better test stability

---------

Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
Co-authored-by: Gabriel Luiz Freitas Almeida <gabriel@langflow.org>
Co-authored-by: cristhianzl <cristhian.lousa@gmail.com>
This commit is contained in:
Edwin Jose 2025-02-19 15:40:56 -05:00 committed by GitHub
commit e8529eaecb
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
11 changed files with 1184 additions and 924 deletions

View file

@ -15,10 +15,9 @@ class SplitTextComponent(Component):
inputs = [ inputs = [
HandleInput( HandleInput(
name="data_inputs", name="data_inputs",
display_name="Data Inputs", display_name="Input Documents",
info="The data to split.", info="The data to split.",
input_types=["Data"], input_types=["Data", "DataFrame"],
is_list=True,
required=True, required=True,
), ),
IntInput( IntInput(
@ -39,6 +38,13 @@ class SplitTextComponent(Component):
info="The character to split on. Defaults to newline.", info="The character to split on. Defaults to newline.",
value="\n", value="\n",
), ),
MessageTextInput(
name="text_key",
display_name="Text Key",
info="The key to use for the text column.",
value="text",
advanced=True,
),
] ]
outputs = [ outputs = [
@ -46,23 +52,57 @@ class SplitTextComponent(Component):
Output(display_name="DataFrame", name="dataframe", method="as_dataframe"), Output(display_name="DataFrame", name="dataframe", method="as_dataframe"),
] ]
def _docs_to_data(self, docs): def _docs_to_data(self, docs) -> list[Data]:
return [Data(text=doc.page_content, data=doc.metadata) for doc in docs] return [Data(text=doc.page_content, data=doc.metadata) for doc in docs]
def split_text(self) -> list[Data]: def _docs_to_dataframe(self, docs):
data_dicts = [{self.text_key: doc.page_content, **doc.metadata} for doc in docs]
return DataFrame(data_dicts)
def split_text_base(self):
separator = unescape_string(self.separator) separator = unescape_string(self.separator)
if isinstance(self.data_inputs, DataFrame):
if not len(self.data_inputs):
msg = "DataFrame is empty"
raise TypeError(msg)
documents = [_input.to_lc_document() for _input in self.data_inputs if isinstance(_input, Data)] self.data_inputs.text_key = self.text_key
try:
documents = self.data_inputs.to_lc_documents()
except Exception as e:
msg = f"Error converting DataFrame to documents: {e}"
raise TypeError(msg) from e
else:
if not self.data_inputs:
msg = "No data inputs provided"
raise TypeError(msg)
splitter = CharacterTextSplitter( documents = []
chunk_overlap=self.chunk_overlap, if isinstance(self.data_inputs, Data):
chunk_size=self.chunk_size, self.data_inputs.text_key = self.text_key
separator=separator, documents = [self.data_inputs.to_lc_document()]
) else:
docs = splitter.split_documents(documents) try:
data = self._docs_to_data(docs) documents = [input_.to_lc_document() for input_ in self.data_inputs if isinstance(input_, Data)]
self.status = data if not documents:
return data msg = f"No valid Data inputs found in {type(self.data_inputs)}"
raise TypeError(msg)
except AttributeError as e:
msg = f"Invalid input type in collection: {e}"
raise TypeError(msg) from e
try:
splitter = CharacterTextSplitter(
chunk_overlap=self.chunk_overlap,
chunk_size=self.chunk_size,
separator=separator,
)
return splitter.split_documents(documents)
except Exception as e:
msg = f"Error splitting text: {e}"
raise TypeError(msg) from e
def split_text(self) -> list[Data]:
return self._docs_to_data(self.split_text_base())
def as_dataframe(self) -> DataFrame: def as_dataframe(self) -> DataFrame:
return DataFrame(self.split_text()) return self._docs_to_dataframe(self.split_text_base())

File diff suppressed because one or more lines are too long

View file

@ -1,6 +1,7 @@
from typing import cast from typing import cast
import pandas as pd import pandas as pd
from langchain_core.documents import Document
from pandas import DataFrame as pandas_DataFrame from pandas import DataFrame as pandas_DataFrame
from langflow.schema.data import Data from langflow.schema.data import Data
@ -32,9 +33,21 @@ class DataFrame(pandas_DataFrame):
>>> dataset = DataFrame({"name": ["John", "Jane"], "age": [30, 25]}) >>> dataset = DataFrame({"name": ["John", "Jane"], "age": [30, 25]})
""" """
def __init__(self, data: list[dict] | list[Data] | pd.DataFrame | None = None, **kwargs): def __init__(
self,
data: list[dict] | list[Data] | pd.DataFrame | None = None,
text_key: str = "text",
default_value: str = "",
**kwargs,
):
# Initialize pandas DataFrame first without data
super().__init__(**kwargs) # Removed data parameter
# Store attributes as private members to avoid conflicts with pandas
self._text_key = text_key
self._default_value = default_value
if data is None: if data is None:
super().__init__(**kwargs)
return return
if isinstance(data, list): if isinstance(data, list):
@ -43,15 +56,36 @@ class DataFrame(pandas_DataFrame):
elif not all(isinstance(x, dict) for x in data): elif not all(isinstance(x, dict) for x in data):
msg = "List items must be either all Data objects or all dictionaries" msg = "List items must be either all Data objects or all dictionaries"
raise ValueError(msg) raise ValueError(msg)
kwargs["data"] = data self._update(data, **kwargs)
elif isinstance(data, dict | pd.DataFrame): elif isinstance(data, dict | pd.DataFrame): # Fixed type check syntax
kwargs["data"] = data self._update(data, **kwargs)
super().__init__(**kwargs) def _update(self, data, **kwargs):
"""Helper method to update DataFrame with new data."""
new_df = pd.DataFrame(data, **kwargs)
self._update_inplace(new_df)
# Update property accessors
@property
def text_key(self) -> str:
return self._text_key
@text_key.setter
def text_key(self, value: str) -> None:
self._text_key = value
@property
def default_value(self) -> str:
return self._default_value
@default_value.setter
def default_value(self, value: str) -> None:
self._default_value = value
def to_data_list(self) -> list[Data]: def to_data_list(self) -> list[Data]:
"""Converts the DataFrame back to a list of Data objects.""" """Converts the DataFrame back to a list of Data objects."""
list_of_dicts = self.to_dict(orient="records") list_of_dicts = self.to_dict(orient="records")
# suggested change: [Data(**row) for row in list_of_dicts]
return [Data(data=row) for row in list_of_dicts] return [Data(data=row) for row in list_of_dicts]
def add_row(self, data: dict | Data) -> "DataFrame": def add_row(self, data: dict | Data) -> "DataFrame":
@ -103,3 +137,31 @@ class DataFrame(pandas_DataFrame):
Returns True if the DataFrame has at least one row, False otherwise. Returns True if the DataFrame has at least one row, False otherwise.
""" """
return not self.empty return not self.empty
def to_lc_documents(self) -> list[Document]:
"""Converts the DataFrame to a list of Documents.
Returns:
list[Document]: The converted list of Documents.
"""
list_of_dicts = self.to_dict(orient="records")
documents = []
for row in list_of_dicts:
data_copy = row.copy()
text = data_copy.pop(self._text_key, self._default_value)
if isinstance(text, str):
documents.append(Document(page_content=text, metadata=data_copy))
else:
documents.append(Document(page_content=str(text), metadata=data_copy))
return documents
def _docs_to_dataframe(self, docs):
"""Converts a list of Documents to a DataFrame.
Args:
docs: List of Document objects
Returns:
DataFrame: A new DataFrame with the converted Documents
"""
return DataFrame(docs)

View file

@ -1,4 +1,5 @@
import pytest import pytest
from langflow.components.data import URLComponent
from langflow.components.processing import SplitTextComponent from langflow.components.processing import SplitTextComponent
from langflow.schema import Data, DataFrame from langflow.schema import Data, DataFrame
@ -44,6 +45,7 @@ class TestSplitTextComponent(ComponentTestBaseWithoutClient):
"chunk_overlap": 0, "chunk_overlap": 0,
"chunk_size": 15, "chunk_size": 15,
"separator": "\n", "separator": "\n",
"text_key": "text",
"session_id": "test_session", "session_id": "test_session",
"sender": "test_sender", "sender": "test_sender",
"sender_name": "test_sender_name", "sender_name": "test_sender_name",
@ -220,3 +222,53 @@ class TestSplitTextComponent(ComponentTestBaseWithoutClient):
assert "Second line" in results[1].text, f"Expected 'Second line', got '{results[1].text}'" assert "Second line" in results[1].text, f"Expected 'Second line', got '{results[1].text}'"
assert "Another text" in results[2].text, f"Expected 'Another text', got '{results[2].text}'" assert "Another text" in results[2].text, f"Expected 'Another text', got '{results[2].text}'"
assert "Another line" in results[3].text, f"Expected 'Another line', got '{results[3].text}'" assert "Another line" in results[3].text, f"Expected 'Another line', got '{results[3].text}'"
def test_split_text_with_dataframe_input(self):
"""Test splitting text with DataFrame input."""
component = SplitTextComponent()
test_texts = ["First text\nSecond line", "Another text\nAnother line"]
data_frame = DataFrame([Data(text=text) for text in test_texts])
component.set_attributes(
{
"data_inputs": data_frame,
"chunk_overlap": 0,
"chunk_size": 10,
"separator": "\n",
"session_id": "test_session",
"sender": "test_sender",
"sender_name": "test_sender_name",
}
)
results = component.split_text()
assert len(results) == 4, f"Expected 4 chunks (2 from each text), got {len(results)}"
assert "First text" in results[0].text, f"Expected 'First text', got '{results[0].text}'"
assert "Second line" in results[1].text, f"Expected 'Second line', got '{results[1].text}'"
assert "Another text" in results[2].text, f"Expected 'Another text', got '{results[2].text}'"
assert "Another line" in results[3].text, f"Expected 'Another line', got '{results[3].text}'"
def test_with_url_loader(self):
"""Test splitting text with URL loader."""
component = SplitTextComponent()
url = ["https://en.wikipedia.org/wiki/London", "https://en.wikipedia.org/wiki/Paris"]
data_frame = URLComponent(urls=url, format="Text").as_dataframe()
assert isinstance(data_frame, DataFrame), "Expected DataFrame instance"
assert len(data_frame) == 2, f"Expected DataFrame with 2 rows, got {len(data_frame)}"
component.set_attributes(
{
"data_inputs": data_frame,
"chunk_overlap": 0,
"chunk_size": 10,
"separator": "\n",
"session_id": "test_session",
"sender": "test_sender",
"sender_name": "test_sender_name",
}
)
results = component.as_dataframe()
assert isinstance(results, DataFrame), "Expected DataFrame instance"
assert len(results) > 2, f"Expected DataFrame with more than 2 rows, got {len(results)}"
results = component.split_text()
assert isinstance(results, list), "Expected list instance"
assert len(results) > 2, f"Expected DataFrame with more than 2 rows, got {len(results)}"

View file

@ -0,0 +1,66 @@
import pandas as pd
import pytest
from langchain_core.documents import Document
from langflow.schema.data import Data
from langflow.schema.dataframe import DataFrame
@pytest.fixture
def sample_dataframe():
"""Create a sample DataFrame for testing."""
return pd.DataFrame({"name": ["John", "Jane"], "text": ["name is John", "name is Jane"]})
class TestDataFrameSchema:
def test_to_data_list(self, sample_dataframe):
"""Test conversion of DataFrame to list of Data objects."""
data_frame = DataFrame(sample_dataframe)
data_list = data_frame.to_data_list()
assert isinstance(data_list, list)
assert all(isinstance(item, Data) for item in data_list)
assert len(data_list) == len(sample_dataframe)
assert data_list[0].data["name"] == "John"
assert data_list[0].data["text"] == "name is John"
def test_add_row(self, sample_dataframe):
"""Test adding a single row to DataFrame."""
data_frame = DataFrame(sample_dataframe)
# Test adding dict
new_df = data_frame.add_row({"name": "Bob", "text": "name is Bob"})
assert len(new_df) == len(sample_dataframe) + 1
assert new_df.iloc[-1]["name"] == "Bob"
assert new_df.iloc[-1]["text"] == "name is Bob"
# Test adding Data object
data_obj = Data(data={"name": "Alice", "text": "name is Alice"})
new_df = data_frame.add_row(data_obj)
assert len(new_df) == len(sample_dataframe) + 1
assert new_df.iloc[-1]["name"] == "Alice"
assert new_df.iloc[-1]["text"] == "name is Alice"
def test_add_rows(self, sample_dataframe):
"""Test adding multiple rows to DataFrame."""
data_frame = DataFrame(sample_dataframe)
new_rows = [{"name": "Bob", "text": "name is Bob"}, Data(data={"name": "Alice", "text": "name is Alice"})]
new_df = data_frame.add_rows(new_rows)
assert len(new_df) == len(sample_dataframe) + 2
assert new_df.iloc[-2:]["name"].tolist() == ["Bob", "Alice"]
assert new_df.iloc[-2:]["text"].tolist() == ["name is Bob", "name is Alice"]
def test_to_lc_documents(self, sample_dataframe):
"""Test conversion to LangChain documents."""
data_frame = DataFrame(sample_dataframe)
documents = data_frame.to_lc_documents()
assert isinstance(documents, list)
assert all(isinstance(doc, Document) for doc in documents)
assert len(documents) == 2
assert documents[0].page_content == "name is John"
assert documents[0].metadata == {"name": "John"}
def test_bool_operator(self):
"""Test boolean operator behavior."""
empty_df = DataFrame()
assert not bool(empty_df)
non_empty_df = DataFrame({"name": ["John"], "text": ["name is John"]})
assert bool(non_empty_df)

View file

@ -836,6 +836,7 @@
}, },
"node_modules/@clack/prompts/node_modules/is-unicode-supported": { "node_modules/@clack/prompts/node_modules/is-unicode-supported": {
"version": "1.3.0", "version": "1.3.0",
"extraneous": true,
"inBundle": true, "inBundle": true,
"license": "MIT", "license": "MIT",
"engines": { "engines": {

View file

@ -119,7 +119,7 @@ test(
await urlOutput.hover(); await urlOutput.hover();
await page.mouse.down(); await page.mouse.down();
const splitTextInputData = await page.getByTestId( const splitTextInputData = await page.getByTestId(
"handle-splittext-shownode-data inputs-left", "handle-splittext-shownode-input documents-left",
); );
await splitTextInputData.hover(); await splitTextInputData.hover();
await page.mouse.up(); await page.mouse.up();

View file

@ -22,7 +22,7 @@ test(
await page.getByTestId("side_nav_options_all-templates").click(); await page.getByTestId("side_nav_options_all-templates").click();
await page.getByRole("heading", { name: "Basic Prompting" }).click(); await page.getByRole("heading", { name: "Basic Prompting" }).click();
await expect(page.getByTestId(/.*rf__node.*/).first()).toBeVisible({ await expect(page.getByTestId(/.*rf__node.*/).first()).toBeVisible({
timeout: 1000, timeout: 3000,
}); });
let outdatedComponents = await page let outdatedComponents = await page
.getByTestId("icon-AlertTriangle") .getByTestId("icon-AlertTriangle")

View file

@ -82,7 +82,7 @@ test(
await urlOutput.hover(); await urlOutput.hover();
await page.mouse.down(); await page.mouse.down();
const splitTextInputData = await page.getByTestId( const splitTextInputData = await page.getByTestId(
"handle-splittext-shownode-data inputs-left", "handle-splittext-shownode-input documents-left",
); );
await splitTextInputData.hover(); await splitTextInputData.hover();
await page.mouse.up(); await page.mouse.up();

View file

@ -17,6 +17,8 @@ test(
await page.getByTestId("search-input-template").fill("Document"); await page.getByTestId("search-input-template").fill("Document");
await page.waitForTimeout(1000);
expect( expect(
page.getByTestId("template_basic-prompting-(hello,-world)"), page.getByTestId("template_basic-prompting-(hello,-world)"),
).toBeVisible({ visible: false, timeout: 3000 }); ).toBeVisible({ visible: false, timeout: 3000 });

View file

@ -23,9 +23,6 @@ test(
await page.getByTestId("sidebar-custom-component-button").click(); await page.getByTestId("sidebar-custom-component-button").click();
await page.getByTitle("fit view").click(); await page.getByTitle("fit view").click();
await page.getByTitle("zoom out").click();
await page.getByTitle("zoom out").click();
await page.getByTitle("zoom out").click();
await page.getByTestId("sidebar-search-input").click(); await page.getByTestId("sidebar-search-input").click();
await page.getByTestId("sidebar-search-input").fill("chat output"); await page.getByTestId("sidebar-search-input").fill("chat output");
@ -36,7 +33,9 @@ test(
await page await page
.getByTestId("outputsChat Output") .getByTestId("outputsChat Output")
.dragTo(page.locator('//*[@id="react-flow-id"]')); .dragTo(page.locator('//*[@id="react-flow-id"]'), {
targetPosition: { x: 400, y: 400 },
});
await adjustScreenView(page); await adjustScreenView(page);