feat: add support to accept Dataframe as input to split text, and added relevant tests (#6302)
* update to support dataframe * [autofix.ci] apply automated fixes * [autofix.ci] apply automated fixes (attempt 2/3) * Update split_text.py * [autofix.ci] apply automated fixes * [autofix.ci] apply automated fixes * update names * Update src/backend/base/langflow/schema/dataframe.py Co-authored-by: Gabriel Luiz Freitas Almeida <gabriel@langflow.org> * [autofix.ci] apply automated fixes * update to template * update review changes * Update Vector Store RAG.json * fix lint errors * fix tests * 📝 (freeze.spec.ts): update test description to match the actual element being tested for better clarity and accuracy * ✨ (stop-button-playground.spec.ts): improve test reliability by specifying target position for drag action to prevent flakiness * ✅ (logs.spec.ts): increase timeout from 1000ms to 3000ms for better test reliability ✅ (stop-building.spec.ts): update test selector from "handle-splittext-shownode-data inputs-left" to "handle-splittext-shownode-input documents-left" for accurate testing ✅ (starter-projects.spec.ts): add a 1000ms timeout before asserting visibility of an element for better test stability --------- Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com> Co-authored-by: Gabriel Luiz Freitas Almeida <gabriel@langflow.org> Co-authored-by: cristhianzl <cristhian.lousa@gmail.com>
This commit is contained in:
parent
b43bf8f783
commit
e8529eaecb
11 changed files with 1184 additions and 924 deletions
|
|
@ -15,10 +15,9 @@ class SplitTextComponent(Component):
|
|||
inputs = [
|
||||
HandleInput(
|
||||
name="data_inputs",
|
||||
display_name="Data Inputs",
|
||||
display_name="Input Documents",
|
||||
info="The data to split.",
|
||||
input_types=["Data"],
|
||||
is_list=True,
|
||||
input_types=["Data", "DataFrame"],
|
||||
required=True,
|
||||
),
|
||||
IntInput(
|
||||
|
|
@ -39,6 +38,13 @@ class SplitTextComponent(Component):
|
|||
info="The character to split on. Defaults to newline.",
|
||||
value="\n",
|
||||
),
|
||||
MessageTextInput(
|
||||
name="text_key",
|
||||
display_name="Text Key",
|
||||
info="The key to use for the text column.",
|
||||
value="text",
|
||||
advanced=True,
|
||||
),
|
||||
]
|
||||
|
||||
outputs = [
|
||||
|
|
@ -46,23 +52,57 @@ class SplitTextComponent(Component):
|
|||
Output(display_name="DataFrame", name="dataframe", method="as_dataframe"),
|
||||
]
|
||||
|
||||
def _docs_to_data(self, docs):
|
||||
def _docs_to_data(self, docs) -> list[Data]:
|
||||
return [Data(text=doc.page_content, data=doc.metadata) for doc in docs]
|
||||
|
||||
def split_text(self) -> list[Data]:
|
||||
def _docs_to_dataframe(self, docs):
|
||||
data_dicts = [{self.text_key: doc.page_content, **doc.metadata} for doc in docs]
|
||||
return DataFrame(data_dicts)
|
||||
|
||||
def split_text_base(self):
|
||||
separator = unescape_string(self.separator)
|
||||
if isinstance(self.data_inputs, DataFrame):
|
||||
if not len(self.data_inputs):
|
||||
msg = "DataFrame is empty"
|
||||
raise TypeError(msg)
|
||||
|
||||
documents = [_input.to_lc_document() for _input in self.data_inputs if isinstance(_input, Data)]
|
||||
self.data_inputs.text_key = self.text_key
|
||||
try:
|
||||
documents = self.data_inputs.to_lc_documents()
|
||||
except Exception as e:
|
||||
msg = f"Error converting DataFrame to documents: {e}"
|
||||
raise TypeError(msg) from e
|
||||
else:
|
||||
if not self.data_inputs:
|
||||
msg = "No data inputs provided"
|
||||
raise TypeError(msg)
|
||||
|
||||
splitter = CharacterTextSplitter(
|
||||
chunk_overlap=self.chunk_overlap,
|
||||
chunk_size=self.chunk_size,
|
||||
separator=separator,
|
||||
)
|
||||
docs = splitter.split_documents(documents)
|
||||
data = self._docs_to_data(docs)
|
||||
self.status = data
|
||||
return data
|
||||
documents = []
|
||||
if isinstance(self.data_inputs, Data):
|
||||
self.data_inputs.text_key = self.text_key
|
||||
documents = [self.data_inputs.to_lc_document()]
|
||||
else:
|
||||
try:
|
||||
documents = [input_.to_lc_document() for input_ in self.data_inputs if isinstance(input_, Data)]
|
||||
if not documents:
|
||||
msg = f"No valid Data inputs found in {type(self.data_inputs)}"
|
||||
raise TypeError(msg)
|
||||
except AttributeError as e:
|
||||
msg = f"Invalid input type in collection: {e}"
|
||||
raise TypeError(msg) from e
|
||||
try:
|
||||
splitter = CharacterTextSplitter(
|
||||
chunk_overlap=self.chunk_overlap,
|
||||
chunk_size=self.chunk_size,
|
||||
separator=separator,
|
||||
)
|
||||
return splitter.split_documents(documents)
|
||||
except Exception as e:
|
||||
msg = f"Error splitting text: {e}"
|
||||
raise TypeError(msg) from e
|
||||
|
||||
def split_text(self) -> list[Data]:
|
||||
return self._docs_to_data(self.split_text_base())
|
||||
|
||||
def as_dataframe(self) -> DataFrame:
|
||||
return DataFrame(self.split_text())
|
||||
return self._docs_to_dataframe(self.split_text_base())
|
||||
|
|
|
|||
File diff suppressed because one or more lines are too long
|
|
@ -1,6 +1,7 @@
|
|||
from typing import cast
|
||||
|
||||
import pandas as pd
|
||||
from langchain_core.documents import Document
|
||||
from pandas import DataFrame as pandas_DataFrame
|
||||
|
||||
from langflow.schema.data import Data
|
||||
|
|
@ -32,9 +33,21 @@ class DataFrame(pandas_DataFrame):
|
|||
>>> dataset = DataFrame({"name": ["John", "Jane"], "age": [30, 25]})
|
||||
"""
|
||||
|
||||
def __init__(self, data: list[dict] | list[Data] | pd.DataFrame | None = None, **kwargs):
|
||||
def __init__(
|
||||
self,
|
||||
data: list[dict] | list[Data] | pd.DataFrame | None = None,
|
||||
text_key: str = "text",
|
||||
default_value: str = "",
|
||||
**kwargs,
|
||||
):
|
||||
# Initialize pandas DataFrame first without data
|
||||
super().__init__(**kwargs) # Removed data parameter
|
||||
|
||||
# Store attributes as private members to avoid conflicts with pandas
|
||||
self._text_key = text_key
|
||||
self._default_value = default_value
|
||||
|
||||
if data is None:
|
||||
super().__init__(**kwargs)
|
||||
return
|
||||
|
||||
if isinstance(data, list):
|
||||
|
|
@ -43,15 +56,36 @@ class DataFrame(pandas_DataFrame):
|
|||
elif not all(isinstance(x, dict) for x in data):
|
||||
msg = "List items must be either all Data objects or all dictionaries"
|
||||
raise ValueError(msg)
|
||||
kwargs["data"] = data
|
||||
elif isinstance(data, dict | pd.DataFrame):
|
||||
kwargs["data"] = data
|
||||
self._update(data, **kwargs)
|
||||
elif isinstance(data, dict | pd.DataFrame): # Fixed type check syntax
|
||||
self._update(data, **kwargs)
|
||||
|
||||
super().__init__(**kwargs)
|
||||
def _update(self, data, **kwargs):
|
||||
"""Helper method to update DataFrame with new data."""
|
||||
new_df = pd.DataFrame(data, **kwargs)
|
||||
self._update_inplace(new_df)
|
||||
|
||||
# Update property accessors
|
||||
@property
|
||||
def text_key(self) -> str:
|
||||
return self._text_key
|
||||
|
||||
@text_key.setter
|
||||
def text_key(self, value: str) -> None:
|
||||
self._text_key = value
|
||||
|
||||
@property
|
||||
def default_value(self) -> str:
|
||||
return self._default_value
|
||||
|
||||
@default_value.setter
|
||||
def default_value(self, value: str) -> None:
|
||||
self._default_value = value
|
||||
|
||||
def to_data_list(self) -> list[Data]:
|
||||
"""Converts the DataFrame back to a list of Data objects."""
|
||||
list_of_dicts = self.to_dict(orient="records")
|
||||
# suggested change: [Data(**row) for row in list_of_dicts]
|
||||
return [Data(data=row) for row in list_of_dicts]
|
||||
|
||||
def add_row(self, data: dict | Data) -> "DataFrame":
|
||||
|
|
@ -103,3 +137,31 @@ class DataFrame(pandas_DataFrame):
|
|||
Returns True if the DataFrame has at least one row, False otherwise.
|
||||
"""
|
||||
return not self.empty
|
||||
|
||||
def to_lc_documents(self) -> list[Document]:
|
||||
"""Converts the DataFrame to a list of Documents.
|
||||
|
||||
Returns:
|
||||
list[Document]: The converted list of Documents.
|
||||
"""
|
||||
list_of_dicts = self.to_dict(orient="records")
|
||||
documents = []
|
||||
for row in list_of_dicts:
|
||||
data_copy = row.copy()
|
||||
text = data_copy.pop(self._text_key, self._default_value)
|
||||
if isinstance(text, str):
|
||||
documents.append(Document(page_content=text, metadata=data_copy))
|
||||
else:
|
||||
documents.append(Document(page_content=str(text), metadata=data_copy))
|
||||
return documents
|
||||
|
||||
def _docs_to_dataframe(self, docs):
|
||||
"""Converts a list of Documents to a DataFrame.
|
||||
|
||||
Args:
|
||||
docs: List of Document objects
|
||||
|
||||
Returns:
|
||||
DataFrame: A new DataFrame with the converted Documents
|
||||
"""
|
||||
return DataFrame(docs)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue