feat: add support to accept Dataframe as input to split text, and added relevant tests (#6302)

* update to support dataframe * [autofix.ci] apply automated fixes * [autofix.ci] apply automated fixes (attempt 2/3) * Update split_text.py * [autofix.ci] apply automated fixes * [autofix.ci] apply automated fixes * update names * Update src/backend/base/langflow/schema/dataframe.py Co-authored-by: Gabriel Luiz Freitas Almeida <gabriel@langflow.org> * [autofix.ci] apply automated fixes * update to template * update review changes * Update Vector Store RAG.json * fix lint errors * fix tests * 📝 (freeze.spec.ts): update test description to match the actual element being tested for better clarity and accuracy * ✨ (stop-button-playground.spec.ts): improve test reliability by specifying target position for drag action to prevent flakiness * ✅ (logs.spec.ts): increase timeout from 1000ms to 3000ms for better test reliability ✅ (stop-building.spec.ts): update test selector from "handle-splittext-shownode-data inputs-left" to "handle-splittext-shownode-input documents-left" for accurate testing ✅ (starter-projects.spec.ts): add a 1000ms timeout before asserting visibility of an element for better test stability --------- Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com> Co-authored-by: Gabriel Luiz Freitas Almeida <gabriel@langflow.org> Co-authored-by: cristhianzl <cristhian.lousa@gmail.com>
2025-02-19 15:40:56 -05:00 · 2025-02-19 15:40:56 -05:00 · e8529eaecb
commit e8529eaecb
parent b43bf8f783
11 changed files with 1184 additions and 924 deletions
--- a/src/backend/base/langflow/components/processing/split_text.py
+++ b/src/backend/base/langflow/components/processing/split_text.py
@ -15,10 +15,9 @@ class SplitTextComponent(Component):
    inputs = [
        HandleInput(
            name="data_inputs",
-            display_name="Data Inputs",
+            display_name="Input Documents",
            info="The data to split.",
-            input_types=["Data"],
-            is_list=True,
+            input_types=["Data", "DataFrame"],
            required=True,
        ),
        IntInput(
@ -39,6 +38,13 @@ class SplitTextComponent(Component):
            info="The character to split on. Defaults to newline.",
            value="\n",
        ),
+        MessageTextInput(
+            name="text_key",
+            display_name="Text Key",
+            info="The key to use for the text column.",
+            value="text",
+            advanced=True,
+        ),
    ]

    outputs = [
@ -46,23 +52,57 @@ class SplitTextComponent(Component):
        Output(display_name="DataFrame", name="dataframe", method="as_dataframe"),
    ]

-    def _docs_to_data(self, docs):
+    def _docs_to_data(self, docs) -> list[Data]:
        return [Data(text=doc.page_content, data=doc.metadata) for doc in docs]

-    def split_text(self) -> list[Data]:
+    def _docs_to_dataframe(self, docs):
+        data_dicts = [{self.text_key: doc.page_content, **doc.metadata} for doc in docs]
+        return DataFrame(data_dicts)
+
+    def split_text_base(self):
        separator = unescape_string(self.separator)
+        if isinstance(self.data_inputs, DataFrame):
+            if not len(self.data_inputs):
+                msg = "DataFrame is empty"
+                raise TypeError(msg)

-        documents = [_input.to_lc_document() for _input in self.data_inputs if isinstance(_input, Data)]
+            self.data_inputs.text_key = self.text_key
+            try:
+                documents = self.data_inputs.to_lc_documents()
+            except Exception as e:
+                msg = f"Error converting DataFrame to documents: {e}"
+                raise TypeError(msg) from e
+        else:
+            if not self.data_inputs:
+                msg = "No data inputs provided"
+                raise TypeError(msg)

-        splitter = CharacterTextSplitter(
-            chunk_overlap=self.chunk_overlap,
-            chunk_size=self.chunk_size,
-            separator=separator,
-        )
-        docs = splitter.split_documents(documents)
-        data = self._docs_to_data(docs)
-        self.status = data
-        return data
+            documents = []
+            if isinstance(self.data_inputs, Data):
+                self.data_inputs.text_key = self.text_key
+                documents = [self.data_inputs.to_lc_document()]
+            else:
+                try:
+                    documents = [input_.to_lc_document() for input_ in self.data_inputs if isinstance(input_, Data)]
+                    if not documents:
+                        msg = f"No valid Data inputs found in {type(self.data_inputs)}"
+                        raise TypeError(msg)
+                except AttributeError as e:
+                    msg = f"Invalid input type in collection: {e}"
+                    raise TypeError(msg) from e
+        try:
+            splitter = CharacterTextSplitter(
+                chunk_overlap=self.chunk_overlap,
+                chunk_size=self.chunk_size,
+                separator=separator,
+            )
+            return splitter.split_documents(documents)
+        except Exception as e:
+            msg = f"Error splitting text: {e}"
+            raise TypeError(msg) from e
+
+    def split_text(self) -> list[Data]:
+        return self._docs_to_data(self.split_text_base())

    def as_dataframe(self) -> DataFrame:
-        return DataFrame(self.split_text())
+        return self._docs_to_dataframe(self.split_text_base())
--- a/src/backend/base/langflow/initial_setup/starter_projects/Vector
+++ b/src/backend/base/langflow/initial_setup/starter_projects/Vector
--- a/src/backend/base/langflow/schema/dataframe.py
+++ b/src/backend/base/langflow/schema/dataframe.py
@ -1,6 +1,7 @@
 from typing import cast

 import pandas as pd
+from langchain_core.documents import Document
 from pandas import DataFrame as pandas_DataFrame

 from langflow.schema.data import Data
@ -32,9 +33,21 @@ class DataFrame(pandas_DataFrame):
        >>> dataset = DataFrame({"name": ["John", "Jane"], "age": [30, 25]})
    """

-    def __init__(self, data: list[dict] | list[Data] | pd.DataFrame | None = None, **kwargs):
+    def __init__(
+        self,
+        data: list[dict] | list[Data] | pd.DataFrame | None = None,
+        text_key: str = "text",
+        default_value: str = "",
+        **kwargs,
+    ):
+        # Initialize pandas DataFrame first without data
+        super().__init__(**kwargs)  # Removed data parameter
+
+        # Store attributes as private members to avoid conflicts with pandas
+        self._text_key = text_key
+        self._default_value = default_value
+
        if data is None:
-            super().__init__(**kwargs)
            return

        if isinstance(data, list):
@ -43,15 +56,36 @@ class DataFrame(pandas_DataFrame):
            elif not all(isinstance(x, dict) for x in data):
                msg = "List items must be either all Data objects or all dictionaries"
                raise ValueError(msg)
-            kwargs["data"] = data
-        elif isinstance(data, dict | pd.DataFrame):
-            kwargs["data"] = data
+            self._update(data, **kwargs)
+        elif isinstance(data, dict | pd.DataFrame):  # Fixed type check syntax
+            self._update(data, **kwargs)

-        super().__init__(**kwargs)
+    def _update(self, data, **kwargs):
+        """Helper method to update DataFrame with new data."""
+        new_df = pd.DataFrame(data, **kwargs)
+        self._update_inplace(new_df)
+
+    # Update property accessors
+    @property
+    def text_key(self) -> str:
+        return self._text_key
+
+    @text_key.setter
+    def text_key(self, value: str) -> None:
+        self._text_key = value
+
+    @property
+    def default_value(self) -> str:
+        return self._default_value
+
+    @default_value.setter
+    def default_value(self, value: str) -> None:
+        self._default_value = value

    def to_data_list(self) -> list[Data]:
        """Converts the DataFrame back to a list of Data objects."""
        list_of_dicts = self.to_dict(orient="records")
+        # suggested change: [Data(**row) for row in list_of_dicts]
        return [Data(data=row) for row in list_of_dicts]

    def add_row(self, data: dict | Data) -> "DataFrame":
@ -103,3 +137,31 @@ class DataFrame(pandas_DataFrame):
        Returns True if the DataFrame has at least one row, False otherwise.
        """
        return not self.empty
+
+    def to_lc_documents(self) -> list[Document]:
+        """Converts the DataFrame to a list of Documents.
+
+        Returns:
+            list[Document]: The converted list of Documents.
+        """
+        list_of_dicts = self.to_dict(orient="records")
+        documents = []
+        for row in list_of_dicts:
+            data_copy = row.copy()
+            text = data_copy.pop(self._text_key, self._default_value)
+            if isinstance(text, str):
+                documents.append(Document(page_content=text, metadata=data_copy))
+            else:
+                documents.append(Document(page_content=str(text), metadata=data_copy))
+        return documents
+
+    def _docs_to_dataframe(self, docs):
+        """Converts a list of Documents to a DataFrame.
+
+        Args:
+            docs: List of Document objects
+
+        Returns:
+            DataFrame: A new DataFrame with the converted Documents
+        """
+        return DataFrame(docs)