Fix: Text split issues related to separator (#6993)

* fixes text split issues related to separator

* [autofix.ci] apply automated fixes

* [autofix.ci] apply automated fixes (attempt 2/3)

* format error fix

* Update Vector Store RAG.json

* [autofix.ci] apply automated fixes

* 📝 (freeze.spec.ts): update test description to match the actual element being tested for better clarity and maintainability

*  (stop-building.spec.ts): update test description to improve clarity and maintainability
 (stop-button-playground.spec.ts): add wait time before filling search input to ensure proper loading and interaction with the element

---------

Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
Co-authored-by: Edwin Jose <edwin.jose@datastax.com>
Co-authored-by: Gabriel Luiz Freitas Almeida <gabriel@langflow.org>
Co-authored-by: cristhianzl <cristhian.lousa@gmail.com>
Co-authored-by: Ítalo Johnny <italojohnnydosanjos@gmail.com>
This commit is contained in:
Rodrigo Nader 2025-03-13 14:51:34 -03:00 committed by GitHub
commit c1a972c533
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 75 additions and 17 deletions

View file

@ -1,7 +1,7 @@
from langchain_text_splitters import CharacterTextSplitter
from langflow.custom import Component
from langflow.io import HandleInput, IntInput, MessageTextInput, Output
from langflow.io import DropdownInput, HandleInput, IntInput, MessageTextInput, Output
from langflow.schema import Data, DataFrame
from langflow.utils.util import unescape_string
@ -15,8 +15,8 @@ class SplitTextComponent(Component):
inputs = [
HandleInput(
name="data_inputs",
display_name="Input Documents",
info="The data to split.",
display_name="Data or DataFrame",
info="The data with texts to split in chunks.",
input_types=["Data", "DataFrame"],
required=True,
),
@ -29,13 +29,20 @@ class SplitTextComponent(Component):
IntInput(
name="chunk_size",
display_name="Chunk Size",
info="The maximum number of characters in each chunk.",
info=(
"The maximum length of each chunk. Text is first split by separator, "
"then chunks are merged up to this size. "
"Individual splits larger than this won't be further divided."
),
value=1000,
),
MessageTextInput(
name="separator",
display_name="Separator",
info="The character to split on. Defaults to newline.",
info=(
"The character to split on. Use \\n for newline. "
"Examples: \\n\\n for paragraphs, \\n for lines, . for sentences"
),
value="\n",
),
MessageTextInput(
@ -45,6 +52,14 @@ class SplitTextComponent(Component):
value="text",
advanced=True,
),
DropdownInput(
name="keep_separator",
display_name="Keep Separator",
info="Whether to keep the separator in the output chunks and where to place it.",
options=["False", "True", "Start", "End"],
value="False",
advanced=True,
),
]
outputs = [
@ -55,12 +70,18 @@ class SplitTextComponent(Component):
def _docs_to_data(self, docs) -> list[Data]:
return [Data(text=doc.page_content, data=doc.metadata) for doc in docs]
def _docs_to_dataframe(self, docs):
data_dicts = [{self.text_key: doc.page_content, **doc.metadata} for doc in docs]
return DataFrame(data_dicts)
def _fix_separator(self, separator: str) -> str:
"""Fix common separator issues and convert to proper format."""
if separator == "/n":
return "\n"
if separator == "/t":
return "\t"
return separator
def split_text_base(self):
separator = unescape_string(self.separator)
separator = self._fix_separator(self.separator)
separator = unescape_string(separator)
if isinstance(self.data_inputs, DataFrame):
if not len(self.data_inputs):
msg = "DataFrame is empty"
@ -91,10 +112,20 @@ class SplitTextComponent(Component):
msg = f"Invalid input type in collection: {e}"
raise TypeError(msg) from e
try:
# Convert string 'False'/'True' to boolean
keep_sep = self.keep_separator
if isinstance(keep_sep, str):
if keep_sep.lower() == "false":
keep_sep = False
elif keep_sep.lower() == "true":
keep_sep = True
# 'start' and 'end' are kept as strings
splitter = CharacterTextSplitter(
chunk_overlap=self.chunk_overlap,
chunk_size=self.chunk_size,
separator=separator,
keep_separator=keep_sep,
)
return splitter.split_documents(documents)
except Exception as e:
@ -105,4 +136,4 @@ class SplitTextComponent(Component):
return self._docs_to_data(self.split_text_base())
def as_dataframe(self) -> DataFrame:
return self._docs_to_dataframe(self.split_text_base())
return DataFrame(self.split_text())

File diff suppressed because one or more lines are too long

View file

@ -119,7 +119,7 @@ test(
await urlOutput.hover();
await page.mouse.down();
const splitTextInputData = await page.getByTestId(
"handle-splittext-shownode-input documents-left",
"handle-splittext-shownode-data or dataframe-left",
);
await splitTextInputData.hover();
await page.mouse.up();

View file

@ -82,7 +82,7 @@ test(
await urlOutput.hover();
await page.mouse.down();
const splitTextInputData = await page.getByTestId(
"handle-splittext-shownode-input documents-left",
"handle-splittext-shownode-data or dataframe-left",
);
await splitTextInputData.hover();
await page.mouse.up();

View file

@ -25,7 +25,9 @@ test(
await page.getByTitle("fit view").click();
await page.getByTestId("sidebar-search-input").click();
await page.waitForTimeout(500);
await page.getByTestId("sidebar-search-input").fill("chat output");
await page.waitForTimeout(500);
await page.waitForSelector('[data-testid="outputsChat Output"]', {
timeout: 3000,