Fix: Text split issues related to separator (#6993)
* fixes text split issues related to separator * [autofix.ci] apply automated fixes * [autofix.ci] apply automated fixes (attempt 2/3) * format error fix * Update Vector Store RAG.json * [autofix.ci] apply automated fixes * 📝 (freeze.spec.ts): update test description to match the actual element being tested for better clarity and maintainability * ✅ (stop-building.spec.ts): update test description to improve clarity and maintainability ✅ (stop-button-playground.spec.ts): add wait time before filling search input to ensure proper loading and interaction with the element --------- Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com> Co-authored-by: Edwin Jose <edwin.jose@datastax.com> Co-authored-by: Gabriel Luiz Freitas Almeida <gabriel@langflow.org> Co-authored-by: cristhianzl <cristhian.lousa@gmail.com> Co-authored-by: Ítalo Johnny <italojohnnydosanjos@gmail.com>
This commit is contained in:
parent
d59f420812
commit
c1a972c533
5 changed files with 75 additions and 17 deletions
|
|
@ -1,7 +1,7 @@
|
|||
from langchain_text_splitters import CharacterTextSplitter
|
||||
|
||||
from langflow.custom import Component
|
||||
from langflow.io import HandleInput, IntInput, MessageTextInput, Output
|
||||
from langflow.io import DropdownInput, HandleInput, IntInput, MessageTextInput, Output
|
||||
from langflow.schema import Data, DataFrame
|
||||
from langflow.utils.util import unescape_string
|
||||
|
||||
|
|
@ -15,8 +15,8 @@ class SplitTextComponent(Component):
|
|||
inputs = [
|
||||
HandleInput(
|
||||
name="data_inputs",
|
||||
display_name="Input Documents",
|
||||
info="The data to split.",
|
||||
display_name="Data or DataFrame",
|
||||
info="The data with texts to split in chunks.",
|
||||
input_types=["Data", "DataFrame"],
|
||||
required=True,
|
||||
),
|
||||
|
|
@ -29,13 +29,20 @@ class SplitTextComponent(Component):
|
|||
IntInput(
|
||||
name="chunk_size",
|
||||
display_name="Chunk Size",
|
||||
info="The maximum number of characters in each chunk.",
|
||||
info=(
|
||||
"The maximum length of each chunk. Text is first split by separator, "
|
||||
"then chunks are merged up to this size. "
|
||||
"Individual splits larger than this won't be further divided."
|
||||
),
|
||||
value=1000,
|
||||
),
|
||||
MessageTextInput(
|
||||
name="separator",
|
||||
display_name="Separator",
|
||||
info="The character to split on. Defaults to newline.",
|
||||
info=(
|
||||
"The character to split on. Use \\n for newline. "
|
||||
"Examples: \\n\\n for paragraphs, \\n for lines, . for sentences"
|
||||
),
|
||||
value="\n",
|
||||
),
|
||||
MessageTextInput(
|
||||
|
|
@ -45,6 +52,14 @@ class SplitTextComponent(Component):
|
|||
value="text",
|
||||
advanced=True,
|
||||
),
|
||||
DropdownInput(
|
||||
name="keep_separator",
|
||||
display_name="Keep Separator",
|
||||
info="Whether to keep the separator in the output chunks and where to place it.",
|
||||
options=["False", "True", "Start", "End"],
|
||||
value="False",
|
||||
advanced=True,
|
||||
),
|
||||
]
|
||||
|
||||
outputs = [
|
||||
|
|
@ -55,12 +70,18 @@ class SplitTextComponent(Component):
|
|||
def _docs_to_data(self, docs) -> list[Data]:
|
||||
return [Data(text=doc.page_content, data=doc.metadata) for doc in docs]
|
||||
|
||||
def _docs_to_dataframe(self, docs):
|
||||
data_dicts = [{self.text_key: doc.page_content, **doc.metadata} for doc in docs]
|
||||
return DataFrame(data_dicts)
|
||||
def _fix_separator(self, separator: str) -> str:
|
||||
"""Fix common separator issues and convert to proper format."""
|
||||
if separator == "/n":
|
||||
return "\n"
|
||||
if separator == "/t":
|
||||
return "\t"
|
||||
return separator
|
||||
|
||||
def split_text_base(self):
|
||||
separator = unescape_string(self.separator)
|
||||
separator = self._fix_separator(self.separator)
|
||||
separator = unescape_string(separator)
|
||||
|
||||
if isinstance(self.data_inputs, DataFrame):
|
||||
if not len(self.data_inputs):
|
||||
msg = "DataFrame is empty"
|
||||
|
|
@ -91,10 +112,20 @@ class SplitTextComponent(Component):
|
|||
msg = f"Invalid input type in collection: {e}"
|
||||
raise TypeError(msg) from e
|
||||
try:
|
||||
# Convert string 'False'/'True' to boolean
|
||||
keep_sep = self.keep_separator
|
||||
if isinstance(keep_sep, str):
|
||||
if keep_sep.lower() == "false":
|
||||
keep_sep = False
|
||||
elif keep_sep.lower() == "true":
|
||||
keep_sep = True
|
||||
# 'start' and 'end' are kept as strings
|
||||
|
||||
splitter = CharacterTextSplitter(
|
||||
chunk_overlap=self.chunk_overlap,
|
||||
chunk_size=self.chunk_size,
|
||||
separator=separator,
|
||||
keep_separator=keep_sep,
|
||||
)
|
||||
return splitter.split_documents(documents)
|
||||
except Exception as e:
|
||||
|
|
@ -105,4 +136,4 @@ class SplitTextComponent(Component):
|
|||
return self._docs_to_data(self.split_text_base())
|
||||
|
||||
def as_dataframe(self) -> DataFrame:
|
||||
return self._docs_to_dataframe(self.split_text_base())
|
||||
return DataFrame(self.split_text())
|
||||
|
|
|
|||
File diff suppressed because one or more lines are too long
|
|
@ -119,7 +119,7 @@ test(
|
|||
await urlOutput.hover();
|
||||
await page.mouse.down();
|
||||
const splitTextInputData = await page.getByTestId(
|
||||
"handle-splittext-shownode-input documents-left",
|
||||
"handle-splittext-shownode-data or dataframe-left",
|
||||
);
|
||||
await splitTextInputData.hover();
|
||||
await page.mouse.up();
|
||||
|
|
|
|||
|
|
@ -82,7 +82,7 @@ test(
|
|||
await urlOutput.hover();
|
||||
await page.mouse.down();
|
||||
const splitTextInputData = await page.getByTestId(
|
||||
"handle-splittext-shownode-input documents-left",
|
||||
"handle-splittext-shownode-data or dataframe-left",
|
||||
);
|
||||
await splitTextInputData.hover();
|
||||
await page.mouse.up();
|
||||
|
|
|
|||
|
|
@ -25,7 +25,9 @@ test(
|
|||
await page.getByTitle("fit view").click();
|
||||
|
||||
await page.getByTestId("sidebar-search-input").click();
|
||||
await page.waitForTimeout(500);
|
||||
await page.getByTestId("sidebar-search-input").fill("chat output");
|
||||
await page.waitForTimeout(500);
|
||||
|
||||
await page.waitForSelector('[data-testid="outputsChat Output"]', {
|
||||
timeout: 3000,
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue