From 357c5878a5afdfda40ff37fd461d8a5bd36177b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=C3=A3o?= <38133825+joaoguilhermeS@users.noreply.github.com> Date: Mon, 11 Nov 2024 08:17:42 -0300 Subject: [PATCH] feat: Adding Semantic Text Splitter Component (Text Splitters) (#4254) * feat: Adding Semantic Text Splitter Component (Text Splitters) * update: adding beta flag to component since it is imported from langchain_experimental * update: adding beta flag to component since it is imported from langchain_experimental formatted --------- Co-authored-by: Edwin Jose --- .../components/textsplitters/__init__.py | 2 + .../textsplitters/language_semantic.py | 135 ++++++++++++++++++ 2 files changed, 137 insertions(+) create mode 100644 src/backend/base/langflow/components/textsplitters/language_semantic.py diff --git a/src/backend/base/langflow/components/textsplitters/__init__.py b/src/backend/base/langflow/components/textsplitters/__init__.py index 8cfc3b29a..002750fda 100644 --- a/src/backend/base/langflow/components/textsplitters/__init__.py +++ b/src/backend/base/langflow/components/textsplitters/__init__.py @@ -1,5 +1,6 @@ from .character import CharacterTextSplitterComponent from .language_recursive import LanguageRecursiveTextSplitterComponent +from .language_semantic import SemanticTextSplitterComponent from .natural_language import NaturalLanguageTextSplitterComponent from .recursive_character import RecursiveCharacterTextSplitterComponent @@ -8,4 +9,5 @@ __all__ = [ "LanguageRecursiveTextSplitterComponent", "NaturalLanguageTextSplitterComponent", "RecursiveCharacterTextSplitterComponent", + "SemanticTextSplitterComponent", ] diff --git a/src/backend/base/langflow/components/textsplitters/language_semantic.py b/src/backend/base/langflow/components/textsplitters/language_semantic.py new file mode 100644 index 000000000..df0ad5e34 --- /dev/null +++ b/src/backend/base/langflow/components/textsplitters/language_semantic.py @@ -0,0 +1,135 @@ +from langchain.docstore.document import Document +from langchain_experimental.text_splitter import SemanticChunker + +from langflow.base.textsplitters.model import LCTextSplitterComponent +from langflow.io import ( + DropdownInput, + FloatInput, + HandleInput, + IntInput, + MessageTextInput, + Output, +) +from langflow.schema import Data + + +class SemanticTextSplitterComponent(LCTextSplitterComponent): + """Split text into semantically meaningful chunks using semantic similarity.""" + + display_name: str = "Semantic Text Splitter" + name: str = "SemanticTextSplitter" + description: str = "Split text into semantically meaningful chunks using semantic similarity." + documentation = "https://python.langchain.com/docs/how_to/semantic-chunker/" + beta = True # this component is beta because it is imported from langchain_experimental + + inputs = [ + HandleInput( + name="data_inputs", + display_name="Data Inputs", + info="List of Data objects containing text and metadata to split.", + input_types=["Data"], + is_list=True, + ), + HandleInput( + name="embeddings", + display_name="Embeddings", + info="Embeddings model to use for semantic similarity. Required.", + input_types=["Embeddings"], + is_list=False, + ), + DropdownInput( + name="breakpoint_threshold_type", + display_name="Breakpoint Threshold Type", + info=( + "Method to determine breakpoints. Options: 'percentile', " + "'standard_deviation', 'interquartile'. Defaults to 'percentile'." + ), + value="percentile", + options=["percentile", "standard_deviation", "interquartile"], + ), + FloatInput( + name="breakpoint_threshold_amount", + display_name="Breakpoint Threshold Amount", + info="Numerical amount for the breakpoint threshold.", + value=0.5, + ), + IntInput( + name="number_of_chunks", + display_name="Number of Chunks", + info="Number of chunks to split the text into.", + value=5, + ), + MessageTextInput( + name="sentence_split_regex", + display_name="Sentence Split Regex", + info="Regular expression to split sentences. Optional.", + value="", + advanced=True, + ), + IntInput( + name="buffer_size", + display_name="Buffer Size", + info="Size of the buffer.", + value=0, + advanced=True, + ), + ] + + outputs = [ + Output(display_name="Chunks", name="chunks", method="split_text"), + ] + + def _docs_to_data(self, docs: list[Document]) -> list[Data]: + """Convert a list of Document objects to Data objects.""" + return [Data(text=doc.page_content, data=doc.metadata) for doc in docs] + + def split_text(self) -> list[Data]: + """Split the input data into semantically meaningful chunks.""" + try: + embeddings = getattr(self, "embeddings", None) + if embeddings is None: + error_msg = "An embeddings model is required for SemanticTextSplitter." + raise ValueError(error_msg) + + if not self.data_inputs: + error_msg = "Data inputs cannot be empty." + raise ValueError(error_msg) + + documents = [] + for _input in self.data_inputs: + if isinstance(_input, Data): + documents.append(_input.to_lc_document()) + else: + error_msg = f"Invalid data input type: {_input}" + raise TypeError(error_msg) + + if not documents: + error_msg = "No valid Data objects found in data_inputs." + raise ValueError(error_msg) + + texts = [doc.page_content for doc in documents] + metadatas = [doc.metadata for doc in documents] + + splitter_params = { + "embeddings": embeddings, + "breakpoint_threshold_type": self.breakpoint_threshold_type or "percentile", + "breakpoint_threshold_amount": self.breakpoint_threshold_amount, + "number_of_chunks": self.number_of_chunks, + "buffer_size": self.buffer_size, + } + + if self.sentence_split_regex: + splitter_params["sentence_split_regex"] = self.sentence_split_regex + + splitter = SemanticChunker(**splitter_params) + docs = splitter.create_documents(texts, metadatas=metadatas) + + data = self._docs_to_data(docs) + self.status = data + + except Exception as e: + error_msg = f"An error occurred during semantic splitting: {e}" + raise RuntimeError(error_msg) from e + + else: + return data