feat: Adding Semantic Text Splitter Component (Text Splitters) (#4254)
* feat: Adding Semantic Text Splitter Component (Text Splitters) * update: adding beta flag to component since it is imported from langchain_experimental * update: adding beta flag to component since it is imported from langchain_experimental formatted --------- Co-authored-by: Edwin Jose <edwin.jose@datastax.com>
This commit is contained in:
parent
fa2854131a
commit
357c5878a5
2 changed files with 137 additions and 0 deletions
|
|
@ -1,5 +1,6 @@
|
|||
from .character import CharacterTextSplitterComponent
|
||||
from .language_recursive import LanguageRecursiveTextSplitterComponent
|
||||
from .language_semantic import SemanticTextSplitterComponent
|
||||
from .natural_language import NaturalLanguageTextSplitterComponent
|
||||
from .recursive_character import RecursiveCharacterTextSplitterComponent
|
||||
|
||||
|
|
@ -8,4 +9,5 @@ __all__ = [
|
|||
"LanguageRecursiveTextSplitterComponent",
|
||||
"NaturalLanguageTextSplitterComponent",
|
||||
"RecursiveCharacterTextSplitterComponent",
|
||||
"SemanticTextSplitterComponent",
|
||||
]
|
||||
|
|
|
|||
|
|
@ -0,0 +1,135 @@
|
|||
from langchain.docstore.document import Document
|
||||
from langchain_experimental.text_splitter import SemanticChunker
|
||||
|
||||
from langflow.base.textsplitters.model import LCTextSplitterComponent
|
||||
from langflow.io import (
|
||||
DropdownInput,
|
||||
FloatInput,
|
||||
HandleInput,
|
||||
IntInput,
|
||||
MessageTextInput,
|
||||
Output,
|
||||
)
|
||||
from langflow.schema import Data
|
||||
|
||||
|
||||
class SemanticTextSplitterComponent(LCTextSplitterComponent):
|
||||
"""Split text into semantically meaningful chunks using semantic similarity."""
|
||||
|
||||
display_name: str = "Semantic Text Splitter"
|
||||
name: str = "SemanticTextSplitter"
|
||||
description: str = "Split text into semantically meaningful chunks using semantic similarity."
|
||||
documentation = "https://python.langchain.com/docs/how_to/semantic-chunker/"
|
||||
beta = True # this component is beta because it is imported from langchain_experimental
|
||||
|
||||
inputs = [
|
||||
HandleInput(
|
||||
name="data_inputs",
|
||||
display_name="Data Inputs",
|
||||
info="List of Data objects containing text and metadata to split.",
|
||||
input_types=["Data"],
|
||||
is_list=True,
|
||||
),
|
||||
HandleInput(
|
||||
name="embeddings",
|
||||
display_name="Embeddings",
|
||||
info="Embeddings model to use for semantic similarity. Required.",
|
||||
input_types=["Embeddings"],
|
||||
is_list=False,
|
||||
),
|
||||
DropdownInput(
|
||||
name="breakpoint_threshold_type",
|
||||
display_name="Breakpoint Threshold Type",
|
||||
info=(
|
||||
"Method to determine breakpoints. Options: 'percentile', "
|
||||
"'standard_deviation', 'interquartile'. Defaults to 'percentile'."
|
||||
),
|
||||
value="percentile",
|
||||
options=["percentile", "standard_deviation", "interquartile"],
|
||||
),
|
||||
FloatInput(
|
||||
name="breakpoint_threshold_amount",
|
||||
display_name="Breakpoint Threshold Amount",
|
||||
info="Numerical amount for the breakpoint threshold.",
|
||||
value=0.5,
|
||||
),
|
||||
IntInput(
|
||||
name="number_of_chunks",
|
||||
display_name="Number of Chunks",
|
||||
info="Number of chunks to split the text into.",
|
||||
value=5,
|
||||
),
|
||||
MessageTextInput(
|
||||
name="sentence_split_regex",
|
||||
display_name="Sentence Split Regex",
|
||||
info="Regular expression to split sentences. Optional.",
|
||||
value="",
|
||||
advanced=True,
|
||||
),
|
||||
IntInput(
|
||||
name="buffer_size",
|
||||
display_name="Buffer Size",
|
||||
info="Size of the buffer.",
|
||||
value=0,
|
||||
advanced=True,
|
||||
),
|
||||
]
|
||||
|
||||
outputs = [
|
||||
Output(display_name="Chunks", name="chunks", method="split_text"),
|
||||
]
|
||||
|
||||
def _docs_to_data(self, docs: list[Document]) -> list[Data]:
|
||||
"""Convert a list of Document objects to Data objects."""
|
||||
return [Data(text=doc.page_content, data=doc.metadata) for doc in docs]
|
||||
|
||||
def split_text(self) -> list[Data]:
|
||||
"""Split the input data into semantically meaningful chunks."""
|
||||
try:
|
||||
embeddings = getattr(self, "embeddings", None)
|
||||
if embeddings is None:
|
||||
error_msg = "An embeddings model is required for SemanticTextSplitter."
|
||||
raise ValueError(error_msg)
|
||||
|
||||
if not self.data_inputs:
|
||||
error_msg = "Data inputs cannot be empty."
|
||||
raise ValueError(error_msg)
|
||||
|
||||
documents = []
|
||||
for _input in self.data_inputs:
|
||||
if isinstance(_input, Data):
|
||||
documents.append(_input.to_lc_document())
|
||||
else:
|
||||
error_msg = f"Invalid data input type: {_input}"
|
||||
raise TypeError(error_msg)
|
||||
|
||||
if not documents:
|
||||
error_msg = "No valid Data objects found in data_inputs."
|
||||
raise ValueError(error_msg)
|
||||
|
||||
texts = [doc.page_content for doc in documents]
|
||||
metadatas = [doc.metadata for doc in documents]
|
||||
|
||||
splitter_params = {
|
||||
"embeddings": embeddings,
|
||||
"breakpoint_threshold_type": self.breakpoint_threshold_type or "percentile",
|
||||
"breakpoint_threshold_amount": self.breakpoint_threshold_amount,
|
||||
"number_of_chunks": self.number_of_chunks,
|
||||
"buffer_size": self.buffer_size,
|
||||
}
|
||||
|
||||
if self.sentence_split_regex:
|
||||
splitter_params["sentence_split_regex"] = self.sentence_split_regex
|
||||
|
||||
splitter = SemanticChunker(**splitter_params)
|
||||
docs = splitter.create_documents(texts, metadatas=metadatas)
|
||||
|
||||
data = self._docs_to_data(docs)
|
||||
self.status = data
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"An error occurred during semantic splitting: {e}"
|
||||
raise RuntimeError(error_msg) from e
|
||||
|
||||
else:
|
||||
return data
|
||||
Loading…
Add table
Add a link
Reference in a new issue