📦 chore(LanguageRecursiveTextSplitter.py): add LanguageRecursiveTextSplitter component to split text into chunks based on language
📝 docs(LanguageRecursiveTextSplitter.py): add documentation link for LanguageRecursiveTextSplitter component 🔧 refactor(LanguageRecursiveTextSplitter.py): refactor build_config method to use options from Language enum for separator_type 🔧 refactor(LanguageRecursiveTextSplitter.py): refactor build method to split text into chunks based on specified length and overlap 🔧 refactor(LanguageRecursiveTextSplitter.py): refactor build method to handle chunk_size and chunk_overlap as integers 🔧 refactor(LanguageRecursiveTextSplitter.py): refactor build method to use RecursiveCharacterTextSplitter from langchain.text_splitter 🔧 refactor(LanguageRecursiveTextSplitter.py): refactor build method to split documents using RecursiveCharacterTextSplitter and return the chunks
This commit is contained in:
parent
86730437a5
commit
0649b61fca
1 changed files with 85 additions and 0 deletions
|
|
@ -0,0 +1,85 @@
|
|||
from typing import Optional
|
||||
from langflow import CustomComponent
|
||||
from langchain.text_splitter import Language
|
||||
from langchain.schema import Document
|
||||
from langflow.utils.util import build_loader_repr_from_documents
|
||||
|
||||
|
||||
class LanguageRecursiveTextSplitterComponent(CustomComponent):
|
||||
display_name: str = "Language Recursive Text Splitter"
|
||||
description: str = "Split text into chunks of a specified length based on language."
|
||||
documentation: str = "https://docs.langflow.org/components/text-splitters#recursivecharactertextsplitter"
|
||||
|
||||
def build_config(self):
|
||||
options = [x.value for x in Language]
|
||||
return {
|
||||
"documents": {
|
||||
"display_name": "Documents",
|
||||
"info": "The documents to split.",
|
||||
},
|
||||
"separator_type": {
|
||||
"display_name": "Separator Type",
|
||||
"info": "The type of separator to use.",
|
||||
"field_type": "str",
|
||||
"options": options,
|
||||
"value": "Python",
|
||||
},
|
||||
"separators": {
|
||||
"display_name": "Separators",
|
||||
"info": "The characters to split on.",
|
||||
"is_list": True,
|
||||
},
|
||||
"chunk_size": {
|
||||
"display_name": "Chunk Size",
|
||||
"info": "The maximum length of each chunk.",
|
||||
"field_type": "int",
|
||||
"value": 1000,
|
||||
},
|
||||
"chunk_overlap": {
|
||||
"display_name": "Chunk Overlap",
|
||||
"info": "The amount of overlap between chunks.",
|
||||
"field_type": "int",
|
||||
"value": 200,
|
||||
},
|
||||
"code": {"show": False},
|
||||
}
|
||||
|
||||
def build(
|
||||
self,
|
||||
documents: Document,
|
||||
chunk_size: Optional[int] = 1000,
|
||||
chunk_overlap: Optional[int] = 200,
|
||||
separator_type: Optional[str] = "Python",
|
||||
) -> Document:
|
||||
"""
|
||||
Split text into chunks of a specified length.
|
||||
|
||||
Args:
|
||||
separators (list[str]): The characters to split on.
|
||||
chunk_size (int): The maximum length of each chunk.
|
||||
chunk_overlap (int): The amount of overlap between chunks.
|
||||
length_function (function): The function to use to calculate the length of the text.
|
||||
|
||||
Returns:
|
||||
list[str]: The chunks of text.
|
||||
"""
|
||||
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
||||
|
||||
# Make sure chunk_size and chunk_overlap are ints
|
||||
try:
|
||||
chunk_size = int(chunk_size)
|
||||
chunk_overlap = int(chunk_overlap)
|
||||
except Exception as e:
|
||||
raise ValueError(
|
||||
"chunk_size and chunk_overlap must be integers."
|
||||
" Received chunk_size={chunk_size} and chunk_overlap={chunk_overlap}."
|
||||
) from e
|
||||
splitter = RecursiveCharacterTextSplitter.from_language(
|
||||
language=Language(separator_type),
|
||||
chunk_size=chunk_size,
|
||||
chunk_overlap=chunk_overlap,
|
||||
)
|
||||
|
||||
docs = splitter.split_documents(documents)
|
||||
self.repr_value = build_loader_repr_from_documents(docs)
|
||||
return docs
|
||||
Loading…
Add table
Add a link
Reference in a new issue