From 95ff2a7f6bf2a2127b5171c1cdda7c0dcafdc449 Mon Sep 17 00:00:00 2001 From: Gabriel Luiz Freitas Almeida Date: Thu, 24 Aug 2023 22:22:09 -0300 Subject: [PATCH 1/9] =?UTF-8?q?=F0=9F=94=A7=20chore(util.py):=20add=20miss?= =?UTF-8?q?ing=20import=20for=20'Document'=20from=20langchain.schema=20?= =?UTF-8?q?=F0=9F=94=A7=20chore(util.py):=20add=20missing=20type=20hint=20?= =?UTF-8?q?for=20'documents'=20parameter=20in=20build=5Floader=5Frepr=5Ffr?= =?UTF-8?q?om=5Fdocuments=20function=20=F0=9F=94=A7=20chore(util.py):=20ad?= =?UTF-8?q?d=20logic=20to=20calculate=20average=20document=20length=20and?= =?UTF-8?q?=20display=20summary=20information=20in=20build=5Floader=5Frepr?= =?UTF-8?q?=5Ffrom=5Fdocuments=20function?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/backend/langflow/utils/util.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/src/backend/langflow/utils/util.py b/src/backend/langflow/utils/util.py index f68c9dbe2..4760ebc2d 100644 --- a/src/backend/langflow/utils/util.py +++ b/src/backend/langflow/utils/util.py @@ -2,7 +2,7 @@ import re import inspect import importlib from functools import wraps -from typing import Optional, Dict, Any, Union +from typing import List, Optional, Dict, Any, Union from docstring_parser import parse # type: ignore @@ -10,6 +10,7 @@ from langflow.template.frontend_node.constants import FORCE_SHOW_FIELDS from langflow.utils import constants from langflow.utils.logger import logger from multiprocess import cpu_count # type: ignore +from langchain.schema import Document def build_template_from_function( @@ -462,3 +463,12 @@ def get_number_of_workers(workers=None): workers = (cpu_count() * 2) + 1 logger.debug(f"Number of workers: {workers}") return workers + + +def build_loader_repr_from_documents(documents: List[Document]) -> str: + if documents: + avg_length = sum(len(doc.page_content) for doc in documents) / len(documents) + return f"""{len(documents)} documents + \nAvg. Document Length (characters): {int(avg_length)} + Documents: {documents[:3]}...""" + return "0 documents" From 40ab6b1e877790961e99ccdebce1fc1af7879bae Mon Sep 17 00:00:00 2001 From: Gabriel Luiz Freitas Almeida Date: Thu, 24 Aug 2023 22:22:31 -0300 Subject: [PATCH 2/9] =?UTF-8?q?=F0=9F=94=A5=20refactor(config.yaml):=20rem?= =?UTF-8?q?ove=20RecursiveCharacterTextSplitter=20from=20the=20config=20fi?= =?UTF-8?q?le=20as=20it=20is=20no=20longer=20used=20=F0=9F=93=9A=20docs(co?= =?UTF-8?q?nfig.yaml):=20update=20documentation=20link=20for=20CharacterTe?= =?UTF-8?q?xtSplitter=20in=20the=20config=20file?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/backend/langflow/config.yaml | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/backend/langflow/config.yaml b/src/backend/langflow/config.yaml index 63e8cdf99..da43403f2 100644 --- a/src/backend/langflow/config.yaml +++ b/src/backend/langflow/config.yaml @@ -169,8 +169,6 @@ prompts: textsplitters: CharacterTextSplitter: documentation: "https://python.langchain.com/docs/modules/data_connection/document_transformers/text_splitters/character_text_splitter" - RecursiveCharacterTextSplitter: - documentation: "https://python.langchain.com/docs/modules/data_connection/document_transformers/text_splitters/recursive_text_splitter" toolkits: OpenAPIToolkit: documentation: "" From 383c9dc5ff63c2f82b64695f1ba2df6ae20c577f Mon Sep 17 00:00:00 2001 From: Gabriel Luiz Freitas Almeida Date: Thu, 24 Aug 2023 22:23:27 -0300 Subject: [PATCH 3/9] =?UTF-8?q?=F0=9F=93=9D=20docs(text-splitters.mdx):=20?= =?UTF-8?q?improve=20formatting=20and=20add=20missing=20information=20abou?= =?UTF-8?q?t=20`LanguageRecursiveTextSplitter`=20and=20its=20parameters=20?= =?UTF-8?q?=F0=9F=90=9B=20fix(text-splitters.mdx):=20fix=20typo=20in=20the?= =?UTF-8?q?=20description=20of=20`separators`=20parameter=20in=20`Recursiv?= =?UTF-8?q?eCharacterTextSplitter`?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/docs/components/text-splitters.mdx | 32 ++++++++++++++++++------- 1 file changed, 23 insertions(+), 9 deletions(-) diff --git a/docs/docs/components/text-splitters.mdx b/docs/docs/components/text-splitters.mdx index 6c91cc1a0..c6efe4553 100644 --- a/docs/docs/components/text-splitters.mdx +++ b/docs/docs/components/text-splitters.mdx @@ -1,11 +1,13 @@ -import Admonition from '@theme/Admonition'; +import Admonition from "@theme/Admonition"; # Text Splitters -

- We appreciate your understanding as we polish our documentation – it may contain some rough edges. Share your feedback or report issues to help us improve! 🛠️📝 -

+

+ We appreciate your understanding as we polish our documentation – it may + contain some rough edges. Share your feedback or report issues to help us + improve! 🛠️📝 +

A text splitter is a tool that divides a document or text into smaller chunks or segments. It is used to break down large texts into more manageable pieces for analysis or processing. @@ -22,13 +24,13 @@ The `CharacterTextSplitter` is used to split a long text into smaller chunks bas - **chunk_overlap:** Determines the number of characters that overlap between consecutive chunks when splitting text. It specifies how much of the previous chunk should be included in the next chunk. - For example, if the `chunk_overlap` is set to 20 and the `chunk_size` is set to 100, the splitter will create chunks of 100 characters each, but the last 20 characters of each chunk will overlap with the first 20 characters of the next chunk. This allows for a smoother transition between chunks and ensures that no information is lost – defaults to `200`. + For example, if the `chunk_overlap` is set to 20 and the `chunk_size` is set to 100, the splitter will create chunks of 100 characters each, but the last 20 characters of each chunk will overlap with the first 20 characters of the next chunk. This allows for a smoother transition between chunks and ensures that no information is lost – defaults to `200`. - **chunk_size:** Determines the maximum number of characters in each chunk when splitting a text. It specifies the size or length of each chunk. - For example, if the chunk_size is set to 100, the splitter will create chunks of 100 characters each. If the text is longer than 100 characters, it will be divided into multiple chunks of equal size, except for the last chunk, which may be smaller if there are remaining characters –defaults to `1000`. + For example, if the chunk_size is set to 100, the splitter will create chunks of 100 characters each. If the text is longer than 100 characters, it will be divided into multiple chunks of equal size, except for the last chunk, which may be smaller if there are remaining characters –defaults to `1000`. -- **separator:** Specifies the character that will be used to split the text into chunks – defaults to `.` +- **separator:** Specifies the character that will be used to split the text into chunks – defaults to `.` --- @@ -44,6 +46,18 @@ The `RecursiveCharacterTextSplitter` splits the text by trying to keep paragra - **chunk_size:** Determines the maximum number of characters in each chunk when splitting a text. It specifies the size or length of each chunk. -- **separator_type:** The parameter allows the user to split the code with multiple language support. It supports various languages such as Text, Ruby, Python, Solidity, Java, and more. Defaults to `Text`. +- **separators:** The `separators` in RecursiveCharacterTextSplitter are the characters used to split the text into chunks. The text splitter tries to create chunks based on splitting on the first character in the list of `separators`. If any chunks are too large, it moves on to the next character in the list and continues splitting. Defaults to ["\n\n", "\n", " ", ""]. -- **separators:** The `separators` in RecursiveCharacterTextSplitter are the characters used to split the text into chunks. The text splitter tries to create chunks based on splitting on the first character in the list of `separators`. If any chunks are too large, it moves on to the next character in the list and continues splitting. Defaults to `.` \ No newline at end of file +### LanguageRecursiveTextSplitter + +The `LanguageRecursiveTextSplitter` is a text splitter that splits the text into smaller chunks based on the (programming) language of the text. + +**Params** + +- **Documents:** Input documents to split. + +- **chunk_overlap:** Determines the number of characters that overlap between consecutive chunks when splitting text. It specifies how much of the previous chunk should be included in the next chunk. + +- **chunk_size:** Determines the maximum number of characters in each chunk when splitting a text. It specifies the size or length of each chunk. + +- **separator_type:** The parameter allows the user to split the code with multiple language support. It supports various languages such as Ruby, Python, Solidity, Java, and more. Defaults to `Python`. From 86730437a580853ac0496c3d083c4e8fa860224f Mon Sep 17 00:00:00 2001 From: Gabriel Luiz Freitas Almeida Date: Thu, 24 Aug 2023 22:23:45 -0300 Subject: [PATCH 4/9] =?UTF-8?q?=E2=9C=A8=20feat(RecursiveCharacterTextSpli?= =?UTF-8?q?tter.py):=20add=20RecursiveCharacterTextSplitter=20component=20?= =?UTF-8?q?to=20split=20text=20into=20chunks=20of=20a=20specified=20length?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../RecursiveCharacterTextSplitter.py | 83 +++++++++++++++++++ .../components/textsplitters/__init__.py | 0 2 files changed, 83 insertions(+) create mode 100644 src/backend/langflow/components/textsplitters/RecursiveCharacterTextSplitter.py create mode 100644 src/backend/langflow/components/textsplitters/__init__.py diff --git a/src/backend/langflow/components/textsplitters/RecursiveCharacterTextSplitter.py b/src/backend/langflow/components/textsplitters/RecursiveCharacterTextSplitter.py new file mode 100644 index 000000000..3b1f70815 --- /dev/null +++ b/src/backend/langflow/components/textsplitters/RecursiveCharacterTextSplitter.py @@ -0,0 +1,83 @@ +from typing import Optional +from langflow import CustomComponent +from langchain.schema import Document + + +class RecursiveCharacterTextSplitterComponent(CustomComponent): + display_name: str = "Recursive Character Text Splitter" + description: str = "Split text into chunks of a specified length." + documentation: str = "https://docs.langflow.org/components/text-splitters#recursivecharactertextsplitter" + + def build_config(self): + return { + "documents": { + "display_name": "Documents", + "info": "The documents to split.", + }, + "separators": { + "display_name": "Separators", + "info": 'The characters to split on.\nIf left empty defaults to ["\\n\\n", "\\n", " ", ""].', + "is_list": True, + }, + "chunk_size": { + "display_name": "Chunk Size", + "info": "The maximum length of each chunk.", + "field_type": "int", + "value": 1000, + }, + "chunk_overlap": { + "display_name": "Chunk Overlap", + "info": "The amount of overlap between chunks.", + "field_type": "int", + "value": 200, + }, + "code": {"show": False}, + } + + def build( + self, + documents: Document, + separators: Optional[str] = None, + chunk_size: Optional[int] = 1000, + chunk_overlap: Optional[int] = 200, + ) -> Document: + """ + Split text into chunks of a specified length. + + Args: + separators (list[str]): The characters to split on. + chunk_size (int): The maximum length of each chunk. + chunk_overlap (int): The amount of overlap between chunks. + length_function (function): The function to use to calculate the length of the text. + + Returns: + list[str]: The chunks of text. + """ + from langchain.text_splitter import RecursiveCharacterTextSplitter + + if separators == "": + separators = None + elif separators: + # check if the separators list has escaped characters + # if there are escaped characters, unescape them + separators = [x.encode().decode("unicode-escape") for x in separators] + + # Make sure chunk_size and chunk_overlap are ints + try: + chunk_size = int(chunk_size) + chunk_overlap = int(chunk_overlap) + except Exception as e: + raise ValueError( + "chunk_size and chunk_overlap must be integers." + " Received chunk_size={chunk_size} and chunk_overlap={chunk_overlap}." + ) from e + splitter = RecursiveCharacterTextSplitter( + separators=separators, + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + ) + + docs = splitter.split_documents(documents) + # self.repr_value = build_loader_repr_from_documents(docs) + self.repr_value = separators + return docs diff --git a/src/backend/langflow/components/textsplitters/__init__.py b/src/backend/langflow/components/textsplitters/__init__.py new file mode 100644 index 000000000..e69de29bb From 0649b61fca0c2a47f3efc84e88797f7e990a3093 Mon Sep 17 00:00:00 2001 From: Gabriel Luiz Freitas Almeida Date: Thu, 24 Aug 2023 22:24:07 -0300 Subject: [PATCH 5/9] =?UTF-8?q?=F0=9F=93=A6=20chore(LanguageRecursiveTextS?= =?UTF-8?q?plitter.py):=20add=20LanguageRecursiveTextSplitter=20component?= =?UTF-8?q?=20to=20split=20text=20into=20chunks=20based=20on=20language=20?= =?UTF-8?q?=F0=9F=93=9D=20docs(LanguageRecursiveTextSplitter.py):=20add=20?= =?UTF-8?q?documentation=20link=20for=20LanguageRecursiveTextSplitter=20co?= =?UTF-8?q?mponent=20=F0=9F=94=A7=20refactor(LanguageRecursiveTextSplitter?= =?UTF-8?q?.py):=20refactor=20build=5Fconfig=20method=20to=20use=20options?= =?UTF-8?q?=20from=20Language=20enum=20for=20separator=5Ftype=20?= =?UTF-8?q?=F0=9F=94=A7=20refactor(LanguageRecursiveTextSplitter.py):=20re?= =?UTF-8?q?factor=20build=20method=20to=20split=20text=20into=20chunks=20b?= =?UTF-8?q?ased=20on=20specified=20length=20and=20overlap=20=F0=9F=94=A7?= =?UTF-8?q?=20refactor(LanguageRecursiveTextSplitter.py):=20refactor=20bui?= =?UTF-8?q?ld=20method=20to=20handle=20chunk=5Fsize=20and=20chunk=5Foverla?= =?UTF-8?q?p=20as=20integers=20=F0=9F=94=A7=20refactor(LanguageRecursiveTe?= =?UTF-8?q?xtSplitter.py):=20refactor=20build=20method=20to=20use=20Recurs?= =?UTF-8?q?iveCharacterTextSplitter=20from=20langchain.text=5Fsplitter=20?= =?UTF-8?q?=F0=9F=94=A7=20refactor(LanguageRecursiveTextSplitter.py):=20re?= =?UTF-8?q?factor=20build=20method=20to=20split=20documents=20using=20Recu?= =?UTF-8?q?rsiveCharacterTextSplitter=20and=20return=20the=20chunks?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../LanguageRecursiveTextSplitter.py | 85 +++++++++++++++++++ 1 file changed, 85 insertions(+) create mode 100644 src/backend/langflow/components/textsplitters/LanguageRecursiveTextSplitter.py diff --git a/src/backend/langflow/components/textsplitters/LanguageRecursiveTextSplitter.py b/src/backend/langflow/components/textsplitters/LanguageRecursiveTextSplitter.py new file mode 100644 index 000000000..6b4373971 --- /dev/null +++ b/src/backend/langflow/components/textsplitters/LanguageRecursiveTextSplitter.py @@ -0,0 +1,85 @@ +from typing import Optional +from langflow import CustomComponent +from langchain.text_splitter import Language +from langchain.schema import Document +from langflow.utils.util import build_loader_repr_from_documents + + +class LanguageRecursiveTextSplitterComponent(CustomComponent): + display_name: str = "Language Recursive Text Splitter" + description: str = "Split text into chunks of a specified length based on language." + documentation: str = "https://docs.langflow.org/components/text-splitters#recursivecharactertextsplitter" + + def build_config(self): + options = [x.value for x in Language] + return { + "documents": { + "display_name": "Documents", + "info": "The documents to split.", + }, + "separator_type": { + "display_name": "Separator Type", + "info": "The type of separator to use.", + "field_type": "str", + "options": options, + "value": "Python", + }, + "separators": { + "display_name": "Separators", + "info": "The characters to split on.", + "is_list": True, + }, + "chunk_size": { + "display_name": "Chunk Size", + "info": "The maximum length of each chunk.", + "field_type": "int", + "value": 1000, + }, + "chunk_overlap": { + "display_name": "Chunk Overlap", + "info": "The amount of overlap between chunks.", + "field_type": "int", + "value": 200, + }, + "code": {"show": False}, + } + + def build( + self, + documents: Document, + chunk_size: Optional[int] = 1000, + chunk_overlap: Optional[int] = 200, + separator_type: Optional[str] = "Python", + ) -> Document: + """ + Split text into chunks of a specified length. + + Args: + separators (list[str]): The characters to split on. + chunk_size (int): The maximum length of each chunk. + chunk_overlap (int): The amount of overlap between chunks. + length_function (function): The function to use to calculate the length of the text. + + Returns: + list[str]: The chunks of text. + """ + from langchain.text_splitter import RecursiveCharacterTextSplitter + + # Make sure chunk_size and chunk_overlap are ints + try: + chunk_size = int(chunk_size) + chunk_overlap = int(chunk_overlap) + except Exception as e: + raise ValueError( + "chunk_size and chunk_overlap must be integers." + " Received chunk_size={chunk_size} and chunk_overlap={chunk_overlap}." + ) from e + splitter = RecursiveCharacterTextSplitter.from_language( + language=Language(separator_type), + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + ) + + docs = splitter.split_documents(documents) + self.repr_value = build_loader_repr_from_documents(docs) + return docs From b326a7e246c8d6b3d489efd4d998ef8fb5f5b5a6 Mon Sep 17 00:00:00 2001 From: Gabriel Luiz Freitas Almeida Date: Thu, 24 Aug 2023 22:25:16 -0300 Subject: [PATCH 6/9] =?UTF-8?q?=F0=9F=93=A6=20chore(pyproject.toml):=20upd?= =?UTF-8?q?ate=20package=20version=20from=200.4.15=20to=200.4.16=20?= =?UTF-8?q?=F0=9F=94=A7=20chore(pyproject.toml):=20add=20pillow=20package?= =?UTF-8?q?=20as=20a=20dependency=20for=20the=20project?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- poetry.lock | 4 ++-- pyproject.toml | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/poetry.lock b/poetry.lock index 7bf5b1342..b64dd4d02 100644 --- a/poetry.lock +++ b/poetry.lock @@ -4195,7 +4195,7 @@ files = [ name = "pillow" version = "10.0.0" description = "Python Imaging Library (Fork)" -optional = true +optional = false python-versions = ">=3.8" files = [ {file = "Pillow-10.0.0-cp310-cp310-macosx_10_10_x86_64.whl", hash = "sha256:1f62406a884ae75fb2f818694469519fb685cc7eaff05d3451a9ebe55c646891"}, @@ -7467,4 +7467,4 @@ local = ["ctransformers", "llama-cpp-python", "sentence-transformers"] [metadata] lock-version = "2.0" python-versions = ">=3.9,<3.11" -content-hash = "1effd07e35ba89cc3971f027218032e24e7816d93bccb7fd6470cc56acc04418" +content-hash = "36e1f79f4e6d2e55b652d10e43ccde639714ffff2965fa52b466bd854259ebf6" diff --git a/pyproject.toml b/pyproject.toml index 9c112f07c..24bb548ff 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "langflow" -version = "0.4.15" +version = "0.4.16" description = "A Python package with a built-in web application" authors = ["Logspace "] maintainers = [ @@ -79,6 +79,7 @@ psycopg-binary = "^3.1.9" fastavro = "^1.8.0" langchain-experimental = "^0.0.8" metaphor-python = "^0.1.11" +pillow = "^10.0.0" [tool.poetry.group.dev.dependencies] black = "^23.1.0" From 86927d10d393a0f679828b9bf5fc7319a5cbe954 Mon Sep 17 00:00:00 2001 From: Gabriel Luiz Freitas Almeida Date: Thu, 24 Aug 2023 22:37:30 -0300 Subject: [PATCH 7/9] =?UTF-8?q?=F0=9F=94=A7=20fix(types.py):=20add=20suppo?= =?UTF-8?q?rt=20for=20extracting=20inner=20type=20from=20list=20field=20ty?= =?UTF-8?q?pes=20in=20process=5Ftype=20function?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/backend/langflow/interface/types.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/backend/langflow/interface/types.py b/src/backend/langflow/interface/types.py index 824b0af50..6f4b6a8cd 100644 --- a/src/backend/langflow/interface/types.py +++ b/src/backend/langflow/interface/types.py @@ -5,6 +5,7 @@ from langflow.api.utils import merge_nested_dicts_with_renaming from langflow.interface.agents.base import agent_creator from langflow.interface.chains.base import chain_creator from langflow.interface.custom.constants import CUSTOM_COMPONENT_SUPPORTED_TYPES +from langflow.interface.custom.utils import extract_inner_type from langflow.interface.document_loaders.base import documentloader_creator from langflow.interface.embeddings.base import embedding_creator from langflow.interface.importing.utils import get_function_custom @@ -84,6 +85,8 @@ def build_langchain_types_dict(): # sourcery skip: dict-assign-update-to-union def process_type(field_type: str): + if field_type.startswith("list") or field_type.startswith("List"): + return extract_inner_type(field_type) return "prompt" if field_type == "Prompt" else field_type From c9f49690801e85b60081f258967864084ca751ef Mon Sep 17 00:00:00 2001 From: Gabriel Luiz Freitas Almeida Date: Thu, 24 Aug 2023 22:49:52 -0300 Subject: [PATCH 8/9] =?UTF-8?q?=F0=9F=90=9B=20fix(types.py):=20fix=20regex?= =?UTF-8?q?=20pattern=20in=20extract=5Ftype=5Ffrom=5Foptional=20function?= =?UTF-8?q?=20to=20correctly=20extract=20type=20from=20optional=20field=5F?= =?UTF-8?q?type=20=F0=9F=90=9B=20fix(types.py):=20fix=20logic=20in=20add?= =?UTF-8?q?=5Fnew=5Fcustom=5Ffield=20function=20to=20correctly=20set=20is?= =?UTF-8?q?=5Flist=20flag=20when=20field=5Ftype=20contains=20"list"=20keyw?= =?UTF-8?q?ord=20=E2=9C=A8=20feat(types.py):=20add=20field=5Fcontains=5Fli?= =?UTF-8?q?st=20variable=20to=20check=20if=20field=5Ftype=20contains=20"li?= =?UTF-8?q?st"=20keyword=20to=20improve=20semantics=20in=20add=5Fnew=5Fcus?= =?UTF-8?q?tom=5Ffield=20function?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/backend/langflow/interface/types.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/backend/langflow/interface/types.py b/src/backend/langflow/interface/types.py index 6f4b6a8cd..e558a3e0c 100644 --- a/src/backend/langflow/interface/types.py +++ b/src/backend/langflow/interface/types.py @@ -103,6 +103,7 @@ def add_new_custom_field( # if it is, update the value display_name = field_config.pop("display_name", field_name) field_type = field_config.pop("field_type", field_type) + field_contains_list = "list" in field_type.lower() field_type = process_type(field_type) field_value = field_config.pop("value", field_value) field_advanced = field_config.pop("advanced", False) @@ -113,7 +114,9 @@ def add_new_custom_field( # If options is a list, then it's a dropdown # If options is None, then it's a list of strings is_list = isinstance(field_config.get("options"), list) - field_config["is_list"] = is_list or field_config.get("is_list", False) + field_config["is_list"] = ( + is_list or field_config.get("is_list", False) or field_contains_list + ) if "name" in field_config: warnings.warn( @@ -175,7 +178,7 @@ def extract_type_from_optional(field_type): Returns: str: The extracted type, or an empty string if no type was found. """ - match = re.search(r"\[(.*?)\]", field_type) + match = re.search(r"\[(.*?)\]$", field_type) return match[1] if match else None From a28df1f08fe5d84f2687d2b6a9baec6ab755fd32 Mon Sep 17 00:00:00 2001 From: Gabriel Luiz Freitas Almeida Date: Thu, 24 Aug 2023 22:52:16 -0300 Subject: [PATCH 9/9] =?UTF-8?q?=F0=9F=93=9D=20docs(LanguageRecursiveTextSp?= =?UTF-8?q?litter.py):=20update=20documentation=20link=20to=20point=20to?= =?UTF-8?q?=20the=20correct=20URL=20=F0=9F=94=A8=20refactor(LanguageRecurs?= =?UTF-8?q?iveTextSplitter.py):=20change=20type=20hints=20for=20the=20'doc?= =?UTF-8?q?uments'=20parameter=20from=20Document=20to=20list[Document]=20a?= =?UTF-8?q?nd=20change=20return=20type=20from=20Document=20to=20list[Docum?= =?UTF-8?q?ent]=20to=20improve=20clarity=20and=20consistency=20?= =?UTF-8?q?=F0=9F=94=A8=20refactor(LanguageRecursiveTextSplitter.py):=20ch?= =?UTF-8?q?ange=20type=20hints=20for=20the=20'chunk=5Foverlap'=20parameter?= =?UTF-8?q?=20from=20Optional[int]=20to=20Optional[str]=20and=20handle=20c?= =?UTF-8?q?onversion=20to=20int=20to=20improve=20flexibility=20and=20error?= =?UTF-8?q?=20handling=20=F0=9F=94=A8=20refactor(LanguageRecursiveTextSpli?= =?UTF-8?q?tter.py):=20change=20type=20hints=20for=20the=20'chunk=5Fsize'?= =?UTF-8?q?=20parameter=20from=20Optional[int]=20to=20Optional[str]=20and?= =?UTF-8?q?=20handle=20conversion=20to=20int=20to=20improve=20flexibility?= =?UTF-8?q?=20and=20error=20handling=20=F0=9F=94=A8=20refactor(RecursiveCh?= =?UTF-8?q?aracterTextSplitter.py):=20change=20type=20hints=20for=20the=20?= =?UTF-8?q?'documents'=20parameter=20from=20Document=20to=20list[Document]?= =?UTF-8?q?=20and=20change=20return=20type=20from=20Document=20to=20list[D?= =?UTF-8?q?ocument]=20to=20improve=20clarity=20and=20consistency=20?= =?UTF-8?q?=F0=9F=94=A8=20refactor(RecursiveCharacterTextSplitter.py):=20c?= =?UTF-8?q?hange=20type=20hints=20for=20the=20'separators'=20parameter=20f?= =?UTF-8?q?rom=20Optional[str]=20to=20Optional[list[str]]=20and=20handle?= =?UTF-8?q?=20conversion=20to=20list=20of=20escaped=20characters=20to=20im?= =?UTF-8?q?prove=20flexibility=20and=20error=20handling=20=F0=9F=94=A8=20r?= =?UTF-8?q?efactor(RecursiveCharacterTextSplitter.py):=20change=20type=20h?= =?UTF-8?q?ints=20for=20the=20'chunk=5Foverlap'=20parameter=20from=20Optio?= =?UTF-8?q?nal[int]=20to=20Optional[str]=20and=20handle=20conversion=20to?= =?UTF-8?q?=20int=20to=20improve=20flexibility=20and=20error=20handling=20?= =?UTF-8?q?=F0=9F=94=A8=20refactor(RecursiveCharacterTextSplitter.py):=20c?= =?UTF-8?q?hange=20type=20hints=20for=20the=20'chunk=5Fsize'=20parameter?= =?UTF-8?q?=20from=20Optional[int]=20to=20Optional[str]=20and=20handle=20c?= =?UTF-8?q?onversion=20to=20int=20to=20improve=20flexibility=20and=20error?= =?UTF-8?q?=20handling?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../LanguageRecursiveTextSplitter.py | 15 ++++++--------- .../RecursiveCharacterTextSplitter.py | 14 +++++--------- 2 files changed, 11 insertions(+), 18 deletions(-) diff --git a/src/backend/langflow/components/textsplitters/LanguageRecursiveTextSplitter.py b/src/backend/langflow/components/textsplitters/LanguageRecursiveTextSplitter.py index 6b4373971..da7c0dd73 100644 --- a/src/backend/langflow/components/textsplitters/LanguageRecursiveTextSplitter.py +++ b/src/backend/langflow/components/textsplitters/LanguageRecursiveTextSplitter.py @@ -8,7 +8,7 @@ from langflow.utils.util import build_loader_repr_from_documents class LanguageRecursiveTextSplitterComponent(CustomComponent): display_name: str = "Language Recursive Text Splitter" description: str = "Split text into chunks of a specified length based on language." - documentation: str = "https://docs.langflow.org/components/text-splitters#recursivecharactertextsplitter" + documentation: str = "https://docs.langflow.org/components/text-splitters#languagerecursivetextsplitter" def build_config(self): options = [x.value for x in Language] @@ -46,11 +46,11 @@ class LanguageRecursiveTextSplitterComponent(CustomComponent): def build( self, - documents: Document, + documents: list[Document], chunk_size: Optional[int] = 1000, chunk_overlap: Optional[int] = 200, separator_type: Optional[str] = "Python", - ) -> Document: + ) -> list[Document]: """ Split text into chunks of a specified length. @@ -66,14 +66,11 @@ class LanguageRecursiveTextSplitterComponent(CustomComponent): from langchain.text_splitter import RecursiveCharacterTextSplitter # Make sure chunk_size and chunk_overlap are ints - try: + if isinstance(chunk_size, str): chunk_size = int(chunk_size) + if isinstance(chunk_overlap, str): chunk_overlap = int(chunk_overlap) - except Exception as e: - raise ValueError( - "chunk_size and chunk_overlap must be integers." - " Received chunk_size={chunk_size} and chunk_overlap={chunk_overlap}." - ) from e + splitter = RecursiveCharacterTextSplitter.from_language( language=Language(separator_type), chunk_size=chunk_size, diff --git a/src/backend/langflow/components/textsplitters/RecursiveCharacterTextSplitter.py b/src/backend/langflow/components/textsplitters/RecursiveCharacterTextSplitter.py index 3b1f70815..58b061f2f 100644 --- a/src/backend/langflow/components/textsplitters/RecursiveCharacterTextSplitter.py +++ b/src/backend/langflow/components/textsplitters/RecursiveCharacterTextSplitter.py @@ -36,11 +36,11 @@ class RecursiveCharacterTextSplitterComponent(CustomComponent): def build( self, - documents: Document, - separators: Optional[str] = None, + documents: list[Document], + separators: Optional[list[str]] = None, chunk_size: Optional[int] = 1000, chunk_overlap: Optional[int] = 200, - ) -> Document: + ) -> list[Document]: """ Split text into chunks of a specified length. @@ -63,14 +63,10 @@ class RecursiveCharacterTextSplitterComponent(CustomComponent): separators = [x.encode().decode("unicode-escape") for x in separators] # Make sure chunk_size and chunk_overlap are ints - try: + if isinstance(chunk_size, str): chunk_size = int(chunk_size) + if isinstance(chunk_overlap, str): chunk_overlap = int(chunk_overlap) - except Exception as e: - raise ValueError( - "chunk_size and chunk_overlap must be integers." - " Received chunk_size={chunk_size} and chunk_overlap={chunk_overlap}." - ) from e splitter = RecursiveCharacterTextSplitter( separators=separators, chunk_size=chunk_size,