From 50d8313cd77fcd014c228e4886d9988ab92854e5 Mon Sep 17 00:00:00 2001 From: Gabriel Luiz Freitas Almeida Date: Thu, 21 Mar 2024 11:59:44 -0300 Subject: [PATCH] Refactor file loading and add support for .docx files --- src/backend/langflow/base/data/utils.py | 23 ++++++-------- src/backend/langflow/components/data/File.py | 32 ++++++++++++++------ 2 files changed, 32 insertions(+), 23 deletions(-) diff --git a/src/backend/langflow/base/data/utils.py b/src/backend/langflow/base/data/utils.py index 450f6e04b..c3fda5e34 100644 --- a/src/backend/langflow/base/data/utils.py +++ b/src/backend/langflow/base/data/utils.py @@ -10,19 +10,7 @@ from langflow.schema.schema import Record # Types of files that can be read simply by file.read() # and have 100% to be completely readable -TEXT_FILE_TYPES = [ - "txt", - "md", - "mdx", - "csv", - "json", - "yaml", - "yml", - "xml", - "html", - "htm", - "pdf", -] +TEXT_FILE_TYPES = ["txt", "md", "mdx", "csv", "json", "yaml", "yml", "xml", "html", "htm", "pdf", "docx"] def is_hidden(path: Path) -> bool: @@ -84,6 +72,13 @@ def read_text_file(file_path: str) -> str: return f.read() +def read_docx_file(file_path: str) -> str: + from docx import Document # type: ignore + + doc = Document(file_path) + return "\n\n".join([p.text for p in doc.paragraphs]) + + def parse_pdf_to_text(file_path: str) -> str: from pypdf import PdfReader # type: ignore @@ -96,6 +91,8 @@ def parse_text_file_to_record(file_path: str, silent_errors: bool) -> Optional[R try: if file_path.endswith(".pdf"): text = parse_pdf_to_text(file_path) + elif file_path.endswith(".docx"): + text = read_docx_file(file_path) else: text = read_text_file(file_path) # if file is json, yaml, or xml, we can parse it diff --git a/src/backend/langflow/components/data/File.py b/src/backend/langflow/components/data/File.py index 4c4e42de4..34da1a7fb 100644 --- a/src/backend/langflow/components/data/File.py +++ b/src/backend/langflow/components/data/File.py @@ -1,4 +1,4 @@ -from typing import Any, Dict, Optional +from typing import Any, Dict, List from langflow import CustomComponent from langflow.base.data.utils import TEXT_FILE_TYPES, parse_text_file_to_record @@ -6,12 +6,17 @@ from langflow.schema import Record class FileComponent(CustomComponent): - display_name = "File" - description = "Load a file." + display_name = "Files" + description = "Read Text Files" def build_config(self) -> Dict[str, Any]: return { - "path": {"display_name": "Path"}, + "path": { + "display_name": "Path", + "field_type": "file", + "file_types": TEXT_FILE_TYPES, + "info": f"Supported file types: {', '.join(TEXT_FILE_TYPES)}", + }, "silent_errors": { "display_name": "Silent Errors", "advanced": True, @@ -19,13 +24,20 @@ class FileComponent(CustomComponent): }, } - def build( - self, - path: str, - silent_errors: bool = False, - ) -> Optional[Record]: + def load_file(self, path: str, silent_errors: bool = False) -> Record: resolved_path = self.resolve_path(path) extension = resolved_path.split(".")[-1] if extension not in TEXT_FILE_TYPES: raise ValueError(f"Unsupported file type: {extension}") - return parse_text_file_to_record(resolved_path, silent_errors) + record = parse_text_file_to_record(resolved_path, silent_errors) + self.status = record if record else "No data" + return record or Record() + + def build( + self, + paths: List[str], + silent_errors: bool = False, + ) -> Record: + records = [self.load_file(path, silent_errors) for path in paths] + self.status = records + return records