Refactor file loading and add support for .docx files

This commit is contained in:
Gabriel Luiz Freitas Almeida 2024-03-21 11:59:44 -03:00
commit 50d8313cd7
2 changed files with 32 additions and 23 deletions

View file

@ -10,19 +10,7 @@ from langflow.schema.schema import Record
# Types of files that can be read simply by file.read()
# and have 100% to be completely readable
TEXT_FILE_TYPES = [
"txt",
"md",
"mdx",
"csv",
"json",
"yaml",
"yml",
"xml",
"html",
"htm",
"pdf",
]
TEXT_FILE_TYPES = ["txt", "md", "mdx", "csv", "json", "yaml", "yml", "xml", "html", "htm", "pdf", "docx"]
def is_hidden(path: Path) -> bool:
@ -84,6 +72,13 @@ def read_text_file(file_path: str) -> str:
return f.read()
def read_docx_file(file_path: str) -> str:
from docx import Document # type: ignore
doc = Document(file_path)
return "\n\n".join([p.text for p in doc.paragraphs])
def parse_pdf_to_text(file_path: str) -> str:
from pypdf import PdfReader # type: ignore
@ -96,6 +91,8 @@ def parse_text_file_to_record(file_path: str, silent_errors: bool) -> Optional[R
try:
if file_path.endswith(".pdf"):
text = parse_pdf_to_text(file_path)
elif file_path.endswith(".docx"):
text = read_docx_file(file_path)
else:
text = read_text_file(file_path)
# if file is json, yaml, or xml, we can parse it

View file

@ -1,4 +1,4 @@
from typing import Any, Dict, Optional
from typing import Any, Dict, List
from langflow import CustomComponent
from langflow.base.data.utils import TEXT_FILE_TYPES, parse_text_file_to_record
@ -6,12 +6,17 @@ from langflow.schema import Record
class FileComponent(CustomComponent):
display_name = "File"
description = "Load a file."
display_name = "Files"
description = "Read Text Files"
def build_config(self) -> Dict[str, Any]:
return {
"path": {"display_name": "Path"},
"path": {
"display_name": "Path",
"field_type": "file",
"file_types": TEXT_FILE_TYPES,
"info": f"Supported file types: {', '.join(TEXT_FILE_TYPES)}",
},
"silent_errors": {
"display_name": "Silent Errors",
"advanced": True,
@ -19,13 +24,20 @@ class FileComponent(CustomComponent):
},
}
def build(
self,
path: str,
silent_errors: bool = False,
) -> Optional[Record]:
def load_file(self, path: str, silent_errors: bool = False) -> Record:
resolved_path = self.resolve_path(path)
extension = resolved_path.split(".")[-1]
if extension not in TEXT_FILE_TYPES:
raise ValueError(f"Unsupported file type: {extension}")
return parse_text_file_to_record(resolved_path, silent_errors)
record = parse_text_file_to_record(resolved_path, silent_errors)
self.status = record if record else "No data"
return record or Record()
def build(
self,
paths: List[str],
silent_errors: bool = False,
) -> Record:
records = [self.load_file(path, silent_errors) for path in paths]
self.status = records
return records