Refactor file loading and add support for .docx files
This commit is contained in:
parent
6bb6a9f5b9
commit
50d8313cd7
2 changed files with 32 additions and 23 deletions
|
|
@ -10,19 +10,7 @@ from langflow.schema.schema import Record
|
|||
|
||||
# Types of files that can be read simply by file.read()
|
||||
# and have 100% to be completely readable
|
||||
TEXT_FILE_TYPES = [
|
||||
"txt",
|
||||
"md",
|
||||
"mdx",
|
||||
"csv",
|
||||
"json",
|
||||
"yaml",
|
||||
"yml",
|
||||
"xml",
|
||||
"html",
|
||||
"htm",
|
||||
"pdf",
|
||||
]
|
||||
TEXT_FILE_TYPES = ["txt", "md", "mdx", "csv", "json", "yaml", "yml", "xml", "html", "htm", "pdf", "docx"]
|
||||
|
||||
|
||||
def is_hidden(path: Path) -> bool:
|
||||
|
|
@ -84,6 +72,13 @@ def read_text_file(file_path: str) -> str:
|
|||
return f.read()
|
||||
|
||||
|
||||
def read_docx_file(file_path: str) -> str:
|
||||
from docx import Document # type: ignore
|
||||
|
||||
doc = Document(file_path)
|
||||
return "\n\n".join([p.text for p in doc.paragraphs])
|
||||
|
||||
|
||||
def parse_pdf_to_text(file_path: str) -> str:
|
||||
from pypdf import PdfReader # type: ignore
|
||||
|
||||
|
|
@ -96,6 +91,8 @@ def parse_text_file_to_record(file_path: str, silent_errors: bool) -> Optional[R
|
|||
try:
|
||||
if file_path.endswith(".pdf"):
|
||||
text = parse_pdf_to_text(file_path)
|
||||
elif file_path.endswith(".docx"):
|
||||
text = read_docx_file(file_path)
|
||||
else:
|
||||
text = read_text_file(file_path)
|
||||
# if file is json, yaml, or xml, we can parse it
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
from typing import Any, Dict, Optional
|
||||
from typing import Any, Dict, List
|
||||
|
||||
from langflow import CustomComponent
|
||||
from langflow.base.data.utils import TEXT_FILE_TYPES, parse_text_file_to_record
|
||||
|
|
@ -6,12 +6,17 @@ from langflow.schema import Record
|
|||
|
||||
|
||||
class FileComponent(CustomComponent):
|
||||
display_name = "File"
|
||||
description = "Load a file."
|
||||
display_name = "Files"
|
||||
description = "Read Text Files"
|
||||
|
||||
def build_config(self) -> Dict[str, Any]:
|
||||
return {
|
||||
"path": {"display_name": "Path"},
|
||||
"path": {
|
||||
"display_name": "Path",
|
||||
"field_type": "file",
|
||||
"file_types": TEXT_FILE_TYPES,
|
||||
"info": f"Supported file types: {', '.join(TEXT_FILE_TYPES)}",
|
||||
},
|
||||
"silent_errors": {
|
||||
"display_name": "Silent Errors",
|
||||
"advanced": True,
|
||||
|
|
@ -19,13 +24,20 @@ class FileComponent(CustomComponent):
|
|||
},
|
||||
}
|
||||
|
||||
def build(
|
||||
self,
|
||||
path: str,
|
||||
silent_errors: bool = False,
|
||||
) -> Optional[Record]:
|
||||
def load_file(self, path: str, silent_errors: bool = False) -> Record:
|
||||
resolved_path = self.resolve_path(path)
|
||||
extension = resolved_path.split(".")[-1]
|
||||
if extension not in TEXT_FILE_TYPES:
|
||||
raise ValueError(f"Unsupported file type: {extension}")
|
||||
return parse_text_file_to_record(resolved_path, silent_errors)
|
||||
record = parse_text_file_to_record(resolved_path, silent_errors)
|
||||
self.status = record if record else "No data"
|
||||
return record or Record()
|
||||
|
||||
def build(
|
||||
self,
|
||||
paths: List[str],
|
||||
silent_errors: bool = False,
|
||||
) -> Record:
|
||||
records = [self.load_file(path, silent_errors) for path in paths]
|
||||
self.status = records
|
||||
return records
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue