diff --git a/src/backend/langflow/base/data/utils.py b/src/backend/langflow/base/data/utils.py index b2c11a270..eaf266d97 100644 --- a/src/backend/langflow/base/data/utils.py +++ b/src/backend/langflow/base/data/utils.py @@ -21,6 +21,7 @@ TEXT_FILE_TYPES = [ "xml", "html", "htm", + "pdf", ] @@ -54,7 +55,9 @@ def retrieve_file_paths( glob = "**/*" if recursive else "*" paths = walk_level(path_obj, depth) if depth else path_obj.glob(glob) - file_paths = [Text(p) for p in paths if p.is_file() and match_types(p) and is_not_hidden(p)] + file_paths = [ + Text(p) for p in paths if p.is_file() and match_types(p) and is_not_hidden(p) + ] return file_paths @@ -83,9 +86,20 @@ def read_text_file(file_path: str) -> str: return f.read() +def parse_pdf_to_text(file_path: str) -> str: + from pypdf import PdfReader # type: ignore + + with open(file_path, "rb") as f: + reader = PdfReader(f) + return "\n\n".join([page.extract_text() for page in reader.pages]) + + def parse_text_file_to_record(file_path: str, silent_errors: bool) -> Optional[Record]: try: - text = read_text_file(file_path) + if file_path.endswith(".pdf"): + text = parse_pdf_to_text(file_path) + else: + text = read_text_file(file_path) # if file is json, yaml, or xml, we can parse it if file_path.endswith(".json"): text = json.loads(text) @@ -111,7 +125,10 @@ def get_elements( if use_multithreading: records = parallel_load_records(file_paths, silent_errors, max_concurrency) else: - records = [partition_file_to_record(file_path, silent_errors) for file_path in file_paths] + records = [ + partition_file_to_record(file_path, silent_errors) + for file_path in file_paths + ] records = list(filter(None, records)) return records