Add support for parsing PDF files

2024-03-08 18:05:42 -03:00 · 2024-03-08 18:05:42 -03:00 · 2455031e48
commit 2455031e48
parent 522bd304e8
1 changed files with 20 additions and 3 deletions
--- a/src/backend/langflow/base/data/utils.py
+++ b/src/backend/langflow/base/data/utils.py
@ -21,6 +21,7 @@ TEXT_FILE_TYPES = [
    "xml",
    "html",
    "htm",
+    "pdf",
 ]


@ -54,7 +55,9 @@ def retrieve_file_paths(

    glob = "**/*" if recursive else "*"
    paths = walk_level(path_obj, depth) if depth else path_obj.glob(glob)
-    file_paths = [Text(p) for p in paths if p.is_file() and match_types(p) and is_not_hidden(p)]
+    file_paths = [
+        Text(p) for p in paths if p.is_file() and match_types(p) and is_not_hidden(p)
+    ]

    return file_paths

@ -83,9 +86,20 @@ def read_text_file(file_path: str) -> str:
        return f.read()


+def parse_pdf_to_text(file_path: str) -> str:
+    from pypdf import PdfReader  # type: ignore
+
+    with open(file_path, "rb") as f:
+        reader = PdfReader(f)
+        return "\n\n".join([page.extract_text() for page in reader.pages])
+
+
 def parse_text_file_to_record(file_path: str, silent_errors: bool) -> Optional[Record]:
    try:
-        text = read_text_file(file_path)
+        if file_path.endswith(".pdf"):
+            text = parse_pdf_to_text(file_path)
+        else:
+            text = read_text_file(file_path)
        # if file is json, yaml, or xml, we can parse it
        if file_path.endswith(".json"):
            text = json.loads(text)
@ -111,7 +125,10 @@ def get_elements(
    if use_multithreading:
        records = parallel_load_records(file_paths, silent_errors, max_concurrency)
    else:
-        records = [partition_file_to_record(file_path, silent_errors) for file_path in file_paths]
+        records = [
+            partition_file_to_record(file_path, silent_errors)
+            for file_path in file_paths
+        ]
    records = list(filter(None, records))
    return records