Add support for parsing PDF files

This commit is contained in:
Gabriel Luiz Freitas Almeida 2024-03-08 18:05:42 -03:00
commit 2455031e48

View file

@ -21,6 +21,7 @@ TEXT_FILE_TYPES = [
"xml",
"html",
"htm",
"pdf",
]
@ -54,7 +55,9 @@ def retrieve_file_paths(
glob = "**/*" if recursive else "*"
paths = walk_level(path_obj, depth) if depth else path_obj.glob(glob)
file_paths = [Text(p) for p in paths if p.is_file() and match_types(p) and is_not_hidden(p)]
file_paths = [
Text(p) for p in paths if p.is_file() and match_types(p) and is_not_hidden(p)
]
return file_paths
@ -83,9 +86,20 @@ def read_text_file(file_path: str) -> str:
return f.read()
def parse_pdf_to_text(file_path: str) -> str:
from pypdf import PdfReader # type: ignore
with open(file_path, "rb") as f:
reader = PdfReader(f)
return "\n\n".join([page.extract_text() for page in reader.pages])
def parse_text_file_to_record(file_path: str, silent_errors: bool) -> Optional[Record]:
try:
text = read_text_file(file_path)
if file_path.endswith(".pdf"):
text = parse_pdf_to_text(file_path)
else:
text = read_text_file(file_path)
# if file is json, yaml, or xml, we can parse it
if file_path.endswith(".json"):
text = json.loads(text)
@ -111,7 +125,10 @@ def get_elements(
if use_multithreading:
records = parallel_load_records(file_paths, silent_errors, max_concurrency)
else:
records = [partition_file_to_record(file_path, silent_errors) for file_path in file_paths]
records = [
partition_file_to_record(file_path, silent_errors)
for file_path in file_paths
]
records = list(filter(None, records))
return records