Add support for parsing PDF files
This commit is contained in:
parent
522bd304e8
commit
2455031e48
1 changed files with 20 additions and 3 deletions
|
|
@ -21,6 +21,7 @@ TEXT_FILE_TYPES = [
|
|||
"xml",
|
||||
"html",
|
||||
"htm",
|
||||
"pdf",
|
||||
]
|
||||
|
||||
|
||||
|
|
@ -54,7 +55,9 @@ def retrieve_file_paths(
|
|||
|
||||
glob = "**/*" if recursive else "*"
|
||||
paths = walk_level(path_obj, depth) if depth else path_obj.glob(glob)
|
||||
file_paths = [Text(p) for p in paths if p.is_file() and match_types(p) and is_not_hidden(p)]
|
||||
file_paths = [
|
||||
Text(p) for p in paths if p.is_file() and match_types(p) and is_not_hidden(p)
|
||||
]
|
||||
|
||||
return file_paths
|
||||
|
||||
|
|
@ -83,9 +86,20 @@ def read_text_file(file_path: str) -> str:
|
|||
return f.read()
|
||||
|
||||
|
||||
def parse_pdf_to_text(file_path: str) -> str:
|
||||
from pypdf import PdfReader # type: ignore
|
||||
|
||||
with open(file_path, "rb") as f:
|
||||
reader = PdfReader(f)
|
||||
return "\n\n".join([page.extract_text() for page in reader.pages])
|
||||
|
||||
|
||||
def parse_text_file_to_record(file_path: str, silent_errors: bool) -> Optional[Record]:
|
||||
try:
|
||||
text = read_text_file(file_path)
|
||||
if file_path.endswith(".pdf"):
|
||||
text = parse_pdf_to_text(file_path)
|
||||
else:
|
||||
text = read_text_file(file_path)
|
||||
# if file is json, yaml, or xml, we can parse it
|
||||
if file_path.endswith(".json"):
|
||||
text = json.loads(text)
|
||||
|
|
@ -111,7 +125,10 @@ def get_elements(
|
|||
if use_multithreading:
|
||||
records = parallel_load_records(file_paths, silent_errors, max_concurrency)
|
||||
else:
|
||||
records = [partition_file_to_record(file_path, silent_errors) for file_path in file_paths]
|
||||
records = [
|
||||
partition_file_to_record(file_path, silent_errors)
|
||||
for file_path in file_paths
|
||||
]
|
||||
records = list(filter(None, records))
|
||||
return records
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue