Fix encoding load file
This commit is contained in:
parent
7d943b8755
commit
a7c1af8c4f
1 changed files with 14 additions and 0 deletions
|
|
@ -3,6 +3,7 @@ import xml.etree.ElementTree as ET
|
|||
from concurrent import futures
|
||||
from pathlib import Path
|
||||
from typing import Callable, List, Optional, Text
|
||||
import unicodedata
|
||||
import chardet
|
||||
import yaml
|
||||
|
||||
|
|
@ -39,6 +40,10 @@ IMG_FILE_TYPES = [
|
|||
]
|
||||
|
||||
|
||||
def normalize_text(text):
|
||||
return unicodedata.normalize('NFKD', text)
|
||||
|
||||
|
||||
def is_hidden(path: Path) -> bool:
|
||||
return path.name.startswith(".")
|
||||
|
||||
|
|
@ -101,6 +106,9 @@ def read_text_file(file_path: str) -> str:
|
|||
result = chardet.detect(raw_data)
|
||||
encoding = result["encoding"]
|
||||
|
||||
if encoding in ['Windows-1254', 'MacRoman']:
|
||||
encoding = "utf-8"
|
||||
|
||||
with open(file_path, "r", encoding=encoding) as f:
|
||||
return f.read()
|
||||
|
||||
|
|
@ -128,9 +136,15 @@ def parse_text_file_to_record(file_path: str, silent_errors: bool) -> Optional[R
|
|||
text = read_docx_file(file_path)
|
||||
else:
|
||||
text = read_text_file(file_path)
|
||||
|
||||
# if file is json, yaml, or xml, we can parse it
|
||||
if file_path.endswith(".json"):
|
||||
text = json.loads(text)
|
||||
if isinstance(text, dict):
|
||||
text = {k: normalize_text(v) if isinstance(v, str) else v for k, v in text.items()}
|
||||
elif isinstance(text, list):
|
||||
text = [normalize_text(item) if isinstance(item, str) else item for item in text]
|
||||
|
||||
elif file_path.endswith(".yaml") or file_path.endswith(".yml"):
|
||||
text = yaml.safe_load(text)
|
||||
elif file_path.endswith(".xml"):
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue