Skip to content

Commit

Permalink
detect metadata file encoding.
Browse files Browse the repository at this point in the history
  • Loading branch information
ladrians committed Sep 26, 2024
1 parent ec52f14 commit 7a022d6
Show file tree
Hide file tree
Showing 4 changed files with 26 additions and 6 deletions.
2 changes: 1 addition & 1 deletion amazon_s3/s3reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -674,7 +674,7 @@ def augment_metadata(
initial_metadata['url'] = doc_url
else:
# Update URL
source_url = f"{self.source_base_url}?{self.source_doc_id}={id}"
source_url = f"{self.source_base_url}?{self.source_doc_id}={id}&CONTDISP=INLINE"
initial_metadata['url'] = source_url

description = f"{name} | {date_string_description} | {activity}"
Expand Down
13 changes: 12 additions & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ docx2txt = "^0.8"
azure-identity = "^1.17.1"
azure-keyvault-secrets = "^4.8.0"
nltk = "3.8.1"
chardet = "^5.2.0"

[tool.poetry.group.dev.dependencies]
pylint = "^3.1.0"
Expand Down
16 changes: 12 additions & 4 deletions saia_ingest/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import logging
import yaml
import requests
import chardet

def get_yaml_config(yaml_file):
# Load the configuration from the YAML file
Expand Down Expand Up @@ -47,13 +48,20 @@ def get_metadata_file(file_path, file_name, metadata_extension = '.json') -> di
ret = load_json_file(metadata_file)
return ret

def detect_encoding(file_path):
with open(file_path, 'rb') as file:
raw_data = file.read()
result = chardet.detect(raw_data)
return result['encoding']

def load_json_file(file_path) -> dict:
ret = None
encoding = detect_encoding(file_path)
try:
with open(file_path, 'r', encoding='utf-8') as json_file:
with open(file_path, 'r', encoding=encoding) as json_file:
ret = json.load(json_file)
except Exception as e:
pass
logging.getLogger().error(f"Error reading json: {e}")
return ret

def search_failed_files(directory, failed_status):
Expand All @@ -69,7 +77,7 @@ def search_failed_files(directory, failed_status):
data['file_path'] = file_path
file_list.append(data)
except json.JSONDecodeError:
print(f"Error decoding JSON in file: {file_path}")
logging.getLogger().error(f"Error decoding JSON in failed file: {file_path}")
return file_list

def find_value_by_key(metadata_list, key):
Expand All @@ -96,7 +104,7 @@ def search_fields_values(directory, fields_to_exclude = []):
if not isinstance(data['fields'][key], list):
dict_of_sets[key].append(data['fields'][key])
except json.JSONDecodeError:
print(f"Error decoding JSON in file: {file_path}")
logging.getLogger().error(f"Error decoding JSON in file: {file_path}")
for key in dict_of_sets:
dict_of_sets[key] = list(set(dict_of_sets[key]))
return dict_of_sets
Expand Down

0 comments on commit 7a022d6

Please sign in to comment.