Skip to content

Commit

Permalink
import-fileextensions-of-bitstreamformatregistry (#162)
Browse files Browse the repository at this point in the history
* file check ruff

* fileextensions comparison
  • Loading branch information
Paurikova2 authored Sep 16, 2024
1 parent dca65c2 commit b8d493f
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 5 deletions.
29 changes: 25 additions & 4 deletions src/pump/_bitstreamformatregistry.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import logging
from collections import defaultdict
from ._utils import read_json, time_method, serialize, deserialize, progress_bar, log_before_import, log_after_import

_logger = logging.getLogger("pump.bitstreamformatregistry")
Expand All @@ -14,12 +15,17 @@ class bitstreamformatregistry:
"compare": ["mimetype", "short_description", "support_level"],
}],
["fileextension", {
"compare": ["extension"],
"sql": {
"5": "select bitstreamformatregistry.short_description, fileextension.extension from fileextension inner join bitstreamformatregistry ON fileextension.bitstream_format_id=bitstreamformatregistry.bitstream_format_id",
"7": "select bitstreamformatregistry.short_description, fileextension.extension from fileextension inner join bitstreamformatregistry ON fileextension.bitstream_format_id=bitstreamformatregistry.bitstream_format_id",
"compare": None,
}
}],
]

def __init__(self, bfr_file_str: str):
def __init__(self, bfr_file_str: str, fe_file_str: str):
self._reg = read_json(bfr_file_str)
self._fe = read_json(fe_file_str)
self._imported = {
"reg": 0,
"existed": 0,
Expand Down Expand Up @@ -65,12 +71,18 @@ def import_to(self, dspace):
log_before_import(log_key, expected)

existing_bfr2id = {}
existing_bfr2ext = defaultdict(list)
bfr_js = dspace.fetch_bitstreamregistry()
if bfr_js is not None:
for bf in bfr_js:
existing_bfr2id[bf['shortDescription']] = bf['id']
if bf['description'] == 'Unknown data format':
self._unknown_format_id = bf['id']
existing_bfr2ext[bf['id']] = bf['extensions']

old_bfr2ext = defaultdict(list)
for fe in self._fe:
old_bfr2ext[fe['bitstream_format_id']].append(fe['extension'])

map = {
0: 'UNKNOWN',
Expand All @@ -88,18 +100,25 @@ def import_to(self, dspace):

bf_id = bf['bitstream_format_id']
ext_id = existing_bfr2id.get(bf['short_description'], None)

if ext_id is not None:
self._imported["existed"] += 1
_logger.debug(
f'Bitstreamformatregistry [{bf["short_description"]}] already exists!')
# check file extensions
old_ext = old_bfr2ext[bf_id]
new_ext = existing_bfr2ext[ext_id]
if set(old_ext) != set(new_ext):
_logger.warning(
f'Fileextensions for bitstreamformatregistry [{bf["short_description"]}] do not match! '
f'Old extensions: {[str(f) for f in old_ext]} New extensions: {[str(f) for f in new_ext]}')
else:
data = {
'mimetype': bf['mimetype'],
'description': bf['description'],
'shortDescription': bf['short_description'],
'supportLevel': level_str,
'internal': bf['internal']
'internal': bf['internal'],
'extensions': old_bfr2ext[bf_id]
}
try:
resp = dspace.put_bitstreamregistry(data)
Expand All @@ -124,6 +143,7 @@ def serialize(self, file_str: str):
"imported": self._imported,
"unknown_format_id": self._unknown_format_id,
"id2mimetype": self._id2mimetype,
"fe": self._fe
}
serialize(file_str, data)

Expand All @@ -134,3 +154,4 @@ def deserialize(self, file_str: str):
self._imported = data["imported"]
self._unknown_format_id = data["unknown_format_id"]
self._id2mimetype = data["id2mimetype"]
self._fe = data["fe"]
2 changes: 1 addition & 1 deletion src/pump/_repo.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ def _f(name): return os.path.join(env["input"]["datadir"], name)
)

self.bitstreamformatregistry = bitstreamformatregistry(
_f("bitstreamformatregistry.json")
_f("bitstreamformatregistry.json"), _f("fileextension.json")
)

self.licenses = licenses(
Expand Down

0 comments on commit b8d493f

Please sign in to comment.