Skip to content

Commit

Permalink
conflicts resolved
Browse files Browse the repository at this point in the history
  • Loading branch information
Paurikova2 committed Oct 28, 2024
2 parents af0fb2a + a53d3c2 commit 29f4b2b
Showing 1 changed file with 57 additions and 15 deletions.
72 changes: 57 additions & 15 deletions tools/add_metadata/add_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import project_settings # noqa
from dspace_rest_client.models import Item
from utils import init_logging, update_settings # noqa
logging.getLogger("dspace.client").setLevel(logging.WARNING)

_logger = logging.getLogger()

Expand Down Expand Up @@ -56,13 +57,13 @@ def parse(self) -> bool:
if len(self._d) < 1:
return False

formats = ['%Y/%m/%d', '%d/%m/%Y', '%Y.%m.%d', '%d.%m.%Y', '%d. %m. %Y', '%Y',
formats = ['%Y/%m/%d', '%d/%m/%Y', '%Y.%m.%d', '%d.%m.%Y', '%Y',
'%Y-%m', '%m-%Y', '%Y/%m', '%m/%Y', '%Y.%m', '%m.%Y']
for fmt in formats:
try:
datetime_obj = datetime.strptime(self._d, fmt)
# Normalize date to 'YYYY-MM-DD'
if fmt in ['%Y-%m', '%Y/%m', '%Y.%m', '%m-%Y', '%m/%Y', '%m.%Y']:
if fmt in ['%Y-%m', '%Y/%m', '%Y.%m']:
self._d = datetime_obj.strftime('%Y-%m-01')
elif fmt == '%Y':
self._d = datetime_obj.strftime('%Y-01-01')
Expand Down Expand Up @@ -160,6 +161,8 @@ def add_new_metadata(self, item) -> bool:
if date_meta is None:
continue
id_str = f"Item [{uuid}]: [{from_mtd}]"
if len(date_meta) != 1:
_logger.warning(f"{id_str}: more than one value {date_meta}")

# If there is more than one value, get only the first one
date_val = date(date_meta[0]["value"])
Expand Down Expand Up @@ -222,6 +225,35 @@ def update(self, item: dict) -> bool:
return self.add_new_metadata(item)


class additional_stats:

def __init__(self):
self._titles = defaultdict(int)
self._doubles = defaultdict(list)

def update(self, item: dict):
uuid = item['uuid']
dc_titles = item['metadata'].get('dc.title', [])
if len(dc_titles) > 0:
self._titles[dc_titles[0]['value']] += 1
key = 'dc.date.issued'
if len(item['metadata'].get(key, [])) > 1:
self._doubles[key].append(uuid)

def print_info(self, show_limit=100):
duplicates = {k: v for k, v in self._titles.items() if v > 1}
_logger.info(
f"Duplicates {len(duplicates)} ({sum(duplicates.values())}) (showing first {show_limit}):")
for i, (k, v) in enumerate(duplicates.items()):
if i >= show_limit:
break
_logger.info(f"Title [{k}] : {v}")
if len(self._doubles) > 0:
_logger.info("Multiple values when expecting at most 1:")
for k, v in self._doubles.items():
_logger.info(f"{k}: {v}")


if __name__ == '__main__':
parser = argparse.ArgumentParser(description="Add metadata for DSpace items")
parser.add_argument("--to_mtd_field",
Expand All @@ -234,6 +266,7 @@ def update(self, item: dict) -> bool:
parser.add_argument("--password", type=str, default=env["backend"]["password"])
parser.add_argument("--dry-run", action='store_true', default=False)
args = parser.parse_args()
_logger.info(f"Arguments: {args}")

start = time.time()

Expand All @@ -242,14 +275,17 @@ def update(self, item: dict) -> bool:

upd = updater(dspace_be, args.from_mtd_field, args.to_mtd_field, dry_run=args.dry_run)

titles = defaultdict(int)
stats = additional_stats()

# Process items
len_all_items = 0
len_used_items = 0
for items in dspace_be.iter_items():
len_all_items += len(items)
items = [item for item in items if not item['withdrawn'] and item['inArchive']]
len_used_items += len(items)
for item in items:
dc_titles = item['metadata'].get('dc.title', [])
if len(dc_titles) > 0:
titles[dc_titles[0]['value']] += 1
stats.update(item)
upd.update(item)

_logger.info(40 * "=")
Expand All @@ -268,16 +304,22 @@ def update(self, item: dict) -> bool:
_logger.info(f"Date invalid [{len(msgs)}]:\n\t{msgs}")

_logger.info(40 * "=")
show_limit = 100
duplicates = {k: v for k, v in titles.items() if v > 1}
_logger.info("Duplicates {len(duplicates)} (showing first {show_limit}:")
for i, (k, v) in enumerate(duplicates.items()):
if i >= show_limit:
break
_logger.info(f"Title [{k}] : {v}")
stats.print_info()

_logger.info(40 * "=")
_logger.info("Statistics:")
_logger.info("Update statistics:")
for k, v in upd.info.items():
_logger.info(f"{k:25s}: {len(v):6d}")
_logger.info(f"Total time: {time.time() - start:.2f} s")
took = time.time() - start

_logger.info(40 * "=")
_logger.info("Counts:")
_logger.info(f"Total items: {len_all_items}")
_logger.info(f"Used items: {len_used_items}")
# sets are not counted
_logger.info(
f"Sum of updates: {sum(len(x) for x in upd.info.values() if isinstance(x, list))}")

_logger.info(40 * "=")
_logger.info(
f"Total time: {took:.2f} s [{time.strftime('%H:%M:%S', time.gmtime(took))}]")

0 comments on commit 29f4b2b

Please sign in to comment.