From 1e46528c6500dfb4f2836f0a0b36a2a7449ad951 Mon Sep 17 00:00:00 2001 From: jm Date: Thu, 24 Oct 2024 20:26:00 +0200 Subject: [PATCH 1/2] additional info, logging pollution reduced --- libs/dspace-rest-python | 2 +- tools/add_metadata/add_metadata.py | 59 +++++++++++++++++++++++------- 2 files changed, 47 insertions(+), 14 deletions(-) diff --git a/libs/dspace-rest-python b/libs/dspace-rest-python index 36b2f0c..5724696 160000 --- a/libs/dspace-rest-python +++ b/libs/dspace-rest-python @@ -1 +1 @@ -Subproject commit 36b2f0cd3ab1b492ad700df740dca4d3848c0e76 +Subproject commit 57246965e89c707ae9f3f4821650793c627ecd27 diff --git a/tools/add_metadata/add_metadata.py b/tools/add_metadata/add_metadata.py index 38d8877..a151a78 100644 --- a/tools/add_metadata/add_metadata.py +++ b/tools/add_metadata/add_metadata.py @@ -16,6 +16,7 @@ import project_settings # noqa from dspace_rest_client.models import Item from utils import init_logging, update_settings # noqa +logging.getLogger("dspace.client").setLevel(logging.WARNING) _logger = logging.getLogger() @@ -159,6 +160,8 @@ def add_new_metadata(self, item) -> bool: if date_meta is None: continue id_str = f"Item [{uuid}]: [{from_mtd}]" + if len(date_meta) != 1: + _logger.warning(f"{id_str}: more than one value {date_meta}") # If there is more than one value, get only the first one date_val = date(date_meta[0]["value"]) @@ -199,6 +202,26 @@ def update(self, item: dict) -> bool: return self.add_new_metadata(item) +class additional_stats: + + def __init__(self): + self._titles = defaultdict(int) + + def update(self, item: dict): + dc_titles = item['metadata'].get('dc.title', []) + if len(dc_titles) > 0: + self._titles[dc_titles[0]['value']] += 1 + + def print_info(self, show_limit=100): + duplicates = {k: v for k, v in self._titles.items() if v > 1} + _logger.info( + f"Duplicates {len(duplicates)} ({sum(duplicates.values())}) (showing first {show_limit}):") + for i, (k, v) in enumerate(duplicates.items()): + if i >= show_limit: + break + _logger.info(f"Title [{k}] : {v}") + + if __name__ == '__main__': parser = argparse.ArgumentParser(description="Add metadata for DSpace items") parser.add_argument("--to_mtd_field", @@ -211,6 +234,7 @@ def update(self, item: dict) -> bool: parser.add_argument("--password", type=str, default=env["backend"]["password"]) parser.add_argument("--dry-run", action='store_true', default=False) args = parser.parse_args() + _logger.info(f"Arguments: {args}") start = time.time() @@ -219,14 +243,17 @@ def update(self, item: dict) -> bool: upd = updater(dspace_be, args.from_mtd_field, args.to_mtd_field, dry_run=args.dry_run) - titles = defaultdict(int) + stats = additional_stats() + # Process items + len_all_items = 0 + len_used_items = 0 for items in dspace_be.iter_items(): + len_all_items += len(items) items = [item for item in items if not item['withdrawn'] and item['inArchive']] + len_used_items += len(items) for item in items: - dc_titles = item['metadata'].get('dc.title', []) - if len(dc_titles) > 0: - titles[dc_titles[0]['value']] += 1 + stats.update(item) upd.update(item) _logger.info(40 * "=") @@ -245,16 +272,22 @@ def update(self, item: dict) -> bool: _logger.info(f"Date invalid [{len(msgs)}]:\n\t{msgs}") _logger.info(40 * "=") - show_limit = 100 - duplicates = {k: v for k, v in titles.items() if v > 1} - _logger.info("Duplicates {len(duplicates)} (showing first {show_limit}:") - for i, (k, v) in enumerate(duplicates.items()): - if i >= show_limit: - break - _logger.info(f"Title [{k}] : {v}") + stats.print_info() _logger.info(40 * "=") - _logger.info("Statistics:") + _logger.info("Update statistics:") for k, v in upd.info.items(): _logger.info(f"{k:25s}: {len(v):6d}") - _logger.info(f"Total time: {time.time() - start:.2f} s") + took = time.time() - start + + _logger.info(40 * "=") + _logger.info("Counts:") + _logger.info(f"Total items: {len_all_items}") + _logger.info(f"Used items: {len_used_items}") + # sets are not counted + _logger.info( + f"Sum of updates: {sum(len(x) for x in upd.info.values() if isinstance(x, list))}") + + _logger.info(40 * "=") + _logger.info( + f"Total time: {took:.2f} s [{time.strftime('%H:%M:%S', time.gmtime(took))}]") From a53d3c24992e116a6aab4581472b44b88c869fa6 Mon Sep 17 00:00:00 2001 From: jm Date: Mon, 28 Oct 2024 09:02:10 +0100 Subject: [PATCH 2/2] show doubles --- tools/add_metadata/add_metadata.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tools/add_metadata/add_metadata.py b/tools/add_metadata/add_metadata.py index a151a78..6c391e8 100644 --- a/tools/add_metadata/add_metadata.py +++ b/tools/add_metadata/add_metadata.py @@ -197,6 +197,8 @@ def update(self, item: dict) -> bool: # Check if the target metadata field exists and is not empty date_meta = item_mtd.get(self._to_mtd_field, None) if date_meta is not None: + if len(date_meta) != 1: + _logger.critical(f"{uuid}: more than one value {date_meta}") return self.update_existing_metadata(item, date_meta[0]["value"]) else: return self.add_new_metadata(item) @@ -206,11 +208,16 @@ class additional_stats: def __init__(self): self._titles = defaultdict(int) + self._doubles = defaultdict(list) def update(self, item: dict): + uuid = item['uuid'] dc_titles = item['metadata'].get('dc.title', []) if len(dc_titles) > 0: self._titles[dc_titles[0]['value']] += 1 + key = 'dc.date.issued' + if len(item['metadata'].get(key, [])) > 1: + self._doubles[key].append(uuid) def print_info(self, show_limit=100): duplicates = {k: v for k, v in self._titles.items() if v > 1} @@ -220,6 +227,10 @@ def print_info(self, show_limit=100): if i >= show_limit: break _logger.info(f"Title [{k}] : {v}") + if len(self._doubles) > 0: + _logger.info("Multiple values when expecting at most 1:") + for k, v in self._doubles.items(): + _logger.info(f"{k}: {v}") if __name__ == '__main__':