diff --git a/libs/dspace-rest-python b/libs/dspace-rest-python index 36b2f0c..5724696 160000 --- a/libs/dspace-rest-python +++ b/libs/dspace-rest-python @@ -1 +1 @@ -Subproject commit 36b2f0cd3ab1b492ad700df740dca4d3848c0e76 +Subproject commit 57246965e89c707ae9f3f4821650793c627ecd27 diff --git a/tools/add_metadata/add_metadata.py b/tools/add_metadata/add_metadata.py index 38d8877..a151a78 100644 --- a/tools/add_metadata/add_metadata.py +++ b/tools/add_metadata/add_metadata.py @@ -16,6 +16,7 @@ import project_settings # noqa from dspace_rest_client.models import Item from utils import init_logging, update_settings # noqa +logging.getLogger("dspace.client").setLevel(logging.WARNING) _logger = logging.getLogger() @@ -159,6 +160,8 @@ def add_new_metadata(self, item) -> bool: if date_meta is None: continue id_str = f"Item [{uuid}]: [{from_mtd}]" + if len(date_meta) != 1: + _logger.warning(f"{id_str}: more than one value {date_meta}") # If there is more than one value, get only the first one date_val = date(date_meta[0]["value"]) @@ -199,6 +202,26 @@ def update(self, item: dict) -> bool: return self.add_new_metadata(item) +class additional_stats: + + def __init__(self): + self._titles = defaultdict(int) + + def update(self, item: dict): + dc_titles = item['metadata'].get('dc.title', []) + if len(dc_titles) > 0: + self._titles[dc_titles[0]['value']] += 1 + + def print_info(self, show_limit=100): + duplicates = {k: v for k, v in self._titles.items() if v > 1} + _logger.info( + f"Duplicates {len(duplicates)} ({sum(duplicates.values())}) (showing first {show_limit}):") + for i, (k, v) in enumerate(duplicates.items()): + if i >= show_limit: + break + _logger.info(f"Title [{k}] : {v}") + + if __name__ == '__main__': parser = argparse.ArgumentParser(description="Add metadata for DSpace items") parser.add_argument("--to_mtd_field", @@ -211,6 +234,7 @@ def update(self, item: dict) -> bool: parser.add_argument("--password", type=str, default=env["backend"]["password"]) parser.add_argument("--dry-run", action='store_true', default=False) args = parser.parse_args() + _logger.info(f"Arguments: {args}") start = time.time() @@ -219,14 +243,17 @@ def update(self, item: dict) -> bool: upd = updater(dspace_be, args.from_mtd_field, args.to_mtd_field, dry_run=args.dry_run) - titles = defaultdict(int) + stats = additional_stats() + # Process items + len_all_items = 0 + len_used_items = 0 for items in dspace_be.iter_items(): + len_all_items += len(items) items = [item for item in items if not item['withdrawn'] and item['inArchive']] + len_used_items += len(items) for item in items: - dc_titles = item['metadata'].get('dc.title', []) - if len(dc_titles) > 0: - titles[dc_titles[0]['value']] += 1 + stats.update(item) upd.update(item) _logger.info(40 * "=") @@ -245,16 +272,22 @@ def update(self, item: dict) -> bool: _logger.info(f"Date invalid [{len(msgs)}]:\n\t{msgs}") _logger.info(40 * "=") - show_limit = 100 - duplicates = {k: v for k, v in titles.items() if v > 1} - _logger.info("Duplicates {len(duplicates)} (showing first {show_limit}:") - for i, (k, v) in enumerate(duplicates.items()): - if i >= show_limit: - break - _logger.info(f"Title [{k}] : {v}") + stats.print_info() _logger.info(40 * "=") - _logger.info("Statistics:") + _logger.info("Update statistics:") for k, v in upd.info.items(): _logger.info(f"{k:25s}: {len(v):6d}") - _logger.info(f"Total time: {time.time() - start:.2f} s") + took = time.time() - start + + _logger.info(40 * "=") + _logger.info("Counts:") + _logger.info(f"Total items: {len_all_items}") + _logger.info(f"Used items: {len_used_items}") + # sets are not counted + _logger.info( + f"Sum of updates: {sum(len(x) for x in upd.info.values() if isinstance(x, list))}") + + _logger.info(40 * "=") + _logger.info( + f"Total time: {took:.2f} s [{time.strftime('%H:%M:%S', time.gmtime(took))}]")