From 511fe6a320d9f20f6d2eb0f3e2df27c7c95cc18a Mon Sep 17 00:00:00 2001 From: Paurikova2 Date: Thu, 17 Oct 2024 13:42:36 +0200 Subject: [PATCH 01/16] checkstyle violations --- src/dspace/_rest.py | 20 +++++ .../create_dc_date_issued.py | 85 +++++++++++++++++++ 2 files changed, 105 insertions(+) create mode 100644 tools/dc_date_issued_searching/create_dc_date_issued.py diff --git a/src/dspace/_rest.py b/src/dspace/_rest.py index c275e35..dcb930d 100644 --- a/src/dspace/_rest.py +++ b/src/dspace/_rest.py @@ -333,6 +333,26 @@ def fetch_item(self, uuid: str): _logger.debug(f"Importing [] using [{url}]") return self._fetch(url, self.get, None) + def fetch_items(self): + url = 'core/items' + _logger.debug(f"Fatch [] using [{url}]") + page = 0 + items = [] + has_more = True + while has_more: + r = self._fetch(url, self.get, "_embedded", + params={"page": page, "size": 100}) + if not r: + return items + key = "items" + if key in r: + r = r[key] + items.extend(r) + else: + _logger.warning(f"Key [{key}] does not exist in response: {r}") + page += 1 + return items + def put_ws_item(self, param: dict, data: dict): url = 'clarin/import/workspaceitem' _logger.debug(f"Importing [{data}] using [{url}]") diff --git a/tools/dc_date_issued_searching/create_dc_date_issued.py b/tools/dc_date_issued_searching/create_dc_date_issued.py new file mode 100644 index 0000000..0520ea1 --- /dev/null +++ b/tools/dc_date_issued_searching/create_dc_date_issued.py @@ -0,0 +1,85 @@ +import argparse +import logging +import os +import sys + +_this_dir = os.path.dirname(os.path.abspath(__file__)) +sys.path.insert(0, os.path.join(_this_dir, "../../src")) +sys.path.insert(0, os.path.join(_this_dir, "../../src/pump")) + +import dspace # noqa +import settings # noqa +import project_settings # noqa +from dspace_rest_client.models import Item +from utils import init_logging, update_settings # noqa + +_logger = logging.getLogger() + +# env settings, update with project_settings +env = update_settings(settings.env, project_settings.settings) +init_logging(_logger, env["log_file"]) + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description="Add metadata for DSpace items") + parser.add_argument("--to_mtd_field", + type=str, required=True, help="Metadata field that we want created.") + parser.add_argument( + "--from_mtd_field", + type=str, + nargs='+', # Accept one or more values + required=True, + help="Metadata field(s) than value(s) can be used." + ) + + args = parser.parse_args() + dspace_be = dspace.rest( + env["backend"]["endpoint"], + env["backend"]["user"], + env["backend"]["password"], + env["backend"]["authentication"] + ) + + all_items = dspace_be.fetch_items() + items = [] + # Check which items do not contain dc.date.issued + for item in all_items: + # Check if item is withdrawn or is not in archive + if item['withdrawn'] or not item['inArchive']: + continue + mtd = item['metadata'] + if args.to_mtd_field not in mtd: + items.append(item) + + # Create missing mtd + from_mtd_field = args.from_mtd_field + created = [] + no_created = [] + error_items = [] + for item in items: + mtd = item["metadata"] + found = False + for from_mtd in from_mtd_field: + if from_mtd in mtd: + if len(mtd[from_mtd]) == 0: + _logger.info( + f"No values for metadata [{from_mtd}] of item [{item['uuid']}]") + break + found = True + _logger.info( + f"Metadata [{args.to_mtd_field}] replaced by [{from_mtd}] for item [{item['uuid']}]") + val = mtd[from_mtd][0]["value"] + r = dspace_be.client.add_metadata(Item(item), args.to_mtd_field, val) + if r is not None: + created.append(item["uuid"]) + break + else: + logging.warning( + f"Error during creating metadata [{args.to_mtd_field}] for item [{item['uuid']}]") + error_items.append(item["uuid"]) + if not found: + no_created.append(item["id"]) + _logger.info(f"Metadata [{args.to_mtd_field}] added to items: {created}") + _logger.warning(f"Metadata [{args.to_mtd_field}] do not added to items: {no_created}") + _logger.warning( + f"Error during added metadata [{args.to_mtd_field}] to items: {error_items}") From 243c824576c07245b97ebefbb9386ac6b4a8f6fd Mon Sep 17 00:00:00 2001 From: Paurikova2 Date: Thu, 17 Oct 2024 13:46:45 +0200 Subject: [PATCH 02/16] checkstyle violations --- src/dspace/_rest.py | 18 ++- .../create_dc_date_issued.py | 110 ++++++++++-------- 2 files changed, 69 insertions(+), 59 deletions(-) diff --git a/src/dspace/_rest.py b/src/dspace/_rest.py index dcb930d..938f187 100644 --- a/src/dspace/_rest.py +++ b/src/dspace/_rest.py @@ -338,19 +338,17 @@ def fetch_items(self): _logger.debug(f"Fatch [] using [{url}]") page = 0 items = [] - has_more = True - while has_more: + while True: r = self._fetch(url, self.get, "_embedded", params={"page": page, "size": 100}) - if not r: - return items - key = "items" - if key in r: - r = r[key] - items.extend(r) + if r is None: + break + items_data = r.get("items", []) + if items_data: + items.extend(items_data) else: - _logger.warning(f"Key [{key}] does not exist in response: {r}") - page += 1 + _logger.warning(f"Key [items] does not exist in response: {r}") + page += r return items def put_ws_item(self, param: dict, data: dict): diff --git a/tools/dc_date_issued_searching/create_dc_date_issued.py b/tools/dc_date_issued_searching/create_dc_date_issued.py index 0520ea1..98a3d9b 100644 --- a/tools/dc_date_issued_searching/create_dc_date_issued.py +++ b/tools/dc_date_issued_searching/create_dc_date_issued.py @@ -19,67 +19,79 @@ env = update_settings(settings.env, project_settings.settings) init_logging(_logger, env["log_file"]) -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description="Add metadata for DSpace items") + +def parse_arguments(): + """Parse command line arguments.""" + parser = argparse.ArgumentParser(description="Add metadata for DSpace items") parser.add_argument("--to_mtd_field", - type=str, required=True, help="Metadata field that we want created.") - parser.add_argument( - "--from_mtd_field", - type=str, - nargs='+', # Accept one or more values - required=True, - help="Metadata field(s) than value(s) can be used." - ) + type=str, required=True, help="Metadata field to be created.") + parser.add_argument("--from_mtd_field", + type=str, nargs='+', required=True, + help="Metadata field(s) from which value(s) can be used.") + return parser.parse_args() - args = parser.parse_args() - dspace_be = dspace.rest( - env["backend"]["endpoint"], - env["backend"]["user"], - env["backend"]["password"], - env["backend"]["authentication"] - ) +def fetch_items(dspace_be): + """Fetch items from DSpace backend, filtering out withdrawn or non-archived items.""" all_items = dspace_be.fetch_items() - items = [] - # Check which items do not contain dc.date.issued - for item in all_items: - # Check if item is withdrawn or is not in archive - if item['withdrawn'] or not item['inArchive']: - continue - mtd = item['metadata'] - if args.to_mtd_field not in mtd: - items.append(item) - - # Create missing mtd - from_mtd_field = args.from_mtd_field - created = [] - no_created = [] - error_items = [] + return [ + item for item in all_items + if not item['withdrawn'] and item['inArchive'] and args.to_mtd_field not in item['metadata'] + ] + + +def create_missing_metadata(dspace_be, items, from_mtd_fields, to_mtd_field): + """Create missing metadata for items based on provided fields.""" + created, not_created, error_items = [], [], [] + for item in items: mtd = item["metadata"] found = False - for from_mtd in from_mtd_field: - if from_mtd in mtd: - if len(mtd[from_mtd]) == 0: - _logger.info( - f"No values for metadata [{from_mtd}] of item [{item['uuid']}]") - break + + for from_mtd in from_mtd_fields: + if from_mtd in mtd and mtd[from_mtd]: found = True - _logger.info( - f"Metadata [{args.to_mtd_field}] replaced by [{from_mtd}] for item [{item['uuid']}]") val = mtd[from_mtd][0]["value"] - r = dspace_be.client.add_metadata(Item(item), args.to_mtd_field, val) - if r is not None: + _logger.info( + f"Metadata [{to_mtd_field}] replaced by [{from_mtd}] for item [{item['uuid']}]") + + # Add the new metadata + if dspace_be.client.add_metadata(Item(item), to_mtd_field, val): created.append(item["uuid"]) - break else: - logging.warning( - f"Error during creating metadata [{args.to_mtd_field}] for item [{item['uuid']}]") + _logger.warning( + f"Error creating metadata [{to_mtd_field}] for item [{item['uuid']}]") error_items.append(item["uuid"]) + + break # Stop searching once we find a valid field + if not found: - no_created.append(item["id"]) + not_created.append(item["id"]) + + return created, not_created, error_items + + +if __name__ == '__main__': + args = parse_arguments() + + # Initialize DSpace backend + dspace_be = dspace.rest( + env["backend"]["endpoint"], + env["backend"]["user"], + env["backend"]["password"], + env["backend"]["authentication"] + ) + + # Fetch and filter items + items_to_update = fetch_items(dspace_be) + + # Create missing metadata + created, not_created, error_items = create_missing_metadata( + dspace_be, items_to_update, args.from_mtd_field, args.to_mtd_field + ) + + # Log results _logger.info(f"Metadata [{args.to_mtd_field}] added to items: {created}") - _logger.warning(f"Metadata [{args.to_mtd_field}] do not added to items: {no_created}") + _logger.warning(f"Metadata [{args.to_mtd_field}] not added to items: {not_created}") _logger.warning( - f"Error during added metadata [{args.to_mtd_field}] to items: {error_items}") + f"Error adding metadata [{args.to_mtd_field}] to items: {error_items}") From e8e5e24c7d93a3a6b5f60a8a2a35914d92e7b910 Mon Sep 17 00:00:00 2001 From: Paurikova2 Date: Thu, 17 Oct 2024 13:55:34 +0200 Subject: [PATCH 03/16] checkstyle violations --- tools/add_metadata/README.md | 6 ++++++ .../add_metadata.py} | 0 2 files changed, 6 insertions(+) create mode 100644 tools/add_metadata/README.md rename tools/{dc_date_issued_searching/create_dc_date_issued.py => add_metadata/add_metadata.py} (100%) diff --git a/tools/add_metadata/README.md b/tools/add_metadata/README.md new file mode 100644 index 0000000..f489d70 --- /dev/null +++ b/tools/add_metadata/README.md @@ -0,0 +1,6 @@ +# add+metadata.py + +This script adds new metadata to items that are missing it. The values for this new metadata are taken from the existing input metadata field. +``` +python add_metadata.py --to_mtd_field dc.date.issued --from_mtd_field dc.date.submitted dc.date.committed dc.date.defense +``` diff --git a/tools/dc_date_issued_searching/create_dc_date_issued.py b/tools/add_metadata/add_metadata.py similarity index 100% rename from tools/dc_date_issued_searching/create_dc_date_issued.py rename to tools/add_metadata/add_metadata.py From b9ec2564acce44eb7def737916e1ad5a1d6573e2 Mon Sep 17 00:00:00 2001 From: Paurikova2 Date: Thu, 17 Oct 2024 14:00:34 +0200 Subject: [PATCH 04/16] fix errors --- src/dspace/_rest.py | 7 ++++--- tools/add_metadata/README.md | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/dspace/_rest.py b/src/dspace/_rest.py index 938f187..24b85a2 100644 --- a/src/dspace/_rest.py +++ b/src/dspace/_rest.py @@ -343,12 +343,13 @@ def fetch_items(self): params={"page": page, "size": 100}) if r is None: break - items_data = r.get("items", []) + key = "items" + items_data = r.get(key, []) if items_data: items.extend(items_data) else: - _logger.warning(f"Key [items] does not exist in response: {r}") - page += r + _logger.warning(f"Key [{key}] does not exist in response: {r}") + page += 1 return items def put_ws_item(self, param: dict, data: dict): diff --git a/tools/add_metadata/README.md b/tools/add_metadata/README.md index f489d70..8d983b8 100644 --- a/tools/add_metadata/README.md +++ b/tools/add_metadata/README.md @@ -1,4 +1,4 @@ -# add+metadata.py +# add_metadata.py This script adds new metadata to items that are missing it. The values for this new metadata are taken from the existing input metadata field. ``` From d3366b887afc93806c9d1f39d50975d1c0fcb3f2 Mon Sep 17 00:00:00 2001 From: Paurikova2 Date: Fri, 18 Oct 2024 10:31:29 +0200 Subject: [PATCH 05/16] added validation control and correct date conversion --- tools/add_metadata/add_metadata.py | 131 +++++++++++++++++++++-------- 1 file changed, 98 insertions(+), 33 deletions(-) diff --git a/tools/add_metadata/add_metadata.py b/tools/add_metadata/add_metadata.py index 98a3d9b..6458fb5 100644 --- a/tools/add_metadata/add_metadata.py +++ b/tools/add_metadata/add_metadata.py @@ -2,6 +2,7 @@ import logging import os import sys +from datetime import datetime _this_dir = os.path.dirname(os.path.abspath(__file__)) sys.path.insert(0, os.path.join(_this_dir, "../../src")) @@ -34,41 +35,97 @@ def parse_arguments(): def fetch_items(dspace_be): """Fetch items from DSpace backend, filtering out withdrawn or non-archived items.""" all_items = dspace_be.fetch_items() + _logger.info(f"Number of fetched items: {len(all_items)}") return [ item for item in all_items - if not item['withdrawn'] and item['inArchive'] and args.to_mtd_field not in item['metadata'] + if not item['withdrawn'] and item['inArchive'] ] -def create_missing_metadata(dspace_be, items, from_mtd_fields, to_mtd_field): +def is_valid_date(date: str): + """Check if the given string is a valid date.""" + try: + datetime.strptime(date, '%Y-%m-%d') + return True + except ValueError as e: + _logger.warning(f"[{date}] is not valid date. Error: {e}") + return False + + +def convert_to_date(value: str): + """Convert the value to a date format. Normalize date to 'YYYY-MM-DD' format, filling missing parts with '01'.""" + formats = ['%Y/%m/%d', '%d/%m/%Y', '%Y.%m.%d', '%d.%m.%Y', '%Y', + '%Y-%m', '%m-%Y', '%Y/%m', '%m/%Y', '%Y.%m', '%m.%Y'] + found = False + for fmt in formats: + try: + datetime_obj = datetime.strptime(value, fmt) + # Normalize date to 'YYYY-MM-DD' + if fmt in ['%Y-%m', '%Y/%m', '%Y.%m']: + return datetime_obj.strftime('%Y-%m-01') + elif fmt == '%Y': + return datetime_obj.strftime('%Y-01-01') + return datetime_obj.strftime('%Y-%m-%d') + except ValueError: + continue + _logger.error(f"Error converting [{value}] to date.") + return None + + +def process_metadata(dspace_be, items, from_mtd_fields, to_mtd_field): """Create missing metadata for items based on provided fields.""" - created, not_created, error_items = [], [], [] + created, updated, not_created, error_items, ok_items = [], [], [], [], [] for item in items: - mtd = item["metadata"] - found = False - - for from_mtd in from_mtd_fields: - if from_mtd in mtd and mtd[from_mtd]: - found = True - val = mtd[from_mtd][0]["value"] - _logger.info( - f"Metadata [{to_mtd_field}] replaced by [{from_mtd}] for item [{item['uuid']}]") - - # Add the new metadata - if dspace_be.client.add_metadata(Item(item), to_mtd_field, val): - created.append(item["uuid"]) - else: - _logger.warning( - f"Error creating metadata [{to_mtd_field}] for item [{item['uuid']}]") - error_items.append(item["uuid"]) - - break # Stop searching once we find a valid field - - if not found: - not_created.append(item["id"]) - - return created, not_created, error_items + uuid = item['uuid'] + item_mtd = item["metadata"] + + if to_mtd_field in item_mtd and item_mtd[to_mtd_field]: + val = item_mtd[to_mtd_field][0]["value"] + if is_valid_date(val): + ok_items.append(uuid) + continue + _logger.info(f"Item [{uuid}] has an invalid date in [{to_mtd_field}]: {val}") + new_mtd = convert_to_date(val) + if new_mtd is None: + _logger.warning(f"Cannot convert [{to_mtd_field}] " + f"to valid date for item [{uuid}]: {val}") + error_items.append(uuid) + continue + item_mtd[to_mtd_field][0]["value"] = new_mtd + item["metadata"] = item_mtd + if dspace_be.client.update_item(Item(item)): + updated.append(uuid) + else: + _logger.error( + f"Error updating [{to_mtd_field}] for item [{uuid}]") + error_items.append(uuid) + else: + found = False + for from_mtd in from_mtd_fields: + if from_mtd in item_mtd and item_mtd[from_mtd]: + val = item_mtd[from_mtd][0]["value"] + if not is_valid_date(val): + val = convert_to_date(val) + if val is None: + _logger.warning(f"Cannot convert [{from_mtd}] " + f"to valid date for item [{uuid}]: {val}") + continue + found = True + _logger.info( + f"Metadata [{to_mtd_field}] created from [{from_mtd}] for item [{uuid}]") + if dspace_be.client.add_metadata(Item(item), to_mtd_field, val): + created.append(uuid) + else: + _logger.warning( + f"Error creating metadata [{to_mtd_field}] for item [{uuid}]") + error_items.append(uuid) + break + + if not found: + not_created.append(uuid) + + return created, updated, not_created, error_items, ok_items if __name__ == '__main__': @@ -85,13 +142,21 @@ def create_missing_metadata(dspace_be, items, from_mtd_fields, to_mtd_field): # Fetch and filter items items_to_update = fetch_items(dspace_be) - # Create missing metadata - created, not_created, error_items = create_missing_metadata( + # Process items + created, updated, not_created, error_items, ok_items = process_metadata( dspace_be, items_to_update, args.from_mtd_field, args.to_mtd_field ) # Log results - _logger.info(f"Metadata [{args.to_mtd_field}] added to items: {created}") - _logger.warning(f"Metadata [{args.to_mtd_field}] not added to items: {not_created}") - _logger.warning( - f"Error adding metadata [{args.to_mtd_field}] to items: {error_items}") + _logger.info(f"Items with correct [{args.to_mtd_field}]: {ok_items}") + _logger.info(f"Items with created [{args.to_mtd_field}]: {created}") + _logger.warning(f"Items where [{args.to_mtd_field}] was not created: {not_created}") + _logger.warning(f"Items with errors during processing: {error_items}") + + _logger.info(f"Number of items to update: {len(items_to_update)}") + _logger.info(f"Number of items with correct [{args.to_mtd_field}]: {len(ok_items)}") + _logger.info(f"Number of items with updated [{args.to_mtd_field}]: {len(updated)}") + _logger.info(f"Number of items with created [{args.to_mtd_field}]: {len(created)}") + _logger.info( + f"Number of items where [{args.to_mtd_field}] was not created: {len(not_created)}") + _logger.info(f"Number of items with errors during processing: {len(error_items)}") From 5b71824a53176ff3cbf7f588d5ec2be5865bad7b Mon Sep 17 00:00:00 2001 From: Paurikova2 Date: Fri, 18 Oct 2024 10:34:43 +0200 Subject: [PATCH 06/16] added dc.date to from_mtd_field --- tools/add_metadata/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/add_metadata/README.md b/tools/add_metadata/README.md index 8d983b8..dc7ea68 100644 --- a/tools/add_metadata/README.md +++ b/tools/add_metadata/README.md @@ -2,5 +2,5 @@ This script adds new metadata to items that are missing it. The values for this new metadata are taken from the existing input metadata field. ``` -python add_metadata.py --to_mtd_field dc.date.issued --from_mtd_field dc.date.submitted dc.date.committed dc.date.defense +python add_metadata.py --to_mtd_field dc.date.issued --from_mtd_field dc.date.submitted dc.date.committed dc.date.defense dc.date ``` From 6ffac8f5124d9bdd76ad69b503793bd0313f1ac9 Mon Sep 17 00:00:00 2001 From: Paurikova2 Date: Fri, 18 Oct 2024 12:48:47 +0200 Subject: [PATCH 07/16] reauthentification --- tools/add_metadata/add_metadata.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/tools/add_metadata/add_metadata.py b/tools/add_metadata/add_metadata.py index 6458fb5..d10a617 100644 --- a/tools/add_metadata/add_metadata.py +++ b/tools/add_metadata/add_metadata.py @@ -68,7 +68,17 @@ def convert_to_date(value: str): return datetime_obj.strftime('%Y-%m-%d') except ValueError: continue - _logger.error(f"Error converting [{value}] to date.") + _logger.warning(f"Error converting [{value}] to date.") + return None + + +def update_item(item: Item): + if dspace_be.client.update_item(item): + return item + # Try to authenticate + _logger.info("Reauthorization during item updating") + if dspace_be.client.authenticate(retry=True): + return dspace_be.client.update_item(item) return None @@ -88,13 +98,13 @@ def process_metadata(dspace_be, items, from_mtd_fields, to_mtd_field): _logger.info(f"Item [{uuid}] has an invalid date in [{to_mtd_field}]: {val}") new_mtd = convert_to_date(val) if new_mtd is None: - _logger.warning(f"Cannot convert [{to_mtd_field}] " - f"to valid date for item [{uuid}]: {val}") + _logger.error(f"Cannot convert [{to_mtd_field}] " + f"to valid date for item [{uuid}]: {val}") error_items.append(uuid) continue item_mtd[to_mtd_field][0]["value"] = new_mtd item["metadata"] = item_mtd - if dspace_be.client.update_item(Item(item)): + if update_item(Item(item)): updated.append(uuid) else: _logger.error( From a635c46f00ad37eaf34127979036522aefb0cc22 Mon Sep 17 00:00:00 2001 From: Paurikova2 Date: Fri, 18 Oct 2024 12:56:43 +0200 Subject: [PATCH 08/16] remove retry=False authentification --- tools/add_metadata/add_metadata.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/add_metadata/add_metadata.py b/tools/add_metadata/add_metadata.py index d10a617..dc4698e 100644 --- a/tools/add_metadata/add_metadata.py +++ b/tools/add_metadata/add_metadata.py @@ -77,7 +77,7 @@ def update_item(item: Item): return item # Try to authenticate _logger.info("Reauthorization during item updating") - if dspace_be.client.authenticate(retry=True): + if dspace_be.client.authenticate(): return dspace_be.client.update_item(item) return None From b73216bf62341fb8156a441b8c3980c43973b72d Mon Sep 17 00:00:00 2001 From: Paurikova2 Date: Mon, 21 Oct 2024 07:36:15 +0200 Subject: [PATCH 09/16] added comments --- tools/add_metadata/add_metadata.py | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/tools/add_metadata/add_metadata.py b/tools/add_metadata/add_metadata.py index dc4698e..47e0cbe 100644 --- a/tools/add_metadata/add_metadata.py +++ b/tools/add_metadata/add_metadata.py @@ -4,6 +4,7 @@ import sys from datetime import datetime +# Set up directories for imports _this_dir = os.path.dirname(os.path.abspath(__file__)) sys.path.insert(0, os.path.join(_this_dir, "../../src")) sys.path.insert(0, os.path.join(_this_dir, "../../src/pump")) @@ -56,7 +57,6 @@ def convert_to_date(value: str): """Convert the value to a date format. Normalize date to 'YYYY-MM-DD' format, filling missing parts with '01'.""" formats = ['%Y/%m/%d', '%d/%m/%Y', '%Y.%m.%d', '%d.%m.%Y', '%Y', '%Y-%m', '%m-%Y', '%Y/%m', '%m/%Y', '%Y.%m', '%m.%Y'] - found = False for fmt in formats: try: datetime_obj = datetime.strptime(value, fmt) @@ -67,6 +67,7 @@ def convert_to_date(value: str): return datetime_obj.strftime('%Y-01-01') return datetime_obj.strftime('%Y-%m-%d') except ValueError: + # The test format does not match the input date format continue _logger.warning(f"Error converting [{value}] to date.") return None @@ -90,11 +91,17 @@ def process_metadata(dspace_be, items, from_mtd_fields, to_mtd_field): uuid = item['uuid'] item_mtd = item["metadata"] + # Check if the target metadata field exists and is not empty if to_mtd_field in item_mtd and item_mtd[to_mtd_field]: + # If there is more than one value, get only the first one val = item_mtd[to_mtd_field][0]["value"] + + # Check if the date is in the correct format if is_valid_date(val): ok_items.append(uuid) continue + + # Convert date to correct format if necessary _logger.info(f"Item [{uuid}] has an invalid date in [{to_mtd_field}]: {val}") new_mtd = convert_to_date(val) if new_mtd is None: @@ -102,8 +109,12 @@ def process_metadata(dspace_be, items, from_mtd_fields, to_mtd_field): f"to valid date for item [{uuid}]: {val}") error_items.append(uuid) continue + + # Update the item metadata with the converted date item_mtd[to_mtd_field][0]["value"] = new_mtd item["metadata"] = item_mtd + + # Update the item in the database if update_item(Item(item)): updated.append(uuid) else: @@ -112,18 +123,26 @@ def process_metadata(dspace_be, items, from_mtd_fields, to_mtd_field): error_items.append(uuid) else: found = False + # Check other metadata fields to create the target metadata field if not present for from_mtd in from_mtd_fields: + # Check if the target metadata field exists and is not empty if from_mtd in item_mtd and item_mtd[from_mtd]: + # If there is more than one value, get only the first one val = item_mtd[from_mtd][0]["value"] + + # Convert date if necessary if not is_valid_date(val): val = convert_to_date(val) if val is None: _logger.warning(f"Cannot convert [{from_mtd}] " f"to valid date for item [{uuid}]: {val}") continue + found = True _logger.info( f"Metadata [{to_mtd_field}] created from [{from_mtd}] for item [{uuid}]") + + # Update the item in the database if dspace_be.client.add_metadata(Item(item), to_mtd_field, val): created.append(uuid) else: @@ -132,6 +151,7 @@ def process_metadata(dspace_be, items, from_mtd_fields, to_mtd_field): error_items.append(uuid) break + # If no valid metadata field was found, add the item to the not_created list if not found: not_created.append(uuid) @@ -159,6 +179,7 @@ def process_metadata(dspace_be, items, from_mtd_fields, to_mtd_field): # Log results _logger.info(f"Items with correct [{args.to_mtd_field}]: {ok_items}") + _logger.info(f"Items with updated [{args.to_mtd_field}]: {updated}") _logger.info(f"Items with created [{args.to_mtd_field}]: {created}") _logger.warning(f"Items where [{args.to_mtd_field}] was not created: {not_created}") _logger.warning(f"Items with errors during processing: {error_items}") From 686be936a7fe9ad84fb6bc2f2d828dddd8bbfaf0 Mon Sep 17 00:00:00 2001 From: jm Date: Thu, 24 Oct 2024 09:32:40 +0200 Subject: [PATCH 10/16] refactored --- src/dspace/_rest.py | 35 ++- src/utils.py | 16 +- tools/add_metadata/README.md | 4 + tools/add_metadata/add_metadata.py | 345 +++++++++++++++++------------ 4 files changed, 253 insertions(+), 147 deletions(-) diff --git a/src/dspace/_rest.py b/src/dspace/_rest.py index 24b85a2..634afa9 100644 --- a/src/dspace/_rest.py +++ b/src/dspace/_rest.py @@ -333,14 +333,14 @@ def fetch_item(self, uuid: str): _logger.debug(f"Importing [] using [{url}]") return self._fetch(url, self.get, None) - def fetch_items(self): + def fetch_items(self, page_size: int = 100, limit=None): url = 'core/items' - _logger.debug(f"Fatch [] using [{url}]") + _logger.debug(f"Fetch [] using [{url}]") page = 0 items = [] while True: r = self._fetch(url, self.get, "_embedded", - params={"page": page, "size": 100}) + params={"page": page, "size": page_size}) if r is None: break key = "items" @@ -350,8 +350,37 @@ def fetch_items(self): else: _logger.warning(f"Key [{key}] does not exist in response: {r}") page += 1 + + if limit is not None and len(items) > limit: + return items[:limit] return items + def iter_items(self, page_size: int = 100, limit: int = -1): + from tqdm import tqdm + + url = 'core/items' + _logger.debug(f"Fetch iter [] using [{url}]") + page = 0 + len_items = 0 + with tqdm(desc="Fetching items", unit=" items") as pbar: + while True: + r = self._fetch(url, self.get, "_embedded", + params={"page": page, "size": page_size}) + if r is None: + break + key = "items" + items_data = r.get(key, []) + if items_data: + len_items += len(items_data) + yield items_data + else: + _logger.warning(f"Key [{key}] does not exist in response: {r}") + page += 1 + pbar.update(len(items_data)) + + if len_items > limit > 0: + return + def put_ws_item(self, param: dict, data: dict): url = 'clarin/import/workspaceitem' _logger.debug(f"Importing [{data}] using [{url}]") diff --git a/src/utils.py b/src/utils.py index 067591e..d45833a 100644 --- a/src/utils.py +++ b/src/utils.py @@ -15,15 +15,21 @@ def init_logging( os.makedirs(base_log_dir, exist_ok=True) formatter = logging.Formatter(format) - file_handler = logging.FileHandler(log_file) + file_handler = logging.FileHandler(log_file, encoding="utf-8") file_handler.setFormatter(formatter) file_handler.setLevel(file_level) logger.addHandler(file_handler) - console_handler = logging.StreamHandler() - console_handler.setLevel(console_level) - console_handler.setFormatter(formatter) - logger.addHandler(console_handler) + found_stream = None + for h in logger.handlers: + if isinstance(h, logging.StreamHandler): + found_stream = h + break + if found_stream is None: + console_handler = logging.StreamHandler() + console_handler.setLevel(console_level) + console_handler.setFormatter(formatter) + logger.addHandler(console_handler) logger.setLevel(logging.INFO) diff --git a/tools/add_metadata/README.md b/tools/add_metadata/README.md index dc7ea68..ea87740 100644 --- a/tools/add_metadata/README.md +++ b/tools/add_metadata/README.md @@ -4,3 +4,7 @@ This script adds new metadata to items that are missing it. The values for this ``` python add_metadata.py --to_mtd_field dc.date.issued --from_mtd_field dc.date.submitted dc.date.committed dc.date.defense dc.date ``` +Dry run: +``` +python add_metadata.py --dry-run --endpoint="http://dev-5.pc:86/server/api/" --to_mtd_field dc.date.issued --from_mtd_field dc.date.submitted dc.date.committed dc.date.defense dc.date +``` diff --git a/tools/add_metadata/add_metadata.py b/tools/add_metadata/add_metadata.py index 47e0cbe..38d8877 100644 --- a/tools/add_metadata/add_metadata.py +++ b/tools/add_metadata/add_metadata.py @@ -1,8 +1,10 @@ import argparse import logging +import time import os import sys from datetime import datetime +from collections import defaultdict # Set up directories for imports _this_dir = os.path.dirname(os.path.abspath(__file__)) @@ -22,172 +24,237 @@ init_logging(_logger, env["log_file"]) -def parse_arguments(): - """Parse command line arguments.""" - parser = argparse.ArgumentParser(description="Add metadata for DSpace items") - parser.add_argument("--to_mtd_field", - type=str, required=True, help="Metadata field to be created.") - parser.add_argument("--from_mtd_field", - type=str, nargs='+', required=True, - help="Metadata field(s) from which value(s) can be used.") - return parser.parse_args() +class date: + invalid = defaultdict(int) + invalid_but_converted = defaultdict(int) + def __init__(self, d: str): + self._input = d + self._d = d -def fetch_items(dspace_be): - """Fetch items from DSpace backend, filtering out withdrawn or non-archived items.""" - all_items = dspace_be.fetch_items() - _logger.info(f"Number of fetched items: {len(all_items)}") - return [ - item for item in all_items - if not item['withdrawn'] and item['inArchive'] - ] + @property + def input(self) -> str: + return self._input + @property + def value(self) -> str: + return self._d -def is_valid_date(date: str): - """Check if the given string is a valid date.""" - try: - datetime.strptime(date, '%Y-%m-%d') - return True - except ValueError as e: - _logger.warning(f"[{date}] is not valid date. Error: {e}") + def is_valid(self): + """Check if the given string is a valid date.""" + try: + datetime.strptime(self._d, '%Y-%m-%d') + return True + except ValueError as e: + date.invalid[self._d] += 1 + if date.invalid[self._d] == 1: + _logger.warning(f"[{self._d}] is not valid date. Error: {e}") + return False + + def parse(self) -> bool: + """Convert the value to a date format. Normalize date to 'YYYY-MM-DD' format, filling missing parts with '01'.""" + if len(self._d) < 1: + return False + + formats = ['%Y/%m/%d', '%d/%m/%Y', '%Y.%m.%d', '%d.%m.%Y', '%Y', + '%Y-%m', '%m-%Y', '%Y/%m', '%m/%Y', '%Y.%m', '%m.%Y'] + for fmt in formats: + try: + datetime_obj = datetime.strptime(self._d, fmt) + # Normalize date to 'YYYY-MM-DD' + if fmt in ['%Y-%m', '%Y/%m', '%Y.%m']: + self._d = datetime_obj.strftime('%Y-%m-01') + elif fmt == '%Y': + self._d = datetime_obj.strftime('%Y-01-01') + else: + self._d = datetime_obj.strftime('%Y-%m-%d') + return True + except ValueError: + # The test format does not match the input date format + continue + _logger.warning(f"Error converting [{self._d}] to date.") return False -def convert_to_date(value: str): - """Convert the value to a date format. Normalize date to 'YYYY-MM-DD' format, filling missing parts with '01'.""" - formats = ['%Y/%m/%d', '%d/%m/%Y', '%Y.%m.%d', '%d.%m.%Y', '%Y', - '%Y-%m', '%m-%Y', '%Y/%m', '%m/%Y', '%Y.%m', '%m.%Y'] - for fmt in formats: - try: - datetime_obj = datetime.strptime(value, fmt) - # Normalize date to 'YYYY-MM-DD' - if fmt in ['%Y-%m', '%Y/%m', '%Y.%m']: - return datetime_obj.strftime('%Y-%m-01') - elif fmt == '%Y': - return datetime_obj.strftime('%Y-01-01') - return datetime_obj.strftime('%Y-%m-%d') - except ValueError: - # The test format does not match the input date format - continue - _logger.warning(f"Error converting [{value}] to date.") - return None - - -def update_item(item: Item): +def update_item(item_d: dict): + item = Item(item_d) if dspace_be.client.update_item(item): - return item + return True # Try to authenticate _logger.info("Reauthorization during item updating") if dspace_be.client.authenticate(): - return dspace_be.client.update_item(item) - return None - + dso = dspace_be.client.update_item(item) + return dso is not None + return False + + +class updater: + + def __init__(self, dspace_be, from_mtd_fields: list, to_mtd_field: list, dry_run: bool = False): + self._dspace_be = dspace_be + self._from_mtd_fields = from_mtd_fields + self._to_mtd_field = to_mtd_field + self._dry_run = dry_run + self._info = { + "valid": [], + "invalid_date": [], + "invalid_date_all": set(), + "updated": [], + "error_updating": [], + "error_creating": [], + "created": [], + "not_created": [], + } + + @property + def cannot_parse(self): + return self._info["invalid_date_all"] + + @property + def info(self): + return self._info + + def update_existing_metadata(self, item: dict, date_str: str): + uuid = item['uuid'] + item_mtd = item["metadata"] -def process_metadata(dspace_be, items, from_mtd_fields, to_mtd_field): - """Create missing metadata for items based on provided fields.""" - created, updated, not_created, error_items, ok_items = [], [], [], [], [] + id_str = f"Item [{uuid}]: [{self._to_mtd_field}]" + # If there is more than one value, get only the first one + date_val = date(date_str) + if date_val.is_valid(): + self._info["valid"].append((uuid, date_val.input)) + return True + + parsed_ok = date_val.parse() + if parsed_ok is False: + _logger.error(f"{id_str}: cannot convert [{date_val.input}] to date") + self._info["invalid_date"].append((uuid, date_val.input)) + return False + + # Convert date to correct format if necessary + date.invalid_but_converted[date_val.input] += 1 + if date.invalid_but_converted[date_val.input] == 1: + _logger.info(f"{id_str}: invalid date [{date_val.input}] converted") + + # Update the item metadata with the converted date + item_mtd[self._to_mtd_field][0]["value"] = date_val.value + item["metadata"] = item_mtd + + # Update the item in the database + updated_ok = self._dry_run or update_item(item) + if not updated_ok: + _logger.error(f"{id_str}: error updating item") + self._info["error_updating"].append((uuid, date_val.input)) + return False + + self._info["updated"].append((uuid, date_val.input)) + return True - for item in items: + def add_new_metadata(self, item) -> bool: uuid = item['uuid'] item_mtd = item["metadata"] - # Check if the target metadata field exists and is not empty - if to_mtd_field in item_mtd and item_mtd[to_mtd_field]: - # If there is more than one value, get only the first one - val = item_mtd[to_mtd_field][0]["value"] - - # Check if the date is in the correct format - if is_valid_date(val): - ok_items.append(uuid) + for from_mtd in self._from_mtd_fields: + date_meta = item_mtd.get(from_mtd, None) + if date_meta is None: continue + id_str = f"Item [{uuid}]: [{from_mtd}]" - # Convert date to correct format if necessary - _logger.info(f"Item [{uuid}] has an invalid date in [{to_mtd_field}]: {val}") - new_mtd = convert_to_date(val) - if new_mtd is None: - _logger.error(f"Cannot convert [{to_mtd_field}] " - f"to valid date for item [{uuid}]: {val}") - error_items.append(uuid) - continue + # If there is more than one value, get only the first one + date_val = date(date_meta[0]["value"]) + # Convert date if necessary + if not date_val.is_valid(): + if not date_val.parse(): + self._info["invalid_date_all"].add(date_val.input) + continue - # Update the item metadata with the converted date - item_mtd[to_mtd_field][0]["value"] = new_mtd - item["metadata"] = item_mtd + _logger.debug(f"{id_str}: created...") # Update the item in the database - if update_item(Item(item)): - updated.append(uuid) - else: - _logger.error( - f"Error updating [{to_mtd_field}] for item [{uuid}]") - error_items.append(uuid) + + added = (self._dry_run or + self._dspace_be.client.add_metadata(Item(item), self._to_mtd_field, date_val.value)) + + if not added: + _logger.warning(f"{id_str}: Error creating metadata") + self._info["error_creating"].append((uuid, date_val.input)) + return False + + self._info["created"].append((uuid, date_val.input)) + return True + + self._info["not_created"].append((uuid, None)) + return False + + def update(self, item: dict) -> bool: + """Create missing metadata for items based on provided fields.""" + item_mtd = item["metadata"] + uuid = item['uuid'] + + # Check if the target metadata field exists and is not empty + date_meta = item_mtd.get(self._to_mtd_field, None) + if date_meta is not None: + return self.update_existing_metadata(item, date_meta[0]["value"]) else: - found = False - # Check other metadata fields to create the target metadata field if not present - for from_mtd in from_mtd_fields: - # Check if the target metadata field exists and is not empty - if from_mtd in item_mtd and item_mtd[from_mtd]: - # If there is more than one value, get only the first one - val = item_mtd[from_mtd][0]["value"] - - # Convert date if necessary - if not is_valid_date(val): - val = convert_to_date(val) - if val is None: - _logger.warning(f"Cannot convert [{from_mtd}] " - f"to valid date for item [{uuid}]: {val}") - continue - - found = True - _logger.info( - f"Metadata [{to_mtd_field}] created from [{from_mtd}] for item [{uuid}]") - - # Update the item in the database - if dspace_be.client.add_metadata(Item(item), to_mtd_field, val): - created.append(uuid) - else: - _logger.warning( - f"Error creating metadata [{to_mtd_field}] for item [{uuid}]") - error_items.append(uuid) - break - - # If no valid metadata field was found, add the item to the not_created list - if not found: - not_created.append(uuid) - - return created, updated, not_created, error_items, ok_items + return self.add_new_metadata(item) if __name__ == '__main__': - args = parse_arguments() + parser = argparse.ArgumentParser(description="Add metadata for DSpace items") + parser.add_argument("--to_mtd_field", + type=str, required=True, help="Metadata field to be created.") + parser.add_argument("--from_mtd_field", + type=str, nargs='+', required=True, + help="Metadata field(s) from which value(s) can be used.") + parser.add_argument("--endpoint", type=str, default=env["backend"]["endpoint"]) + parser.add_argument("--user", type=str, default=env["backend"]["user"]) + parser.add_argument("--password", type=str, default=env["backend"]["password"]) + parser.add_argument("--dry-run", action='store_true', default=False) + args = parser.parse_args() + + start = time.time() # Initialize DSpace backend - dspace_be = dspace.rest( - env["backend"]["endpoint"], - env["backend"]["user"], - env["backend"]["password"], - env["backend"]["authentication"] - ) + dspace_be = dspace.rest(args.endpoint, args.user, args.password, True) - # Fetch and filter items - items_to_update = fetch_items(dspace_be) + upd = updater(dspace_be, args.from_mtd_field, args.to_mtd_field, dry_run=args.dry_run) + titles = defaultdict(int) # Process items - created, updated, not_created, error_items, ok_items = process_metadata( - dspace_be, items_to_update, args.from_mtd_field, args.to_mtd_field - ) - - # Log results - _logger.info(f"Items with correct [{args.to_mtd_field}]: {ok_items}") - _logger.info(f"Items with updated [{args.to_mtd_field}]: {updated}") - _logger.info(f"Items with created [{args.to_mtd_field}]: {created}") - _logger.warning(f"Items where [{args.to_mtd_field}] was not created: {not_created}") - _logger.warning(f"Items with errors during processing: {error_items}") - - _logger.info(f"Number of items to update: {len(items_to_update)}") - _logger.info(f"Number of items with correct [{args.to_mtd_field}]: {len(ok_items)}") - _logger.info(f"Number of items with updated [{args.to_mtd_field}]: {len(updated)}") - _logger.info(f"Number of items with created [{args.to_mtd_field}]: {len(created)}") - _logger.info( - f"Number of items where [{args.to_mtd_field}] was not created: {len(not_created)}") - _logger.info(f"Number of items with errors during processing: {len(error_items)}") + for items in dspace_be.iter_items(): + items = [item for item in items if not item['withdrawn'] and item['inArchive']] + for item in items: + dc_titles = item['metadata'].get('dc.title', []) + if len(dc_titles) > 0: + titles[dc_titles[0]['value']] += 1 + upd.update(item) + + _logger.info(40 * "=") + _logger.info("Item info:") + limit = 50 + for k, v in upd.info.items(): + _logger.info(f"{k:20s}:{len(v):6d}: first {limit} items .. {list(v)[:limit]}...") + + _logger.info(40 * "=") + _logger.info("Date info") + msgs = "\n\t".join(upd.cannot_parse) + _logger.info(f"Cannot parse [{len(msgs)}]:\n\t{msgs}") + inv_arr = [(v, f"[{k:15s}]: {v:4d}") for k, v in date.invalid.items()] + inv_arr.sort(key=lambda x: x[0], reverse=True) + msgs = "\n\t".join([x[1] for x in inv_arr]) + _logger.info(f"Date invalid [{len(msgs)}]:\n\t{msgs}") + + _logger.info(40 * "=") + show_limit = 100 + duplicates = {k: v for k, v in titles.items() if v > 1} + _logger.info("Duplicates {len(duplicates)} (showing first {show_limit}:") + for i, (k, v) in enumerate(duplicates.items()): + if i >= show_limit: + break + _logger.info(f"Title [{k}] : {v}") + + _logger.info(40 * "=") + _logger.info("Statistics:") + for k, v in upd.info.items(): + _logger.info(f"{k:25s}: {len(v):6d}") + _logger.info(f"Total time: {time.time() - start:.2f} s") From 1e46528c6500dfb4f2836f0a0b36a2a7449ad951 Mon Sep 17 00:00:00 2001 From: jm Date: Thu, 24 Oct 2024 20:26:00 +0200 Subject: [PATCH 11/16] additional info, logging pollution reduced --- libs/dspace-rest-python | 2 +- tools/add_metadata/add_metadata.py | 59 +++++++++++++++++++++++------- 2 files changed, 47 insertions(+), 14 deletions(-) diff --git a/libs/dspace-rest-python b/libs/dspace-rest-python index 36b2f0c..5724696 160000 --- a/libs/dspace-rest-python +++ b/libs/dspace-rest-python @@ -1 +1 @@ -Subproject commit 36b2f0cd3ab1b492ad700df740dca4d3848c0e76 +Subproject commit 57246965e89c707ae9f3f4821650793c627ecd27 diff --git a/tools/add_metadata/add_metadata.py b/tools/add_metadata/add_metadata.py index 38d8877..a151a78 100644 --- a/tools/add_metadata/add_metadata.py +++ b/tools/add_metadata/add_metadata.py @@ -16,6 +16,7 @@ import project_settings # noqa from dspace_rest_client.models import Item from utils import init_logging, update_settings # noqa +logging.getLogger("dspace.client").setLevel(logging.WARNING) _logger = logging.getLogger() @@ -159,6 +160,8 @@ def add_new_metadata(self, item) -> bool: if date_meta is None: continue id_str = f"Item [{uuid}]: [{from_mtd}]" + if len(date_meta) != 1: + _logger.warning(f"{id_str}: more than one value {date_meta}") # If there is more than one value, get only the first one date_val = date(date_meta[0]["value"]) @@ -199,6 +202,26 @@ def update(self, item: dict) -> bool: return self.add_new_metadata(item) +class additional_stats: + + def __init__(self): + self._titles = defaultdict(int) + + def update(self, item: dict): + dc_titles = item['metadata'].get('dc.title', []) + if len(dc_titles) > 0: + self._titles[dc_titles[0]['value']] += 1 + + def print_info(self, show_limit=100): + duplicates = {k: v for k, v in self._titles.items() if v > 1} + _logger.info( + f"Duplicates {len(duplicates)} ({sum(duplicates.values())}) (showing first {show_limit}):") + for i, (k, v) in enumerate(duplicates.items()): + if i >= show_limit: + break + _logger.info(f"Title [{k}] : {v}") + + if __name__ == '__main__': parser = argparse.ArgumentParser(description="Add metadata for DSpace items") parser.add_argument("--to_mtd_field", @@ -211,6 +234,7 @@ def update(self, item: dict) -> bool: parser.add_argument("--password", type=str, default=env["backend"]["password"]) parser.add_argument("--dry-run", action='store_true', default=False) args = parser.parse_args() + _logger.info(f"Arguments: {args}") start = time.time() @@ -219,14 +243,17 @@ def update(self, item: dict) -> bool: upd = updater(dspace_be, args.from_mtd_field, args.to_mtd_field, dry_run=args.dry_run) - titles = defaultdict(int) + stats = additional_stats() + # Process items + len_all_items = 0 + len_used_items = 0 for items in dspace_be.iter_items(): + len_all_items += len(items) items = [item for item in items if not item['withdrawn'] and item['inArchive']] + len_used_items += len(items) for item in items: - dc_titles = item['metadata'].get('dc.title', []) - if len(dc_titles) > 0: - titles[dc_titles[0]['value']] += 1 + stats.update(item) upd.update(item) _logger.info(40 * "=") @@ -245,16 +272,22 @@ def update(self, item: dict) -> bool: _logger.info(f"Date invalid [{len(msgs)}]:\n\t{msgs}") _logger.info(40 * "=") - show_limit = 100 - duplicates = {k: v for k, v in titles.items() if v > 1} - _logger.info("Duplicates {len(duplicates)} (showing first {show_limit}:") - for i, (k, v) in enumerate(duplicates.items()): - if i >= show_limit: - break - _logger.info(f"Title [{k}] : {v}") + stats.print_info() _logger.info(40 * "=") - _logger.info("Statistics:") + _logger.info("Update statistics:") for k, v in upd.info.items(): _logger.info(f"{k:25s}: {len(v):6d}") - _logger.info(f"Total time: {time.time() - start:.2f} s") + took = time.time() - start + + _logger.info(40 * "=") + _logger.info("Counts:") + _logger.info(f"Total items: {len_all_items}") + _logger.info(f"Used items: {len_used_items}") + # sets are not counted + _logger.info( + f"Sum of updates: {sum(len(x) for x in upd.info.values() if isinstance(x, list))}") + + _logger.info(40 * "=") + _logger.info( + f"Total time: {took:.2f} s [{time.strftime('%H:%M:%S', time.gmtime(took))}]") From a53d3c24992e116a6aab4581472b44b88c869fa6 Mon Sep 17 00:00:00 2001 From: jm Date: Mon, 28 Oct 2024 09:02:10 +0100 Subject: [PATCH 12/16] show doubles --- tools/add_metadata/add_metadata.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tools/add_metadata/add_metadata.py b/tools/add_metadata/add_metadata.py index a151a78..6c391e8 100644 --- a/tools/add_metadata/add_metadata.py +++ b/tools/add_metadata/add_metadata.py @@ -197,6 +197,8 @@ def update(self, item: dict) -> bool: # Check if the target metadata field exists and is not empty date_meta = item_mtd.get(self._to_mtd_field, None) if date_meta is not None: + if len(date_meta) != 1: + _logger.critical(f"{uuid}: more than one value {date_meta}") return self.update_existing_metadata(item, date_meta[0]["value"]) else: return self.add_new_metadata(item) @@ -206,11 +208,16 @@ class additional_stats: def __init__(self): self._titles = defaultdict(int) + self._doubles = defaultdict(list) def update(self, item: dict): + uuid = item['uuid'] dc_titles = item['metadata'].get('dc.title', []) if len(dc_titles) > 0: self._titles[dc_titles[0]['value']] += 1 + key = 'dc.date.issued' + if len(item['metadata'].get(key, [])) > 1: + self._doubles[key].append(uuid) def print_info(self, show_limit=100): duplicates = {k: v for k, v in self._titles.items() if v > 1} @@ -220,6 +227,10 @@ def print_info(self, show_limit=100): if i >= show_limit: break _logger.info(f"Title [{k}] : {v}") + if len(self._doubles) > 0: + _logger.info("Multiple values when expecting at most 1:") + for k, v in self._doubles.items(): + _logger.info(f"{k}: {v}") if __name__ == '__main__': From af0fb2a754ef0e558a6acfe937076f933830bdd0 Mon Sep 17 00:00:00 2001 From: Paurikova2 Date: Mon, 28 Oct 2024 12:10:49 +0100 Subject: [PATCH 13/16] remove duplicities mtd --- libs/dspace-rest-python | 2 +- tools/add_metadata/add_metadata.py | 29 ++++++++++++++++++++++++++--- 2 files changed, 27 insertions(+), 4 deletions(-) diff --git a/libs/dspace-rest-python b/libs/dspace-rest-python index 36b2f0c..b844198 160000 --- a/libs/dspace-rest-python +++ b/libs/dspace-rest-python @@ -1 +1 @@ -Subproject commit 36b2f0cd3ab1b492ad700df740dca4d3848c0e76 +Subproject commit b84419845c9fa55a29484509a3a14c2c0cb86982 diff --git a/tools/add_metadata/add_metadata.py b/tools/add_metadata/add_metadata.py index 38d8877..4febf9e 100644 --- a/tools/add_metadata/add_metadata.py +++ b/tools/add_metadata/add_metadata.py @@ -56,13 +56,13 @@ def parse(self) -> bool: if len(self._d) < 1: return False - formats = ['%Y/%m/%d', '%d/%m/%Y', '%Y.%m.%d', '%d.%m.%Y', '%Y', + formats = ['%Y/%m/%d', '%d/%m/%Y', '%Y.%m.%d', '%d.%m.%Y', '%d. %m. %Y', '%Y', '%Y-%m', '%m-%Y', '%Y/%m', '%m/%Y', '%Y.%m', '%m.%Y'] for fmt in formats: try: datetime_obj = datetime.strptime(self._d, fmt) # Normalize date to 'YYYY-MM-DD' - if fmt in ['%Y-%m', '%Y/%m', '%Y.%m']: + if fmt in ['%Y-%m', '%Y/%m', '%Y.%m', '%m-%Y', '%m/%Y', '%m.%Y']: self._d = datetime_obj.strftime('%Y-%m-01') elif fmt == '%Y': self._d = datetime_obj.strftime('%Y-01-01') @@ -97,6 +97,7 @@ def __init__(self, dspace_be, from_mtd_fields: list, to_mtd_field: list, dry_run self._dry_run = dry_run self._info = { "valid": [], + "multiple": [], "invalid_date": [], "invalid_date_all": set(), "updated": [], @@ -194,7 +195,29 @@ def update(self, item: dict) -> bool: # Check if the target metadata field exists and is not empty date_meta = item_mtd.get(self._to_mtd_field, None) if date_meta is not None: - return self.update_existing_metadata(item, date_meta[0]["value"]) + val = date_meta[0]["value"] + # Check if items have multiple values for to_mtd_value + if len(date_meta) > 1: + _logger.warning( + f'Item [{uuid}] has multiple values for {self._to_mtd_field}!') + self._info["multiple"].append(uuid) + if not self._dry_run: + val = '' + for i in range(len(date_meta)): + if len(val) == 0: + date_val = date(date_meta[i]["value"]) + if date_val.is_valid() or date_val.parse(): + val = date_val.value + continue + if val == '' and i == len(date_meta) - 1: + val = date_meta[i]["value"] + continue + dspace_be.client.remove_metadata( + Item(item), self._to_mtd_field, i) + + # Reload item and metadata + item = dspace_be._fetch(f'core/items/{uuid}', dspace_be.get, None) + return self.update_existing_metadata(item, val) else: return self.add_new_metadata(item) From 71ede942e0e7edd81fa2d10a3420c11f76c606ba Mon Sep 17 00:00:00 2001 From: Paurikova2 Date: Mon, 28 Oct 2024 12:19:36 +0100 Subject: [PATCH 14/16] add date formats --- tools/add_metadata/add_metadata.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/add_metadata/add_metadata.py b/tools/add_metadata/add_metadata.py index 1f559c5..65d55c8 100644 --- a/tools/add_metadata/add_metadata.py +++ b/tools/add_metadata/add_metadata.py @@ -58,12 +58,12 @@ def parse(self) -> bool: return False formats = ['%Y/%m/%d', '%d/%m/%Y', '%Y.%m.%d', '%d.%m.%Y', '%Y', - '%Y-%m', '%m-%Y', '%Y/%m', '%m/%Y', '%Y.%m', '%m.%Y'] + '%Y-%m', '%m-%Y', '%Y/%m', '%m/%Y', '%Y.%m', '%m.%Y', '%d. %m. %Y'] for fmt in formats: try: datetime_obj = datetime.strptime(self._d, fmt) # Normalize date to 'YYYY-MM-DD' - if fmt in ['%Y-%m', '%Y/%m', '%Y.%m']: + if fmt in ['%Y-%m', '%Y/%m', '%Y.%m', '%m-%Y', "%m/%Y", "%m.%Y"]: self._d = datetime_obj.strftime('%Y-%m-01') elif fmt == '%Y': self._d = datetime_obj.strftime('%Y-01-01') From f517cf7b997768b2ba10443d43e0881a30cba33c Mon Sep 17 00:00:00 2001 From: Paurikova2 Date: Mon, 28 Oct 2024 23:08:59 +0100 Subject: [PATCH 15/16] removed metadata --- tools/add_metadata/add_metadata.py | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/tools/add_metadata/add_metadata.py b/tools/add_metadata/add_metadata.py index 6c391e8..c16f73e 100644 --- a/tools/add_metadata/add_metadata.py +++ b/tools/add_metadata/add_metadata.py @@ -58,12 +58,12 @@ def parse(self) -> bool: return False formats = ['%Y/%m/%d', '%d/%m/%Y', '%Y.%m.%d', '%d.%m.%Y', '%Y', - '%Y-%m', '%m-%Y', '%Y/%m', '%m/%Y', '%Y.%m', '%m.%Y'] + '%Y-%m', '%m-%Y', '%Y/%m', '%m/%Y', '%Y.%m', '%m.%Y', '%d. %m. %Y'] for fmt in formats: try: datetime_obj = datetime.strptime(self._d, fmt) # Normalize date to 'YYYY-MM-DD' - if fmt in ['%Y-%m', '%Y/%m', '%Y.%m']: + if fmt in ['%Y-%m', '%Y/%m', '%Y.%m', '%m-%Y', "%m/%Y", "%m.%Y"]: self._d = datetime_obj.strftime('%Y-%m-01') elif fmt == '%Y': self._d = datetime_obj.strftime('%Y-01-01') @@ -98,6 +98,7 @@ def __init__(self, dspace_be, from_mtd_fields: list, to_mtd_field: list, dry_run self._dry_run = dry_run self._info = { "valid": [], + "multiple": [], "invalid_date": [], "invalid_date_all": set(), "updated": [], @@ -197,9 +198,26 @@ def update(self, item: dict) -> bool: # Check if the target metadata field exists and is not empty date_meta = item_mtd.get(self._to_mtd_field, None) if date_meta is not None: + val = date_meta[0]["value"] if len(date_meta) != 1: _logger.critical(f"{uuid}: more than one value {date_meta}") - return self.update_existing_metadata(item, date_meta[0]["value"]) + self._info["multiple"].append(uuid) + if not self._dry_run: + val = '' + for i in range(len(date_meta)): + if len(val) == 0: + date_val = date(date_meta[i]["value"]) + if date_val.is_valid() or date_val.parse(): + val = date_val.value + continue + if val == '' and i == len(date_meta) - 1: + val = date_meta[i]["value"] + continue + dspace_be.client.remove_metadata( + Item(item), self._to_mtd_field, i) + # Reload item and metadata + item = dspace_be._fetch(f'core/items/{uuid}', dspace_be.get, None) + return self.update_existing_metadata(item, val) else: return self.add_new_metadata(item) From 68b13b07b9affa32a809df64b309bb0b85c75027 Mon Sep 17 00:00:00 2001 From: jm Date: Tue, 29 Oct 2024 14:00:44 +0100 Subject: [PATCH 16/16] option to fetch 1 item --- libs/dspace-rest-python | 2 +- src/dspace/_rest.py | 21 ++++++++++++++++----- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/libs/dspace-rest-python b/libs/dspace-rest-python index b844198..dc698dd 160000 --- a/libs/dspace-rest-python +++ b/libs/dspace-rest-python @@ -1 +1 @@ -Subproject commit b84419845c9fa55a29484509a3a14c2c0cb86982 +Subproject commit dc698dd95e400a8e88de1ae9b6112cb4d698fb4c diff --git a/src/dspace/_rest.py b/src/dspace/_rest.py index 634afa9..6379c9c 100644 --- a/src/dspace/_rest.py +++ b/src/dspace/_rest.py @@ -355,26 +355,37 @@ def fetch_items(self, page_size: int = 100, limit=None): return items[:limit] return items - def iter_items(self, page_size: int = 100, limit: int = -1): + def iter_items(self, page_size: int = 100, limit: int = -1, uuid: str = None): from tqdm import tqdm url = 'core/items' _logger.debug(f"Fetch iter [] using [{url}]") page = 0 len_items = 0 + item_key = "items" + fetch_key = "_embedded" + + if uuid is not None: + fetch_key = None + url = f"{url}/{uuid}" + with tqdm(desc="Fetching items", unit=" items") as pbar: while True: - r = self._fetch(url, self.get, "_embedded", + r = self._fetch(url, self.get, fetch_key, params={"page": page, "size": page_size}) if r is None: break - key = "items" - items_data = r.get(key, []) + # only one + if uuid is not None: + yield [r] + return + + items_data = r.get(item_key, []) if items_data: len_items += len(items_data) yield items_data else: - _logger.warning(f"Key [{key}] does not exist in response: {r}") + _logger.warning(f"Key [{item_key}] does not exist in response: {r}") page += 1 pbar.update(len(items_data))