From 1b359a0eb47e84fccc96c578b876eac7c196c8e9 Mon Sep 17 00:00:00 2001 From: milanmajchrak Date: Wed, 20 Sep 2023 16:35:26 +0200 Subject: [PATCH 1/6] It called mappedCollections Vanilla endpoint --- const.py | 14 +- data_pump/item.py | 234 ++++++++++++++++------------- data_pump/metadata.py | 16 +- data_pump/utils.py | 13 ++ main.data_pump.py | 184 +++++++++++------------ support/dspace_interface/client.py | 13 +- 6 files changed, 259 insertions(+), 215 deletions(-) diff --git a/const.py b/const.py index 7a163b8..9d5bf6e 100644 --- a/const.py +++ b/const.py @@ -1,7 +1,7 @@ import enum import logging -user = "test@test.edu" +user = "m@m.edu" password = "dspace" # password = "admin" # user = "m@edu.com" @@ -9,12 +9,12 @@ # http or https use_ssl = False -# host = "localhost" -host = "dev-5.pc" -# fe_port = ":4000" -fe_port = None -# be_port = ":8080" -be_port = None +host = "localhost" +# host = "dev-5.pc" +fe_port = ":4000" +# fe_port = None +be_port = ":8080" +# be_port = None be_location = "/server/" # config logging diff --git a/data_pump/item.py b/data_pump/item.py index 68de29b..fd29b61 100644 --- a/data_pump/item.py +++ b/data_pump/item.py @@ -1,7 +1,7 @@ import logging from data_pump.utils import read_json, convert_response_to_json, do_api_post, \ - save_dict_as_json + save_dict_as_json, do_api_post_custom from support.dspace_proxy import rest_proxy from const import API_URL @@ -24,9 +24,11 @@ def import_item(metadata_class, saved_workspace_json_name = "workspaceitem_dict.json" workflowitem_json_name = 'workflowitem.json' saved_workflow_json_name = "workflow_dict.json" + collection2table_name = "collection2item.json" item_url = 'clarin/import/item' saved_item_json_name = "item_dict.json" workflowitem_url = 'clarin/import/workflowitem' + item2collection_url = 'core/items/{item_uuid}/mappedCollections' imported_workspaceitem = 0 imported_workflowitem = 0 imported_item = 0 @@ -34,110 +36,138 @@ def import_item(metadata_class, # create dict from items by item id item_json_list = read_json(item_json_name) items_dict = {} - if not item_json_list: - logging.info("Item JSON is empty.") - return - for item in item_json_list: - items_dict[item['item_id']] = item - statistics_dict['item'] = (len(item_json_list), 0) + # if not item_json_list: + # logging.info("Item JSON is empty.") + # return + # for item in item_json_list: + # items_dict[item['item_id']] = item + # statistics_dict['item'] = (len(item_json_list), 0) + # + # # create item and workspaceitem + # workspaceitem_json_list = read_json(workspaceitem_json_name) + # if workspaceitem_json_list is not None: + # for workspaceitem in workspaceitem_json_list: + # item = items_dict[workspaceitem['item_id']] + # import_workspaceitem(item, workspaceitem['collection_id'], + # workspaceitem['multiple_titles'], + # workspaceitem['published_before'], + # workspaceitem['multiple_files'], + # workspaceitem['stage_reached'], + # workspaceitem['page_reached'], + # metadata_class, + # handle_class, + # workspaceitem_id_dict, + # item_id_dict, + # collection_id_dict, + # eperson_id_dict) + # imported_workspaceitem += 1 + # del items_dict[workspaceitem['item_id']] + # + # statistics_dict['workspaceitem'] = (len(workspaceitem_json_list), + # imported_workspaceitem) + # imported_item += imported_workspaceitem + # # save workspaceitem dict as json + # if save_dict: + # save_dict_as_json(saved_workspace_json_name, workspaceitem_id_dict) + # logging.info("Workspaceitem was successfully imported!") + # else: + # logging.info("Workspaceitem JSON is empty.") + # # create workflowitem + # # workflowitem is created from workspaceitem + # # -1, because the workflowitem doesn't contain this attribute + # workflowitem_json_list = read_json(workflowitem_json_name) + # if workflowitem_json_list is not None: + # for workflowitem in workflowitem_json_list: + # item = items_dict[workflowitem['item_id']] + # import_workspaceitem(item, workflowitem['collection_id'], + # workflowitem['multiple_titles'], + # workflowitem['published_before'], + # workflowitem['multiple_files'], + # -1, + # -1, + # metadata_class, + # handle_class, + # workspaceitem_id_dict, + # item_id_dict, + # collection_id_dict, + # eperson_id_dict) + # # create workflowitem from created workspaceitem + # params = {'id': str(workspaceitem_id_dict[workflowitem['item_id']])} + # try: + # response = do_api_post(workflowitem_url, params, None) + # workflowitem_id_dict[workflowitem['workflow_id']] = \ + # response.headers['workflowitem_id'] + # imported_workflowitem += 1 + # except Exception as e: + # logging.error('POST request ' + workflowitem_url + ' for id: ' + + # str(workflowitem['item_id']) + ' failed. Exception: ' + + # str(e)) + # del items_dict[workflowitem['item_id']] + # + # # save workflow dict as json + # if save_dict: + # save_dict_as_json(saved_workflow_json_name, workflowitem_id_dict) + # statistics_val = (len(workflowitem_json_list), imported_workflowitem) + # statistics_dict['workflowitem'] = statistics_val + # imported_item += imported_workflowitem + # logging.info("Cwf_workflowitem was successfully imported!") + # else: + # logging.info("Workflowitem JSON is empty.") + # + # # create other items + # for item in items_dict.values(): + # item_json_p = { + # 'discoverable': item['discoverable'], + # 'inArchive': item['in_archive'], + # 'lastModified': item['last_modified'], + # 'withdrawn': item['withdrawn'] + # } + # metadatvalue_item_dict = metadata_class.get_metadata_value(2, item['item_id']) + # if metadatvalue_item_dict: + # item_json_p['metadata'] = metadatvalue_item_dict + # handle_item = handle_class.get_handle(2, item['item_id']) + # if handle_item is not None: + # item_json_p['handle'] = handle_item + # params = { + # 'owningCollection': collection_id_dict[item['owning_collection']], + # 'epersonUUID': eperson_id_dict[item['submitter_id']] + # } + # try: + # response = do_api_post(item_url, params, item_json_p) + # response_json = convert_response_to_json(response) + # item_id_dict[item['item_id']] = response_json['id'] + # imported_item += 1 + # except Exception as e: + # logging.error('POST request ' + item_url + ' for id: ' + + # str(item['item_id']) + ' failed. Exception: ' + str(e)) - # create item and workspaceitem - workspaceitem_json_list = read_json(workspaceitem_json_name) - if workspaceitem_json_list is not None: - for workspaceitem in workspaceitem_json_list: - item = items_dict[workspaceitem['item_id']] - import_workspaceitem(item, workspaceitem['collection_id'], - workspaceitem['multiple_titles'], - workspaceitem['published_before'], - workspaceitem['multiple_files'], - workspaceitem['stage_reached'], - workspaceitem['page_reached'], - metadata_class, - handle_class, - workspaceitem_id_dict, - item_id_dict, - collection_id_dict, - eperson_id_dict) - imported_workspaceitem += 1 - del items_dict[workspaceitem['item_id']] + # Import collection2item table - only items which are mapped in more collections + # Add another collection into Item only if another collection is not owning_collection + collection2table_json_list = read_json(collection2table_name) + coll_2_item_dict = {} + items_with_more_colls = {} + # Find items which are mapped in more collections and store them into dictionary in this way + # {'item_uuid': [collection_uuid_1, collection_uuid_2]} + for collection2table in collection2table_json_list: + # Every item should have mapped only one collection - the owning collection except the items which + # are mapped into more collections + if collection2table['item_id'] in coll_2_item_dict: + item_uuid = item_id_dict[collection2table['item_id']] + collection_uuid = collection_id_dict[collection2table['collection_id']] + # Add item UUID into list + items_with_more_colls[item_uuid] = collection_uuid + continue + coll_2_item_dict[collection2table['item_id']] = collection_id_dict[collection2table['collection_id']] - statistics_dict['workspaceitem'] = (len(workspaceitem_json_list), - imported_workspaceitem) - imported_item += imported_workspaceitem - # save workspaceitem dict as json - if save_dict: - save_dict_as_json(saved_workspace_json_name, workspaceitem_id_dict) - logging.info("Workspaceitem was successfully imported!") - else: - logging.info("Workspaceitem JSON is empty.") - # create workflowitem - # workflowitem is created from workspaceitem - # -1, because the workflowitem doesn't contain this attribute - workflowitem_json_list = read_json(workflowitem_json_name) - if workflowitem_json_list is not None: - for workflowitem in workflowitem_json_list: - item = items_dict[workflowitem['item_id']] - import_workspaceitem(item, workflowitem['collection_id'], - workflowitem['multiple_titles'], - workflowitem['published_before'], - workflowitem['multiple_files'], - -1, - -1, - metadata_class, - handle_class, - workspaceitem_id_dict, - item_id_dict, - collection_id_dict, - eperson_id_dict) - # create workflowitem from created workspaceitem - params = {'id': str(workspaceitem_id_dict[workflowitem['item_id']])} - try: - response = do_api_post(workflowitem_url, params, None) - workflowitem_id_dict[workflowitem['workflow_id']] = \ - response.headers['workflowitem_id'] - imported_workflowitem += 1 - except Exception as e: - logging.error('POST request ' + workflowitem_url + ' for id: ' + - str(workflowitem['item_id']) + ' failed. Exception: ' + - str(e)) - del items_dict[workflowitem['item_id']] - # save workflow dict as json - if save_dict: - save_dict_as_json(saved_workflow_json_name, workflowitem_id_dict) - statistics_val = (len(workflowitem_json_list), imported_workflowitem) - statistics_dict['workflowitem'] = statistics_val - imported_item += imported_workflowitem - logging.info("Cwf_workflowitem was successfully imported!") - else: - logging.info("Workflowitem JSON is empty.") - - # create other items - for item in items_dict.values(): - item_json_p = { - 'discoverable': item['discoverable'], - 'inArchive': item['in_archive'], - 'lastModified': item['last_modified'], - 'withdrawn': item['withdrawn'] - } - metadatvalue_item_dict = metadata_class.get_metadata_value(2, item['item_id']) - if metadatvalue_item_dict: - item_json_p['metadata'] = metadatvalue_item_dict - handle_item = handle_class.get_handle(2, item['item_id']) - if handle_item is not None: - item_json_p['handle'] = handle_item - params = { - 'owningCollection': collection_id_dict[item['owning_collection']], - 'epersonUUID': eperson_id_dict[item['submitter_id']] - } - try: - response = do_api_post(item_url, params, item_json_p) - response_json = convert_response_to_json(response) - item_id_dict[item['item_id']] = response_json['id'] - imported_item += 1 - except Exception as e: - logging.error('POST request ' + item_url + ' for id: ' + - str(item['item_id']) + ' failed. Exception: ' + str(e)) + # Call Vanilla REST endpoint which add relation between Item and Collection into the collection2item table + for item_with_more_coll_id in items_with_more_colls.keys(): + # Prepare request URL - replace `{item_uuid}` with current `item_with_more_coll_id` + request_url = item2collection_url.replace('{item_uuid}', item_with_more_coll_id) + response = do_api_post_custom(request_url, {}, '', 'text/uri-list') + # Prepare request body which should looks like this: + # `"https://localhost:8080/spring-rest/api/core/collections/{collection_uuid_1}" + \n + # "https://localhost:8080/spring-rest/api/core/collections/{collection_uuid_2}" # save item dict as json if save_dict: diff --git a/data_pump/metadata.py b/data_pump/metadata.py index d9d75b9..7ef9043 100644 --- a/data_pump/metadata.py +++ b/data_pump/metadata.py @@ -15,16 +15,16 @@ def __init__(self, statistics_dict, insert_dict): self.metadatavalue_dict = {} self.metadataschema_id_dict = {} self.metadatafield_id_dict = {} - if insert_dict: - self.metadataschema_id_dict = \ - insert_data_into_dicts("metadataschemaregistry.json") - self.metadatafield_id_dict = \ - insert_data_into_dicts("metadatafieldregistry.json") + # if insert_dict: + # self.metadataschema_id_dict = \ + # insert_data_into_dicts("metadataschemaregistry.json") + # self.metadatafield_id_dict = \ + # insert_data_into_dicts("metadatafieldregistry.json") # import all metadata - self.read_metadata() - self.import_metadataschemaregistry(statistics_dict) - self.import_metadatafieldregistry(statistics_dict) + # self.read_metadata() + # self.import_metadataschemaregistry(statistics_dict) + # self.import_metadatafieldregistry(statistics_dict) def read_metadata(self): metadatavalue_json_name = 'metadatavalue.json' diff --git a/data_pump/utils.py b/data_pump/utils.py index b16ad08..a352319 100644 --- a/data_pump/utils.py +++ b/data_pump/utils.py @@ -43,6 +43,19 @@ def do_api_post(url, params: dict, json_p): return response +def do_api_post_custom(url, params: dict, list, content_type): + """ + Insert data into database by api. + @param url: url for api post + @param params: parameters for api post + @param json_p: posted data + @return: response from api post + """ + url = API_URL + url + response = rest_proxy.d.api_post(url, params, list, False, content_type) + return response + + def do_api_get_one(url, object_id): """ Get data with id from table. diff --git a/main.data_pump.py b/main.data_pump.py index 183e6c3..b78469c 100644 --- a/main.data_pump.py +++ b/main.data_pump.py @@ -44,37 +44,37 @@ def insert_data_into_dicts(eperson_json_name, user_registraion_json_name, bitstream_json_name, insert_data): if not insert_data: return - var.eperson_id_dict = create_dict_from_json(eperson_json_name) - var.user_registration_id_dict = create_dict_from_json(user_registraion_json_name) - var.group_id_dict = create_dict_from_json(group_json_name) - var.community_id_dict = create_dict_from_json(community_json_name) + # var.eperson_id_dict = create_dict_from_json(eperson_json_name) + # var.user_registration_id_dict = create_dict_from_json(user_registraion_json_name) + # var.group_id_dict = create_dict_from_json(group_json_name) + # var.community_id_dict = create_dict_from_json(community_json_name) var.collection_id_dict = create_dict_from_json(collection_json_name) var.item_id_dict = create_dict_from_json(item_json_name) - var.workflowitem_id_dict = create_dict_from_json(workspace_json_name) - var.workflowitem_id_dict = create_dict_from_json(workflow_json_name) - var.bitstreamformat_id_dict = create_dict_from_json(bitstreamformat_json_name) - var.bundle_id_dict = create_dict_from_json(bundle_json_name) - var.bitstream_id_dict = create_dict_from_json(bitstream_json_name) + # var.workflowitem_id_dict = create_dict_from_json(workspace_json_name) + # var.workflowitem_id_dict = create_dict_from_json(workflow_json_name) + # var.bitstreamformat_id_dict = create_dict_from_json(bitstreamformat_json_name) + # var.bundle_id_dict = create_dict_from_json(bundle_json_name) + # var.bitstream_id_dict = create_dict_from_json(bitstream_json_name) if __name__ == "__main__": parser = argparse.ArgumentParser(description='Upload values into dictionaries') - parser.add_argument('--insert_dict_bool', + parser.add_argument('--load_dict_bool', help='bool value if we load values into dict', - required=False, type=bool, default=False) + required=False, type=bool, default=True) parser.add_argument('--save_dict_bool', help='bool value if we save dict values into jsons', required=False, type=bool, default=False) args = parser.parse_args() - - # Is the email server really off? - email_s_off = input("Please make sure your email server is turned off. " - "Otherwise unbearable amount of emails will be sent. " - "Is your EMAIL SERVER really OFF? (Y/N)") - email_s_off = email_s_off.lower() - # terminate the program - if email_s_off not in ("y", "yes"): - sys.exit() + # + # # Is the email server really off? + # email_s_off = input("Please make sure your email server is turned off. " + # "Otherwise unbearable amount of emails will be sent. " + # "Is your EMAIL SERVER really OFF? (Y/N)") + # email_s_off = email_s_off.lower() + # # terminate the program + # if email_s_off not in ("y", "yes"): + # sys.exit() insert_data_into_dicts("eperson_dict.json", "user_registration_dict.json", @@ -87,46 +87,46 @@ def insert_data_into_dicts(eperson_json_name, user_registraion_json_name, "bitstreamformatregistry_dict.json", "bundle_dict.json", "bitstream_dict.json", - args.insert_dict_bool) + args.load_dict_bool) handle_class = Handle() - metadata_class = Metadata(var.statistics_dict, args.save_dict_bool) - - _logger.info("Data migration started!") - import_community(metadata_class, - handle_class, - var.group_id_dict, - var.community_id_dict, - var.community2logo_dict, - var.statistics_dict, - args.save_dict_bool) - import_collection(metadata_class, - handle_class, - var.group_id_dict, - var.community_id_dict, - var.collection_id_dict, - var.collection2logo_dict, - var.statistics_dict, - args.save_dict_bool) - import_registrationdata(var.statistics_dict) - import_epersongroup(metadata_class, - var.group_id_dict, - var.statistics_dict, - args.save_dict_bool) - import_group2group(var.group_id_dict, var.statistics_dict) - import_eperson(metadata_class, - var.eperson_id_dict, - var.email2epersonId_dict, - var.statistics_dict, - args.save_dict_bool) - import_user_registration(var.email2epersonId_dict, - var.eperson_id_dict, - var.user_registration_id_dict, - var.statistics_dict, - args.save_dict_bool) - import_group2eperson(var.eperson_id_dict, - var.group_id_dict, - var.statistics_dict) - import_license(var.eperson_id_dict, var.statistics_dict, args.save_dict_bool) + metadata_class = Metadata(var.statistics_dict, args.load_dict_bool) + # + # _logger.info("Data migration started!") + # import_community(metadata_class, + # handle_class, + # var.group_id_dict, + # var.community_id_dict, + # var.community2logo_dict, + # var.statistics_dict, + # args.save_dict_bool) + # import_collection(metadata_class, + # handle_class, + # var.group_id_dict, + # var.community_id_dict, + # var.collection_id_dict, + # var.collection2logo_dict, + # var.statistics_dict, + # args.save_dict_bool) + # import_registrationdata(var.statistics_dict) + # import_epersongroup(metadata_class, + # var.group_id_dict, + # var.statistics_dict, + # args.save_dict_bool) + # import_group2group(var.group_id_dict, var.statistics_dict) + # import_eperson(metadata_class, + # var.eperson_id_dict, + # var.email2epersonId_dict, + # var.statistics_dict, + # args.save_dict_bool) + # import_user_registration(var.email2epersonId_dict, + # var.eperson_id_dict, + # var.user_registration_id_dict, + # var.statistics_dict, + # args.save_dict_bool) + # import_group2eperson(var.eperson_id_dict, + # var.group_id_dict, + # var.statistics_dict) + # import_license(var.eperson_id_dict, var.statistics_dict, args.save_dict_bool) import_item(metadata_class, handle_class, var.workflowitem_id_dict, @@ -135,39 +135,39 @@ def insert_data_into_dicts(eperson_json_name, user_registraion_json_name, var.eperson_id_dict, var.statistics_dict, args.save_dict_bool) - import_tasklistitem(var.workflowitem_id_dict, - var.eperson_id_dict, - var.statistics_dict) - var.unknown_format_id_val = import_bitstreamformatregistry( - var.bitstreamformat_id_dict, - var.unknown_format_id_val, - var.statistics_dict, - args.save_dict_bool) - import_bundle(metadata_class, - var.item_id_dict, - var.bundle_id_dict, - var.primaryBitstream_dict, - var.statistics_dict, - args.save_dict_bool) - import_bitstream(metadata_class, - var.bitstreamformat_id_dict, - var.primaryBitstream_dict, - var.bitstream2bundle_dict, - var.bundle_id_dict, - var.community2logo_dict, - var.collection2logo_dict, - var.bitstream_id_dict, - var.community_id_dict, - var.collection_id_dict, - var.unknown_format_id_val, - var.statistics_dict, - args.save_dict_bool) - import_user_metadata(var.bitstream_id_dict, - var.user_registration_id_dict, - var.statistics_dict) - - # migrate sequences - migrate_sequences() + # import_tasklistitem(var.workflowitem_id_dict, + # var.eperson_id_dict, + # var.statistics_dict) + # var.unknown_format_id_val = import_bitstreamformatregistry( + # var.bitstreamformat_id_dict, + # var.unknown_format_id_val, + # var.statistics_dict, + # args.save_dict_bool) + # import_bundle(metadata_class, + # var.item_id_dict, + # var.bundle_id_dict, + # var.primaryBitstream_dict, + # var.statistics_dict, + # args.save_dict_bool) + # import_bitstream(metadata_class, + # var.bitstreamformat_id_dict, + # var.primaryBitstream_dict, + # var.bitstream2bundle_dict, + # var.bundle_id_dict, + # var.community2logo_dict, + # var.collection2logo_dict, + # var.bitstream_id_dict, + # var.community_id_dict, + # var.collection_id_dict, + # var.unknown_format_id_val, + # var.statistics_dict, + # args.save_dict_bool) + # import_user_metadata(var.bitstream_id_dict, + # var.user_registration_id_dict, + # var.statistics_dict) + # + # # migrate sequences + # migrate_sequences() at_the_end_of_import(handle_class, var.statistics_dict) _logger.info("Data migration is completed!") diff --git a/support/dspace_interface/client.py b/support/dspace_interface/client.py index af29c8c..a398bc3 100644 --- a/support/dspace_interface/client.py +++ b/support/dspace_interface/client.py @@ -167,20 +167,21 @@ def api_get(self, url, params=None, data=None): self.session.cookies.update({'X-XSRF-Token': t}) return r - def api_post(self, url, params, json_p, retry=False): + def api_post(self, url, params, data, retry=False, content_type='application/json'): """ Perform a POST request. Refresh XSRF token if necessary. POSTs are typically used to create objects. + @param content_type: @param url: DSpace REST API URL @param params: Any parameters to include (eg ?parent=abbc-....) - @param json_p: Data in json-ready form (dict) to send as + @param data: Data in json-ready form (dict) to send as POST body (eg. item.as_dict()) @param retry: Has this method already been retried? Used if we need to refresh XSRF. @return: Response from API """ - h = {'Content-type': 'application/json'} - r = self.session.post(url, json=json_p, params=params, headers=h) + h = {'Content-type': content_type} + r = self.session.post(url, json=data, params=params, headers=h) if 'DSPACE-XSRF-TOKEN' in r.headers: t = r.headers['DSPACE-XSRF-TOKEN'] logging.debug('API Post: Updating token to ' + t) @@ -201,7 +202,7 @@ def api_post(self, url, params, json_p, retry=False): 'API Post: Already retried... something must be wrong') else: logging.info("API Post: Retrying request with updated CSRF token") - return self.api_post(url, params=params, json_p=json_p, retry=True) + return self.api_post(url, params=params, data=data, retry=True) elif r.status_code == 401: r_json = r.json() if 'message' in r_json and 'Authentication is required' in r_json[ @@ -220,7 +221,7 @@ def api_post(self, url, params, json_p, retry=False): retry_value = False if self.exception401Counter > 3: retry_value = True - return self.api_post(url, params=params, json_p=json_p, + return self.api_post(url, params=params, data=data, retry=retry_value) check_response(r, "api post") From eb382b1e79877f3f65c44054d61065b8a428b61c Mon Sep 17 00:00:00 2001 From: milanmajchrak Date: Thu, 21 Sep 2023 16:10:20 +0200 Subject: [PATCH 2/6] The Item's collections are imported using custom clarin endpoint --- data_pump/item.py | 237 ++++++++++++++++++++++++---------------------- main.data_pump.py | 178 +++++++++++++++++----------------- 2 files changed, 211 insertions(+), 204 deletions(-) diff --git a/data_pump/item.py b/data_pump/item.py index fd29b61..4fdd679 100644 --- a/data_pump/item.py +++ b/data_pump/item.py @@ -28,7 +28,7 @@ def import_item(metadata_class, item_url = 'clarin/import/item' saved_item_json_name = "item_dict.json" workflowitem_url = 'clarin/import/workflowitem' - item2collection_url = 'core/items/{item_uuid}/mappedCollections' + item2collection_url = 'clarin/import/item/{item_uuid}/mappedCollections' imported_workspaceitem = 0 imported_workflowitem = 0 imported_item = 0 @@ -36,110 +36,110 @@ def import_item(metadata_class, # create dict from items by item id item_json_list = read_json(item_json_name) items_dict = {} - # if not item_json_list: - # logging.info("Item JSON is empty.") - # return - # for item in item_json_list: - # items_dict[item['item_id']] = item - # statistics_dict['item'] = (len(item_json_list), 0) - # - # # create item and workspaceitem - # workspaceitem_json_list = read_json(workspaceitem_json_name) - # if workspaceitem_json_list is not None: - # for workspaceitem in workspaceitem_json_list: - # item = items_dict[workspaceitem['item_id']] - # import_workspaceitem(item, workspaceitem['collection_id'], - # workspaceitem['multiple_titles'], - # workspaceitem['published_before'], - # workspaceitem['multiple_files'], - # workspaceitem['stage_reached'], - # workspaceitem['page_reached'], - # metadata_class, - # handle_class, - # workspaceitem_id_dict, - # item_id_dict, - # collection_id_dict, - # eperson_id_dict) - # imported_workspaceitem += 1 - # del items_dict[workspaceitem['item_id']] - # - # statistics_dict['workspaceitem'] = (len(workspaceitem_json_list), - # imported_workspaceitem) - # imported_item += imported_workspaceitem - # # save workspaceitem dict as json - # if save_dict: - # save_dict_as_json(saved_workspace_json_name, workspaceitem_id_dict) - # logging.info("Workspaceitem was successfully imported!") - # else: - # logging.info("Workspaceitem JSON is empty.") - # # create workflowitem - # # workflowitem is created from workspaceitem - # # -1, because the workflowitem doesn't contain this attribute - # workflowitem_json_list = read_json(workflowitem_json_name) - # if workflowitem_json_list is not None: - # for workflowitem in workflowitem_json_list: - # item = items_dict[workflowitem['item_id']] - # import_workspaceitem(item, workflowitem['collection_id'], - # workflowitem['multiple_titles'], - # workflowitem['published_before'], - # workflowitem['multiple_files'], - # -1, - # -1, - # metadata_class, - # handle_class, - # workspaceitem_id_dict, - # item_id_dict, - # collection_id_dict, - # eperson_id_dict) - # # create workflowitem from created workspaceitem - # params = {'id': str(workspaceitem_id_dict[workflowitem['item_id']])} - # try: - # response = do_api_post(workflowitem_url, params, None) - # workflowitem_id_dict[workflowitem['workflow_id']] = \ - # response.headers['workflowitem_id'] - # imported_workflowitem += 1 - # except Exception as e: - # logging.error('POST request ' + workflowitem_url + ' for id: ' + - # str(workflowitem['item_id']) + ' failed. Exception: ' + - # str(e)) - # del items_dict[workflowitem['item_id']] - # - # # save workflow dict as json - # if save_dict: - # save_dict_as_json(saved_workflow_json_name, workflowitem_id_dict) - # statistics_val = (len(workflowitem_json_list), imported_workflowitem) - # statistics_dict['workflowitem'] = statistics_val - # imported_item += imported_workflowitem - # logging.info("Cwf_workflowitem was successfully imported!") - # else: - # logging.info("Workflowitem JSON is empty.") - # - # # create other items - # for item in items_dict.values(): - # item_json_p = { - # 'discoverable': item['discoverable'], - # 'inArchive': item['in_archive'], - # 'lastModified': item['last_modified'], - # 'withdrawn': item['withdrawn'] - # } - # metadatvalue_item_dict = metadata_class.get_metadata_value(2, item['item_id']) - # if metadatvalue_item_dict: - # item_json_p['metadata'] = metadatvalue_item_dict - # handle_item = handle_class.get_handle(2, item['item_id']) - # if handle_item is not None: - # item_json_p['handle'] = handle_item - # params = { - # 'owningCollection': collection_id_dict[item['owning_collection']], - # 'epersonUUID': eperson_id_dict[item['submitter_id']] - # } - # try: - # response = do_api_post(item_url, params, item_json_p) - # response_json = convert_response_to_json(response) - # item_id_dict[item['item_id']] = response_json['id'] - # imported_item += 1 - # except Exception as e: - # logging.error('POST request ' + item_url + ' for id: ' + - # str(item['item_id']) + ' failed. Exception: ' + str(e)) + if not item_json_list: + logging.info("Item JSON is empty.") + return + for item in item_json_list: + items_dict[item['item_id']] = item + statistics_dict['item'] = (len(item_json_list), 0) + + # create item and workspaceitem + workspaceitem_json_list = read_json(workspaceitem_json_name) + if workspaceitem_json_list is not None: + for workspaceitem in workspaceitem_json_list: + item = items_dict[workspaceitem['item_id']] + import_workspaceitem(item, workspaceitem['collection_id'], + workspaceitem['multiple_titles'], + workspaceitem['published_before'], + workspaceitem['multiple_files'], + workspaceitem['stage_reached'], + workspaceitem['page_reached'], + metadata_class, + handle_class, + workspaceitem_id_dict, + item_id_dict, + collection_id_dict, + eperson_id_dict) + imported_workspaceitem += 1 + del items_dict[workspaceitem['item_id']] + + statistics_dict['workspaceitem'] = (len(workspaceitem_json_list), + imported_workspaceitem) + imported_item += imported_workspaceitem + # save workspaceitem dict as json + if save_dict: + save_dict_as_json(saved_workspace_json_name, workspaceitem_id_dict) + logging.info("Workspaceitem was successfully imported!") + else: + logging.info("Workspaceitem JSON is empty.") + # create workflowitem + # workflowitem is created from workspaceitem + # -1, because the workflowitem doesn't contain this attribute + workflowitem_json_list = read_json(workflowitem_json_name) + if workflowitem_json_list is not None: + for workflowitem in workflowitem_json_list: + item = items_dict[workflowitem['item_id']] + import_workspaceitem(item, workflowitem['collection_id'], + workflowitem['multiple_titles'], + workflowitem['published_before'], + workflowitem['multiple_files'], + -1, + -1, + metadata_class, + handle_class, + workspaceitem_id_dict, + item_id_dict, + collection_id_dict, + eperson_id_dict) + # create workflowitem from created workspaceitem + params = {'id': str(workspaceitem_id_dict[workflowitem['item_id']])} + try: + response = do_api_post(workflowitem_url, params, None) + workflowitem_id_dict[workflowitem['workflow_id']] = \ + response.headers['workflowitem_id'] + imported_workflowitem += 1 + except Exception as e: + logging.error('POST request ' + workflowitem_url + ' for id: ' + + str(workflowitem['item_id']) + ' failed. Exception: ' + + str(e)) + del items_dict[workflowitem['item_id']] + + # save workflow dict as json + if save_dict: + save_dict_as_json(saved_workflow_json_name, workflowitem_id_dict) + statistics_val = (len(workflowitem_json_list), imported_workflowitem) + statistics_dict['workflowitem'] = statistics_val + imported_item += imported_workflowitem + logging.info("Cwf_workflowitem was successfully imported!") + else: + logging.info("Workflowitem JSON is empty.") + + # create other items + for item in items_dict.values(): + item_json_p = { + 'discoverable': item['discoverable'], + 'inArchive': item['in_archive'], + 'lastModified': item['last_modified'], + 'withdrawn': item['withdrawn'] + } + metadatvalue_item_dict = metadata_class.get_metadata_value(2, item['item_id']) + if metadatvalue_item_dict: + item_json_p['metadata'] = metadatvalue_item_dict + handle_item = handle_class.get_handle(2, item['item_id']) + if handle_item is not None: + item_json_p['handle'] = handle_item + params = { + 'owningCollection': collection_id_dict[item['owning_collection']], + 'epersonUUID': eperson_id_dict[item['submitter_id']] + } + try: + response = do_api_post(item_url, params, item_json_p) + response_json = convert_response_to_json(response) + item_id_dict[item['item_id']] = response_json['id'] + imported_item += 1 + except Exception as e: + logging.error('POST request ' + item_url + ' for id: ' + + str(item['item_id']) + ' failed. Exception: ' + str(e)) # Import collection2item table - only items which are mapped in more collections # Add another collection into Item only if another collection is not owning_collection @@ -151,23 +151,30 @@ def import_item(metadata_class, for collection2table in collection2table_json_list: # Every item should have mapped only one collection - the owning collection except the items which # are mapped into more collections - if collection2table['item_id'] in coll_2_item_dict: - item_uuid = item_id_dict[collection2table['item_id']] - collection_uuid = collection_id_dict[collection2table['collection_id']] - # Add item UUID into list + item_uuid = item_id_dict[collection2table['item_id']] + collection_uuid = collection_id_dict[collection2table['collection_id']] + if item_uuid in coll_2_item_dict: + # Add another collection into dict to get all collections for current Item + coll_2_item_dict[item_uuid].append(collection_id_dict[collection2table['collection_id']]) + # Add item UUID and collection UUID into list in this way {`item_uuid`: `collection_uuid`} items_with_more_colls[item_uuid] = collection_uuid continue - coll_2_item_dict[collection2table['item_id']] = collection_id_dict[collection2table['collection_id']] - + coll_2_item_dict[item_uuid] = [collection_uuid] # Call Vanilla REST endpoint which add relation between Item and Collection into the collection2item table - for item_with_more_coll_id in items_with_more_colls.keys(): - # Prepare request URL - replace `{item_uuid}` with current `item_with_more_coll_id` - request_url = item2collection_url.replace('{item_uuid}', item_with_more_coll_id) - response = do_api_post_custom(request_url, {}, '', 'text/uri-list') + for item_with_more_coll_uuid in items_with_more_colls.keys(): + # Prepare request URL - replace `{item_uuid}` with current `item_with_more_coll_uuid` + request_url = item2collection_url.replace('{item_uuid}', item_with_more_coll_uuid) + # Prepare request body which should looks like this: # `"https://localhost:8080/spring-rest/api/core/collections/{collection_uuid_1}" + \n # "https://localhost:8080/spring-rest/api/core/collections/{collection_uuid_2}" + request_body = [] + collection_url = 'core/collections/' + for collection_uuid in coll_2_item_dict[item_with_more_coll_uuid]: + request_body.append(API_URL + collection_url + collection_uuid) + + do_api_post(request_url, {}, request_body) # save item dict as json if save_dict: diff --git a/main.data_pump.py b/main.data_pump.py index b78469c..576f410 100644 --- a/main.data_pump.py +++ b/main.data_pump.py @@ -44,37 +44,37 @@ def insert_data_into_dicts(eperson_json_name, user_registraion_json_name, bitstream_json_name, insert_data): if not insert_data: return - # var.eperson_id_dict = create_dict_from_json(eperson_json_name) - # var.user_registration_id_dict = create_dict_from_json(user_registraion_json_name) - # var.group_id_dict = create_dict_from_json(group_json_name) - # var.community_id_dict = create_dict_from_json(community_json_name) + var.eperson_id_dict = create_dict_from_json(eperson_json_name) + var.user_registration_id_dict = create_dict_from_json(user_registraion_json_name) + var.group_id_dict = create_dict_from_json(group_json_name) + var.community_id_dict = create_dict_from_json(community_json_name) var.collection_id_dict = create_dict_from_json(collection_json_name) var.item_id_dict = create_dict_from_json(item_json_name) - # var.workflowitem_id_dict = create_dict_from_json(workspace_json_name) - # var.workflowitem_id_dict = create_dict_from_json(workflow_json_name) - # var.bitstreamformat_id_dict = create_dict_from_json(bitstreamformat_json_name) - # var.bundle_id_dict = create_dict_from_json(bundle_json_name) - # var.bitstream_id_dict = create_dict_from_json(bitstream_json_name) + var.workflowitem_id_dict = create_dict_from_json(workspace_json_name) + var.workflowitem_id_dict = create_dict_from_json(workflow_json_name) + var.bitstreamformat_id_dict = create_dict_from_json(bitstreamformat_json_name) + var.bundle_id_dict = create_dict_from_json(bundle_json_name) + var.bitstream_id_dict = create_dict_from_json(bitstream_json_name) if __name__ == "__main__": parser = argparse.ArgumentParser(description='Upload values into dictionaries') parser.add_argument('--load_dict_bool', help='bool value if we load values into dict', - required=False, type=bool, default=True) + required=False, type=bool, default=False) parser.add_argument('--save_dict_bool', help='bool value if we save dict values into jsons', required=False, type=bool, default=False) args = parser.parse_args() - # - # # Is the email server really off? - # email_s_off = input("Please make sure your email server is turned off. " - # "Otherwise unbearable amount of emails will be sent. " - # "Is your EMAIL SERVER really OFF? (Y/N)") - # email_s_off = email_s_off.lower() - # # terminate the program - # if email_s_off not in ("y", "yes"): - # sys.exit() + + # Is the email server really off? + email_s_off = input("Please make sure your email server is turned off. " + "Otherwise unbearable amount of emails will be sent. " + "Is your EMAIL SERVER really OFF? (Y/N)") + email_s_off = email_s_off.lower() + # terminate the program + if email_s_off not in ("y", "yes"): + sys.exit() insert_data_into_dicts("eperson_dict.json", "user_registration_dict.json", @@ -90,43 +90,43 @@ def insert_data_into_dicts(eperson_json_name, user_registraion_json_name, args.load_dict_bool) handle_class = Handle() metadata_class = Metadata(var.statistics_dict, args.load_dict_bool) - # - # _logger.info("Data migration started!") - # import_community(metadata_class, - # handle_class, - # var.group_id_dict, - # var.community_id_dict, - # var.community2logo_dict, - # var.statistics_dict, - # args.save_dict_bool) - # import_collection(metadata_class, - # handle_class, - # var.group_id_dict, - # var.community_id_dict, - # var.collection_id_dict, - # var.collection2logo_dict, - # var.statistics_dict, - # args.save_dict_bool) - # import_registrationdata(var.statistics_dict) - # import_epersongroup(metadata_class, - # var.group_id_dict, - # var.statistics_dict, - # args.save_dict_bool) - # import_group2group(var.group_id_dict, var.statistics_dict) - # import_eperson(metadata_class, - # var.eperson_id_dict, - # var.email2epersonId_dict, - # var.statistics_dict, - # args.save_dict_bool) - # import_user_registration(var.email2epersonId_dict, - # var.eperson_id_dict, - # var.user_registration_id_dict, - # var.statistics_dict, - # args.save_dict_bool) - # import_group2eperson(var.eperson_id_dict, - # var.group_id_dict, - # var.statistics_dict) - # import_license(var.eperson_id_dict, var.statistics_dict, args.save_dict_bool) + + _logger.info("Data migration started!") + import_community(metadata_class, + handle_class, + var.group_id_dict, + var.community_id_dict, + var.community2logo_dict, + var.statistics_dict, + args.save_dict_bool) + import_collection(metadata_class, + handle_class, + var.group_id_dict, + var.community_id_dict, + var.collection_id_dict, + var.collection2logo_dict, + var.statistics_dict, + args.save_dict_bool) + import_registrationdata(var.statistics_dict) + import_epersongroup(metadata_class, + var.group_id_dict, + var.statistics_dict, + args.save_dict_bool) + import_group2group(var.group_id_dict, var.statistics_dict) + import_eperson(metadata_class, + var.eperson_id_dict, + var.email2epersonId_dict, + var.statistics_dict, + args.save_dict_bool) + import_user_registration(var.email2epersonId_dict, + var.eperson_id_dict, + var.user_registration_id_dict, + var.statistics_dict, + args.save_dict_bool) + import_group2eperson(var.eperson_id_dict, + var.group_id_dict, + var.statistics_dict) + import_license(var.eperson_id_dict, var.statistics_dict, args.save_dict_bool) import_item(metadata_class, handle_class, var.workflowitem_id_dict, @@ -135,39 +135,39 @@ def insert_data_into_dicts(eperson_json_name, user_registraion_json_name, var.eperson_id_dict, var.statistics_dict, args.save_dict_bool) - # import_tasklistitem(var.workflowitem_id_dict, - # var.eperson_id_dict, - # var.statistics_dict) - # var.unknown_format_id_val = import_bitstreamformatregistry( - # var.bitstreamformat_id_dict, - # var.unknown_format_id_val, - # var.statistics_dict, - # args.save_dict_bool) - # import_bundle(metadata_class, - # var.item_id_dict, - # var.bundle_id_dict, - # var.primaryBitstream_dict, - # var.statistics_dict, - # args.save_dict_bool) - # import_bitstream(metadata_class, - # var.bitstreamformat_id_dict, - # var.primaryBitstream_dict, - # var.bitstream2bundle_dict, - # var.bundle_id_dict, - # var.community2logo_dict, - # var.collection2logo_dict, - # var.bitstream_id_dict, - # var.community_id_dict, - # var.collection_id_dict, - # var.unknown_format_id_val, - # var.statistics_dict, - # args.save_dict_bool) - # import_user_metadata(var.bitstream_id_dict, - # var.user_registration_id_dict, - # var.statistics_dict) - # - # # migrate sequences - # migrate_sequences() + import_tasklistitem(var.workflowitem_id_dict, + var.eperson_id_dict, + var.statistics_dict) + var.unknown_format_id_val = import_bitstreamformatregistry( + var.bitstreamformat_id_dict, + var.unknown_format_id_val, + var.statistics_dict, + args.save_dict_bool) + import_bundle(metadata_class, + var.item_id_dict, + var.bundle_id_dict, + var.primaryBitstream_dict, + var.statistics_dict, + args.save_dict_bool) + import_bitstream(metadata_class, + var.bitstreamformat_id_dict, + var.primaryBitstream_dict, + var.bitstream2bundle_dict, + var.bundle_id_dict, + var.community2logo_dict, + var.collection2logo_dict, + var.bitstream_id_dict, + var.community_id_dict, + var.collection_id_dict, + var.unknown_format_id_val, + var.statistics_dict, + args.save_dict_bool) + import_user_metadata(var.bitstream_id_dict, + var.user_registration_id_dict, + var.statistics_dict) + + # migrate sequences + migrate_sequences() at_the_end_of_import(handle_class, var.statistics_dict) _logger.info("Data migration is completed!") From 161f2d19b52311b8060ae3613e00dff8a091bb70 Mon Sep 17 00:00:00 2001 From: milanmajchrak Date: Thu, 21 Sep 2023 16:11:34 +0200 Subject: [PATCH 3/6] The Item's collections are imported using custom clarin endpoint --- data_pump/metadata.py | 16 ++++++++-------- data_pump/utils.py | 11 ----------- 2 files changed, 8 insertions(+), 19 deletions(-) diff --git a/data_pump/metadata.py b/data_pump/metadata.py index 7ef9043..d9d75b9 100644 --- a/data_pump/metadata.py +++ b/data_pump/metadata.py @@ -15,16 +15,16 @@ def __init__(self, statistics_dict, insert_dict): self.metadatavalue_dict = {} self.metadataschema_id_dict = {} self.metadatafield_id_dict = {} - # if insert_dict: - # self.metadataschema_id_dict = \ - # insert_data_into_dicts("metadataschemaregistry.json") - # self.metadatafield_id_dict = \ - # insert_data_into_dicts("metadatafieldregistry.json") + if insert_dict: + self.metadataschema_id_dict = \ + insert_data_into_dicts("metadataschemaregistry.json") + self.metadatafield_id_dict = \ + insert_data_into_dicts("metadatafieldregistry.json") # import all metadata - # self.read_metadata() - # self.import_metadataschemaregistry(statistics_dict) - # self.import_metadatafieldregistry(statistics_dict) + self.read_metadata() + self.import_metadataschemaregistry(statistics_dict) + self.import_metadatafieldregistry(statistics_dict) def read_metadata(self): metadatavalue_json_name = 'metadatavalue.json' diff --git a/data_pump/utils.py b/data_pump/utils.py index a352319..af98ae1 100644 --- a/data_pump/utils.py +++ b/data_pump/utils.py @@ -43,17 +43,6 @@ def do_api_post(url, params: dict, json_p): return response -def do_api_post_custom(url, params: dict, list, content_type): - """ - Insert data into database by api. - @param url: url for api post - @param params: parameters for api post - @param json_p: posted data - @return: response from api post - """ - url = API_URL + url - response = rest_proxy.d.api_post(url, params, list, False, content_type) - return response def do_api_get_one(url, object_id): From 6093d77b2ad0922cadeb5b1ec2dc37674770b272 Mon Sep 17 00:00:00 2001 From: milanmajchrak Date: Thu, 21 Sep 2023 16:12:06 +0200 Subject: [PATCH 4/6] refactoring --- data_pump/utils.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/data_pump/utils.py b/data_pump/utils.py index af98ae1..b16ad08 100644 --- a/data_pump/utils.py +++ b/data_pump/utils.py @@ -43,8 +43,6 @@ def do_api_post(url, params: dict, json_p): return response - - def do_api_get_one(url, object_id): """ Get data with id from table. From fd5de12d10aaedb4789061ea31bf23f8f3a94d32 Mon Sep 17 00:00:00 2001 From: milanmajchrak Date: Thu, 21 Sep 2023 16:24:38 +0200 Subject: [PATCH 5/6] Revert unwanted changes --- const.py | 14 +++++++------- data_pump/item.py | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/const.py b/const.py index 9d5bf6e..7a163b8 100644 --- a/const.py +++ b/const.py @@ -1,7 +1,7 @@ import enum import logging -user = "m@m.edu" +user = "test@test.edu" password = "dspace" # password = "admin" # user = "m@edu.com" @@ -9,12 +9,12 @@ # http or https use_ssl = False -host = "localhost" -# host = "dev-5.pc" -fe_port = ":4000" -# fe_port = None -be_port = ":8080" -# be_port = None +# host = "localhost" +host = "dev-5.pc" +# fe_port = ":4000" +fe_port = None +# be_port = ":8080" +be_port = None be_location = "/server/" # config logging diff --git a/data_pump/item.py b/data_pump/item.py index 4fdd679..1a1259f 100644 --- a/data_pump/item.py +++ b/data_pump/item.py @@ -1,7 +1,7 @@ import logging from data_pump.utils import read_json, convert_response_to_json, do_api_post, \ - save_dict_as_json, do_api_post_custom + save_dict_as_json from support.dspace_proxy import rest_proxy from const import API_URL From 3e2c447a4e33811007c18e3cb6b5cb2e71724983 Mon Sep 17 00:00:00 2001 From: milanmajchrak Date: Tue, 26 Sep 2023 16:14:56 +0200 Subject: [PATCH 6/6] Refactoring - added a comment --- support/dspace_interface/client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/support/dspace_interface/client.py b/support/dspace_interface/client.py index a398bc3..27c4463 100644 --- a/support/dspace_interface/client.py +++ b/support/dspace_interface/client.py @@ -171,7 +171,7 @@ def api_post(self, url, params, data, retry=False, content_type='application/jso """ Perform a POST request. Refresh XSRF token if necessary. POSTs are typically used to create objects. - @param content_type: + @param content_type: Type of the content, it is `JSON` by default @param url: DSpace REST API URL @param params: Any parameters to include (eg ?parent=abbc-....) @param data: Data in json-ready form (dict) to send as