diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..9ef3dc6 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,5 @@ +*.sh eol=lf +*.py eol=lf +*.md eol=lf +apt-requirements.txt eol=lf +*.bat eol=crlf \ No newline at end of file diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index cd99ce1..58b2c7c 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -1,33 +1,28 @@ -name: Test dspace on dev-5 +name: build-and-test on: workflow_dispatch: schedule: - # * is a special character in YAML so you have to quote this string - cron: '0 0 * * *' push: - branches: [ "main" ] jobs: test: - runs-on: dspace-bbt + runs-on: ubuntu-latest - # Steps represent a sequence of tasks that will be executed as part of the job steps: - uses: actions/checkout@v3 - name: install requirements run: pip install -r requirements.txt - - name: test - run: python3 -m unittest -v 2> output.txt - - - name: report result - run: echo $? > result.txt + - name: smoketest + run: | + cd ./src + python repo_import.py --help - # multi line commands for future reference - - name: Run a multi-line script + - name: test run: | - echo first line - echo second line + cd ./tests + python -m unittest discover ./ -v diff --git a/.gitignore b/.gitignore index 8aa10d2..0185194 100644 --- a/.gitignore +++ b/.gitignore @@ -139,3 +139,7 @@ clarin-dspace-dump-8.8.23 # data folders data/ temp-files/ + +__logs +input +*.bak \ No newline at end of file diff --git a/README.dev.md b/README.dev.md new file mode 100644 index 0000000..129b770 --- /dev/null +++ b/README.dev.md @@ -0,0 +1,46 @@ +# How to write new tests +Check test.example package. Everything necessary should be there. + +Test data are in `test/data` folder. +If your test data contains special characters like čřšáý and so on, it is recommended +to make `.stripped` variation of the file. +E.g. `my_format.json` and `my_format.stripped.json` for loading data +and `my_format.test.xml` and `my_format.test.stripped.xml` for testing. + +If not on dev-5 (e.g. when run on localhost), `.stripped` version of files will be loaded. +The reason for this is, that when dspace runs on windows, it has trouble with special characters. + + +## Settings +See const.py for constants used at testing. + +To set up logs, navigate to support.logs.py and modify method set_up_logging. + +## Run + +In order to run tests, use command +`python -m unittest` + +Recommended variation is +`python -m unittest -v 2> output.txt` +which leaves result in output.txt + +Before running for the first time, requirements must be installed with following command +`pip install -r requirements.txt` + +It is possible to run in Pycharm with configuration like so: + +![image](https://user-images.githubusercontent.com/88670521/186934112-d0f828fd-a809-4ed8-bbfd-4457b734d8fd.png) + + +# How to re-initialize dspace 7 database + +Recreate your local CLARIN-DSpace7.* database **NOTE: all data will be deleted** + +- Install again the database following the official tutorial steps: https://wiki.lyrasis.org/display/DSDOC7x/Installing+DSpace#InstallingDSpace-PostgreSQL11.x,12.xor13.x(withpgcryptoinstalled) +- Or try to run these commands in the /bin: +> - `createdb --username=postgres --owner=dspace --encoding=UNICODE dspace` // create database +> - `psql --username=postgres dspace -c "CREATE EXTENSION pgcrypto;"` // Add pgcrypto extension +> > If it throws warning that `-c` parameter was ignored, just write a `CREATE EXTENSION pgcrypto;` command in the database cmd. +> > CREATE EXTENSION pgcrypto; +![image](https://user-images.githubusercontent.com/90026355/228528044-f6ad178c-f525-4b15-b6cc-03d8d94c8ccc.png) diff --git a/README.md b/README.md index e37e86f..460ca7e 100644 --- a/README.md +++ b/README.md @@ -13,143 +13,67 @@ there exists automatic function that sends email, what we don't want because we use this endpoint for importing existing data. ### Prerequisites: -- Installed CLARIN-DSpace7.*. with running database, solr, tomcat +1. Install CLARIN-DSpace7.*. (postgres, solr, dspace backend) -### Steps: -1. Clone python-api: https://github.com/dataquest-dev/dspace-python-api (branch `main`) and dpace://https://github.com/dataquest-dev/DSpace (branch `dtq-dev`) +2. Clone python-api: https://github.com/dataquest-dev/dspace-python-api (branch `main`) and https://github.com/dataquest-dev/DSpace (branch `dtq-dev`) -*** -2. Get database dump (old CLARIN-DSpace) and unzip it into the `/bin` (or wherever you want) - -*** -3. Create CLARIN-DSpace5.* databases (dspace, utilities) from dump. -> // clarin-dspace database -> - `createdb --username=postgres --owner=dspace --encoding=UNICODE clarin-dspace` // create a clarin database with owner - -> // Running on second try: -> - `psql -U postgres clarin-dspace < ` +3. Get database dump (old CLARIN-DSpace) and unzip it into `input/dump` directory in `dspace-python-api` project. -> // clarin-utilities database -> - `createdb --username=postgres --owner=dspace --encoding=UNICODE clarin-utilities` // create the utilities database with owner - -> // Running on second try: -> - `psql -U postgres clarin-utilities < ` - -*** -4. Recreate your local CLARIN-DSpace7.* database **NOTE: all data will be deleted** -- Install again the database following the official tutorial steps: https://wiki.lyrasis.org/display/DSDOC7x/Installing+DSpace#InstallingDSpace-PostgreSQL11.x,12.xor13.x(withpgcryptoinstalled) -- Or try to run these commands in the /bin: -> - `createdb --username=postgres --owner=dspace --encoding=UNICODE dspace` // create database -> - `psql --username=postgres dspace -c "CREATE EXTENSION pgcrypto;"` // Add pgcrypto extension -> > If it throws warning that `-c` parameter was ignored, just write a `CREATE EXTENSION pgcrypto;` command in the database cmd. -> > CREATE EXTENSION pgcrypto; -![image](https://user-images.githubusercontent.com/90026355/228528044-f6ad178c-f525-4b15-b6cc-03d8d94c8ccc.png) - - -> // Now the clarin database for DSpace7 should be created -> - Run the database by the command: `pg_ctl start -D "\data\"` +4. Create CLARIN-DSpace5.* databases (dspace, utilities) from dump. +Run `scripts/start.local.dspace.db.bat` or use `scipts/init.dspacedb5.sh` directly with your database. *** -5. (Your DSpace project must be installed) Go to the `dspace/bin` and run the command `dspace database migrate force` // force because of local types +5. Go to the `dspace/bin` in dspace7 installation and run the command `dspace database migrate force` (force because of local types). **NOTE:** `dspace database migrate force` creates default database data that may be not in database dump, so after migration, some tables may have more data than the database dump. Data from database dump that already exists in database is not migrated. -*** 6. Create an admin by running the command `dspace create-administrator` in the `dspace/bin` *** -7. Prepare `dspace-python-api` project for migration -**IMPORTANT:** If `data` folder doesn't exist in the project, create it - -Update `const.py` -- `user = ""` -- `password = ""` - -- `# http or https` -- `use_ssl = False` -- `host = "" e.g., localhost` -- `# host = "dev-5.pc"` -- `fe_port = ""` -- `# fe_port = ":4000"` -- `be_port = ""` -- `# be_port = ":8080"` -- `be_location = "/server/"` -#### Database const - for copying sequences -- `CLARIN_DSPACE_NAME = "clarin-dspace"` -- `CLARIN_DSPACE_HOST = "localhost"` -- `CLARIN_DSPACE_USER = ""` -- `CLARIN_DSPACE_PASSWORD = ""` -- `CLARIN_UTILITIES_NAME = "clarin-utilities"` -- `CLARIN_UTILITIES_HOST = "localhost"` -- `CLARIN_UTILITIES_USER = ""` -- `CLARIN_UTILITIES_PASSWORD = ""` -- `CLARIN_DSPACE_7_NAME = "dspace"` -- `CLARIN_DSPACE_7_HOST = "localhost"` -- `CLARIN_DSPACE_7_PORT = 5432` -- `CLARIN_DSPACE_7_USER = ""` -- `CLARIN_DSPACE_7_PASSWORD = ""` -#### const - for importing licenses -- `OLD_LICENSE_DEFINITION_STRING = ` -- `NEW_LICENSE_DEFINITION_STRING = ` - -**NOTE:** Be sure, that `authorization = True`, because some of the used endpoints won't work +7. Create JSON files from the database tables. +**NOTE: You must do it for both databases `clarin-dspace` and `clarin-utilities`** (JSON files are stored in the `data` folder) +- Go to `dspace-python-api` and run +``` +pip install -r requirements.txt +(optional on ubuntu like systems) apt install libpq-dev +python db_to_json.py --database=clarin-dspace +python db_to_json.py --database=clarin-utilities +``` *** -8. Create JSON files from the database tables. **NOTE: You must do it for both databases `clarin-dspace` and `clarin-utilities`** (JSON files are stored in the `data` folder) -- Go to `dspace-python-api` in the cmd -- Run `pip install -r requirements.txt` -- Run `python create_jsons.py --database --host --user postgres --password ` e.g., `python create_jsons.py --database clarin-dspace --host localhost --user postgres --password pass` (arguments for database connection - database, host, user, password) for the BOTH databases // NOTE there must exist data folder in the project structure +8. Prepare `dspace-python-api` project for migration -*** -9. Make sure, your backend configuration (`dspace.cfg`) includes all handle prefixes from generated handle json in property `handle.additional.prefixes`, -e.g.,`handle.additional.prefixes = 11858, 11234, 11372, 11346, 20.500.12801, 20.500.12800` +- copy the files used during migration into `input/` directory: +``` +> ls -R ./input +input: +data dump icon + +input/data: +bitstream.json fileextension.json piwik_report.json +bitstreamformatregistry.json ... + +input/dump: +clarin-dspace-8.8.23.sql clarin-utilities-8.8.23.sql + +input/icon: +aca.png by.png gplv2.png mit.png ... +``` *** -10. Copy `assetstore` from dspace5 to dspace7 (for bitstream import). `assetstore` is in the folder where you have installed DSpace `dspace/assetstore`. +9. update `project_settings.py` *** -11. Create `icon/` folder if it doesn't exist in project and copy all the icons that are used into it. +10. Make sure, your backend configuration (`dspace.cfg`) includes all handle prefixes from generated handle json in property `handle.additional.prefixes`, +e.g.,`handle.additional.prefixes = 11858, 11234, 11372, 11346, 20.500.12801, 20.500.12800` + +11. Copy `assetstore` from dspace5 to dspace7 (for bitstream import). `assetstore` is in the folder where you have installed DSpace `dspace/assetstore`. *** -12. Import data from the json files (python-api/data/*) into dspace database (CLARIN-DSpace7.*) +11. Import data from the json files (python-api/input/*) into dspace database (CLARIN-DSpace7.*) - **NOTE:** database must be up to date (`dspace database migrate force` must be called in the `dspace/bin`) - **NOTE:** dspace server must be running -- From the `dspace-python-api` run command `python main.data_pump.py` +- run command `cd ./src && python repo_import.py` -*** ## !!!Migration notes:!!! - The values of table attributes that describe the last modification time of dspace object (for example attribute `last_modified` in table `Item`) have a value that represents the time when that object was migrated and not the value from migrated database dump. - If you don't have valid and complete data, not all data will be imported. - -# How to write new tests -Check test.example package. Everything necessary should be there. - -Test data are in `test/data` folder. -If your test data contains special characters like čřšáý and so on, it is recommended -to make `.stripped` variation of the file. -E.g. `my_format.json` and `my_format.stripped.json` for loading data -and `my_format.test.xml` and `my_format.test.stripped.xml` for testing. - -If not on dev-5 (e.g. when run on localhost), `.stripped` version of files will be loaded. -The reason for this is, that when dspace runs on windows, it has trouble with special characters. - - -## Settings -See const.py for constants used at testing. - -To set up logs, navigate to support.logs.py and modify method set_up_logging. - -## Run - -In order to run tests, use command -`python -m unittest` - -Recommended variation is -`python -m unittest -v 2> output.txt` -which leaves result in output.txt - -Before running for the first time, requirements must be installed with following command -`pip install -r requirements.txt` - -It is possible to run in Pycharm with configuration like so: - -![image](https://user-images.githubusercontent.com/88670521/186934112-d0f828fd-a809-4ed8-bbfd-4457b734d8fd.png) diff --git a/icon/aca.png b/assets/icon/aca.png similarity index 100% rename from icon/aca.png rename to assets/icon/aca.png diff --git a/icon/bsd.png b/assets/icon/bsd.png similarity index 100% rename from icon/bsd.png rename to assets/icon/bsd.png diff --git a/icon/by.png b/assets/icon/by.png similarity index 100% rename from icon/by.png rename to assets/icon/by.png diff --git a/icon/cc.png b/assets/icon/cc.png similarity index 100% rename from icon/cc.png rename to assets/icon/cc.png diff --git a/icon/gplv2.png b/assets/icon/gplv2.png similarity index 100% rename from icon/gplv2.png rename to assets/icon/gplv2.png diff --git a/icon/gplv3.png b/assets/icon/gplv3.png similarity index 100% rename from icon/gplv3.png rename to assets/icon/gplv3.png diff --git a/icon/mit.png b/assets/icon/mit.png similarity index 100% rename from icon/mit.png rename to assets/icon/mit.png diff --git a/icon/nc-eu.png b/assets/icon/nc-eu.png similarity index 100% rename from icon/nc-eu.png rename to assets/icon/nc-eu.png diff --git a/icon/nc-jp.png b/assets/icon/nc-jp.png similarity index 100% rename from icon/nc-jp.png rename to assets/icon/nc-jp.png diff --git a/icon/nc.png b/assets/icon/nc.png similarity index 100% rename from icon/nc.png rename to assets/icon/nc.png diff --git a/icon/nd.png b/assets/icon/nd.png similarity index 100% rename from icon/nd.png rename to assets/icon/nd.png diff --git a/icon/osi.png b/assets/icon/osi.png similarity index 100% rename from icon/osi.png rename to assets/icon/osi.png diff --git a/icon/pd.png b/assets/icon/pd.png similarity index 100% rename from icon/pd.png rename to assets/icon/pd.png diff --git a/icon/remix.png b/assets/icon/remix.png similarity index 100% rename from icon/remix.png rename to assets/icon/remix.png diff --git a/icon/sa.png b/assets/icon/sa.png similarity index 100% rename from icon/sa.png rename to assets/icon/sa.png diff --git a/icon/sampling.plus.png b/assets/icon/sampling.plus.png similarity index 100% rename from icon/sampling.plus.png rename to assets/icon/sampling.plus.png diff --git a/icon/sampling.png b/assets/icon/sampling.png similarity index 100% rename from icon/sampling.png rename to assets/icon/sampling.png diff --git a/icon/share.png b/assets/icon/share.png similarity index 100% rename from icon/share.png rename to assets/icon/share.png diff --git a/icon/zero.png b/assets/icon/zero.png similarity index 100% rename from icon/zero.png rename to assets/icon/zero.png diff --git a/const.py b/const.py deleted file mode 100644 index fefc90c..0000000 --- a/const.py +++ /dev/null @@ -1,91 +0,0 @@ -import enum -import logging - -user = "test@test.edu" -password = "dspace" -authentication = True - -# http or https -use_ssl = False -# host = "localhost" -host = "dev-5.pc" -# fe_port = ":4000" -fe_port = None -# be_port = ":8080" -be_port = None -be_location = "/server/" - -# config logging -logging.basicConfig(filename='logs.log', encoding='utf-8', level=logging.INFO) - -on_dev_5 = host == "dev-5.pc" - -# there should be no need to modify this part, unless adding new tests. -# mainly concatenates and parses settings above -protocol = "https://" if use_ssl else "http://" -url = protocol + host -FE_url = url + (fe_port if fe_port else "") -BE_url = url + (be_port if be_port else "") + be_location -OAI_url = BE_url + "oai/" -OAI_req = OAI_url + "request?verb=ListRecords&metadataPrefix=oai_dc&set=" -OAI_openaire_dc = OAI_url + "openaire_data?verb=ListRecords&" \ - "metadataPrefix=oai_dc&set=" -OAI_openaire_datacite = OAI_url + "openaire_data?verb=ListRecords&" \ - "metadataPrefix=oai_datacite&set=" -OAI_olac = OAI_url + "request?verb=ListRecords&metadataPrefix=olac&set=" -OAI_cmdi = OAI_url + "request?verb=ListRecords&metadataPrefix=cmdi&set=" -API_URL = BE_url + "api/" -IMPORT_DATA_PATH = "data/license_import/" -COM = "BB-TEST-COM" -com_UUID = None -COL = "BB-TEST-COL" -col_UUID = None -ITM_prefix = "BB-TEST-ITM-" -EMBEDDED = "_embedded" - -# Database const - for copying sequences -# CLARIN-DSpace 5 databases -# CLARIN_DSPACE_* -CLARIN_DSPACE_NAME = "clarin-dspace" -CLARIN_DSPACE_HOST = "localhost" -CLARIN_DSPACE_USER = "" -CLARIN_DSPACE_PASSWORD = "" - -# CLARIN_UTILITIES_* -CLARIN_UTILITIES_NAME = "clarin-utilities" -CLARIN_UTILITIES_HOST = "localhost" -CLARIN_UTILITIES_USER = "" -CLARIN_UTILITIES_PASSWORD = "" - -# CLARIN-DSpace 7 database -CLARIN_DSPACE_7_NAME = "dspace" -CLARIN_DSPACE_7_HOST = "localhost" -CLARIN_DSPACE_7_PORT = 5432 -CLARIN_DSPACE_7_USER = "" -CLARIN_DSPACE_7_PASSWORD = "" - -# IMPORTING LICENSES -# String which should be replaced -OLD_LICENSE_DEFINITION_STRING = 'https://lindat.mff.cuni.cz/repository/xmlui/page/' -# String which will be replaced instead of OLD_LICENSE_DEFINITION_STRING -NEW_LICENSE_DEFINITION_STRING = FE_url + '/static/' - - -class ItemType(enum.Enum): - ITEM = 1 - COMMUNITY = 2 - COLLECTION = 3 - - -# Handle prefix for item version migration -HANDLE_PREFIX = "http://hdl.handle.net/" - -# constants for resource type ID, taken from DSpace (BE) codebase -BITSTREAM = 0 -BUNDLE = 1 -ITEM = 2 -COLLECTION = 3 -COMMUNITY = 4 -SITE = 5 -GROUP = 6 -EPERSON = 7 diff --git a/data_checker/main.resource_policy_check.py b/data_checker/main.resource_policy_check.py deleted file mode 100644 index 211b056..0000000 --- a/data_checker/main.resource_policy_check.py +++ /dev/null @@ -1,172 +0,0 @@ -import os -import logging -import requests -import psycopg2 - -import const - -from data_pump.utils import create_dict_from_json, read_json, \ - convert_response_to_json, do_api_get_one -from support.dspace_proxy import rest_proxy - -logging.basicConfig(level=logging.INFO) -_logger = logging.getLogger("resource_checker") - - -def convert_old_ids_to_new(old_object_ids, map_dict): - """ - Create list of IDs of dspace 7 from IDs of dspace5 based on their mapping. - @param old_object_ids: list of IDs from dspace5 - @param map_dict: dict of IDs mapping - @return: list of IDs of dspace7 - """ - new_ids = [map_dict[old_id] for old_id in old_object_ids] - return new_ids - - -def get_data_from_database(): - """ - Get data from dspace5 based on SELECT. - We want all IDs of items, which are READ able for Anonymous - and are not workspace or workflow. - @return: list od item ids from dspace5 - """ - resource_ids_list = [] - # create database connection - conn = psycopg2.connect(database=const.CLARIN_DSPACE_NAME, - host=const.CLARIN_DSPACE_HOST, - user=const.CLARIN_DSPACE_USER, - password=const.CLARIN_DSPACE_PASSWORD) - logging.info("Connection to database " + const.CLARIN_DSPACE_NAME + - " was successful!") - # create select - # we want all resource_ids for items - # where the action is READ - # which are not workspaces or workflows - # item exists in item table - # owning group is Anonymous - cursor = conn.cursor() - cursor.execute( - "SELECT distinct resource_id FROM public.resourcepolicy " + - "WHERE resource_type_id = '2' " + - "AND action_id IN (0, 9, 10) " + - "AND NOT EXISTS (SELECT 'x' FROM public.workspaceitem WHERE " + - "public.resourcepolicy.resource_id = public.workspaceitem.item_id)" - "AND NOT EXISTS (SELECT 'x' FROM public.workflowitem WHERE " + - "public.resourcepolicy.resource_id = public.workflowitem.item_id) " + - "AND EXISTS (SELECT 'x' FROM public.item WHERE " + - "public.resourcepolicy.resource_id = public.item.item_id) " + - "AND epersongroup_id = '0'") - # list of tuples - result_t = cursor.fetchall() - cursor.close() - conn.close() - # create list from select result - for resource_id_t in result_t: - resource_id = resource_id_t[0] - resource_ids_list.append(resource_id) - return resource_ids_list - -if __name__ == "__main__": - _logger.info('Resource policies checker of anonymous view of items') - item_dict_json = "item_dict.json" - handle_json = "handle.json" - - statistics = {} - # keys for statistics - DSPACE5_STR = 'Count of visible items in Dspace5' - DSPACE7_STR = 'Count of visible items in Dspace7' - VISIBLE_STR = 'Count of visible items from Dspace5 in Dspace7' - NOTFOUND_VISIBLE_STR = ("Count of visible items from Dspace5 didn't found in Dspace7 " - "but they are visible there too") - NOTFOUND_STR = "Count of visible items from Dspace5 didn't found in Dspace7" - INHERITED_STR = ("Count of visible items from Dspace7 didn't found " - "in Dspace5 but they are visible there too") - - # get a dictionary mapping dspace5 IDs to dspace7 IDs for items - item_dict = create_dict_from_json(item_dict_json) - # get IDs of item from dspace5 base od select - old_item_list = get_data_from_database() - statistics[DSPACE5_STR] = len(old_item_list) - # get IDs for dspace7 from IDs from dspace5 based on map - new_item_list = convert_old_ids_to_new(old_item_list, item_dict) - - # list od item IDs from dspace7 which can READ Anonymous - item_ids_list=[] - # get total pages for search - # max page size for this request is 100 - response = rest_proxy.get('discover/search/objects?sort=score,' - 'DESC&size=100&page=0&configuration=default' - '&dsoType=ITEM&embed=thumbnail&embed=item%2Fthumbnail') - response_json = convert_response_to_json(response) - totalPages = objects = response_json['_embedded']['searchResult']['page']['totalPages'] - # get result from each page - # we don't get items which are withdrawn or discoverable - for page in range(totalPages): - response = rest_proxy.get('discover/search/objects?sort=score,DESC&size=100&page=' + - str(page) + - '&configuration=default&' - 'dsoType=ITEM&embed=thumbnail&embed=item%2Fthumbnail') - response_json = convert_response_to_json(response) - objects = response_json['_embedded']['searchResult']['_embedded']['objects'] - # add each object to result list - for item in objects: - item_ids_list.append(item['_embedded']['indexableObject']['id']) - statistics[DSPACE7_STR] = len(item_ids_list) - - # compare expected items in dspace5 and got items from dspace7 - # log items, which we cannot find - item_url = 'core/items' - notfound = 0 - notfound_but_visible = 0 - found = 0 - for id_ in new_item_list: - if id_ in item_ids_list: - item_ids_list.remove(id_) - found += 1 - else: - # check if we really don't have access to item in Dspace7 - response = do_api_get_one(item_url, id_) - if response.ok: - notfound_but_visible += 1 - else: - _logger.error(f"Item with id: {id_} is not visible in DSpace7, " - f"but it is visible in DSpace5! " - f"Import of resource policies was incorrect!") - notfound += 1 - statistics[VISIBLE_STR] = found - statistics[NOTFOUND_VISIBLE_STR] = notfound_but_visible - statistics[NOTFOUND_STR] = notfound - - #now in new_item_list are items whose resource_policy - # was not found in dspace5 - # it could be because in dspace7 is using inheritance for resource policies - # check if you have access for these items in dspace5 - # based on their handles or there was import error - item_lindat_url = 'https://lindat.mff.cuni.cz/repository/xmlui/handle/' - # load handle_json - handle_json = read_json(handle_json) - # create dict - handle_dict = {} - # handle has to be defined for item and item has to exist - handle_dict = {item_dict[handle['resource_id']]: handle['handle'] - for handle in handle_json if - handle['resource_type_id'] == 2 and - handle['resource_id'] in item_dict} - # do request to dspace5 for remaining items - found = 0 - notfound = 0 - for id_ in item_ids_list: - response = requests.get(item_lindat_url + handle_dict[id_]) - if response.ok: - found += 1 - else: - _logger.error(f"Item with id {id_} is visible in Dspace7 " - f"but not in Dspace5! This is a data breach!") - raise Exception(f"Item with id {id_} is visible in Dspace7 but " - f"not in Dspace5! This is a data breach!") - statistics[INHERITED_STR] = found - - # write statistics to logs - for key, value in statistics.items(): - _logger.info(f"{key}: {value}") \ No newline at end of file diff --git a/data_pump/bitstream.py b/data_pump/bitstream.py deleted file mode 100644 index 88e0562..0000000 --- a/data_pump/bitstream.py +++ /dev/null @@ -1,182 +0,0 @@ -import logging - -from data_pump.utils import read_json, convert_response_to_json, do_api_post, \ - save_dict_as_json -from migration_const import BITSTREAM_DICT - -def import_bitstream(metadata_class, - bitstreamformat_id_dict, - primary_bitstream_dict, - bitstream2bundle_dict, - bundle_id_dict, - community2logo_dict, - collection2logo_dict, - bitstream_id_dict, - community_id_dict, - collection_id_dict, - unknown_format_id_val, - statistics_dict, - save_dict): - """ - Import data into database. - Mapped tables: bitstream, bundle2bitstream, metadata, most_recent_checksum - and checksum_result - """ - bitstream_json_name = 'bitstream.json' - bundle2bitstream_json_name = 'bundle2bitstream.json' - bitstream_url = 'clarin/import/core/bitstream' - imported = 0 - - # load bundle2bitstream - bundle2bitstream_json_list = read_json(bundle2bitstream_json_name) - if bundle2bitstream_json_list: - for bundle2bitstream in bundle2bitstream_json_list: - bitstream2bundle_dict[bundle2bitstream['bitstream_id']] = \ - bundle2bitstream['bundle_id'] - - # load and import bitstreams - bitstream_json_list = read_json(bitstream_json_name) - if not bitstream_json_list: - logging.info("Bitstream JSON is empty.") - return - counter = 0 - for bitstream in bitstream_json_list: - # do bitstream checksum - # do this after every 500 imported bitstreams, - # because the server may be out of memory - if counter % 500 == 0: - do_most_recent_checksum() - counter = 0 - counter += 1 - bitstream_json_p = {} - metadata_bitstream_dict = \ - metadata_class.get_metadata_value(0, bitstream['bitstream_id']) - if metadata_bitstream_dict is not None: - bitstream_json_p['metadata'] = metadata_bitstream_dict - bitstream_json_p['sizeBytes'] = bitstream['size_bytes'] - bitstream_json_p['checkSum'] = { - 'checkSumAlgorithm': bitstream['checksum_algorithm'], - 'value': bitstream['checksum'] - } - if not bitstream['bitstream_format_id']: - logging.info( - f'Bitstream {bitstream["bitstream_id"]} ' - f'does not have a bitstream_format_id. ' - f'Using {unknown_format_id_val} instead.') - bitstream['bitstream_format_id'] = unknown_format_id_val - params = {'internal_id': bitstream['internal_id'], - 'storeNumber': bitstream['store_number'], - 'bitstreamFormat': bitstreamformat_id_dict[ - bitstream['bitstream_format_id']], - 'deleted': bitstream['deleted'], - 'sequenceId': bitstream['sequence_id'], - 'bundle_id': None, - 'primaryBundle_id': None} - - # if bitstream has bundle, set bundle_id from None to id - if bitstream['bitstream_id'] in bitstream2bundle_dict: - params['bundle_id'] = \ - bundle_id_dict[bitstream2bundle_dict[bitstream['bitstream_id']]] - - # if bitstream is primary bitstream of some bundle, - # set primaryBundle_id from None to id - if bitstream['bitstream_id'] in primary_bitstream_dict: - params['primaryBundle_id'] = \ - bundle_id_dict[primary_bitstream_dict[bitstream['bitstream_id']]] - try: - logging.info('Going to process Bitstream with internal_id: ' + - str(bitstream['internal_id'])) - response = do_api_post(bitstream_url, params, bitstream_json_p) - bitstream_id_dict[bitstream['bitstream_id']] = \ - convert_response_to_json(response)['id'] - imported += 1 - except Exception as e: - logging.error( - 'POST request ' + bitstream_url + ' for id: ' + - str(bitstream['bitstream_id']) + ' failed. Exception: ' + - str(e)) - - # do bitstream checksum for the last imported bitstreams - # these bitstreams can be less than 500, so it is not calculated in a loop - do_most_recent_checksum() - - # write bitstream dict as json - if save_dict: - save_dict_as_json(BITSTREAM_DICT, bitstream_id_dict) - statistics_val = (len(bitstream_json_list), imported) - statistics_dict['bitstream'] = statistics_val - # add logos (bitstreams) to collections and communities - add_logo_to_community(community2logo_dict, bitstream_id_dict, community_id_dict) - add_logo_to_collection(collection2logo_dict, bitstream_id_dict, collection_id_dict) - - logging.info( - "Bitstream, bundle2bitstream, most_recent_checksum " - "and checksum_result were successfully imported!") - - -def do_most_recent_checksum(): - """ - Fill the tables most_recent_checksum and checksum_result based - on imported bitstreams that haven't already their checksum - calculated. - """ - checksum_url = 'clarin/import/core/bitstream/checksum' - try: - response = do_api_post(checksum_url, {}, None) - if not response.ok: - raise Exception(response) - except Exception as e: - logging.error('POST request ' + - checksum_url + ' failed. Exception: ' + str(e)) - - -def add_logo_to_community(community2logo_dict, bitstream_id_dict, community_id_dict): - """ - Add bitstream to community as community logo. - Logo has to exist in database. - """ - logo_comm_url = 'clarin/import/logo/community' - if not community2logo_dict: - logging.info("There are no logos for communities.") - return - for key, value in community2logo_dict.items(): - if key not in community_id_dict or value not in bitstream_id_dict: - continue - params = { - 'community_id': community_id_dict[key], - 'bitstream_id': bitstream_id_dict[value] - } - try: - response = do_api_post(logo_comm_url, params, None) - if not response.ok: - raise Exception(response) - except Exception as e: - logging.error('POST request ' + logo_comm_url + ' for community: ' + - str(key) + ' failed. Exception: ' + str(e)) - logging.info( - "Logos for communities were successfully added!") - - -def add_logo_to_collection(collection2logo_dict, bitstream_id_dict, collection_id_dict): - """ - Add bitstream to collection as collection logo. - Logo has to exist in database. - """ - logo_coll_url = 'clarin/import/logo/collection' - if not collection2logo_dict: - logging.info("There are no logos for collections.") - return - for key, value in collection2logo_dict.items(): - if key not in collection_id_dict or value not in bitstream_id_dict: - continue - params = {'collection_id': collection_id_dict[key], - 'bitstream_id': bitstream_id_dict[value]} - try: - response = do_api_post(logo_coll_url, params, None) - if not response.ok: - raise Exception(response) - except Exception as e: - logging.error('POST request ' + logo_coll_url + ' for collection: ' + - str(key) + ' failed. Exception: ' + str(e)) - logging.info( - "Logos for collections were successfully added!") diff --git a/data_pump/bitstreamformatregistry.py b/data_pump/bitstreamformatregistry.py deleted file mode 100644 index 314f414..0000000 --- a/data_pump/bitstreamformatregistry.py +++ /dev/null @@ -1,83 +0,0 @@ -import logging - -from data_pump.utils import read_json, convert_response_to_json, do_api_get_all, \ - do_api_post, save_dict_as_json -from migration_const import BITSTREAM_FORMAT_DICT - -def import_bitstreamformatregistry(bitstreamformat_id_dict, - unknown_format_id_val, - statistics_dict, - save_dict): - """ - Import data into database. - Mapped tables: bitstreamformatregistry - """ - bitsteamformat_json_name = 'bitstreamformatregistry.json' - bitstreamformat_url = 'core/bitstreamformats' - imported = 0 - # read all existing data from bitstreamformatregistry - shortDesc2Id_dict = {} - try: - response = do_api_get_all(bitstreamformat_url) - bitstreamformat_json = \ - convert_response_to_json(response)['_embedded']['bitstreamformats'] - if bitstreamformat_json is not None: - for bitstreamformat in bitstreamformat_json: - shortDesc2Id_dict[bitstreamformat['shortDescription']] = \ - bitstreamformat['id'] - if bitstreamformat['description'] == 'Unknown data format': - unknown_format_id_val = bitstreamformat['id'] - - bitstreamformat_json_list = read_json(bitsteamformat_json_name) - if not bitstreamformat_json_list: - logging.info("Bitstreamformatregistry JSON is empty.") - return - - for bitstreamformat in bitstreamformat_json_list: - level = bitstreamformat['support_level'] - if level == 0: - level_str = "UNKNOWN" - elif level == 1: - level_str = "KNOWN" - elif level == 2: - level_str = "SUPPORTED" - else: - logging.error('Unsupported bitstream format registry id: ' + str(level)) - continue - - bitstreamformat_json_p = { - 'mimetype': bitstreamformat['mimetype'], - 'description': bitstreamformat['description'], - 'shortDescription': bitstreamformat['short_description'], - 'supportLevel': level_str, - 'internal': bitstreamformat['internal'] - } - try: - response = do_api_post(bitstreamformat_url, {}, - bitstreamformat_json_p) - bitstreamformat_id_dict[bitstreamformat['bitstream_format_id']] = \ - convert_response_to_json(response)['id'] - imported += 1 - except Exception as e: - if response.status_code == 200 or response.status_code == 201: - bitstreamformat_id_dict[bitstreamformat['bitstream_format_id']] = \ - shortDesc2Id_dict[bitstreamformat['short_description']] - logging.info('Bitstreamformatregistry with short description ' + - bitstreamformat['short_description'] + - ' already exists in database!') - else: - logging.error('POST request ' + bitstreamformat_url + ' for id: ' + - str(bitstreamformat['bitstream_format_id']) + - ' failed. Exception: ' + str(e)) - - # save bitstreamregistry dict as json - if save_dict: - save_dict_as_json(BITSTREAM_FORMAT_DICT, bitstreamformat_id_dict) - statistics_val = (len(bitstreamformat_json_list), imported) - statistics_dict['bitstreamformatregistry'] = statistics_val - except Exception as e: - logging.error('GET request ' + bitstreamformat_url + - ' failed. Exception: ' + str(e)) - - logging.info("Bitstream format registry was successfully imported!") - return unknown_format_id_val diff --git a/data_pump/bundle.py b/data_pump/bundle.py deleted file mode 100644 index 1b4309d..0000000 --- a/data_pump/bundle.py +++ /dev/null @@ -1,71 +0,0 @@ -import logging - -from data_pump.utils import read_json, convert_response_to_json, do_api_post, \ - save_dict_as_json -from migration_const import BUNDLE_DICT - -def import_bundle(metadata_class, - item_id_dict, - bundle_id_dict, - primary_bitstream_dict, - statistics_dict, - save_dict): - """ - Import data into database. - Mapped tables: item2bundle, bundle - """ - item2bundle_json_name = 'item2bundle.json' - bundle_json_name = 'bundle.json' - item_url = 'core/items/' - imported = 0 - # load item2bundle into dict - item2bundle_json_list = read_json(item2bundle_json_name) - statistics_val = (len(item2bundle_json_list), 0) - statistics_dict['item2bundle'] = statistics_val - item2bundle_dict = {} - if not item2bundle_json_list: - logging.info("Item2bundle JSON is empty.") - return - for item2bundle in item2bundle_json_list: - if item2bundle['item_id'] in item2bundle_dict: - item2bundle_dict[item2bundle['item_id']].append(item2bundle['bundle_id']) - else: - item2bundle_dict[item2bundle['item_id']] = [item2bundle['bundle_id']] - - # load bundles and map bundles to their primary bitstream ids - bundle_json_list = read_json(bundle_json_name) - if not bundle_json_list: - logging.info("Bundle JSON is empty.") - return - for bundle in bundle_json_list: - if bundle['primary_bitstream_id']: - primary_bitstream_dict[bundle['primary_bitstream_id']] = bundle['bundle_id'] - - # import bundle without primary bitstream id - if not item2bundle_dict: - logging.info("Bundle JSON is empty.") - return - for item in item2bundle_dict.items(): - for bundle in item[1]: - bundle_json_p = {} - metadata_bundle_dict = metadata_class.get_metadata_value(1, bundle) - if metadata_bundle_dict: - bundle_json_p['metadata'] = metadata_bundle_dict - bundle_json_p['name'] = metadata_bundle_dict['dc.title'][0]['value'] - - bundle_url = item_url - try: - bundle_url += str(item_id_dict[item[0]]) + "/bundles" - response = do_api_post(bundle_url, {}, bundle_json_p) - bundle_id_dict[bundle] = convert_response_to_json(response)['uuid'] - imported += 1 - except Exception as e: - logging.error('POST request ' + bundle_url + - ' failed. Exception: ' + str(e)) - - # save bundle dict as json - if save_dict: - save_dict_as_json(BUNDLE_DICT, bundle_id_dict) - statistics_val = (statistics_dict['item2bundle'][0], imported) - statistics_dict['item2bundle'] = statistics_val - logging.info("Bundle and Item2Bundle were successfully imported!") diff --git a/data_pump/collection.py b/data_pump/collection.py deleted file mode 100644 index 554bc2d..0000000 --- a/data_pump/collection.py +++ /dev/null @@ -1,134 +0,0 @@ -import logging - -from data_pump.utils import read_json, convert_response_to_json, do_api_post, \ - save_dict_as_json -from migration_const import COLLECTION_DICT - - -def import_collection(metadata_class, - handle_class, - group_id_dict, - community_id_dict, - collection_id_dict, - collection2logo_dict, - statistics_dict, - save_dict): - """ - Import data into database. - Mapped tables: collection, community2collection, metadatavalue, handle - """ - collection_json_name = 'collection.json' - com2col_json_name = 'community2collection.json' - metadata_json_name = 'metadatavalue.json' - collection_url = 'core/collections' - imported_coll = 0 - imported_group = 0 - collection_json_list = read_json(collection_json_name) - comm2coll_json_list = read_json(com2col_json_name) - coll2comm_dict = {} - - if not comm2coll_json_list: - logging.info("Community2collection JSON is empty.") - return - for comm2coll in comm2coll_json_list: - coll2comm_dict[comm2coll['collection_id']] = comm2coll['community_id'] - - # because the role DEFAULT_READ is without old group id in collection - coll2group_dict = {} - metadata_json_list = read_json(metadata_json_name) - - if metadata_json_list is not None: - for metadata in metadata_json_list: - if metadata['resource_type_id'] == 6 and \ - 'COLLECTION_' in metadata['text_value'] and\ - '_DEFAULT_READ' in metadata['text_value']: - text = metadata['text_value'] - positions = [ind for ind, ch in enumerate(text) if ch == '_'] - coll2group_dict[int(text[positions[0] + 1: positions[1]])] = \ - metadata['resource_id'] - - if not collection_json_list: - logging.info("Collection JSON is empty.") - return - for collection in collection_json_list: - collection_json_p = {} - metadata_col_dict =\ - metadata_class.get_metadata_value(3, collection['collection_id']) - if metadata_col_dict: - collection_json_p['metadata'] = metadata_col_dict - handle_col = handle_class.get_handle(3, collection['collection_id']) - if handle_col: - collection_json_p['handle'] = handle_col - params = {'parent': community_id_dict[coll2comm_dict[ - collection['collection_id']]]} - try: - response = do_api_post(collection_url, params, collection_json_p) - coll_id = convert_response_to_json(response)['id'] - collection_id_dict[collection['collection_id']] = coll_id - imported_coll += 1 - except Exception as e: - logging.error( - 'POST request ' + collection_url + ' for id: ' + - str(collection['collection_id']) + 'failed. Exception: ' + str(e)) - continue - - # add to collection2logo, if collection has logo - if collection['logo_bitstream_id'] is not None: - collection2logo_dict[collection['collection_id']] = \ - collection["logo_bitstream_id"] - - # greate group - # template_item_id, workflow_step_1, workflow_step_3, admin are not implemented, - # because they are null in all data - if collection['workflow_step_2']: - workflowGroups_url = collection_url + '/' + \ - coll_id + '/workflowGroups/editor' - try: - response = do_api_post(workflowGroups_url, {}, {}) - group_id_dict[collection['workflow_step_2']] = [ - convert_response_to_json(response)['id']] - imported_group += 1 - except Exception as e: - logging.error('POST request ' + workflowGroups_url + - ' failed. Exception: ' + str(e)) - if collection['submitter']: - submittersGroup_url = collection_url + '/' + \ - coll_id + '/submittersGroup' - try: - response = do_api_post(submittersGroup_url, {}, {}) - group_id_dict[collection['submitter']] = \ - [convert_response_to_json(response)['id']] - imported_group += 1 - except Exception as e: - logging.error('POST request ' + submittersGroup_url + - ' failed. Exception: ' + str(e)) - if collection['collection_id'] in coll2group_dict: - bitstreamReadGroup_url = collection_url + '/' + \ - coll_id + '/bitstreamReadGroup' - try: - response = do_api_post(bitstreamReadGroup_url, {}, {}) - group_id_dict[coll2group_dict[collection['collection_id']]] = [ - convert_response_to_json(response)['id']] - imported_group += 1 - except Exception as e: - logging.error('POST request ' + bitstreamReadGroup_url + - ' failed. Exception: ' + str(e)) - itemReadGroup_url = collection_url + '/' +\ - coll_id + '/itemReadGroup' - try: - response = do_api_post(itemReadGroup_url, {}, {}) - group_id_dict[coll2group_dict[collection['collection_id']]].append( - convert_response_to_json(response)['id']) - imported_group += 1 - except Exception as e: - logging.error('POST request ' + itemReadGroup_url - + ' failed. Exception: ' + str(e)) - - # save collection dict as json - if save_dict: - save_dict_as_json(COLLECTION_DICT, collection_id_dict) - statistics_val = (len(collection_json_list), imported_coll) - statistics_dict['collection'] = statistics_val - statistics_val = (0, statistics_dict['epersongroup'][1] + imported_group) - statistics_dict['epersongroup'] = statistics_val - logging.info("Collection and Community2collection were successfully imported!") diff --git a/data_pump/community.py b/data_pump/community.py deleted file mode 100644 index dfa4704..0000000 --- a/data_pump/community.py +++ /dev/null @@ -1,111 +0,0 @@ -import logging - -from data_pump.utils import read_json, convert_response_to_json, do_api_post, \ - save_dict_as_json -from migration_const import COMMUNITY_DICT - - -def import_community(metadata_class, - handle_class, - group_id_dict, - community_id_dict, - community2logo_dict, - statistics_dict, - save_dict): - """ - Import data into database. - Mapped tables: community, community2community, metadatavalue, handle - """ - community_json_name = 'community.json' - comm2comm_json_name = 'community2community.json' - community_url = 'core/communities' - imported_comm = 0 - imported_group = 0 - community_json_list = read_json(community_json_name) - comm2comm_json_list = read_json(comm2comm_json_name) - parent_dict = {} - child_dict = {} - if comm2comm_json_list is not None: - for comm2comm in comm2comm_json_list: - parent_id = comm2comm['parent_comm_id'] - child_id = comm2comm['child_comm_id'] - if parent_id in parent_dict.keys(): - parent_dict[parent_id].append(child_id) - else: - parent_dict[parent_id] = [child_id] - if child_id in child_dict.keys(): - child_dict[child_id].append(parent_id) - else: - child_dict[child_id] = parent_id - statistics_dict['community'] = (len(community_json_list), 0) - if not community_json_list: - logging.info("Community JSON is empty.") - return - counter = 0 - while community_json_list: - community_json_p = {} - # process community only when: - # comm is not parent and child - # comm is parent and not child - # parent comm exists - # else process it later - community = community_json_list[counter] - i_id = community['community_id'] - if (i_id not in parent_dict.keys() and - i_id not in child_dict.keys()) or\ - i_id not in child_dict.keys() or \ - child_dict[i_id] in community_id_dict.keys(): - # resource_type_id for community is 4 - handle_comm = handle_class.get_handle(4, community['community_id']) - if handle_comm: - community_json_p['handle'] = handle_comm - metadatavalue_comm_dict = \ - metadata_class.get_metadata_value(4, community['community_id']) - if metadatavalue_comm_dict: - community_json_p['metadata'] = metadatavalue_comm_dict - # create community - parent_id = None - if i_id in child_dict: - parent_id = {'parent': community_id_dict[child_dict[i_id]]} - try: - response = do_api_post(community_url, parent_id, community_json_p) - response_comm_id = convert_response_to_json(response)['id'] - community_id_dict[community['community_id']] = response_comm_id - imported_comm += 1 - except Exception as e: - logging.error('POST request ' + community_url + ' for id: ' + str(i_id) - + ' failed. Exception: ' + str(e)) - continue - - # add to community2logo, if community has logo - if community['logo_bitstream_id'] is not None: - community2logo_dict[i_id] = community["logo_bitstream_id"] - - # create admingroup - if community['admin'] is not None: - admin_url = community_url + '/' + response_comm_id + '/adminGroup' - try: - response = do_api_post(admin_url, {}, {}) - group_id_dict[community['admin']] = [convert_response_to_json( - response)['id']] - imported_group += 1 - except Exception as e: - logging.error('POST request ' + admin_url + - ' failed. Exception: ' + str(e)) - del community_json_list[counter] - else: - counter += 1 - if counter == len(community_json_list): - counter = 0 - - # save community dict as json - if save_dict: - save_dict_as_json(COMMUNITY_DICT, community_id_dict) - - if 'community' in statistics_dict: - statistics_val = (statistics_dict['community'][0], imported_comm) - statistics_dict['community'] = statistics_val - - statistics_val = (0, imported_group) - statistics_dict['epersongroup'] = statistics_val - logging.info("Community and Community2Community were successfully imported!") diff --git a/data_pump/create_jsons.py b/data_pump/create_jsons.py deleted file mode 100644 index 8ecee36..0000000 --- a/data_pump/create_jsons.py +++ /dev/null @@ -1,47 +0,0 @@ -import argparse -import logging -import os - -import psycopg2 -import json - - -def get_data_as_json(database, host, db_user, db_password): - # create database connection - conn = psycopg2.connect(database=database, - host=host, - user=db_user, - password=db_password) - logging.info("Connection was successful!") - - cursor = conn.cursor() - cursor.execute( - "SELECT table_name FROM information_schema.tables WHERE is_insertable_into = " - "'YES' AND table_schema = 'public'") - # list of tuples - table_name = cursor.fetchall() - logging.info("Processing...") - for name_t in table_name: - # access to 0. position, because name_t is tuple - name = name_t[0] - j_name = os.path.join("../data", name + ".json") - with open(j_name, 'w', encoding='utf-8') as j: - cursor.execute("SELECT json_agg(row_to_json(t)) FROM \"{}\" t".format(name)) - # access to 0. position, because the fetchone returns tuple - created_json = json.dumps(cursor.fetchone()[0]) - j.write(created_json) - logging.info("Data was successfully exported!") - conn.close() - logging.info("Disconnect from database!") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description='Process database connection') - parser.add_argument('--database', help='database name', - required=True, type=str) - parser.add_argument('--host', help='type of host', required=True, type=str) - parser.add_argument('--user', help='database user', required=True, type=str) - parser.add_argument('--password', help='database password', - required=True, type=str) - args = parser.parse_args() - get_data_as_json(args.database, args.host, args.user, args.password) diff --git a/data_pump/eperson.py b/data_pump/eperson.py deleted file mode 100644 index 782953d..0000000 --- a/data_pump/eperson.py +++ /dev/null @@ -1,117 +0,0 @@ -import logging - -from const import API_URL -from data_pump.utils import read_json, convert_response_to_json, do_api_post, \ - save_dict_as_json -from migration_const import EPERSON_DICT - - -def import_eperson(metadata_class, - eperson_id_dict, - email2epersonId_dict, - statistics_dict, - save_dict): - """ - Import data into database. - Mapped tables: eperson, metadatavalue - """ - eperson_json_name = 'eperson.json' - eperson_url = 'clarin/import/eperson' - imported_eperson = 0 - eperson_json_list = read_json(eperson_json_name) - - if not eperson_json_list: - logging.info("Eperson JSON is empty.") - return - for eperson in eperson_json_list: - metadatavalue_eperson_dict = \ - metadata_class.get_metadata_value(7, eperson['eperson_id']) - eperson_json_p = { - 'selfRegistered': eperson['self_registered'], - 'requireCertificate': eperson['require_certificate'], - 'netid': eperson['netid'], - 'canLogIn': eperson['can_log_in'], - 'lastActive': eperson['last_active'], - 'email': eperson['email'], - 'password': eperson['password'], - 'welcomeInfo': eperson['welcome_info'], - 'canEditSubmissionMetadata': eperson['can_edit_submission_metadata'] - } - - # eperson email could consist of more emails, add eperson_id into everyone - eperson_email_array = get_eperson_emails(eperson['email']) - for eperson_email in eperson_email_array: - email2epersonId_dict[eperson_email] = eperson['eperson_id'] - - if metadatavalue_eperson_dict: - eperson_json_p['metadata'] = metadatavalue_eperson_dict - params = { - 'selfRegistered': eperson['self_registered'], - 'lastActive': eperson['last_active'] - } - try: - response = do_api_post(eperson_url, params, eperson_json_p) - eperson_id_dict[eperson['eperson_id']] = convert_response_to_json( - response)['id'] - imported_eperson += 1 - except Exception as e: - logging.error('POST request ' + eperson_url + ' for id: ' + - str(eperson['eperson_id']) + - ' failed. Exception: ' + str(e)) - - # save eperson dict as json - if save_dict: - save_dict_as_json(EPERSON_DICT, eperson_id_dict) - statistics_val = (len(eperson_json_list), imported_eperson) - statistics_dict['eperson'] = statistics_val - logging.info("Eperson was successfully imported!") - - -def get_eperson_emails(email): - """ - The eperson email could consist of more email, return all of them in the array. - If the email doesn't contain `;` that means there is only one email without `;` separator. - """ - if email is None: - return [] - - if ';' not in email: - return [email] - - # email value contains of two email, take just the first one. - # e.g., test@msn.com;name@gmail.com - return email.split(';') - -def import_group2eperson(eperson_id_dict, - group_id_dict, - statistics_dict): - """ - Import data into database. - Mapped tables: epersongroup2eperson - """ - group2eperson_json_name = 'epersongroup2eperson.json' - group2eperson_url = 'clarin/eperson/groups/' - imported_group2eper = 0 - group2eperson_json_list = read_json(group2eperson_json_name) - if not group2eperson_json_list: - logging.info("Epersongroup2eperson JSON is empty.") - return - for group2eperson in group2eperson_json_list: - group_url = group2eperson_url - try: - group_url += group_id_dict[group2eperson['eperson_group_id']][0] + \ - '/epersons' - eperson_url = API_URL + 'eperson/groups/' + eperson_id_dict[ - group2eperson['eperson_id']] - response = do_api_post(group_url, {}, eperson_url) - if response.ok: - imported_group2eper += 1 - else: - raise Exception(response) - except Exception as e: - logging.error('POST request ' + - group_url + ' failed. Exception: ' + str(e)) - - statistics_val = (len(group2eperson_json_list), imported_group2eper) - statistics_dict['epersongroup2eperson'] = statistics_val - logging.info("Epersongroup2eperson was successfully imported!") diff --git a/data_pump/epersongroup.py b/data_pump/epersongroup.py deleted file mode 100644 index 91b7fc5..0000000 --- a/data_pump/epersongroup.py +++ /dev/null @@ -1,142 +0,0 @@ -import logging - -from const import API_URL -from data_pump.utils import read_json, convert_response_to_json, do_api_get_all, \ - do_api_post, save_dict_as_json -from migration_const import EPERSONGROUP_DICT - - -def import_epersongroup(metadata_class, - group_id_dict, - statistics_dict, - save_dict): - """ - Import data into database. - Mapped tables: epersongroup - """ - group_json_name = 'epersongroup.json' - group_url = 'eperson/groups' - imported = 0 - group_json_list = read_json(group_json_name) - # group Administrator and Anonymous already exist - # we need to remember their id if we haven't done it yet - # load_admin_anonymous_groups(var.group_id_dict) - if not group_json_list: - logging.info("Epersongroup JSON is empty.") - return - for group in group_json_list: - group_id = group['eperson_group_id'] - # group Administrator and Anonymous already exist - # group is created with dspace object too - if group_id not in (0, 1) and group_id not in group_id_dict: - # get group metadata - metadatavalue_group_dict = \ - metadata_class.get_metadata_value(6, group['eperson_group_id']) - if 'dc.title' not in metadatavalue_group_dict: - logging.error('Metadata for group ' + str(group_id) + - ' does not contain title!') - continue - name = metadatavalue_group_dict['dc.title'][0]['value'] - del metadatavalue_group_dict['dc.title'] - # the group_metadata contains the name of the group - json_p = {'name': name, 'metadata': metadatavalue_group_dict} - try: - response = do_api_post(group_url, {}, json_p) - group_id_dict[group['eperson_group_id']] = [ - convert_response_to_json(response)['id']] - imported += 1 - except Exception as e: - logging.error('POST request ' + group_url + ' for id: ' + - str(group['eperson_group_id']) + - ' failed. Exception: ' + str(e)) - - # save group dict as json - if save_dict: - save_dict_as_json(EPERSONGROUP_DICT, group_id_dict) - - if 'epersongroup' in statistics_dict: - statistics_val = (len(group_json_list), statistics_dict['epersongroup'][1] + - imported) - statistics_dict['epersongroup'] = statistics_val - else: - statistics_val = (len(group_json_list), imported) - statistics_dict['epersongroup'] = statistics_val - logging.info("Eperson group was successfully imported!") - - -def get_existing_epersongroups(group_url): - """ - Get all existing eperson groups from database. - """ - existing_data_dict = None - try: - response = do_api_get_all(group_url) - existing_data_dict = convert_response_to_json(response)['_embedded']['groups'] - except Exception as e: - logging.error('GET request ' + group_url + ' failed. Exception: ' + str(e)) - return existing_data_dict - - -def import_group2group(group_id_dict, - statistics_dict): - """ - Import data into database. - Mapped tables: group2group - """ - group2group_json_name = 'group2group.json' - group2group_url = 'clarin/eperson/groups' - imported = 0 - group2group_json_list = read_json(group2group_json_name) - if not group2group_json_list: - logging.info("Group2group JSON is empty.") - return - - for group2group in group2group_json_list: - parents_a = group_id_dict[group2group['parent_id']] - childs_a = group_id_dict[group2group['child_id']] - for parent in parents_a: - for child in childs_a: - parent_url = group2group_url + '/' + parent + '/subgroups' - try: - child_url = API_URL + 'eperson/groups/' + child - response = do_api_post(parent_url, {}, child_url) - if response.ok: - imported += 1 - else: - raise Exception(response) - except Exception as e: - logging.error('POST request ' + parent_url + ' for id: ' + - str(parent) + ' failed. Exception: ' + str(e)) - - statistics_val = (len(group2group_json_list), imported) - statistics_dict['group2group'] = statistics_val - logging.info("Group2group was successfully imported!") - -def load_admin_anonymous_groups(group_id_dict): - """ - Load Administrator and Anonymous groups into dict. - This data already exists in database. - Remember its id. - """ - group_url = 'eperson/groups' - existing_data_dict = get_existing_epersongroups(group_url) - if existing_data_dict is not None: - for existing_data in existing_data_dict: - if existing_data['name'] == 'Anonymous': - group_id_dict[0] = [existing_data['id']] - elif existing_data['name'] == 'Administrator': - group_id_dict[1] = [existing_data['id']] - else: - logging.error('Unrecognized eperson group ' + existing_data['name']) - -def get_existing_epersongroups(group_url): - """ - Get all existing eperson groups from database. - """ - existing_data_dict = None - try: - response = do_api_get_all(group_url) - existing_data_dict = convert_response_to_json(response)['_embedded']['groups'] - except Exception as e: - logging.error('GET request ' + group_url + ' failed. Exception: ' + str(e)) - return existing_data_dict diff --git a/data_pump/handle.py b/data_pump/handle.py deleted file mode 100644 index 48788f2..0000000 --- a/data_pump/handle.py +++ /dev/null @@ -1,98 +0,0 @@ -import logging - -from data_pump.utils import do_api_post, read_json - - -class Handle: - def __init__(self): - self.handle_dict = {} - self.imported_handle = 0 - self.read_handle() - self.import_handle_with_url() - self.import_handle_without_object() - - def read_handle(self): - """ - Read handle as json and convert it to dictionary wth tuple key: - resource_type_id and resource_type, - where value is list of jsons. - """ - handle_json_name = 'handle.json' - handle_json_list = read_json(handle_json_name) - if not handle_json_list: - logging.info('Handle JSON is empty.') - return - for handle in handle_json_list: - key = (handle['resource_type_id'], handle['resource_id']) - if key in self.handle_dict.keys(): - self.handle_dict[key].append(handle) - else: - self.handle_dict[key] = [handle] - - def import_handle_with_url(self): - """ - Import handles into database with url. - Other handles are imported by dspace objects. - Mapped table: handles - """ - handle_url = 'core/handles' - # handle with defined url has key (None, None) - if (None, None) not in self.handle_dict: - logging.info("Handles with url don't exist.") - return - handles_a = self.handle_dict[(None, None)] - for handle in handles_a: - handle_json_p = { - 'handle': handle['handle'], - 'url': handle['url'] - } - try: - response = do_api_post(handle_url, {}, handle_json_p) - if response.ok: - self.imported_handle += 1 - else: - raise Exception(response) - except Exception as e: - logging.error('POST response ' + handle_url + ' for handle: ' + - handle['handle'] + ' failed. Exception: ' + str(e)) - - logging.info("Handles with url were successfully imported!") - - def import_handle_without_object(self): - """ - Import handles which have not objects into database. - Other handles are imported by dspace objects. - Mapped table: handles - """ - handle_url = 'clarin/import/handle' - if (2, None) not in self.handle_dict: - logging.info("Handles without objects don't exist.") - return - - handles_a = self.handle_dict[(2, None)] - for handle in handles_a: - handle_json_p = { - 'handle': handle['handle'], - 'resourceTypeID': handle['resource_type_id'] - } - try: - do_api_post(handle_url, {}, handle_json_p) - self.imported_handle += 1 - except Exception as e: - logging.error( - 'POST response ' + handle_url + ' failed. Exception: ' + str(e)) - - logging.info("Handles without object were successfully imported!") - - def get_handle(self, obj_type_int, obj_id): - """ - Get handle based on object type and its id. - """ - if (obj_type_int, obj_id) in self.handle_dict: - self.imported_handle += 1 - return self.handle_dict[(obj_type_int, obj_id)][0]['handle'] - else: - return None - - def get_imported_handle(self): - return self.imported_handle diff --git a/data_pump/item.py b/data_pump/item.py deleted file mode 100644 index c9389f9..0000000 --- a/data_pump/item.py +++ /dev/null @@ -1,534 +0,0 @@ -import datetime -import logging - -import const -from data_pump.sequences import connect_to_db -from data_pump.utils import read_json, convert_response_to_json, do_api_post, \ - save_dict_as_json -from data_pump.var_declarations import DC_RELATION_REPLACES_ID, DC_RELATION_ISREPLACEDBY_ID, DC_IDENTIFIER_URI_ID -from support.dspace_proxy import rest_proxy -from const import API_URL -from migration_const import WORKFLOWITEM_DICT, WORKSPACEITEM_DICT, ITEM_DICT - -def import_item(metadata_class, - handle_class, - workflowitem_id_dict, - workspaceitem_id_dict, - item_id_dict, - collection_id_dict, - eperson_id_dict, - statistics_dict, - item_handle_item_metadata_dict, - save_dict): - """ - Import data into database. - Mapped tables: item, collection2item, workspaceitem, cwf_workflowitem, - metadata, handle - """ - item_json_name = "item.json" - workspaceitem_json_name = "workspaceitem.json" - workflowitem_json_name = 'workflowitem.json' - collection2table_name = "collection2item.json" - item_url = 'clarin/import/item' - workflowitem_url = 'clarin/import/workflowitem' - item2collection_url = 'clarin/import/item/{item_uuid}/mappedCollections' - imported_workspaceitem = 0 - imported_workflowitem = 0 - imported_item = 0 - # create dict from items by item id - item_json_list = read_json(item_json_name) - items_dict = {} - if not item_json_list: - logging.info("Item JSON is empty.") - return - for item in item_json_list: - items_dict[item['item_id']] = item - statistics_dict['item'] = (len(item_json_list), 0) - - # create item and workspaceitem - workspaceitem_json_list = read_json(workspaceitem_json_name) - if workspaceitem_json_list is not None: - for workspaceitem in workspaceitem_json_list: - item = items_dict[workspaceitem['item_id']] - import_workspaceitem(item, workspaceitem['collection_id'], - workspaceitem['multiple_titles'], - workspaceitem['published_before'], - workspaceitem['multiple_files'], - workspaceitem['stage_reached'], - workspaceitem['page_reached'], - metadata_class, - handle_class, - workspaceitem_id_dict, - item_id_dict, - collection_id_dict, - eperson_id_dict) - imported_workspaceitem += 1 - del items_dict[workspaceitem['item_id']] - - statistics_dict['workspaceitem'] = (len(workspaceitem_json_list), - imported_workspaceitem) - imported_item += imported_workspaceitem - # save workspaceitem dict as json - if save_dict: - save_dict_as_json(WORKSPACEITEM_DICT, workspaceitem_id_dict) - logging.info("Workspaceitem was successfully imported!") - else: - logging.info("Workspaceitem JSON is empty.") - # create workflowitem - # workflowitem is created from workspaceitem - # -1, because the workflowitem doesn't contain this attribute - workflowitem_json_list = read_json(workflowitem_json_name) - if workflowitem_json_list is not None: - for workflowitem in workflowitem_json_list: - item = items_dict[workflowitem['item_id']] - import_workspaceitem(item, workflowitem['collection_id'], - workflowitem['multiple_titles'], - workflowitem['published_before'], - workflowitem['multiple_files'], - -1, - -1, - metadata_class, - handle_class, - workspaceitem_id_dict, - item_id_dict, - collection_id_dict, - eperson_id_dict) - # create workflowitem from created workspaceitem - params = {'id': str(workspaceitem_id_dict[workflowitem['item_id']])} - try: - response = do_api_post(workflowitem_url, params, None) - workflowitem_id_dict[workflowitem['workflow_id']] = \ - response.headers['workflowitem_id'] - imported_workflowitem += 1 - except Exception as e: - logging.error('POST request ' + workflowitem_url + ' for id: ' + - str(workflowitem['item_id']) + ' failed. Exception: ' + - str(e)) - del items_dict[workflowitem['item_id']] - - # save workflow dict as json - if save_dict: - save_dict_as_json(WORKFLOWITEM_DICT, workflowitem_id_dict) - statistics_val = (len(workflowitem_json_list), imported_workflowitem) - statistics_dict['workflowitem'] = statistics_val - imported_item += imported_workflowitem - logging.info("Cwf_workflowitem was successfully imported!") - else: - logging.info("Workflowitem JSON is empty.") - - # create other items - for item in items_dict.values(): - item_json_p = { - 'discoverable': item['discoverable'], - 'inArchive': item['in_archive'], - 'lastModified': item['last_modified'], - 'withdrawn': item['withdrawn'] - } - metadatvalue_item_dict = metadata_class.get_metadata_value(2, item['item_id']) - if metadatvalue_item_dict: - item_json_p['metadata'] = metadatvalue_item_dict - handle_item = handle_class.get_handle(2, item['item_id']) - if handle_item is not None: - item_json_p['handle'] = handle_item - params = { - 'owningCollection': collection_id_dict[item['owning_collection']], - 'epersonUUID': eperson_id_dict[item['submitter_id']] - } - try: - response = do_api_post(item_url, params, item_json_p) - response_json = convert_response_to_json(response) - item_id_dict[item['item_id']] = response_json['id'] - imported_item += 1 - except Exception as e: - logging.error('POST request ' + item_url + ' for id: ' + - str(item['item_id']) + ' failed. Exception: ' + str(e)) - - # Import collection2item table - only items which are mapped in more collections - # Add another collection into Item only if another collection is not owning_collection - collection2table_json_list = read_json(collection2table_name) - coll_2_item_dict = {} - items_with_more_colls = {} - # Find items which are mapped in more collections and store them into dictionary in this way - # {'item_uuid': [collection_uuid_1, collection_uuid_2]} - for collection2table in collection2table_json_list: - # Every item should have mapped only one collection - the owning collection except the items which - # are mapped into more collections - item_uuid = item_id_dict[collection2table['item_id']] - collection_uuid = collection_id_dict[collection2table['collection_id']] - if item_uuid in coll_2_item_dict: - # Add another collection into dict to get all collections for current Item - coll_2_item_dict[item_uuid].append(collection_id_dict[collection2table['collection_id']]) - # Add item UUID and collection UUID into list in this way {`item_uuid`: `collection_uuid`} - items_with_more_colls[item_uuid] = collection_uuid - continue - coll_2_item_dict[item_uuid] = [collection_uuid] - - # Call Vanilla REST endpoint which add relation between Item and Collection into the collection2item table - for item_with_more_coll_uuid in items_with_more_colls.keys(): - # Prepare request URL - replace `{item_uuid}` with current `item_with_more_coll_uuid` - request_url = item2collection_url.replace('{item_uuid}', item_with_more_coll_uuid) - - # Prepare request body which should looks like this: - # `"https://localhost:8080/spring-rest/api/core/collections/{collection_uuid_1}" + \n - # "https://localhost:8080/spring-rest/api/core/collections/{collection_uuid_2}" - request_body = [] - collection_url = 'core/collections/' - for collection_uuid in coll_2_item_dict[item_with_more_coll_uuid]: - request_body.append(API_URL + collection_url + collection_uuid) - - do_api_post(request_url, {}, request_body) - - # save item dict as json - if save_dict: - save_dict_as_json(ITEM_DICT, item_id_dict) - statistics_val = (statistics_dict['item'][0], imported_item) - statistics_dict['item'] = statistics_val - - # Migrate item versions - # Get connections to database - the versions data are directly added into the database - c5_dspace = connect_to_db(database=const.CLARIN_DSPACE_NAME, - host=const.CLARIN_DSPACE_HOST, - user=const.CLARIN_DSPACE_USER, - password=const.CLARIN_DSPACE_PASSWORD) - c7_dspace = connect_to_db(database=const.CLARIN_DSPACE_7_NAME, - host=const.CLARIN_DSPACE_7_HOST, - port=const.CLARIN_DSPACE_7_PORT, - user=const.CLARIN_DSPACE_7_USER, - password=const.CLARIN_DSPACE_7_PASSWORD) - # Store all created versions in the list - do not create a version for the same item - processed_items_id = [] - # Some item versions cannot be imported into the database because they are already withdrawn and a new versions - # are stored in another repository - withdrawn_item_handles = [] - # Handle Item versions which cannot be imported because of some error - not_imported_item_handles = [] - # Migration process - migrate_item_history(metadata_class, items_dict, item_id_dict, item_handle_item_metadata_dict, c7_dspace, - processed_items_id, withdrawn_item_handles, not_imported_item_handles) - - # Check if migration was successful - if not log unsuccessful items - check_sum(c7_dspace, c5_dspace, item_id_dict, withdrawn_item_handles, not_imported_item_handles, - item_handle_item_metadata_dict) - - # Add result of version importing into statistics - statistics_dict['versions_imported'] = (-1, len(processed_items_id)) - statistics_dict['versions_not_imported_withdrawn'] = (-1, len(withdrawn_item_handles)) - statistics_dict['versions_not_imported_error'] = (-1, len(not_imported_item_handles)) - logging.info("Item and Collection2item were successfully imported!") - - -def check_sum(c7_dspace, c5_dspace, item_id_dict, withdrawn_item_handles, not_imported_item_handles, - item_handle_item_metadata_dict): - """ - Check if item versions importing was successful - Select item ids from CLARIN-DSpace5 which has some version metadata - Select items uuids from CLARIN-DSpace7 `versionitem` table where are stored item's version - Check if all items from CLARIN-DSpace5 has record in the CLARIN-DSpace7 history version table - check uuids - """ - - cursor_5 = c5_dspace.cursor() - cursor_7 = c7_dspace.cursor() - # Select item ids from CLARIN-DSpace5 which has some version metadata - cursor_5.execute("SELECT resource_id FROM metadatavalue WHERE metadata_field_id in (50,51) group by resource_id;") - # Fetch the result - clarin_5_item_ids = cursor_5.fetchall() - - # Select item uuids from CLARIN-DSpace7 which record in the `versionitem` table - cursor_7.execute("select item_id from versionitem;") - # Fetch the result - clarin_7_item_uuids = cursor_7.fetchall() - - if clarin_5_item_ids is None or clarin_7_item_uuids is None: - logging.error('Cannot check result of importing item versions.') - - # Some new version of the item is not finished yet - item_id - worklfowitem_not_imported = [] - # Some items could not be imported - uuid - not_imported_items = [] - clarin_5_ids_to_uuid = [] - # Convert item_id to uuid - for clarin_5_id in clarin_5_item_ids: - clarin_5_ids_to_uuid.append(item_id_dict[clarin_5_id[0]]) - - # Check if the clarin_5_uuid is in the clarin_7_historyversion_uuid - for clarin_7_uuid in clarin_7_item_uuids: - # clarin_5_uuid = item_id_dict[clarin_5_id] - if clarin_7_uuid[0] not in clarin_5_ids_to_uuid: - not_imported_items.append(clarin_7_uuid[0]) - - if not_imported_items: - logging.warning('Version migration MAYBE was not successful for the items below because the item could be' - ' a workspace or previous version is withdrawn.') - for non_imported_uuid in not_imported_items: - logging.warning(f'Please check versions for the Item with: {non_imported_uuid}') - return - logging.info('Version migration was successful.') - - -def migrate_item_history(metadata_class, - items_dict, - item_id_dict, - item_handle_item_metadata_dict, - c7_dspace, - processed_items_id, - withdrawn_item_handles, - not_imported_item_handles): - logging.info("Going to migrate versions of all items.") - - cursor_c7_dspace = c7_dspace.cursor() - admin_uuid = get_admin_uuid(cursor_c7_dspace) - - # Migrate versions for every Item - for item in items_dict.values(): - item_id = item['item_id'] - # Do not process versions of the item that have already been processed. - if item_id in processed_items_id: - continue - - # This sequence contains handles of all versions of the Item ordered from the first version to the latest one - item_version_sequence = get_item_version_sequence(item_id, items_dict, metadata_class, - item_handle_item_metadata_dict, withdrawn_item_handles, - not_imported_item_handles) - - # Do not process item which does not have any version - if item_version_sequence is None: - continue - - logging.debug(f'Going to process all versions for the item with ID: {item_id}') - # All versions of this Item is going to be processed - # Insert data into `versionhistory` table - versionhistory_new_id = get_last_id_from_table(cursor_c7_dspace, 'versionhistory', 'versionhistory_id') + 1 - cursor_c7_dspace.execute("INSERT INTO versionhistory(versionhistory_id) VALUES (" + - str(versionhistory_new_id) + ");") - # Update sequence - cursor_c7_dspace.execute(f"SELECT setval('versionhistory_seq', {versionhistory_new_id})") - c7_dspace.commit() - - # Insert data into `versionitem` with `versionhistory` id - versionitem_new_id = get_last_id_from_table(cursor_c7_dspace, 'versionitem', 'versionitem_id') + 1 - for index, item_version_handle in enumerate(item_version_sequence, 1): - # If the item is withdrawn the new version could be stored in our repo or in another. Do import that version - # only if the item is stored in our repo. - if item_version_handle not in item_handle_item_metadata_dict: - current_item = items_dict[item_id] - if current_item['withdrawn']: - logging.info(f'The item with handle: {item_version_handle} cannot be migrated because' - f' it is stored in another repository.') - continue - - # Get the handle of the x.th version of the Item - item_handle_id_dict = item_handle_item_metadata_dict[item_version_handle] - # Get item_id using the handle - item_id = item_handle_id_dict['item_id'] - # Get the uuid of the item using the item_id - item_uuid = item_id_dict[item_id] - # timestamp is required column in the database - timestamp = datetime.datetime.now() - cursor_c7_dspace.execute(f'INSERT INTO public.versionitem(versionitem_id, version_number, version_date, version_summary, versionhistory_id, eperson_id, item_id) VALUES (' - f'{versionitem_new_id}, ' - f'{index}, ' - f'\'{timestamp}\', ' - f'\'\', ' - f'{versionhistory_new_id}, ' - f'\'{admin_uuid}\', ' - f'\'{item_uuid}\');') - # Update sequence - cursor_c7_dspace.execute(f"SELECT setval('versionitem_seq', {versionitem_new_id})") - versionitem_new_id += 1 - processed_items_id.append(item_id) - c7_dspace.commit() - - -def get_admin_uuid(cursor): - """ - Get uuid of the admin user - """ - # Execute a SQL query to retrieve the last record's ID (assuming 'your_table' is the name of your table) - cursor.execute(f'SELECT uuid FROM eperson WHERE email like \'{const.user}\'') - - # Fetch the result - eperson_uuid = cursor.fetchone() - - uuid = '' - # Check if there is a result and extract the ID - if eperson_uuid: - uuid = eperson_uuid[0] - else: - logging.error("No eperson records in the table.") - - return uuid - - -def get_last_id_from_table(cursor, table_name, id_column): - """ - Get id of the last record from the specific table - @return: id of the last record - """ - # Execute a SQL query to retrieve the last record's ID (assuming 'your_table' is the name of your table) - cursor.execute("SELECT " + id_column + " FROM " + table_name + " ORDER BY " + id_column + " DESC LIMIT 1") - - # Fetch the result - last_record_id = cursor.fetchone() - - # Default value - the table is empty - last_id = 1 - # Check if there is a result and extract the ID - if last_record_id: - last_id = last_record_id[0] - else: - logging.info("No records in the table.") - - # Close the cursor and the database connection - return last_id - - -def get_item_version_sequence(item_id, - items_dict, - metadata_class, - item_handle_item_metadata_dict, - withdrawn_item_handles, - not_imported_item_handles): - """ - Return all versions of the item in ordered list from the first version to the latest including the handle of the - current Item - @return: list of the item versions or if the item doesn't have any version return None - """ - # The newer versions of the item - newer_versions = get_item_versions(item_id, items_dict, metadata_class, item_handle_item_metadata_dict, True, - withdrawn_item_handles, not_imported_item_handles) - # The previous versions of the item - previous_versions = get_item_versions(item_id, items_dict, metadata_class, item_handle_item_metadata_dict, False, - withdrawn_item_handles, not_imported_item_handles) - # Previous versions are in wrong order - reverse the list - previous_versions = previous_versions[::-1] - - # If this item does not have any version return a None - if len(newer_versions) == 0 and len(previous_versions) == 0: - return None - - # Get handle of the current Item - current_item_handle = getFirstMetadataValue(item_id, metadata_class, DC_IDENTIFIER_URI_ID) - if current_item_handle is None: - logging.error(f'Cannot find handle for the item with id: {item_id}') - not_imported_item_handles.append(item_id) - return None - - return previous_versions + [current_item_handle] + newer_versions - - -def get_item_versions(item_id, items_dict, metadata_class, item_handle_item_metadata_dict, previous_or_newer: bool, - withdrawn_item_handles, not_imported_item_handles): - """ - Return all previous or newer versions of the item using connection between `dc.relation.replaces` and - `dc.relation.isreplacedby` item metadata. - @return: list of versions or empty list - """ - # Get previous version - fetch metadata value from `dc.relation.replaces` - # Get newer version - fetch metadata value from `dc.relation.isreplaced.by` - metadata_field = DC_RELATION_REPLACES_ID - if previous_or_newer: - metadata_field = DC_RELATION_ISREPLACEDBY_ID - - list_of_version = [] - current_item_id = item_id - # current_version is handle of previous or newer item - current_version = getFirstMetadataValue(current_item_id, metadata_class, metadata_field) - while current_version is not None: - if current_version not in item_handle_item_metadata_dict: - # Check if current item is withdrawn - item = items_dict[item_id] - if item['withdrawn']: - # The item is withdrawn and stored in another repository - logging.info(f'The item with handle: {current_version} is withdrawn and will not be imported because ' - f'it is stored in another repository.') - withdrawn_item_handles.append(current_version) - else: - logging.error(f'The item with handle: {current_version} has not been imported!') - not_imported_item_handles.append(current_version) - current_version = None - continue - - list_of_version.append(current_version) - - current_item_id = item_handle_item_metadata_dict[current_version]['item_id'] - current_version = getFirstMetadataValue(current_item_id, metadata_class, metadata_field) - - return list_of_version - - -def getFirstMetadataValue(item_id, metadata_class, metadata_field_id): - if item_id is None: - return None - - # 2 = resource_type = Item - try: - # It returns a dict of metadata_values - all_metadata_values = metadata_class.metadatavalue_dict[(2, item_id)] - # because metadata value are stored in the list - for metadata_value in all_metadata_values: - if metadata_value['metadata_field_id'] != metadata_field_id: - continue - # Return first value - return metadata_value['text_value'] - # if metadata_field_id not in all_metadata_values: - # return None - except Exception as e: - logging.error(f'Cannot get first metadata from the Item with ID: {item_id} because: {e}') - return None - - -def import_workspaceitem(item, - owning_collectin_id, - multiple_titles, - published_before, - multiple_files, - stagereached, - page_reached, - metadata_class, - handle_class, - workspaceitem_id_dict, - item_id_dict, - collection_id_dict, - eperson_id_dict): - """ - Auxiliary method for import item. - Import data into database. - Mapped tables: workspaceitem, metadata, handle - """ - workspaceitem_url = 'clarin/import/workspaceitem' - workspaceitem_json_p = { - 'discoverable': item['discoverable'], - 'inArchive': item['in_archive'], - 'lastModified': item['last_modified'], - 'withdrawn': item['withdrawn'] - } - metadatavalue_item_dict = metadata_class.get_metadata_value(2, item['item_id']) - if metadatavalue_item_dict is not None: - workspaceitem_json_p['metadata'] = metadatavalue_item_dict - handle_workspaceitem = handle_class.get_handle(2, item['item_id']) - if handle_workspaceitem is not None: - workspaceitem_json_p['handle'] = handle_workspaceitem - # the params are workspaceitem attributes - params = { - 'owningCollection': collection_id_dict[owning_collectin_id], - 'multipleTitles': multiple_titles, - 'publishedBefore': published_before, - 'multipleFiles': multiple_files, 'stageReached': stagereached, - 'pageReached': page_reached, - 'epersonUUID': eperson_id_dict[item['submitter_id']] - } - try: - response = do_api_post(workspaceitem_url, params, workspaceitem_json_p) - workspaceitem_id = convert_response_to_json(response)['id'] - workspaceitem_id_dict[item['item_id']] = workspaceitem_id - item_url = API_URL + 'clarin/import/' + str(workspaceitem_id) + "/item" - try: - response = rest_proxy.d.api_get(item_url, None, None) - item_id_dict[item['item_id']] = convert_response_to_json(response)['id'] - except Exception as e: - logging.error('POST request ' + item_url + - ' failed. Exception: ' + str(e)) - except Exception as e: - logging.error('POST request ' + workspaceitem_url + ' for id: ' + - str(item['item_id']) + - ' failed. Exception: ' + str(e)) diff --git a/data_pump/license.py b/data_pump/license.py deleted file mode 100644 index d2b01ac..0000000 --- a/data_pump/license.py +++ /dev/null @@ -1,116 +0,0 @@ -import logging -import os - -from const import OLD_LICENSE_DEFINITION_STRING, NEW_LICENSE_DEFINITION_STRING -from migration_const import ICON_PATH -from data_pump.utils import read_json, do_api_post, convert_response_to_json - - -def import_license(eperson_id_dict, statistics_dict): - """ - Import data into database. - Mapped tables: license_label, extended_mapping, license_definitions - """ - # import license label - label_json_name = 'license_label.json' - label_url = 'core/clarinlicenselabels' - imported_label = 0 - labels_dict = {} - # import license_label - label_json_list = read_json(label_json_name) - if not label_json_list: - logging.info("License_label JSON is empty.") - return - for label in label_json_list: - label_json_p = { - 'label': label['label'], - 'title': label['title'], - 'extended': label['is_extended'], - 'icon': None - } - # find image with label name - try: - image_path = ICON_PATH + label['label'].lower() + ".png" - if os.path.exists(image_path): - with open(image_path, "rb") as image: - f = image.read() - label_json_p['icon'] = list(f) - except Exception as e: - logging.error( - "Exception while reading label image with name: " + label[ - 'label'].lower() + ".png occurred: " + str(e)) - try: - response = do_api_post(label_url, {}, label_json_p) - created_label = convert_response_to_json(response) - imported_label += 1 - del created_label['license'] - del created_label['_links'] - labels_dict[label['label_id']] = created_label - except Exception as e: - logging.error('POST request ' + label_url + - ' failed. Exception: ' + str(e)) - - statistics_val = (len(label_json_list), imported_label) - statistics_dict['license_label'] = statistics_val - - # import license definition and exteended mapping - license_json_name = 'license_definition.json' - license_url = 'clarin/import/license' - ext_map_json_name = 'license_label_extended_mapping.json' - # read license label extended mapping - ext_map_dict = {} - ext_map_json_list = read_json(ext_map_json_name) - if not ext_map_json_list: - logging.info("Extended_mapping JSON is empty.") - return - for ext_map in ext_map_json_list: - if ext_map['license_id'] in ext_map_dict.keys(): - ext_map_dict[ext_map['license_id']].append(labels_dict[ext_map['label_id']]) - else: - ext_map_dict[ext_map['license_id']] = [labels_dict[ext_map['label_id']]] - # import license_definition - imported_license = 0 - license_json_list = read_json(license_json_name) - if not license_json_list: - logging.info("License_definitions JSON is empty.") - return - for license_ in license_json_list: - license_json_p = { - 'name': license_['name'], - 'definition': update_license_definition(license_['definition']), - 'confirmation': license_['confirmation'], - 'requiredInfo': license_['required_info'], - 'clarinLicenseLabel': labels_dict[license_['label_id']] - } - if license_['license_id'] in ext_map_dict: - license_json_p['extendedClarinLicenseLabels'] = \ - ext_map_dict[license_['license_id']] - params = {'eperson': eperson_id_dict[license_['eperson_id']]} - try: - response = do_api_post(license_url, params, license_json_p) - if response.ok: - imported_license += 1 - else: - raise Exception(response) - except Exception as e: - logging.error('POST request ' + license_url + - ' failed. Exception: ' + str(e)) - - statistics_val = (len(license_json_list), imported_license) - statistics_dict['license_definition'] = statistics_val - logging.info("License_label, Extended_mapping, License_definitions " - "were successfully imported!") - - -def update_license_definition(current_license_definition: str): - """ - Replace license definition url from current site url to a new site url - e.g., from `https://lindat.mff.cuni.cz/repository/xmlui/page/licence-hamledt` - to `https://lindat.mff.cuni.cz/repository/static/licence-hamledt.html` - """ - # Replace old site url to a new site url - new_license_definition = current_license_definition.replace(OLD_LICENSE_DEFINITION_STRING, - NEW_LICENSE_DEFINITION_STRING) - # File name has a missing `.html` suffix -> add that suffix to the end of the definition url - new_license_definition = new_license_definition + '.html' - return new_license_definition diff --git a/data_pump/metadata.py b/data_pump/metadata.py deleted file mode 100644 index 0abf33a..0000000 --- a/data_pump/metadata.py +++ /dev/null @@ -1,302 +0,0 @@ -import logging - -from const import HANDLE_PREFIX -from data_pump.utils import read_json, convert_response_to_json, \ - do_api_get_one, do_api_get_all, do_api_post, save_dict_as_json, \ - create_dict_from_json -from data_pump.var_declarations import DC_RELATION_REPLACES_ID, DC_RELATION_ISREPLACEDBY_ID, DC_IDENTIFIER_URI_ID -from migration_const import METADATAFIELD_DICT, METADATASCHEMA_DICT - -class Metadata: - def __init__(self, statistics_dict, handle_item_metadata_dict, load_dict): - """ - Read metadatavalue as json and - convert it to dictionary with tuple key: resource_type_id and resource_id. - """ - self.metadatavalue_dict = {} - self.metadataschema_id_dict = {} - self.metadatafield_id_dict = {} - self.read_metadata(handle_item_metadata_dict) - - if load_dict: - self.metadataschema_id_dict = \ - create_dict_from_json(METADATASCHEMA_DICT) - self.metadatafield_id_dict = \ - create_dict_from_json(METADATAFIELD_DICT) - else: - self.import_metadataschemaregistry(statistics_dict) - self.import_metadatafieldregistry(statistics_dict) - - def read_metadata(self, handle_item_metadata_dict): - metadatavalue_json_name = 'metadatavalue.json' - metadatafield_json_name = 'metadatafieldregistry.json' - - metadatavalue_json_list = read_json(metadatavalue_json_name) - if not metadatavalue_json_list: - logging.info('Metadatavalue JSON is empty.') - return - - metadatafield_json_list = read_json(metadatafield_json_name) - sponsor_field_id = -1 - if not metadatafield_json_list: - logging.info('Metadatafield JSON is empty.') - return - - # Find out which field is `local.sponsor`, check only `sponsor` string - for metadatafield in metadatafield_json_list: - element = metadatafield['element'] - if element != 'sponsor': - continue - sponsor_field_id = metadatafield['metadata_field_id'] - - for metadatavalue in metadatavalue_json_list: - key = (metadatavalue['resource_type_id'], metadatavalue['resource_id']) - # replace separator @@ by ; - metadatavalue['text_value'] = metadatavalue['text_value'].replace("@@", ";") - # replace `local.sponsor` data sequence - # from `;;;` - # to `;;;` - if metadatavalue['metadata_field_id'] == sponsor_field_id: - metadatavalue['text_value'] = \ - self.fix_local_sponsor_sequence(metadatavalue['text_value']) - if key in self.metadatavalue_dict.keys(): - self.metadatavalue_dict[key].append(metadatavalue) - else: - self.metadatavalue_dict[key] = [metadatavalue] - - # Store item handle and item id connection in dict - if not metadatavalue['text_value'].startswith(HANDLE_PREFIX): - continue - # Insert data into handle_item_metadata_dict - version_history_metadata = {} - # If it exists just append it - if metadatavalue['text_value'] in handle_item_metadata_dict.keys(): - version_history_metadata = handle_item_metadata_dict[metadatavalue['text_value']] - - # metadata_field_id 25 is Item's handle - if metadatavalue['metadata_field_id'] == DC_IDENTIFIER_URI_ID: - version_history_metadata['item_id'] = metadatavalue['resource_id'] - handle_item_metadata_dict[metadatavalue['text_value']] = version_history_metadata - - - @staticmethod - def fix_local_sponsor_sequence(wrong_sequence_str): - """ - Replace `local.sponsor` data sequence - from `;;;;` - to `;;;;` - """ - separator = ';' - sponsor_list_max_length = 5 - - # sponsor list could have length 4 or 5 - sponsor_list = wrong_sequence_str.split(separator) - org = sponsor_list[0] - project_code = sponsor_list[1] - project_name = sponsor_list[2] - project_type = sponsor_list[3] - eu_identifier = '' - if len(sponsor_list) == sponsor_list_max_length: - # has eu_identifier value - eu_identifier = sponsor_list[4] - # compose the `local.sponsor` sequence in the right way - return separator.join( - [project_type, project_code, org, project_name, eu_identifier]) - - def import_metadataschemaregistry(self, statistics_dict, save_dict=True): - """ - Import data into database. - Mapped tables: metadataschemaregistry - """ - metadataschema_json_name = 'metadataschemaregistry.json' - metadataschema_url = 'core/metadataschemas' - imported = 0 - # get all existing data from database table - existing_data_dict = Metadata.get_imported_metadataschemaregistry( - metadataschema_url) - - metadataschema_json_list = read_json(metadataschema_json_name) - if not metadataschema_json_list: - logging.info("Metadataschemaregistry JSON is empty.") - return - for metadataschema in metadataschema_json_list: - metadataschema_json_p = { - 'namespace': metadataschema['namespace'], - 'prefix': metadataschema['short_id'] - } - # prefix has to be unique - try: - response = do_api_post(metadataschema_url, {}, metadataschema_json_p) - self.metadataschema_id_dict[metadataschema['metadata_schema_id']] = \ - convert_response_to_json(response)['id'] - imported += 1 - except Exception as e: - found = False - if not existing_data_dict: - logging.error('POST request ' + metadataschema_url + ' for id: ' + - str(metadataschema['metadata_schema_id']) + - ' failed. Exception: ' + str(e)) - continue - for existing_data in existing_data_dict: - if existing_data['prefix'] != metadataschema['short_id']: - continue - self.metadataschema_id_dict[metadataschema - ['metadata_schema_id']] = \ - existing_data['id'] - logging.info('Metadataschemaregistry ' - ' prefix: ' + metadataschema['short_id'] - + 'already exists in database!') - found = True - imported += 1 - break - if not found: - logging.error('POST request ' + metadataschema_url + ' for id: ' + - str(metadataschema['metadata_schema_id']) + - ' failed. Exception: ' + str(e)) - - # save metadataschema dict as json - if save_dict: - save_dict_as_json(METADATASCHEMA_DICT, - self.metadataschema_id_dict) - statistics_val = (len(metadataschema_json_list), imported) - statistics_dict['metadataschemaregistry'] = statistics_val - logging.info("MetadataSchemaRegistry was successfully imported!") - - @staticmethod - def get_imported_metadataschemaregistry(metadataschema_url): - """ - Gel all existing data from table metadataschemaregistry. - """ - existing_data_dict = None - try: - response = do_api_get_all(metadataschema_url) - existing_data_dict = convert_response_to_json(response)['_embedded'][ - 'metadataschemas'] - except Exception as e: - logging.error('GET request ' + metadataschema_url + ' failed. Exception: ' - + str(e)) - return existing_data_dict - - def import_metadatafieldregistry(self, statistics_dict, save_dict=True): - """ - Import data into database. - Mapped tables: metadatafieldregistry - """ - metadatafield_json_name = 'metadatafieldregistry.json' - metadatafield_url = 'core/metadatafields' - imported = 0 - existing_data_dict = None - try: - response = do_api_get_all(metadatafield_url) - existing_data_dict = convert_response_to_json(response)['_embedded'][ - 'metadatafields'] - except Exception as e: - logging.error('GET request ' + metadatafield_url + - ' failed. Exception: ' + str(e)) - - metadatafield_json_list = read_json(metadatafield_json_name) - if not metadatafield_json_list: - logging.info("Metadatafieldregistry JSON is empty.") - return - for metadatafield in metadatafield_json_list: - metadatafield_json_p = { - 'element': metadatafield['element'], - 'qualifier': metadatafield['qualifier'], - 'scopeNote': metadatafield['scope_note'] - } - params = {'schemaId': self.metadataschema_id_dict[ - metadatafield['metadata_schema_id']]} - # element and qualifier have to be unique - try: - response = do_api_post(metadatafield_url, params, metadatafield_json_p) - self.metadatafield_id_dict[metadatafield['metadata_field_id']] = \ - convert_response_to_json(response)['id'] - imported += 1 - except Exception as e: - found = False - if not existing_data_dict: - logging.error('POST request ' + metadatafield_url + ' for id: ' + - str(metadatafield['metadata_field_id']) + - ' failed. Exception: ' + str(e)) - continue - for existing_data in existing_data_dict: - if metadatafield['metadata_schema_id'] not in self.metadataschema_id_dict.keys(): - continue - - if existing_data['_embedded']['schema']['id'] != self.metadataschema_id_dict[metadatafield['metadata_schema_id']] or \ - existing_data['element'] != metadatafield['element'] or \ - existing_data['qualifier'] != metadatafield['qualifier']: - continue - self.metadatafield_id_dict[metadatafield['metadata_field_id']] = \ - existing_data['id'] - logging.info('Metadatafieldregistry with element: ' + - metadatafield['element'] + - ' already exists in database!') - found = True - imported += 1 - break - if not found: - logging.error('POST request ' + metadatafield_url + ' for id: ' + - str(metadatafield['metadata_field_id']) + - ' failed. Exception: ' + str(e)) - - # save metadatafield dict as json - if save_dict: - save_dict_as_json(METADATAFIELD_DICT, self.metadatafield_id_dict) - statistics_val = (len(metadatafield_json_list), imported) - statistics_dict['metadatafieldregistry'] = statistics_val - logging.info("MetadataFieldRegistry was successfully imported!") - - def get_metadata_value(self, old_resource_type_id, old_resource_id): - """ - Get metadata value for dspace object. - """ - metadatafield_url = 'core/metadatafields' - metadataschema_url = 'core/metadataschemas' - result_dict = {} - # get all metadatavalue for object - if (old_resource_type_id, old_resource_id) not in self.metadatavalue_dict: - logging.info('Metadatavalue for resource_type_id: ' + - str(old_resource_type_id) + ' and resource_id: ' + - str(old_resource_id) + 'does not exist.') - return None - metadatavalue_obj = self.metadatavalue_dict[( - old_resource_type_id, old_resource_id)] - # create list of object metadata - for metadatavalue in metadatavalue_obj: - if metadatavalue['metadata_field_id'] not in self.metadatafield_id_dict: - continue - try: - response = do_api_get_one( - metadatafield_url, - self.metadatafield_id_dict[metadatavalue['metadata_field_id']]) - metadatafield_json = convert_response_to_json(response) - except Exception as e: - logging.error('GET request' + metadatafield_url + - ' failed. Exception: ' + str(e)) - continue - # get metadataschema - try: - response = do_api_get_one( - metadataschema_url, metadatafield_json['_embedded']['schema']['id']) - metadataschema_json = convert_response_to_json(response) - except Exception as e: - logging.error('GET request ' + metadataschema_url + - ' failed. Exception: ' + str(e)) - continue - # define and insert key and value of dict - key = metadataschema_json['prefix'] + '.' + metadatafield_json['element'] - value = { - 'value': metadatavalue['text_value'], - 'language': metadatavalue['text_lang'], - 'authority': metadatavalue['authority'], - 'confidence': metadatavalue['confidence'], - 'place': metadatavalue['place'] - } - if metadatafield_json['qualifier']: - key += '.' + metadatafield_json['qualifier'] - if key in result_dict.keys(): - result_dict[key].append(value) - else: - result_dict[key] = [value] - return result_dict diff --git a/data_pump/registrationdata.py b/data_pump/registrationdata.py deleted file mode 100644 index 5731acd..0000000 --- a/data_pump/registrationdata.py +++ /dev/null @@ -1,34 +0,0 @@ -import logging - -from data_pump.utils import read_json, do_api_post - - -def import_registrationdata(statistics_dict): - """ - Import data into database. - Mapped tables: registrationdata - """ - registrationdata_json_name = 'registrationdata.json' - registrationdata_url = 'eperson/registrations' - imported_registrationdata = 0 - registrationdata_json_list = read_json(registrationdata_json_name) - if not registrationdata_json_list: - logging.info("Registrationdata JSON is empty.") - return - for registrationdata in registrationdata_json_list: - registrationdata_json_p = {'email': registrationdata['email']} - params = {'accountRequestType': 'register'} - try: - response = do_api_post(registrationdata_url, params, - registrationdata_json_p) - if response.ok: - imported_registrationdata += 1 - else: - raise Exception(response) - except Exception as e: - logging.error('POST request' + registrationdata_url + ' for email: ' + - registrationdata['email'] + ' failed. Exception: ' + str(e)) - - statistics_val = (len(registrationdata_json_list), imported_registrationdata) - statistics_dict['registrationdata'] = statistics_val - logging.info("Registration data was successfully imported!") diff --git a/data_pump/resourcepolicy.py b/data_pump/resourcepolicy.py deleted file mode 100644 index 3658454..0000000 --- a/data_pump/resourcepolicy.py +++ /dev/null @@ -1,119 +0,0 @@ -import logging -import psycopg2 - -from data_pump.utils import read_json, convert_response_to_json, do_api_post - -from const import CLARIN_DSPACE_7_NAME, CLARIN_DSPACE_7_HOST, \ - CLARIN_DSPACE_7_USER, CLARIN_DSPACE_7_PASSWORD, COMMUNITY, COLLECTION,\ - ITEM, BUNDLE, BITSTREAM -from migration_const import ACTIONS_LIST - - -def import_resource_policies(community_id_dict, - collection_id_dict, - item_id_dict, - bundle_id_dict, - bitstream_id_dict, - eperson_id_dict, - group_id_dict, - statistics_dict): - res_policy_json_name = 'resourcepolicy.json' - res_policy_url = 'authz/resourcepolicies' - res_policy_json_list = read_json(res_policy_json_name) - imported = 0 - unimported = 0 - def_read = 0 - for res_policy in res_policy_json_list: - params = {} - try: - # find object id based on its type - type = res_policy['resource_type_id'] - if type == COMMUNITY: - params['resource'] = community_id_dict[res_policy['resource_id']] - elif type == COLLECTION: - params['resource'] = collection_id_dict[res_policy['resource_id']] - elif type == ITEM: - params['resource'] = item_id_dict[res_policy['resource_id']] - elif type == BUNDLE: - params['resource'] = bundle_id_dict[res_policy['resource_id']] - elif type == BITSTREAM: - params['resource'] = bitstream_id_dict[res_policy['resource_id']] - # in resource there is action as id, but we need action as text - actionId = res_policy['action_id'] - # control, if action is entered correctly - if actionId < 0 or actionId >= len(ACTIONS_LIST): - logging.error('Cannot do POST request ' + res_policy_url + ' for id: ' + - str(res_policy['policy_id']) + ' because action id: ' - + str(actionId) + ' does not exist.') - unimported += 1 - continue - # create object for request - json_p = {'action': ACTIONS_LIST[actionId], 'startDate': - res_policy['start_date'], - 'endDate': res_policy['end_date'], 'name': res_policy['rpname'], - 'policyType': res_policy['rptype'], 'description': - res_policy['rpdescription']} - # resource policy has defined eperson or group, not the both - # get eperson if it is not none - if res_policy['eperson_id'] is not None: - params['eperson'] = eperson_id_dict[res_policy['eperson_id']] - # create resource policy - response = do_api_post(res_policy_url, params, json_p) - response = convert_response_to_json(response) - response['id'] - imported += 1 - continue - - # get group if it is not none - elif res_policy['epersongroup_id'] is not None: - group_list = group_id_dict[res_policy['epersongroup_id']] - if len(group_list) > 1: - def_read += 1 - for group in group_list: - params['group'] = group - response = do_api_post(res_policy_url, params, json_p) - response = convert_response_to_json(response) - response['id'] - imported += 1 - else: - logging.error('Cannot do POST request ' + res_policy_url + ' for id: ' + - str(res_policy['policy_id']) + - ' because neither eperson nor group is defined.') - unimported += 1 - continue - except Exception as e: - logging.error('POST request ' + res_policy_url + ' for id: ' + - str(res_policy['policy_id']) + ' ' - 'failed. Exception: ' + str(e)) - unimported += 1 - - # write statistic - statistics_dict['resourcepolicy'] = {'expected: ': len(res_policy_json_list), - 'imported': imported, - 'duplicated': def_read, - 'unimported': unimported} - - -def delete_all_resource_policy(): - # create database connection - conn = psycopg2.connect(database=CLARIN_DSPACE_7_NAME, - host=CLARIN_DSPACE_7_HOST, - user=CLARIN_DSPACE_7_USER, - password=CLARIN_DSPACE_7_PASSWORD) - logging.info("Connection to database " + CLARIN_DSPACE_7_NAME + " was successful!") - # get count of resourcepolicy - cursor = conn.cursor() - cursor.execute( - "SELECT COUNT(*) from public.resourcepolicy" - ) - # access to 0. position, because the fetchone returns tuple - expected = cursor.fetchone()[0] - # delete all data - cursor.execute( - "DELETE FROM public.resourcepolicy") - deleted = cursor.rowcount - conn.commit() - cursor.close() - conn.close() - # control, if we deleted all data - assert expected == deleted diff --git a/data_pump/sequences.py b/data_pump/sequences.py deleted file mode 100644 index 999a3db..0000000 --- a/data_pump/sequences.py +++ /dev/null @@ -1,131 +0,0 @@ -import os -import logging -import time - -import psycopg2 -import const - -_this_dir = os.path.dirname(os.path.abspath(__file__)) -_temp_dir = os.path.join(_this_dir, "..", "temp-files") - -# create temp dir if needed -if not os.path.exists(_temp_dir): - os.makedirs(_temp_dir, exist_ok=True) - -# standard console output -_logger = logging.getLogger("migrate_sequences") - -# testing output to file -file_path = os.path.join(_temp_dir, "__failed_sequences.txt") -file_handler = logging.FileHandler(file_path, mode="w") -file_handler.setLevel(logging.ERROR) - -_logger_test = logging.getLogger("test_migrated_sequences") -_logger_test.setLevel(logging.ERROR) -_logger_test.addHandler(file_handler) - -def migrate_sequences(): - """ - Migrate sequences from clarin 5 database to clarin 7 database. - """ - _logger.info("Sequence migration started.") - - # create database connection - c5_dspace_conn = connect_to_db(database=const.CLARIN_DSPACE_NAME, - host=const.CLARIN_DSPACE_HOST, - user=const.CLARIN_DSPACE_USER, - password=const.CLARIN_DSPACE_PASSWORD) - - c5_utilities_conn = connect_to_db(database=const.CLARIN_UTILITIES_NAME, - host=const.CLARIN_UTILITIES_HOST, - user=const.CLARIN_UTILITIES_USER, - password=const.CLARIN_UTILITIES_PASSWORD) - - c7_dspace = connect_to_db(database=const.CLARIN_DSPACE_7_NAME, - host=const.CLARIN_DSPACE_7_HOST, - port=const.CLARIN_DSPACE_7_PORT, - user=const.CLARIN_DSPACE_7_USER, - password=const.CLARIN_DSPACE_7_PASSWORD) - - # get all sequences from clarin-dspace database - cursor_c5_dspace = c5_dspace_conn.cursor() - cursor_c5_dspace.execute("SELECT * FROM information_schema.sequences") - c5_dspace_seq = cursor_c5_dspace.fetchall() - - # Do not import `clarin-utilities` sequences because of this issue: - # https://github.com/dataquest-dev/dspace-python-api/issues/114 - - # # get all sequences from clarin-utilities database - # cursor_c5_utilities = c5_utilities_conn.cursor() - # cursor_c5_utilities.execute("SELECT * FROM information_schema.sequences") - # c5_utilities_seq = cursor_c5_utilities.fetchall() - # - # # join all clarin5 sequences into one list as clarin 7 only has one database for sequences - clarin5_all_seq = c5_dspace_seq - - cursor_c7_dspace = c7_dspace.cursor() - cursor_c7_dspace.execute("SELECT * FROM information_schema.sequences") - c7_dspace_seq = cursor_c7_dspace.fetchall() - c7_dspace_seq_names = [seq[2] for seq in c7_dspace_seq] - - name_idx = 2 - db_idx = 0 - - # check if all sequences from clarin 5 are already present in clarin 7 - failed_seq = [] - for c5_seq in clarin5_all_seq: - - c5_seq_name = c5_seq[name_idx] - seq_db = c5_seq[db_idx] - - if c5_seq_name not in c7_dspace_seq_names: - continue - - # use cursor according to database to which sequence belongs - if seq_db == "clarin-dspace": - cursor = cursor_c5_dspace - # else: - # cursor = cursor_c5_utilities - - # get current value of given sequence - cursor.execute(f"SELECT last_value FROM {c5_seq_name}") - c5_seq_val = cursor.fetchone()[0] - - # set value of the sequence in clarin 7 dspace database - cursor_c7_dspace.execute(f"SELECT setval('{c5_seq_name}', {c5_seq_val})") - c7_dspace.commit() - - # check value of the sequence in clarin7 database - test_seq_value(cursor_c7_dspace, c5_seq_name, c5_seq_val) - - _logger.info("Sequence migration is complete.") - - -def connect_to_db(database: str, user: str, password: str, host="localhost", port=5432, max_attempt=5, conn_delay=2): - """ - Try to connect to database with given credential in fixed number of attempt. - Throws ConnectionError exception if fails to do so. - """ - for conn_ctr in range(max_attempt): - conn = psycopg2.connect(database=database, - host=host, - port=port, - user=user, - password=password) - - if conn.closed == 0: - _logger.debug(f"Connection to {database} successful.") - return conn - - _logger.warning(f"Connection to {database} failed. Next attempt [no. {conn_ctr}] in {conn_delay} seconds.") - time.sleep(conn_delay) - - raise ConnectionError(f"Connection to {database} could not be established in {max_attempt} attempts.") - - -def test_seq_value(cursor: psycopg2.extensions.cursor, seq_name: str, expected_val: int): - cursor.execute(f"SELECT last_value FROM {seq_name}") - seq_val = cursor.fetchone()[0] - - if seq_val != expected_val: - _logger_test.error(f"{seq_name} --> [{seq_val}] does not match expected [{expected_val}].") diff --git a/data_pump/tasklistitem.py b/data_pump/tasklistitem.py deleted file mode 100644 index 29c5ae3..0000000 --- a/data_pump/tasklistitem.py +++ /dev/null @@ -1,37 +0,0 @@ -import logging - -from data_pump.utils import read_json, do_api_post - - -def import_tasklistitem(workflowitem_id_dict, - eperson_id_dict, - statistics_dict): - """ - Import data into database. - Mapped table: tasklistitem - """ - tasklistitem_json_name = "tasklistitem.json" - tasklistitem_url = 'clarin/eperson/groups/tasklistitem' - imported_tasklistitem = 0 - tasklistitem_json_list = read_json(tasklistitem_json_name) - if not tasklistitem_json_list: - logging.info("Tasklistitem JSON is empty.") - return - for tasklistitem in tasklistitem_json_list: - try: - params = { - 'epersonUUID': eperson_id_dict[tasklistitem['eperson_id']], - 'workflowitem_id': workflowitem_id_dict[tasklistitem['workflow_id']] - } - response = do_api_post(tasklistitem_url, params, None) - if response.ok: - imported_tasklistitem += 1 - else: - raise Exception(response) - except Exception as e: - logging.error('POST request ' + tasklistitem_url + ' failed. Exception: ' + - str(e)) - - statistics_val = (len(tasklistitem_json_list), imported_tasklistitem) - statistics_dict['tasklistitem'] = statistics_val - logging.info("Tasklistitem was sucessfully imported!") diff --git a/data_pump/user_metadata.py b/data_pump/user_metadata.py deleted file mode 100644 index 88787c2..0000000 --- a/data_pump/user_metadata.py +++ /dev/null @@ -1,97 +0,0 @@ -import logging -from data_pump.utils import read_json, do_api_post - - -def import_user_metadata(bitstream_id_dict, - user_registration_id_dict, - statistics_dict): - """ - Import data into database. - Mapped tables: user_metadata, license_resource_user_allowance - """ - user_met_url = 'clarin/import/usermetadata' - user_met_json_name = 'user_metadata.json' - imported_user_met = 0 - # read license_resource_user_allowance - # mapping transaction_id to mapping_id - user_allowance_dict = {} - user_allowance_json_list = read_json("license_resource_user_allowance.json") - if not user_allowance_json_list: - logging.info("License_resource_user_allowance JSON is empty.") - return - for user_allowance in user_allowance_json_list: - user_allowance_dict[user_allowance['transaction_id']] = user_allowance - - # read license_resource_mapping - # mapping bitstream_id to mapping_id - resource_mapping_json_list = read_json('license_resource_mapping.json') - mappings_dict = {} - if not resource_mapping_json_list: - logging.info("License_resource_mapping JSON is empty.") - return - for resource_mapping in resource_mapping_json_list: - mappings_dict[resource_mapping['mapping_id']] = resource_mapping['bitstream_id'] - - # read user_metadata - user_met_json_list = read_json(user_met_json_name) - if not user_met_json_list: - logging.info("User_metadata JSON is empty.") - return - - # Group user metadata by `transaction_id`. The endpoint must receive list of all metadata with the same - # transaction_id` because if the endpoint will be called for every `user_metadata` there will be a huge amount - # of `license_resource_user_allowance` records with not correct mapping with the `user_metadata` table. - user_met_json_dict = {} - for user_met in user_met_json_list: - if user_met['transaction_id'] not in user_allowance_dict: - continue - - # If the user_metadata with transaction_id has some values in the list - append the list, otherwise - # create a new one - if user_met['transaction_id'] in user_met_json_dict: - user_met_json_dict[user_met['transaction_id']].append(user_met) - else: - user_met_json_dict[user_met['transaction_id']] = [user_met] - - # Go through dict and import user_metadata - for user_met_key in user_met_json_dict.keys(): - # Get list of all user_metadata following `transaction_id` - user_met_list = user_met_json_dict[user_met_key] - # Get user_registration data for importing - data_user_all_dict = user_allowance_dict[user_met_list[0]['transaction_id']] - # Get `eperson_id` for importing - eperson_id = user_met_list[0]['eperson_id'] - - # Prepare user_metadata list for request - user_met_list_request = [] - for user_met in user_met_list: - user_met_list_request.append( - {'metadataKey': user_met['metadata_key'], - 'metadataValue': user_met['metadata_value'] - }) - - try: - # Prepare params for the import endpoint - params = { - 'bitstreamUUID': bitstream_id_dict[mappings_dict[ - data_user_all_dict['mapping_id']]], - 'createdOn': data_user_all_dict['created_on'], - 'token': data_user_all_dict['token'], - 'userRegistrationId': user_registration_id_dict[eperson_id] - } - response = do_api_post(user_met_url, params, user_met_list_request) - if response.ok: - imported_user_met += len(user_met_list_request) - else: - raise Exception(response) - except Exception as e: - logging.error('POST response ' + user_met_url + - ' failed for user registration id: ' + - str(user_met['eperson_id']) + - ' and bitstream id: ' + - str(mappings_dict[data_user_all_dict['mapping_id']]) + - '. Exception: ' + str(e)) - - statistics_val = (len(user_met_json_dict), imported_user_met) - statistics_dict['user_metadata'] = statistics_val - logging.info("User metadata successfully imported!") diff --git a/data_pump/user_registration.py b/data_pump/user_registration.py deleted file mode 100644 index 22b9d4c..0000000 --- a/data_pump/user_registration.py +++ /dev/null @@ -1,50 +0,0 @@ -import logging - -from data_pump.utils import read_json, convert_response_to_json, do_api_post, \ - save_dict_as_json -from migration_const import USER_REGISTRATION_DICT - -def import_user_registration(email2epersonId_dict, - eperson_id_dict, - userRegistration_id_dict, - statistics_dict, - save_dict): - """ - Import data into database. - Mapped tables: user_registration - """ - user_reg_json_name = "user_registration.json" - user_reg_url = 'clarin/import/userregistration' - imported_user_reg = 0 - # read user_registration - user_reg_json_list = read_json(user_reg_json_name) - if not user_reg_json_list: - logging.info("User_registration JSON is empty.") - return - for user_reg_json in user_reg_json_list: - user_reg_json_p = { - 'email': user_reg_json['email'], - 'organization': user_reg_json['organization'], - 'confirmation': user_reg_json['confirmation'] - } - if user_reg_json['email'] in email2epersonId_dict: - user_reg_json_p['ePersonID'] = \ - eperson_id_dict[email2epersonId_dict[user_reg_json['email']]] - else: - user_reg_json_p['ePersonID'] = None - try: - response = do_api_post(user_reg_url, {}, user_reg_json_p) - userRegistration_id_dict[user_reg_json['eperson_id']] = \ - convert_response_to_json(response)['id'] - imported_user_reg += 1 - except Exception as e: - logging.error('POST request ' + user_reg_url + ' for id: ' + - str(user_reg_json['eperson_id']) + - ' failed. Exception: ' + str(e)) - - # save user registration dict as json - if save_dict: - save_dict_as_json(USER_REGISTRATION_DICT, userRegistration_id_dict) - statistics_val = (len(user_reg_json_list), imported_user_reg) - statistics_dict['user_registration'] = statistics_val - logging.info("User registration was successfully imported!") diff --git a/data_pump/utils.py b/data_pump/utils.py deleted file mode 100644 index d272320..0000000 --- a/data_pump/utils.py +++ /dev/null @@ -1,96 +0,0 @@ -import json -import os -import requests - -from support.dspace_proxy import rest_proxy -from migration_const import DATA_PATH -from migration_const import MAPPING_PATH -from const import API_URL - -_this_dir = os.path.dirname(os.path.abspath(__file__)) - -def read_json(file_name, file_path=DATA_PATH): - """ - Read data from file as json. - @param file_name: file name - @return: data as json - """ - f_path = os.path.join(_this_dir, '..', file_path, file_name) - assert os.path.exists(f_path) - with open(f_path, mode='r', encoding='utf-8') as f: - json_p = json.load(f) - return json_p - - -def convert_response_to_json(response: requests.models.Response): - """ - Convert response to json. - @param response: response from api call - @return: json created from response - """ - return json.loads(response.content.decode('utf-8')) - - -def do_api_post(url, params: dict, json_p): - """ - Insert data into database by api. - @param url: url for api post - @param params: parameters for api post - @param json_p: posted data - @return: response from api post - """ - url = API_URL + url - response = rest_proxy.d.api_post(url, params, json_p) - return response - - -def do_api_get_one(url, object_id): - """ - Get data with id from table. - @param url: url for api get - @param object_id: id of object - @return: response from api get - """ - url = API_URL + url + '/' + str(object_id) - response = rest_proxy.d.api_get(url, {}, None) - return response - -def do_api_get(url, params): - """ - Get data with id from table. - @param url: url for api get - @param object_id: id of object - @return: response from api get - """ - url = API_URL + url - response = rest_proxy.d.api_get(url, params, None) - return response - - -def do_api_get_all(url): - """ - Get all data from table. - @param url: url for api get - @return: response from api get - """ - url = API_URL + url - # is the default value of how many items you get when you want all data from a table - # need to increase this value, or you won't get all data - # you increase this value by param 'size' - params = {'size': 1000} - response = rest_proxy.d.api_get(url, params, None) - return response - - -def save_dict_as_json(json_name, dictionary: dict): - """ - Save data from dictionaries as json. - """ - os.makedirs(MAPPING_PATH, exist_ok=True) - with open(MAPPING_PATH + json_name, 'w') as f: - f.write(json.dumps(dictionary)) - - -def create_dict_from_json(json_name): - return {int(key): value - for key, value in read_json(json_name, MAPPING_PATH).items()} diff --git a/data_pump/var_declarations.py b/data_pump/var_declarations.py deleted file mode 100644 index 1272315..0000000 --- a/data_pump/var_declarations.py +++ /dev/null @@ -1,26 +0,0 @@ -eperson_id_dict = {} -email2epersonId_dict = {} -user_registration_id_dict = {} -group_id_dict = {} -community_id_dict = {} -community2logo_dict = {} -collection_id_dict = {} -collection2logo_dict = {} -item_id_dict = {} -workflowitem_id_dict = {} -workspaceitem_id_dict = {} -# {'item_id': handle} - for importing item versions -item_handle_item_metadata_dict = {} -bitstreamformat_id_dict = {} -primaryBitstream_dict = {} -bitstream2bundle_dict = {} -bundle_id_dict = {} -bitstream_id_dict = {} -statistics_dict = {} -unknown_format_id_val = None - -# Migration item version history -DC_RELATION_REPLACES_ID = 50 -DC_RELATION_ISREPLACEDBY_ID = 51 -DC_IDENTIFIER_URI_ID = 25 - diff --git a/expected.py b/expected.py deleted file mode 100644 index 5fcba55..0000000 --- a/expected.py +++ /dev/null @@ -1,10 +0,0 @@ -# this file contains expected values for tests -# sometimes SOMEONE changes them in main.py and then errors might not be too readable. -# if the values are not what is present here, warnings will be shown - -exp_host = "dev-5.pc" -exp_FE_port = None -exp_BE_port = None -exp_SSL = False -exp_import_command = "docker exec -it dspace /dspace/bin/dspace oai import -c > " \ - "/dev/null 2> /dev/null" diff --git a/main.data_pump.py b/main.data_pump.py deleted file mode 100644 index ea96c07..0000000 --- a/main.data_pump.py +++ /dev/null @@ -1,181 +0,0 @@ -import argparse -import sys -import logging - -import data_pump.var_declarations as var -import migration_const as mig_const -from data_pump.bitstream import import_bitstream -from data_pump.bitstreamformatregistry import import_bitstreamformatregistry -from data_pump.bundle import import_bundle -from data_pump.collection import import_collection -from data_pump.community import import_community -from data_pump.resourcepolicy import import_resource_policies, delete_all_resource_policy -from data_pump.user_metadata import import_user_metadata -from data_pump.eperson import import_eperson, import_group2eperson -from data_pump.epersongroup import import_epersongroup, import_group2group, \ - load_admin_anonymous_groups -from data_pump.handle import Handle -from data_pump.item import import_item -from data_pump.license import import_license -from data_pump.metadata import Metadata -from data_pump.registrationdata import import_registrationdata -from data_pump.tasklistitem import import_tasklistitem -from data_pump.user_registration import import_user_registration -from data_pump.utils import read_json, create_dict_from_json -from data_pump.sequences import migrate_sequences - -logging.basicConfig(level=logging.INFO) -_logger = logging.getLogger("root") - - -def at_the_end_of_import(handle_class_p, statistics_dict): - # write statistic about handles - handle_json_list = read_json("handle.json") - statistics_dict['handle'] = (len(handle_json_list), - handle_class_p.get_imported_handle()) - # write statistic into log - _logger.info("Statistics:") - for key, value in statistics_dict.items(): - # resourcepolicy has own style of statistics - if key == 'resourcepolicy': - string = '' - for rpKey, rpValue in value.items(): - string += str(rpValue) + ' ' + rpKey + ', ' - _logger.info(key + ": " + string) - continue - _logger.info(key + ": " + str(value[0]) + - " expected and imported " + str(value[1])) - - -def load_data_into_dicts(): - var.eperson_id_dict = create_dict_from_json(mig_const.EPERSON_DICT) - var.user_registration_id_dict = create_dict_from_json(mig_const.USER_REGISTRATION_DICT) - var.group_id_dict = create_dict_from_json(mig_const.EPERSONGROUP_DICT) - var.community_id_dict = create_dict_from_json(mig_const.COMMUNITY_DICT) - var.collection_id_dict = create_dict_from_json(mig_const.COLLECTION_DICT) - var.item_id_dict = create_dict_from_json(mig_const.ITEM_DICT) - var.workspaceitem_id_dict = create_dict_from_json(mig_const.WORKSPACEITEM_DICT) - var.workflowitem_id_dict = create_dict_from_json(mig_const.WORKFLOWITEM_DICT) - var.bitstreamformat_id_dict = create_dict_from_json(mig_const.BITSTREAM_FORMAT_DICT) - var.bundle_id_dict = create_dict_from_json(mig_const.BUNDLE_DICT) - var.bitstream_id_dict = create_dict_from_json(mig_const.BITSTREAM_DICT) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description='Upload values into dictionaries') - parser.add_argument('--load_dict_bool', - help='bool value if we load values into dict', - required=False, type=bool, default=False) - parser.add_argument('--save_dict_bool', - help='bool value if we save dict values into jsons', - required=False, type=bool, default=False) - args = parser.parse_args() - - # Is the email server really off? - email_s_off = input("Please make sure your email server is turned off. " - "Otherwise unbearable amount of emails will be sent. " - "Is your EMAIL SERVER really OFF? (Y/N)") - email_s_off = email_s_off.lower() - # terminate the program - if email_s_off not in ("y", "yes"): - sys.exit() - - if args.load_dict_bool: - load_data_into_dicts() - handle_class = Handle() - metadata_class = Metadata(var.statistics_dict, var.item_handle_item_metadata_dict, args.load_dict_bool) - - _logger.info("Data migration started!") - # group Administrator and Anonymous already exist, load them - load_admin_anonymous_groups(var.group_id_dict) - import_community(metadata_class, - handle_class, - var.group_id_dict, - var.community_id_dict, - var.community2logo_dict, - var.statistics_dict, - args.save_dict_bool) - import_collection(metadata_class, - handle_class, - var.group_id_dict, - var.community_id_dict, - var.collection_id_dict, - var.collection2logo_dict, - var.statistics_dict, - args.save_dict_bool) - import_registrationdata(var.statistics_dict) - import_epersongroup(metadata_class, - var.group_id_dict, - var.statistics_dict, - args.save_dict_bool) - import_group2group(var.group_id_dict, var.statistics_dict) - import_eperson(metadata_class, - var.eperson_id_dict, - var.email2epersonId_dict, - var.statistics_dict, - args.save_dict_bool) - import_user_registration(var.email2epersonId_dict, - var.eperson_id_dict, - var.user_registration_id_dict, - var.statistics_dict, - args.save_dict_bool) - import_group2eperson(var.eperson_id_dict, - var.group_id_dict, - var.statistics_dict) - import_license(var.eperson_id_dict, var.statistics_dict) - import_item(metadata_class, - handle_class, - var.workflowitem_id_dict, - var.workspaceitem_id_dict, - var.item_id_dict, - var.collection_id_dict, - var.eperson_id_dict, - var.statistics_dict, - var.item_handle_item_metadata_dict, - args.save_dict_bool) - import_tasklistitem(var.workflowitem_id_dict, - var.eperson_id_dict, - var.statistics_dict) - var.unknown_format_id_val = import_bitstreamformatregistry( - var.bitstreamformat_id_dict, - var.unknown_format_id_val, - var.statistics_dict, - args.save_dict_bool) - import_bundle(metadata_class, - var.item_id_dict, - var.bundle_id_dict, - var.primaryBitstream_dict, - var.statistics_dict, - args.save_dict_bool) - import_bitstream(metadata_class, - var.bitstreamformat_id_dict, - var.primaryBitstream_dict, - var.bitstream2bundle_dict, - var.bundle_id_dict, - var.community2logo_dict, - var.collection2logo_dict, - var.bitstream_id_dict, - var.community_id_dict, - var.collection_id_dict, - var.unknown_format_id_val, - var.statistics_dict, - args.save_dict_bool) - import_user_metadata(var.bitstream_id_dict, - var.user_registration_id_dict, - var.statistics_dict) - # before importing of resource policies we have to delete all - # created data - delete_all_resource_policy() - import_resource_policies(var.community_id_dict, - var.collection_id_dict, - var.item_id_dict, - var.bundle_id_dict, - var.bitstream_id_dict, - var.eperson_id_dict, - var.group_id_dict, - var.statistics_dict) - # migrate sequences - migrate_sequences() - - at_the_end_of_import(handle_class, var.statistics_dict) - _logger.info("Data migration is completed!") diff --git a/migration_const.py b/migration_const.py deleted file mode 100644 index ef4f01c..0000000 --- a/migration_const.py +++ /dev/null @@ -1,25 +0,0 @@ -# CHANGE ME -DATA_PATH = "data/" -ICON_PATH = "icon/" -MAPPING_PATH = "temp-files/" -ACTIONS_LIST = ["READ", "WRITE", "OBSOLETE (DELETE)", - "ADD", "REMOVE", "WORKFLOW_STEP_1", - "WORKFLOW_STEP_2", "WORKFLOW_STEP_3", - "WORKFLOW_ABORT", "DEFAULT_BITSTREAM_READ", - "DEFAULT_ITEM_READ", "ADMIN", - "WITHDRAWN_READ"] - -# mapping dict names -EPERSON_DICT = "eperson_dict.json" -USER_REGISTRATION_DICT = "user_registration_dict.json" -EPERSONGROUP_DICT = "epersongroup_dict.json" -COMMUNITY_DICT = "community_dict.json" -COLLECTION_DICT = "collection_dict.json" -ITEM_DICT = "item_dict.json" -WORKSPACEITEM_DICT = "workspaceitem_dict.json" -WORKFLOWITEM_DICT = "workflowitem_dict.json" -BITSTREAM_FORMAT_DICT = "bitstreamformatregistry_dict.json" -BUNDLE_DICT = "bundle_dict.json" -BITSTREAM_DICT = "bitstream_dict.json" -METADATASCHEMA_DICT = "metadataschema_dict.json" -METADATAFIELD_DICT = "metadatafield_dict.json" diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..7b1a61b --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,34 @@ +###### + +[tool.autopep8] +max_line_length = 90 + +###### + +[tool.ruff] +# Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default. +select = ["E", "F"] +ignore = ["F403", "F405", "E501", "E402", "F841", "E741"] + +# Allow autofix for all enabled rules (when `--fix`) is provided. +fixable = ["A", "B", "C", "D", "E", "F",] +unfixable = [] + +# Exclude a variety of commonly ignored directories. +exclude = [ + "__pypackages__", + ".venv", +] + +# Same as Black. +line-length = 90 + +# Allow unused variables when underscore-prefixed. +dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$" + +# Assume Python 3.10. +target-version = "py38" + +[tool.ruff.mccabe] +# Unlike Flake8, default to a complexity level of 10. +max-complexity = 10 diff --git a/requirements.txt b/requirements.txt index 02e7cb0..e68a798 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,4 +2,5 @@ bs4 requests lxml psycopg2 -pre-commit \ No newline at end of file +pre-commit +tqdm \ No newline at end of file diff --git a/scripts/.gitignore b/scripts/.gitignore new file mode 100644 index 0000000..b121290 --- /dev/null +++ b/scripts/.gitignore @@ -0,0 +1 @@ +__* \ No newline at end of file diff --git a/scripts/Readme.md b/scripts/Readme.md new file mode 100644 index 0000000..9ea73a3 --- /dev/null +++ b/scripts/Readme.md @@ -0,0 +1,6 @@ +Copy sql dumps to `../input/`: + +``` +ls ../input/dump +clarin-dspace-8.8.23.sql clarin-utilities-8.8.23.sql +``` diff --git a/scripts/init.dspacedb5.sh b/scripts/init.dspacedb5.sh new file mode 100755 index 0000000..e975736 --- /dev/null +++ b/scripts/init.dspacedb5.sh @@ -0,0 +1,23 @@ +#!/bin/bash +echo "Starting postgres" +/usr/local/bin/docker-entrypoint.sh postgres &> ./__postgres.log & +PID=$! +sleep 3 + +createuser --username=postgres dspace + +echo "Importing clarin-dspace" +createdb --username=postgres --owner=dspace --encoding=UNICODE clarin-dspace +psql -U postgres clarin-dspace < ../dump/clarin-dspace-8.8.23.sql &> /dev/null +psql -U postgres clarin-dspace < ../dump/clarin-dspace-8.8.23.sql &> ./__clarin-dspace.log + +echo "Importing clarin-utilities" +createdb --username=postgres --encoding=UNICODE clarin-utilities +psql -U postgres clarin-utilities < ../dump/clarin-utilities-8.8.23.sql &> /dev/null +psql -U postgres clarin-utilities < ../dump/clarin-utilities-8.8.23.sql &> ./__clarin-utilities.log + +echo "Done, starting psql" + +# psql -U postgres +echo "Waiting for PID:$PID /usr/local/bin/docker-entrypoint.sh" +wait $PID \ No newline at end of file diff --git a/scripts/start.local.dspace.db.bat b/scripts/start.local.dspace.db.bat new file mode 100644 index 0000000..9f097ce --- /dev/null +++ b/scripts/start.local.dspace.db.bat @@ -0,0 +1,2 @@ +docker run --rm -it --name dspace-db5 -v %cd%:/dq/scripts -v %cd%/../input/dump:/dq/dump -p 5432:5432 -e POSTGRES_DB=empty -e POSTGRES_USER=postgres -e POSTGRES_PASSWORD=dspace postgres /bin/bash -c "cd /dq/scripts && ./init.dspacedb5.sh" +pause diff --git a/src/.gitignore b/src/.gitignore new file mode 100644 index 0000000..2bca3fa --- /dev/null +++ b/src/.gitignore @@ -0,0 +1 @@ +__temp \ No newline at end of file diff --git a/src/dspace/__init__.py b/src/dspace/__init__.py new file mode 100644 index 0000000..bd445e6 --- /dev/null +++ b/src/dspace/__init__.py @@ -0,0 +1,5 @@ +__all__ = [ + "rest" +] + +from ._rest import rest diff --git a/src/dspace/_http.py b/src/dspace/_http.py new file mode 100644 index 0000000..fffacd3 --- /dev/null +++ b/src/dspace/_http.py @@ -0,0 +1,15 @@ +import json +import requests + +g_cnt = 0 + + +def response_to_json(response: requests.models.Response): + """ + Convert response to json. + @param response: response from api call + @return: json created from response + """ + global g_cnt + g_cnt += 1 + return json.loads(response.content.decode('utf-8')) diff --git a/src/dspace/_rest.py b/src/dspace/_rest.py new file mode 100644 index 0000000..ec043ad --- /dev/null +++ b/src/dspace/_rest.py @@ -0,0 +1,470 @@ +import logging +# from json import JSONDecodeError +from ._http import response_to_json +from .impl import client + +_logger = logging.getLogger("dspace.rest") +ANONYM_EMAIL = True + + +def ascii(s, default="unknown"): + try: + return str(s).encode("ascii", "ignore").decode("ascii") + except Exception as e: + pass + return default + + +def progress_bar(arr): + if len(arr) < 2: + return iter(arr) + try: + from tqdm import tqdm + except Exception as e: + return iter(arr) + + mininterval = 5 if len(arr) < 500 else 10 + return tqdm(arr, mininterval=mininterval, maxinterval=2 * mininterval) + + +class rest: + """ + Serves as proxy to Dspace REST API. + Mostly uses attribute d which represents (slightly modified) dspace_client from + original python rest api by dspace developers + """ + + def __init__(self, endpoint: str, user: str, password: str, auth: bool = True): + _logger.info(f"Initialise connection to DSpace REST backend [{endpoint}]") + + self._acceptable_resp = [] + self._get_cnt = 0 + self._post_cnt = 0 + + client.check_response = lambda x, y: self._resp_check(x, y) + self._response_map = { + 201: lambda r: self._resp_ok(r), + 200: lambda r: self._resp_ok(r), + 500: lambda r: self._resp_error(r), + 400: lambda r: self._resp_error(r) + } + + self.client = client.DSpaceClient( + api_endpoint=endpoint, username=user, password=password) + if auth: + if not self.client.authenticate(): + _logger.error(f'Error auth to dspace REST API at [{endpoint}]!') + raise ConnectionError("Cannot connect to dspace!") + _logger.debug(f"Successfully logged in to [{endpoint}]") + _logger.info(f"DSpace REST backend is available at [{endpoint}]") + self.endpoint = endpoint.rstrip("/") + + # ======= + + @property + def get_cnt(self): + return self._get_cnt + + @property + def post_cnt(self): + return self._post_cnt + + # ======= + + def push_acceptable(self, arr: list): + self._acceptable_resp.append(arr) + + def pop_acceptable(self): + self._acceptable_resp.pop() + + # ======= + + def clarin_put_handles(self, handle_arr: list): + """ + Import handles which have not objects into database. + Other handles are imported by dspace objects. + Mapped table: handles + """ + url = 'clarin/import/handle' + arr = [{'handle': h['handle'], 'resourceTypeID': h['resource_type_id']} + for h in handle_arr] + return self._put(url, arr) + + def put_handles(self, handle_arr: list): + url = 'core/handles' + arr = [{'handle': h['handle'], 'url': h['url']} for h in handle_arr] + return self._put(url, arr) + + # ======= + + def fetch_existing_epersongroups(self): + """ + Get all existing eperson groups from database. + """ + url = 'eperson/groups' + resp = self._fetch(url, self.get_many, '_embedded') + return resp["groups"] + + def fetch_metadata_schemas(self): + """ + Gel all existing data from table metadataschemaregistry. + """ + url = 'core/metadataschemas' + arr = self._fetch(url, self.get_many, None) + if arr is None or "_embedded" not in arr: + return None + return arr["_embedded"]['metadataschemas'] + + def fetch_metadata_fields(self): + """ + """ + url = 'core/metadatafields' + arr = self._fetch(url, self.get_many, None) + if arr is None or "_embedded" not in arr: + return None + return arr["_embedded"]['metadatafields'] + + def fetch_metadata_field(self, object_id): + """ + """ + url = 'core/metadatafields' + return self._fetch(url, self.get_one, None, object_id=object_id) + + def fetch_schema(self, object_id): + """ + Gel all existing data from table metadataschemaregistry. + """ + url = 'core/metadataschemas' + return self._fetch(url, self.get_one, None, object_id=object_id) + + def put_metadata_schema(self, data): + url = 'core/metadataschemas' + return list(self._iput(url, [data]))[0] + + def put_metadata_field(self, data: list, params: list): + url = 'core/metadatafields' + return list(self._iput(url, [data], [params]))[0] + + # ======= + + def put_community(self, param: dict, data: dict): + url = 'core/communities' + _logger.debug(f"Importing [{data}] using [{url}]") + arr = list(self._iput(url, [data], [param])) + if len(arr) == 0: + return None + return arr[0] + + def put_community_admin_group(self, com_id: int): + url = f'core/communities/{com_id}/adminGroup' + _logger.debug(f"Adding admin group to [{com_id}] using [{url}]") + return list(self._iput(url, [{}], [{}]))[0] + + # ======= + + def put_collection(self, param: dict, data: dict): + url = 'core/collections' + _logger.debug(f"Importing [{data}] using [{url}]") + arr = list(self._iput(url, [data], [param])) + if len(arr) == 0: + return None + return arr[0] + + def put_collection_editor_group(self, col_id: int): + url = f'core/collections/{col_id}/workflowGroups/editor' + _logger.debug(f"Adding editor group to [{col_id}] using [{url}]") + return list(self._iput(url, [{}], [{}]))[0] + + def put_collection_submitter(self, col_id: int): + url = f'core/collections/{col_id}/submittersGroup' + _logger.debug(f"Adding editor group to [{col_id}] using [{url}]") + return list(self._iput(url, [{}], [{}]))[0] + + def put_collection_bitstream_read_group(self, col_id: int): + url = f'core/collections/{col_id}/bitstreamReadGroup' + _logger.debug(f"Adding bitstream read group to [{col_id}] using [{url}]") + return list(self._iput(url, [{}], [{}]))[0] + + def put_collection_item_read_group(self, col_id: int): + url = f'core/collections/{col_id}/itemReadGroup' + _logger.debug(f"Adding item read group to [{col_id}] using [{url}]") + return list(self._iput(url, [{}], [{}]))[0] + + # ======= + + def put_registrationdata(self, param: dict, data: dict): + url = 'eperson/registrations' + _logger.debug(f"Importing [{data}] using [{url}]") + return list(self._iput(url, [data], [param]))[0] + + # ======= + + def put_eperson_group(self, param: dict, data: dict): + url = 'eperson/groups' + _logger.debug(f"Importing [{data}] using [{url}]") + return list(self._iput(url, [data], [param]))[0] + + def put_group2group(self, parent, child): + url = f'clarin/eperson/groups/{parent}/subgroups' + child_url = f'{self.endpoint}/eperson/groups/{child}' + _logger.debug(f"Importing [{parent}][{child}] using [{url}]") + return list(self._iput(url, [child_url]))[0] + + def put_eperson(self, param: dict, data: dict): + url = 'clarin/import/eperson' + _logger.debug(f"Importing [{data}] using [{url}]") + return list(self._iput(url, [data], [param]))[0] + + def put_userregistration(self, data: dict): + url = 'clarin/import/userregistration' + _logger.debug(f"Importing [{data}] using [{url}]") + return list(self._iput(url, [data]))[0] + + def put_egroup(self, gid: int, eid: int): + url = f'clarin/eperson/groups/{gid}/epersons' + _logger.debug(f"Importing group[{gid}] e:[{eid}] using [{url}]") + eperson_url = f'{self.endpoint}/eperson/groups/{eid}' + return list(self._iput(url, [eperson_url]))[0] + + # ======= + + def fetch_bitstreamregistry(self): + url = 'core/bitstreamformats' + arr = self._fetch(url, self.get_many, None) + if arr is None or "_embedded" not in arr: + return None + return arr["_embedded"]["bitstreamformats"] + + def put_bitstreamregistry(self, data: dict): + url = 'core/bitstreamformats' + _logger.debug(f"Importing [{data}] using [{url}]") + return list(self._iput(url, [data]))[0] + + # ======= + + def put_license_label(self, data: dict): + url = 'core/clarinlicenselabels' + _logger.debug(f"Importing [{data}] using [{url}]") + return list(self._iput(url, [data]))[0] + + def put_license(self, param: dict, data: dict): + url = 'clarin/import/license' + _logger.debug(f"Importing [{data}] using [{url}]") + return list(self._iput(url, [data], [param]))[0] + + # ======= + + def put_tasklistitem(self, param: dict): + url = 'clarin/eperson/groups/tasklistitem' + _logger.debug(f"Importing [][{param}] using [{url}]") + return list(self._iput(url, None, [param]))[0] + + # ======= + + def put_bundle(self, item_uuid: int, data: dict): + url = f'core/items/{item_uuid}/bundles' + _logger.debug(f"Importing [{data}] using [{url}]") + return list(self._iput(url, [data],))[0] + + # ======= + + def fetch_raw_item(self, uuid: str): + url = f'core/items/{uuid}' + _logger.debug(f"Fetching [{uuid}] using [{url}]") + r = self.get(url) + if not r.ok: + raise Exception(r) + return response_to_json(r) + + # ======= + + def put_usermetadata(self, params: dict, data: dict): + url = 'clarin/import/usermetadata' + _logger.debug(f"Importing [{data}] using [{url}]") + return list(self._iput(url, [data], [params]))[0] + + # ======= + + def put_resourcepolicy(self, params: dict, data: dict): + url = 'authz/resourcepolicies' + _logger.debug(f"Importing [{data}] using [{url}]") + return list(self._iput(url, [data], [params]))[0] + + # ======= + + def add_checksums(self): + """ + Fill the tables most_recent_checksum and checksum_result based + on imported bitstreams that haven't already their checksum + calculated. + """ + url = 'clarin/import/core/bitstream/checksum' + _logger.debug(f"Checksums using [{url}]") + r = self.post(url) + if not r.ok: + raise Exception(r) + + def put_bitstream(self, param: dict, data: dict): + url = 'clarin/import/core/bitstream' + _logger.debug(f"Importing [][{param}] using [{url}]") + return list(self._iput(url, [data], [param]))[0] + + def put_com_logo(self, param: dict): + url = 'clarin/import/logo/community' + _logger.debug(f"Importing [][{param}] using [{url}]") + r = self.post(url, params=param, data=None) + if not r.ok: + raise Exception(r) + return response_to_json(r) + + def put_col_logo(self, param: dict): + url = 'clarin/import/logo/collection' + _logger.debug(f"Importing [][{param}] using [{url}]") + r = self.post(url, params=param, data=None) + if not r.ok: + raise Exception(r) + return response_to_json(r) + + # ======= + + def fetch_item(self, uuid: str): + url = f'clarin/import/{uuid}/item' + _logger.debug(f"Importing [] using [{url}]") + return self._fetch(url, self.get, None) + + def put_ws_item(self, param: dict, data: dict): + url = 'clarin/import/workspaceitem' + _logger.debug(f"Importing [{data}] using [{url}]") + return list(self._iput(url, [data], [param]))[0] + + def put_wf_item(self, param: dict): + url = 'clarin/import/workflowitem' + _logger.debug(f"Importing [][{param}] using [{url}]") + r = self.post(url, params=param, data=None) + if not r.ok: + raise Exception(r) + return r + + def put_item(self, param: dict, data: dict): + url = 'clarin/import/item' + _logger.debug(f"Importing [][{param}] using [{url}]") + return list(self._iput(url, [data], [param]))[0] + + def put_item_to_col(self, item_uuid: str, data: list): + url = f'clarin/import/item/{item_uuid}/mappedCollections' + _logger.debug(f"Importing [{data}] using [{url}]") + col_url = 'core/collections/' + # Prepare request body which should looks like this: + # `"https://localhost:8080/spring-rest/api/core/collections/{collection_uuid_1}" + \n + # "https://localhost:8080/spring-rest/api/core/collections/{collection_uuid_2}" + data = [f"{self.endpoint}/{col_url}/{x}" for x in data] + return list(self._iput(url, [data]))[0] + + # ======= + + def fetch_search_items(self, item_type: str = "ITEM", page: int = 0, size: int = 100): + """ + TODO(jm): make generic + """ + url = f'discover/search/objects?sort=score,DESC&size={size}&page={page}&configuration=default&dsoType={item_type}&embed=thumbnail&embed=item%2Fthumbnail' + r = self.get(url) + if not r.ok: + raise Exception(r) + return response_to_json(r) + + # ======= + + def _fetch(self, url: str, method, key: str, **kwargs): + try: + r = method(url, **kwargs) + js = response_to_json(r) + if key is None: + return js + return js[key] + except Exception as e: + _logger.error(f'GET [{url}] failed. Exception: [{str(e)}]') + return None + + def _put(self, url: str, arr: list, params: list = None): + return len(list(self._iput(url, arr, params))) + + def _iput(self, url: str, arr: list, params=None): + _logger.debug(f"Importing {len(arr)} using [{url}]") + if params is not None: + assert len(params) == len(arr) + + for i, data in enumerate(progress_bar(arr)): + try: + param = params[i] if params is not None else None + r = self.post(url, params=param, data=data) + if not r.ok: + raise Exception(r) + try: + js = None + if len(r.content or '') > 0: + js = response_to_json(r) + yield js + except Exception: + yield r + except Exception as e: + ascii_data = ascii(data) + if ANONYM_EMAIL: + # poor man's anonymize + if "@" in ascii_data or "email" in ascii_data: + ascii_data = ascii_data[:5] + if len(ascii_data) > 80: + ascii_data = f"{ascii_data[:70]}..." + msg_r = "" + try: + msg_r = str(r) + except Exception: + pass + + msg = f'POST [{url}] for [{ascii_data}] failed. Exception: [{str(e)}][{msg_r}]' + _logger.error(msg) + yield None + _logger.debug(f"Imported [{url}] successfully") + + # ======= + + def get_many(self, command: str, size: int = 1000): + params = {'size': size} + return self.get(command, params) + + def get_one(self, command: str, object_id: int): + url = command + '/' + str(object_id) + return self.get(url, {}) + + def get(self, command: str, params=None, data=None): + url = self.endpoint + '/' + command + self._get_cnt += 1 + return self.client.api_get(url, params, data) + + def post(self, command: str, params=None, data=None): + url = self.endpoint + '/' + command + self._post_cnt += 1 + return self.client.api_post(url, params or {}, data or {}) + + # ======= + + def _resp_check(self, r, msg): + if r is None: + _logger.error(f"Failed to receive response [{msg}] ") + raise Exception("No response from server where one was expected") + _logger.debug(f"{str(msg)}: {r.status_code}") + + # explicit accepted + for ar in self._acceptable_resp: + if r.status_code in ar: + return + + if r.status_code not in self._response_map: + _logger.warning(f"Unexpected response: {r.status_code}; [{r.url}]; {r.text}") + else: + self._response_map[r.status_code](r) + + def _resp_error(self, r): + raise ConnectionError(r.text) + + def _resp_ok(self, r): + return True diff --git a/support/dspace_interface/LICENSE.txt b/src/dspace/impl/LICENSE.txt similarity index 100% rename from support/dspace_interface/LICENSE.txt rename to src/dspace/impl/LICENSE.txt diff --git a/data_checker/__init__.py b/src/dspace/impl/__init__.py similarity index 100% rename from data_checker/__init__.py rename to src/dspace/impl/__init__.py diff --git a/support/dspace_interface/client.py b/src/dspace/impl/client.py similarity index 98% rename from support/dspace_interface/client.py rename to src/dspace/impl/client.py index 27c4463..7f959a6 100644 --- a/support/dspace_interface/client.py +++ b/src/dspace/impl/client.py @@ -26,7 +26,6 @@ import requests from requests import Request -from support.dspace_interface.response_map import check_response from .models import DSpaceObject, SimpleDSpaceObject, Bundle, Bitstream, Community, \ Collection, User, Item, Group @@ -35,6 +34,11 @@ import logging +# override in your own code if you want to do something else +def check_response(_1, _2): + pass + + def parse_json(response): """ Simple static method to handle ValueError if JSON is invalid in response body @@ -201,7 +205,7 @@ def api_post(self, url, params, data, retry=False, content_type='application/jso logging.error( 'API Post: Already retried... something must be wrong') else: - logging.info("API Post: Retrying request with updated CSRF token") + logging.debug("API Post: Retrying request with updated CSRF token") return self.api_post(url, params=params, data=data, retry=True) elif r.status_code == 401: r_json = r.json() @@ -212,7 +216,7 @@ def api_post(self, url, params, data, retry=False, content_type='application/jso 'API Post: Already retried... something must be wrong') self.exception401Counter = 0 else: - logging.info("API Post: Retrying request with updated CSRF token") + logging.debug("API Post: Retrying request with updated CSRF token") # try to authenticate self.authenticate() # Try to authenticate and repeat the request 3 times - @@ -259,7 +263,7 @@ def api_put(self, url, params, json_p, retry=False): if retry: logging.error('Already retried... something must be wrong') else: - logging.info("Retrying request with updated CSRF token") + logging.debug("Retrying request with updated CSRF token") return self.api_put(url, params=params, json_p=json_p, retry=True) return r @@ -294,7 +298,7 @@ def api_delete(self, url, params, retry=False): if retry: logging.error('Already retried... something must be wrong') else: - logging.info("Retrying request with updated CSRF token") + logging.debug("Retrying request with updated CSRF token") return self.api_delete(url, params=params, retry=True) return r @@ -361,7 +365,7 @@ def api_patch(self, url, operation, path, value, retry=False): if retry: logging.error('Already retried... something must be wrong') else: - logging.info("Retrying request with updated CSRF token") + logging.debug("Retrying request with updated CSRF token") return self.api_patch(url, operation, path, value, True) elif r.status_code == 200: # 200 Success @@ -455,7 +459,6 @@ def create_dso(self, url, params, data): is nice too and can always be parsed from this response later. """ r = self.api_post(url, params, data) - from support.dspace_interface.response_map import check_response check_response(r, "Creating item in dspace.") return r @@ -685,7 +688,7 @@ def create_bitstream(self, bundle=None, name=None, path=None, mime=None, if retry: logging.warning('Already retried... something must be wrong') else: - logging.error("Retrying request with updated CSRF token") + logging.debug("Retrying request with updated CSRF token") return self.create_bitstream(bundle, name, path, mime, metadata, True) check_response(r, "creating bitstream") diff --git a/support/dspace_interface/models.py b/src/dspace/impl/models.py similarity index 100% rename from support/dspace_interface/models.py rename to src/dspace/impl/models.py diff --git a/support/dspace_interface/original_readme.md b/src/dspace/impl/readme.md similarity index 100% rename from support/dspace_interface/original_readme.md rename to src/dspace/impl/readme.md diff --git a/src/project_settings.py b/src/project_settings.py new file mode 100644 index 0000000..dd10a5d --- /dev/null +++ b/src/project_settings.py @@ -0,0 +1,62 @@ +import os +from datetime import datetime +_this_dir = os.path.dirname(os.path.abspath(__file__)) +ts = datetime.now().strftime("%Y_%m_%d__%H.%M.%S") + +settings = { + "log_file": os.path.join(_this_dir, "../__logs", f"{ts}.txt"), + + "resume_dir": "__temp/resume/", + + "backend": { + "endpoint": "http://dev-5.pc:85/server/api/", + "user": "test@test.edu", + "password": "admin", + "authentication": True, + }, + + "ignore": { + "missing-icons": ["PUB", "RES", "ReD", "Inf"], + "epersons": [ + # ignore - empty person + 198 + ], + }, + + "db_dspace_7": { + # CLARIN-DSpace 7 database + "name": "dspace", + "host": "localhost", + # careful - NON standard port + "port": 5435, + "user": "dspace", + "password": "dspace", + }, + + "db_dspace_5": { + "name": "clarin-dspace", + "host": "localhost", + "user": "postgres", + "password": "dspace", + "port": 5432, + }, + + "db_utilities_5": { + "name": "clarin-utilities", + "host": "localhost", + "user": "postgres", + "password": "dspace", + "port": 5432, + }, + + "input": { + "datadir": os.path.join(_this_dir, "../input/data"), + "icondir": os.path.join(_this_dir, "../input/icon"), + }, + + "licenses": { + "to_replace_def_url": "https://lindat.mff.cuni.cz/repository/xmlui/page/", + # TODO(jm): replace with correct url + "replace_with_def_url": "http://dev-5.pc:85/XXX/static/", + } +} diff --git a/src/pump/__init__.py b/src/pump/__init__.py new file mode 100644 index 0000000..49dd34e --- /dev/null +++ b/src/pump/__init__.py @@ -0,0 +1,7 @@ +__all__ = [ + "repo", + "db", +] + +from ._repo import repo +from ._db import db diff --git a/src/pump/_bitstream.py b/src/pump/_bitstream.py new file mode 100644 index 0000000..acb4c40 --- /dev/null +++ b/src/pump/_bitstream.py @@ -0,0 +1,250 @@ +import logging +from ._utils import read_json, time_method, serialize, deserialize, progress_bar, log_before_import, log_after_import + +_logger = logging.getLogger("pump.bitstream") + + +class bitstreams: + """ + SQL: + Mapped tables: bitstream, bundle2bitstream, metadata, most_recent_checksum + and checksum_result + """ + TYPE = 0 + validate_table = [ + ["bitstream", { + "compare": ["checksum", "internal_id", "deleted"], + }], + ["bundle2bitstream", { + }], + ["checksum_results", { + "compare": ["result_description", "result_code"], + }], + + ] + + def __init__(self, bitstream_file_str: str, bundle2bitstream_file_str: str): + self._bs = read_json(bitstream_file_str) + self._bundle2bs = read_json(bundle2bitstream_file_str) + + self._id2uuid = {} + self._imported = { + "bitstream": 0, + "com_logo": 0, + "col_logo": 0, + } + + if len(self._bs) == 0: + _logger.info(f"Empty input: [{bitstream_file_str}].") + return + + self._bs2bundle = {} + for e in self._bundle2bs: + self._bs2bundle[e['bitstream_id']] = e['bundle_id'] + self._done = [] + + def __len__(self): + return len(self._bs) + + def uuid(self, b_id: int): + return self._id2uuid.get(str(b_id), None) + + @property + def imported(self): + return self._imported['bitstream'] + + @property + def imported_com_logos(self): + return self._imported['com_logo'] + + @property + def imported_col_logos(self): + return self._imported['col_logo'] + + @time_method + def import_to(self, env, cache_file, dspace, metadatas, bitstreamformatregistry, bundles, communities, collections): + if "bs" in self._done: + _logger.info("Skipping bitstream import") + else: + self._done.append("bs") + self._bitstream_import_to(env, dspace, metadatas, + bitstreamformatregistry, bundles, communities, collections) + self.serialize(cache_file) + + if "logos" in self._done: + _logger.info("Skipping logo import") + else: + self._done.append("logos") + # add logos (bitstreams) to collections and communities + self._logo2com_import_to(dspace, communities) + self._logo2col_import_to(dspace, collections) + self.serialize(cache_file) + + def _logo2col_import_to(self, dspace, collections): + if not collections.logos: + _logger.info("There are no logos for collections.") + return + + expected = len(collections.logos.items()) + log_key = "collection logos" + log_before_import(log_key, expected) + + for key, value in progress_bar(collections.logos.items()): + col_uuid = collections.uuid(key) + bs_uuid = self.uuid(value) + if col_uuid is None or bs_uuid is None: + continue + + params = { + 'collection_id': col_uuid, + 'bitstream_id': bs_uuid + } + try: + resp = dspace.put_col_logo(params) + self._imported["col_logo"] += 1 + except Exception as e: + _logger.error(f'put_col_logo [{col_uuid}]: failed. Exception: [{str(e)}]') + + log_after_import(log_key, expected, self.imported_col_logos) + + def _logo2com_import_to(self, dspace, communities): + """ + Add bitstream to community as community logo. + Logo has to exist in database. + """ + if not communities.logos: + _logger.info("There are no logos for communities.") + return + + expected = len(communities.logos.items()) + log_key = "communities logos" + log_before_import(log_key, expected) + + for key, value in progress_bar(communities.logos.items()): + com_uuid = communities.uuid(key) + bs_uuid = self.uuid(value) + if com_uuid is None or bs_uuid is None: + continue + + params = { + 'community_id': com_uuid, + 'bitstream_id': bs_uuid, + } + try: + resp = dspace.put_com_logo(params) + self._imported["com_logo"] += 1 + except Exception as e: + _logger.error(f'put_com_logo [{com_uuid}]: failed. Exception: [{str(e)}]') + + log_after_import(log_key, expected, self.imported_com_logos) + + def _bitstream_import_to(self, env, dspace, metadatas, bitstreamformatregistry, bundles, communities, collections): + expected = len(self) + log_key = "bitstreams" + log_before_import(log_key, expected) + + for i, b in enumerate(progress_bar(self._bs)): + b_id = b['bitstream_id'] + b_deleted = b['deleted'] + + # do bitstream checksum + # do this after every 500 imported bitstreams, + # because the server may be out of memory + if (i + 1) % 500 == 0: + try: + dspace.add_checksums() + except Exception as e: + _logger.error(f'add_checksums failed: [{str(e)}]') + + data = {} + b_meta = metadatas.value(bitstreams.TYPE, b_id, + log_missing=b_deleted is False) + if b_meta is not None: + data['metadata'] = b_meta + else: + com_logo = b_id in communities.logos.values() + col_logo = b_id in collections.logos.values() + if b_deleted or com_logo or col_logo: + log_fnc = _logger.debug + else: + log_fnc = _logger.warning + log_fnc( + f'No metadata for bitstream [{b_id}] deleted: [{b_deleted}] com logo:[{com_logo}] col logo:[{col_logo}]') + + data['sizeBytes'] = b['size_bytes'] + data['checkSum'] = { + 'checkSumAlgorithm': b['checksum_algorithm'], + 'value': b['checksum'] + } + + if not b['bitstream_format_id']: + unknown_id = bitstreamformatregistry.unknown_format_id + _logger.info(f'Using unknown format for bitstream {b_id}') + b['bitstream_format_id'] = unknown_id + + bformat_mimetype = bitstreamformatregistry.mimetype(b['bitstream_format_id']) + if bformat_mimetype is None: + _logger.critical(f'Bitstream format not found for [{b_id}]') + + params = { + 'internal_id': b['internal_id'], + 'storeNumber': b['store_number'], + 'bitstreamFormat': bformat_mimetype, + 'deleted': b['deleted'], + 'sequenceId': b['sequence_id'], + 'bundle_id': None, + 'primaryBundle_id': None + } + + # TODO(jm): fake bitstreams + TEST_DEV5 = "http://dev-5.pc" in env["backend"]["endpoint"] + if TEST_DEV5: + data['sizeBytes'] = 1748 + data['checkSum'] = { + 'checkSumAlgorithm': b['checksum_algorithm'], 'value': '8a4605be74aa9ea9d79846c1fba20a33'} + params['internal_id'] = '77893754617268908529226218097860272513' + + # if bitstream has bundle, set bundle_id from None to id + if b_id in self._bs2bundle: + bundle_int_id = self._bs2bundle[b_id] + params['bundle_id'] = bundles.uuid(bundle_int_id) + + # if bitstream is primary bitstream of some bundle, + # set primaryBundle_id from None to id + if b_id in bundles.primary: + params['primaryBundle_id'] = bundles.uuid(bundles.primary[b_id]) + try: + resp = dspace.put_bitstream(params, data) + self._id2uuid[str(b_id)] = resp['id'] + self._imported["bitstream"] += 1 + except Exception as e: + _logger.error(f'put_bitstream [{b_id}]: failed. Exception: [{str(e)}]') + + # do bitstream checksum for the last imported bitstreams + # these bitstreams can be less than 500, so it is not calculated in a loop + try: + dspace.add_checksums() + except Exception as e: + _logger.error(f'add_checksums failed: [{str(e)}]') + + log_after_import(log_key, expected, self.imported) + + # ============= + + def serialize(self, file_str: str): + data = { + "bs": self._bs, + "bundle2bs": self._bundle2bs, + "id2uuid": self._id2uuid, + "imported": self._imported, + "done": self._done, + } + serialize(file_str, data) + + def deserialize(self, file_str: str): + data = deserialize(file_str) + self._bs = data["bs"] + self._bundle2bs = data["bundle2bs"] + self._id2uuid = data["id2uuid"] + self._imported = data["imported"] + self._done = data["done"] diff --git a/src/pump/_bitstreamformatregistry.py b/src/pump/_bitstreamformatregistry.py new file mode 100644 index 0000000..c9e5635 --- /dev/null +++ b/src/pump/_bitstreamformatregistry.py @@ -0,0 +1,136 @@ +import logging +from ._utils import read_json, time_method, serialize, deserialize, progress_bar, log_before_import, log_after_import + +_logger = logging.getLogger("pump.bitstreamformatregistry") + + +class bitstreamformatregistry: + """ + SQL: + delete from fileextension ; delete from bitstreamformatregistry ; + """ + validate_table = [ + ["bitstreamformatregistry", { + "compare": ["mimetype", "short_description", "support_level"], + }], + ["fileextension", { + "compare": ["extension"], + }], + ] + + def __init__(self, bfr_file_str: str): + self._reg = read_json(bfr_file_str) + self._imported = { + "reg": 0, + "existed": 0, + } + + self._id2uuid = {} + self._id2mimetype = {} + self._unknown_format_id = None + + if len(self) == 0: + _logger.info(f"Empty input: [{bfr_file_str}].") + return + + def __len__(self): + return len(self._reg) + + def uuid(self, f_id: int): + assert isinstance(list(self._id2uuid.keys() or [""])[0], str) + return self._id2uuid.get(str(f_id), None) + + def mimetype(self, f_id: str): + return self._id2mimetype.get(str(f_id), None) + + @property + def imported(self): + return self._imported['reg'] + + @property + def imported_existed(self): + return self._imported['existed'] + + @property + def unknown_format_id(self): + return self._unknown_format_id + + @time_method + def import_to(self, dspace): + """ + Mapped tables: bitstreamformatregistry + """ + expected = len(self) + log_key = "bitstreamformatregistry" + log_before_import(log_key, expected) + + existing_bfr2id = {} + bfr_js = dspace.fetch_bitstreamregistry() + if bfr_js is not None: + for bf in bfr_js: + existing_bfr2id[bf['shortDescription']] = bf['id'] + if bf['description'] == 'Unknown data format': + self._unknown_format_id = bf['id'] + + map = { + 0: 'UNKNOWN', + 1: 'KNOWN', + 2: 'SUPPORTED', + } + + for bf in progress_bar(self._reg): + try: + level_str = map[bf['support_level']] + except Exception as e: + _logger.error( + f'Unsupported bitstream format registry id: [{bf["support_level"]}]') + continue + + bf_id = bf['bitstream_format_id'] + ext_id = existing_bfr2id.get(bf['short_description'], None) + + if ext_id is not None: + self._imported["existed"] += 1 + _logger.debug( + f'Bitstreamformatregistry [{bf["short_description"]}] already exists!') + else: + data = { + 'mimetype': bf['mimetype'], + 'description': bf['description'], + 'shortDescription': bf['short_description'], + 'supportLevel': level_str, + 'internal': bf['internal'] + } + try: + resp = dspace.put_bitstreamregistry(data) + ext_id = resp['id'] + self._imported["reg"] += 1 + except Exception as e: + _logger.error(f'put_bitstreamregistry: [{bf_id}] failed [{str(e)}]') + continue + + self._id2uuid[str(bf_id)] = ext_id + self._id2mimetype[str(bf_id)] = bf['mimetype'] + + log_after_import(f"{log_key} [existed:{self.imported_existed}]", + expected, self.imported + self.imported_existed) + + # ============= + + def serialize(self, file_str: str): + data = { + "reg": self._reg, + "id2uuid": self._id2uuid, + "imported": self._imported, + "unknown_format_id": self._unknown_format_id, + "id2mimetype": self._id2mimetype, + } + serialize(file_str, data) + + def deserialize(self, file_str: str): + data = deserialize(file_str) + self._reg = data["reg"] + self._id2uuid = data["id2uuid"] + self._imported = data["imported"] + self._unknown_format_id = data["unknown_format_id"] + self._id2mimetype = data["id2mimetype"] diff --git a/src/pump/_bundle.py b/src/pump/_bundle.py new file mode 100644 index 0000000..5bb3197 --- /dev/null +++ b/src/pump/_bundle.py @@ -0,0 +1,99 @@ +import logging +from ._utils import read_json, time_method, serialize, deserialize, progress_bar, log_before_import, log_after_import + +_logger = logging.getLogger("pump.bundle") + + +class bundles: + """ + Mapped tables: item2bundle, bundle + SQL: + """ + TYPE = 1 + validate_table = [ + ["bundle", { + }], + ] + + def __init__(self, bundle_file_str: str, item2bundle_file_str: str): + self._bundles = read_json(bundle_file_str) + self._item2bundle = read_json(item2bundle_file_str) + self._imported = { + "bundles": 0, + } + self._id2uuid = {} + + if len(self._bundles) == 0: + _logger.info(f"Empty input: [{bundle_file_str}].") + return + + self._itemid2bundle = {} + for e in self._item2bundle: + self._itemid2bundle.setdefault(e['item_id'], []).append(e['bundle_id']) + + self._primary = {} + for b in self._bundles: + primary_id = b['primary_bitstream_id'] + if primary_id: + self._primary[primary_id] = b['bundle_id'] + + def __len__(self): + return len(self._bundles) + + def uuid(self, b_id: int): + assert isinstance(list(self._id2uuid.keys() or [""])[0], str) + return self._id2uuid.get(str(b_id), None) + + @property + def primary(self): + return self._primary + + @property + def imported(self): + return self._imported['bundles'] + + @time_method + def import_to(self, dspace, metadatas, items): + expected = len(self) + log_key = "bundles" + log_before_import(log_key, expected) + + for item_id, bundle_arr in progress_bar(self._itemid2bundle.items()): + for bundle_id in bundle_arr: + data = {} + meta_bundle = metadatas.value(bundles.TYPE, bundle_id) + if meta_bundle: + data['metadata'] = meta_bundle + data['name'] = meta_bundle['dc.title'][0]['value'] + + try: + item_uuid = items.uuid(item_id) + if item_uuid is None: + _logger.critical(f'Item UUID not found for [{item_id}]') + continue + resp = dspace.put_bundle(item_uuid, data) + self._id2uuid[str(bundle_id)] = resp['uuid'] + self._imported["bundles"] += 1 + except Exception as e: + _logger.error(f'put_bundle: [{item_id}] failed [{str(e)}]') + + log_after_import(log_key, expected, self.imported) + + # ============= + + def serialize(self, file_str: str): + # not needed _itemid2bundle, _primary + data = { + "bundles": self._bundles, + "item2bundle": self._item2bundle, + "id2uuid": self._id2uuid, + "imported": self._imported, + } + serialize(file_str, data) + + def deserialize(self, file_str: str): + data = deserialize(file_str) + self._bundles = data["bundles"] + self._item2bundle = data["item2bundle"] + self._id2uuid = data["id2uuid"] + self._imported = data["imported"] diff --git a/src/pump/_collection.py b/src/pump/_collection.py new file mode 100644 index 0000000..5ac2d82 --- /dev/null +++ b/src/pump/_collection.py @@ -0,0 +1,185 @@ +import logging +import re +from ._group import groups +from ._utils import read_json, time_method, serialize, deserialize, progress_bar, log_before_import, log_after_import + +_logger = logging.getLogger("pump.collection") + + +class collections: + """ + SQL: + delete from cwf_collectionrole ; delete from collection ; + """ + validate_table = [ + ["collection", { + "nonnull": ["logo_bitstream_id"], + }], + ["community2collection", { + }], + ] + + TYPE = 3 + + def __init__(self, col_file_str: str, com2col_file_str: str, metadata_file_str: str): + self._col = read_json(col_file_str) + self._com2col = read_json(com2col_file_str) + self._imported = { + "col": 0, + "group": 0, + } + self._metadata_values = read_json(metadata_file_str) + self._id2uuid = {} + + self._logos = {} + self._groups_id2uuid = {} + + if len(self._col) == 0: + _logger.info(f"Empty input collections: [{col_file_str}].") + return + + if len(self._com2col) == 0: + _logger.info(f"Empty input community2collection: [{com2col_file_str}].") + return + + # because the role DEFAULT_READ is without old group id in collection + self._col2group = {} + col_def_read_rec = re.compile("COLLECTION_(.*)_DEFAULT_READ") + for meta in self._metadata_values: + if meta['resource_type_id'] != groups.TYPE: + continue + m_text = meta['text_value'] + m = col_def_read_rec.search(m_text) + if m is None: + continue + self._col2group[int(m.group(1))] = meta['resource_id'] + + def __len__(self): + return len(self._col) + + def uuid(self, com_id: int): + assert isinstance(list(self._id2uuid.keys() or [""])[0], str) + return self._id2uuid.get(str(com_id), None) + + def group_uuid(self, g_id: int): + # NOTE: we have string indices + return self._groups_id2uuid.get(str(g_id), []) + + @property + def logos(self): + return self._logos + + @property + def imported_cols(self): + return self._imported['col'] + + @property + def imported_groups(self): + return self._imported['group'] + + @property + def groups_id2uuid(self): + return self._groups_id2uuid + + @time_method + def import_to(self, dspace, handles, metadatas, coms): + expected = len(self) + log_key = "collections" + log_before_import(log_key, expected) + + coll2com = {x['collection_id']: x['community_id'] for x in self._com2col} + + for col in progress_bar(self._col): + col_id = col['collection_id'] + + data = {} + meta_col = metadatas.value(collections.TYPE, col_id) + data['metadata'] = meta_col + + handle_col = handles.get(collections.TYPE, col_id) + if handle_col is None: + _logger.critical(f"Cannot find handle for col [{col_id}]") + continue + + data['handle'] = handle_col + + # filter + data = {k: v for k, v in data.items() if v is not None} + + param = {'parent': coms.uuid(coll2com[col_id])} + + try: + resp = dspace.put_collection(param, data) + col_uuid = resp['id'] + self._id2uuid[str(col_id)] = col_uuid + self._imported["col"] += 1 + except Exception as e: + _logger.error(f'put_collection: [{col_id}] failed [{str(e)}]') + continue + + # add to collection2logo, if collection has logo + if col['logo_bitstream_id'] is not None: + self._logos[str(col_id)] = col["logo_bitstream_id"] + + # greate group + # template_item_id, workflow_step_1, workflow_step_3, admin are not implemented, + # because they are null in all data + ws2 = col['workflow_step_2'] + if ws2: + try: + resp = dspace.put_collection_editor_group(col_uuid) + self._groups_id2uuid[str(ws2)] = [resp['id']] + self._imported["group"] += 1 + except Exception as e: + _logger.error( + f'put_collection_editor_group: [{col_id}] failed [{str(e)}]') + + subm = col['submitter'] + if subm: + try: + resp = dspace.put_collection_submitter(col_uuid) + self._groups_id2uuid[str(subm)] = [resp['id']] + self._imported["group"] += 1 + except Exception as e: + _logger.error( + f'put_collection_submitter: [{col_id}] failed [{str(e)}]') + + if col_id in self._col2group: + group_col = self._col2group[col_id] + try: + resp = dspace.put_collection_bitstream_read_group(col_uuid) + self._groups_id2uuid.setdefault(str(group_col), []).append(resp['id']) + self._imported["group"] += 1 + except Exception as e: + _logger.error( + f'put_collection_bitstream_read_group: [{col_id}] failed [{str(e)}]') + + try: + resp = dspace.put_collection_item_read_group(col_uuid) + self._groups_id2uuid.setdefault(str(group_col), []).append(resp['id']) + self._imported["group"] += 1 + except Exception as e: + _logger.error( + f'put_collection_item_read_group: [{col_id}] failed [{str(e)}]') + + log_after_import(log_key, expected, self.imported_cols) + + # ============= + + def serialize(self, file_str: str): + data = { + "id2uuid": self._id2uuid, + "logos": self._logos, + "groups_id2uuid": self._groups_id2uuid, + "imported": self._imported, + } + serialize(file_str, data) + + def deserialize(self, file_str: str): + data = deserialize(file_str) + # TODO(jm): support older cache files + key = "id2uuid" if "id2uuid" in data else "col_created" + self._id2uuid = data[key] + self._logos = data["logos"] + self._groups_id2uuid = data["groups_id2uuid"] + self._imported = data["imported"] diff --git a/src/pump/_community.py b/src/pump/_community.py new file mode 100644 index 0000000..8c10516 --- /dev/null +++ b/src/pump/_community.py @@ -0,0 +1,185 @@ +import logging +from ._utils import read_json, time_method, serialize, deserialize, log_before_import, log_after_import + +_logger = logging.getLogger("pump.community") + + +class communities: + """ + SQL: + delete from community2community ; delete from community2collection ; delete from community ; + """ + validate_table = [ + ["community", { + }], + ["community2community", { + }], + ] + + TYPE = 4 + + def __init__(self, com_file_str: str, com2com_file_str: str): + self._com = read_json(com_file_str) + self._com2com = read_json(com2com_file_str) + self._imported = { + "com": 0, + "group": 0, + "com2com": 0, + } + + # + self._id2uuid = {} + + self._logos = {} + self._groups = {} + + def __len__(self): + return len(self._com) + + @property + def logos(self): + return self._logos + + @property + def imported_coms(self): + return self._imported['com'] + + @property + def imported_com2coms(self): + return self._imported['com2com'] + + @property + def imported_groups(self): + return self._groups + + def uuid(self, com_id: int): + assert isinstance(list(self._id2uuid.keys() or [""])[0], str) + return self._id2uuid.get(str(com_id), None) + + @time_method + def import_to(self, dspace, handles, metadata): + """ + Import data into database. + Mapped tables: community, community2community, metadatavalue, handle + """ + if len(self) == 0: + _logger.info("Community JSON is empty.") + return + + expected = len(self) + log_key = "communities" + log_before_import(log_key, expected) + + parents = {} + childs = {} + for comm2comm in (self._com2com or []): + parent_id = comm2comm['parent_comm_id'] + child_id = comm2comm['child_comm_id'] + parents.setdefault(parent_id, []).append(child_id) + childs.setdefault(child_id, []).append(parent_id) + + for arr in childs.values(): + if len(arr) != 1: + _logger.critical(f"Strange child array: [{arr}]") + + coms = self._com.copy() + + iter = 0 + + i = 0 + while len(coms) > 0: + iter += 1 + + if iter > 200: + _logger.critical( + "Very likely in the process of infinite loop because importing to existing db.") + break + + data = {} + # process community only when: + # comm is not parent and child + # comm is parent and not child + # parent comm exists + # else process it later + com = coms[i] + com_id = com['community_id'] + + not_child = com_id not in childs + not_child_nor_parent = (com_id not in parents and not_child) + com_child = childs[com_id][0] if com_id in childs else None + com_child_uuid = self.uuid(com_child) + if not_child_nor_parent or not_child or com_child_uuid is not None: + + # resource_type_id for community is 4 + handle_com = handles.get(communities.TYPE, com_id) + if handle_com is None: + _logger.critical(f"Cannot find handle for com [{com_id}]") + continue + + data['handle'] = handle_com + + metadata_com = metadata.value(communities.TYPE, com_id) + + if metadata_com: + data['metadata'] = metadata_com + + # create community + parent_d = None + if com_id in childs: + parent_d = {'parent': self.uuid(com_child)} + + try: + new_com_id = dspace.put_community(parent_d, data) + # error + if new_com_id is None: + i += 1 + if i == len(coms): + i = 0 + continue + # make sure the indices are str + self._id2uuid[str(com_id)] = new_com_id['id'] + self._imported["com"] += 1 + except Exception as e: + _logger.error( + f'put_community: [{com_id}] failed. Exception: [{str(e)}]') + continue + + # add to community2logo, if community has logo + if com['logo_bitstream_id'] is not None: + self._logos[str(com_id)] = com["logo_bitstream_id"] + + # create admingroup + if com['admin'] is not None: + try: + resp = dspace.put_community_admin_group(new_com_id['id']) + self._groups[str(com['admin'])] = [resp['id']] + self._imported["group"] += 1 + except Exception as e: + _logger.error( + f'put_community_admin_group: [{new_com_id["id"]}] failed. Exception: [{str(e)}]') + del coms[i] + else: + i += 1 + + if i == len(coms): + i = 0 + + log_after_import(log_key, expected, self.imported_coms) + + # ============= + + def serialize(self, file_str: str): + data = { + "com_created": self._id2uuid, + "logos": self._logos, + "groups": self._groups, + "imported": self._imported, + } + serialize(file_str, data) + + def deserialize(self, file_str: str): + data = deserialize(file_str) + self._id2uuid = data["com_created"] + self._logos = data["logos"] + self._groups = data["groups"] + self._imported = data["imported"] diff --git a/src/pump/_db.py b/src/pump/_db.py new file mode 100644 index 0000000..5372e0b --- /dev/null +++ b/src/pump/_db.py @@ -0,0 +1,147 @@ +import sys +import logging +_logger = logging.getLogger("pump.db") + + +class conn: + def __init__(self, env): + self.name = env["name"] + self.host = env["host"] + self.user = env["user"] + self.port = env.get("port", 5432) + self.password = env["password"] + self._conn = None + self._cursor = None + + def connect(self): + if self._conn is not None: + return + + import psycopg2 # noqa + self._conn = psycopg2.connect( + database=self.name, host=self.host, port=self.port, user=self.user, password=self.password) + _logger.debug(f"Connection to database [{self.name}] successful!") + + def __del__(self): + self.close() + + def __enter__(self): + self.connect() + self._cursor = self._conn.cursor() + return self._cursor + + def __exit__(self, exc_type, exc_value, traceback): + if exc_type is not None: + _logger.critical( + f"An exception of type {exc_type} occurred with message: {exc_value}") + return + self._conn.commit() + return self._cursor.close() + + def close(self): + if self._conn: + self._conn.close() + self._conn = None + + +class db: + """ + TODO(jm): working but should be refactored, with semantics + """ + + def __init__(self, env: dict): + self._conn = conn(env) + + # ============= + + def fetch_all(self, sql: str, col_names: list = None): + with self._conn as cursor: + cursor.execute(sql) + arr = cursor.fetchall() + if col_names is not None: + col_names += [x[0] for x in cursor.description] + return arr + + def fetch_one(self, sql: str): + with self._conn as cursor: + cursor.execute(sql) + res = cursor.fetchone() + if res is None: + return None + + return res[0] + + def exe_sql(self, sql_text: str): + with self._conn as cursor: + sql_lines = [x.strip() for x in sql_text.splitlines() if len(x.strip()) > 0] + for sql in sql_lines: + cursor.execute(sql) + return + + # ============= + + def delete_resource_policy(self): + with self._conn as cursor: + expected = self.fetch_one("SELECT COUNT(*) from public.resourcepolicy") + + # delete all data + cursor.execute("DELETE FROM public.resourcepolicy") + deleted = cursor.rowcount + + # control, if we deleted all data + if expected != deleted: + _logger.critical( + f"Did not remove all entries from resourcepolicy table. Expected: {expected}, deleted: {deleted}") + sys.exit(1) + + def get_admin_uuid(self, username): + """ + Get uuid of the admin user + """ + res = self.fetch_one(f"SELECT uuid FROM eperson WHERE email like '{username}'") + + # Check if there is a result and extract the ID + if res is not None: + return res + + _logger.error(f"No eperson records in the table for {username}") + return None + + def get_last_id(self, table_name, id_column): + """ + Get id of the last record from the specific table + @return: id of the last record + """ + sql = f"SELECT {id_column} FROM {table_name} ORDER BY {id_column} DESC LIMIT 1" + last_record_id = self.fetch_one(sql) + + if not last_record_id: + _logger.info(f"No records in [{table_name}] table.") + # Default value - the table is empty + return 1 + + # Check if there is a result and extract the ID + return last_record_id + + def all_tables(self): + return self.fetch_all( + "SELECT table_name FROM information_schema.tables WHERE is_insertable_into = 'YES' AND table_schema = 'public'") + + def status(self): + d = {} + tables = self.all_tables() + for table in tables: + name = table[0] + count = self.fetch_one(f"SELECT COUNT(*) FROM {name}") + d[name] = count + zero = "" + msg = "" + for name in sorted(d.keys()): + count = d[name] + if count == 0: + zero += f"{name}," + else: + msg += f"{name: >40}: {int(count): >8d}\n" + + _logger.info(f"\n{msg}Empty tables:\n\t{zero}") + _logger.info(40 * "=") diff --git a/src/pump/_eperson.py b/src/pump/_eperson.py new file mode 100644 index 0000000..883a61c --- /dev/null +++ b/src/pump/_eperson.py @@ -0,0 +1,216 @@ +import logging +from ._utils import read_json, time_method, serialize, deserialize, progress_bar, log_before_import, log_after_import + +_logger = logging.getLogger("pump.eperson") + + +def _emails(email): + """ + The eperson email could consist of more email, return all of them in the array. + If the email doesn't contain `;` that means there is only one email without `;` separator. + """ + if email is None: + return [] + + if ';' not in email: + return [email] + + # email value contains of two email, take just the first one. + # e.g., test@msn.com;name@gmail.com + return email.split(';') + + +class epersons: + """ + Import data into database. + Mapped tables: epersongroup2eperson + SQL: + delete from epersongroup2eperson ; delete from eperson where email NOT IN (SELECT email FROM eperson LIMIT 1) ; + delete from group2groupcache ; delete from group2group ; delete from resourcepolicy ; delete from community2community ; delete from community ; delete from epersongroup where permanent=false; + """ + validate_table = [ + ["eperson", { + # do not use compare because of email field (GDPR) + "compare": ["email", "netid"], + }], + + ["epersongroup2eperson", { + # do not use compare because of email field (GDPR) + "sql": { + "5": "select epersongroup.eperson_group_id, eperson.email from epersongroup2eperson inner join epersongroup ON epersongroup2eperson.eperson_group_id=epersongroup.eperson_group_id inner join eperson ON epersongroup2eperson.eperson_id=eperson.eperson_id", + "7": "select epersongroup.uuid, eperson.email from epersongroup2eperson inner join epersongroup ON epersongroup2eperson.eperson_group_id=epersongroup.uuid inner join eperson ON epersongroup2eperson.eperson_id=eperson.uuid", + "compare": "email", + } + }], + + ] + TYPE = 7 + + def __init__(self, eperson_file_str: str): + self._epersons = read_json(eperson_file_str) + self._imported = { + "p": 0, + } + + self._email2id = {} + self._id2uuid = {} + + if len(self._epersons) == 0: + _logger.info(f"Empty input: [{eperson_file_str}].") + return + + # fill mapping email -> eperson_id + for e in self._epersons: + # eperson email could consist of more emails, add eperson_id into everyone + for email in _emails(e['email']): + self._email2id[email] = e['eperson_id'] + + def __len__(self): + return len(self._epersons) + + def by_email(self, email: str): + return self._email2id.get(email, None) + + def uuid(self, eid: int): + assert isinstance(list(self._id2uuid.keys())[0], str) + return self._id2uuid.get(str(eid), None) + + @property + def imported(self): + return self._imported['p'] + + @time_method + def import_to(self, env, dspace, metadatas): + expected = len(self) + log_key = "eperson" + log_before_import(log_key, expected) + + ignore_eids = env.get("ignore", {}).get("epersons", []) + ignored = 0 + + for e in progress_bar(self._epersons): + e_id = e['eperson_id'] + + if e_id in ignore_eids: + _logger.debug(f"Skipping eperson [{e_id}]") + ignored += 1 + continue + + data = { + 'selfRegistered': e['self_registered'], + 'requireCertificate': e['require_certificate'], + 'netid': e['netid'], + 'canLogIn': e['can_log_in'], + 'lastActive': e['last_active'], + 'email': e['email'], + 'password': e['password'], + 'welcomeInfo': e['welcome_info'], + 'canEditSubmissionMetadata': e['can_edit_submission_metadata'] + } + + e_meta = metadatas.value(epersons.TYPE, e_id) + if e_meta: + data['metadata'] = e_meta + + params = { + 'selfRegistered': e['self_registered'], + 'lastActive': e['last_active'] + } + try: + resp = dspace.put_eperson(params, data) + self._id2uuid[str(e_id)] = resp['id'] + self._imported["p"] += 1 + except Exception as e: + _logger.error(f'put_eperson: [{e_id}] failed [{str(e)}]') + + log_after_import(f"{log_key} ignored:[{ignored}]", + expected, self.imported + ignored) + + # ============= + + def serialize(self, file_str: str): + data = { + "epersons": self._epersons, + "id2uuid": self._id2uuid, + "email2id": self._email2id, + "imported": self._imported, + } + serialize(file_str, data) + + def deserialize(self, file_str: str): + data = deserialize(file_str) + self._epersons = data["epersons"] + self._id2uuid = data["id2uuid"] + self._email2id = data["email2id"] + self._imported = data["imported"] + + +# ============= + +class groups: + """ + Mapped tables: epersongroup2eperson + """ + + def __init__(self, egroups_file_str: str): + self._groups = read_json(egroups_file_str) + self._imported = { + "group": 0, + } + + self._id2uuid = {} + + if len(self._groups) == 0: + _logger.info(f"Empty input: [{egroups_file_str}].") + return + + def __len__(self): + return len(self._groups) + + @property + def imported(self): + return self._imported['group'] + + @time_method + def import_to(self, dspace, groups, epersons): + expected = len(self) + log_key = "epersongroup2eperson" + log_before_import(log_key, expected) + + for g in progress_bar(self._groups): + g_id = g['eperson_group_id'] + e_id = g['eperson_id'] + try: + g_uuid_list = groups.uuid(g_id) + e_uuid = epersons.uuid(e_id) + for g_uuid in g_uuid_list: + if g_uuid is None: + _logger.critical(f"Group UUID for [{g_id}] is None!") + continue + if e_uuid is None: + _logger.critical(f"Eperson UUID for [{e_id}] is None!") + continue + dspace.put_egroup(g_uuid, e_uuid) + self._imported["group"] += 1 + except Exception as e: + _logger.error(f'put_egroup: [{g_id}] failed [{str(e)}]') + + log_after_import(log_key, expected, self.imported) + + # ============= + + def serialize(self, file_str: str): + data = { + "groups": self._groups, + "id2uuid": self._id2uuid, + "imported": self._imported, + } + serialize(file_str, data) + + def deserialize(self, file_str: str): + data = deserialize(file_str) + self._groups = data["groups"] + self._id2uuid = data["id2uuid"] + self._imported = data["imported"] + + # ============= diff --git a/src/pump/_group.py b/src/pump/_group.py new file mode 100644 index 0000000..09c36c0 --- /dev/null +++ b/src/pump/_group.py @@ -0,0 +1,237 @@ +import re +import logging +from ._utils import read_json, time_method, serialize, deserialize, progress_bar, log_before_import, log_after_import + +_logger = logging.getLogger("pump.groups") + + +def _epersongroup_process(repo, v5data: list, v7data: list): + """ + v5: ['COLLECTION_17_DEFAULT_READ', 'COLLECTION_20_WORKFLOW_STEP_2'] + v7: ['COLLECTION_f3c65f29-355e-4ca2-a05b-f3e30883e09f_BITSTREAM_DEFAULT_READ'] + """ + rec = re.compile("(COLLECTION|COMMUNITY)_(\d+)_(.*)") + v5data_new = [] + for val in v5data: + m = rec.match(val) + if m is None: + v5data_new.append(val) + continue + c, c_id, role = m.groups() + uuid = repo.collections.uuid(c_id) if c == "COLLECTION" \ + else repo.communities.uuid(c_id) + if role == "WORKFLOW_STEP_2": + role = "WORKFLOW_ROLE_editor" + if role == "DEFAULT_READ": + v5data_new.append(f"{m.group(1)}_{uuid}_BITSTREAM_DEFAULT_READ") + v5data_new.append(f"{m.group(1)}_{uuid}_ITEM_DEFAULT_READ") + else: + v5data_new.append(f"{m.group(1)}_{uuid}_{role}") + _logger.info( + f"Changing v5 groups to uuid version and adding bitstream/item reads: {len(v5data)} -> {len(v5data_new)}") + + return v5data_new, v7data + + +class groups: + + validate_table = [ + ["epersongroup", { + # do not use compare because of email field (GDPR) + "nonnull": ["eperson_group_id"], + }], + + ["epersongroup", { + "sql": { + "5": "select metadatavalue.text_value from epersongroup inner join metadatavalue ON metadatavalue.resource_id=epersongroup.eperson_group_id and metadatavalue.resource_type_id=6", + "7": "select name from epersongroup", + "compare": 0, + "process": _epersongroup_process, + } + }], + + ["group2group", { + # do not use compare because of email field (GDPR) + "nonnull": ["parent_id", "child_id"], + }], + ["epersongroup2eperson", { + # do not use compare because of email field (GDPR) + "nonnull": ["eperson_group_id", "eperson_id"], + }], + ] + + TYPE = 6 + DEF_GID_ANON = "0" + DEF_GID_ADMIN = "1" + + def __init__(self, eperson_file_str: str, g2g_file_str: str): + self._eperson = read_json(eperson_file_str) + self._g2g = read_json(g2g_file_str) + self._imported = { + "eperson": 0, + "group": 0, + "g2g": 0, + "default_groups": 0, + "coll_groups": 0, + } + + # created during import + + # all imported group + self._id2uuid = {} + + if len(self._eperson) == 0: + _logger.info(f"Empty input collections: [{eperson_file_str}].") + + if len(self._g2g) == 0: + _logger.info(f"Empty input collections: [{g2g_file_str}].") + + @property + def imported_eperson(self): + return self._imported['eperson'] + + @property + def imported_g2g(self): + return self._imported['g2g'] + + @property + def anonymous(self): + return self.uuid(groups.DEF_GID_ANON) + + @property + def admins(self): + return self.uuid(groups.DEF_GID_ADMIN) + + def from_rest(self, dspace, ignore_other=False): + """ + Load Administrator and Anonymous groups into dict. + This data already exists in database. + Remember its id. + """ + res = dspace.fetch_existing_epersongroups() + if res is None: + return self + + other_groups = [] + for group in res: + if group['name'] == 'Anonymous': + self._id2uuid[groups.DEF_GID_ANON] = [group['id']] + continue + + if group['name'] == 'Administrator': + self._id2uuid[groups.DEF_GID_ADMIN] = [group['id']] + continue + + other_groups.append(group) + _logger.info( + f"Loaded groups [{self._id2uuid}], other groups:[{len(other_groups)}]") + return self + + def uuid(self, gid: int): + assert isinstance(list(self._id2uuid.keys())[0], str) + return self._id2uuid.get(str(gid), None) + + @time_method + def import_to(self, dspace, metadatas, coll_groups, comm_groups): + # Do not import groups which are already imported + self._id2uuid.update(coll_groups) + self._id2uuid.update(comm_groups) + self._import_eperson(dspace, metadatas) + self._import_group2group(dspace) + + def _import_eperson(self, dspace, metadatas): + """ + Import data into database. + Mapped tables: epersongroup + """ + expected = len(self._eperson) + log_key = "epersongroup" + log_before_import(log_key, expected) + + grps = [] + + for eg in progress_bar(self._eperson): + g_id = eg['eperson_group_id'] + + # group Administrator and Anonymous already exist + # group is created with dspace object too + if str(g_id) in (groups.DEF_GID_ADMIN, groups.DEF_GID_ANON): + self._imported["default_groups"] += 1 + continue + + g_uuid = self.uuid(g_id) + if g_uuid is not None: + # TODO(jm) what is this? + self._imported["coll_groups"] += 1 + continue + + # get group metadata + g_meta = metadatas.value(groups.TYPE, g_id) + if 'dc.title' not in g_meta: + _logger.error(f'Metadata for group [{g_id}] does not contain dc.title!') + continue + + name = g_meta['dc.title'][0]['value'] + del g_meta['dc.title'] + + # the group_metadata contains the name of the group + data = {'name': name, 'metadata': g_meta} + grps.append(name) + try: + # continue + resp = dspace.put_eperson_group({}, data) + self._id2uuid[str(g_id)] = [resp['id']] + self._imported["eperson"] += 1 + except Exception as e: + _logger.error(f'put_eperson_group: [{g_id}] failed [{str(e)}]') + + # sql_del = "delete from epersongroup where name='" + "' or name='".join(grps) + "' ;" + # _logger.info(sql_del) + + log_after_import(f'{log_key} [known existing:{self._imported["default_groups"]}]', + expected, self.imported_eperson + self._imported["default_groups"]) + + def _import_group2group(self, dspace): + """ + Import data into database. + Mapped tables: group2group + """ + expected = len(self._g2g) + log_key = "epersons g2g (could have children)" + log_before_import(log_key, expected) + + for g2g in progress_bar(self._g2g): + parent_a = self.uuid(g2g['parent_id']) + child_a = self.uuid(g2g['child_id']) + if parent_a is None or child_a is None: + _logger.critical( + f"Invalid uuid for [{g2g['parent_id']}] or [{g2g['child_id']}]") + continue + + for parent in parent_a: + for child in child_a: + try: + dspace.put_group2group(parent, child) + # TODO Update statistics when the collection has more group relations. + self._imported["g2g"] += 1 + except Exception as e: + _logger.error( + f'put_group2group: [{parent}][{child}] failed [{str(e)}]') + + log_after_import(log_key, expected, self.imported_g2g) + + # ============= + + def serialize(self, file_str: str): + data = { + "eperson": self._eperson, + "id2uuid": self._id2uuid, + "imported": self._imported, + } + serialize(file_str, data) + + def deserialize(self, file_str: str): + data = deserialize(file_str) + self._eperson = data["eperson"] + self._id2uuid = data["id2uuid"] + self._imported = data["imported"] diff --git a/src/pump/_handle.py b/src/pump/_handle.py new file mode 100644 index 0000000..5a83d03 --- /dev/null +++ b/src/pump/_handle.py @@ -0,0 +1,89 @@ +import logging +from ._utils import read_json, time_method, serialize, deserialize, log_before_import, log_after_import +from ._item import items + +_logger = logging.getLogger("pump.handle") + + +class handles: + """ + SQL: + delete from handle ; + """ + validate_table = [ + ["handle", { + "compare": ["handle", "resource_type_id"], + }], + ] + + def __init__(self, file_str: str): + self._handles = {} + self._imported = 0 + + js = read_json(file_str) + for h in js: + res_type_id = h['resource_type_id'] + res_id = h['resource_id'] + arr = self._handles.setdefault( + str(res_type_id), {}).setdefault(str(res_id), []) + arr.append(h) + + def __len__(self): + return len(self._handles) + + @property + def imported(self): + return self._imported + + # ============= + + def serialize(self, file_str: str): + # cannot serialize tuples as keys + d = { + "handles": self._handles, + "imported": self._imported, + } + serialize(file_str, d, sorted=False) + + def deserialize(self, file_str: str): + data = deserialize(file_str) + self._handles = data["handles"] + self._imported = data["imported"] + + # ============= + + def get_handles_by_type(self, type_id: int = None, res_id: int = None): + return self._handles.get(str(type_id), {}).get(str(res_id), []) + + # ============= + + @time_method + def import_to(self, dspace): + # external + arr = self.get_handles_by_type(None, None) + expected = len(arr) + log_key = "external handles" + log_before_import(log_key, expected) + cnt = dspace.put_handles(arr) + log_after_import(log_key, expected, cnt) + self._imported += cnt + + # no object + arr = self.get_handles_by_type(items.TYPE, None) + expected = len(arr) + log_key = "handles" + log_before_import(log_key, expected) + cnt = dspace.clarin_put_handles(arr) + log_after_import(log_key, expected, cnt) + self._imported += cnt + + # ============= + + def get(self, type_id: int, obj_id: int): + """ + Get handle based on object type and its id. + """ + arr = self.get_handles_by_type(type_id, obj_id) + if len(arr) == 0: + return None + return arr[0]['handle'] diff --git a/src/pump/_item.py b/src/pump/_item.py new file mode 100644 index 0000000..b82d248 --- /dev/null +++ b/src/pump/_item.py @@ -0,0 +1,595 @@ +import datetime +import logging +from ._utils import read_json, serialize, deserialize, time_method, progress_bar, log_before_import, log_after_import + +_logger = logging.getLogger("pump.item") + + +class items: + """ + SQL: + delete from workspaceitem ; + """ + TYPE = 2 + validate_table = [ + ["item", { + # do not use compare because of email field (GDPR) + "nonnull": ["in_archive", "withdrawn"], + }], + ["item2bundle", { + # do not use compare because of email field (GDPR) + "nonnull": ["bundle_id"], + }], + ["versionhistory", { + }], + ["workspaceitem", { + }], + ["collection2item", { + }], + ] + + def __init__(self, + item_file_str: str, + ws_file_str: str, + wf_file_str: str, + col2item_file_str: str): + + self._items = read_json(item_file_str) + if len(self._items) == 0: + _logger.info(f"Empty input: [{item_file_str}].") + + self._ws_items = read_json(ws_file_str) + if len(self._ws_items) == 0: + _logger.info(f"Empty input: [{ws_file_str}].") + + self._wf_items = read_json(wf_file_str) + if len(self._wf_items) == 0: + _logger.info(f"Empty input: [{wf_file_str}].") + + self._col2item = read_json(col2item_file_str) + if len(self._col2item) == 0: + _logger.info(f"Empty input: [{col2item_file_str}].") + + self._id2item = {str(e['item_id']): e for e in self._items} + self._id2uuid = {} + self._ws_id2v7id = {} + self._ws_id2uuid = {} + self._wf_id2workflow_id = {} + self._wf_item_ids = [] + self._col_id2uuid = {} + self._migrated_versions = [] + + self._imported = { + "items": 0, + "wf": 0, + "ws": 0, + "cols": 0, + "versions": 0, + } + self._done = [] + self._versions = { + "not_imported_handles": [], + "withdrawn": [], + "not_imported": [], + } + + def __len__(self): + return len(self._items) + + def find_by_uuid(self, uuid: str): + for k, item_uuid in self._id2uuid.items(): + if uuid == item_uuid: + return self._id2item[k] + return None + + def uuid(self, eid: int): + assert isinstance(list(self._id2uuid.keys() or [""])[0], str) + return self._id2uuid.get(str(eid), None) + + def wf_id(self, wfid: int): + return self._wf_id2workflow_id.get(str(wfid), None) + + @property + def imported_ws(self): + return self._imported['ws'] + + @property + def imported_wf(self): + return self._imported['wf'] + + @property + def imported_cols(self): + return self._imported['cols'] + + @property + def imported(self): + return self._imported['items'] + + def item(self, item_id: int): + return self._id2item[str(item_id)] + + @time_method + def import_to(self, cache_file, dspace, handles, metadatas, epersons, collections): + """ + Import data into database. + Mapped tables: item, collection2item, workspaceitem, cwf_workflowitem, + metadata, handle + """ + if "ws" in self._done: + _logger.info("Skipping workspace import") + else: + if self._ws_items is not None: + self._ws_import_to(dspace, handles, metadatas, epersons, collections) + self._done.append("ws") + self.serialize(cache_file) + + if "wf" in self._done: + _logger.info("Skipping workflow import") + else: + if self._wf_items is not None: + self._wf_import_to(dspace, handles, metadatas, epersons, collections) + self._done.append("wf") + self.serialize(cache_file) + + if "item" in self._done: + _logger.info("Skipping item import") + else: + self._item_import_to(dspace, handles, metadatas, epersons, collections) + self._done.append("item") + self.serialize(cache_file) + + if "itemcol" in self._done: + _logger.info("Skipping itemcol import") + else: + self._itemcol_import_to(dspace, handles, metadatas, epersons, collections) + self._done.append("itemcol") + self.serialize(cache_file) + + def _import_item(self, dspace, generic_item_d, item, handles, metadatas, epersons, collections, what: str) -> bool: + i_id = item['item_id'] + + data = { + 'discoverable': item['discoverable'], + 'inArchive': item['in_archive'], + 'lastModified': item['last_modified'], + 'withdrawn': item['withdrawn'] + } + i_meta = metadatas.value(items.TYPE, i_id) + if i_meta is not None: + data['metadata'] = i_meta + + i_handle = handles.get(items.TYPE, i_id) + if i_handle is not None: + data['handle'] = i_handle + else: + log_fnc = _logger.info + # workspace do not need to have handle + if what == "workspace": + log_fnc = _logger.debug + log_fnc(f"Cannot find handle for item in {what} [{i_id}]") + + # the params are workspaceitem attributes + params = { + 'owningCollection': collections.uuid(generic_item_d['collection_id']), + 'multipleTitles': generic_item_d['multiple_titles'], + 'publishedBefore': generic_item_d['published_before'], + 'multipleFiles': generic_item_d['multiple_files'], + 'stageReached': generic_item_d.get('stage_reached', -1), + 'pageReached': generic_item_d.get('page_reached', -1), + 'epersonUUID': epersons.uuid(item['submitter_id']) + } + + try: + resp = dspace.put_ws_item(params, data) + ws_id = resp['id'] + if what == "workspace": + self._ws_id2v7id[str(i_id)] = ws_id + except Exception as e: + _logger.error(f'put_ws_item: [{i_id}] failed [{str(e)}]') + return False, None + + try: + resp = dspace.fetch_item(ws_id) + i_uuid = resp['id'] + self._id2uuid[str(i_id)] = i_uuid + if what == "workspace": + self._ws_id2uuid[str(i_id)] = i_uuid + except Exception as e: + _logger.error(f'fetch_item: [{ws_id}] failed [{str(e)}]') + return False, None + + return True, ws_id + + def _ws_import_to(self, dspace, handles, metadatas, epersons, collections): + expected = len(self._ws_items) + log_key = "workspaceitems" + log_before_import(log_key, expected) + + for ws in progress_bar(self._ws_items): + item = self.item(ws['item_id']) + ret, _1 = self._import_item(dspace, ws, item, handles, + metadatas, epersons, collections, "workspace") + if ret: + self._imported["ws"] += 1 + + log_after_import(log_key, expected, self.imported_ws) + + def _wf_import_to(self, dspace, handles, metadatas, epersons, collections): + expected = len(self._wf_items) + log_key = "workflowitems" + log_before_import(log_key, expected) + + # create workflowitem + # workflowitem is created from workspaceitem + # -1, because the workflowitem doesn't contain this attribute + for wf in progress_bar(self._wf_items): + wf_id = wf['item_id'] + item = self.item(wf_id) + ret, ws_id = self._import_item(dspace, wf, item, handles, + metadatas, epersons, collections, "workflow") + if not ret: + continue + + # create workflowitem from created workspaceitem + params = {'id': str(ws_id)} + try: + resp = dspace.put_wf_item(params) + self._wf_id2workflow_id[str(wf['workflow_id']) + ] = resp.headers['workflowitem_id'] + self._wf_item_ids.append(wf_id) + self._imported["wf"] += 1 + except Exception as e: + _logger.error(f'put_wf_item: [{wf_id}] failed [{str(e)}]') + + log_after_import(log_key, expected, self.imported_wf) + + def _item_import_to(self, dspace, handles, metadatas, epersons, collections): + expected = len(self._items) + log_key = "items" + log_before_import(log_key, expected) + + without_col = 0 + + ws_items = 0 + wf_items = 0 + + # create other items + for item in progress_bar(self._items): + i_id = item['item_id'] + + # is it already imported in WS? + if str(i_id) in self._ws_id2v7id: + ws_items += 1 + continue + if i_id in self._wf_item_ids: + wf_items += 1 + continue + + data = { + 'discoverable': item['discoverable'], + 'inArchive': item['in_archive'], + 'lastModified': item['last_modified'], + 'withdrawn': item['withdrawn'] + } + + i_meta = metadatas.value(items.TYPE, i_id) + if i_meta: + data['metadata'] = i_meta + + i_handle = handles.get(items.TYPE, i_id) + if i_handle is None: + _logger.critical(f"Cannot find handle for item [{i_id}]") + continue + + data['handle'] = i_handle + + if item['owning_collection'] is None: + _logger.critical(f"Item without collection [{i_id}] is not valid!") + without_col += 1 + continue + + col_uuid = collections.uuid(item['owning_collection']) + params = { + 'owningCollection': col_uuid, + 'epersonUUID': epersons.uuid(item['submitter_id']), + } + + if col_uuid is None: + _logger.critical( + f"Item without collection [{i_id}] cannot be imported here") + continue + + try: + resp = dspace.put_item(params, data) + self._id2uuid[str(i_id)] = resp['id'] + self._imported["items"] += 1 + except Exception as e: + _logger.error(f'put_item: [{i_id}] failed [{str(e)}]') + + log_after_import(f'{log_key} no owning col:[{without_col}], ws items:[{ws_items}] wf items:[{wf_items}]', + expected, self.imported + without_col + ws_items + wf_items) + + def _itemcol_import_to(self, dspace, handles, metadatas, epersons, collections): + # Find items which are mapped in more collections and store them into dictionary in this way + # {'item_uuid': [collection_uuid_1, collection_uuid_2]} + for col in self._col2item: + col_item_id = col['item_id'] + # Every item should have mapped only one collection - the owning collection except the items which + # are mapped into more collections + item_uuid = self.uuid(col_item_id) + if item_uuid is None: + _logger.critical(f"Cannot find collection of item [{col_item_id}]") + continue + col_uuid = collections.uuid(col['collection_id']) + self._col_id2uuid.setdefault(item_uuid, []).append(col_uuid) + + to_import = [x for x in self._col_id2uuid.items() if len(x[1]) > 1] + expected = len(to_import) + log_key = "items coll" + log_before_import(log_key, expected) + + # Call Vanilla REST endpoint which add relation between Item and Collection into the collection2item table + for item_uuid, cols in progress_bar(to_import): + if len(cols) < 2: + continue + try: + data = self._col_id2uuid[item_uuid] + dspace.put_item_to_col(item_uuid, data) + self._imported['cols'] += 1 + except Exception as e: + _logger.error(f'put_item_to_col: [{item_uuid}] failed [{str(e)}]') + + log_after_import(log_key, expected, self.imported_cols) + + # ============= + + def serialize(self, file_str: str): + data = { + "items": self._items, + "ws_items": self._ws_items, + "wf_items": self._wf_items, + "col2item": self._col2item, + "id2item": self._id2item, + "id2uuid": self._id2uuid, + "ws_id2v7id": self._ws_id2v7id, + "ws_id2uuid": self._ws_id2uuid, + "wf_id2uuid": self._wf_id2workflow_id, + "wf_item_ids": self._wf_item_ids, + "col_id2uuid": self._col_id2uuid, + "imported": self._imported, + "done": self._done, + "versions": self._versions, + "migrated_versions": self._migrated_versions, + } + serialize(file_str, data) + + def deserialize(self, file_str: str): + data = deserialize(file_str) + self._items = data["items"] + self._ws_items = data["ws_items"] + self._wf_items = data["wf_items"] + self._col2item = data["col2item"] + self._id2item = data["id2item"] + self._id2uuid = data["id2uuid"] + self._ws_id2v7id = data["ws_id2v7id"] + self._ws_id2uuid = data["ws_id2uuid"] + self._wf_id2workflow_id = data["wf_id2uuid"] + self._wf_item_ids = data.get("wf_item_ids", []) + self._col_id2uuid = data["col_id2uuid"] + self._imported = data["imported"] + self._done = data["done"] + self._versions = data["versions"] + self._migrated_versions = data.get("migrated_versions", []) + + def _migrate_versions(self, env, db7, db5_dspace, metadatas): + _logger.info( + f"Migrating versions [{len(self._id2item)}], already done:[{len(self._migrated_versions)}]") + + admin_username = env["backend"]["user"] + admin_uuid = db7.get_admin_uuid(admin_username) + + self._migrated_versions = [] + + # Migrate versions for every Item + for item_id, item in progress_bar(self._id2item.items()): + # Do not process versions of the item that have already been processed. + if item_id in self._migrated_versions: + continue + + # This sequence contains handles of all versions of the Item ordered from the first version to the latest one + versions = self.get_all_versions(item_id, metadatas) + + # Do not process item which does not have any version + if len(versions or []) == 0: + continue + + _logger.debug(f'Processing all versions for the item with ID: {item_id}') + + # All versions of this Item is going to be processed + # Insert data into `versionhistory` table + versionhistory_new_id = db7.get_last_id( + 'versionhistory', 'versionhistory_id') + 1 + db7.exe_sql(f""" +INSERT INTO versionhistory(versionhistory_id) VALUES ({versionhistory_new_id}) +SELECT setval('versionhistory_seq', {versionhistory_new_id}) +""") + + # Insert data into `versionitem` with `versionhistory` id + versionitem_new_id = db7.get_last_id('versionitem', 'versionitem_id') + 1 + + for index, i_handle in enumerate(versions, 1): + # Get the handle of the x.th version of the Item + i_handle_d = metadatas.versions.get(i_handle, None) + + # If the item is withdrawn the new version could be stored in our repo or in another. Do import that version + # only if the item is stored in our repo. + if i_handle_d is None: + current_item = self.item(item_id) + if current_item['withdrawn']: + _logger.info( + f'The item handle: {i_handle} cannot be migrated because it is stored in another repository.') + continue + + # Get item_id using the handle + item_id = i_handle_d['item_id'] + # Get the uuid of the item using the item_id + item_uuid = self.uuid(item_id) + # timestamp is required column in the database + timestamp = datetime.datetime.now() + + db7.exe_sql( + f"INSERT INTO public.versionitem(versionitem_id, version_number, version_date, version_summary, versionhistory_id, eperson_id, item_id) VALUES (" + f"{versionitem_new_id}, " + f"{index}, " + f"'{timestamp}', " + f"'', " + f"{versionhistory_new_id}, " + f"'{admin_uuid}', " + f"'{item_uuid}');" + ) + # Update sequence + db7.exe_sql(f"SELECT setval('versionitem_seq', {versionitem_new_id})") + versionitem_new_id += 1 + self._migrated_versions.append(str(item_id)) + + _logger.info(f"Migrated versions [{len(self._migrated_versions)}]") + + def raw_after_import(self, env, db7, db5_dspace, metadatas): + # Migration process + self._migrate_versions(env, db7, db5_dspace, metadatas) + self._check_sum(db7, db5_dspace, metadatas) + + def get_newer_versions(self, item_id: int, metadatas): + return self._get_versions(item_id, metadatas, metadatas.V5_DC_RELATION_ISREPLACEDBY_ID) + + def get_older_versions(self, item_id: int, metadatas): + return self._get_versions(item_id, metadatas, metadatas.V5_DC_RELATION_REPLACES_ID) + + def _get_versions(self, item_id: int, metadatas, metadata_field: int): + """ + Return all previous or newer versions of the item using connection between `dc.relation.replaces` and + `dc.relation.isreplacedby` item metadata. + @return: list of versions or empty list + """ + + def _get_version(cur_item_id): + item_versions = metadatas.value(items.TYPE, cur_item_id, metadata_field) + if len(item_versions or []) == 0: + # _logger.debug(f"Item [{cur_item_id}] does not have any version.") + return None + return item_versions[0] + + versions = [] + cur_item_id = item_id + + # current_version is handle of previous or newer item + cur_item_version = _get_version(cur_item_id) + + while cur_item_version is not None: + # + if cur_item_version not in metadatas.versions: + # Check if current item is withdrawn + # TODO(jm): check original code - item_id + cur_item = self.item(cur_item_id) + if cur_item['withdrawn']: + # The item is withdrawn and stored in another repository + _logger.debug(f'Item [{cur_item_version}] is withdrawn') + self._versions["withdrawn"].append(cur_item_version) + else: + _logger.error( + f'The item with handle: {cur_item_version} has not been imported!') + self._versions["not_imported"].append(cur_item_version) + break + + versions.append(cur_item_version) + cur_item_id = metadatas.versions[cur_item_version]['item_id'] + cur_item_version = _get_version(cur_item_id) + + return versions + + def get_all_versions(self, item_id: int, metadatas): + """ + Return all versions of the item in ordered list from the first version to the latest including the handle of the + current Item + @return: list of the item versions or if the item doesn't have any version return None + """ + # The newer versions of the item + newer_versions = self.get_newer_versions(item_id, metadatas) + # The previous versions of the item + previous_versions = self.get_older_versions(item_id, metadatas) + # Previous versions are in wrong order - reverse the list + previous_versions = previous_versions[::-1] + + # If this item does not have any version return a None + if len(newer_versions) == 0 and len(previous_versions) == 0: + return None + + # Get handle of the current Item + cur_handle = metadatas.value( + items.TYPE, item_id, metadatas.V5_DC_IDENTIFIER_URI_ID) + if len(cur_handle or []) == 0: + _logger.error(f'Cannot find handle for the item with id: {item_id}') + self._versions["not_imported_handles"].append(item_id) + return None + + return previous_versions + [cur_handle[0]] + newer_versions + + def _check_sum(self, db7, db5_dspace, metadatas): + """ + Check if item versions importing was successful + Select item ids from CLARIN-DSpace5 which has some version metadata + Select items uuids from CLARIN-DSpace7 `versionitem` table where are stored item's version + Check if all items from CLARIN-DSpace5 has record in the CLARIN-DSpace7 history version table - check uuids + """ + + # Select item ids from CLARIN-DSpace5 which has some version metadata + clarin_5_item_ids = db5_dspace.fetch_all( + f"SELECT resource_id FROM metadatavalue WHERE metadata_field_id in ({metadatas.V5_DC_RELATION_REPLACES_ID},{metadatas.V5_DC_RELATION_ISREPLACEDBY_ID}) group by resource_id;" + ) + + # Select item uuids from CLARIN-DSpace7 which record in the `versionitem` table + clarin_7_item_uuids = db7.fetch_all("select item_id from versionitem") + + if clarin_5_item_ids is None or clarin_7_item_uuids is None: + _logger.error('Cannot check result of importing item versions.') + return + + clarin_7_item_uuids = set([x[0] for x in clarin_7_item_uuids]) + + # Some items could not be imported - uuid + clarin_5_ids_to_uuid = set([self.uuid(x[0]) for x in clarin_5_item_ids]) + + # Check v7 + problematic = [] + for uuid7 in clarin_7_item_uuids: + if uuid7 in clarin_5_ids_to_uuid: + continue + if uuid7 in self._ws_id2uuid.values(): + continue + # if item is in wf/ws it will have the relation stored in versionitem + # in v5, we stored it after item installation + + problematic.append(uuid7) + if len(problematic) > 0: + _logger.warning( + f'We have [{len(problematic)}] versions in v7 `versionitem` that are not expected!') + for uuid in problematic: + _logger.warning(f'UUID: {uuid}') + + # Check v5 + problematic = [] + for uuid5 in clarin_5_ids_to_uuid: + if uuid5 in clarin_7_item_uuids: + continue + # if withdrawn, we do not expect it to be in v7 versionitem + # TODO(jm): check that previous version is replaced by external item + item_d = self.find_by_uuid(uuid5) + if (item_d or {}).get('withdrawn', False): + continue + + problematic.append(uuid5) + if len(problematic) > 0: + _logger.warning( + f'We have [{len(problematic)}] versions in v5 not migrated into `versionitem`!') + for uuid in problematic: + _logger.warning(f'UUID: {uuid}') diff --git a/src/pump/_license.py b/src/pump/_license.py new file mode 100644 index 0000000..ae2da7b --- /dev/null +++ b/src/pump/_license.py @@ -0,0 +1,199 @@ +import os +import logging +from ._utils import read_json, time_method, serialize, deserialize, progress_bar, log_before_import, log_after_import + +_logger = logging.getLogger("pump.license") + + +class licenses: + + validate_table = [ + ["license_definition", { + "compare": ["name", "confirmation", "required_info"], + "db": "clarin-utilities", + }], + ["license_label", { + "compare": ["label", "title"], + "db": "clarin-utilities", + }], + ["license_label", { + "compare": ["label", "title"], + "db": "clarin-utilities", + }], + ["license_label_extended_mapping", { + "nonnull": ["license_id"], + "db": "clarin-utilities", + }], + ["license_resource_user_allowance", { + "nonnull": ["mapping_id"], + "db": "clarin-utilities", + }], + ["license_resource_mapping", { + "nonnull": ["license_id"], + "db": "clarin-utilities", + }], + ] + + def __init__(self, + license_labels_file_str: str, + license_defs_file_str: str, + license_map_file_str: str): + self._labels = read_json(license_labels_file_str) + self._licenses = read_json(license_defs_file_str) + self._map = read_json(license_map_file_str) + + self._license2label = {} + self._created_labels = {} + + self._imported = { + "label": 0, + "licenses": 0, + } + + if len(self._labels) == 0: + _logger.info(f"Empty input: [{license_labels_file_str}].") + if len(self._map) == 0: + _logger.info(f"Empty input: [{license_map_file_str}].") + if len(self._licenses) == 0: + _logger.info(f"Empty input: [{license_defs_file_str}].") + + def __len__(self): + return len(self._labels) + + @property + def imported_labels(self): + return self._imported['label'] + + @property + def imported_licenses(self): + return self._imported['licenses'] + + def import_to(self, env, dspace, epersons): + self._import_license_labels(env, dspace) + self._import_license_defs(env, dspace, epersons) + + @time_method + def _import_license_labels(self, env, dspace): + """ + Mapped tables: license_label + """ + expected = len(self._labels) + log_key = "license labels" + log_before_import(log_key, expected) + + no_icon_for_labels = env.get("ignore", {}).get("missing-icons", []) + + for label in progress_bar(self._labels): + l_id = label['label_id'] + l_name = label['label'] + data = { + 'label': l_name, + 'title': label['title'], + 'extended': label['is_extended'], + 'icon': None + } + + # find image with label name + icon_path = os.path.join(env["input"]["icondir"], + l_name.lower() + ".png") + try: + if l_name not in no_icon_for_labels: + with open(icon_path, "rb") as fin: + data['icon'] = list(fin.read()) + except Exception as e: + _logger.error( + f"Problem reading label icon [{os.path.abspath(icon_path)}] [{l_name}]: str(e)") + + try: + resp = dspace.put_license_label(data) + del resp['license'] + del resp['_links'] + self._created_labels[str(l_id)] = resp + self._imported["label"] += 1 + except Exception as e: + _logger.error(f'put_license_label: [{l_id}] failed [{str(e)}]') + + for m in self._map: + lic_id = m['license_id'] + lab_id = m['label_id'] + self._license2label.setdefault(str(lic_id), []).append( + self._created_labels[str(lab_id)]) + + log_after_import(log_key, expected, self.imported_labels) + + @time_method + def _import_license_defs(self, env, dspace, epersons): + expected = len(self._licenses) + log_key = "license defs" + log_before_import(log_key, expected) + + # import license_definition + for lic in progress_bar(self._licenses): + lic_id = lic['license_id'] + lab_id = lic['label_id'] + updated_def = update_license_def(env, lic['definition']) + data = { + 'name': lic['name'], + 'definition': updated_def, + 'confirmation': lic['confirmation'], + 'requiredInfo': lic['required_info'], + 'clarinLicenseLabel': self._created_labels[str(lab_id)] + } + + if lic_id in self._license2label: + data['extendedClarinLicenseLabels'] = self._license2label[lic_id] + + params = {'eperson': epersons.uuid(lic['eperson_id'])} + try: + resp = dspace.put_license(params, data) + self._imported["licenses"] += 1 + except Exception as e: + _logger.error(f'XXX: [{lic_id}] failed [{str(e)}]') + + log_after_import(log_key, expected, self.imported_licenses) + + # ============= + + def serialize(self, file_str: str): + data = { + "labels": self._labels, + "licenses": self._licenses, + "map": self._map, + "license2label": self._license2label, + "created_labels": self._created_labels, + "imported": self._imported, + } + serialize(file_str, data) + + def deserialize(self, file_str: str): + data = deserialize(file_str) + self._labels = data["labels"] + self._licenses = data["licenses"] + self._map = data["map"] + self._license2label = data["license2label"] + self._created_labels = data["created_labels"] + self._imported = data["imported"] + + +def update_license_def(env, lic_def_url: str): + """ + Replace license definition url from current site url to a new site url + e.g., from `https://lindat.mff.cuni.cz/repository/xmlui/page/licence-hamledt` + to `https://lindat.mff.cuni.cz/repository/static/licence-hamledt.html` + """ + env_lic = env.get("licenses", {}) + if "to_replace_def_url" not in env_lic: + _logger.info( + "License def URL is not replaced, absolute path to the new repo must math the old one!") + return lic_def_url + + # Replace old site url to a new site url + if env_lic["to_replace_def_url"] in lic_def_url: + lic_def_url = lic_def_url.replace( + env_lic["to_replace_def_url"], + env_lic["replace_with_def_url"] + ) + # File name has a missing `.html` suffix -> add that suffix to the end of the definition url + lic_def_url += '.html' + + return lic_def_url diff --git a/src/pump/_metadata.py b/src/pump/_metadata.py new file mode 100644 index 0000000..1e795db --- /dev/null +++ b/src/pump/_metadata.py @@ -0,0 +1,587 @@ +import logging +import re +from ._utils import read_json, time_method, serialize, deserialize, progress_bar, log_before_import, log_after_import + +_logger = logging.getLogger("pump.metadata") + + +def _metadatavalue_process(repo, v5data: list, v7data: list): + """ + v5: ['COLLECTION_17_DEFAULT_READ', 'COLLECTION_20_WORKFLOW_STEP_2'] + v7: ['COLLECTION_f3c65f29-355e-4ca2-a05b-f3e30883e09f_BITSTREAM_DEFAULT_READ'] + """ + + def norm_lic(text): + # normalize it, not 100% because of licence-UD-2.2 + return text.split('/')[-1].split('.')[0] + + def norm_text(text): + # this should not be a reasonable list of replacements but rather + # instance specific use cases + return text.replace("\u2028", "\n").rstrip() + + rec_complex_funds = re.compile("(euFunds|nationalFunds|ownFunds|@@Other)") + v5data_new = [] + V5_FIELD_ID_APPROX_DATE = repo.metadatas.get_field_id_by_name_v5( + "approximateDate.issued") + specific_fields = { + V5_FIELD_ID_APPROX_DATE: [], + repo.metadatas.V5_DATE_ISSUED: [], + } + + for res_id, res_type_id, text, field_id in v5data: + # ignore '0000', 15 -> we do not store unknown dates + if field_id == repo.metadatas.V5_DATE_ISSUED and text == "0000": + continue + + # ignore file preview in metadata + if field_id in metadatas.IGNORE_FIELDS: + continue + + uuid = repo.uuid(res_type_id, res_id) + if uuid is None: + _logger.debug( + f"Cannot find uuid for [{res_type_id}] [{res_id}] [{str(text)}]") + + if field_id in specific_fields.keys(): + specific_fields[field_id].append(uuid) + + field_id_v7 = repo.metadatas.get_field_id(field_id) + # + if "@@" in text: + splits = text.split("@@") + new_splits = splits + + if field_id_v7 not in (repo.metadatas.V7_FIELD_ID_PROVENANCE,): + if len(splits) == 5: + new_splits = [splits[-2], splits[1], splits[0], splits[2], splits[-1]] + # special case - older complex field impl. + elif len(splits) == 4 and rec_complex_funds.search(text) is not None: + new_splits = [splits[3], splits[1], splits[0], splits[2], ''] + text = ";".join(new_splits) + + # license def + if field_id_v7 == repo.metadatas.V7_FIELD_ID_LIC: + text = norm_lic(text) + + # groups have titles in table + if field_id_v7 == repo.metadatas.V7_FIELD_ID_TITLE and res_type_id == repo.groups.TYPE: + continue + + text = norm_text(text) + v5data_new.append((uuid, text, field_id_v7)) + + # cleanup + # has local.approximateDate.issued -> ignore dc.date.issued + to_check_dates_uuids = set(specific_fields[V5_FIELD_ID_APPROX_DATE]).intersection( + set(specific_fields[repo.metadatas.V5_DATE_ISSUED]) + ) + for to_check_uuid in to_check_dates_uuids: + for i, v in enumerate(v5data_new): + if v is None: + continue + if to_check_uuid == v[0] and v[2] == repo.metadatas.V7_FIELD_DATE_ISSUED: + v5data_new[i] = None + break + v5data_new = [x for x in v5data_new if x is not None] + + v7data_new = [] + for uuid, text, field_id in v7data: + # added language description in addition to language code + if field_id == repo.metadatas.V7_FIELD_LANG_ADDED: + continue + + # should be already ignored # imported preview data + # if field_id == 147: + # continue + + # license def + if field_id == repo.metadatas.V7_FIELD_ID_LIC: + text = norm_lic(text) + + if field_id == repo.metadatas.V7_FIELD_ID_IDENTIFIER_URI: + text = text.replace("http://dev-5.pc:88/handle/", "http://hdl.handle.net/") + + text = norm_text(text) + v7data_new.append((uuid, text, field_id)) + + _logger.info( + f"Changed v5 metadata values to match v7: {len(v5data)} -> {len(v5data_new)}") + _logger.info( + f"Changed v7 metadata values to match v7: {len(v7data)} -> {len(v7data_new)}") + return v5data_new, v7data_new + + +class metadatas: + """ + SQL: + delete from metadatavalue ; delete from metadatafieldregistry ; delete from metadataschemaregistry ; + """ + + # clarin-dspace=# select * from metadatafieldregistry where metadata_field_id=176 ; + # metadata_field_id | metadata_schema_id | element | qualifier | scope_note + # -------------------+--------------------+-----------+-----------+---------------------------------------- + # 176 | 3 | bitstream | file | Files inside a bitstream if an archive + IGNORE_FIELDS = [ + 176 + ] + + validate_table = [ + ["metadataschemaregistry", { + "compare": ["namespace", "short_id"], + }], + ["metadatafieldregistry", { + "compare": ["element", "qualifier"], + }], + ["metadatavalue", { + "sql": { + "5": "select resource_id, resource_type_id, text_value, metadata_field_id from metadatavalue", + "7": "select dspace_object_id, text_value, metadata_field_id from metadatavalue", + "compare": None, + "process": _metadatavalue_process, + } + }], + ] + + def __init__(self, env, dspace, value_file_str: str, field_file_str: str, schema_file_str: str): + self._dspace = dspace + self._values = {} + + self._fields = read_json(field_file_str) + self._fields_id2v7id = {} + self._fields_id2js = {x['metadata_field_id']: x for x in self._fields} + self._v5_fields_name2id = {} + self._v7_fields_name2id = {} + for f in self.fields: + self._v5_fields_name2id[f"{f['element']}.{f['qualifier']}"] = f['metadata_field_id'] + + self._schemas = read_json(schema_file_str) + self._schemas_id2id = {} + self._schemas_id2js = {x['metadata_schema_id']: x for x in self._schemas} + + # read dynamically + self._versions = {} + + self._imported = { + "schema_imported": 0, + "schema_existed": 0, + "field_imported": 0, + "field_existed": 0, + } + + # Find out which field is `local.sponsor`, check only `sponsor` string + sponsor_field_id = -1 + sponsors = [x for x in self._fields if x['element'] == 'sponsor'] + if len(sponsors) != 1: + _logger.warning(f"Found [{len(sponsors)}] elements with name [sponsor]") + else: + sponsor_field_id = sponsors[0]['metadata_field_id'] + + # norm + js_value = read_json(value_file_str) + for val in js_value: + # replace separator @@ by ; + val['text_value'] = val['text_value'].replace("@@", ";") + + # replace `local.sponsor` data sequence + # from `;;;` + # to `;;;` + if val['metadata_field_id'] == sponsor_field_id: + val['text_value'] = metadatas._fix_local_sponsor(val['text_value']) + + # ignore file preview in metadata and others + orig_len = len(js_value) + js_value = [x for x in js_value if x["metadata_field_id"] + not in metadatas.IGNORE_FIELDS] + if orig_len != len(js_value): + _logger.warning( + f"Ignoring metadata fields [{metadatas.IGNORE_FIELDS}], len:[{orig_len}->{len(js_value)}]") + + # fill values + for val in js_value: + res_type_id = str(val['resource_type_id']) + res_id = str(val['resource_id']) + arr = self._values.setdefault(res_type_id, {}).setdefault(res_id, []) + arr.append(val) + + # fill values + for val in js_value: + # Store item handle and item id connection in dict + if not val['text_value'].startswith(env["dspace"]["handle_prefix"]): + continue + + # metadata_field_id 25 is Item's handle + if val['metadata_field_id'] == self.V5_DC_IDENTIFIER_URI_ID: + d = self._versions.get(val['text_value'], {}) + d['item_id'] = val['resource_id'] + self._versions[val['text_value']] = d + + def __len__(self): + return sum(len(x) for x in self._values.values()) + + # ===== + + def get_field_id_by_name_v5(self, name: str): + """ + Note: + Multiple schemas should not have the same key, v7 would not allow it. + + select * from metadatafieldregistry where metadata_field_id=XXX ; + """ + return self._v5_fields_name2id.get(name, None) + + @property + def V5_DC_RELATION_REPLACES_ID(self): + from_map = self.get_field_id_by_name_v5('relation.replaces') + assert 50 == from_map + return from_map + + @property + def V5_DC_RELATION_ISREPLACEDBY_ID(self): + from_map = self.get_field_id_by_name_v5('relation.isreplacedby') + assert 51 == from_map + return from_map + + @property + def V5_DC_IDENTIFIER_URI_ID(self): + from_map = self.get_field_id_by_name_v5('identifier.uri') + assert 25 == from_map + return from_map + + @property + def V5_DATE_ISSUED(self): + from_map = self.get_field_id_by_name_v5('date.issued') + assert 15 == from_map + return from_map + + @property + def V7_FIELD_ID_LIC(self): + return 63 + from_map = self.get_field_id_by_name('date.issued') + assert 63 == from_map + return from_map + + @property + def V7_FIELD_DATE_ISSUED(self): + return 22 + + @property + def V7_FIELD_LANG_ADDED(self): + return 149 + from_map = self.get_field_id_by_name('date.issued') + assert 149 == from_map + return from_map + + @property + def V7_FIELD_ID_IDENTIFIER_URI(self): + return 34 + from_map = self.get_field_id_by_name('identififer.uri') + assert 34 == from_map + return from_map + + @property + def V7_FIELD_ID_TITLE(self): + return 74 + from_map = self.get_field_id_by_name('title.None') + assert 74 == from_map + return from_map + + @property + def V7_FIELD_ID_PROVENANCE(self): + return 37 + from_map = self.get_field_id_by_name('provenance.None') + assert 37 == from_map + return from_map + + # ===== + + @property + def schemas(self): + return self._schemas + + @property + def fields(self): + return self._fields + + @property + def versions(self): + return self._versions + + @property + def imported_schemas(self): + return self._imported['schema_imported'] + + @property + def existed_schemas(self): + return self._imported['schema_existed'] + + @property + def imported_fields(self): + return self._imported['field_imported'] + + @property + def existed_fields(self): + return self._imported['field_existed'] + + @time_method + def import_to(self, dspace): + self._import_schema(dspace) + self._import_fields(dspace) + + # ============= + + def schema_id(self, internal_id: int): + return self._schemas_id2id.get(str(internal_id), None) + + # ============= + + def serialize(self, file_str: str): + data = { + "schemas_id2id": self._schemas_id2id, + "fields_id2v7id": self._fields_id2v7id, + "imported": self._imported, + "v5_fields_name2id": self._v5_fields_name2id, + "v7_fields_name2id": self._v7_fields_name2id, + } + serialize(file_str, data) + + def deserialize(self, file_str: str): + data = deserialize(file_str) + self._schemas_id2id = data["schemas_id2id"] + self._fields_id2v7id = data["fields_id2v7id"] + self._imported = data["imported"] + self._v5_fields_name2id = data["v5_fields_name2id"] + self._v7_fields_name2id = data["v7_fields_name2id"] + + # ============= + + @staticmethod + def _fix_local_sponsor(wrong_sequence_str): + """ + Replace `local.sponsor` data sequence + from `;;;;` + to `;;;;` + """ + sep = ';' + # sponsor list could have length 4 or 5 + sponsor_list = wrong_sequence_str.split(sep) + org, p_code, p_name, p_type = sponsor_list[0:4] + eu_id = '' if len(sponsor_list) < 5 else sponsor_list[4] + # compose the `local.sponsor` sequence in the right way + return sep.join([p_type, p_code, org, p_name, eu_id]) + + def _import_schema(self, dspace): + """ + Import data into database. + Mapped tables: metadataschemaregistry + """ + expected = len(self._schemas) + log_key = "metadata schemas" + log_before_import(log_key, expected) + + # get all existing data from database table + existed_schemas = dspace.fetch_metadata_schemas() or [] + + def find_existing_with_ns(short_id: str, ns: str): + return next((e for e in existed_schemas if e['prefix'] == short_id and e['namespace'] == ns), None) + + def find_existing_prefix(short_id: str): + return next((e for e in existed_schemas if e['prefix'] == short_id), None) + + for schema in progress_bar(self._schemas): + meta_id = schema['metadata_schema_id'] + + # exists in the database + existing = find_existing_with_ns(schema['short_id'], schema['namespace']) + if existing is not None: + _logger.debug( + f'Metadataschemaregistry prefix: {schema["short_id"]} already exists!') + self._imported["schema_existed"] += 1 + self._schemas_id2id[str(meta_id)] = existing['id'] + continue + + # only prefix exists, but there is unique constraint on prefix in the databse + existing = find_existing_prefix(schema['short_id']) + if existing is not None: + _logger.warning( + f'Metadata_schema short_id {schema["short_id"]} ' + f'exists in database with different namespace: {existing["namespace"]}.') + self._imported["schema_existed"] += 1 + self._schemas_id2id[str(meta_id)] = existing['id'] + continue + + data = { + 'namespace': schema['namespace'], + 'prefix': schema['short_id'] + } + try: + resp = dspace.put_metadata_schema(data) + self._schemas_id2id[str(meta_id)] = resp['id'] + self._imported["schema_imported"] += 1 + except Exception as e: + _logger.error( + f'put_metadata_schema [{meta_id}] failed. Exception: {str(e)}') + + log_after_import(f'{log_key} [existed:{self.existed_schemas}]', + expected, self.imported_schemas + self.existed_schemas) + + def _import_fields(self, dspace): + """ + Import data into database. + Mapped tables: metadatafieldregistry + """ + expected = len(self._fields) + log_key = "metadata fields" + log_before_import(log_key, expected) + + existed_fields = dspace.fetch_metadata_fields() or [] + + def find_existing(field): + schema_id = field['metadata_schema_id'] + sch_id = self.schema_id(schema_id) + if sch_id is None: + return None + for e in existed_fields: + if e['_embedded']['schema']['id'] != sch_id or \ + e['element'] != field['element'] or \ + e['qualifier'] != field['qualifier']: + continue + return e + return None + + existing_arr = [] + for field in progress_bar(self._fields): + field_id = field["metadata_field_id"] + schema_id = field['metadata_schema_id'] + e = field['element'] + q = field['qualifier'] + + existing = find_existing(field) + if existing is not None: + _logger.debug(f'Metadatafield: {e}.{q} already exists!') + existing_arr.append(field) + ext_field_id = existing['id'] + self._imported["field_existed"] += 1 + else: + data = { + 'element': field['element'], + 'qualifier': field['qualifier'], + 'scopeNote': field['scope_note'] + } + params = {'schemaId': self.schema_id(schema_id)} + try: + resp = dspace.put_metadata_field(data, params) + ext_field_id = resp['id'] + self._imported["field_imported"] += 1 + except Exception as e: + _logger.error( + f'put_metadata_field [{str(field_id)}] failed. Exception: {str(e)}') + continue + + self._fields_id2v7id[str(field_id)] = ext_field_id + + log_after_import(f'{log_key} [existing:{self.existed_fields}]', + expected, self.imported_fields + self.existed_fields) + + def _get_key_v1(self, val): + """ + Using dspace backend. + """ + int_meta_field_id = val['metadata_field_id'] + try: + ext_meta_field_id = self.get_field_id(int_meta_field_id) + field_js = self._dspace.fetch_metadata_field(ext_meta_field_id) + if field_js is None: + return None + except Exception as e: + _logger.error(f'fetch_metadata_field request failed. Exception: [{str(e)}]') + return None + + # get metadataschema + try: + obj_id = field_js['_embedded']['schema']['id'] + schema_js = self._dspace.fetch_schema(obj_id) + if schema_js is None: + return None + except Exception as e: + _logger.error(f'fetch_schema request failed. Exception: [{str(e)}]') + return None + + # define and insert key and value of dict + key = schema_js['prefix'] + '.' + field_js['element'] + if field_js['qualifier']: + key += '.' + field_js['qualifier'] + return key + + def _get_key_v2(self, val): + """ + Using data. + """ + int_meta_field_id = val['metadata_field_id'] + field_js = self._fields_id2js.get(int_meta_field_id, None) + if field_js is None: + return None + # get metadataschema + schema_id = field_js["metadata_schema_id"] + schema_js = self._schemas_id2js.get(schema_id, None) + if schema_js is None: + return None + # define and insert key and value of dict + key = schema_js['short_id'] + '.' + field_js['element'] + if field_js['qualifier']: + key += '.' + field_js['qualifier'] + return key + + def value(self, res_type_id: int, res_id: int, text_for_field_id: int = None, log_missing: bool = True): + """ + Get metadata value for dspace object. + """ + res_type_id = str(res_type_id) + res_id = str(res_id) + log_miss = _logger.info if log_missing else _logger.debug + + if res_type_id not in self._values: + log_miss(f'Metadata missing [{res_type_id}] type') + return None + tp_values = self._values[res_type_id] + if res_id not in tp_values: + log_miss(f'Metadata for [{res_id}] are missing in [{res_type_id}] type') + return None + + vals = tp_values[res_id] + + vals = [x for x in vals if self.exists_field(x['metadata_field_id'])] + if len(vals) == 0: + return {} + + # special case - return only text_value + if text_for_field_id is not None: + vals = [x['text_value'] + for x in vals if x['metadata_field_id'] == text_for_field_id] + return vals + + res_d = {} + # create list of object metadata + for val in vals: + # key = self._get_key_v1(val) + key = self._get_key_v2(val) + + # if key != key2: + # _logger.critical(f"Incorrect v2 impl.") + + d = { + 'value': val['text_value'], + 'language': val['text_lang'], + 'authority': val['authority'], + 'confidence': val['confidence'], + 'place': val['place'] + } + res_d.setdefault(key, []).append(d) + + return res_d + + def exists_field(self, id: int) -> bool: + return str(id) in self._fields_id2v7id + + def get_field_id(self, id: int) -> int: + return self._fields_id2v7id[str(id)] diff --git a/src/pump/_registrationdata.py b/src/pump/_registrationdata.py new file mode 100644 index 0000000..7e656f6 --- /dev/null +++ b/src/pump/_registrationdata.py @@ -0,0 +1,73 @@ +import logging +from ._utils import read_json, time_method, serialize, deserialize, progress_bar, log_before_import, log_after_import + +_logger = logging.getLogger("pump.registrationdata") + + +class registrationdatas: + """ + SQL: + delete from registrationdata ; + """ + validate_table = [ + ["registrationdata", { + # do not use compare because of email field (GDPR) + "nonnull": ["email"], + }], + ] + + def __init__(self, col_rd_str: str): + self._rd = read_json(col_rd_str) + self._imported = { + "rd": 0, + "missing_email": 0, + } + + if len(self._rd) == 0: + _logger.info(f"Empty input: [{col_rd_str}].") + return + + def __len__(self): + return len(self._rd) + + @property + def imported(self): + return self._imported['rd'] + + @time_method + def import_to(self, dspace): + expected = len(self) + log_key = "registrationdata" + log_before_import(log_key, expected) + + for rd in progress_bar(self._rd): + email = rd['email'] + if email == '': + _logger.debug(f"Registration data [{rd}] ignored because of empty email.") + self._imported["missing_email"] += 1 + continue + data = {'email': email} + params = {'accountRequestType': 'register'} + try: + resp = dspace.put_registrationdata(params, data) + self._imported["rd"] += 1 + except Exception as e: + _logger.error( + f'put_registrationdata [{rd["email"]}]: failed. Exception: [{str(e)}]') + + log_after_import(f'{log_key} missing_email:[{self._imported["missing_email"]}]', + expected, self.imported + self._imported["missing_email"]) + + # ============= + + def serialize(self, file_str: str): + data = { + "rd": self._rd, + "imported": self._imported, + } + serialize(file_str, data) + + def deserialize(self, file_str: str): + data = deserialize(file_str) + self._rd = data["rd"] + self._imported = data["imported"] diff --git a/src/pump/_repo.py b/src/pump/_repo.py new file mode 100644 index 0000000..4c35b57 --- /dev/null +++ b/src/pump/_repo.py @@ -0,0 +1,281 @@ +import logging +import os + +from ._utils import time_method + +from ._handle import handles +from ._metadata import metadatas + +from ._group import groups +from ._community import communities +from ._collection import collections +from ._registrationdata import registrationdatas +from ._eperson import epersons +from ._eperson import groups as eperson_groups +from ._userregistration import userregistrations +from ._bitstreamformatregistry import bitstreamformatregistry +from ._license import licenses +from ._item import items +from ._tasklistitem import tasklistitems +from ._bundle import bundles +from ._bitstream import bitstreams +from ._resourcepolicy import resourcepolicies +from ._usermetadata import usermetadatas +from ._db import db +from ._sequences import sequences + +_logger = logging.getLogger("pump.repo") + + +class repo: + @time_method + def __init__(self, env: dict, dspace): + def _f(name): return os.path.join(env["input"]["datadir"], name) + + # load groups + self.groups = groups( + _f("epersongroup.json"), + _f("group2group.json"), + ) + self.groups.from_rest(dspace) + + # load handles + self.handles = handles(_f("handle.json")) + + # load metadata + self.metadatas = metadatas( + env, + dspace, + _f("metadatavalue.json"), + _f("metadatafieldregistry.json"), + _f("metadataschemaregistry.json"), + ) + + # load community + self.communities = communities( + _f("community.json"), + _f("community2community.json"), + ) + + self.collections = collections( + _f("collection.json"), + _f("community2collection.json"), + _f("metadatavalue.json"), + ) + + self.registrationdatas = registrationdatas( + _f("registrationdata.json") + ) + + self.epersons = epersons( + _f("eperson.json") + ) + + self.egroups = eperson_groups( + _f("epersongroup2eperson.json") + ) + + self.userregistrations = userregistrations( + _f("user_registration.json") + ) + + self.bitstreamformatregistry = bitstreamformatregistry( + _f("bitstreamformatregistry.json") + ) + + self.licenses = licenses( + _f("license_label.json"), + _f("license_definition.json"), + _f("license_label_extended_mapping.json"), + ) + + self.items = items( + _f("item.json"), + _f("workspaceitem.json"), + _f("workflowitem.json"), + _f("collection2item.json"), + ) + + self.tasklistitems = tasklistitems( + _f("tasklistitem.json") + ) + + self.bundles = bundles( + _f("bundle.json"), + _f("item2bundle.json"), + ) + + self.bitstreams = bitstreams( + _f("bitstream.json"), + _f("bundle2bitstream.json"), + ) + + self.usermetadatas = usermetadatas( + _f("user_metadata.json"), + _f("license_resource_user_allowance.json"), + _f("license_resource_mapping.json") + ) + + self.resourcepolicies = resourcepolicies( + _f("resourcepolicy.json") + ) + + self.raw_db_7 = db(env["db_dspace_7"]) + self.raw_db_dspace_5 = db(env["db_dspace_5"]) + self.raw_db_utilities_5 = db(env["db_utilities_5"]) + + self.sequences = sequences() + + def _fetch_all_vals(self, db5, table_name: str, sql: str = None): + sql = f"SELECT * FROM {table_name}" + cols5 = [] + db5 = db5 or self.raw_db_dspace_5 + vals5 = db5.fetch_all(sql, col_names=cols5) + cols7 = [] + vals7 = self.raw_db_7.fetch_all(sql, col_names=cols7) + return cols5, vals5, cols7, vals7 + + def _filter_vals(self, vals, col_names, only_names): + idxs = [col_names.index(x) for x in only_names] + filtered = [] + for row in vals: + filtered.append([row[idx] for idx in idxs]) + return filtered + + def _cmp_values(self, table_name: str, vals5, only_in_5, vals7, only_in_7, do_not_show: bool): + too_many_5 = "" + too_many_7 = "" + LIMIT = 5 + if len(only_in_5) > LIMIT: + too_many_5 = f"!!! TOO MANY [{len(only_in_5)}] " + if len(only_in_7) > LIMIT: + too_many_7 = f"!!! TOO MANY [{len(only_in_7)}] " + + do_not_show = do_not_show or "CI" in os.environ or "GITHUB_ACTION" in os.environ + # assume we do not have emails that we do not want to show in db7 + if do_not_show: + only_in_5 = [x if "@" not in x else "....." for x in only_in_5] + only_in_7 = [x if "@" not in x else "....." for x in only_in_7] + + _logger.info(f"Table [{table_name}]: v5:[{len(vals5)}], v7:[{len(vals7)}]\n" + f" {too_many_5}only in v5:[{only_in_5[:LIMIT]}]\n" + f" {too_many_7}only in v7:[{only_in_7[:LIMIT]}]") + + def diff_table_cmp_cols(self, db5, table_name: str, compare_arr: list, gdpr: bool = True): + cols5, vals5, cols7, vals7 = self._fetch_all_vals(db5, table_name) + do_not_show = gdpr and "email" in compare_arr + + filtered5 = self._filter_vals(vals5, cols5, compare_arr) + vals5_cmp = ["|".join(str(x) for x in x) for x in filtered5] + filtered7 = self._filter_vals(vals7, cols7, compare_arr) + vals7_cmp = ["|".join(str(x) for x in x) for x in filtered7] + + only_in_5 = list(set(vals5_cmp).difference(vals7_cmp)) + only_in_7 = list(set(vals7_cmp).difference(vals5_cmp)) + if len(only_in_5) + len(only_in_7) == 0: + _logger.info(f"Table [{table_name: >20}] is THE SAME in v5 and v7!") + return + self._cmp_values(table_name, vals5, only_in_5, vals7, only_in_7, do_not_show) + + def diff_table_cmp_len(self, db5, table_name: str, nonnull: list = None, gdpr: bool = True): + nonnull = nonnull or [] + cols5, vals5, cols7, vals7 = self._fetch_all_vals(db5, table_name) + do_not_show = gdpr and "email" in nonnull + + msg = " OK " if len(vals5) == len(vals7) else " !!! WARN !!! " + _logger.info( + f"Table [{table_name: >20}] {msg} compared by len only v5:[{len(vals5)}], v7:[{len(vals7)}]") + + for col_name in nonnull: + vals5_cmp = [x for x in self._filter_vals( + vals5, cols5, [col_name]) if x[0] is not None] + vals7_cmp = [x for x in self._filter_vals( + vals7, cols7, [col_name]) if x[0] is not None] + + msg = " OK " if len(vals5_cmp) == len(vals7_cmp) else " !!! WARN !!! " + _logger.info( + f"Table [{table_name: >20}] {msg} NON NULL [{col_name:>15}] v5:[{len(vals5_cmp):3}], v7:[{len(vals7_cmp):3}]") + + def diff_table_sql(self, db5, table_name: str, sql5, sql7, compare, process_ftor): + cols5 = [] + vals5 = db5.fetch_all(sql5, col_names=cols5) + cols7 = [] + vals7 = self.raw_db_7.fetch_all(sql7, col_names=cols7) + # special case where we have different names of columns but only one column to compare + if compare == 0: + vals5_cmp = [x[0] for x in vals5 if x[0] is not None] + vals7_cmp = [x[0] for x in vals7 if x[0] is not None] + elif compare is None: + vals5_cmp = vals5 + vals7_cmp = vals7 + else: + vals5_cmp = [x[0] for x in self._filter_vals( + vals5, cols5, [compare]) if x[0] is not None] + vals7_cmp = [x[0] for x in self._filter_vals( + vals7, cols7, [compare]) if x[0] is not None] + + if process_ftor is not None: + vals5_cmp, vals7_cmp = process_ftor(self, vals5_cmp, vals7_cmp) + + only_in_5 = list(set(vals5_cmp).difference(vals7_cmp)) + only_in_7 = list(set(vals7_cmp).difference(vals5_cmp)) + self._cmp_values(table_name, vals5, only_in_5, vals7, only_in_7, False) + + def diff(self, to_validate=None): + if to_validate is None: + to_validate = [ + getattr(getattr(self, x), "validate_table") + for x in dir(self) if hasattr(getattr(self, x), "validate_table") + ] + else: + if not hasattr(to_validate, "validate_table"): + _logger.warning(f"Missing validate_table in {to_validate}") + return + to_validate = [to_validate.validate_table] + + for valid_defs in to_validate: + for table_name, defin in valid_defs: + _logger.info("=" * 10 + f" Validating {table_name} " + "=" * 10) + db5_name = defin.get("db", "clarin-dspace") + db5 = self.raw_db_dspace_5 if db5_name == "clarin-dspace" else self.raw_db_utilities_5 + + cmp = defin.get("compare", None) + if cmp is not None: + self.diff_table_cmp_cols(db5, table_name, cmp) + + cmp = defin.get("nonnull", None) + if cmp is not None: + self.diff_table_cmp_len(db5, table_name, cmp) + + # compare only len + if len(defin) == 0: + self.diff_table_cmp_len(db5, table_name) + + cmp = defin.get("sql", None) + if cmp is not None: + self.diff_table_sql( + db5, table_name, cmp["5"], cmp["7"], cmp["compare"], cmp.get("process", None)) + + # ===== + def uuid(self, res_type_id: int, res_id: int): + # find object id based on its type + try: + if res_type_id == self.communities.TYPE: + return self.communities.uuid(res_id) + if res_type_id == self.collections.TYPE: + return self.collections.uuid(res_id) + if res_type_id == self.items.TYPE: + return self.items.uuid(res_id) + if res_type_id == self.bitstreams.TYPE: + return self.bitstreams.uuid(res_id) + if res_type_id == self.bundles.TYPE: + return self.bundles.uuid(res_id) + if res_type_id == self.epersons.TYPE: + return self.epersons.uuid(res_id) + if res_type_id == self.groups.TYPE: + arr = self.groups.uuid(res_id) + if len(arr or []) > 0: + return arr[0] + except Exception as e: + return None + return None diff --git a/src/pump/_resourcepolicy.py b/src/pump/_resourcepolicy.py new file mode 100644 index 0000000..b80751c --- /dev/null +++ b/src/pump/_resourcepolicy.py @@ -0,0 +1,133 @@ +import logging +from ._utils import read_json, time_method, serialize, deserialize, progress_bar, log_before_import, log_after_import + +_logger = logging.getLogger("pump.resourcepolicy") + + +class resourcepolicies: + """ + SQL: + delete from resourcepolicy ; + """ + + def __init__(self, resourcepolicy_file_str: str): + self._respol = read_json(resourcepolicy_file_str) + if len(self._respol) == 0: + _logger.info(f"Empty input: [{resourcepolicy_file_str}].") + self._id2uuid = {} + self._imported = { + "respol": 0, + } + + def __len__(self): + return len(self._respol) + + def uuid(self, b_id: int): + assert isinstance(list(self._id2uuid.keys() or [""])[0], str) + return self._id2uuid[str(b_id)] + + @property + def imported(self): + return self._imported['respol'] + + @time_method + def import_to(self, env, dspace, repo): + expected = len(self) + log_key = "resourcepolicies" + log_before_import(log_key, expected) + + dspace_actions = env["dspace"]["actions"] + failed = 0 + + for res_policy in progress_bar(self._respol): + res_id = res_policy['resource_id'] + res_type_id = res_policy['resource_type_id'] + # If resourcepolicy belongs to some Item or Bundle, check if that Item/Bundle wasn't removed from the table. + # Somehow, the resourcepolicy table could still have a reference to deleted items/bundles. + if res_type_id in [repo.items.TYPE, repo.bundles.TYPE]: + if repo.uuid(res_type_id, res_id) is None: + _logger.info( + f"Cannot import resource policy [{res_id}] for the record with type [{res_type_id}] that has already been deleted.") + continue + + res_uuid = repo.uuid(res_type_id, res_id) + if res_uuid is None: + _logger.critical( + f"Cannot find uuid for [{res_type_id}] [{res_id}] [{str(res_policy)}]") + continue + params = {} + if res_uuid is not None: + params['resource'] = res_uuid + # in resource there is action as id, but we need action as text + actionId = res_policy['action_id'] + + # control, if action is entered correctly + if actionId < 0 or actionId >= len(dspace_actions): + _logger.error(f"action_id [{actionId}] is out of range.") + failed += 1 + continue + + # create object for request + data = { + 'action': dspace_actions[actionId], + 'startDate': res_policy['start_date'], + 'endDate': res_policy['end_date'], + 'name': res_policy['rpname'], + 'policyType': res_policy['rptype'], + 'description': res_policy['rpdescription'] + } + + # resource policy has defined eperson or group, not the both + # get eperson if it is not none + if res_policy['eperson_id'] is not None: + params['eperson'] = repo.epersons.uuid(res_policy['eperson_id']) + try: + resp = dspace.put_resourcepolicy(params, data) + self._imported["respol"] += 1 + except Exception as e: + _logger.error( + f'put_resourcepolicy: [{res_policy["policy_id"]}] failed [{str(e)}]') + continue + + # get group if it is not none + eg_id = res_policy['epersongroup_id'] + if eg_id is not None: + group_list1 = repo.groups.uuid(eg_id) + group_list2 = repo.collections.group_uuid(eg_id) + group_list = set(group_list1 + group_list2) + if len(group_list) == 0: + continue + imported_groups = 0 + for group in group_list: + params['group'] = group + try: + resp = dspace.put_resourcepolicy(params, data) + imported_groups += 1 + except Exception as e: + _logger.error( + f'put_resourcepolicy: [{res_policy["policy_id"]}] failed [{str(e)}]') + if imported_groups > 0: + self._imported["respol"] += 1 + continue + + _logger.error(f"Cannot import resource policy {res_policy['policy_id']} " + f"because neither eperson nor group is defined") + failed += 1 + + log_after_import(f"{log_key}, failed:[{failed}]", expected, self.imported) + + # ============= + + def serialize(self, file_str: str): + data = { + "respol": self._respol, + "id2uuid": self._id2uuid, + "imported": self._imported, + } + serialize(file_str, data) + + def deserialize(self, file_str: str): + data = deserialize(file_str) + self._respol = data["respol"] + self._id2uuid = data["id2uuid"] + self._imported = data["imported"] diff --git a/src/pump/_sequences.py b/src/pump/_sequences.py new file mode 100644 index 0000000..568db52 --- /dev/null +++ b/src/pump/_sequences.py @@ -0,0 +1,56 @@ +import logging + +_logger = logging.getLogger("pump.item") + + +class sequences: + def __init__(self): + pass + + def migrate(self, env, db7, db5_dspace, db5_utilities): + """ + Migrate sequences from clarin 5 database to clarin 7 database. + """ + _logger.info("Sequence migration started.") + + # get all sequences from clarin-dspace database + dspace5_seqs = db5_dspace.fetch_all("SELECT * FROM information_schema.sequences") + + key_db_idx = 0 + key_name_idx = 2 + + # Do not import `clarin-utilities` sequences because of this issue: + # https://github.com/dataquest-dev/dspace-python-api/issues/114 + # utilities5_seq = db5_utilities.fetchall("SELECT * FROM information_schema.sequences") + + db7_seqs = db7.fetch_all("SELECT * FROM information_schema.sequences") + db7_seqs_names = [seq[key_name_idx] for seq in db7_seqs] + + # check if all sequences from clarin 5 are already present in clarin 7 + for dspace5_seq in dspace5_seqs: + + dspace5_seq_db = dspace5_seq[key_db_idx] + dspace5_seq_name = dspace5_seq[key_name_idx] + + if dspace5_seq_name not in db7_seqs_names: + continue + + # use cursor according to database to which sequence belongs + if dspace5_seq_db == "clarin-dspace": + db = db5_dspace + else: + db = db5_utilities + + # get current value of given sequence + seq_val = db.fetch_one(f"SELECT last_value FROM {dspace5_seq_name}") + + # set value of the sequence in clarin 7 dspace database + db7.exe_sql(f"SELECT setval('{dspace5_seq_name}', {seq_val})") + + # check value of the sequence in clarin7 database + db7_seq_val = db7.fetch_one(f"SELECT last_value FROM {dspace5_seq_name}") + if seq_val != db7_seq_val: + _logger.error( + f"{dspace5_seq_name} --> [{seq_val}] does not match expected [{db7_seq_val}].") + + _logger.info("Sequence migration is complete.") diff --git a/src/pump/_tasklistitem.py b/src/pump/_tasklistitem.py new file mode 100644 index 0000000..df07e78 --- /dev/null +++ b/src/pump/_tasklistitem.py @@ -0,0 +1,62 @@ +import logging +from ._utils import read_json, time_method, serialize, deserialize, progress_bar, log_before_import, log_after_import + +_logger = logging.getLogger("pump.tasklistitem") + + +class tasklistitems: + """ + Mapped table: tasklistitem + + SQL: + """ + + def __init__(self, tl_file_str: str): + self._tasks = read_json(tl_file_str) or {} + self._imported = { + "tasks": 0, + } + + if len(self._tasks) == 0: + _logger.info(f"Empty input: [{tl_file_str}].") + return + + def __len__(self): + return len(self._tasks) + + @property + def imported(self): + return self._imported['tasks'] + + @time_method + def import_to(self, dspace, epersons, items): + expected = len(self) + log_key = "tasks" + log_before_import(log_key, expected) + + for task in progress_bar(self._tasks): + try: + params = { + 'epersonUUID': epersons.uuid(task['eperson_id']), + 'workflowitem_id': items.wf_id(task['workflow_id']) + } + resp = dspace.put_tasklistitem(params) + self._imported["task"] += 1 + except Exception as e: + _logger.error(f'put_tasklistitem: [{task}] failed [{str(e)}]') + + log_after_import(log_key, expected, self.imported) + + # ============= + + def serialize(self, file_str: str): + data = { + "tasks": self._tasks, + "imported": self._imported, + } + serialize(file_str, data) + + def deserialize(self, file_str: str): + data = deserialize(file_str) + self._tasks = data["tasks"] + self._imported = data["imported"] diff --git a/src/pump/_usermetadata.py b/src/pump/_usermetadata.py new file mode 100644 index 0000000..1eb5170 --- /dev/null +++ b/src/pump/_usermetadata.py @@ -0,0 +1,118 @@ +import logging +from ._utils import read_json, time_method, serialize, deserialize, progress_bar, log_before_import, log_after_import + +_logger = logging.getLogger("pump.usermetadata") + + +class usermetadatas: + """ + SQL: + Mapped tables: user_metadata, license_resource_user_allowance + """ + + def __init__(self, usermetadata_file_str: str, userallowance_file_str: str, resourcemapping_file_str: str): + self._umeta = read_json(usermetadata_file_str) + self._uallowance = read_json(userallowance_file_str) + self._rmap = read_json(resourcemapping_file_str) + self._id2uuid = {} + self._imported = { + "um": 0, + } + + if len(self._umeta) == 0: + _logger.info(f"Empty input: [{usermetadata_file_str}].") + + if len(self._uallowance) == 0: + _logger.info(f"Empty input: [{userallowance_file_str}].") + + if len(self._rmap) == 0: + _logger.info(f"Empty input: [{resourcemapping_file_str}].") + + # mapping transaction_id to mapping_id + self._uallowance_transid2d = {ua['transaction_id']: ua for ua in self._uallowance} + # mapping bitstream_id to mapping_id + self._rmap_id2bsid = {m["mapping_id"]: m["bitstream_id"] for m in self._rmap} + + # Group user metadata by `transaction_id`. The endpoint must receive list of all metadata with the same + # transaction_id` because if the endpoint will be called for every `user_metadata` there will be a huge amount + # of `license_resource_user_allowance` records with not correct mapping with the `user_metadata` table. + self._umeta_transid2ums = {} + for um in self._umeta: + t_id = um['transaction_id'] + if t_id not in self._uallowance_transid2d: + continue + self._umeta_transid2ums.setdefault(t_id, []).append(um) + + def __len__(self): + return len(self._umeta) + + def uuid(self, b_id: int): + assert isinstance(list(self._id2uuid.keys() or [""])[0], str) + return self._id2uuid[str(b_id)] + + @property + def imported(self): + return self._imported['um'] + + @time_method + def import_to(self, dspace, bitstreams, userregistrations): + expected = len(self._umeta_transid2ums) + log_key = "usermetadata" + log_before_import(log_key, expected) + + # Go through dict and import user_metadata + for t_id, um_arr in progress_bar(self._umeta_transid2ums.items()): + um0 = um_arr[0] + # Get user_registration data for importing + ua_d = self._uallowance_transid2d[um0['transaction_id']] + # Get `eperson_id` for importing + eperson_id = um0['eperson_id'] + map_id = ua_d['mapping_id'] + + # Prepare user_metadata list for request + data = [{'metadataKey': um['metadata_key'], + 'metadataValue': um['metadata_value'] + } for um in um_arr] + + try: + bs_id = self._rmap_id2bsid[map_id] + bs_uuid = bitstreams.uuid(bs_id) + if bs_uuid is None: + _logger.info( + f"Cannot import user metadata for mapping_id->bsid: [{map_id}]->[{bs_id}] because the bitstream has probably already been deleted.") + continue + userreg_id = userregistrations.uuid(eperson_id) + + # Prepare params for the import endpoint + params = { + 'bitstreamUUID': bs_uuid, + 'createdOn': ua_d['created_on'], + 'token': ua_d['token'], + 'userRegistrationId': userreg_id + } + resp = dspace.put_usermetadata(params, data) + self._imported['um'] += 1 + except Exception as e: + _logger.error(f'put_usermetadata: [{t_id}] failed [{str(e)}]') + + log_after_import(log_key, expected, self.imported) + + # ============= + + def serialize(self, file_str: str): + data = { + "umeta": self._umeta, + "uallowance": self._uallowance, + "rmap": self._rmap, + "id2uuid": self._id2uuid, + "imported": self._imported, + } + serialize(file_str, data) + + def deserialize(self, file_str: str): + data = deserialize(file_str) + self._umeta = data["umeta"] + self._uallowance = data["uallowance"] + self._rmap = data["rmap"] + self._id2uuid = data["id2uuid"] + self._imported = data["imported"] diff --git a/src/pump/_userregistration.py b/src/pump/_userregistration.py new file mode 100644 index 0000000..c5f5605 --- /dev/null +++ b/src/pump/_userregistration.py @@ -0,0 +1,81 @@ +import logging +from ._utils import read_json, time_method, serialize, deserialize, progress_bar, log_before_import, log_after_import + +_logger = logging.getLogger("pump.userregistration") + + +class userregistrations: + validate_table = [ + # ["userregistration", { + # # do not use compare because of email field (GDPR) + # "compare": ["email", "netid"], + # }], + ] + + def __init__(self, ur_file_str: str): + self._ur = read_json(ur_file_str) + self._imported = { + "users": 0, + } + + self._id2uuid = {} + + if len(self._ur) == 0: + _logger.info(f"Empty input: [{ur_file_str}].") + return + + def __len__(self): + return len(self._ur) + + def uuid(self, e_id: int): + assert isinstance(list(self._id2uuid.keys() or [""])[0], str) + return self._id2uuid[str(e_id)] + + @property + def imported(self): + return self._imported['users'] + + @time_method + def import_to(self, dspace, epersons): + """ + Import data into database. + Mapped tables: user_registration + """ + expected = len(self) + log_key = "userregistration" + log_before_import(log_key, expected) + + for ur in progress_bar(self._ur): + data = { + 'email': ur['email'], + 'organization': ur['organization'], + 'confirmation': ur['confirmation'] + } + e_id = ur['eperson_id'] + e_id_by_email = epersons.by_email(ur['email']) + data['ePersonID'] = epersons.uuid( + e_id_by_email) if e_id_by_email is not None else None + try: + resp = dspace.put_userregistration(data) + self._id2uuid[str(e_id)] = resp['id'] + self._imported['users'] += 1 + except Exception as e: + _logger.error(f'put_userregistration: [{e_id}] failed [{str(e)}]') + + log_after_import(log_key, expected, self.imported) + + # ============= + + def serialize(self, file_str: str): + data = { + "ur": self._ur, + "id2uuid": self._id2uuid, + "imported": self._imported, + } + serialize(file_str, data) + + def deserialize(self, file_str: str): + data = deserialize(file_str) + self._ur = data["ur"] + self._id2uuid = data["id2uuid"] + self._imported = data["imported"] diff --git a/src/pump/_utils.py b/src/pump/_utils.py new file mode 100644 index 0000000..3709d1c --- /dev/null +++ b/src/pump/_utils.py @@ -0,0 +1,128 @@ +import json +import os +import logging +from datetime import datetime, timezone +from time import time as time_fnc +_logger = logging.getLogger("pump.utils") + + +def read_json(file_name: str): + """ + Read data from file as json. + @param file_name: file name + @return: data as json + """ + if not os.path.exists(file_name): + raise FileNotFoundError(f"File [{file_name}] does not exist.") + with open(file_name, mode='r', encoding='utf-8') as f: + return json.load(f) + + +def to_dict(arr: list): + return {int(k): v for k, v in enumerate(arr)} + + +def ts() -> str: + return str(datetime.now(timezone.utc)) + + +def time_method(func): + """ + Timer decorator will store execution time of a function into + the class which it uses. The time will be stored in + instance.timed + """ + + def _enclose(self, *args, **kw): + """ Enclose every function with this one. """ + start = time_fnc() + res = func(self, *args, **kw) + took = time_fnc() - start + if took > 10.: + _logger.info(f"Method [{func.__name__}] took [{round(took, 2)}] seconds.") + return res + + return _enclose + + +def time_function(d): + """ + Timer decorator will store execution time of a function into + the specified dict to timed entry. + """ + + def wrap(func): + """ Simple wrapper around time decorator. """ + + def _enclose(*args, **kw): + """ Simple wrapper around wrap :). """ + start = time_fnc() + res = func(*args, **kw) + took = time_fnc() - start + _logger.debug(f"Function [{func.__name__}] took [{round(took, 2)}] seconds.") + return res + + return _enclose + + return wrap + + +def serial_d(data): + """ + Unify serialisation data. + """ + return { + "data": data, + "timestamp": ts(), + } + + +def serialize(file_str: str, data, sorted=True): + """ + Serialize data into json file. + """ + if isinstance(data, dict): + for v in data.values(): + if isinstance(v, dict): + keys = list(v.keys()) + if len(keys) > 0: + if isinstance(keys[0], int): + _logger.critical(f"Serializing dictionary with integer keys [{file_str}] !!!") + + os.makedirs(os.path.dirname(file_str), exist_ok=True) + with open(file_str, encoding="utf-8", mode="w") as fout: + json.dump(serial_d(data), fout, indent=None, sort_keys=sorted) + + +def deserialize(file_str: str): + with open(file_str, encoding="utf-8", mode="r") as fin: + js = json.load(fin) + return js["data"] + + +IMPORT_LIMIT = None +if os.environ.get("IMPORT_LIMIT", "0") != "0": + IMPORT_LIMIT = int(os.environ["IMPORT_LIMIT"]) + _logger.critical(f"Using import limit [{IMPORT_LIMIT}]") + + +def progress_bar(arr): + if len(arr) < 2: + return iter(arr) + try: + from tqdm import tqdm + except Exception as e: + return iter(arr) + + mininterval = 5 if len(arr) < 500 else 10 + return tqdm(arr, mininterval=mininterval, maxinterval=2 * mininterval) + + +def log_before_import(msg: str, expected: int): + _logger.info("=====") + _logger.info(f"Importing [{expected: >4d}] {msg}") + + +def log_after_import(msg: str, expected: int, imported: int): + prefix = "OK " if expected == imported else "!!! WARN !!! " + _logger.info(f"{prefix}Imported [{imported: >4d}] {msg}") diff --git a/src/repo_import.py b/src/repo_import.py new file mode 100644 index 0000000..9f7339f --- /dev/null +++ b/src/repo_import.py @@ -0,0 +1,312 @@ +import sys +import time +import os +import argparse +import logging + +import settings +import project_settings +from utils import init_logging, update_settings, exists_key, set_key + +_logger = logging.getLogger() + +# env settings, update with project_settings +env = update_settings(settings.env, project_settings.settings) +init_logging(_logger, env["log_file"]) + +import dspace # noqa +import pump # noqa + + +def verify_disabled_mailserver(): + """ + Is the email server really off? + """ + email_s_off = input("Please make sure your email server is turned off. " + "Otherwise unbearable amount of emails will be sent. " + "Is your EMAIL SERVER really OFF? (Y/N)") + if email_s_off.lower() not in ("y", "yes"): + _logger.critical("The email server is not off.") + sys.exit() + + +def deserialize(resume: bool, obj, cache_file: str) -> bool: + """ + If cache file exists, deserialize it and return True. + """ + if not resume: + return False + + if not os.path.exists(cache_file): + return False + obj.deserialize(cache_file) + return True + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description='Import data from previous version to current DSpace') + parser.add_argument('--resume', + help='Resume by loading values into dictionary', + required=False, type=bool, default=True) + parser.add_argument('--config', + help='Update configs', + required=False, type=str, action='append') + + args = parser.parse_args() + s = time.time() + + for k, v in [x.split("=") for x in (args.config or [])]: + _logger.info(f"Updating [{k}]->[{v}]") + _1, prev_val = exists_key(k, env, True) + if isinstance(prev_val, bool): + new_val = str(v).lower() in ("true", "t", "1") + elif prev_val is None: + new_val = str(v) + else: + new_val = type(prev_val)(v) + set_key(k, new_val, env) + + # just in case + # verify_disabled_mailserver() + + # update based on env + for k, v in env["cache"].items(): + env["cache"][k] = os.path.join(env["resume_dir"], v) + + # input data directory + input_dir = env["input"]["datadir"] + if not os.path.exists(input_dir): + _logger.critical(f"Input directory [{input_dir}] does not exist - cannot import.") + sys.exit(1) + + dspace_be = dspace.rest( + env["backend"]["endpoint"], + env["backend"]["user"], + env["backend"]["password"], + env["backend"]["authentication"] + ) + + _logger.info("Loading repo objects") + repo = pump.repo(env, dspace_be) + + #### + _logger.info("New instance database status:") + repo.raw_db_7.status() + _logger.info("Reference database dspace status:") + repo.raw_db_dspace_5.status() + _logger.info("Reference database dspace-utilities status:") + repo.raw_db_utilities_5.status() + + import_sep = f"\n{40 * '*'}\n" + _logger.info("Starting import") + + # import handles + cache_file = env["cache"]["handle"] + if deserialize(args.resume, repo.handles, cache_file): + _logger.info(f"Resuming handle [{repo.handles.imported}]") + else: + repo.handles.import_to(dspace_be) + repo.handles.serialize(cache_file) + repo.diff(repo.handles) + _logger.info(import_sep) + + # import metadata + cache_file = env["cache"]["metadataschema"] + if deserialize(args.resume, repo.metadatas, cache_file): + _logger.info( + f"Resuming metadata [schemas:{repo.metadatas.imported_schemas}][fields:{repo.metadatas.imported_fields}]") + else: + repo.metadatas.import_to(dspace_be) + repo.metadatas.serialize(cache_file) + repo.diff(repo.metadatas) + _logger.info(import_sep) + + # import bitstreamformatregistry + cache_file = env["cache"]["bitstreamformat"] + if deserialize(args.resume, repo.bitstreamformatregistry, cache_file): + _logger.info( + f"Resuming bitstreamformatregistry [{repo.bitstreamformatregistry.imported}]") + else: + repo.bitstreamformatregistry.import_to(dspace_be) + repo.bitstreamformatregistry.serialize(cache_file) + repo.diff(repo.bitstreamformatregistry) + _logger.info(import_sep) + + # import community + cache_file = env["cache"]["community"] + if deserialize(args.resume, repo.communities, cache_file): + _logger.info( + f"Resuming community [coms:{repo.communities.imported_coms}][com2coms:{repo.communities.imported_com2coms}]") + else: + repo.communities.import_to(dspace_be, repo.handles, repo.metadatas) + if len(repo.communities) == repo.communities.imported_coms: + repo.communities.serialize(cache_file) + repo.diff(repo.communities) + _logger.info(import_sep) + + # import collection + cache_file = env["cache"]["collection"] + if deserialize(args.resume, repo.collections, cache_file): + _logger.info( + f"Resuming collection [cols:{repo.collections.imported_cols}] [groups:{repo.collections.imported_groups}]") + else: + repo.collections.import_to(dspace_be, repo.handles, + repo.metadatas, repo.communities) + repo.collections.serialize(cache_file) + repo.diff(repo.collections) + _logger.info(import_sep) + + # import registration data + cache_file = env["cache"]["registrationdata"] + if deserialize(args.resume, repo.registrationdatas, cache_file): + _logger.info(f"Resuming registrationdata [{repo.registrationdatas.imported}]") + else: + repo.registrationdatas.import_to(dspace_be) + repo.registrationdatas.serialize(cache_file) + repo.diff(repo.registrationdatas) + _logger.info(import_sep) + + # import eperson groups + cache_file = env["cache"]["epersongroup"] + if deserialize(args.resume, repo.groups, cache_file): + _logger.info( + f"Resuming epersongroup [eperson:{repo.groups.imported_eperson}] [g2g:{repo.groups.imported_g2g}]") + else: + repo.groups.import_to(dspace_be, repo.metadatas, repo.collections.groups_id2uuid, + repo.communities.imported_groups) + repo.groups.serialize(cache_file) + repo.diff(repo.groups) + _logger.info(import_sep) + + # import eperson + cache_file = env["cache"]["eperson"] + if deserialize(args.resume, repo.epersons, cache_file): + _logger.info(f"Resuming epersons [{repo.epersons.imported}]") + else: + repo.epersons.import_to(env, dspace_be, repo.metadatas) + repo.epersons.serialize(cache_file) + repo.diff(repo.epersons) + _logger.info(import_sep) + + # import userregistrations + cache_file = env["cache"]["userregistration"] + if deserialize(args.resume, repo.userregistrations, cache_file): + _logger.info(f"Resuming userregistrations [{repo.userregistrations.imported}]") + else: + repo.userregistrations.import_to(dspace_be, repo.epersons) + repo.userregistrations.serialize(cache_file) + repo.diff(repo.userregistrations) + _logger.info(import_sep) + + # import group2eperson + cache_file = env["cache"]["group2eperson"] + if deserialize(args.resume, repo.egroups, cache_file): + _logger.info(f"Resuming egroups [{repo.egroups.imported}]") + else: + repo.egroups.import_to(dspace_be, repo.groups, repo.epersons) + repo.egroups.serialize(cache_file) + repo.diff(repo.egroups) + _logger.info(import_sep) + + # import licenses + cache_file = env["cache"]["license"] + if deserialize(args.resume, repo.licenses, cache_file): + _logger.info( + f"Resuming licenses [labels:{repo.licenses.imported_labels}] [licenses:{repo.licenses.imported_licenses}]") + else: + repo.licenses.import_to(env, dspace_be, repo.epersons) + repo.licenses.serialize(cache_file) + repo.diff(repo.licenses) + _logger.info(import_sep) + + # import item + cache_file = env["cache"]["item"] + if deserialize(args.resume, repo.items, cache_file): + _logger.info(f"Resuming items [{repo.items.imported}]") + repo.items.import_to(cache_file, dspace_be, repo.handles, + repo.metadatas, repo.epersons, repo.collections) + else: + repo.items.import_to(cache_file, dspace_be, repo.handles, + repo.metadatas, repo.epersons, repo.collections) + repo.items.serialize(cache_file) + repo.items.raw_after_import( + env, repo.raw_db_7, repo.raw_db_dspace_5, repo.metadatas) + repo.diff(repo.items) + _logger.info(import_sep) + + # import tasklists + cache_file = env["cache"]["tasklistitem"] + if deserialize(args.resume, repo.tasklistitems, cache_file): + _logger.info(f"Resuming tasklistitems [{repo.tasklistitems.imported}]") + else: + repo.tasklistitems.import_to(dspace_be, repo.epersons, repo.items) + repo.tasklistitems.serialize(cache_file) + repo.diff(repo.tasklistitems) + _logger.info(import_sep) + + # import bundle + cache_file = env["cache"]["bundle"] + if deserialize(args.resume, repo.bundles, cache_file): + _logger.info(f"Resuming bundles [{repo.bundles.imported}]") + else: + repo.bundles.import_to(dspace_be, repo.metadatas, repo.items) + repo.bundles.serialize(cache_file) + repo.diff(repo.bundles) + _logger.info(import_sep) + + # import bitstreams + cache_file = env["cache"]["bitstream"] + if deserialize(args.resume, repo.bitstreams, cache_file): + _logger.info(f"Resuming bitstreams [{repo.bitstreams.imported}]") + repo.bitstreams.import_to( + env, cache_file, dspace_be, repo.metadatas, repo.bitstreamformatregistry, repo.bundles, repo.communities, repo.collections) + else: + repo.bitstreams.import_to( + env, cache_file, dspace_be, repo.metadatas, repo.bitstreamformatregistry, repo.bundles, repo.communities, repo.collections) + repo.bitstreams.serialize(cache_file) + repo.diff(repo.bitstreams) + _logger.info(import_sep) + + # import usermetadata + cache_file = env["cache"]["usermetadata"] + if deserialize(args.resume, repo.usermetadatas, cache_file): + _logger.info(f"Resuming usermetadatas [{repo.usermetadatas.imported}]") + else: + repo.usermetadatas.import_to(dspace_be, repo.bitstreams, repo.userregistrations) + repo.usermetadatas.serialize(cache_file) + repo.diff(repo.usermetadatas) + _logger.info(import_sep) + + # before importing of resource policies we have to delete all + # created data + repo.raw_db_7.delete_resource_policy() + + # import bitstreams + cache_file = env["cache"]["resourcepolicy"] + if deserialize(args.resume, repo.resourcepolicies, cache_file): + _logger.info(f"Resuming resourcepolicies [{repo.resourcepolicies.imported}]") + else: + repo.resourcepolicies.import_to(env, dspace_be, repo) + repo.resourcepolicies.serialize(cache_file) + repo.diff(repo.resourcepolicies) + _logger.info(import_sep) + + # migrate sequences + repo.sequences.migrate(env, repo.raw_db_7, repo.raw_db_dspace_5, + repo.raw_db_utilities_5) + + took = time.time() - s + _logger.info(f"Took [{round(took, 2)}] seconds to import all data") + _logger.info( + f"Made [{dspace_be.get_cnt}] GET requests, [{dspace_be.post_cnt}] POST requests.") + + _logger.info("New instance database status:") + repo.raw_db_7.status() + _logger.info("Reference database dspace status:") + repo.raw_db_dspace_5.status() + _logger.info("Reference database dspace-utilities status:") + repo.raw_db_utilities_5.status() + + _logger.info("Database difference") + repo.diff() diff --git a/src/settings/__init__.py b/src/settings/__init__.py new file mode 100644 index 0000000..dc35edf --- /dev/null +++ b/src/settings/__init__.py @@ -0,0 +1,15 @@ +__all__ = [ + # "dspace", + "env", +] + +env = { + +} + +# from ._dspace import +from ._cache import settings # noqa +env["cache"] = settings + +from ._dspace import settings # noqa +env["dspace"] = settings diff --git a/src/settings/_cache.py b/src/settings/_cache.py new file mode 100644 index 0000000..6de3c2d --- /dev/null +++ b/src/settings/_cache.py @@ -0,0 +1,27 @@ +# mapping dict names +settings = { + + "handle": "handle.json", + "metadataschema": "metadataschema.json", + + "community": "community.json", + "collection": "collection.json", + + "registrationdata": "registrationdata.json", + "epersongroup": "epersongroup.json", + "eperson": "eperson.json", + "group2eperson": "group2eperson.json", + + "userregistration": "user_registration.json", + "bitstreamformat": "bitstreamformatregistry.json", + "license": "license.json", + "item": "item.json", + + "tasklistitem": "tasklistitem.json", + + "bundle": "bundle.json", + "bitstream": "bitstream.json", + + "resourcepolicy": "resourcepolicy.json", + "usermetadata": "user_metadata.json", +} diff --git a/src/settings/_dspace.py b/src/settings/_dspace.py new file mode 100644 index 0000000..fdeebcd --- /dev/null +++ b/src/settings/_dspace.py @@ -0,0 +1,41 @@ +settings = { + + "handle_prefix": "http://hdl.handle.net/", + + "actions": ["READ", "WRITE", "OBSOLETE (DELETE)", + "ADD", "REMOVE", "WORKFLOW_STEP_1", + "WORKFLOW_STEP_2", "WORKFLOW_STEP_3", + "WORKFLOW_ABORT", "DEFAULT_BITSTREAM_READ", + "DEFAULT_ITEM_READ", "ADMIN", + "WITHDRAWN_READ"] +} + +# # there should be no need to modify this part, unless adding new tests. +# # mainly concatenates and parses settings above +# OAI_url = BE_url + "oai/" +# OAI_req = OAI_url + "request?verb=ListRecords&metadataPrefix=oai_dc&set=" +# OAI_openaire_dc = OAI_url + "openaire_data?verb=ListRecords&" \ +# "metadataPrefix=oai_dc&set=" +# OAI_openaire_datacite = OAI_url + "openaire_data?verb=ListRecords&" \ +# "metadataPrefix=oai_datacite&set=" +# OAI_olac = OAI_url + "request?verb=ListRecords&metadataPrefix=olac&set=" +# OAI_cmdi = OAI_url + "request?verb=ListRecords&metadataPrefix=cmdi&set=" +# IMPORT_DATA_PATH = "data/license_import/" +# COM = "BB-TEST-COM" +# com_UUID = None +# COL = "BB-TEST-COL" +# col_UUID = None +# ITM_prefix = "BB-TEST-ITM-" +# EMBEDDED = "_embedded" +# +# import enum +# +# +# class ItemType(enum.Enum): +# ITEM = 1 +# COMMUNITY = 2 +# COLLECTION = 3 +# +# +# # constants for resource type ID, taken from DSpace (BE) codebase +# SITE = 5 diff --git a/src/utils.py b/src/utils.py new file mode 100644 index 0000000..eaf2a33 --- /dev/null +++ b/src/utils.py @@ -0,0 +1,67 @@ +import os +import logging + + +def init_logging( + logger, + log_file: str, + console_level=logging.INFO, + file_level=logging.INFO, + format: str = '%(asctime)s:%(levelname)s: %(message)s'): + """ + Simple basic file/console logging. + """ + base_log_dir = os.path.dirname(log_file) + os.makedirs(base_log_dir, exist_ok=True) + + formatter = logging.Formatter(format) + file_handler = logging.FileHandler(log_file) + file_handler.setFormatter(formatter) + file_handler.setLevel(file_level) + logger.addHandler(file_handler) + + console_handler = logging.StreamHandler() + console_handler.setLevel(console_level) + console_handler.setFormatter(formatter) + logger.addHandler(console_handler) + + logger.setLevel(logging.DEBUG) + + +def update_settings(main_env: dict, update_with: dict) -> dict: + """ + Update `main_env` with `update_with`, + if `update_with` value is a dict, update only keys which are in `main_env` + """ + env = main_env.copy() + for k, v in update_with.items(): + if isinstance(v, dict) and k in env: + env[k].update(v) + continue + env[k] = v + return env + + +def exists_key(special_format_key_str, dict_inst, return_val=False): + """ Checks whether a recursive key exists defined in dot format.""" + parts = special_format_key_str.split(".") + d = dict_inst + for part in parts: + if part is None or part not in d: + return (False, None) if return_val else False + d = d[part] + return (True, d) if return_val else True + + +def set_key(special_format_key_str, value, dict_inst): + """ Checks whether a recursive key exists defined in dot format.""" + parts = special_format_key_str.split(".") + d = dict_inst + for i, part in enumerate(parts): + if part is None or part not in d: + return False + if i != len(parts) - 1: + d = d[part] + else: + d[part] = value + return True diff --git a/support/dspace_interface/__init__.py b/support/dspace_interface/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/support/dspace_interface/readme.md b/support/dspace_interface/readme.md deleted file mode 100644 index c3f3d69..0000000 --- a/support/dspace_interface/readme.md +++ /dev/null @@ -1,5 +0,0 @@ -# This library is taken from [github repository](https://github.com/the-library-code/dspace-rest-python) - -More info can be found in original_readme. - -The library is modified. diff --git a/support/dspace_interface/response_map.py b/support/dspace_interface/response_map.py deleted file mode 100644 index 5af5fcf..0000000 --- a/support/dspace_interface/response_map.py +++ /dev/null @@ -1,38 +0,0 @@ -""" -Defines how dspace api (rest_proxy) behaves on responses. -Add specific reactions to response_map. -""" -import logging -from json import JSONDecodeError - - -def check_response(r, additional_message): - if r is None: - logging.error("Failed to receive response. " + additional_message) - raise Exception("No response from server where one was expected") - logging.info(str(additional_message) + " Response " + str(r.status_code)) - if r.status_code not in response_map: - logging.warning("Unexpected response while creating item: " + - str(r.status_code) + "; " + r.url + "; " + r.text) - else: - response_map[r.status_code](r) - - -response_map = { - 201: lambda r: response_success(r), - 200: lambda r: response_success(r), - 500: lambda r: error(r), - 400: lambda r: error(r) -} - - -def error(r): - raise ConnectionError(r.text) - - -def response_success(r): - try: - r = r.json() - logging.info(f'{r["type"]} created successfully!') - except JSONDecodeError: - logging.info("request successfully") diff --git a/support/dspace_proxy.py b/support/dspace_proxy.py deleted file mode 100644 index ef2359b..0000000 --- a/support/dspace_proxy.py +++ /dev/null @@ -1,36 +0,0 @@ -import logging - -import const -from support.dspace_interface.client import DSpaceClient - - -class DspaceRESTProxy: - """ - Serves as proxy to Dspace REST API. - Mostly uses attribute d which represents (slightly modified) dspace_client from - original python rest api by dspace developers - """ - - def __init__(self): - self.response = None - self.d = DSpaceClient(api_endpoint=const.API_URL, - username=const.user, password=const.password) - if const.authentication: - authenticated = self.d.authenticate() - if not authenticated: - logging.error('Error logging in to dspace REST API at ' + - const.API_URL + '! Exiting!') - raise ConnectionError("Cannot connect to dspace!") - logging.info("Successfully logged in to dspace on " + const.API_URL) - - def get(self, command, params=None, data=None): - """ - Simple GET of url. - param command what to append to host.xx/server/api/ - """ - url = const.API_URL + command - self.response = self.d.api_get(url, params, data) - return self.response - - -rest_proxy = DspaceRESTProxy() diff --git a/support/transform/__init__.py b/support/transform/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/data_checker/README.md b/tools/data_checker/README.md similarity index 95% rename from data_checker/README.md rename to tools/data_checker/README.md index 58dc3e6..2652156 100644 --- a/data_checker/README.md +++ b/tools/data_checker/README.md @@ -1,33 +1,33 @@ -# Check of correct import of resource policies -### We check ONLY if ANONYMOUS group has same READ access to ITEMS in Dspace5 and Dspace7 after success data import! We don't check other resource policies. - -## How to use it: -1. Update `const.py` - - `use_ssl = False` - - `host = ""` - - `fe_port = ""` - - `be_port = ""` - - `be_location = "/server/"` - - - `authentication = False` - we test anonymous access - - - `CLARIN_DSPACE_NAME = "clarin-dspace"` - - `CLARIN_DSPACE_HOST = "localhost"` - - `CLARIN_DSPACE_USER = ""` - - `CLARIN_DSPACE_PASSWORD = ""` - -2. Be sure your project contains files: - **IMPORTANT:** If `data` or `temp-files` folders don't exist in the project, create them - - `temp-files/item_dict.json` - dict of mapping item IDs from Dspace5 to Dspace7 - - `data/handle.json` - data of handles from Dspace5 - -3. Run resource policy checker for anonymous view of items in Dspace7 based on Dspace5 resource policcies - - **NOTE:** database must be full - - **NOTE:** item_dict.json has to contain actual IDs from database of Dspace5 mapping to IDs of Dspace7 - - **NOTE:** dspace server must be running - - From the `dspace-python-api/data_checker` run command `python main.resource_policy_pump.py` - -4. Check `logs.log` in `data_checker` for `Resource policies checker of anonymous view of items` - - +# Check of correct import of resource policies +### We check ONLY if ANONYMOUS group has same READ access to ITEMS in Dspace5 and Dspace7 after success data import! We don't check other resource policies. + +## How to use it: +1. Update `const.py` + - `use_ssl = False` + - `host = ""` + - `fe_port = ""` + - `be_port = ""` + - `be_location = "/server/"` + + - `authentication = False` - we test anonymous access + + - `CLARIN_DSPACE_NAME = "clarin-dspace"` + - `CLARIN_DSPACE_HOST = "localhost"` + - `CLARIN_DSPACE_USER = ""` + - `CLARIN_DSPACE_PASSWORD = ""` + +2. Be sure your project contains files: + **IMPORTANT:** If `data` or `temp-files` folders don't exist in the project, create them + - `temp-files/item_dict.json` - dict of mapping item IDs from Dspace5 to Dspace7 + - `data/handle.json` - data of handles from Dspace5 + +3. Run resource policy checker for anonymous view of items in Dspace7 based on Dspace5 resource policcies + - **NOTE:** database must be full + - **NOTE:** item_dict.json has to contain actual IDs from database of Dspace5 mapping to IDs of Dspace7 + - **NOTE:** dspace server must be running + - From the `dspace-python-api/data_checker` run command `python check_resource_policy.py` + +4. Check `logs.log` in `data_checker` for `Resource policies checker of anonymous view of items` + + \ No newline at end of file diff --git a/data_pump/__init__.py b/tools/data_checker/__init__.py similarity index 100% rename from data_pump/__init__.py rename to tools/data_checker/__init__.py diff --git a/tools/data_checker/check_resource_policy.py b/tools/data_checker/check_resource_policy.py new file mode 100644 index 0000000..252c38e --- /dev/null +++ b/tools/data_checker/check_resource_policy.py @@ -0,0 +1,157 @@ +import os +import requests +import argparse +from tqdm import tqdm +import sys +import logging + +_this_dir = os.path.dirname(os.path.abspath(__file__)) +sys.path.insert(0, os.path.join(_this_dir, "../../src")) + +import pump # noqa: E402 +import dspace # noqa: E402 +from project_settings import settings # noqa: E402 +from pump._utils import read_json # noqa: E402 +from pump._item import items # noqa: E402 + +logging.basicConfig(level=logging.INFO) +_logger = logging.getLogger("resource_checker") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description='Resource policies checker of anonymous view of items') + parser.add_argument('--temp-item-dict', help='item_dict.json', type=str, default=os.path.join( + _this_dir, "../../src/__temp/resume/item.json")) + parser.add_argument('--input-handle-json', help='handle.json', type=str, default=os.path.join( + _this_dir, "../../input/data/handle.json")) + args = parser.parse_args() + + _logger.info('Resource policies checker of anonymous view of items') + + if not os.path.exists(args.temp_item_dict): + _logger.critical(f"File {args.temp_item_dict} does not exist - cannot import.") + sys.exit(1) + + dspace_be = dspace.rest( + settings["backend"]["endpoint"], + settings["backend"]["user"], + settings["backend"]["password"], + settings["backend"]["authentication"] + ) + + db_env = settings["db_dspace_5"] + db5 = pump.db(db_env) + + items_id2uuid = read_json(args.temp_item_dict)["data"]["id2uuid"] + + # create select + # we want all resource_ids for items + # where the action is READ + # which are not workspaces or workflows + # item exists in item table + # owning group is Anonymous + sql = """ + SELECT distinct resource_id FROM public.resourcepolicy + WHERE resource_type_id = '2' + AND action_id IN (0, 9, 10) + AND NOT EXISTS (SELECT 'x' FROM public.workspaceitem WHERE + public.resourcepolicy.resource_id = public.workspaceitem.item_id) + AND NOT EXISTS (SELECT 'x' FROM public.workflowitem WHERE + public.resourcepolicy.resource_id = public.workflowitem.item_id) + AND EXISTS (SELECT 'x' FROM public.item WHERE + public.resourcepolicy.resource_id = public.item.item_id) + AND epersongroup_id = '0' + """ + sql = " ".join(x.strip() for x in sql.splitlines() if len(x.strip()) > 0) + dspace5_item_list = db5.fetch_all(sql) + + _logger.info(f"Count of items with anonymous access: {len(dspace5_item_list)}") + + # get IDs for dspace7 from IDs from dspace5 based on map + dspace7_item_list = [items_id2uuid[str(x[0])] for x in dspace5_item_list] + + # list od item IDs from dspace7 which can READ Anonymous + dspace7_item_ids_list = [] + + page_size = 50 + + # get total pages for search + # max page size for this request is 100 + js = dspace_be.fetch_search_items(size=page_size) + pages = js['_embedded']['searchResult']['page']['totalPages'] + + # get result from each page + # we don't get items which are withdrawn or discoverable + for page in tqdm(range(pages)): + js = dspace_be.fetch_search_items(page=page, size=page_size) + objects = js['_embedded']['searchResult']['_embedded']['objects'] + # add each object to result list + for item in objects: + dspace7_item_ids_list.append(item['_embedded']['indexableObject']['id']) + + _logger.info( + f"Count of items with anonymous access in Dspace7: {len(dspace7_item_ids_list)}") + + # compare expected items in dspace5 and got items from dspace7 + # log items, which we cannot find + notfound = 0 + notfound_but_visible = 0 + found = 0 + + for item_uuid in tqdm(dspace7_item_list): + if item_uuid in dspace7_item_ids_list: + dspace7_item_ids_list.remove(item_uuid) + found += 1 + continue + + # check if we really don't have access to item in Dspace7 + try: + response = dspace_be.fetch_raw_item(item_uuid) + notfound_but_visible += 1 + except Exception as e: + _logger.error(f"Item with id: {item_uuid} is not visible in DSpace7, " + f"but it is visible in DSpace5! " + f"Import of resource policies was incorrect!") + notfound += 1 + + _logger.info( + f"Visible in dspace5 found in dspace7:[{found}], missing visible in dspace 7: [{notfound_but_visible}], missing in dspace7: [{notfound}]") + + # now in new_item_list are items whose resource_policy + # was not found in dspace5 + # it could be because in dspace7 is using inheritance for resource policies + # check if you have access for these items in dspace5 + # based on their handles or there was import error + item_lindat_url = 'https://lindat.mff.cuni.cz/repository/xmlui/handle/' + + handles = read_json(args.input_handle_json) + + # handle has to be defined for item and item has to exist + itemuuid2handle = {} + for h in handles: + item_uuud = items_id2uuid.get(str(h['resource_id']), None) + if h['resource_type_id'] == items.TYPE and item_uuud is not None: + itemuuid2handle[item_uuud] = h['handle'] + + # do request to dspace5 for remaining items + found = 0 + errors = 0 + for item_uuid in tqdm(dspace7_item_ids_list): + if item_uuid not in itemuuid2handle: + _logger.critical(f"Item with id {item_uuid} not found") + continue + + response = requests.get(item_lindat_url + itemuuid2handle[item_uuid]) + if response.ok: + found += 1 + continue + + errors += 1 + _logger.error( + f"Item with id {item_uuid} is visible in Dspace7 but not in Dspace5!") + + _logger.info(f"Found in lindat [{found}]") + if errors > 0: + _logger.critical("!!!!!!!!!!") + sys.exit(1) diff --git a/tools/db_to_json.py b/tools/db_to_json.py new file mode 100644 index 0000000..d553779 --- /dev/null +++ b/tools/db_to_json.py @@ -0,0 +1,66 @@ +import sys +import argparse +import logging +import os +import json +from tqdm import tqdm + +logging.basicConfig(level=logging.INFO) +_logger = logging.getLogger() + +_this_dir = os.path.dirname(os.path.abspath(__file__)) +sys.path.insert(0, os.path.join(_this_dir, "../src/")) + + +def get_data_as_json(db, out_dir: str): + table_names = db.all_tables() + + os.makedirs(out_dir, exist_ok=True) + _logger.info(f"Exporting data to {out_dir}") + for table in tqdm(table_names): + # access to 0. position, because name_t is tuple + name = table[0] + file_name = os.path.join(out_dir, name + ".json") + with open(file_name, 'w', encoding='utf-8') as fout: + js = db.fetch_one(f'SELECT json_agg(row_to_json(t)) FROM "{name}" t') + json.dump(js, fout) + + _logger.info("Data successfully exported!") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='Process database connection') + parser.add_argument('--database', help='database name', required=True, type=str) + parser.add_argument('--port', help='port', type=int, default=5432) + parser.add_argument('--host', help='type of host', type=str, default="localhost") + parser.add_argument('--user', help='database user', type=str) + parser.add_argument('--password', help='database password', type=str) + parser.add_argument('--output', help='output dir', type=str, + default=os.path.join(_this_dir, "../input/data")) + args = parser.parse_args() + + if args.user is None: + from project_settings import settings + db_dspace_5 = settings["db_dspace_5"] + db_utilities_5 = settings["db_utilities_5"] + if args.database == db_dspace_5["name"]: + db = db_dspace_5 + elif args.database == db_utilities_5["name"]: + db = db_utilities_5 + else: + _logger.error("Unknown database, support username and password!") + sys.exit(1) + args.user = db["user"] + args.password = db["password"] + args.host = db["host"] + + from pump import db + dspace5 = db({ + "name": args.database, + "host": args.host, + "user": args.user, + "port": 5432, + "password": args.password, + }) + + get_data_as_json(dspace5, args.output) diff --git a/localization/README.md b/tools/localization/README.md similarity index 100% rename from localization/README.md rename to tools/localization/README.md diff --git a/localization/__init__.py b/tools/localization/__init__.py similarity index 100% rename from localization/__init__.py rename to tools/localization/__init__.py diff --git a/localization/cs.csv b/tools/localization/cs.csv similarity index 100% rename from localization/cs.csv rename to tools/localization/cs.csv diff --git a/localization/csv_to_json.py b/tools/localization/csv_to_json.py similarity index 100% rename from localization/csv_to_json.py rename to tools/localization/csv_to_json.py diff --git a/localization/out.json b/tools/localization/out.json similarity index 100% rename from localization/out.json rename to tools/localization/out.json diff --git a/support/__init__.py b/tools/transform/__init__.py similarity index 100% rename from support/__init__.py rename to tools/transform/__init__.py diff --git a/support/transform/out.json b/tools/transform/out.json similarity index 100% rename from support/transform/out.json rename to tools/transform/out.json diff --git a/support/transform/transformer.py b/tools/transform/transformer.py similarity index 100% rename from support/transform/transformer.py rename to tools/transform/transformer.py diff --git a/support/transform/website_copy.txt b/tools/transform/website_copy.txt similarity index 100% rename from support/transform/website_copy.txt rename to tools/transform/website_copy.txt