From 03143de31ec20ce0c262558783ff51440bb6358a Mon Sep 17 00:00:00 2001 From: jm Date: Fri, 27 Sep 2024 17:06:26 +0200 Subject: [PATCH] fix #135 by dynamically exporting to jsons (keep the current implementation as much as possible) when needed No json exports are required beforehand. --- README.md | 36 +++------- src/project_settings.py | 2 +- src/pump/_repo.py | 100 ++++++++++++++++++---------- src/repo_import.py | 6 -- tools/export_db/README.md | 1 + tools/{ => export_db}/db_to_json.py | 0 6 files changed, 77 insertions(+), 68 deletions(-) create mode 100644 tools/export_db/README.md rename tools/{ => export_db}/db_to_json.py (100%) diff --git a/README.md b/README.md index d00600e..dac8c07 100644 --- a/README.md +++ b/README.md @@ -19,48 +19,32 @@ because we use this endpoint for importing existing data. 2.2. Clone submodules: 2.2.1.: `git submodule update --init libs/dspace-rest-python/` - -4. Get database dump (old CLARIN-DSpace) and unzip it into `input/dump` directory in `dspace-python-api` project. - -5. Create CLARIN-DSpace5.* databases (dspace, utilities) from dump. -Run `scripts/start.local.dspace.db.bat` or use `scipts/init.dspacedb5.sh` directly with your database. +2. Get database dump (old CLARIN-DSpace) and unzip it into `input/dump` directory in `dspace-python-api` project. *** -5. Go to the `dspace/bin` in dspace7 installation and run the command `dspace database migrate force` (force because of local types). +3. Go to the `dspace/bin` in dspace7 installation and run the command `dspace database migrate force` (force because of local types). **NOTE:** `dspace database migrate force` creates default database data that may be not in database dump, so after migration, some tables may have more data than the database dump. Data from database dump that already exists in database is not migrated. -6. Create an admin by running the command `dspace create-administrator` in the `dspace/bin` +4. Create an admin by running the command `dspace create-administrator` in the `dspace/bin` *** -7. Create JSON files from the database tables. -**NOTE: You must do it for both databases `clarin-dspace` and `clarin-utilities`** (JSON files are stored in the `data` folder) -- Go to `dspace-python-api` and run -``` -pip install -r requirements.txt -(optional on ubuntu like systems) apt install libpq-dev -python db_to_json.py --database=clarin-dspace -python db_to_json.py --database=clarin-utilities -``` - -*** -8. Prepare `dspace-python-api` project for migration +5. Prepare `dspace-python-api` project for migration - copy the files used during migration into `input/` directory: ``` > ls -R ./input input: -data dump icon - -input/data: -bitstream.json fileextension.json piwik_report.json -bitstreamformatregistry.json ... +dump icon input/dump: -clarin-dspace-8.8.23.sql clarin-utilities-8.8.23.sql +clarin-dspace.sql clarin-utilities.sql input/icon: aca.png by.png gplv2.png mit.png ... ``` +6. +7. Create CLARIN-DSpace5.* databases (dspace, utilities) from dump. +Run `scripts/start.local.dspace.db.bat` or use `scipts/init.dspacedb5.sh` directly with your database. *** 9. update `project_settings.py` @@ -72,7 +56,7 @@ e.g.,`handle.additional.prefixes = 11858, 11234, 11372, 11346, 20.500.12801, 20. 11. Copy `assetstore` from dspace5 to dspace7 (for bitstream import). `assetstore` is in the folder where you have installed DSpace `dspace/assetstore`. *** -11. Import data from the json files (python-api/input/*) into dspace database (CLARIN-DSpace7.*) +11. Import - **NOTE:** database must be up to date (`dspace database migrate force` must be called in the `dspace/bin`) - **NOTE:** dspace server must be running - run command `cd ./src && python repo_import.py` diff --git a/src/project_settings.py b/src/project_settings.py index 72d7607..809469b 100644 --- a/src/project_settings.py +++ b/src/project_settings.py @@ -51,7 +51,7 @@ }, "input": { - "datadir": os.path.join(_this_dir, "../input/data"), + "tempdbexport": os.path.join(_this_dir, "../input/tempdbexport"), "icondir": os.path.join(_this_dir, "../input/icon"), }, diff --git a/src/pump/_repo.py b/src/pump/_repo.py index 176e8f2..4c95c29 100644 --- a/src/pump/_repo.py +++ b/src/pump/_repo.py @@ -1,5 +1,7 @@ import logging import os +import json +import shutil from ._utils import time_method @@ -27,102 +29,130 @@ _logger = logging.getLogger("pump.repo") +def export_table(db, table_name: str, out_f: str): + with open(out_f, 'w', encoding='utf-8') as fout: + js = db.fetch_one(f'SELECT json_agg(row_to_json(t)) FROM "{table_name}" t') + json.dump(js, fout) + + class repo: @time_method def __init__(self, env: dict, dspace): - def _f(name): return os.path.join(env["input"]["datadir"], name) + + self.raw_db_dspace_5 = db(env["db_dspace_5"]) + self.raw_db_utilities_5 = db(env["db_utilities_5"]) + + # remove directory + if os.path.exists(env["input"]["tempdbexport"]): + shutil.rmtree(env["input"]["tempdbexport"]) + + tables_db_5 = [x for arr in self.raw_db_dspace_5.all_tables() for x in arr] + tables_utilities_5 = [x for arr in self.raw_db_utilities_5.all_tables() + for x in arr] + + def _f(table_name): + """ Dynamically export the table to json file and return path to it. """ + os.makedirs(env["input"]["tempdbexport"], exist_ok=True) + out_f = os.path.join(env["input"]["tempdbexport"], f"{table_name}.json") + if table_name in tables_db_5: + db = self.raw_db_dspace_5 + elif table_name in tables_utilities_5: + db = self.raw_db_utilities_5 + else: + _logger.warning(f"Table [{table_name}] not found in db.") + raise NotImplementedError(f"Table [{table_name}] not found in db.") + export_table(db, table_name, out_f) + return out_f # load groups self.groups = groups( - _f("epersongroup.json"), - _f("group2group.json"), + _f("epersongroup"), + _f("group2group"), ) self.groups.from_rest(dspace) # load handles - self.handles = handles(_f("handle.json")) + self.handles = handles(_f("handle")) # load metadata self.metadatas = metadatas( env, dspace, - _f("metadatavalue.json"), - _f("metadatafieldregistry.json"), - _f("metadataschemaregistry.json"), + _f("metadatavalue"), + _f("metadatafieldregistry"), + _f("metadataschemaregistry"), ) # load community self.communities = communities( - _f("community.json"), - _f("community2community.json"), + _f("community"), + _f("community2community"), ) self.collections = collections( - _f("collection.json"), - _f("community2collection.json"), - _f("metadatavalue.json"), + _f("collection"), + _f("community2collection"), + _f("metadatavalue"), ) self.registrationdatas = registrationdatas( - _f("registrationdata.json") + _f("registrationdata") ) self.epersons = epersons( - _f("eperson.json") + _f("eperson") ) self.egroups = eperson_groups( - _f("epersongroup2eperson.json") + _f("epersongroup2eperson") ) self.userregistrations = userregistrations( - _f("user_registration.json") + _f("user_registration") ) self.bitstreamformatregistry = bitstreamformatregistry( - _f("bitstreamformatregistry.json"), _f("fileextension.json") + _f("bitstreamformatregistry"), _f("fileextension") ) self.licenses = licenses( - _f("license_label.json"), - _f("license_definition.json"), - _f("license_label_extended_mapping.json"), + _f("license_label"), + _f("license_definition"), + _f("license_label_extended_mapping"), ) self.items = items( - _f("item.json"), - _f("workspaceitem.json"), - _f("workflowitem.json"), - _f("collection2item.json"), + _f("item"), + _f("workspaceitem"), + _f("workflowitem"), + _f("collection2item"), ) self.tasklistitems = tasklistitems( - _f("tasklistitem.json") + _f("tasklistitem") ) self.bundles = bundles( - _f("bundle.json"), - _f("item2bundle.json"), + _f("bundle"), + _f("item2bundle"), ) self.bitstreams = bitstreams( - _f("bitstream.json"), - _f("bundle2bitstream.json"), + _f("bitstream"), + _f("bundle2bitstream"), ) self.usermetadatas = usermetadatas( - _f("user_metadata.json"), - _f("license_resource_user_allowance.json"), - _f("license_resource_mapping.json") + _f("user_metadata"), + _f("license_resource_user_allowance"), + _f("license_resource_mapping") ) self.resourcepolicies = resourcepolicies( - _f("resourcepolicy.json") + _f("resourcepolicy") ) self.raw_db_7 = db(env["db_dspace_7"]) - self.raw_db_dspace_5 = db(env["db_dspace_5"]) - self.raw_db_utilities_5 = db(env["db_utilities_5"]) self.sequences = sequences() diff --git a/src/repo_import.py b/src/repo_import.py index 34a0d6d..1a57994 100644 --- a/src/repo_import.py +++ b/src/repo_import.py @@ -80,12 +80,6 @@ def deserialize(resume: bool, obj, cache_file: str) -> bool: for k, v in env["cache"].items(): env["cache"][k] = os.path.join(env["resume_dir"], v) - # input data directory - input_dir = env["input"]["datadir"] - if not os.path.exists(input_dir): - _logger.critical(f"Input directory [{input_dir}] does not exist - cannot import.") - sys.exit(1) - dspace_be = dspace.rest( env["backend"]["endpoint"], env["backend"]["user"], diff --git a/tools/export_db/README.md b/tools/export_db/README.md new file mode 100644 index 0000000..b8b6728 --- /dev/null +++ b/tools/export_db/README.md @@ -0,0 +1 @@ +Obsolete now because we need a running v5 database anyway. \ No newline at end of file diff --git a/tools/db_to_json.py b/tools/export_db/db_to_json.py similarity index 100% rename from tools/db_to_json.py rename to tools/export_db/db_to_json.py