Skip to content

Commit

Permalink
fix #135 by dynamically exporting to jsons (keep the current implemen…
Browse files Browse the repository at this point in the history
…tation as much as possible) when needed

No json exports are required beforehand.
  • Loading branch information
jm committed Sep 27, 2024
1 parent b8d493f commit 03143de
Show file tree
Hide file tree
Showing 6 changed files with 77 additions and 68 deletions.
36 changes: 10 additions & 26 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,48 +19,32 @@ because we use this endpoint for importing existing data.
2.2. Clone submodules:
2.2.1.: `git submodule update --init libs/dspace-rest-python/`


4. Get database dump (old CLARIN-DSpace) and unzip it into `input/dump` directory in `dspace-python-api` project.

5. Create CLARIN-DSpace5.* databases (dspace, utilities) from dump.
Run `scripts/start.local.dspace.db.bat` or use `scipts/init.dspacedb5.sh` directly with your database.
2. Get database dump (old CLARIN-DSpace) and unzip it into `input/dump` directory in `dspace-python-api` project.

***
5. Go to the `dspace/bin` in dspace7 installation and run the command `dspace database migrate force` (force because of local types).
3. Go to the `dspace/bin` in dspace7 installation and run the command `dspace database migrate force` (force because of local types).
**NOTE:** `dspace database migrate force` creates default database data that may be not in database dump, so after migration, some tables may have more data than the database dump. Data from database dump that already exists in database is not migrated.

6. Create an admin by running the command `dspace create-administrator` in the `dspace/bin`
4. Create an admin by running the command `dspace create-administrator` in the `dspace/bin`

***
7. Create JSON files from the database tables.
**NOTE: You must do it for both databases `clarin-dspace` and `clarin-utilities`** (JSON files are stored in the `data` folder)
- Go to `dspace-python-api` and run
```
pip install -r requirements.txt
(optional on ubuntu like systems) apt install libpq-dev
python db_to_json.py --database=clarin-dspace
python db_to_json.py --database=clarin-utilities
```

***
8. Prepare `dspace-python-api` project for migration
5. Prepare `dspace-python-api` project for migration

- copy the files used during migration into `input/` directory:
```
> ls -R ./input
input:
data dump icon
input/data:
bitstream.json fileextension.json piwik_report.json
bitstreamformatregistry.json ...
dump icon
input/dump:
clarin-dspace-8.8.23.sql clarin-utilities-8.8.23.sql
clarin-dspace.sql clarin-utilities.sql
input/icon:
aca.png by.png gplv2.png mit.png ...
```
6.
7. Create CLARIN-DSpace5.* databases (dspace, utilities) from dump.
Run `scripts/start.local.dspace.db.bat` or use `scipts/init.dspacedb5.sh` directly with your database.

***
9. update `project_settings.py`
Expand All @@ -72,7 +56,7 @@ e.g.,`handle.additional.prefixes = 11858, 11234, 11372, 11346, 20.500.12801, 20.
11. Copy `assetstore` from dspace5 to dspace7 (for bitstream import). `assetstore` is in the folder where you have installed DSpace `dspace/assetstore`.

***
11. Import data from the json files (python-api/input/*) into dspace database (CLARIN-DSpace7.*)
11. Import
- **NOTE:** database must be up to date (`dspace database migrate force` must be called in the `dspace/bin`)
- **NOTE:** dspace server must be running
- run command `cd ./src && python repo_import.py`
Expand Down
2 changes: 1 addition & 1 deletion src/project_settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@
},

"input": {
"datadir": os.path.join(_this_dir, "../input/data"),
"tempdbexport": os.path.join(_this_dir, "../input/tempdbexport"),
"icondir": os.path.join(_this_dir, "../input/icon"),
},

Expand Down
100 changes: 65 additions & 35 deletions src/pump/_repo.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import logging
import os
import json
import shutil

from ._utils import time_method

Expand Down Expand Up @@ -27,102 +29,130 @@
_logger = logging.getLogger("pump.repo")


def export_table(db, table_name: str, out_f: str):
with open(out_f, 'w', encoding='utf-8') as fout:
js = db.fetch_one(f'SELECT json_agg(row_to_json(t)) FROM "{table_name}" t')
json.dump(js, fout)


class repo:
@time_method
def __init__(self, env: dict, dspace):
def _f(name): return os.path.join(env["input"]["datadir"], name)

self.raw_db_dspace_5 = db(env["db_dspace_5"])
self.raw_db_utilities_5 = db(env["db_utilities_5"])

# remove directory
if os.path.exists(env["input"]["tempdbexport"]):
shutil.rmtree(env["input"]["tempdbexport"])

tables_db_5 = [x for arr in self.raw_db_dspace_5.all_tables() for x in arr]
tables_utilities_5 = [x for arr in self.raw_db_utilities_5.all_tables()
for x in arr]

def _f(table_name):
""" Dynamically export the table to json file and return path to it. """
os.makedirs(env["input"]["tempdbexport"], exist_ok=True)
out_f = os.path.join(env["input"]["tempdbexport"], f"{table_name}.json")
if table_name in tables_db_5:
db = self.raw_db_dspace_5
elif table_name in tables_utilities_5:
db = self.raw_db_utilities_5
else:
_logger.warning(f"Table [{table_name}] not found in db.")
raise NotImplementedError(f"Table [{table_name}] not found in db.")
export_table(db, table_name, out_f)
return out_f

# load groups
self.groups = groups(
_f("epersongroup.json"),
_f("group2group.json"),
_f("epersongroup"),
_f("group2group"),
)
self.groups.from_rest(dspace)

# load handles
self.handles = handles(_f("handle.json"))
self.handles = handles(_f("handle"))

# load metadata
self.metadatas = metadatas(
env,
dspace,
_f("metadatavalue.json"),
_f("metadatafieldregistry.json"),
_f("metadataschemaregistry.json"),
_f("metadatavalue"),
_f("metadatafieldregistry"),
_f("metadataschemaregistry"),
)

# load community
self.communities = communities(
_f("community.json"),
_f("community2community.json"),
_f("community"),
_f("community2community"),
)

self.collections = collections(
_f("collection.json"),
_f("community2collection.json"),
_f("metadatavalue.json"),
_f("collection"),
_f("community2collection"),
_f("metadatavalue"),
)

self.registrationdatas = registrationdatas(
_f("registrationdata.json")
_f("registrationdata")
)

self.epersons = epersons(
_f("eperson.json")
_f("eperson")
)

self.egroups = eperson_groups(
_f("epersongroup2eperson.json")
_f("epersongroup2eperson")
)

self.userregistrations = userregistrations(
_f("user_registration.json")
_f("user_registration")
)

self.bitstreamformatregistry = bitstreamformatregistry(
_f("bitstreamformatregistry.json"), _f("fileextension.json")
_f("bitstreamformatregistry"), _f("fileextension")
)

self.licenses = licenses(
_f("license_label.json"),
_f("license_definition.json"),
_f("license_label_extended_mapping.json"),
_f("license_label"),
_f("license_definition"),
_f("license_label_extended_mapping"),
)

self.items = items(
_f("item.json"),
_f("workspaceitem.json"),
_f("workflowitem.json"),
_f("collection2item.json"),
_f("item"),
_f("workspaceitem"),
_f("workflowitem"),
_f("collection2item"),
)

self.tasklistitems = tasklistitems(
_f("tasklistitem.json")
_f("tasklistitem")
)

self.bundles = bundles(
_f("bundle.json"),
_f("item2bundle.json"),
_f("bundle"),
_f("item2bundle"),
)

self.bitstreams = bitstreams(
_f("bitstream.json"),
_f("bundle2bitstream.json"),
_f("bitstream"),
_f("bundle2bitstream"),
)

self.usermetadatas = usermetadatas(
_f("user_metadata.json"),
_f("license_resource_user_allowance.json"),
_f("license_resource_mapping.json")
_f("user_metadata"),
_f("license_resource_user_allowance"),
_f("license_resource_mapping")
)

self.resourcepolicies = resourcepolicies(
_f("resourcepolicy.json")
_f("resourcepolicy")
)

self.raw_db_7 = db(env["db_dspace_7"])
self.raw_db_dspace_5 = db(env["db_dspace_5"])
self.raw_db_utilities_5 = db(env["db_utilities_5"])

self.sequences = sequences()

Expand Down
6 changes: 0 additions & 6 deletions src/repo_import.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,12 +80,6 @@ def deserialize(resume: bool, obj, cache_file: str) -> bool:
for k, v in env["cache"].items():
env["cache"][k] = os.path.join(env["resume_dir"], v)

# input data directory
input_dir = env["input"]["datadir"]
if not os.path.exists(input_dir):
_logger.critical(f"Input directory [{input_dir}] does not exist - cannot import.")
sys.exit(1)

dspace_be = dspace.rest(
env["backend"]["endpoint"],
env["backend"]["user"],
Expand Down
1 change: 1 addition & 0 deletions tools/export_db/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Obsolete now because we need a running v5 database anyway.
File renamed without changes.

0 comments on commit 03143de

Please sign in to comment.