generated from acdh-oeaw/template-app-next
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: add baserow import/export scripts
- Loading branch information
1 parent
a302676
commit 5f7e4ee
Showing
3 changed files
with
334 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
#!/usr/bin/env python3 | ||
# type: ignore | ||
from acdh_baserow_pyutils import BaseRowClient | ||
import os | ||
from dotenv import load_dotenv | ||
|
||
load_dotenv("../.env.local") | ||
|
||
BASEROW_BASE_URL = os.environ.get("BASEROW_BASE_URL") | ||
BASEROW_USER = os.environ.get("BASEROW_USER") | ||
BASEROW_PW = os.environ.get("BASEROW_PW") | ||
BASEROW_TOKEN = os.environ.get("BASEROW_TOKEN") | ||
BASEROW_DATABASE_ID = os.environ.get("BASEROW_DATABASE_ID") | ||
# initialize the client | ||
br_client = BaseRowClient( | ||
BASEROW_USER, BASEROW_PW, BASEROW_TOKEN, BASEROW_BASE_URL, BASEROW_DATABASE_ID | ||
) | ||
|
||
folder_name = "from_baserow" | ||
# os.mkdir(folder_name) | ||
br_client.dump_tables_as_json(BASEROW_DATABASE_ID, folder_name=folder_name, indent="\t") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,172 @@ | ||
#!/usr/bin/env python3 | ||
# type: ignore | ||
import logging | ||
from dotenv import load_dotenv | ||
import json | ||
import argparse | ||
import os | ||
import typesense | ||
from typesense.exceptions import ObjectNotFound | ||
|
||
parser = argparse.ArgumentParser( | ||
prog="baserow", description="import / export data from / to baserow" | ||
) | ||
|
||
parser.add_argument( | ||
"-env", | ||
default="../.env.local", | ||
help=".env file to be used for getting typesense server details", | ||
) | ||
parser.add_argument( | ||
"-v", | ||
"--verbose", | ||
action="count", | ||
default=3, | ||
help="Increase the verbosity of the logging output: default is WARNING, use -v for INFO, -vv for DEBUG", | ||
) | ||
|
||
args = parser.parse_args() | ||
|
||
logging.basicConfig( | ||
level=max(10, 30 - 10 * args.verbose), | ||
# format="%(levelname)-8s %(message)s", | ||
) | ||
|
||
|
||
logging.debug(f"Loading typesense access data from {args.env}") | ||
os.chdir(os.path.dirname(__file__)) | ||
load_dotenv(args.env) | ||
|
||
|
||
def fatal(msg): | ||
logging.fatal(msg) | ||
exit(1) | ||
|
||
|
||
logging.info( | ||
"step 1: accumulate relational data into a typesense-ready nested structure" | ||
) | ||
|
||
|
||
def load_json(dirname, filename): | ||
return json.load(open(f"{dirname}/{filename}.json")) | ||
|
||
|
||
# get the data as it was originally exported to baserow | ||
publications = load_json("to_baserow", "Publikation") | ||
translations = load_json("to_baserow", "Übersetzung") | ||
works = load_json("to_baserow", "BernhardWerk") | ||
translators = load_json("to_baserow", "Übersetzer") | ||
|
||
# TODO get and apply the manual edits | ||
# publications = json.load(open("from_baserow/Publikation.json")) | ||
# translations = json.load(open("from_baserow/Übersetzung.json")) | ||
# works = json.load(open("from_baserow/BernhardWerk.json.json")) | ||
# translators = json.load(open("from_baserow/Übersetzer.json")) | ||
|
||
# create nested structures | ||
|
||
|
||
def del_empty_strings(o, field_names): | ||
for f in field_names: | ||
if not o[f]: | ||
del o[f] | ||
|
||
|
||
for i, t in enumerate(translators): | ||
# add 1-indexed translator ids to allow links to translator pages | ||
t["id"] = i + 1 | ||
|
||
for i, w in enumerate(works): | ||
# add 1-indexed bernhard work ids to allow links to work ids | ||
w["id"] = i + 1 | ||
# TODO to be removed/replaced by work id | ||
w["yeartitle"] = str(w["year"]) + w["title"] | ||
|
||
for t in translations: | ||
t["work"] = works[t["work"] - 1] | ||
t["translators"] = [translators[t_id - 1] for t_id in t["translators"]] | ||
del_empty_strings(t, ["work_display_title"]) | ||
|
||
for i, pub in enumerate(publications): | ||
pub["id"] = str(i + 1) | ||
pub["contains"] = [translations[t_id - 1] for t_id in pub["contains"]] | ||
|
||
pub["images"] = [{"id": img} for img in pub["images"].split(" ")] | ||
pub["has_image"] = len(pub["images"]) > 0 | ||
if not pub["year_display"]: | ||
pub["year_display"] = str(pub["year"]) | ||
|
||
for pid in pub["parents"]: | ||
if "later" in publications[pid - 1]: | ||
publications[pid - 1]["later"].append(i + 1) | ||
else: | ||
publications[pid - 1]["later"] = [i + 1] | ||
|
||
del_empty_strings(pub, ["publication_details"]) | ||
|
||
# trim data a little | ||
del pub["exemplar_suhrkamp_berlin"] | ||
del pub["exemplar_oeaw"] | ||
del pub["original_publication"] | ||
del pub["zusatzinfos"] | ||
|
||
logging.info("step 2: insert nested documents into typesense") | ||
|
||
if "TYPESENSE_ADMIN_API_KEY" not in os.environ: | ||
fatal("Couldn't find typesense database information in environment files") | ||
|
||
logging.info(f"connecting to {os.environ.get('NEXT_PUBLIC_TYPESENSE_HOST')}") | ||
|
||
client = typesense.Client( | ||
{ | ||
"api_key": os.environ.get("TYPESENSE_ADMIN_API_KEY"), | ||
"nodes": [ | ||
{ | ||
"host": os.environ.get("NEXT_PUBLIC_TYPESENSE_HOST"), | ||
"port": os.environ.get("NEXT_PUBLIC_TYPESENSE_PORT"), | ||
"protocol": os.environ.get("NEXT_PUBLIC_TYPESENSE_PROTOCOL"), | ||
} | ||
], | ||
"connection_timeout_seconds": 5, | ||
} | ||
) | ||
|
||
collection_name = os.environ.get("NEXT_PUBLIC_TYPESENSE_COLLECTION_NAME") | ||
|
||
try: | ||
r = client.collections[collection_name].retrieve() | ||
except ObjectNotFound: | ||
logging.info(f"collection '{collection_name}' does not exist yet, creating") | ||
schema = json.load(open("typesense-schema.json")) | ||
schema["collection_name"] = collection_name | ||
create = client.collections.create(schema) | ||
r = client.collections[collection_name].retrieve() | ||
|
||
|
||
if r["num_documents"] > 0: | ||
logging.info(f'Clearing {r["num_documents"]} existing documents') | ||
r = client.collections[collection_name].documents.delete({"filter_by": 'id :!= ""'}) | ||
logging.info( | ||
f'Cleared {r["num_deleted"]} documents from collection {collection_name}' | ||
) | ||
|
||
logging.info(f"importing {len(publications)} documents") | ||
r = client.collections[collection_name].documents.import_(publications) | ||
|
||
nfails = list(map(lambda d: d["success"], r)).count(False) | ||
if nfails == len(publications): | ||
if args.verbose > 0: | ||
print(r) | ||
logging.error( | ||
f"Failed to insert any of the documents. Either the documents don't comply with the schema of the collection, or maybe you are using an api key that only has read access to the collection? (run the script again with --verbose to see all {nfails} errors)" | ||
) | ||
exit(1) | ||
elif nfails > 0: | ||
logging.error(f"{nfails} documents could not be inserted.") | ||
for doc in filter(lambda d: not d["success"], r): | ||
logging.error(doc) | ||
exit(1) | ||
|
||
print(publications[0]) | ||
logging.info("Success!") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,141 @@ | ||
{ | ||
"name": "SET IN .env FILE NOT HERE", | ||
"fields": [ | ||
{ | ||
"name": "erstpublikation", | ||
"type": "bool", | ||
"facet": false, | ||
"optional": false, | ||
"index": true, | ||
"sort": false, | ||
"infix": false, | ||
"locale": "", | ||
"stem": false | ||
}, | ||
{ | ||
"name": "has_image", | ||
"type": "bool", | ||
"facet": false, | ||
"optional": false, | ||
"index": true, | ||
"sort": false, | ||
"infix": false, | ||
"locale": "", | ||
"stem": false | ||
}, | ||
{ | ||
"name": "year", | ||
"type": "int32", | ||
"facet": false, | ||
"optional": false, | ||
"index": true, | ||
"sort": true, | ||
"infix": false, | ||
"locale": "", | ||
"stem": false | ||
}, | ||
{ | ||
"name": "signatur", | ||
"type": "string", | ||
"facet": false, | ||
"optional": false, | ||
"index": true, | ||
"sort": false, | ||
"infix": false, | ||
"locale": "", | ||
"stem": false | ||
}, | ||
{ | ||
"name": "year_display", | ||
"type": "string", | ||
"facet": false, | ||
"optional": false, | ||
"index": true, | ||
"sort": false, | ||
"infix": false, | ||
"locale": "", | ||
"stem": false | ||
}, | ||
{ | ||
"name": "contains.work.category", | ||
"type": "string[]", | ||
"facet": true, | ||
"optional": true, | ||
"index": true, | ||
"sort": false, | ||
"infix": false, | ||
"locale": "", | ||
"stem": false | ||
}, | ||
{ | ||
"name": "contains.work.title", | ||
"type": "string[]", | ||
"facet": true, | ||
"optional": false, | ||
"index": true, | ||
"sort": false, | ||
"infix": false, | ||
"locale": "", | ||
"stem": false | ||
}, | ||
{ | ||
"name": "contains.work.yeartitle", | ||
"type": "string[]", | ||
"facet": true, | ||
"optional": false, | ||
"index": true, | ||
"sort": false, | ||
"infix": false, | ||
"locale": "", | ||
"stem": false | ||
}, | ||
{ | ||
"name": "language", | ||
"type": "string", | ||
"facet": true, | ||
"optional": false, | ||
"index": true, | ||
"sort": true, | ||
"infix": false, | ||
"locale": "", | ||
"stem": false | ||
}, | ||
{ | ||
"name": "contains.title", | ||
"type": "string[]", | ||
"facet": true, | ||
"optional": false, | ||
"index": true, | ||
"sort": false, | ||
"infix": false, | ||
"locale": "", | ||
"stem": false | ||
}, | ||
{ | ||
"name": "contains.translators.name", | ||
"type": "string[]", | ||
"facet": true, | ||
"optional": true, | ||
"index": true, | ||
"sort": false, | ||
"infix": false, | ||
"locale": "", | ||
"stem": false | ||
}, | ||
{ | ||
"name": "title", | ||
"type": "string", | ||
"facet": true, | ||
"optional": false, | ||
"index": true, | ||
"sort": true, | ||
"infix": false, | ||
"locale": "", | ||
"stem": false | ||
} | ||
], | ||
"default_sorting_field": "year", | ||
"enable_nested_fields": true, | ||
"symbols_to_index": [], | ||
"token_separators": [] | ||
} |