Skip to content

Commit

Permalink
Merge pull request #90 from arthur-schnitzler/main
Browse files Browse the repository at this point in the history
code updates
  • Loading branch information
csae8092 authored Oct 7, 2024
2 parents cbe4137 + e512ef2 commit 72902e1
Show file tree
Hide file tree
Showing 5 changed files with 279 additions and 2 deletions.
18 changes: 18 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,24 @@ lightweight APIS-PMB instance to generate TEI/XML dumps

maybe in future also with PMB CRUD included

## install

* clone the repo
* create a virtual env `python -m venv venv`
* activate it `source venv/bin/activate`
* install packages `pip install -r requirements_dev.txt`
* get a recent db-dump (`./dump_and_restore.sh`) (create a postgres-db `pmb` and run `./dump_and_restore.sh`, make sure you tunneled to production db first)
* set env variables
* run `python manage.py runserver`
* (create a superuser) `python manage.py creatsuperuser`



## jupyter notebook
* `pip install notebook` (1 time)
* set env varibles `./set_env_variables`
* `python manage.py shell_plus --lab`


## cheat-sheet

Expand Down
145 changes: 145 additions & 0 deletions issue__209_insel_import.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "e1d2654a-09a9-4cef-b833-88e386bb6b17",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"from tqdm import tqdm\n",
"from django.core.exceptions import ObjectDoesNotExist\n",
"from acdh_tei_pyutils.tei import TeiReader\n",
"from acdh_tei_pyutils.utils import nsmap, normalize_string"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "19f52165-9dea-45c8-b48b-1e52e844648a",
"metadata": {},
"outputs": [],
"source": [
"doc = TeiReader(\"https://raw.githubusercontent.com/csae8092/insel-data-for-pmb/main/schoeffling_anhang_pmb.xml\")\n",
"tmp_file = \"hansi.xml\"\n",
"domain = \"insel\"\n",
"col, _ = Collection.objects.get_or_create(name=\"Die Insel\")\n",
"wrote = PersonWorkRelation.objects.get(name=\"hat geschaffen\")\n",
"translated = PersonWorkRelation.objects.get(name=\"hat übersetzt\")\n",
"label_type = LabelType.objects.get(name=\"Werk_Bibliografische-Angabe\")\n",
"work_work_type = WorkWorkRelation.objects.get(name=\"enthält\")\n",
"insel = Work.objects.get(id=\"40598\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ca2c4418-9461-43ce-98a6-3ae3e34bbbcb",
"metadata": {},
"outputs": [],
"source": [
"for i, x in enumerate(doc.any_xpath(\".//tei:biblStruct[.//tei:author]\"), start=1):\n",
" insel_id = f\"insel-werk_{i:04}\"\n",
" x.attrib[\"key\"] = insel_id\n",
"doc.tree_to_file(tmp_file)\n",
"doc = TeiReader(tmp_file)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1246f0c8-132a-4ee1-8b02-8782773eb981",
"metadata": {},
"outputs": [],
"source": [
"for i, x in tqdm(enumerate(doc.any_xpath(\".//tei:biblStruct[.//tei:author]\"))):\n",
" werk_id = x.attrib[\"key\"]\n",
" werk_uri = f\"https://die-insel.foo.bar/{werk_id}\"\n",
" title = normalize_string(\" \".join(x.xpath(\".//tei:title[1]//text()\", namespaces=nsmap)))\n",
" authors = set()\n",
" for a in x.xpath(\".//tei:author/@ref\", namespaces=nsmap):\n",
" authors.add(a[4:])\n",
" translators = set()\n",
" for a in x.xpath(\".//tei:title//tei:persName[@role='translator']/@ref\", namespaces=nsmap):\n",
" translators.add(a[4:])\n",
" labels = []\n",
" for label in x.xpath(\".//tei:monogr\", namespaces=nsmap):\n",
" jg = label.xpath(\".//tei:biblScope[@unit='jg']/text()\", namespaces=nsmap)[0]\n",
" volume = label.xpath(\".//tei:biblScope[@unit='volume']/text()\", namespaces=nsmap)[0]\n",
" issue = label.xpath(\".//tei:biblScope[@unit='issue']/text()\", namespaces=nsmap)[0]\n",
" page = label.xpath(\".//tei:biblScope[@unit='page']/text()\", namespaces=nsmap)[0]\n",
" bibl_full = f\"Die Insel, Jg. {jg}, Bd. {volume}, Nr. {issue}, S. {page}\"\n",
" labels.append(bibl_full)\n",
" try:\n",
" start_year = x.xpath(\".//tei:biblScope[@unit='publ-year']\", namespaces=nsmap)[0].text\n",
" end_year = x.xpath(\".//tei:biblScope[@unit='publ-year']\", namespaces=nsmap)[-1].text\n",
" except IndexError:\n",
" start_year, end_year = False, False\n",
" try:\n",
" start_month = x.xpath(\".//tei:biblScope[@unit='publ-month']\", namespaces=nsmap)[0].text.split(\"/\")[0]\n",
" end_month = x.xpath(\".//tei:biblScope[@unit='publ-month']\", namespaces=nsmap)[-1].text.split(\"/\")[0]\n",
" except IndexError:\n",
" start_month, end_month = False, False\n",
" try:\n",
" uri = Uri.objects.get(uri=werk_uri)\n",
" entity = Work.objects.get(id=uri.entity.id)\n",
" except (ObjectDoesNotExist, AttributeError):\n",
" entity = False\n",
" if not entity:\n",
" uri, _ = Uri.objects.get_or_create(uri=werk_uri, domain=domain)\n",
" entity = Work.objects.create(name=title[:250])\n",
" entity.collection.add(col)\n",
" uri.entity = entity\n",
" # Werk_Bibliografische-Angabe\n",
" for l in labels:\n",
" label, _ = Label.objects.get_or_create(label=l, label_type=label_type, temp_entity=entity)\n",
" for item in authors:\n",
" person = Person.objects.get(id=item)\n",
" author_rel = PersonWork.objects.get_or_create(related_person=person, related_work=entity, relation_type=wrote)\n",
" for item in translators:\n",
" person = Person.objects.get(id=item)\n",
" author_rel = PersonWork.objects.get_or_create(related_person=person, related_work=entity, relation_type=translated)\n",
" # link to \"Die Insel\"\n",
" WorkWork.objects.get_or_create(related_worka=insel, related_workb=entity, relation_type=work_work_type)\n",
" if start_year and start_month:\n",
" start_date_written = f\"{start_year}-{start_month:0>2}\"\n",
" entity.start_date_written = start_date_written\n",
" if end_year and end_month:\n",
" end_date_written = f\"{end_year}-{end_month:0>2}\"\n",
" entity.end_date_written = end_date_written\n",
" entity.save()\n",
" uri.save()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f80a800d-63fc-4bc3-bcfb-8860cb91d8dd",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Django Shell-Plus",
"language": "python",
"name": "django_extensions"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.3"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
114 changes: 114 additions & 0 deletions notebooks/issue__212_aufenthaltsorte.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "f37d96db-8052-4bba-964f-ba644e57c398",
"metadata": {},
"outputs": [],
"source": [
"# run against production 2024-10-03\n",
"from django.core.exceptions import ObjectDoesNotExist\n",
"from tqdm import tqdm\n",
"from dumper.utils import gsheet_to_df\n",
"from apis_core.utils import get_object_from_pk_or_uri"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4cdb69b2-5f6a-4536-8723-c970d2e462fe",
"metadata": {},
"outputs": [],
"source": [
"relation_type = PersonPlaceRelation.objects.get(id=1181)\n",
"relation_type"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8f5018a7-b945-4c76-896b-cc39fd63715d",
"metadata": {},
"outputs": [],
"source": [
"df = gsheet_to_df(\"13ZccvGh6nZwL4h8ZKn8W9COGtRjVVZT6gu4EtyW9Bi8\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "021d8f39-c172-47cf-b0c2-51afbddc2a8a",
"metadata": {},
"outputs": [],
"source": [
"df"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0b272dfc-9558-4f13-a956-7de4b665336b",
"metadata": {},
"outputs": [],
"source": [
"no_places = set()\n",
"for g, ndf in df.groupby(\"source_id\"):\n",
" person = Person.objects.get(id=g)\n",
" print(person)\n",
" for i, row in tqdm(ndf.iterrows()):\n",
" try:\n",
" place = Place.objects.get(id=row[\"target_id\"])\n",
" except ObjectDoesNotExist:\n",
" no_places.add(row[\"target_id\"])\n",
" continue\n",
" rel, _ = PersonPlace.objects.get_or_create(\n",
" related_person=person,\n",
" related_place=place,\n",
" relation_type=relation_type,\n",
" start_date_written=row[\"relation_start_date_written\"],\n",
" end_date_written=row[\"relation_end_date_written\"]\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8e43ed78-abd3-40cf-b2c0-6975495b98b8",
"metadata": {},
"outputs": [],
"source": [
"print(no_places)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7954d880-2ff4-4f5e-861c-28e9cf110e88",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Django Shell-Plus",
"language": "python",
"name": "django_extensions"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.3"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ django-vite>=3.0.3,<4
djangorestframework
pandas
pylobid
psycopg2
psycopg2-binary==2.9.9
pyocclient==0.6
networkx>=3.2.1,<4
icecream
Expand Down
2 changes: 1 addition & 1 deletion templates/robots.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
User-agent: *
Crawl-Delay: 2000
Disallow: /

0 comments on commit 72902e1

Please sign in to comment.