Support updating existing dictionaries via import

elexis-eu · Apr 21, 2021 · 9243dbd · 9243dbd
1 parent e491c11
commit 9243dbd
Show file tree

Hide file tree

Showing 7 changed files with 88 additions and 16 deletions.
diff --git a/app/importing/models.py b/app/importing/models.py
@@ -1,6 +1,7 @@
 import sys
 from typing import List, Optional
 
+from bson import ObjectId
 from pydantic import AnyHttpUrl, FilePath, HttpUrl, root_validator, validator
 
 from app.models import BaseModel, Genre, Language, ReleasePolicy, _AutoStrEnum
@@ -25,6 +26,7 @@ class ImportJob(BaseModel):
     file: Optional[FilePath]
     state: JobStatus
     meta: _ImportMeta
+    dict_id: Optional[ObjectId]
 
     @validator('url', 'file')
     def cast_to_str(cls, v):

diff --git a/app/importing/ops.py b/app/importing/ops.py
@@ -42,31 +42,50 @@ def _process_one_dict(job_id: str):
                 assert response.num_bytes_downloaded == num_bytes_expected
                 job.file = filename
 
-            # Parse file
+            # Parse file into dict object
             assert filename
             log.debug('Parse %s from %r', job_id, filename)
             obj = file_to_obj(filename, job.meta.sourceLanguage)
 
             # Transfer properties
             obj['_id'] = job_id
+            obj['import_time'] = str(datetime.now())
             # We add job.meta properrties on base object, which in
             # router /about get overriden by meta from file
             obj.update(job.meta.dict(exclude_none=True, exclude_unset=True))
 
+            # Check if our dict should replace entries from other dict_id
+            dict_id = job.dict_id or job_id
+            if job.dict_id is not None:
+                log.debug('Job %s replaces dict %s', job_id, dict_id)
+                obj['_id'] = dict_id
+
+                old_obj = db.dicts.find_one({'api_key': job.meta.api_key,
+                                             '_id': dict_id},
+                                            {'_id': True})
+                if old_obj is None:
+                    raise Exception('E403, forbidden')
+
+                # Transfer entry ids from old dict
+                obj = _transfer_ids(obj, dict_id, db)
+
             # Extract entries separately, assign them dict id
             entries = obj.pop('entries')
             assert entries, 'No entries in dictionary'
+            obj['n_entries'] = len(entries)
             for entry in entries:
-                entry['_dict_id'] = job_id
+                entry['_dict_id'] = dict_id
 
-            obj['n_entries'] = len(entries)
+            log.debug('Insert %s with %d entries', dict_id, len(entries))
+            # Remove previous dict/entries
+            db.entry.delete_many({'_dict_id': dict_id})
+            db.dicts.delete_one({'_id': dict_id})
 
             # Insert dict, entries
-            log.debug('Insert %s with %d entries', job_id, len(entries))
             result = db.entry.insert_many(entries)
             obj['_entries'] = result.inserted_ids  # Inverse of _dict_id
             result = db.dicts.insert_one(obj)
-            assert result.inserted_id == job_id
+            assert result.inserted_id == dict_id
 
             # Mark job done
             db.import_jobs.update_one(
@@ -79,6 +98,25 @@ def _process_one_dict(job_id: str):
             log.exception('Error processing %s', job_id)
             db.import_jobs.update_one(
                 {'_id': job_id}, {'$set': {'state': JobStatus.ERROR,
-                                       'error': traceback.format_exc()}})
+                                           'error': traceback.format_exc()}})
             if settings.UPLOAD_REMOVE_ON_FAILURE and os.path.isfile(filename):
                 os.remove(filename)
+
+
+def _transfer_ids(new_obj, old_dict_id, db):
+    def entry_to_key(entry):
+        return (
+            entry['lemma'],
+            entry['partOfSpeech'],
+        )
+
+    old_entries = db.entry.find({'_dict_id': old_dict_id},
+                                {'lemma': True,
+                                 'partOfSpeech': True})
+    old_id_by_key = {entry_to_key(entry): entry['_id']
+                     for entry in old_entries}
+    for entry in new_obj['entries']:
+        id = old_id_by_key.get(entry_to_key(entry))
+        if id is not None:
+            entry['_id'] = id
+    return new_obj
diff --git a/app/importing/router.py b/app/importing/router.py
@@ -5,6 +5,7 @@
 from queue import SimpleQueue
 from typing import List, Optional
 
+from bson import ObjectId
 from fastapi import APIRouter, Depends, File, HTTPException, Query, UploadFile
 from fastapi.responses import PlainTextResponse
 
@@ -35,6 +36,11 @@ async def dict_import(
             None,
             description='URL of the dictionary to fetch and import. See <em>file=</em>.',
         ),
+        dictionary: Optional[str] = Query(
+            None,
+            description='Id of dictionary to replace.',
+            regex='^[a-z0-f]{24}$',
+        ),
         file: Optional[UploadFile] = File(
             None,
             description='Dictionary file to import. In either OntoLex/Turtle, '
@@ -70,6 +76,7 @@ async def dict_import(
         job = ImportJob(
             url=url,
             file=upload_path,
+            dict_id=dictionary and ObjectId(dictionary),
             state=JobStatus.SCHEDULED,
             meta=dict(
                 release=release,

diff --git a/app/router.py b/app/router.py
@@ -19,7 +19,7 @@
 _DICT_PATH: str = Path(
     ...,
     description='Dictionary id.',
-    regex=r'[a-f\d]{24}',
+    regex=r'^[a-f\d]{24}$',
 )
 _OFFSET_QUERY = Query(0, ge=0)
 _LIMIT_QUERY = Query(1_000_000, ge=1)

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -109,3 +109,11 @@ async def example_entry_ids(example_id):
         cursor = db.entry.find({'_dict_id': ObjectId(example_id)}, {'_id': True})
         ids = [str(i['_id']) for i in cursor]
     return ids
+
+
+@pytest.fixture(scope='module')
+async def entry_id(client, example_id):
+    response = await client.get(f'/lemma/{example_id}/cat',
+                                params={'offset': 0, 'limit': 1})
+    entry_id = response.json()[0]['id']
+    return entry_id
diff --git a/tests/test_importing.py b/tests/test_importing.py
@@ -1,8 +1,10 @@
+import asyncio
 from http import HTTPStatus
+from io import BytesIO
 
 import pytest
 
-from tests.conftest import verify_upload
+from tests.conftest import EXAMPLE_DIR, verify_upload
 
 pytestmark = pytest.mark.asyncio
 
@@ -32,3 +34,26 @@ async def test_from_url(client, example_file, httpserver):
         })
     assert response.status_code == HTTPStatus.CREATED
     await verify_upload(client, response.text)
+
+
+async def test_replace_dict(client, example_id, entry_id):
+    with open(EXAMPLE_DIR / 'example.ttl', 'rb') as fd:
+        text = fd.read()
+    fd = BytesIO(text.replace(b'type of animal', b'lalala'))
+
+    response = await client.post(
+        "/import",
+        files={'file': fd},
+        params={
+            'release': 'PUBLIC',
+            'api_key': 'test',
+            'dictionary': example_id,
+        })
+    assert response.status_code == HTTPStatus.CREATED
+    await asyncio.sleep(1)  # Previous dict is still loaded. Wait longer.
+    import time
+    time.sleep(1)
+    await verify_upload(client, example_id)
+
+    response = await client.get(f'/json/{example_id}/{entry_id}')
+    assert 'lalala' in str(response.read())
diff --git a/tests/test_router_rest.py b/tests/test_router_rest.py
@@ -44,14 +44,6 @@ async def test_lemma(client, example_id):
     assert 'json' in obj[0]['formats']
 
 
-@pytest.fixture(scope='module')
-async def entry_id(client, example_id):
-    response = await client.get(f'/lemma/{example_id}/cat',
-                                params={'offset': 0, 'limit': 1})
-    entry_id = response.json()[0]['id']
-    return entry_id
-
-
 async def test_entry_tei(client, example_id, entry_id):
     response = await client.get(f'/tei/{example_id}/{entry_id}')
     assert 'text/xml' in response.headers['content-type']