Skip to content

Commit

Permalink
Support updating existing dictionaries via import
Browse files Browse the repository at this point in the history
  • Loading branch information
kernc committed Apr 21, 2021
1 parent e491c11 commit 9243dbd
Show file tree
Hide file tree
Showing 7 changed files with 88 additions and 16 deletions.
2 changes: 2 additions & 0 deletions app/importing/models.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import sys
from typing import List, Optional

from bson import ObjectId
from pydantic import AnyHttpUrl, FilePath, HttpUrl, root_validator, validator

from app.models import BaseModel, Genre, Language, ReleasePolicy, _AutoStrEnum
Expand All @@ -25,6 +26,7 @@ class ImportJob(BaseModel):
file: Optional[FilePath]
state: JobStatus
meta: _ImportMeta
dict_id: Optional[ObjectId]

@validator('url', 'file')
def cast_to_str(cls, v):
Expand Down
50 changes: 44 additions & 6 deletions app/importing/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,31 +42,50 @@ def _process_one_dict(job_id: str):
assert response.num_bytes_downloaded == num_bytes_expected
job.file = filename

# Parse file
# Parse file into dict object
assert filename
log.debug('Parse %s from %r', job_id, filename)
obj = file_to_obj(filename, job.meta.sourceLanguage)

# Transfer properties
obj['_id'] = job_id
obj['import_time'] = str(datetime.now())
# We add job.meta properrties on base object, which in
# router /about get overriden by meta from file
obj.update(job.meta.dict(exclude_none=True, exclude_unset=True))

# Check if our dict should replace entries from other dict_id
dict_id = job.dict_id or job_id
if job.dict_id is not None:
log.debug('Job %s replaces dict %s', job_id, dict_id)
obj['_id'] = dict_id

old_obj = db.dicts.find_one({'api_key': job.meta.api_key,
'_id': dict_id},
{'_id': True})
if old_obj is None:
raise Exception('E403, forbidden')

# Transfer entry ids from old dict
obj = _transfer_ids(obj, dict_id, db)

# Extract entries separately, assign them dict id
entries = obj.pop('entries')
assert entries, 'No entries in dictionary'
obj['n_entries'] = len(entries)
for entry in entries:
entry['_dict_id'] = job_id
entry['_dict_id'] = dict_id

obj['n_entries'] = len(entries)
log.debug('Insert %s with %d entries', dict_id, len(entries))
# Remove previous dict/entries
db.entry.delete_many({'_dict_id': dict_id})
db.dicts.delete_one({'_id': dict_id})

# Insert dict, entries
log.debug('Insert %s with %d entries', job_id, len(entries))
result = db.entry.insert_many(entries)
obj['_entries'] = result.inserted_ids # Inverse of _dict_id
result = db.dicts.insert_one(obj)
assert result.inserted_id == job_id
assert result.inserted_id == dict_id

# Mark job done
db.import_jobs.update_one(
Expand All @@ -79,6 +98,25 @@ def _process_one_dict(job_id: str):
log.exception('Error processing %s', job_id)
db.import_jobs.update_one(
{'_id': job_id}, {'$set': {'state': JobStatus.ERROR,
'error': traceback.format_exc()}})
'error': traceback.format_exc()}})
if settings.UPLOAD_REMOVE_ON_FAILURE and os.path.isfile(filename):
os.remove(filename)


def _transfer_ids(new_obj, old_dict_id, db):
def entry_to_key(entry):
return (
entry['lemma'],
entry['partOfSpeech'],
)

old_entries = db.entry.find({'_dict_id': old_dict_id},
{'lemma': True,
'partOfSpeech': True})
old_id_by_key = {entry_to_key(entry): entry['_id']
for entry in old_entries}
for entry in new_obj['entries']:
id = old_id_by_key.get(entry_to_key(entry))
if id is not None:
entry['_id'] = id
return new_obj
7 changes: 7 additions & 0 deletions app/importing/router.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from queue import SimpleQueue
from typing import List, Optional

from bson import ObjectId
from fastapi import APIRouter, Depends, File, HTTPException, Query, UploadFile
from fastapi.responses import PlainTextResponse

Expand Down Expand Up @@ -35,6 +36,11 @@ async def dict_import(
None,
description='URL of the dictionary to fetch and import. See <em>file=</em>.',
),
dictionary: Optional[str] = Query(
None,
description='Id of dictionary to replace.',
regex='^[a-z0-f]{24}$',
),
file: Optional[UploadFile] = File(
None,
description='Dictionary file to import. In either OntoLex/Turtle, '
Expand Down Expand Up @@ -70,6 +76,7 @@ async def dict_import(
job = ImportJob(
url=url,
file=upload_path,
dict_id=dictionary and ObjectId(dictionary),
state=JobStatus.SCHEDULED,
meta=dict(
release=release,
Expand Down
2 changes: 1 addition & 1 deletion app/router.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
_DICT_PATH: str = Path(
...,
description='Dictionary id.',
regex=r'[a-f\d]{24}',
regex=r'^[a-f\d]{24}$',
)
_OFFSET_QUERY = Query(0, ge=0)
_LIMIT_QUERY = Query(1_000_000, ge=1)
Expand Down
8 changes: 8 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,3 +109,11 @@ async def example_entry_ids(example_id):
cursor = db.entry.find({'_dict_id': ObjectId(example_id)}, {'_id': True})
ids = [str(i['_id']) for i in cursor]
return ids


@pytest.fixture(scope='module')
async def entry_id(client, example_id):
response = await client.get(f'/lemma/{example_id}/cat',
params={'offset': 0, 'limit': 1})
entry_id = response.json()[0]['id']
return entry_id
27 changes: 26 additions & 1 deletion tests/test_importing.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
import asyncio
from http import HTTPStatus
from io import BytesIO

import pytest

from tests.conftest import verify_upload
from tests.conftest import EXAMPLE_DIR, verify_upload

pytestmark = pytest.mark.asyncio

Expand Down Expand Up @@ -32,3 +34,26 @@ async def test_from_url(client, example_file, httpserver):
})
assert response.status_code == HTTPStatus.CREATED
await verify_upload(client, response.text)


async def test_replace_dict(client, example_id, entry_id):
with open(EXAMPLE_DIR / 'example.ttl', 'rb') as fd:
text = fd.read()
fd = BytesIO(text.replace(b'type of animal', b'lalala'))

response = await client.post(
"/import",
files={'file': fd},
params={
'release': 'PUBLIC',
'api_key': 'test',
'dictionary': example_id,
})
assert response.status_code == HTTPStatus.CREATED
await asyncio.sleep(1) # Previous dict is still loaded. Wait longer.
import time
time.sleep(1)
await verify_upload(client, example_id)

response = await client.get(f'/json/{example_id}/{entry_id}')
assert 'lalala' in str(response.read())
8 changes: 0 additions & 8 deletions tests/test_router_rest.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,14 +44,6 @@ async def test_lemma(client, example_id):
assert 'json' in obj[0]['formats']


@pytest.fixture(scope='module')
async def entry_id(client, example_id):
response = await client.get(f'/lemma/{example_id}/cat',
params={'offset': 0, 'limit': 1})
entry_id = response.json()[0]['id']
return entry_id


async def test_entry_tei(client, example_id, entry_id):
response = await client.get(f'/tei/{example_id}/{entry_id}')
assert 'text/xml' in response.headers['content-type']
Expand Down

0 comments on commit 9243dbd

Please sign in to comment.