Skip to content

Commit

Permalink
add NgramMatchResource
Browse files Browse the repository at this point in the history
  • Loading branch information
fsimonjetz committed Sep 25, 2023
1 parent e6cba94 commit d4e7733
Show file tree
Hide file tree
Showing 3 changed files with 71 additions and 0 deletions.
46 changes: 46 additions & 0 deletions ebl/common/application/ngram_matcher.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
from typing import Set, Tuple, Optional, Sequence, List


def aggregate_ngram_overlaps(
ngrams: Set[Tuple[str]], limit: Optional[int] = None
) -> Sequence[dict]:
ngram_list = list(ngrams)
pipeline: List[dict] = [
{"$match": {"textId.category": {"$ne": 99}}},
{
"$project": {
"_id": 0,
"textId": 1,
"name": 1,
"stage": 1,
"overlap": {
"$let": {
"vars": {
"intersection": {
"$size": {"$setIntersection": ["$ngrams", ngram_list]}
},
"minLength": {
"$min": [
{"$size": "$ngrams"},
{"$size": [ngram_list]},
]
},
},
"in": {
"$cond": [
{"$eq": ["$$minLength", 0]},
0.0,
{"$divide": ["$$intersection", "$$minLength"]},
]
},
}
},
}
},
{"$sort": {"overlap": -1}},
]

if limit:
pipeline.append({"$limit": limit})

return pipeline
3 changes: 3 additions & 0 deletions ebl/fragmentarium/web/bootstrap.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from ebl.fragmentarium.web.folio_pager import FolioPagerResource
from ebl.fragmentarium.web.folios import FoliosResource
from ebl.fragmentarium.web.fragment_genre import FragmentGenreResource
from ebl.fragmentarium.web.fragment_ngram_matcher import NgramMatchResource
from ebl.fragmentarium.web.fragment_script import FragmentScriptResource
from ebl.fragmentarium.web.fragment_date import (
FragmentDateResource,
Expand Down Expand Up @@ -85,6 +86,7 @@ def create_fragmentarium_routes(api: falcon.App, context: Context):
fragment_query = FragmentsQueryResource(
context.fragment_repository, context.get_transliteration_query_factory()
)
ngram_matcher = NgramMatchResource(context.fragment_repository)
genres = GenresResource()
periods = PeriodsResource()
lemmatization = LemmatizationResource(updater)
Expand Down Expand Up @@ -124,6 +126,7 @@ def create_fragmentarium_routes(api: falcon.App, context: Context):
("/fragments/{number}/annotations", annotations),
("/fragments/{number}/photo", photo),
("/fragments/{number}/corpus", chapters),
("/fragments/{number}/ngrams", ngram_matcher),
("/genres", genres),
("/periods", periods),
("/statistics", statistics),
Expand Down
22 changes: 22 additions & 0 deletions ebl/fragmentarium/web/fragment_ngram_matcher.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
from ebl.common.application.ngram_matcher import aggregate_ngram_overlaps
from ebl.corpus.application.text_repository import TextRepository
from ebl.fragmentarium.application.fragment_repository import FragmentRepository
from falcon import Request, Response

from ebl.transliteration.domain.museum_number import MuseumNumber


class NgramMatchResource:
def __init__(
self,
fragment_repository: FragmentRepository,
text_repository: TextRepository,
):
self._fragment_repository = fragment_repository
self._text_repository = text_repository

def on_get(self, _req: Request, resp: Response, number: str) -> None:
ngrams = self._fragment_repository.get_ngrams(MuseumNumber.of(number))
resp.media = self._text_repository._chapters.aggregate(
aggregate_ngram_overlaps(ngrams)
)

0 comments on commit d4e7733

Please sign in to comment.