Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add ngram endpoints #458

Open
wants to merge 30 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 19 commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
ab60932
add ngram utility functions
fsimonjetz Sep 21, 2023
8fdf215
add ngram global
fsimonjetz Sep 21, 2023
c7a7dad
add update_ngrams method
fsimonjetz Sep 21, 2023
93a1e4e
add helper functions for ngram tests
fsimonjetz Sep 21, 2023
0c06bd9
add tests for fragment ngrams
fsimonjetz Sep 21, 2023
7370db2
simplify function
fsimonjetz Sep 22, 2023
22a43ed
refactoring
fsimonjetz Sep 22, 2023
4d7f7c9
add _update_ngrams
fsimonjetz Sep 22, 2023
be0eb35
allow ngram field
fsimonjetz Sep 22, 2023
bdeeecd
add chapter ngram tests
fsimonjetz Sep 22, 2023
532555f
update importer to compute ngrams
fsimonjetz Sep 25, 2023
e6cba94
refactoring
fsimonjetz Sep 25, 2023
d4e7733
add NgramMatchResource
fsimonjetz Sep 25, 2023
24cde92
refactoring
fsimonjetz Sep 25, 2023
ba8e399
add text repo
fsimonjetz Sep 25, 2023
d5b74bc
move aggregate ngrams into TextRepo
fsimonjetz Sep 25, 2023
87cb373
add get_ngrams
fsimonjetz Sep 25, 2023
73c00b5
refactor
fsimonjetz Sep 25, 2023
bb3cfd9
add test_match_fragment_ngrams
fsimonjetz Sep 25, 2023
7142502
include textName in ngram output,
fsimonjetz Sep 25, 2023
562051f
add test_get_ngrams
fsimonjetz Sep 26, 2023
8b571cc
add test_aggregate_ngram_overlaps
fsimonjetz Sep 26, 2023
a6df055
add ChapterNgramScore and schema
fsimonjetz Sep 26, 2023
5c3ced2
fix typo
fsimonjetz Sep 26, 2023
f072015
bug fix
fsimonjetz Sep 26, 2023
f4e7f2a
reformat
fsimonjetz Sep 26, 2023
e9d5449
add None sign handling
fsimonjetz Sep 26, 2023
49f1ad2
remove debug output
fsimonjetz Sep 26, 2023
0323d48
add batch ngram update script
fsimonjetz Sep 26, 2023
f7e79d7
add fragment batch ngram update
fsimonjetz Sep 26, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions ebl/common/infrastructure/ngrams.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
NGRAM_N_VALUES = [1, 2, 3]
46 changes: 42 additions & 4 deletions ebl/common/query/util.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Union, Dict
from typing import Union, Dict, Sequence


def flatten_field(input_: Union[str, Dict], depth=1) -> Dict:
Expand All @@ -15,9 +15,9 @@ def drop_duplicates(input_: Union[str, Dict]) -> Dict:
return {"$setUnion": [input_, []]}


def ngrams(input_: Union[str, Dict], n) -> Dict:
if n <= 1:
raise ValueError("ngram size must be 2 or more")
def ngrams(input_: Union[str, Dict], n: int) -> Dict:
if n <= 0:
raise ValueError("ngram size must be 1 or more")
return {
"$zip": {
"inputs": [
Expand All @@ -39,3 +39,41 @@ def ngrams(input_: Union[str, Dict], n) -> Dict:

def filter_array(input_, as_, cond) -> Dict:
return {"$filter": {"input": input_, "as": as_, "cond": cond}}


def extract_ngrams(
input_: Union[str, Dict],
N: Sequence[int],
):
signs_to_exclude = ["X", ""]

exclude_empty = {
"$eq": [
{
"$size": {
"$setIntersection": [
"$$this",
signs_to_exclude,
]
}
},
0,
]
}
return drop_duplicates(
filter_array(
{"$concatArrays": [ngrams(input_, n) for n in N if n > 0]},
"this",
exclude_empty,
)
)


def replace_all(input_: Union[str, Dict], old: str, new: str):
return {
"$replaceAll": {
"input": input_,
"find": old,
"replacement": new,
}
}
4 changes: 4 additions & 0 deletions ebl/corpus/application/schemas.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from ebl.corpus.domain.provenance import Provenance
from marshmallow import (
EXCLUDE,
Schema,
ValidationError,
fields,
Expand Down Expand Up @@ -260,6 +261,9 @@ class DictionaryLinePaginationSchema(Schema):


class ChapterSchema(Schema):
class Meta:
unknown = EXCLUDE

text_id = fields.Nested(TextIdSchema, required=True, data_key="textId")
classification = ValueEnumField(Classification, required=True)
stage = ValueEnumField(Stage, required=True)
Expand Down
6 changes: 6 additions & 0 deletions ebl/corpus/application/text_repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,3 +85,9 @@
@abstractmethod
def query(self, query: dict) -> CorpusQueryResult:
...

@abstractmethod
def aggregate_ngram_overlaps(
self, ngrams: Sequence[Sequence[str]], limit: Optional[int] = None
) -> Sequence[dict]:
...
Dismissed Show dismissed Hide dismissed
74 changes: 74 additions & 0 deletions ebl/corpus/infrastructure/mongo_text_repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,15 @@


from ebl.bibliography.infrastructure.bibliography import join_reference_documents
from ebl.common.infrastructure.ngrams import NGRAM_N_VALUES
from ebl.common.query.query_result import CorpusQueryResult
from ebl.common.query.query_schemas import CorpusQueryResultSchema
from ebl.common.query.util import (
drop_duplicates,
extract_ngrams,
flatten_field,
replace_all,
)
from ebl.corpus.application.text_repository import TextRepository
from ebl.corpus.application.display_schemas import ChapterDisplaySchema
from ebl.corpus.application.schemas import (
Expand Down Expand Up @@ -109,6 +116,7 @@ def create(self, text: Text) -> None:

def create_chapter(self, chapter: Chapter) -> None:
self._chapters.insert_one(ChapterSchema().dump(chapter))
self._update_ngrams(chapter.id_)

def find(self, id_: TextId) -> Text:
try:
Expand Down Expand Up @@ -243,6 +251,7 @@ def update(self, id_: ChapterId, chapter: Chapter) -> None:
).dump(chapter)
},
)
self._update_ngrams(id_)

def query_by_transliteration(
self, query: TransliterationQuery, pagination_index: int
Expand Down Expand Up @@ -449,3 +458,68 @@ def query_corpus_by_manuscript(
]
)
return ManuscriptAttestationSchema().load(cursor, many=True)

def _update_ngrams(self, id_: ChapterId) -> None:
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Needs to exclude colophons

map_extract_ngrams = {
"$map": {
"input": "$signs",
"in": extract_ngrams(
{"$split": [replace_all("$$this", "\n", " # "), " "]},
NGRAM_N_VALUES,
),
}
}
pipeline = [
{"$set": {"ngrams": drop_duplicates(flatten_field(map_extract_ngrams))}},
]

self._chapters.update_one(
chapter_id_query(id_),
pipeline,
)

def aggregate_ngram_overlaps(
self, ngrams: Sequence[Sequence[str]], limit: Optional[int] = None
) -> Sequence[dict]:
ngram_list = list(ngrams)
pipeline: List[dict] = [
{"$match": {"textId.category": {"$ne": 99}}},
{
"$project": {
"_id": 0,
"textId": 1,
"name": 1,
"stage": 1,
"overlap": {
"$let": {
"vars": {
"intersection": {
"$size": {
"$setIntersection": ["$ngrams", ngram_list]
}
},
"minLength": {
"$min": [
{"$size": "$ngrams"},
{"$size": [ngram_list]},
]
},
},
"in": {
"$cond": [
{"$eq": ["$$minLength", 0]},
0.0,
{"$divide": ["$$intersection", "$$minLength"]},
]
},
}
},
}
},
{"$sort": {"overlap": -1}},
]

if limit:
pipeline.append({"$limit": limit})

return list(self._chapters.aggregate(pipeline))
4 changes: 4 additions & 0 deletions ebl/fragmentarium/application/fragment_repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,3 +106,7 @@
@abstractmethod
def list_all_fragments(self) -> Sequence[str]:
...

@abstractmethod
def get_ngrams(self, number: MuseumNumber) -> Sequence[Sequence[str]]:
...
Dismissed Show dismissed Hide dismissed
39 changes: 29 additions & 10 deletions ebl/fragmentarium/infrastructure/mongo_fragment_repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,10 @@

from ebl.bibliography.infrastructure.bibliography import join_reference_documents
from ebl.common.domain.scopes import Scope
from ebl.common.infrastructure.ngrams import NGRAM_N_VALUES
from ebl.common.query.query_result import QueryResult
from ebl.common.query.query_schemas import QueryResultSchema
from ebl.common.query.util import extract_ngrams, replace_all
from ebl.errors import NotFoundError
from ebl.fragmentarium.application.fragment_info_schema import FragmentInfoSchema
from ebl.fragmentarium.application.fragment_repository import FragmentRepository
Expand Down Expand Up @@ -103,13 +105,16 @@ def count_lines(self):
return 0

def create(self, fragment, sort_key=None):
return self._fragments.insert_one(
id_ = self._fragments.insert_one(
{
"_id": str(fragment.number),
**FragmentSchema(exclude=["joins"]).dump(fragment),
**({} if sort_key is None else {"_sortKey": sort_key}),
}
)
self._update_ngrams(fragment.number)

return id_

def create_many(self, fragments: Sequence[Fragment]) -> Sequence[str]:
schema = FragmentSchema(exclude=["joins"])
Expand Down Expand Up @@ -295,6 +300,9 @@ def update_field(self, field, fragment):
{"$set": query if query else {field: None}},
)

if field == "transliteration":
self._update_ngrams(fragment.number)

def query_next_and_previous_folio(self, folio_name, folio_number, number):
sort_ascending = {"$sort": {"key": 1}}
sort_descending = {"$sort": {"key": -1}}
Expand Down Expand Up @@ -342,15 +350,6 @@ def get_numbers(pipeline):
else:
return result

def query_museum_numbers(self, prefix: str, number_regex: str) -> Sequence[dict]:
return self._fragments.find_many(
{
"museumNumber.prefix": prefix,
"museumNumber.number": {"$regex": number_regex},
},
projection={"museumNumber": True},
)

def query_by_sort_key(self, key: int) -> MuseumNumber:
if key < 0:
last_fragment = next(
Expand Down Expand Up @@ -423,3 +422,23 @@ def list_all_fragments(
return list(
self._fragments.get_all_values("_id", match_user_scopes(user_scopes))
)

def _update_ngrams(self, number: MuseumNumber):
self._fragments.update_one(
museum_number_is(number),
[
{
"$set": {
"ngrams": extract_ngrams(
{"$split": [replace_all("$signs", "\n", " # "), " "]},
NGRAM_N_VALUES,
)
}
},
],
)

def get_ngrams(self, number: MuseumNumber) -> Sequence[Sequence[str]]:
return self._fragments.find_one(
museum_number_is(number), projection={"ngrams": True}
)["ngrams"]
5 changes: 5 additions & 0 deletions ebl/fragmentarium/web/bootstrap.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from ebl.fragmentarium.web.folio_pager import FolioPagerResource
from ebl.fragmentarium.web.folios import FoliosResource
from ebl.fragmentarium.web.fragment_genre import FragmentGenreResource
from ebl.fragmentarium.web.fragment_ngram_matcher import NgramMatcherResource
from ebl.fragmentarium.web.fragment_script import FragmentScriptResource
from ebl.fragmentarium.web.fragment_date import (
FragmentDateResource,
Expand Down Expand Up @@ -85,6 +86,9 @@ def create_fragmentarium_routes(api: falcon.App, context: Context):
fragment_query = FragmentsQueryResource(
context.fragment_repository, context.get_transliteration_query_factory()
)
ngram_matcher = NgramMatcherResource(
context.fragment_repository, context.text_repository
)
genres = GenresResource()
periods = PeriodsResource()
lemmatization = LemmatizationResource(updater)
Expand Down Expand Up @@ -124,6 +128,7 @@ def create_fragmentarium_routes(api: falcon.App, context: Context):
("/fragments/{number}/annotations", annotations),
("/fragments/{number}/photo", photo),
("/fragments/{number}/corpus", chapters),
("/fragments/{number}/ngrams", ngram_matcher),
("/genres", genres),
("/periods", periods),
("/statistics", statistics),
Expand Down
19 changes: 19 additions & 0 deletions ebl/fragmentarium/web/fragment_ngram_matcher.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from ebl.corpus.application.text_repository import TextRepository
from ebl.fragmentarium.application.fragment_repository import FragmentRepository
from falcon import Request, Response

from ebl.transliteration.domain.museum_number import MuseumNumber


class NgramMatcherResource:
def __init__(
self,
fragment_repository: FragmentRepository,
text_repository: TextRepository,
):
self._fragment_repository = fragment_repository
self._text_repository = text_repository

def on_get(self, _req: Request, resp: Response, number: str) -> None:
ngrams = self._fragment_repository.get_ngrams(MuseumNumber.of(number))
resp.media = self._text_repository.aggregate_ngram_overlaps(ngrams)
Loading
Loading