Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add ngram endpoints #458

Open
wants to merge 30 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
ab60932
add ngram utility functions
fsimonjetz Sep 21, 2023
8fdf215
add ngram global
fsimonjetz Sep 21, 2023
c7a7dad
add update_ngrams method
fsimonjetz Sep 21, 2023
93a1e4e
add helper functions for ngram tests
fsimonjetz Sep 21, 2023
0c06bd9
add tests for fragment ngrams
fsimonjetz Sep 21, 2023
7370db2
simplify function
fsimonjetz Sep 22, 2023
22a43ed
refactoring
fsimonjetz Sep 22, 2023
4d7f7c9
add _update_ngrams
fsimonjetz Sep 22, 2023
be0eb35
allow ngram field
fsimonjetz Sep 22, 2023
bdeeecd
add chapter ngram tests
fsimonjetz Sep 22, 2023
532555f
update importer to compute ngrams
fsimonjetz Sep 25, 2023
e6cba94
refactoring
fsimonjetz Sep 25, 2023
d4e7733
add NgramMatchResource
fsimonjetz Sep 25, 2023
24cde92
refactoring
fsimonjetz Sep 25, 2023
ba8e399
add text repo
fsimonjetz Sep 25, 2023
d5b74bc
move aggregate ngrams into TextRepo
fsimonjetz Sep 25, 2023
87cb373
add get_ngrams
fsimonjetz Sep 25, 2023
73c00b5
refactor
fsimonjetz Sep 25, 2023
bb3cfd9
add test_match_fragment_ngrams
fsimonjetz Sep 25, 2023
7142502
include textName in ngram output,
fsimonjetz Sep 25, 2023
562051f
add test_get_ngrams
fsimonjetz Sep 26, 2023
8b571cc
add test_aggregate_ngram_overlaps
fsimonjetz Sep 26, 2023
a6df055
add ChapterNgramScore and schema
fsimonjetz Sep 26, 2023
5c3ced2
fix typo
fsimonjetz Sep 26, 2023
f072015
bug fix
fsimonjetz Sep 26, 2023
f4e7f2a
reformat
fsimonjetz Sep 26, 2023
e9d5449
add None sign handling
fsimonjetz Sep 26, 2023
49f1ad2
remove debug output
fsimonjetz Sep 26, 2023
0323d48
add batch ngram update script
fsimonjetz Sep 26, 2023
f7e79d7
add fragment batch ngram update
fsimonjetz Sep 26, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions ebl/common/infrastructure/ngrams.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
NGRAM_N_VALUES = [1, 2, 3]
46 changes: 42 additions & 4 deletions ebl/common/query/util.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Union, Dict
from typing import Union, Dict, Sequence


def flatten_field(input_: Union[str, Dict], depth=1) -> Dict:
Expand All @@ -15,9 +15,9 @@ def drop_duplicates(input_: Union[str, Dict]) -> Dict:
return {"$setUnion": [input_, []]}


def ngrams(input_: Union[str, Dict], n) -> Dict:
if n <= 1:
raise ValueError("ngram size must be 2 or more")
def ngrams(input_: Union[str, Dict], n: int) -> Dict:
if n <= 0:
raise ValueError("ngram size must be 1 or more")
return {
"$zip": {
"inputs": [
Expand All @@ -39,3 +39,41 @@ def ngrams(input_: Union[str, Dict], n) -> Dict:

def filter_array(input_, as_, cond) -> Dict:
return {"$filter": {"input": input_, "as": as_, "cond": cond}}


def extract_ngrams(
input_: Union[str, Dict],
N: Sequence[int],
):
signs_to_exclude = ["X", ""]

exclude_empty = {
"$eq": [
{
"$size": {
"$setIntersection": [
"$$this",
signs_to_exclude,
]
}
},
0,
]
}
return drop_duplicates(
filter_array(
{"$concatArrays": [ngrams(input_, n) for n in N if n > 0]},
"this",
exclude_empty,
)
)


def replace_all(input_: Union[str, Dict], old: str, new: str):
return {
"$replaceAll": {
"input": input_,
"find": old,
"replacement": new,
}
}
15 changes: 14 additions & 1 deletion ebl/corpus/application/display_schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,11 @@
from ebl.corpus.application.id_schemas import ChapterIdSchema
from ebl.corpus.application.record_schemas import RecordSchema
from ebl.corpus.application.schemas import LineVariantSchema, ManuscriptSchema
from ebl.corpus.domain.chapter_display import ChapterDisplay, LineDisplay
from ebl.corpus.domain.chapter_display import (
ChapterDisplay,
ChapterNgramScore,
LineDisplay,
)
from ebl.corpus.domain.record import Record
from ebl.transliteration.application.line_number_schemas import (
OneOfLineNumberSchema,
Expand Down Expand Up @@ -89,3 +93,12 @@ def add_line_indexes(self, data: dict, **kwargs) -> dict:
]

return data


class ChapterNgramScoreSchema(ChapterIdSchema):
text_name = fields.String(required=True, data_key="textName")
score = fields.Float(required=True)

@post_load
def make_result(self, data: dict, **kwargs) -> ChapterNgramScore:
return ChapterNgramScore(**data)
4 changes: 4 additions & 0 deletions ebl/corpus/application/schemas.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from ebl.corpus.domain.provenance import Provenance
from marshmallow import (
EXCLUDE,
Schema,
ValidationError,
fields,
Expand Down Expand Up @@ -260,6 +261,9 @@ class DictionaryLinePaginationSchema(Schema):


class ChapterSchema(Schema):
class Meta:
unknown = EXCLUDE

text_id = fields.Nested(TextIdSchema, required=True, data_key="textId")
classification = ValueEnumField(Classification, required=True)
stage = ValueEnumField(Stage, required=True)
Expand Down
6 changes: 6 additions & 0 deletions ebl/corpus/application/text_repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,3 +85,9 @@
@abstractmethod
def query(self, query: dict) -> CorpusQueryResult:
...

@abstractmethod
def aggregate_ngram_overlaps(
self, ngrams: Sequence[Sequence[str]], limit: Optional[int] = None
) -> Sequence[dict]:
...
Dismissed Show dismissed Hide dismissed
12 changes: 12 additions & 0 deletions ebl/corpus/domain/chapter_display.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,3 +139,15 @@ def of_chapter(text: Text, chapter: Chapter) -> "ChapterDisplay":
chapter.record,
chapter.manuscripts,
)


@attr.s(frozen=True, auto_attribs=True)
class ChapterNgramScore(ChapterId):
text_name: str
score: float

@staticmethod
def of(chapter_id: ChapterId, text_name: str, score: float) -> "ChapterNgramScore":
Fixed Show fixed Hide fixed
return ChapterNgramScore(
chapter_id.text_id, chapter_id.stage, chapter_id.name, text_name, score
)
132 changes: 101 additions & 31 deletions ebl/corpus/infrastructure/mongo_text_repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,16 @@


from ebl.bibliography.infrastructure.bibliography import join_reference_documents
from ebl.common.infrastructure.ngrams import NGRAM_N_VALUES
from ebl.common.query.query_result import CorpusQueryResult
from ebl.common.query.query_schemas import CorpusQueryResultSchema
from ebl.common.query.util import (
drop_duplicates,
extract_ngrams,
filter_array,
flatten_field,
replace_all,
)
from ebl.corpus.application.text_repository import TextRepository
from ebl.corpus.application.display_schemas import ChapterDisplaySchema
from ebl.corpus.application.schemas import (
Expand Down Expand Up @@ -37,7 +45,7 @@
chapter_id_query,
join_chapters,
join_text,
join_text_title,
join_text_names,
)
from ebl.errors import NotFoundError
from ebl.fragmentarium.infrastructure.queries import is_in_fragmentarium, join_joins
Expand Down Expand Up @@ -109,6 +117,7 @@ def create(self, text: Text) -> None:

def create_chapter(self, chapter: Chapter) -> None:
self._chapters.insert_one(ChapterSchema().dump(chapter))
self._update_ngrams(chapter.id_)

def find(self, id_: TextId) -> Text:
try:
Expand Down Expand Up @@ -243,6 +252,7 @@ def update(self, id_: ChapterId, chapter: Chapter) -> None:
).dump(chapter)
},
)
self._update_ngrams(id_)

def query_by_transliteration(
self, query: TransliterationQuery, pagination_index: int
Expand All @@ -252,35 +262,8 @@ def query_by_transliteration(
cursor = self._chapters.aggregate(
[
{"$match": mongo_query},
{
"$lookup": {
"from": "texts",
"let": {
"chapterGenre": "$textId.genre",
"chapterCategory": "$textId.category",
"chapterIndex": "$textId.index",
},
"pipeline": [
{
"$match": {
"$expr": {
"$and": [
{"$eq": ["$genre", "$$chapterGenre"]},
{"$eq": ["$category", "$$chapterCategory"]},
{"$eq": ["$index", "$$chapterIndex"]},
]
}
}
},
{"$project": {"name": 1, "_id": 0}},
],
"as": "textNames",
}
},
*join_text_names(),
{"$project": {"_id": False}},
{"$addFields": {"textName": {"$first": "$textNames"}}},
{"$addFields": {"textName": "$textName.name"}},
{"$project": {"textNames": False}},
{"$skip": LIMIT * pagination_index},
{"$limit": LIMIT},
],
Expand Down Expand Up @@ -333,12 +316,12 @@ def query_by_lemma(
},
{"$unwind": "$lines"},
{"$match": lemma_query},
join_text_title(),
*join_text_names(),
filter_manuscripts_by_lemma(lemma),
{
"$project": {
"textId": True,
"textName": {"$first": "$textName.name"},
"textName": True,
"chapterName": "$name",
"stage": True,
"line": "$lines",
Expand Down Expand Up @@ -449,3 +432,90 @@ def query_corpus_by_manuscript(
]
)
return ManuscriptAttestationSchema().load(cursor, many=True)

def _update_ngrams(self, id_: ChapterId) -> None:
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Needs to exclude colophons

map_extract_ngrams = {
"$map": {
"input": "$signs",
"in": extract_ngrams(
{"$split": [replace_all("$$this", "\n", " # "), " "]},
NGRAM_N_VALUES,
),
}
}
pipeline = [
{
"$set": {
"ngrams": drop_duplicates(
flatten_field(
filter_array(
map_extract_ngrams,
"manuscriptSigns",
{"$ne": ["$$manuscriptSigns", None]},
)
)
)
}
},
]

self._chapters.update_one(
chapter_id_query(id_),
pipeline,
)

def aggregate_ngram_overlaps(
self, ngrams: Sequence[Sequence[str]], limit: Optional[int] = None
) -> Sequence[dict]:
if not ngrams:
raise ValueError("ngrams must not be empty")

ngram_list = list(ngrams)
test_chapter_category = 99
pipeline: List[dict] = [
{
"$match": {
"textId.category": {"$ne": test_chapter_category},
"ngrams": {"$exists": True, "$not": {"$size": 0}},
}
},
{
"$project": {
"_id": 0,
"textId": 1,
"name": 1,
"stage": 1,
"score": {
"$let": {
"vars": {
"intersection": {
"$size": {
"$setIntersection": ["$ngrams", ngram_list]
}
},
"minLength": {
"$min": [
{"$size": "$ngrams"},
len(ngram_list),
]
},
},
"in": {
"$cond": [
{"$eq": ["$$minLength", 0]},
0.0,
{"$divide": ["$$intersection", "$$minLength"]},
]
},
}
},
}
},
*join_text_names(),
{"$sort": {"score": -1}},
]

if limit:
pipeline.append({"$limit": limit})

return list(self._chapters.aggregate(pipeline))
49 changes: 24 additions & 25 deletions ebl/corpus/infrastructure/queries.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,29 +187,28 @@ def join_text() -> List[dict]:
]


def join_text_title() -> dict:
return {
"$lookup": {
"from": "texts",
"let": {
"genre": "$textId.genre",
"category": "$textId.category",
"index": "$textId.index",
},
"pipeline": [
{
"$match": {
"$expr": {
"$and": [
{"$eq": ["$genre", "$$genre"]},
{"$eq": ["$category", "$$category"]},
{"$eq": ["$index", "$$index"]},
]
def join_text_names() -> List[dict]:
return [
{
"$lookup": {
"from": "texts",
"let": {"textId": "$textId"},
"pipeline": [
{
"$match": {
"$expr": {
"$and": [
{"$eq": ["$genre", "$$textId.genre"]},
{"$eq": ["$category", "$$textId.category"]},
{"$eq": ["$index", "$$textId.index"]},
]
}
}
}
},
{"$project": {"_id": False, "name": True}},
],
"as": "textName",
}
}
},
{"$project": {"name": 1, "_id": 0}},
],
"as": "textName",
}
},
{"$addFields": {"textName": {"$first": "$textName.name"}}},
]
4 changes: 4 additions & 0 deletions ebl/fragmentarium/application/fragment_repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,3 +106,7 @@
@abstractmethod
def list_all_fragments(self) -> Sequence[str]:
...

@abstractmethod
def get_ngrams(self, number: MuseumNumber) -> Sequence[Sequence[str]]:
...
Dismissed Show dismissed Hide dismissed
Loading
Loading