Skip to content

Commit

Permalink
Update Sõnaveeb scrapper
Browse files Browse the repository at this point in the history
Fix search from conjugated forms
  • Loading branch information
azymohliad committed Mar 5, 2024
1 parent 526eb1c commit 3d1c0c5
Show file tree
Hide file tree
Showing 3 changed files with 41 additions and 23 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
# Changelog

## v0.5.0 - 2024-03-05

- Update Sõnaveeb scrapper, fix search from conjugated forms.

## v0.4.0 - 2024-02-10

- Added "Delete" and "Replace" note buttons.
Expand Down
32 changes: 19 additions & 13 deletions anki_addon/sonaveeb.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@


BASE_URL = 'https://sonaveeb.ee'
FORMS_URL = 'https://sonaveeb.ee/searchwordfrag/unif/{word}'
SEARCH_URL = 'https://sonaveeb.ee/search/unif/dlall/dsall/{word}'
DETAILS_URL = 'https://sonaveeb.ee/worddetails/unif/{word_id}'

Expand Down Expand Up @@ -57,9 +58,7 @@ def _parse_search_results(self, dom, lang=None):
# Filter by language
if lang is not None:
homonyms = [r for r in homonyms if r.lang == lang]
# Parse forms
alt_forms = [b['data-word'] for b in dom.find_all('button', class_='word-form')]
return homonyms, alt_forms
return homonyms

def _parse_word_info(self, dom):
info = WordInfo()
Expand Down Expand Up @@ -89,17 +88,23 @@ def _parse_word_info(self, dom):
info.morphology.append(entry)
return info

def get_forms(self, word):
self._ensure_session()
resp = self._request(FORMS_URL.format(word=word))
data = resp.json()
forms = data['formWords']
match = word if word in data['prefWords'] else None
return match, forms

def get_candidates(self, word, lang='et', debug=False):
# Request word lookup page
dom = self._word_lookup_dom(word)

# Save HTML page for debugging
if debug:
open(os.path.join('debug', f'lookup_{word}.html'), 'w').write(dom.prettify())

# Parse results
homonyms, alt_forms = self._parse_search_results(dom, lang='et')
return homonyms, alt_forms
homonyms = self._parse_search_results(dom, lang=lang)
return homonyms

def get_word_info_by_candidate(self, candidate, debug=False):
# Request word details page
Expand All @@ -116,13 +121,14 @@ def get_word_info_by_candidate(self, candidate, debug=False):
return word_info

def get_word_info(self, word, lang='et', debug=False):
homonyms, forms = self.get_candidates(word, lang, debug)
if len(homonyms) == 0 and len(forms) > 0:
homonyms, forms = self.get_candidates(forms[0], lang, debug)
if len(homonyms) > 0:
return self.get_word_info_by_candidate(homonyms[0], debug)
else:
match, forms = self.get_forms(word)
if match is None and len(forms) == 0:
return None
word = forms[0] if match is None else match
homonyms = self.get_candidates(word, lang, debug)
if len(homonyms) == 0:
return None
return self.get_word_info_by_candidate(homonyms[0], debug)


@dc.dataclass
Expand Down
28 changes: 18 additions & 10 deletions anki_addon/ui/main_window.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,11 +150,19 @@ def request_search(self, query):
self.set_status('Searching...')
operation = QueryOp(
parent=self,
op=lambda col: self._sonaveeb.get_candidates(query),
op=lambda col: self.search_candidates(query),
success=self.search_results_received
).failure(self.handle_search_error)
operation.run_in_background()

def search_candidates(self, query):
match, forms = self._sonaveeb.get_forms(query)
if match is not None:
candidates = self._sonaveeb.get_candidates(match)
else:
candidates = []
return candidates, forms

def search_triggered(self):
self.clear_search_results()
query = self._search.text().strip()
Expand Down Expand Up @@ -186,24 +194,24 @@ def deck_changed(self, _index):
mw.addonManager.writeConfig(__name__, self._config)

def search_results_received(self, result):
homonyms, alt_forms = result
candidates, forms = result
self._search_button.setEnabled(True)
if len(homonyms) == 0:
if len(alt_forms) == 0:
if len(candidates) == 0:
if len(forms) == 0:
self.set_status('Not found :(')
elif len(alt_forms) == 1:
self.request_search(alt_forms[0])
elif len(forms) == 1:
self.request_search(forms[0])
else:
self._form_selector.set_label('Select base form:')
self._form_selector.set_options(alt_forms)
self._form_selector.set_options(forms)
self._form_selector.show()
self._content_stack.setCurrentWidget(self._content)
else:
self._form_selector.set_options(alt_forms)
self._form_selector.set_options(forms)
self._form_selector.set_label('See also:')
self._form_selector.setVisible(len(alt_forms) > 0)
self._form_selector.setVisible(len(forms) > 0)
self._content_stack.setCurrentWidget(self._content)
for homonym in homonyms:
for homonym in candidates:
word_panel = WordInfoPanel(homonym, self._sonaveeb, self.deck_id(), self.lang_code())
self._search_results_layout.addWidget(word_panel)

Expand Down

0 comments on commit 3d1c0c5

Please sign in to comment.