-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmain.py
447 lines (373 loc) · 16.5 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
from playwright.async_api import (
Playwright, async_playwright,
)
from playwright._impl._api_types import (
TimeoutError as PWTimeOutErr,
)
import asyncio
from pathlib import Path
import re
from rich.console import Console
from rich import print as richprint
import random
from gtts import gTTS, gTTSError
import genanki
import json
DUOLINGO_WORDS_URL = "https://duome.eu/vocabulary/en/it"
# [
# ['original_word', 'html_original_word', 'html_definition', 'category', 'html_category'],
# [...]
# ]
LIST_WHOLE_WORDS_FOR_ANKI = list()
LIST_WHOLE_WORDS = list()
# e.g. EN-IT_3667.apkg
ANKI_APKG_FILENAME = "{ph_lang_code}_{ph_total_words}.apkg"
# e.g. ['EN', 'IT']
FOUND_LANGS = list()
# e.g. EN-IT_3667
VOICES_DIRECTORY_NAME = None
DICT_LANGS_CODES = {
"EN": "English",
"IT": "Italian",
}
DICT_COUNTRY_FLAGS = {
"EN": "🇬🇧",
"IT": "🇮🇹",
}
async def word2voice_using_google(
word: str, lang_code: str
) -> None:
'''
Stores the voice of the given word on the given language.
### Parameters
`word (str)`: The word the you want to store its voice.
`lang_code (str)`: Language code of the word. e.g. `"it"` for Italian.
'''
global VOICES_DIRECTORY_NAME
# Get Anki's apkg filename without its extension
# Create a Path object from the filename
path_obj_from_filename = Path(ANKI_APKG_FILENAME)
# Get the filename stem (without extension) to use as the voices directory
voices_directory_name = path_obj_from_filename.stem
VOICES_DIRECTORY_NAME = voices_directory_name
# Create the voices directory
Path(voices_directory_name).mkdir(exist_ok=True)
# Check if file is available in the directory
# Create a Path object for the file
voice_file_path = Path(voices_directory_name) / f"{word}.mp3"
# Check if the file exists, don't download it again
if voice_file_path.exists():
return
else:
# If connection error happened, try again
while True:
try:
tts = gTTS(
text=word,
# Make sure the lang_code is lower case
lang=lang_code.lower()
)
tts.save(f"{voices_directory_name}/{word}.mp3")
break
except gTTSError:
richprint(f"Trying to get voice of [red]{word}[/red]")
continue
async def prettifier_for_anki(
type_of_text: str,
text: str
) -> str:
'''
Takes type_of_text (original, definition or category) and returns the prettified string.
### Parameters
`type_of_text (str)`: Can be one of `original`, `definition` or `category`.
`text (str)`: The text to format as a HTML string.
### Returns
`prettified_string (str)`: The HTML string of the given text.
'''
original_word_size = "30px"
original_word_font_family = "Times New Roman"
original_word_color = "blue"
definition_text_size = "30px"
definition_text_font_family = "Arial"
definition_text_color = "black"
if type_of_text.lower() == "original":
prettified_string = f"""<center>
<span style="font-weight: bold; font-size: {original_word_size}; font-family: {original_word_font_family}; color: {original_word_color};">
{text}
</span>
</center>"""
elif type_of_text.lower() == "definition":
prettified_string = f"""<center>
<span style="font-size: {definition_text_size}; font-family: {definition_text_font_family}; color: {definition_text_color};">
{text}
</span>
</center>"""
elif type_of_text.lower() == "category":
prettified_string = f"""<center>
<span>
{text}
</span>
</center>"""
return prettified_string
async def pw_duome_scraper(playwright: Playwright) -> None:
'''
Scrapes the duome website's words, prettifies them using function `prettifier_for_anki` and stores them into the global variable `LIST_WHOLE_WORDS`.
### Parameters
`playwright (Playwright)`: Takes the playwright's class to open the browser asynchronously.
'''
global ANKI_APKG_FILENAME, FOUND_LANGS
# Get the path of the current script file (running Python)
current_script_path = Path(__file__).resolve()
# Use the resolved path to access the file or its parent directory
resolve_persistent_dir = current_script_path.parent / "PersistentContext"
browser_type = playwright.chromium
browser = await browser_type.launch_persistent_context(
user_data_dir=str(resolve_persistent_dir),
headless=False,
channel="msedge",
#slow_mo=10,
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.57",
args=[
"--disable-blink-features=AutomationControlled",
"--disable-dev-shm-usage",
"--disable-component-extensions-with-background-pages",
],
# Set locale & timezone so websites always load in English
locale="en-US",
timezone_id="Europe/London"
)
page = await browser.new_page()
# Set viewport size like a fake laptop
await page.set_viewport_size({"width": 1244, "height": 830})
# Extract language codes from the URL
# Define the regex pattern to extract language codes
extract_langcode_from_url_pattern = r"https://duome\.eu/vocabulary/([a-z]{2})/([a-z]{2})"
# Use regular expression to extract language codes
lang_code_matches = re.search(
pattern=extract_langcode_from_url_pattern,
string=DUOLINGO_WORDS_URL
)
language_codes_list = [lang.upper() for lang in lang_code_matches.groups()]
LANGUAGE_CODE = "-".join(language_codes_list)
FOUND_LANGS = language_codes_list
await page.goto(
url=DUOLINGO_WORDS_URL,
wait_until="load",
# Timeout: 3 minutes
timeout=180000,
)
#* Get total available words count
#? document.querySelector("small[class='cCCC']").textContent
total_words_element = await page.query_selector(
selector="small[class='cCCC']"
)
total_words_text = str(await total_words_element.text_content())
total_words = int(
re.search(
pattern=r"\b\d+\b",
string=total_words_text,
).group()
)
# Generate the Anki apkg filename
ANKI_APKG_FILENAME = ANKI_APKG_FILENAME.format(
ph_lang_code=LANGUAGE_CODE,
ph_total_words=total_words,
)
#* Get a list of all words (li elements), exclude header alphabets that have class='single'
#? document.querySelectorAll("div[id='words'] li:not(.single)")
# or: const word_element = document.querySelectorAll("div[id='words'] li:not(.single)")[0];
all_visible_words_element = await page.query_selector_all(
selector="div[id='words'] li:not(.single)",
)
# Colorize the final word count
if len(all_visible_words_element) == total_words:
colorized_word_count_result = "green"
elif len(all_visible_words_element) < total_words:
# FFC411 = orange
colorized_word_count_result = "#FFC411"
richprint(
f"[bold]Total Words Found: [{colorized_word_count_result}]{len(all_visible_words_element)}[/{colorized_word_count_result}]/[green]{total_words}[/green][/bold] (Visible Words Elements/Total Words)"
)
rich_console = Console()
# Iterate through elements & access their attributes|content
for i, word_element in enumerate(all_visible_words_element):
""" if i == 100:
break """
# end="\r" to print on same line as before (no print on newline, updating current line)
rich_console.print(f"[bold][red] Scarping Words:[/red] [#C5FF33]{i + 1}[/#C5FF33]/[green]{total_words}[/green][/bold]", end="\r")
# List of current word (word, definition, category)
LIST_CURRENT_WORD = list()
#* Access original word
#? word_element.querySelector("span[class='hide wN']").textContent
original_word_element = await word_element.query_selector(
selector="span[class='hide wN']"
)
original_word = str(await original_word_element.text_content())
#* Access original word with phonetic symbols (e.g. ò)
#? word_element.querySelector("span[class='speak xs voice']").textContent
original_phoneticword_element = await word_element.query_selector(
selector="span[class='speak xs voice']"
)
original_phoneticword = str(await original_phoneticword_element.text_content())
# Make it beautiful to be used in Anki using HTML-CSS!
pretty_original_word = await prettifier_for_anki(
type_of_text="original",
text=original_phoneticword
)
LIST_WHOLE_WORDS.append(original_phoneticword)
LIST_CURRENT_WORD.append(original_phoneticword)
LIST_CURRENT_WORD.append(pretty_original_word)
#print(original_word)
#* Store the pronunciation of the word as mp3
await word2voice_using_google(
word=original_phoneticword,
lang_code=FOUND_LANGS[1]
)
#* Access definition (displayed on hover on the word)
#? word_element.querySelector("span[class='wA']").getAttribute("title")
word_definition_element = await word_element.query_selector(
selector="span[class='wA']"
)
word_definition = str(
await word_definition_element.get_attribute(
name="title"
)
)
#* Remove the [original_word] from the definition, only from the beginnig of the string
word_definition = re.sub(
# Remove the [original_word] and any whitespace after it
pattern=rf"\[{original_word}\]\s*",
repl="",
string=word_definition,
# Only from the beginning of the text
count=1,
)
# Make it beautiful to be used in Anki using HTML-CSS!
pretty_word_definition = await prettifier_for_anki(
type_of_text="definition",
text=word_definition
)
LIST_CURRENT_WORD.append(pretty_word_definition)
#print(word_definition)
#* Access word category (part of speech), like: Adverb, must remove "· " from the string
#? word_element.querySelector("small[class='cCCC wP']").textContent
word_category_element = await word_element.query_selector(
selector="small[class='cCCC wP']"
)
word_category_text = str(await word_category_element.text_content())
#* Remove the dot and any number of whitespaces it have, only from the beginning of the string
word_category = re.sub(
# Remove the dot and any number of whitespaces it have
pattern=r"·\s*",
repl="",
string=word_category_text,
# Only from the beginning of the text
count=1,
)
# Make it beautiful to be used in Anki using HTML-CSS!
pretty_word_category = await prettifier_for_anki(
type_of_text="category",
text=word_category
)
LIST_CURRENT_WORD.append(word_category)
LIST_CURRENT_WORD.append(pretty_word_category)
#print(word_category)
#print("🔶" * 10)
# Append the collected list of current word to the main list of words
LIST_WHOLE_WORDS_FOR_ANKI.append(LIST_CURRENT_WORD)
LIST_CURRENT_WORD = list()
#* Write Whole Words List to a file
# Store the list in a file using JSON
with open("original_words_list.json", "w", encoding="utf-8") as file:
json.dump(LIST_WHOLE_WORDS, file)
async def anki_apkg_file_generator() -> None:
'''
Generates Anki's `.apkg` file based on three global variables: `ANKI_APKG_FILENAME`, `LANGUAGE_CODE`, `TOTAL_WORDS` by iterating over the list of lists (`LIST_WHOLE_WORDS`).
'''
global ANKI_APKG_FILENAME, FOUND_LANGS, DICT_COUNTRY_FLAGS
# Generate a random unique ID
random_model_id = random.randint(1, 999999999)
# Create a model for the Anki deck
model = genanki.Model(
# Use a unique ID
model_id=random_model_id,
name=f"Duolingo's {DICT_LANGS_CODES[FOUND_LANGS[1]]} to {DICT_LANGS_CODES[FOUND_LANGS[0]]} Words",
# Define the fields that each flashcard will have.
# Each field is represented as a dictionary with a single key 'name' and a corresponding value indicating the name of the field.
fields=[
{"name": "Word"},
{"name": "Definition"},
{"name": "Category"},
],
# Templates determine how the front and back of the flashcards will be formatted.
# Each template is a dictionary with attributes that define the formatting for both the question (qfmt) and answer (afmt) sides of the card.
templates=[
{
# Specifies the name of the template.
"name": f"Duolingo {DICT_LANGS_CODES[FOUND_LANGS[1]]}-{DICT_LANGS_CODES[FOUND_LANGS[0]]}",
# "qfmt" Defines the format of the question side of the card.
# It uses the {{Word}} field to display the original word content on the question side.
"qfmt": "{{Word}}",
# "afmt" Defines the format of the answer side of the card.
# It starts with {{FrontSide}} to show the content that's on the question side, followed by an HTML <hr> element with the id attribute set to "category", This is used as a separator line (horizontal rule).
# Then, it displays the "Category" field and the "Definition" field.
"afmt": "{{FrontSide}}<hr id='category'>{{Category}}<br>{{Definition}}",
},
]
)
# Generate a random unique ID
random_deck_id = random.randint(1, 999999999)
# Create the Anki deck
deck = genanki.Deck(
# Use a unique ID
deck_id=random_deck_id,
name=f"Duolingo's {DICT_LANGS_CODES[FOUND_LANGS[1]]} to {DICT_LANGS_CODES[FOUND_LANGS[0]]} Vocabulary",
description=f"Scraped from: {DUOLINGO_WORDS_URL}"
)
# Add cards to the deck
for original_word, html_original_word, html_definition, category, html_category in LIST_WHOLE_WORDS_FOR_ANKI:
# Replace consecutive whitespaces and non-word characters with underscores
category_as_tag = re.sub(
# Replace spaces with underscores, except at the beginning or end of words
pattern=r"(?<=\S) +(?!$)",
repl="_",
string=category
)
# Replace consecutive underscores with a single underscore
category_as_tag = re.sub(
# Remove spaces at the beginning or end of words
pattern=r"^ +| +$",
#repl="_",
repl="",
string=category_as_tag
)
note = genanki.Note(
model=model,
# Add note with the word's voice in the top-center of each card
fields=[f"<center>[sound:{original_word}.mp3]</center>\n{html_original_word}", html_definition, html_category],
# Add category as tag to distinguish each word
tags=[f"{DICT_COUNTRY_FLAGS[FOUND_LANGS[1]]}{DICT_LANGS_CODES[FOUND_LANGS[1]]}", category_as_tag]
)
deck.add_note(note)
# Create a package and save it to a file
package = genanki.Package(deck)
# Get a list of all file names in the directory
voices_dir_path = Path(VOICES_DIRECTORY_NAME)
list_of_all_voices_media = [f"{VOICES_DIRECTORY_NAME}/{file.name}" for file in voices_dir_path.iterdir() if file.is_file()]
package.media_files = list_of_all_voices_media
package.write_to_file(ANKI_APKG_FILENAME)
richprint(f"✅ Successfully created file: {ANKI_APKG_FILENAME} from {len(LIST_WHOLE_WORDS_FOR_ANKI)} words.")
async def main():
'''
Runs the functions that have `Playwright` as parameter, then runs other necessary functions after closing the browser.
'''
async with async_playwright() as playwright:
await pw_duome_scraper(
playwright=playwright
)
# As of now, all constant variables are ready, generate the apkg file
await anki_apkg_file_generator()
# Run the async function
asyncio.run(
main()
)