-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
141 lines (111 loc) · 4.96 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import requests
from tqdm import tqdm
import os
from lm_dataformat import Archive
import shutil
import spacy
import json
import glob
def download_file(url):
ok = True
file_name = './downloaded.txt'
txt = ''
response = requests.get(url, stream=True)
total_size_in_bytes = int(response.headers.get('content-length', 0))
block_size = 1024
progress_bar = tqdm(total=total_size_in_bytes, unit='iB', unit_scale=True)
with open(file_name, 'wb') as file:
for data in response.iter_content(block_size):
progress_bar.update(len(data))
file.write(data)
progress_bar.close()
if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes:
ok = False
if ok:
with open(file_name, encoding='utf8') as f:
txt = f.read()
#Each text has a similiar disclaimer and intro appended.
#Attempting to partition output text using a pattern to remove intro.
before, separator, after = txt.partition('GUTENBERG EBOOK')
if after != '':
txt = after
before, separator, after = txt.partition('END OF')
if separator != '':
txt = before
return ok, txt
def get_word_stats(txt):
if not txt:
return 0, 0, 0, 0, 0, 0
sentences = 0
words = 0
verbs = 0
nouns = 0
punctuations = 0
symbols = 0
doc = nlp(txt)
sentences = len(list(doc.sents))
words = len([token.text for token in doc if not token.is_punct])
nouns = len([token.text for token in doc if (not token.is_stop and not token.is_punct and token.pos_ == "NOUN")])
verbs = len([token.text for token in doc if (not token.is_stop and not token.is_punct and token.pos_ == "VERB")])
punctuations = len([token.text for token in doc if (token.is_punct or token.pos_ == "PUNCT")])
symbols = len([token.text for token in doc if (token.pos_ == "SYM")])
return sentences, words, verbs, nouns, punctuations, symbols
ar = Archive('./data')
file_name_zst = './project_gutenberg_pl_corpus.jsonl.zst'
file_name_manifest = './project_gutenberg_pl_corpus.manifest'
#disabling some unused model features speeds things up to 20%
nlp = spacy.load("pl_core_news_md", disable=('ner','lemmatizer','textcat','entity_linker'))
total_len = 0
total_docs = 0
total_sentences = 0
total_words = 0
total_verbs = 0
total_nouns = 0
total_punctuations = 0
total_symbols = 0
url = 'https://gutendex.com/books?languages=pl'
#api returns all books metadata collection with hrefs to each book's further details.
books = requests.get(url)
books_count = len(books.json()['results'])
if books.ok:
for idx, book in enumerate(books.json()['results']):
book_media_url = None
if 'text/plain; charset=utf-8' in book['formats']:
book_media_url = book['formats']['text/plain; charset=utf-8']
if 'text/plain' in book['formats']:
book_media_url = book['formats']['text/plain']
if book_media_url:
print(book_media_url)
ok, txt = download_file(book_media_url)
if ok:
l = len(txt.strip())
if l > 100000:
nlp.max_length = len(txt) + 100
sentences, words, verbs, nouns, punctuations, symbols = get_word_stats(txt.strip())
total_words += words
total_verbs += verbs
total_nouns += nouns
total_len += l
total_docs += 1
total_sentences += sentences
total_punctuations += punctuations
total_symbols += symbols
meta = {'url' : book_media_url, 'title': book['title'], 'length': l, 'sentences': sentences, 'words': words, 'verbs': verbs, 'nouns': nouns, 'punctuations': punctuations, 'symbols': symbols}
ar.add_data(txt.strip(), meta = meta)
print("Added {num}/{total} ".format(num=idx+1, total=books_count) + meta.get('url'))
else:
print("Skipping {num}/{total}".format(num=idx+1, total=books_count))
ar.commit()
data_files= glob.glob('./data/*')
file_size = 0
#This solves an issue where data_files remained locked after ar commiting, causing error on cleanup
ar = None
for f in data_files:
if f.endswith('.zst'):
shutil.copy(f, os.path.join(file_name_zst))
file_size = os.path.getsize(file_name_zst)
os.remove(f)
manifest = {"project" : "SpeakLeash", "name": "project_gutenberg_pl", "description": "Polish books collection from Project Gutenberg", "license": "Public Domain", "language": "pl", "file_size" : file_size, "sources": [{"name": "project_gutenberg_pl", "url": "https://www.gutenberg.org/", "license": "Public Domain"}], "stats": {"documents": total_docs, "sentences": total_sentences, "words" : total_words, "nouns" : total_nouns, "verbs" : total_verbs, "characters": total_len, "punctuations" : total_punctuations, "symbols" : total_symbols}}
json_manifest = json.dumps(manifest, indent = 4)
with open(file_name_manifest, 'w') as mf:
mf.write(json_manifest)