-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
102 lines (76 loc) · 3.28 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import requests
from tqdm import tqdm
import os
from lm_dataformat import Archive
import shutil
import spacy
import json
import glob
import time
import sys
def get_word_stats(txt):
if not txt:
return 0, 0, 0, 0, 0, 0
sentences = 0
words = 0
verbs = 0
nouns = 0
punctuations = 0
symbols = 0
doc = nlp(txt)
sentences = len(list(doc.sents))
words = len([token.text for token in doc if not token.is_punct])
nouns = len([token.text for token in doc if (not token.is_stop and not token.is_punct and token.pos_ == "NOUN")])
verbs = len([token.text for token in doc if (not token.is_stop and not token.is_punct and token.pos_ == "VERB")])
punctuations = len([token.text for token in doc if (token.is_punct or token.pos_ == "PUNCT")])
symbols = len([token.text for token in doc if (token.pos_ == "SYM")])
return sentences, words, verbs, nouns, punctuations, symbols
nlp = spacy.load("pl_core_news_md")
txt_datasets = glob.glob('./*.json')
for f in txt_datasets:
ar = Archive('./data')
data = None
with open(f, 'r') as jf:
data = json.load(jf)
file_name_zst = './' + data.get("name","") + '.zst'
file_name_manifest = './' + data.get("name","") + '.manifest'
total_len = 0
total_docs = 0
total_sentences = 0
total_words = 0
total_verbs = 0
total_nouns = 0
total_punctuations = 0
total_symbols = 0
if data:
txt_files = glob.glob(f.replace('.json','') + '/*.txt')
for txt_file in txt_files:
with open(txt_file, 'r') as tf:
print("Processing file: " + txt_file)
txt = tf.read()
l = len(txt)
if l > 100000:
nlp.max_length = len(txt) + 100
sentences, words, verbs, nouns, punctuations, symbols = get_word_stats(txt.strip())
total_words += words
total_verbs += verbs
total_nouns += nouns
total_len += l
total_docs += 1
total_sentences += sentences
total_punctuations += punctuations
total_symbols += symbols
meta = {'name' : txt_file, 'length': l, 'sentences': sentences, 'words': words, 'verbs': verbs, 'nouns': nouns, 'punctuations': punctuations, 'symbols': symbols}
ar.add_data(txt.strip(), meta = meta)
ar.commit()
data_files= glob.glob('./data/*')
file_size = 0
for f in data_files:
if f.endswith('.zst'):
shutil.copy(f, os.path.join(file_name_zst))
file_size = os.path.getsize(file_name_zst)
os.remove(f)
manifest = {"project" : data.get("project",""), "name": data.get("name",""), "description": data.get("desctiption",""), "license": data.get("license",""), "language": data.get("language",""), "file_size" : file_size, "sources": data.get("sources",[]), "stats": {"documents": total_docs, "sentences": total_sentences, "words" : total_words, "nouns" : total_nouns, "verbs" : total_verbs, "characters": total_len, "punctuations" : total_punctuations, "symbols" : total_symbols}}
json_manifest = json.dumps(manifest, indent = 4)
with open(file_name_manifest, 'w') as mf:
mf.write(json_manifest)