main.py

import requests
from tqdm import tqdm
import os
from lm_dataformat import Archive
import shutil
import spacy
import json
import glob


def download_file(url):

    ok = True
    file_name = './downloaded.txt'
    txt = ''

    response = requests.get(url, stream=True)
    total_size_in_bytes = int(response.headers.get('content-length', 0))
    block_size = 1024
    progress_bar = tqdm(total=total_size_in_bytes, unit='iB', unit_scale=True)
    with open(file_name, 'wb') as file:
        for data in response.iter_content(block_size):
            progress_bar.update(len(data))
            file.write(data)
    progress_bar.close()
    if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes:
        ok = False

    if ok:
      with open(file_name, encoding='utf8') as f:
        txt = f.read()
    #Each text has a similiar disclaimer and intro appended. 
    #Attempting to partition output text using a pattern to remove intro.
    before, separator, after = txt.partition('GUTENBERG EBOOK')
    if after != '':
        txt = after
    before, separator, after = txt.partition('END OF')
    if separator != '':
        txt = before
        
        
    return ok, txt

def get_word_stats(txt):
    if not txt:
        return 0, 0, 0, 0, 0, 0

    sentences = 0
    words = 0
    verbs = 0
    nouns = 0
    punctuations = 0
    symbols = 0

    doc = nlp(txt)

    sentences = len(list(doc.sents))
    words = len([token.text for token in doc if not token.is_punct])
    nouns = len([token.text for token in doc if (not token.is_stop and not token.is_punct and token.pos_ == "NOUN")])
    verbs = len([token.text for token in doc if (not token.is_stop and not token.is_punct and token.pos_ == "VERB")])
    punctuations = len([token.text for token in doc if (token.is_punct or token.pos_ == "PUNCT")])
    symbols = len([token.text for token in doc if (token.pos_ == "SYM")])

    return sentences, words, verbs, nouns, punctuations, symbols

ar = Archive('./data')

file_name_zst = './project_gutenberg_pl_corpus.jsonl.zst'
file_name_manifest = './project_gutenberg_pl_corpus.manifest'

#disabling some unused model features speeds things up to 20%
nlp = spacy.load("pl_core_news_md", disable=('ner','lemmatizer','textcat','entity_linker'))

total_len = 0
total_docs = 0
total_sentences = 0
total_words = 0
total_verbs = 0
total_nouns = 0
total_punctuations = 0
total_symbols = 0

url = 'https://gutendex.com/books?languages=pl'

#api returns all books metadata collection with hrefs to each book's further details. 

books = requests.get(url)
books_count = len(books.json()['results'])

if books.ok:
    for idx, book in enumerate(books.json()['results']):
        book_media_url = None
        if 'text/plain; charset=utf-8' in book['formats']:
            book_media_url = book['formats']['text/plain; charset=utf-8']
        if 'text/plain' in book['formats']:
            book_media_url = book['formats']['text/plain']
        if book_media_url:
            print(book_media_url)
            ok, txt = download_file(book_media_url)
            if ok:
                l = len(txt.strip())
                if l > 100000:
                    nlp.max_length = len(txt) + 100
                sentences, words, verbs, nouns, punctuations, symbols = get_word_stats(txt.strip())
                total_words += words
                total_verbs += verbs
                total_nouns += nouns
                total_len += l
                total_docs += 1
                total_sentences += sentences
                total_punctuations += punctuations
                total_symbols += symbols
                meta = {'url' : book_media_url, 'title': book['title'], 'length': l, 'sentences': sentences, 'words': words, 'verbs': verbs, 'nouns': nouns, 'punctuations': punctuations, 'symbols': symbols}
                ar.add_data(txt.strip(), meta = meta)
                print("Added {num}/{total} ".format(num=idx+1, total=books_count) + meta.get('url'))
        else:
            print("Skipping {num}/{total}".format(num=idx+1, total=books_count))
    

ar.commit()


data_files= glob.glob('./data/*')
file_size = 0

#This solves an issue where data_files remained locked after ar commiting, causing error on cleanup
ar = None

for f in data_files:
    if f.endswith('.zst'):
        shutil.copy(f, os.path.join(file_name_zst))
        file_size = os.path.getsize(file_name_zst)

    os.remove(f)

manifest = {"project" : "SpeakLeash", "name": "project_gutenberg_pl", "description": "Polish books collection from Project Gutenberg", "license": "Public Domain", "language": "pl", "file_size" : file_size, "sources": [{"name": "project_gutenberg_pl", "url": "https://www.gutenberg.org/", "license": "Public Domain"}], "stats": {"documents": total_docs, "sentences": total_sentences, "words" : total_words, "nouns" : total_nouns, "verbs" : total_verbs, "characters": total_len, "punctuations" : total_punctuations, "symbols" : total_symbols}}
json_manifest = json.dumps(manifest, indent = 4) 

with open(file_name_manifest, 'w') as mf:
    mf.write(json_manifest)