This library strips all namu marks from a namu wiki document and extracts its plain text only.
- Python 3
pip install namu-wiki-extractor
import json
from namuwiki.extractor import extract_text
with open('namu_wiki.json', 'r', encoding='utf-8') as input_file:
namu_wiki = json.load(input_file)
item = namu_wiki[1]
plain_text = extract_text(item['text'])
print(plain_text)
import json
from namuwiki.extractor import extract_text
with open('namu_wiki.json', 'r', encoding='utf-8') as input_file:
namu_wiki = json.load(input_file)
item = namu_wiki[1]
document = extract_text(item['text'], separate_deletions=True, separate_footnotes=True)
print(document.text)
print(document.deletions)
print(document.footnotes)
import json
from multiprocessing import Pool
from namuwiki.extractor import extract_text
def work(document):
return {
'title': document['title'],
'content': extract_text(document['text'])
}
with open('namu_wiki.json', 'r', encoding='utf-8') as input_file:
namu_wiki = json.load(input_file)
with Pool() as pool:
items = pool.map(work, namu_wiki)
namuwiki.extractor.extract_text(source: str, separate_deletions: bool = False, separate_footnotes: bool = False) -> Union[str, Document]
This function strips all namu marks from source
and extracts its plain text. If either separate_deletions
or separate_footnotes
is True
, this returns extracted plain text as str
. Otherwise, this returns extracted plain text, deletions and footnotes as Document
source
: Text from a namu wiki documentseparate_deletions
: Whether deletions should be separately extracted from thesource
separate_footnotes
: Whether footnotes should be separately extracted from thesource
text
: Plain text with all namu marks removed from the givensource
deletions
: Separately extracted deletions from the givensource
footnotes
: Separately extracted footnotes from the givensource
A JSON dump file of namu wiki can be downloaded from here