From ab437c9d6b4c6e7c0ee1601cce49aca0fa302014 Mon Sep 17 00:00:00 2001 From: seth-js <83692925+seth-js@users.noreply.github.com> Date: Sun, 22 Oct 2023 17:36:31 -0500 Subject: [PATCH] Reworked "create-freq.js", and changed corpus --- src/create-freq.js | 90 +++++++++++++++++++++++++------------------- src/make-yomichan.js | 20 ++++++---- 2 files changed, 64 insertions(+), 46 deletions(-) diff --git a/src/create-freq.js b/src/create-freq.js index 398d9cb..4d4098d 100644 --- a/src/create-freq.js +++ b/src/create-freq.js @@ -22,24 +22,27 @@ for (const [lemma, info] of Object.entries(lemmaDict)) { } } -// const sentences = JSON.parse(readFileSync('data/sentences/opensubtitles-es-sentences.json')); +const sentences = JSON.parse(readFileSync('data/sentences/netflix-es-sentences.json')); -const sentences = ['¡Un mundo de espadas y hechicería!', 'si Dios quiere.']; +// const sentences = ['¡Un mundo de espadas y hechicería!', 'si Dios quiere.']; -const freqList = {}; +const freqList = new Map(); let totalWords = 0; let missedWords = 0; +let sentenceLimit = 5000000; + +console.log('Parsing corpus...'); let index = 0; for (const sentence of sentences) { index++; // log progress the first time, then every 100,000 sentences, and the last one if (index === 1 || index % 100000 === 0 || index === sentences.length) { - console.log(`(${index}/${sentences.length})`); + console.log(`(${index.toLocaleString()} of ${sentences.length.toLocaleString()} sentences parsed)`); } - // stop at 5 million - if (index === 5000000) { + if (index === sentenceLimit) { + console.log(`(${sentenceLimit.toLocaleString()} sentence limit reached. moving on...)`) break; } @@ -51,57 +54,66 @@ for (const sentence of sentences) { for (const { word, surface } of customWords) { if (word !== '' && /\p{L}/u.test(word) && /\p{L}/u.test(surface) && !nameDict.has(word)) { totalWords++; - freqList[word] = (freqList[word] || 0) + 1; + + if (freqList.has(word)) { + freqList.set(word, freqList.get(word) + 1); + } else { + freqList.set(word, 1); + } } - if (word === '' && /\p{L}/u.test(word) && /\p{L}/u.test(surface)) { + if (word === '' && /\p{L}/u.test(surface)) { missedWords++; } } } -const freqArr = Object.entries(freqList) - .filter(([word]) => lemmaDict[word]) - .map(([word, count]) => ({ word, count })) - .sort((a, b) => b.count - a.count); +console.log('Done parsing.'); + +const freqArr = []; + +for (const [word, count] of freqList) { + freqArr.push({ word, count }); +} -const totalCount = freqArr.reduce((sum, entry) => sum + entry.count, 0); +freqArr.sort((a, b) => b.count - a.count); -const thresholds = [0.95, 0.98, 0.99]; -const coverage = new Map(); -const thousand = []; +const nineFive = []; +const nineEight = []; +const nineNine = []; +const thousand = {}; let percSoFar = 0.0; for (const { word, count } of freqArr) { - percSoFar += count / totalCount; + percSoFar += count / totalWords; - for (const threshold of thresholds) { - if (threshold >= percSoFar) { - coverage.set(threshold, coverage.get(threshold) || new Set()); - coverage.get(threshold).add(word); - } + if (0.95 >= percSoFar) { + nineFive.push(word); } - if (coverage.get(0.95).size === 1000) { - thousand.push(...coverage.get(0.95)); - console.log(`The top 1000 words cover ${+(percSoFar * 100).toFixed(2)}%.`); + if (0.98 >= percSoFar) { + nineEight.push(word); } -} -const hundredCoverage = {}; + if (0.99 >= percSoFar) { + nineNine.push(word); + } -for (const { word, count } of freqArr) { - hundredCoverage[word] = count; + if (nineFive.length === 1000) { + thousand.words = [...nineFive]; + thousand.coverage = `${+(percSoFar * 100).toFixed(2)}%`; + } } const message = ` -Your corpus is made up of ${totalCount} words. -${coverage.get(0.95).size} words cover 95%. -${coverage.get(0.98).size} words cover 98%. -${coverage.get(0.99).size} words cover 99%. +Your corpus is made up of ${totalWords.toLocaleString()} words. +The 1000 most common words cover ${thousand.coverage}. +${nineFive.length} words cover 95%. +${nineEight.length} words cover 98%. +${nineNine.length} words cover 99%. -Frequency list contains ${freqArr.length} unique word(s). +Frequency list contains ${freqArr.length.toLocaleString()} unique word(s). ${((totalWords - missedWords) / totalWords * 100).toFixed(2)}% of words were able to find a definition. `; @@ -109,11 +121,11 @@ ${((totalWords - missedWords) / totalWords * 100).toFixed(2)}% of words were abl console.log(message); const frequencies = { - 'nine-five': Array.from(coverage.get(0.95)), - 'nine-eight': Array.from(coverage.get(0.98)), - 'nine-nine': Array.from(coverage.get(0.99)), + 'nine-five': nineFive, + 'nine-eight': nineEight, + 'nine-nine': nineNine, '1k': thousand, - 'hundred': hundredCoverage, + 'hundred': freqArr, }; for (const [file, data] of Object.entries(frequencies)) { @@ -123,7 +135,7 @@ for (const [file, data] of Object.entries(frequencies)) { writeFileSync('data/freq/info.txt', message); function getWords(sentence) { - return sentence.split(/(?=\s)|(?<=\s)|(?=[.,!?—\]\[\)":¡])|(?<=[.,!?—\]\[\(":¡])/g) + return sentence.replace(/^-/, '- ').split(/(?=\s)|(?<=\s)|(?=[.,!?—\]\[\)":¡¿…])|(?<=[.,!?—\]\[\(":¡¿…])/g) .map(word => { if (/[.,!?:"]|\s/.test(word)) { return { word, lemma: word }; diff --git a/src/make-yomichan.js b/src/make-yomichan.js index 6bc30e4..25ab6fa 100644 --- a/src/make-yomichan.js +++ b/src/make-yomichan.js @@ -10,11 +10,14 @@ const lemmaDict = JSON.parse(readFileSync('data/tidy/spanish-lemmas.json')); const formDict = JSON.parse(readFileSync('data/tidy/spanish-forms.json')); let popularDict; -let frequencies = {}; +const frequencies = new Map(); -if (existsSync('data/freq/nine-five.json') && existsSync('data/freq/hundred.json')) { - popularDict = new Set(JSON.parse(readFileSync('data/freq/nine-five.json'))); - frequencies = JSON.parse(readFileSync('data/freq/hundred.json')); +if (existsSync('data/freq/nine-eight.json') && existsSync('data/freq/hundred.json')) { + popularDict = new Set(JSON.parse(readFileSync('data/freq/nine-eight.json'))); + + for (const { word, count } of JSON.parse(readFileSync('data/freq/hundred.json'))) { + frequencies.set(word, count); + } } const lemmaYomi = []; @@ -29,7 +32,7 @@ for (const [lemma, infoMap] of allInfo) { const tags = [pos, ...(info.tags || [])].join(' '); const ipa = info.ipa || ''; const popular = popularDict && popularDict.has(lemma) ? 'P' : ''; - const freq = frequencies[lemma] || 0; + const freq = frequencies.get(lemma) || 0; // term, ipa, tags, rules, frequency, definitions, sequence, tags2 lemmaYomi.push([lemma, ipa, tags, '', freq, glosses, 0, popular]); @@ -55,7 +58,10 @@ for (const [form, allInfo] of Object.entries(formDict)) { } } -const tagBank = Array.from(allPOS).map((pos) => [pos, 'partOfSpeech', -3, pos, 0]); +const tagBank = [ + ['P', 'popular', -10, 'popular term', 10], + ...Array.from(allPOS).map((pos) => [pos, 'partOfSpeech', -3, pos, 0]) +]; const customTags = ['non-lemma', 'masculine', 'feminine', 'neuter']; @@ -80,7 +86,7 @@ while (allYomi.length > 0) { writeFileSync(`${yomiPath}/term_bank_${bankIndex}.json`, JSON.stringify(batch)); } -const freqYomi = Object.entries(frequencies).map(([word, count]) => [word, 'freq', count]); +const freqYomi = [...frequencies.entries()].map(([word, count]) => [word, 'freq', count]); writeFileSync(`${yomiPath}/term_meta_bank_1.json`, JSON.stringify(freqYomi));