Skip to content

Commit

Permalink
fix: Better word matching regex (#110)
Browse files Browse the repository at this point in the history
feat: word matching regex and add substitute API

Problem:
The old implementation of word finding RegExp also matches communiqu
of the word communiqué, which can lead to mis-hyphenation.
One-step RegExp to correctly match words is not possible.
(For details see issue #109.)

Solution:
First find all words using a transpiled form of
RegExp `/[\p{Letter}]{n,}/gui`. This RegExp is quite large.
It is minimized by collecting all characters used by patterns.
Then in replacer-function test if all characters are in `lo.alphabet`.
With this words containing a char not in lo.alphabet are not
hyphenated at all. By substituting foreign characters such words
can be hyphenated.
So also add an API to substitute characters and document this API.

Notes:
This commit requires all .wasm files to be regenerated.

Fixes #109
  • Loading branch information
mnater authored Apr 16, 2020
1 parent c27893e commit f154a36
Show file tree
Hide file tree
Showing 306 changed files with 2,270 additions and 755 deletions.
59 changes: 42 additions & 17 deletions Hyphenopoly.js
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@
"keepAlive": setProp(true, 2),
"normalize": setProp(false, 2),
"safeCopy": setProp(true, 2),
"substitutions": setProp(empty(), 2),
"timeout": setProp(1000, 2)
});

Expand Down Expand Up @@ -497,6 +498,8 @@
"msg": "found word longer than 61 characters"
});
hw = word;
} else if (lo.reNotAlphabet.test(word)) {
hw = word;
} else {
/* eslint-disable security/detect-object-injection */
hw = lo.hyphenate(
Expand All @@ -505,8 +508,8 @@
selSettings.leftminPerLang[lang],
selSettings.rightminPerLang[lang]
);
/* eslint-enable security/detect-object-injection */
}
/* eslint-enable security/detect-object-injection */
} else {
hw = hyphenateCompound(word);
}
Expand Down Expand Up @@ -574,7 +577,14 @@
const orphanController = (orphanControllerPool.has(sel))
? orphanControllerPool.get(sel)
: createOrphanController(sel);
const re = lo.re.get(sel);

/*
* Transpiled RegExp of
* /[${alphabet}\p{Letter}-]{${minwordlength},}/gui
*/
const reWord = RegExp(
`[${lo.alphabet}a-z\u00DF-\u00F6\u00F8-\u00FE\u0101\u0103\u0105\u0107\u0109\u010D\u010F\u0111\u0113\u0117\u0119\u011B\u011D\u011F\u0123\u0125\u012B\u012F\u0131\u0135\u0137\u013C\u013E\u0142\u0144\u0146\u0148\u014D\u0151\u0153\u0155\u0159\u015B\u015D\u015F\u0161\u0165\u016B\u016D\u016F\u0171\u0173\u017A\u017C\u017E\u017F\u01CE\u01D0\u01D2\u01D4\u01D6\u01D8\u01DA\u01DC\u0219\u021B\u02BC\u0390\u03AC-\u03CE\u03D0\u03E3\u03E5\u03E7\u03E9\u03EB\u03ED\u03EF\u03F2\u0430-\u044F\u0451-\u045C\u045E\u045F\u0491\u04AF\u04E9\u0561-\u0585\u0587\u0905-\u090C\u090F\u0910\u0913-\u0928\u092A-\u0930\u0932\u0933\u0935-\u0939\u093D\u0960\u0961\u0985-\u098C\u098F\u0990\u0993-\u09A8\u09AA-\u09B0\u09B2\u09B6-\u09B9\u09BD\u09CE\u09DC\u09DD\u09DF-\u09E1\u0A05-\u0A0A\u0A0F\u0A10\u0A13-\u0A28\u0A2A-\u0A30\u0A32\u0A33\u0A35\u0A36\u0A38\u0A39\u0A85-\u0A8B\u0A8F\u0A90\u0A93-\u0AA8\u0AAA-\u0AB0\u0AB2\u0AB3\u0AB5-\u0AB9\u0ABD\u0AE0\u0B05-\u0B0C\u0B0F\u0B10\u0B13-\u0B28\u0B2A-\u0B30\u0B32\u0B33\u0B35-\u0B39\u0B60\u0B61\u0B83\u0B85-\u0B8A\u0B8E-\u0B90\u0B92-\u0B95\u0B99\u0B9A\u0B9C\u0B9E\u0B9F\u0BA3\u0BA4\u0BA8-\u0BAA\u0BAE-\u0BB5\u0BB7-\u0BB9\u0C05-\u0C0C\u0C0E-\u0C10\u0C12-\u0C28\u0C2A-\u0C33\u0C35-\u0C39\u0C60\u0C61\u0C85-\u0C8C\u0C8E-\u0C90\u0C92-\u0CA8\u0CAA-\u0CB3\u0CB5-\u0CB9\u0CBD\u0CDE\u0CE0\u0CE1\u0D05-\u0D0C\u0D0E-\u0D10\u0D12-\u0D28\u0D2A-\u0D39\u0D60\u0D61\u0D7A-\u0D7F\u0E01-\u0E2E\u0E30\u0E32\u0E33\u0E40-\u0E45\u10D0-\u10F0\u1200-\u1248\u124A-\u124D\u1250-\u1256\u1258\u125A-\u125D\u1260-\u1288\u128A-\u128D\u1290-\u12B0\u12B2-\u12B5\u12B8-\u12BE\u12C0\u12C2-\u12C5\u12C8-\u12D6\u12D8-\u1310\u1312-\u1315\u1318-\u135A\u1380-\u138F\u1E0D\u1E37\u1E41\u1E43\u1E45\u1E47\u1E6D\u1F00-\u1F07\u1F10-\u1F15\u1F20-\u1F27\u1F30-\u1F37\u1F40-\u1F45\u1F50-\u1F57\u1F60-\u1F67\u1F70-\u1F7D\u1F80-\u1F87\u1F90-\u1F97\u1FA0-\u1FA7\u1FB2-\u1FB4\u1FB6\u1FB7\u1FC2-\u1FC4\u1FC6\u1FC7\u1FD2\u1FD3\u1FD6\u1FD7\u1FE2-\u1FE7\u1FF2-\u1FF4\u1FF6\u1FF7\u2C81\u2C83\u2C85\u2C87\u2C89\u2C8D\u2C8F\u2C91\u2C93\u2C95\u2C97\u2C99\u2C9B\u2C9D\u2C9F\u2CA1\u2CA3\u2CA5\u2CA7\u2CA9\u2CAB\u2CAD\u2CAF\u2CB1\u2CC9\u2D80-\u2D96\u2DA0-\u2DA6\u2DA8-\u2DAE\u2DB0-\u2DB6\u2DB8-\u2DBE\u2DC0-\u2DC6\u2DC8-\u2DCE\u2DD0-\u2DD6\u2DD8-\u2DDE\uAB01-\uAB06\uAB09-\uAB0E\uAB11-\uAB16\uAB20-\uAB26\uAB28-\uAB2E-]{${minWordLength},}`, "gui"
);

/**
* Hyphenate text according to setting in sel
Expand All @@ -584,9 +594,9 @@
function hyphenateText(text) {
let tn = null;
if (C.normalize) {
tn = text.normalize("NFC").replace(re, wordHyphenator);
tn = text.normalize("NFC").replace(reWord, wordHyphenator);
} else {
tn = text.replace(re, wordHyphenator);
tn = text.replace(reWord, wordHyphenator);
}
if (selSettings.orphanControl !== 1) {
tn = tn.replace(
Expand Down Expand Up @@ -751,7 +761,8 @@
lo.exc = new Map();
}
/* eslint-enable security/detect-object-injection */
lo.re = new Map();
lo.alphabet = alphabet;
lo.reNotAlphabet = RegExp(`[^${alphabet}]`, "gi");
lo.hyphenate = hyphenateFunction;
C.selectors.forEach((sel) => {
/* eslint-disable security/detect-object-injection */
Expand Down Expand Up @@ -784,16 +795,6 @@
Number(selSettings.rightminPerLang[lang]) || 0
);
/* eslint-enable security/detect-object-injection */

/*
* Find words with characters from `alphabet` and
* `Zero Width Non-Joiner` and `-` with a min length.
*
* This regexp is not perfect. It also finds parts of words
* that follow a character that is not in the `alphabet`.
* Word delimiters are not taken in account.
*/
lo.re.set(sel, RegExp(`[${alphabet}\u200C-]{${selSettings.minWordLength},}`, "gi"));
});
lo.ready = true;
// eslint-disable-next-line security/detect-object-injection
Expand Down Expand Up @@ -852,10 +853,34 @@
*/
function instantiateWasmEngine(heProm, lang) {
const wa = window.WebAssembly;

// eslint-disable-next-line require-jsdoc
function registerSubstitutions(alphalen, exp) {
/* eslint-disable security/detect-object-injection */
if (H.c.substitute && H.c.substitute[lang]) {
const subst = H.c.substitute[lang];
eachKey(subst, (sChar) => {
const sCharU = sChar.toUpperCase();
let sCharUcc = 0;
if (sCharU !== sChar) {
sCharUcc = sCharU.charCodeAt(0);
}
alphalen = exp.subst(
sChar.charCodeAt(0),
sCharUcc,
subst[sChar].charCodeAt(0)
);
});
}
return alphalen;
/* eslint-enable security/detect-object-injection */
}

// eslint-disable-next-line require-jsdoc
function handleWasm(res) {
const exp = res.instance.exports;
const alphalen = exp.conv();
let alphalen = exp.conv();
alphalen = registerSubstitutions(alphalen, exp);
const baseData = {
/* eslint-disable multiline-ternary */
"buf": exp.mem.buffer,
Expand All @@ -871,7 +896,7 @@
baseData,
exp.hyphenate
),
decode(new Uint16Array(exp.mem.buffer, 770, alphalen - 1)),
decode(new Uint16Array(exp.mem.buffer, 1026, alphalen - 1)),
baseData.lm,
baseData.rm
);
Expand Down
29 changes: 29 additions & 0 deletions docs/Setup.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ These page documents the optional fields in `setup`:
* [keepAlive](#keepalive)
* [normalize](#normalize)
* [safeCopy](#safecopy)
* [substitute](#substitute)
* [timeout](#timeout)
* [Selector Based Settings](#selector-based-settings)
* [compound](#compound)
Expand Down Expand Up @@ -269,6 +270,34 @@ To prevent soft hyphens from being copied to the clipboard, Hyphenopoly.js regis
_It does NOT remove other `hyphen`-characters!_
This feature is on by default, but it's a hack – disable it if you don't like it.

### substitute
````
type: Object
default: undefined
````
Substitute characters
````html
<script>
var Hyphenopoly = {
require: {...},
paths: {...},
setup: {
substitute: {
"en-us": {
"é": "e" //substitute "é" with "e" in "en-us"
}
},
selectors: {...}
}
};
</script>
````
If a word contains a letter that is not part of the alphabet defined in the sample file, this word is not hyphenated by default. This is the case for example with "communiqué". The letter "é" is not in the English alphabet, so the word cannot be hyphenated.
These problems can be solved with letter substitutions. If you want to use the letter "e" instead of the letter "é" for the hyphenation process, you can specify this accordingly (see example).
"communiqué" is then separated (com-mu-niqué).

The substitute object must contain language-codes as keys. The values are objects theirselves, with the characters to be substituted as keys and the substituting characters as values (both lowercase only – Hyphenopoly handles all the letter casing, if necessary).

### timeout
````
type: number
Expand Down
45 changes: 39 additions & 6 deletions hyphenopoly.module.js
Original file line number Diff line number Diff line change
Expand Up @@ -255,9 +255,8 @@ function prepareLanguagesObj(
} else {
lo.exceptions = new Map();
}
/* eslint-disable security/detect-non-literal-regexp */
lo.genRegExp = new RegExp(`[${alphabet}\u200C-]{${H.c.minWordLength},}`, "gi");
/* eslint-enable security/detect-non-literal-regexp */
lo.alphabet = alphabet;
lo.reNotAlphabet = RegExp(`[^${alphabet}]`, "gi");
(() => {
H.c.leftminPerLang[lang] = Math.max(
patternLeftmin,
Expand Down Expand Up @@ -323,6 +322,28 @@ function encloseHyphenateFunction(baseData, hyphenateFunc) {
* @returns {undefined}
*/
function instantiateWasmEngine(lang) {
// eslint-disable-next-line require-jsdoc
function registerSubstitutions(alphalen, exp) {
/* eslint-disable security/detect-object-injection */
if (H.c.substitute && H.c.substitute[lang]) {
const subst = H.c.substitute[lang];
Object.keys(subst).forEach((sChar) => {
const sCharU = sChar.toUpperCase();
let sCharUcc = 0;
if (sCharU !== sChar) {
sCharUcc = sCharU.charCodeAt(0);
}
alphalen = exp.subst(
sChar.charCodeAt(0),
sCharUcc,
subst[sChar].charCodeAt(0)
);
});
}
return alphalen;
/* eslint-enable security/detect-object-injection */
}

// eslint-disable-next-line require-jsdoc
function handleWasm(inst) {
const exp = inst.exports;
Expand All @@ -335,14 +356,15 @@ function instantiateWasmEngine(lang) {
"wo": (WebAssembly.Global) ? exp.uwo.value : exp.uwo
/* eslint-enable multiline-ternary */
};
const alphalen = exp.conv();
let alphalen = exp.conv();
alphalen = registerSubstitutions(alphalen, exp);
prepareLanguagesObj(
lang,
encloseHyphenateFunction(
baseData,
exp.hyphenate
),
decode(new Uint16Array(exp.mem.buffer, 770, alphalen - 1)),
decode(new Uint16Array(exp.mem.buffer, 1026, alphalen - 1)),
baseData.lm,
baseData.rm
);
Expand Down Expand Up @@ -431,6 +453,8 @@ function createWordHyphenator(lo, lang) {
if (word.length > 61) {
H.events.dispatch("error", {"msg": "found word longer than 61 characters"});
hw = word;
} else if (lo.reNotAlphabet.test(word)) {
hw = word;
} else {
hw = lo.hyphenateFunction(
word,
Expand Down Expand Up @@ -494,6 +518,14 @@ function createTextHyphenator(lang) {
? wordHyphenatorPool.get(lang)
: createWordHyphenator(lo, lang);

/*
* Transpiled RegExp of
* /[${alphabet}\p{Letter}-]{${minwordlength},}/gui
*/
const reWord = RegExp(
`[${lo.alphabet}a-z\u00DF-\u00F6\u00F8-\u00FE\u0101\u0103\u0105\u0107\u0109\u010D\u010F\u0111\u0113\u0117\u0119\u011B\u011D\u011F\u0123\u0125\u012B\u012F\u0131\u0135\u0137\u013C\u013E\u0142\u0144\u0146\u0148\u014D\u0151\u0153\u0155\u0159\u015B\u015D\u015F\u0161\u0165\u016B\u016D\u016F\u0171\u0173\u017A\u017C\u017E\u017F\u01CE\u01D0\u01D2\u01D4\u01D6\u01D8\u01DA\u01DC\u0219\u021B\u02BC\u0390\u03AC-\u03CE\u03D0\u03E3\u03E5\u03E7\u03E9\u03EB\u03ED\u03EF\u03F2\u0430-\u044F\u0451-\u045C\u045E\u045F\u0491\u04AF\u04E9\u0561-\u0585\u0587\u0905-\u090C\u090F\u0910\u0913-\u0928\u092A-\u0930\u0932\u0933\u0935-\u0939\u093D\u0960\u0961\u0985-\u098C\u098F\u0990\u0993-\u09A8\u09AA-\u09B0\u09B2\u09B6-\u09B9\u09BD\u09CE\u09DC\u09DD\u09DF-\u09E1\u0A05-\u0A0A\u0A0F\u0A10\u0A13-\u0A28\u0A2A-\u0A30\u0A32\u0A33\u0A35\u0A36\u0A38\u0A39\u0A85-\u0A8B\u0A8F\u0A90\u0A93-\u0AA8\u0AAA-\u0AB0\u0AB2\u0AB3\u0AB5-\u0AB9\u0ABD\u0AE0\u0B05-\u0B0C\u0B0F\u0B10\u0B13-\u0B28\u0B2A-\u0B30\u0B32\u0B33\u0B35-\u0B39\u0B60\u0B61\u0B83\u0B85-\u0B8A\u0B8E-\u0B90\u0B92-\u0B95\u0B99\u0B9A\u0B9C\u0B9E\u0B9F\u0BA3\u0BA4\u0BA8-\u0BAA\u0BAE-\u0BB5\u0BB7-\u0BB9\u0C05-\u0C0C\u0C0E-\u0C10\u0C12-\u0C28\u0C2A-\u0C33\u0C35-\u0C39\u0C60\u0C61\u0C85-\u0C8C\u0C8E-\u0C90\u0C92-\u0CA8\u0CAA-\u0CB3\u0CB5-\u0CB9\u0CBD\u0CDE\u0CE0\u0CE1\u0D05-\u0D0C\u0D0E-\u0D10\u0D12-\u0D28\u0D2A-\u0D39\u0D60\u0D61\u0D7A-\u0D7F\u0E01-\u0E2E\u0E30\u0E32\u0E33\u0E40-\u0E45\u10D0-\u10F0\u1200-\u1248\u124A-\u124D\u1250-\u1256\u1258\u125A-\u125D\u1260-\u1288\u128A-\u128D\u1290-\u12B0\u12B2-\u12B5\u12B8-\u12BE\u12C0\u12C2-\u12C5\u12C8-\u12D6\u12D8-\u1310\u1312-\u1315\u1318-\u135A\u1380-\u138F\u1E0D\u1E37\u1E41\u1E43\u1E45\u1E47\u1E6D\u1F00-\u1F07\u1F10-\u1F15\u1F20-\u1F27\u1F30-\u1F37\u1F40-\u1F45\u1F50-\u1F57\u1F60-\u1F67\u1F70-\u1F7D\u1F80-\u1F87\u1F90-\u1F97\u1FA0-\u1FA7\u1FB2-\u1FB4\u1FB6\u1FB7\u1FC2-\u1FC4\u1FC6\u1FC7\u1FD2\u1FD3\u1FD6\u1FD7\u1FE2-\u1FE7\u1FF2-\u1FF4\u1FF6\u1FF7\u2C81\u2C83\u2C85\u2C87\u2C89\u2C8D\u2C8F\u2C91\u2C93\u2C95\u2C97\u2C99\u2C9B\u2C9D\u2C9F\u2CA1\u2CA3\u2CA5\u2CA7\u2CA9\u2CAB\u2CAD\u2CAF\u2CB1\u2CC9\u2D80-\u2D96\u2DA0-\u2DA6\u2DA8-\u2DAE\u2DB0-\u2DB6\u2DB8-\u2DBE\u2DC0-\u2DC6\u2DC8-\u2DCE\u2DD0-\u2DD6\u2DD8-\u2DDE\uAB01-\uAB06\uAB09-\uAB0E\uAB11-\uAB16\uAB20-\uAB26\uAB28-\uAB2E-]{${H.c.minWordLength},}`, "gui"
);

/**
* Hyphenate text
* @param {string} text The text
Expand All @@ -504,7 +536,7 @@ function createTextHyphenator(lang) {
if (H.c.normalize) {
text = text.normalize("NFC");
}
let tn = text.replace(lo.genRegExp, wordHyphenator);
let tn = text.replace(reWord, wordHyphenator);
if (H.c.orphanControl !== 1) {
tn = tn.replace(
// eslint-disable-next-line prefer-named-capture-group
Expand Down Expand Up @@ -621,6 +653,7 @@ H.config = ((userConfig) => {
"require": setProp([], 2),
"rightmin": setProp(0, 3),
"rightminPerLang": setProp(empty(), 2),
"substitute": setProp(empty(), 2),
"sync": setProp(false, 2)
});
const settings = Object.create(defaults);
Expand Down
Binary file modified lang/af/af.wasm
Binary file not shown.
16 changes: 8 additions & 8 deletions lang/af/src/g.ts
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
export const to: i32 = 5060952;
export const po: i32 = 5061114;
export const to: i32 = 5061208;
export const po: i32 = 5061370;
export const pl: i32 = 55096;
export const vs: i32 = 1280;
export const pt: i32 = 48904;
export const wo: i32 = 5059884;
export const tw: i32 = 5060012;
export const hp: i32 = 5060076;
export const hw: i32 = 5060140;
export const vs: i32 = 1536;
export const pt: i32 = 49160;
export const wo: i32 = 5060140;
export const tw: i32 = 5060268;
export const hp: i32 = 5060332;
export const hw: i32 = 5060396;
export const lm: i32 = 1;
export const rm: i32 = 2;
19 changes: 17 additions & 2 deletions lang/af/src/hyphenEngine.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ export const hwo: i32 = hw;
export const lmi: i32 = lm;
export const rmi: i32 = rm;

let alphabetCount: i32 = 0;

/**
* Maps BMP-charCode (16bit) to 8bit adresses
*
Expand Down Expand Up @@ -46,7 +48,6 @@ function createTranslateMap(): i32 {
let first: i32 = 0;
let second: i32 = 0;
let secondInt: i32 = 0;
let alphabetCount: i32 = 0;
i = to + 2;
while (i < po) {
first = load<u16>(i);
Expand All @@ -69,13 +70,27 @@ function createTranslateMap(): i32 {
pushToTranslateMap(first, secondInt);
}
// Add to alphabet
store<u16>(alphabetCount, first, 768);
store<u16>(alphabetCount, first, 1024);
alphabetCount += 2;
i += 4;
}
return alphabetCount >> 1;
}

export function subst(ccl: i32, ccu: i32, replcc: i32): i32 {
const replccInt: i32 = pullFromTranslateMap(replcc);
if (replccInt !== 255) {
pushToTranslateMap(ccl, replccInt);
if (ccu !== 0) {
pushToTranslateMap(ccu, replccInt);
}
// Add to alphabet
store<u16>(alphabetCount, ccl, 1024);
alphabetCount += 2;
}
return alphabetCount >> 1;
}

export function conv(): i32 {
let i: i32 = po;
const patternEnd: i32 = po + pl;
Expand Down
Binary file modified lang/as/as.wasm
Binary file not shown.
16 changes: 8 additions & 8 deletions lang/as/src/g.ts
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
export const to: i32 = 3052;
export const po: i32 = 3346;
export const to: i32 = 3308;
export const po: i32 = 3602;
export const pl: i32 = 228;
export const vs: i32 = 1280;
export const pt: i32 = 1464;
export const wo: i32 = 2048;
export const tw: i32 = 2176;
export const hp: i32 = 2240;
export const hw: i32 = 2304;
export const vs: i32 = 1536;
export const pt: i32 = 1720;
export const wo: i32 = 2304;
export const tw: i32 = 2432;
export const hp: i32 = 2496;
export const hw: i32 = 2560;
export const lm: i32 = 1;
export const rm: i32 = 1;
Loading

0 comments on commit f154a36

Please sign in to comment.