Simplify string normalization using modern JS functions

Simplify the utility for normalizing strings now that `String.prototype.normalize` and `\p` escapes are widely available.
hypothesis · Nov 22, 2024 · afa68e9 · afa68e9
1 parent 1870fa8
commit afa68e9
Show file tree

Hide file tree

Showing 4 changed files with 47 additions and 89 deletions.
diff --git a/h/static/scripts/controllers/search-bar-controller.js b/h/static/scripts/controllers/search-bar-controller.js
@@ -3,7 +3,7 @@ import escapeHtml from 'escape-html';
 import { Controller } from '../base/controller';
 import { cloneTemplate } from '../util/dom';
 import { getLozengeValues, shouldLozengify } from '../util/search-text-parser';
-import * as stringUtil from '../util/string';
+import { stripMarks } from '../util/string';
 
 import { AutosuggestDropdownController } from './autosuggest-dropdown-controller';
 import { LozengeController } from './lozenge-controller';
@@ -13,14 +13,6 @@ const TAG_TYPE = 'TAG';
 const GROUP_TYPE = 'GROUP';
 const MAX_SUGGESTIONS = 5;
 
-/**
- * Normalize a string for use in comparisons of user input with a suggestion.
- * This causes differences in unicode composition and combining characters/accents to be ignored.
- */
-const normalizeStr = function (str) {
-  return stringUtil.fold(stringUtil.normalize(str));
-};
-
 /**
  * Controller for the search bar.
  */
@@ -85,7 +77,7 @@ export class SearchBarController extends Controller {
         return Object.assign(item, {
           type: TAG_TYPE,
           title: item.tag, // make safe
-          matchOn: normalizeStr(item.tag),
+          matchOn: stripMarks(item.tag),
           usageCount: item.count || 0,
         });
       });
@@ -113,7 +105,7 @@ export class SearchBarController extends Controller {
         return Object.assign(item, {
           type: GROUP_TYPE,
           title: item.name, // make safe
-          matchOn: normalizeStr(item.name),
+          matchOn: stripMarks(item.name),
           pubid: item.pubid,
           name: item.name,
           relationship: item.relationship,
@@ -161,7 +153,7 @@ export class SearchBarController extends Controller {
         groupVal = groupVal.slice(0, -1);
       }
 
-      const matchVal = normalizeStr(groupVal).toLowerCase();
+      const matchVal = stripMarks(groupVal).toLowerCase();
 
       // NOTE: We are pushing a pubid to lowercase here. These ids are created by us
       // in a random generation case-sensistive style. Theoretically, that means
@@ -364,7 +356,7 @@ export class SearchBarController extends Controller {
           typeFilter = GROUP_TYPE;
         }
 
-        let inputFilter = normalizeStr(currentInput);
+        let inputFilter = stripMarks(currentInput);
 
         if (typeFilter === TAG_TYPE || typeFilter === GROUP_TYPE) {
           inputFilter = inputFilter.substr(inputFilter.indexOf(':') + 1);

diff --git a/h/static/scripts/tests/util/string-test.js b/h/static/scripts/tests/util/string-test.js
@@ -1,58 +1,58 @@
-import * as stringUtil from '../../util/string';
+import { hyphenate, unhyphenate, stripMarks } from '../../util/string';
 
 describe('util/string', () => {
   describe('hyphenate', () => {
     it('converts input to kebab-case', () => {
-      assert.equal(stringUtil.hyphenate('fooBar'), 'foo-bar');
-      assert.equal(stringUtil.hyphenate('FooBar'), '-foo-bar');
+      assert.equal(hyphenate('fooBar'), 'foo-bar');
+      assert.equal(hyphenate('FooBar'), '-foo-bar');
     });
   });
 
   describe('unhyphenate', () => {
     it('converts input to camelCase', () => {
-      assert.equal(stringUtil.unhyphenate('foo-bar'), 'fooBar');
-      assert.equal(stringUtil.unhyphenate('foo-bar-'), 'fooBar');
-      assert.equal(stringUtil.unhyphenate('foo-bar-baz'), 'fooBarBaz');
-      assert.equal(stringUtil.unhyphenate('-foo-bar-baz'), 'FooBarBaz');
+      assert.equal(unhyphenate('foo-bar'), 'fooBar');
+      assert.equal(unhyphenate('foo-bar-'), 'fooBar');
+      assert.equal(unhyphenate('foo-bar-baz'), 'fooBarBaz');
+      assert.equal(unhyphenate('-foo-bar-baz'), 'FooBarBaz');
     });
   });
 
-  describe('stringUtil helpers', () => {
+  describe('stripAccents', () => {
     it('removes hungarian marks', () => {
       const text = 'Fürge rőt róka túlszökik zsíros étkű kutyán';
-      const decoded = stringUtil.fold(stringUtil.normalize(text));
+      const decoded = stripMarks(text);
       const expected = 'Furge rot roka tulszokik zsiros etku kutyan';
 
       assert.equal(decoded, expected);
     });
 
     it('removes greek marks', () => {
       const text = 'Καλημέρα κόσμε';
-      const decoded = stringUtil.fold(stringUtil.normalize(text));
+      const decoded = stripMarks(text);
       const expected = 'Καλημερα κοσμε';
 
       assert.equal(decoded, expected);
     });
 
     it('removes japanese marks', () => {
       const text = 'カタカナコンバータ';
-      const decoded = stringUtil.fold(stringUtil.normalize(text));
+      const decoded = stripMarks(text);
       const expected = 'カタカナコンハータ';
 
       assert.equal(decoded, expected);
     });
 
     it('removes marathi marks', () => {
       const text = 'काचं शक्नोम्यत्तुम';
-      const decoded = stringUtil.fold(stringUtil.normalize(text));
+      const decoded = stripMarks(text);
       const expected = 'कच शकनमयततम';
 
       assert.equal(decoded, expected);
     });
 
     it('removes thai marks', () => {
       const text = 'ฉันกินกระจกได้ แต่มันไม่ทำให้ฉันเจ็บ';
-      const decoded = stringUtil.fold(stringUtil.normalize(text));
+      const decoded = stripMarks(text);
       const expected = 'ฉนกนกระจกได แตมนไมทาใหฉนเจบ';
 
       assert.equal(decoded, expected);
@@ -61,7 +61,7 @@ describe('util/string', () => {
     it('removes all marks', () => {
       const text =
         '̀ ́ ̂ ̃ ̄ ̅ ̆ ̇ ̈ ̉ ̊ ̋ ̌ ̍ ̎ ̏ ̐ ̑ ̒ ̓ ̔ ̕ ̖ ̗ ̘ ̙ ̚ ̛ ̜ ̝ ̞ ̟ ̠ ̡ ̢ ̣ ̤ ̥ ̦ ̧ ̨ ̩ ̪ ̫ ̬ ̭ ̮ ̯ ̰ ̱ ̲ ̳ ̴ ̵ ̶ ̷ ̸ ̹ ̺ ̻ ̼ ̽ ̾ ̿ ̀ ́ ͂ ̓ ̈́ ͅ ͠ ͡"';
-      const decoded = stringUtil.fold(stringUtil.normalize(text));
+      const decoded = stripMarks(text);
       const expected =
         '                                                                       "';
 

diff --git a/h/static/scripts/util/string.js b/h/static/scripts/util/string.js
diff --git a/h/static/scripts/util/string.ts b/h/static/scripts/util/string.ts
@@ -0,0 +1,28 @@
+/**
+ * Convert a `camelCase` or `CapitalCase` string to `kebab-case`
+ */
+export function hyphenate(name: string) {
+  const uppercasePattern = /([A-Z])/g;
+  return name.replace(uppercasePattern, '-$1').toLowerCase();
+}
+
+/** Convert a `kebab-case` string to `camelCase` */
+export function unhyphenate(name: string) {
+  const idx = name.indexOf('-');
+  if (idx === -1) {
+    return name;
+  } else {
+    const ch = (name[idx + 1] || '').toUpperCase();
+    return unhyphenate(name.slice(0, idx) + ch + name.slice(idx + 2));
+  }
+}
+
+/**
+ * Convert a string into NFKD normalization form and remove marks (accents etc.)
+ *
+ * This function is used to normalize strings before search to ignore
+ * differences in accents etc.
+ */
+export function stripMarks(str: string) {
+  return str.normalize('NFKD').replace(/\p{M}/gu, '');
+}