Add charSplit to ens-utils (#400)

* add splitCharacters to ens-utils * add splitCharacters to ens-utils * update versions * add changesets * revert version change * rename label to text * fix docs * Refine docs * Align naming pattern of related utility functions and refine language in unit tests * Update NameGuard.js to use updated function names --------- Co-authored-by: Carbon225 <[email protected]> Co-authored-by: lightwalker.eth <[email protected]>
namehash · Sep 26, 2024 · ba89172 · ba89172
1 parent 1885ca4
commit ba89172
Show file tree

Hide file tree

Showing 6 changed files with 105 additions and 51 deletions.
diff --git a/.changeset/warm-gorillas-film.md b/.changeset/warm-gorillas-film.md
@@ -0,0 +1,6 @@
+---
+"@namehash/ens-utils": minor
+"@namehash/nameguard-js": minor
+---
+
+Add splitCharacters to ens-utils
diff --git a/packages/ens-utils/src/ensname.test.ts b/packages/ens-utils/src/ensname.test.ts
@@ -1,6 +1,7 @@
 import { describe, it, expect } from "vitest";
 import {
   buildENSName,
+  charSplit,
   charCount,
   getDecentralizationStatus,
   getDisplayLabels,
@@ -405,43 +406,80 @@ describe("getRegistrationPotential", () => {
 });
 
 describe("charCount", () => {
-  it("empty name", () => {
-    const label = "";
-    const result = charCount(label);
+  it("counts empty strings", () => {
+    const text = "";
+    const result = charCount(text);
 
     expect(result).toBe(0);
-    expect(label.length).toBe(0);
+    expect(text.length).toBe(0);
   });
 
-  it("multi codepoint emoji", () => {
-    const label = "🧟‍♂";
-    const result = charCount(label);
+  it("counts multi codepoint emojis", () => {
+    const text = "🧟‍♂";
+    const result = charCount(text);
 
     expect(result).toBe(3);  // 3 Unicode characters
-    expect(label.length).toBe(4); // 4 UTF-16 code units
+    expect(text.length).toBe(4); // 4 UTF-16 code units
   });
 
-  it("another multi codepoint emoji", () => {
-    const label = "🤦🏼‍♂️";
-    const result = charCount(label);
+  it("counts another multi codepoint emoji", () => {
+    const text = "🤦🏼‍♂️";
+    const result = charCount(text);
 
     expect(result).toBe(5); // 5 Unicode characters
-    expect(label.length).toBe(7); // 7 UTF-16 code units
+    expect(text.length).toBe(7); // 7 UTF-16 code units
   });
 
-  it("namehash string", () => {
-    const label = "namehash";
-    const result = charCount(label);
+  it("counts ascii strings", () => {
+    const text = "namehash";
+    const result = charCount(text);
 
     expect(result).toBe(8); // 8 Unicode characters
-    expect(label.length).toBe(8); // 8 UTF-16 code units
+    expect(text.length).toBe(8); // 8 UTF-16 code units
   });
 
-  it("multi codepoint emoji 15.1", () => {
-    const label = "🏃🏿‍➡";
-    const result = charCount(label);
+  it("counts multi codepoint emoji 15.1", () => {
+    const text = "🏃🏿‍➡";
+    const result = charCount(text);
 
     expect(result).toBe(4); // 4 Unicode characters
-    expect(label.length).toBe(6);  // 6 UTF-16 code units
+    expect(text.length).toBe(6);  // 6 UTF-16 code units
+  });
+});
+
+describe("charSplit", () => {
+  it("splits empty strings", () => {
+    const text = "";
+    const result = charSplit(text);
+
+    expect(result).toEqual([]);
+  });
+
+  it("splits multi codepoint emojis", () => {
+    const text = "🧟‍♂";
+    const result = charSplit(text);
+
+    expect(result).toEqual(["🧟", "‍", "♂"]);  // 3 Unicode characters
+  });
+
+  it("splits another multi codepoint emoji", () => {
+    const text = "🤦🏼‍♂️";
+    const result = charSplit(text);
+
+    expect(result).toEqual(["🤦", "🏼", "‍", "♂", "️"]); // 5 Unicode characters
+  });
+
+  it("splits ascii strings", () => {
+    const text = "namehash";
+    const result = charSplit(text);
+
+    expect(result).toEqual(["n", "a", "m", "e", "h", "a", "s", "h"]); // 8 Unicode characters
+  });
+
+  it("splits multi codepoint emoji 15.1", () => {
+    const text = "🏃🏿‍➡";
+    const result = charSplit(text);
+
+    expect(result).toEqual(["🏃", "🏿", "‍", "➡"]); // 4 Unicode characters
   });
 });
diff --git a/packages/ens-utils/src/ensname.ts b/packages/ens-utils/src/ensname.ts
@@ -327,19 +327,39 @@ export function getRegistrationPotential(name: ENSName): RegistrationPotential {
 }
 
 /**
- * Calculates the number of characters in a label.
+ * Splits a `string` into an array of the Unicode characters it contains.
  * 
- * NOTE: This length will be the same as determined by the EthRegistrarController smart contracts.
- * These contracts calculate length using the following code that counts Unicode characters in UTF-8 encoding.
- * https://github.com/ensdomains/ens-contracts/blob/staging/contracts/ethregistrar/StringUtils.sol
+ * In JavaScript, the `.split("")` method of a `string` may give different
+ * results because it returns an array of UTF-16 code units, not Unicode
+ * characters. For example, the string "😄" (Grinning Face with Smiling Eyes) is
+ * represented by two UTF-16 code units, but is a single Unicode character.
+ * 
+ * @param text the `string` to split into Unicode characters.
+ * @returns An array of the Unicode characters contained in `text`.
+ */
+export function charSplit(text: string): string[] {
+  return [...text];
+}
+
+/**
+ * Counts the number of Unicode characters in a `string`.
  * 
- * This length may be different than the traditional ".length" property of a string in JavaScript.
- * In Javascript, the ".length" property of a string returns the number of UTF-16 code units in that string.
- * UTF-16 represents Unicode characters with codepoints higher can fit within a 16 bit value as a "surrogate pair"
- * of UTF-16 code units. This means that some Unicode characters are represented by *more than one* UTF-16 code unit.
- * @param label
- * @returns the number of characters within `label`.
+ * This length may be different than the traditional `.length` property of a
+ * `string` in JavaScript. In Javascript, the `.length` property of a `string`
+ * returns the number of UTF-16 code units in that `string`. Some Unicode
+ * characters have codepoints higher than can fit within a 16 bit value. For
+ * example, the character "😄" (Grinning Face with Smiling Eyes) has a codepoint
+ * of U+1F604, which exceeds 16 bits. Therefore, UTF-16 represents such
+ * characters using *more than one* 16-bit code unit, which is known as a
+ * "surrogate pair".
+ * 
+ * NOTE: This length will be the same as determined by the
+ * EthRegistrarController smart contracts. These contracts calculate length
+ * using the following code that counts Unicode characters in UTF-8 encoding.
+ * https://github.com/ensdomains/ens-contracts/blob/staging/contracts/ethregistrar/StringUtils.sol
+ * @param text the `string` to count the Unicode characters of.
+ * @returns the count of Unicode characters in `text`.
  */
-export function charCount(label: string) {
-  return [...label].length;
+export function charCount(text: string) {
+  return charSplit(text).length;
 }
diff --git a/packages/nameguard-js/src/confusables.ts b/packages/nameguard-js/src/confusables.ts
@@ -1,5 +1,6 @@
 import { GRAPHEME_CANONICALS } from "./data/canonicals";
-import { isCombiningChar, splitCharacters } from "./utils";
+import { isCombiningChar } from "./utils";
+import { charSplit } from "@namehash/ens-utils";
 
 /**
  * Checks if a grapheme is of the form `base character + sequence of combining marks`.
@@ -9,7 +10,7 @@ import { isCombiningChar, splitCharacters } from "./utils";
  * @returns A boolean indicating whether the grapheme has combining marks.
  */
 function graphemeHasCombiningMarks(grapheme: string): boolean {
-  const characters = splitCharacters(grapheme);
+  const characters = charSplit(grapheme);
   return (
     // has more than one character
     characters.length > 1 &&
@@ -86,7 +87,7 @@ export function isGraphemeConfusable(grapheme: string): boolean {
  * @returns The canonical form of the grapheme, or null if the canonical form is not known.
  */
 function getCanonicalGrapheme(grapheme: string): string | null {
-  const characters = splitCharacters(grapheme);
+  const characters = charSplit(grapheme);
 
   if (graphemeHasCombiningMarks(grapheme)) {
     return characters[0];
@@ -123,7 +124,7 @@ export function getCanonical(grapheme: string): string | null {
   } else {
     // If getCanonicalGrapheme failed, try looking at only the first character
     // which might be present in the confusables list (in contrast to the full grapheme)
-    return getCanonical(splitCharacters(grapheme)[0]);
+    return getCanonical(charSplit(grapheme)[0]);
   }
 }
 

diff --git a/packages/nameguard-js/src/graphemes.ts b/packages/nameguard-js/src/graphemes.ts
@@ -1,5 +1,6 @@
 import { splitGraphemes as unicodeStandardGraphemeSplit } from "text-segmentation";
-import { splitCharacters, isEmoji } from "./utils";
+import { isEmoji } from "./utils";
+import { charSplit } from "@namehash/ens-utils";
 import { HANGUL_JAMO } from "./data/hangul";
 import { INVISIBLE_JOINERS } from "./data/invisible_joiners";
 
@@ -29,7 +30,7 @@ export function splitGraphemes(name: string): string[] {
   // break up invisible characters
   let graphemesWithSplitInvisibles = [];
   for (const graphemeStr of graphemes) {
-    const graphemeCps = splitCharacters(graphemeStr);
+    const graphemeCps = charSplit(graphemeStr);
     let i = graphemeCps.length - 1;
     while (i >= 0) {
       if (INVISIBLE_JOINERS.has(graphemeCps[i])) {
@@ -59,7 +60,7 @@ export function splitGraphemes(name: string): string[] {
   // break up Hangul Jamo
   let graphemesWithSplitHangul = [];
   for (const graphemeStr of graphemes) {
-    const graphemeCps = splitCharacters(graphemeStr);
+    const graphemeCps = charSplit(graphemeStr);
     let i = 0;
     let j = 1;
     while (j < graphemeCps.length) {

diff --git a/packages/nameguard-js/src/utils.ts b/packages/nameguard-js/src/utils.ts
@@ -2,8 +2,6 @@ import { charCount } from "@namehash/ens-utils";
 import { EMOJI_SEQUENCES, EMOJI_ZWJ_SEQUENCES, EMOJI_BLOCK_STARTS, EMOJI_BLOCK_IS_EMOJI } from "./data/unicode";
 import { COMBINING } from "./data/combining";
 
-export { charCount };
-
 /**
  * Checks if the given string is a single character.
  * 
@@ -14,16 +12,6 @@ export function isCharacter(text: string): boolean {
   return charCount(text) === 1;
 }
 
-/**
- * Splits a string into an array of individual characters.
- *
- * @param text - The string to split.
- * @returns An array of individual characters.
- */
-export function splitCharacters(text: string): string[] {
-  return [...text];
-}
-
 export function isEmojiSequence(grapheme: string): boolean {
   return EMOJI_SEQUENCES.has(grapheme);
 }