diff --git a/.changeset/warm-gorillas-film.md b/.changeset/warm-gorillas-film.md new file mode 100644 index 000000000..2ba8d6ef9 --- /dev/null +++ b/.changeset/warm-gorillas-film.md @@ -0,0 +1,6 @@ +--- +"@namehash/ens-utils": minor +"@namehash/nameguard-js": minor +--- + +Add splitCharacters to ens-utils diff --git a/packages/ens-utils/src/ensname.test.ts b/packages/ens-utils/src/ensname.test.ts index ede7a680a..cffb4f375 100644 --- a/packages/ens-utils/src/ensname.test.ts +++ b/packages/ens-utils/src/ensname.test.ts @@ -1,6 +1,7 @@ import { describe, it, expect } from "vitest"; import { buildENSName, + charSplit, charCount, getDecentralizationStatus, getDisplayLabels, @@ -405,43 +406,80 @@ describe("getRegistrationPotential", () => { }); describe("charCount", () => { - it("empty name", () => { - const label = ""; - const result = charCount(label); + it("counts empty strings", () => { + const text = ""; + const result = charCount(text); expect(result).toBe(0); - expect(label.length).toBe(0); + expect(text.length).toBe(0); }); - it("multi codepoint emoji", () => { - const label = "🧟‍♂"; - const result = charCount(label); + it("counts multi codepoint emojis", () => { + const text = "🧟‍♂"; + const result = charCount(text); expect(result).toBe(3); // 3 Unicode characters - expect(label.length).toBe(4); // 4 UTF-16 code units + expect(text.length).toBe(4); // 4 UTF-16 code units }); - it("another multi codepoint emoji", () => { - const label = "🤦🏼‍♂️"; - const result = charCount(label); + it("counts another multi codepoint emoji", () => { + const text = "🤦🏼‍♂️"; + const result = charCount(text); expect(result).toBe(5); // 5 Unicode characters - expect(label.length).toBe(7); // 7 UTF-16 code units + expect(text.length).toBe(7); // 7 UTF-16 code units }); - it("namehash string", () => { - const label = "namehash"; - const result = charCount(label); + it("counts ascii strings", () => { + const text = "namehash"; + const result = charCount(text); expect(result).toBe(8); // 8 Unicode characters - expect(label.length).toBe(8); // 8 UTF-16 code units + expect(text.length).toBe(8); // 8 UTF-16 code units }); - it("multi codepoint emoji 15.1", () => { - const label = "🏃🏿‍➡"; - const result = charCount(label); + it("counts multi codepoint emoji 15.1", () => { + const text = "🏃🏿‍➡"; + const result = charCount(text); expect(result).toBe(4); // 4 Unicode characters - expect(label.length).toBe(6); // 6 UTF-16 code units + expect(text.length).toBe(6); // 6 UTF-16 code units + }); +}); + +describe("charSplit", () => { + it("splits empty strings", () => { + const text = ""; + const result = charSplit(text); + + expect(result).toEqual([]); + }); + + it("splits multi codepoint emojis", () => { + const text = "🧟‍♂"; + const result = charSplit(text); + + expect(result).toEqual(["🧟", "‍", "♂"]); // 3 Unicode characters + }); + + it("splits another multi codepoint emoji", () => { + const text = "🤦🏼‍♂️"; + const result = charSplit(text); + + expect(result).toEqual(["🤦", "🏼", "‍", "♂", "️"]); // 5 Unicode characters + }); + + it("splits ascii strings", () => { + const text = "namehash"; + const result = charSplit(text); + + expect(result).toEqual(["n", "a", "m", "e", "h", "a", "s", "h"]); // 8 Unicode characters + }); + + it("splits multi codepoint emoji 15.1", () => { + const text = "🏃🏿‍➡"; + const result = charSplit(text); + + expect(result).toEqual(["🏃", "🏿", "‍", "➡"]); // 4 Unicode characters }); }); diff --git a/packages/ens-utils/src/ensname.ts b/packages/ens-utils/src/ensname.ts index bf4b957c5..e89e06bef 100644 --- a/packages/ens-utils/src/ensname.ts +++ b/packages/ens-utils/src/ensname.ts @@ -327,19 +327,39 @@ export function getRegistrationPotential(name: ENSName): RegistrationPotential { } /** - * Calculates the number of characters in a label. + * Splits a `string` into an array of the Unicode characters it contains. * - * NOTE: This length will be the same as determined by the EthRegistrarController smart contracts. - * These contracts calculate length using the following code that counts Unicode characters in UTF-8 encoding. - * https://github.com/ensdomains/ens-contracts/blob/staging/contracts/ethregistrar/StringUtils.sol + * In JavaScript, the `.split("")` method of a `string` may give different + * results because it returns an array of UTF-16 code units, not Unicode + * characters. For example, the string "😄" (Grinning Face with Smiling Eyes) is + * represented by two UTF-16 code units, but is a single Unicode character. + * + * @param text the `string` to split into Unicode characters. + * @returns An array of the Unicode characters contained in `text`. + */ +export function charSplit(text: string): string[] { + return [...text]; +} + +/** + * Counts the number of Unicode characters in a `string`. * - * This length may be different than the traditional ".length" property of a string in JavaScript. - * In Javascript, the ".length" property of a string returns the number of UTF-16 code units in that string. - * UTF-16 represents Unicode characters with codepoints higher can fit within a 16 bit value as a "surrogate pair" - * of UTF-16 code units. This means that some Unicode characters are represented by *more than one* UTF-16 code unit. - * @param label - * @returns the number of characters within `label`. + * This length may be different than the traditional `.length` property of a + * `string` in JavaScript. In Javascript, the `.length` property of a `string` + * returns the number of UTF-16 code units in that `string`. Some Unicode + * characters have codepoints higher than can fit within a 16 bit value. For + * example, the character "😄" (Grinning Face with Smiling Eyes) has a codepoint + * of U+1F604, which exceeds 16 bits. Therefore, UTF-16 represents such + * characters using *more than one* 16-bit code unit, which is known as a + * "surrogate pair". + * + * NOTE: This length will be the same as determined by the + * EthRegistrarController smart contracts. These contracts calculate length + * using the following code that counts Unicode characters in UTF-8 encoding. + * https://github.com/ensdomains/ens-contracts/blob/staging/contracts/ethregistrar/StringUtils.sol + * @param text the `string` to count the Unicode characters of. + * @returns the count of Unicode characters in `text`. */ -export function charCount(label: string) { - return [...label].length; +export function charCount(text: string) { + return charSplit(text).length; } diff --git a/packages/nameguard-js/src/confusables.ts b/packages/nameguard-js/src/confusables.ts index 48ceedcb8..27699a6e7 100644 --- a/packages/nameguard-js/src/confusables.ts +++ b/packages/nameguard-js/src/confusables.ts @@ -1,5 +1,6 @@ import { GRAPHEME_CANONICALS } from "./data/canonicals"; -import { isCombiningChar, splitCharacters } from "./utils"; +import { isCombiningChar } from "./utils"; +import { charSplit } from "@namehash/ens-utils"; /** * Checks if a grapheme is of the form `base character + sequence of combining marks`. @@ -9,7 +10,7 @@ import { isCombiningChar, splitCharacters } from "./utils"; * @returns A boolean indicating whether the grapheme has combining marks. */ function graphemeHasCombiningMarks(grapheme: string): boolean { - const characters = splitCharacters(grapheme); + const characters = charSplit(grapheme); return ( // has more than one character characters.length > 1 && @@ -86,7 +87,7 @@ export function isGraphemeConfusable(grapheme: string): boolean { * @returns The canonical form of the grapheme, or null if the canonical form is not known. */ function getCanonicalGrapheme(grapheme: string): string | null { - const characters = splitCharacters(grapheme); + const characters = charSplit(grapheme); if (graphemeHasCombiningMarks(grapheme)) { return characters[0]; @@ -123,7 +124,7 @@ export function getCanonical(grapheme: string): string | null { } else { // If getCanonicalGrapheme failed, try looking at only the first character // which might be present in the confusables list (in contrast to the full grapheme) - return getCanonical(splitCharacters(grapheme)[0]); + return getCanonical(charSplit(grapheme)[0]); } } diff --git a/packages/nameguard-js/src/graphemes.ts b/packages/nameguard-js/src/graphemes.ts index d413cbd79..0eee3b88a 100644 --- a/packages/nameguard-js/src/graphemes.ts +++ b/packages/nameguard-js/src/graphemes.ts @@ -1,5 +1,6 @@ import { splitGraphemes as unicodeStandardGraphemeSplit } from "text-segmentation"; -import { splitCharacters, isEmoji } from "./utils"; +import { isEmoji } from "./utils"; +import { charSplit } from "@namehash/ens-utils"; import { HANGUL_JAMO } from "./data/hangul"; import { INVISIBLE_JOINERS } from "./data/invisible_joiners"; @@ -29,7 +30,7 @@ export function splitGraphemes(name: string): string[] { // break up invisible characters let graphemesWithSplitInvisibles = []; for (const graphemeStr of graphemes) { - const graphemeCps = splitCharacters(graphemeStr); + const graphemeCps = charSplit(graphemeStr); let i = graphemeCps.length - 1; while (i >= 0) { if (INVISIBLE_JOINERS.has(graphemeCps[i])) { @@ -59,7 +60,7 @@ export function splitGraphemes(name: string): string[] { // break up Hangul Jamo let graphemesWithSplitHangul = []; for (const graphemeStr of graphemes) { - const graphemeCps = splitCharacters(graphemeStr); + const graphemeCps = charSplit(graphemeStr); let i = 0; let j = 1; while (j < graphemeCps.length) { diff --git a/packages/nameguard-js/src/utils.ts b/packages/nameguard-js/src/utils.ts index 36bff40e6..75d3dd4e0 100644 --- a/packages/nameguard-js/src/utils.ts +++ b/packages/nameguard-js/src/utils.ts @@ -2,8 +2,6 @@ import { charCount } from "@namehash/ens-utils"; import { EMOJI_SEQUENCES, EMOJI_ZWJ_SEQUENCES, EMOJI_BLOCK_STARTS, EMOJI_BLOCK_IS_EMOJI } from "./data/unicode"; import { COMBINING } from "./data/combining"; -export { charCount }; - /** * Checks if the given string is a single character. * @@ -14,16 +12,6 @@ export function isCharacter(text: string): boolean { return charCount(text) === 1; } -/** - * Splits a string into an array of individual characters. - * - * @param text - The string to split. - * @returns An array of individual characters. - */ -export function splitCharacters(text: string): string[] { - return [...text]; -} - export function isEmojiSequence(grapheme: string): boolean { return EMOJI_SEQUENCES.has(grapheme); }