Skip to content

Commit

Permalink
Add charSplit to ens-utils (#400)
Browse files Browse the repository at this point in the history
* add splitCharacters to ens-utils

* add splitCharacters to ens-utils

* update versions

* add changesets

* revert version change

* rename label to text

* fix docs

* Refine docs

* Align naming pattern of related utility functions and refine language in unit tests

* Update NameGuard.js to use updated function names

---------

Co-authored-by: Carbon225 <[email protected]>
Co-authored-by: lightwalker.eth <[email protected]>
  • Loading branch information
3 people authored Sep 26, 2024
1 parent 1885ca4 commit ba89172
Show file tree
Hide file tree
Showing 6 changed files with 105 additions and 51 deletions.
6 changes: 6 additions & 0 deletions .changeset/warm-gorillas-film.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
---
"@namehash/ens-utils": minor
"@namehash/nameguard-js": minor
---

Add splitCharacters to ens-utils
78 changes: 58 additions & 20 deletions packages/ens-utils/src/ensname.test.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import { describe, it, expect } from "vitest";
import {
buildENSName,
charSplit,
charCount,
getDecentralizationStatus,
getDisplayLabels,
Expand Down Expand Up @@ -405,43 +406,80 @@ describe("getRegistrationPotential", () => {
});

describe("charCount", () => {
it("empty name", () => {
const label = "";
const result = charCount(label);
it("counts empty strings", () => {
const text = "";
const result = charCount(text);

expect(result).toBe(0);
expect(label.length).toBe(0);
expect(text.length).toBe(0);
});

it("multi codepoint emoji", () => {
const label = "🧟‍♂";
const result = charCount(label);
it("counts multi codepoint emojis", () => {
const text = "🧟‍♂";
const result = charCount(text);

expect(result).toBe(3); // 3 Unicode characters
expect(label.length).toBe(4); // 4 UTF-16 code units
expect(text.length).toBe(4); // 4 UTF-16 code units
});

it("another multi codepoint emoji", () => {
const label = "🤦🏼‍♂️";
const result = charCount(label);
it("counts another multi codepoint emoji", () => {
const text = "🤦🏼‍♂️";
const result = charCount(text);

expect(result).toBe(5); // 5 Unicode characters
expect(label.length).toBe(7); // 7 UTF-16 code units
expect(text.length).toBe(7); // 7 UTF-16 code units
});

it("namehash string", () => {
const label = "namehash";
const result = charCount(label);
it("counts ascii strings", () => {
const text = "namehash";
const result = charCount(text);

expect(result).toBe(8); // 8 Unicode characters
expect(label.length).toBe(8); // 8 UTF-16 code units
expect(text.length).toBe(8); // 8 UTF-16 code units
});

it("multi codepoint emoji 15.1", () => {
const label = "🏃🏿‍➡";
const result = charCount(label);
it("counts multi codepoint emoji 15.1", () => {
const text = "🏃🏿‍➡";
const result = charCount(text);

expect(result).toBe(4); // 4 Unicode characters
expect(label.length).toBe(6); // 6 UTF-16 code units
expect(text.length).toBe(6); // 6 UTF-16 code units
});
});

describe("charSplit", () => {
it("splits empty strings", () => {
const text = "";
const result = charSplit(text);

expect(result).toEqual([]);
});

it("splits multi codepoint emojis", () => {
const text = "🧟‍♂";
const result = charSplit(text);

expect(result).toEqual(["🧟", "‍", "♂"]); // 3 Unicode characters
});

it("splits another multi codepoint emoji", () => {
const text = "🤦🏼‍♂️";
const result = charSplit(text);

expect(result).toEqual(["🤦", "🏼", "‍", "♂", "️"]); // 5 Unicode characters
});

it("splits ascii strings", () => {
const text = "namehash";
const result = charSplit(text);

expect(result).toEqual(["n", "a", "m", "e", "h", "a", "s", "h"]); // 8 Unicode characters
});

it("splits multi codepoint emoji 15.1", () => {
const text = "🏃🏿‍➡";
const result = charSplit(text);

expect(result).toEqual(["🏃", "🏿", "‍", "➡"]); // 4 Unicode characters
});
});
44 changes: 32 additions & 12 deletions packages/ens-utils/src/ensname.ts
Original file line number Diff line number Diff line change
Expand Up @@ -327,19 +327,39 @@ export function getRegistrationPotential(name: ENSName): RegistrationPotential {
}

/**
* Calculates the number of characters in a label.
* Splits a `string` into an array of the Unicode characters it contains.
*
* NOTE: This length will be the same as determined by the EthRegistrarController smart contracts.
* These contracts calculate length using the following code that counts Unicode characters in UTF-8 encoding.
* https://github.com/ensdomains/ens-contracts/blob/staging/contracts/ethregistrar/StringUtils.sol
* In JavaScript, the `.split("")` method of a `string` may give different
* results because it returns an array of UTF-16 code units, not Unicode
* characters. For example, the string "😄" (Grinning Face with Smiling Eyes) is
* represented by two UTF-16 code units, but is a single Unicode character.
*
* @param text the `string` to split into Unicode characters.
* @returns An array of the Unicode characters contained in `text`.
*/
export function charSplit(text: string): string[] {
return [...text];
}

/**
* Counts the number of Unicode characters in a `string`.
*
* This length may be different than the traditional ".length" property of a string in JavaScript.
* In Javascript, the ".length" property of a string returns the number of UTF-16 code units in that string.
* UTF-16 represents Unicode characters with codepoints higher can fit within a 16 bit value as a "surrogate pair"
* of UTF-16 code units. This means that some Unicode characters are represented by *more than one* UTF-16 code unit.
* @param label
* @returns the number of characters within `label`.
* This length may be different than the traditional `.length` property of a
* `string` in JavaScript. In Javascript, the `.length` property of a `string`
* returns the number of UTF-16 code units in that `string`. Some Unicode
* characters have codepoints higher than can fit within a 16 bit value. For
* example, the character "😄" (Grinning Face with Smiling Eyes) has a codepoint
* of U+1F604, which exceeds 16 bits. Therefore, UTF-16 represents such
* characters using *more than one* 16-bit code unit, which is known as a
* "surrogate pair".
*
* NOTE: This length will be the same as determined by the
* EthRegistrarController smart contracts. These contracts calculate length
* using the following code that counts Unicode characters in UTF-8 encoding.
* https://github.com/ensdomains/ens-contracts/blob/staging/contracts/ethregistrar/StringUtils.sol
* @param text the `string` to count the Unicode characters of.
* @returns the count of Unicode characters in `text`.
*/
export function charCount(label: string) {
return [...label].length;
export function charCount(text: string) {
return charSplit(text).length;
}
9 changes: 5 additions & 4 deletions packages/nameguard-js/src/confusables.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import { GRAPHEME_CANONICALS } from "./data/canonicals";
import { isCombiningChar, splitCharacters } from "./utils";
import { isCombiningChar } from "./utils";
import { charSplit } from "@namehash/ens-utils";

/**
* Checks if a grapheme is of the form `base character + sequence of combining marks`.
Expand All @@ -9,7 +10,7 @@ import { isCombiningChar, splitCharacters } from "./utils";
* @returns A boolean indicating whether the grapheme has combining marks.
*/
function graphemeHasCombiningMarks(grapheme: string): boolean {
const characters = splitCharacters(grapheme);
const characters = charSplit(grapheme);
return (
// has more than one character
characters.length > 1 &&
Expand Down Expand Up @@ -86,7 +87,7 @@ export function isGraphemeConfusable(grapheme: string): boolean {
* @returns The canonical form of the grapheme, or null if the canonical form is not known.
*/
function getCanonicalGrapheme(grapheme: string): string | null {
const characters = splitCharacters(grapheme);
const characters = charSplit(grapheme);

if (graphemeHasCombiningMarks(grapheme)) {
return characters[0];
Expand Down Expand Up @@ -123,7 +124,7 @@ export function getCanonical(grapheme: string): string | null {
} else {
// If getCanonicalGrapheme failed, try looking at only the first character
// which might be present in the confusables list (in contrast to the full grapheme)
return getCanonical(splitCharacters(grapheme)[0]);
return getCanonical(charSplit(grapheme)[0]);
}
}

Expand Down
7 changes: 4 additions & 3 deletions packages/nameguard-js/src/graphemes.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import { splitGraphemes as unicodeStandardGraphemeSplit } from "text-segmentation";
import { splitCharacters, isEmoji } from "./utils";
import { isEmoji } from "./utils";
import { charSplit } from "@namehash/ens-utils";
import { HANGUL_JAMO } from "./data/hangul";
import { INVISIBLE_JOINERS } from "./data/invisible_joiners";

Expand Down Expand Up @@ -29,7 +30,7 @@ export function splitGraphemes(name: string): string[] {
// break up invisible characters
let graphemesWithSplitInvisibles = [];
for (const graphemeStr of graphemes) {
const graphemeCps = splitCharacters(graphemeStr);
const graphemeCps = charSplit(graphemeStr);
let i = graphemeCps.length - 1;
while (i >= 0) {
if (INVISIBLE_JOINERS.has(graphemeCps[i])) {
Expand Down Expand Up @@ -59,7 +60,7 @@ export function splitGraphemes(name: string): string[] {
// break up Hangul Jamo
let graphemesWithSplitHangul = [];
for (const graphemeStr of graphemes) {
const graphemeCps = splitCharacters(graphemeStr);
const graphemeCps = charSplit(graphemeStr);
let i = 0;
let j = 1;
while (j < graphemeCps.length) {
Expand Down
12 changes: 0 additions & 12 deletions packages/nameguard-js/src/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@ import { charCount } from "@namehash/ens-utils";
import { EMOJI_SEQUENCES, EMOJI_ZWJ_SEQUENCES, EMOJI_BLOCK_STARTS, EMOJI_BLOCK_IS_EMOJI } from "./data/unicode";
import { COMBINING } from "./data/combining";

export { charCount };

/**
* Checks if the given string is a single character.
*
Expand All @@ -14,16 +12,6 @@ export function isCharacter(text: string): boolean {
return charCount(text) === 1;
}

/**
* Splits a string into an array of individual characters.
*
* @param text - The string to split.
* @returns An array of individual characters.
*/
export function splitCharacters(text: string): string[] {
return [...text];
}

export function isEmojiSequence(grapheme: string): boolean {
return EMOJI_SEQUENCES.has(grapheme);
}
Expand Down

0 comments on commit ba89172

Please sign in to comment.