From 5ade78ddb311663df90a6e7a81ea4520a1408da9 Mon Sep 17 00:00:00 2001 From: dan Date: Tue, 10 Dec 2024 19:45:04 +0000 Subject: [PATCH] Add fast path skipping UTF8 length counting (#2819) * Harden UTF8 length test cases * Harden tests to account for new fast path * Add fast paths that skip UTF8 encoding --- packages/lexicon/src/validators/primitives.ts | 59 ++++-- packages/lexicon/tests/_scaffolds/lexicons.ts | 18 ++ packages/lexicon/tests/general.test.ts | 191 +++++++++++++++++- 3 files changed, 246 insertions(+), 22 deletions(-) diff --git a/packages/lexicon/src/validators/primitives.ts b/packages/lexicon/src/validators/primitives.ts index 007dc7be234..81728279e02 100644 --- a/packages/lexicon/src/validators/primitives.ts +++ b/packages/lexicon/src/validators/primitives.ts @@ -198,27 +198,52 @@ export function string( } // maxLength and minLength - if (typeof def.maxLength === 'number' || typeof def.minLength === 'number') { - const len = utf8Len(value) + if (typeof def.minLength === 'number' || typeof def.maxLength === 'number') { + // If the JavaScript string length * 3 is below the maximum limit, + // its UTF8 length (which <= .length * 3) will also be below. + if (typeof def.minLength === 'number' && value.length * 3 < def.minLength) { + return { + success: false, + error: new ValidationError( + `${path} must not be shorter than ${def.minLength} characters`, + ), + } + } - if (typeof def.maxLength === 'number') { - if (len > def.maxLength) { - return { - success: false, - error: new ValidationError( - `${path} must not be longer than ${def.maxLength} characters`, - ), + // If the JavaScript string length * 3 is within the maximum limit, + // its UTF8 length (which <= .length * 3) will also be within. + // When there's no minimal length, this lets us skip the UTF8 length check. + let canSkipUtf8LenChecks = false + if ( + typeof def.minLength === 'undefined' && + typeof def.maxLength === 'number' && + value.length * 3 <= def.maxLength + ) { + canSkipUtf8LenChecks = true + } + + if (!canSkipUtf8LenChecks) { + const len = utf8Len(value) + + if (typeof def.maxLength === 'number') { + if (len > def.maxLength) { + return { + success: false, + error: new ValidationError( + `${path} must not be longer than ${def.maxLength} characters`, + ), + } } } - } - if (typeof def.minLength === 'number') { - if (len < def.minLength) { - return { - success: false, - error: new ValidationError( - `${path} must not be shorter than ${def.minLength} characters`, - ), + if (typeof def.minLength === 'number') { + if (len < def.minLength) { + return { + success: false, + error: new ValidationError( + `${path} must not be shorter than ${def.minLength} characters`, + ), + } } } } diff --git a/packages/lexicon/tests/_scaffolds/lexicons.ts b/packages/lexicon/tests/_scaffolds/lexicons.ts index d0cf414ccef..cc9cede6ef9 100644 --- a/packages/lexicon/tests/_scaffolds/lexicons.ts +++ b/packages/lexicon/tests/_scaffolds/lexicons.ts @@ -313,6 +313,24 @@ const lexicons: LexiconDoc[] = [ }, }, }, + { + lexicon: 1, + id: 'com.example.stringLengthNoMinLength', + defs: { + main: { + type: 'record', + record: { + type: 'object', + properties: { + string: { + type: 'string', + maxLength: 4, + }, + }, + }, + }, + }, + }, { lexicon: 1, id: 'com.example.stringLengthGrapheme', diff --git a/packages/lexicon/tests/general.test.ts b/packages/lexicon/tests/general.test.ts index 2d493a23d09..abd337a7329 100644 --- a/packages/lexicon/tests/general.test.ts +++ b/packages/lexicon/tests/general.test.ts @@ -567,26 +567,207 @@ describe('Record validation', () => { }) it('Applies string length constraint', () => { + // Shorter than two UTF8 characters + expect(() => + lex.assertValidRecord('com.example.stringLength', { + $type: 'com.example.stringLength', + string: '', + }), + ).toThrow('Record/string must not be shorter than 2 characters') + expect(() => + lex.assertValidRecord('com.example.stringLength', { + $type: 'com.example.stringLength', + string: 'a', + }), + ).toThrow('Record/string must not be shorter than 2 characters') + + // Two to four UTF8 characters + lex.assertValidRecord('com.example.stringLength', { + $type: 'com.example.stringLength', + string: 'ab', + }) + lex.assertValidRecord('com.example.stringLength', { + $type: 'com.example.stringLength', + string: '\u0301', // Combining acute accent (2 bytes) + }) + lex.assertValidRecord('com.example.stringLength', { + $type: 'com.example.stringLength', + string: 'a\u0301', // 'a' + combining acute accent (1 + 2 bytes = 3 bytes) + }) + lex.assertValidRecord('com.example.stringLength', { + $type: 'com.example.stringLength', + string: 'aé', // 'a' (1 byte) + 'é' (2 bytes) = 3 bytes + }) + lex.assertValidRecord('com.example.stringLength', { + $type: 'com.example.stringLength', + string: 'abc', + }) lex.assertValidRecord('com.example.stringLength', { $type: 'com.example.stringLength', - string: '123', + string: '一', // CJK character (3 bytes) }) + lex.assertValidRecord('com.example.stringLength', { + $type: 'com.example.stringLength', + string: '\uD83D', // Unpaired high surrogate (3 bytes) + }) + lex.assertValidRecord('com.example.stringLength', { + $type: 'com.example.stringLength', + string: 'abcd', + }) + lex.assertValidRecord('com.example.stringLength', { + $type: 'com.example.stringLength', + string: 'éé', // 'é' + 'é' (2 + 2 bytes = 4 bytes) + }) + lex.assertValidRecord('com.example.stringLength', { + $type: 'com.example.stringLength', + string: 'aaé', // 1 + 1 + 2 = 4 bytes + }) + lex.assertValidRecord('com.example.stringLength', { + $type: 'com.example.stringLength', + string: '👋', // 4 bytes + }) + expect(() => lex.assertValidRecord('com.example.stringLength', { $type: 'com.example.stringLength', - string: '1', + string: 'abcde', }), - ).toThrow('Record/string must not be shorter than 2 characters') + ).toThrow('Record/string must not be longer than 4 characters') expect(() => lex.assertValidRecord('com.example.stringLength', { $type: 'com.example.stringLength', - string: '12345', + string: 'a\u0301\u0301', // 1 + (2 * 2) = 5 bytes }), ).toThrow('Record/string must not be longer than 4 characters') expect(() => lex.assertValidRecord('com.example.stringLength', { $type: 'com.example.stringLength', - string: '👨‍👩‍👧‍👧', + string: '\uD83D\uD83D', // Two unpaired high surrogates (3 * 2 = 6 bytes) + }), + ).toThrow('Record/string must not be longer than 4 characters') + expect(() => + lex.assertValidRecord('com.example.stringLength', { + $type: 'com.example.stringLength', + string: 'ééé', // 2 + 2 + 2 bytes = 6 bytes + }), + ).toThrow('Record/string must not be longer than 4 characters') + expect(() => + lex.assertValidRecord('com.example.stringLength', { + $type: 'com.example.stringLength', + string: '👋a', // 4 + 1 bytes = 5 bytes + }), + ).toThrow('Record/string must not be longer than 4 characters') + expect(() => + lex.assertValidRecord('com.example.stringLength', { + $type: 'com.example.stringLength', + string: '👨👨', // 4 + 4 = 8 bytes + }), + ).toThrow('Record/string must not be longer than 4 characters') + expect(() => + lex.assertValidRecord('com.example.stringLength', { + $type: 'com.example.stringLength', + string: '👨‍👩‍👧‍👧', // 4 emojis × 4 bytes + 3 ZWJs × 3 bytes = 25 bytes + }), + ).toThrow('Record/string must not be longer than 4 characters') + }) + + it('Applies string length constraint (no minLength)', () => { + // Shorter than two UTF8 characters + lex.assertValidRecord('com.example.stringLengthNoMinLength', { + $type: 'com.example.stringLengthNoMinLength', + string: '', + }) + lex.assertValidRecord('com.example.stringLengthNoMinLength', { + $type: 'com.example.stringLengthNoMinLength', + string: 'a', + }) + + // Two to four UTF8 characters + lex.assertValidRecord('com.example.stringLengthNoMinLength', { + $type: 'com.example.stringLengthNoMinLength', + string: 'ab', + }) + lex.assertValidRecord('com.example.stringLengthNoMinLength', { + $type: 'com.example.stringLengthNoMinLength', + string: '\u0301', // Combining acute accent (2 bytes) + }) + lex.assertValidRecord('com.example.stringLengthNoMinLength', { + $type: 'com.example.stringLengthNoMinLength', + string: 'a\u0301', // 'a' + combining acute accent (1 + 2 bytes = 3 bytes) + }) + lex.assertValidRecord('com.example.stringLengthNoMinLength', { + $type: 'com.example.stringLengthNoMinLength', + string: 'aé', // 'a' (1 byte) + 'é' (2 bytes) = 3 bytes + }) + lex.assertValidRecord('com.example.stringLengthNoMinLength', { + $type: 'com.example.stringLengthNoMinLength', + string: 'abc', + }) + lex.assertValidRecord('com.example.stringLengthNoMinLength', { + $type: 'com.example.stringLengthNoMinLength', + string: '一', // CJK character (3 bytes) + }) + lex.assertValidRecord('com.example.stringLengthNoMinLength', { + $type: 'com.example.stringLengthNoMinLength', + string: '\uD83D', // Unpaired high surrogate (3 bytes) + }) + lex.assertValidRecord('com.example.stringLengthNoMinLength', { + $type: 'com.example.stringLengthNoMinLength', + string: 'abcd', + }) + lex.assertValidRecord('com.example.stringLengthNoMinLength', { + $type: 'com.example.stringLengthNoMinLength', + string: 'éé', // 'é' + 'é' (2 + 2 bytes = 4 bytes) + }) + lex.assertValidRecord('com.example.stringLengthNoMinLength', { + $type: 'com.example.stringLengthNoMinLength', + string: 'aaé', // 1 + 1 + 2 = 4 bytes + }) + lex.assertValidRecord('com.example.stringLengthNoMinLength', { + $type: 'com.example.stringLengthNoMinLength', + string: '👋', // 4 bytes + }) + + expect(() => + lex.assertValidRecord('com.example.stringLengthNoMinLength', { + $type: 'com.example.stringLengthNoMinLength', + string: 'abcde', + }), + ).toThrow('Record/string must not be longer than 4 characters') + expect(() => + lex.assertValidRecord('com.example.stringLengthNoMinLength', { + $type: 'com.example.stringLengthNoMinLength', + string: 'a\u0301\u0301', // 1 + (2 * 2) = 5 bytes + }), + ).toThrow('Record/string must not be longer than 4 characters') + expect(() => + lex.assertValidRecord('com.example.stringLengthNoMinLength', { + $type: 'com.example.stringLengthNoMinLength', + string: '\uD83D\uD83D', // Two unpaired high surrogates (3 * 2 = 6 bytes) + }), + ).toThrow('Record/string must not be longer than 4 characters') + expect(() => + lex.assertValidRecord('com.example.stringLengthNoMinLength', { + $type: 'com.example.stringLengthNoMinLength', + string: 'ééé', // 2 + 2 + 2 bytes = 6 bytes + }), + ).toThrow('Record/string must not be longer than 4 characters') + expect(() => + lex.assertValidRecord('com.example.stringLengthNoMinLength', { + $type: 'com.example.stringLengthNoMinLength', + string: '👋a', // 4 + 1 bytes = 5 bytes + }), + ).toThrow('Record/string must not be longer than 4 characters') + expect(() => + lex.assertValidRecord('com.example.stringLengthNoMinLength', { + $type: 'com.example.stringLengthNoMinLength', + string: '👨👨', // 4 + 4 = 8 bytes + }), + ).toThrow('Record/string must not be longer than 4 characters') + expect(() => + lex.assertValidRecord('com.example.stringLengthNoMinLength', { + $type: 'com.example.stringLengthNoMinLength', + string: '👨‍👩‍👧‍👧', // 4 emojis × 4 bytes + 3 ZWJs × 3 bytes = 25 bytes }), ).toThrow('Record/string must not be longer than 4 characters') })