Skip to content

Commit

Permalink
Improve tag detection (#2260)
Browse files Browse the repository at this point in the history
* Allow tags to lead with and contain only numbers

* Break tags on other whitespace characters

* Export regexes from rich text detection

* Add test

* Add test

* Disallow number-only tags

* Avoid combining enclosing screen chars

* Allow full-width number sign

* Clarify tests

* Fix punctuation edge case

* Reorder

* Simplify, add another test

* Another test, comment
  • Loading branch information
estrattonbailey authored Mar 1, 2024
1 parent 2e08b69 commit 6ec8859
Show file tree
Hide file tree
Showing 7 changed files with 68 additions and 8 deletions.
5 changes: 5 additions & 0 deletions .changeset/chatty-cows-kick.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
'@atproto/api': patch
---

Export regex from rich text detection
5 changes: 5 additions & 0 deletions .changeset/lovely-pandas-pretend.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
'@atproto/api': patch
---

Disallow rare unicode whitespace characters from tags
5 changes: 5 additions & 0 deletions .changeset/quick-ducks-joke.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
'@atproto/api': patch
---

Allow tags to lead with numbers
1 change: 1 addition & 0 deletions packages/api/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ export * from './agent'
export * from './rich-text/rich-text'
export * from './rich-text/sanitization'
export * from './rich-text/unicode'
export * from './rich-text/util'
export * from './moderation'
export * from './moderation/types'
export { LABELS } from './moderation/const/labels'
Expand Down
18 changes: 13 additions & 5 deletions packages/api/src/rich-text/detection.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
import TLDs from 'tlds'
import { AppBskyRichtextFacet } from '../client'
import { UnicodeString } from './unicode'
import {
URL_REGEX,
MENTION_REGEX,
TAG_REGEX,
TRAILING_PUNCTUATION_REGEX,
} from './util'

export type Facet = AppBskyRichtextFacet.Main

Expand All @@ -9,7 +15,7 @@ export function detectFacets(text: UnicodeString): Facet[] | undefined {
const facets: Facet[] = []
{
// mentions
const re = /(^|\s|\()(@)([a-zA-Z0-9.-]+)(\b)/g
const re = MENTION_REGEX
while ((match = re.exec(text.utf16))) {
if (!isValidDomain(match[3]) && !match[3].endsWith('.test')) {
continue // probably not a handle
Expand All @@ -33,8 +39,7 @@ export function detectFacets(text: UnicodeString): Facet[] | undefined {
}
{
// links
const re =
/(^|\s|\()((https?:\/\/[\S]+)|((?<domain>[a-z][a-z0-9]*(\.[a-z0-9]+)+)[\S]*))/gim
const re = URL_REGEX
while ((match = re.exec(text.utf16))) {
let uri = match[2]
if (!uri.startsWith('http')) {
Expand Down Expand Up @@ -70,11 +75,14 @@ export function detectFacets(text: UnicodeString): Facet[] | undefined {
}
}
{
const re = /(^|\s)#((?!\ufe0f)[^\d\s]\S*)(?=\s)?/g
const re = TAG_REGEX
while ((match = re.exec(text.utf16))) {
let [, leading, tag] = match

Check warning on line 80 in packages/api/src/rich-text/detection.ts

View workflow job for this annotation

GitHub Actions / Build & Publish

'leading' is never reassigned. Use 'const' instead

tag = tag.trim().replace(/\p{P}+$/gu, '') // strip ending punctuation
if (!tag) continue

// strip ending punctuation and any spaces
tag = tag.trim().replace(TRAILING_PUNCTUATION_REGEX, '')

if (tag.length === 0 || tag.length > 64) continue

Expand Down
11 changes: 11 additions & 0 deletions packages/api/src/rich-text/util.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
export const MENTION_REGEX = /(^|\s|\()(@)([a-zA-Z0-9.-]+)(\b)/g
export const URL_REGEX =
/(^|\s|\()((https?:\/\/[\S]+)|((?<domain>[a-z][a-z0-9]*(\.[a-z0-9]+)+)[\S]*))/gim
export const TRAILING_PUNCTUATION_REGEX = /\p{P}+$/gu

/**
* `\ufe0f` emoji modifier
* `\u00AD\u2060\u200A\u200B\u200C\u200D\u20e2` zero-width spaces (likely incomplete)
*/
export const TAG_REGEX =
/(^|\s)[##]((?!\ufe0f)[^\s\u00AD\u2060\u200A\u200B\u200C\u200D\u20e2]*[^\d\s\p{P}\u00AD\u2060\u200A\u200B\u200C\u200D\u20e2]+[^\s\u00AD\u2060\u200A\u200B\u200C\u200D\u20e2]*)?/gu

Check warning on line 11 in packages/api/src/rich-text/util.ts

View workflow job for this annotation

GitHub Actions / Build & Publish

Unexpected combined character in character class

Check warning on line 11 in packages/api/src/rich-text/util.ts

View workflow job for this annotation

GitHub Actions / Build & Publish

Unexpected joined character sequence in character class
31 changes: 28 additions & 3 deletions packages/api/tests/rich-text-detection.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,7 @@ describe('detectFacets', () => {
}
})

it('correctly detects tags inline', async () => {
describe('correctly detects tags inline', () => {
const inputs: [
string,
string[],
Expand All @@ -234,11 +234,13 @@ describe('detectFacets', () => {
],
],
['#1', [], []],
['#1a', ['1a'], [{ byteStart: 0, byteEnd: 3 }]],
['#tag', ['tag'], [{ byteStart: 0, byteEnd: 4 }]],
['body #tag', ['tag'], [{ byteStart: 5, byteEnd: 9 }]],
['#tag body', ['tag'], [{ byteStart: 0, byteEnd: 4 }]],
['body #tag body', ['tag'], [{ byteStart: 5, byteEnd: 9 }]],
['body #1', [], []],
['body #1a', ['1a'], [{ byteStart: 5, byteEnd: 8 }]],
['body #a1', ['a1'], [{ byteStart: 5, byteEnd: 8 }]],
['#', [], []],
['#?', [], []],
Expand All @@ -254,12 +256,18 @@ describe('detectFacets', () => {
[],
[],
],
[
'body #thisisa64characterstring_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa!',
['thisisa64characterstring_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'],
[{ byteStart: 5, byteEnd: 70 }],
],
[
'its a #double#rainbow',
['double#rainbow'],
[{ byteStart: 6, byteEnd: 21 }],
],
['##hashash', ['#hashash'], [{ byteStart: 0, byteEnd: 9 }]],
['##', [], []],
['some #n0n3s@n5e!', ['n0n3s@n5e'], [{ byteStart: 5, byteEnd: 15 }]],
[
'works #with,punctuation',
Expand Down Expand Up @@ -319,9 +327,26 @@ describe('detectFacets', () => {
},
],
],
['no match (\\u200B): #​', [], []],
['no match (\\u200Ba): #​a', [], []],
['match (a\\u200Bb): #a​b', ['a'], [{ byteStart: 18, byteEnd: 20 }]],
['match (ab\\u200B): #ab​', ['ab'], [{ byteStart: 18, byteEnd: 21 }]],
['no match (\\u20e2tag): #⃢tag', [], []],
['no match (a\\u20e2b): #a⃢b', ['a'], [{ byteStart: 21, byteEnd: 23 }]],
[
'match full width number sign (tag): #tag',
['tag'],
[{ byteStart: 36, byteEnd: 42 }],
],
[
'match full width number sign (tag): ##️⃣tag',
['#️⃣tag'],
[{ byteStart: 36, byteEnd: 49 }],
],
['no match 1?: #1?', [], []],
]

for (const [input, tags, indices] of inputs) {
it.each(inputs)('%s', async (input, tags, indices) => {
const rt = new RichText({ text: input })
await rt.detectFacets(agent)

Expand All @@ -340,7 +365,7 @@ describe('detectFacets', () => {

expect(detectedTags).toEqual(tags)
expect(detectedIndices).toEqual(indices)
}
})
})
})

Expand Down

0 comments on commit 6ec8859

Please sign in to comment.