Improve tag detection (#2260)

* Allow tags to lead with and contain only numbers * Break tags on other whitespace characters * Export regexes from rich text detection * Add test * Add test * Disallow number-only tags * Avoid combining enclosing screen chars * Allow full-width number sign * Clarify tests * Fix punctuation edge case * Reorder * Simplify, add another test * Another test, comment
bluesky-social · Mar 1, 2024 · 6ec8859 · 6ec8859
1 parent 2e08b69
commit 6ec8859
Show file tree

Hide file tree

Showing 7 changed files with 68 additions and 8 deletions.
diff --git a/.changeset/chatty-cows-kick.md b/.changeset/chatty-cows-kick.md
@@ -0,0 +1,5 @@
+---
+'@atproto/api': patch
+---
+
+Export regex from rich text detection
diff --git a/.changeset/lovely-pandas-pretend.md b/.changeset/lovely-pandas-pretend.md
@@ -0,0 +1,5 @@
+---
+'@atproto/api': patch
+---
+
+Disallow rare unicode whitespace characters from tags
diff --git a/.changeset/quick-ducks-joke.md b/.changeset/quick-ducks-joke.md
@@ -0,0 +1,5 @@
+---
+'@atproto/api': patch
+---
+
+Allow tags to lead with numbers
diff --git a/packages/api/src/index.ts b/packages/api/src/index.ts
@@ -14,6 +14,7 @@ export * from './agent'
 export * from './rich-text/rich-text'
 export * from './rich-text/sanitization'
 export * from './rich-text/unicode'
+export * from './rich-text/util'
 export * from './moderation'
 export * from './moderation/types'
 export { LABELS } from './moderation/const/labels'

diff --git a/packages/api/src/rich-text/detection.ts b/packages/api/src/rich-text/detection.ts
@@ -1,6 +1,12 @@
 import TLDs from 'tlds'
 import { AppBskyRichtextFacet } from '../client'
 import { UnicodeString } from './unicode'
+import {
+  URL_REGEX,
+  MENTION_REGEX,
+  TAG_REGEX,
+  TRAILING_PUNCTUATION_REGEX,
+} from './util'
 
 export type Facet = AppBskyRichtextFacet.Main
 
@@ -9,7 +15,7 @@ export function detectFacets(text: UnicodeString): Facet[] | undefined {
   const facets: Facet[] = []
   {
     // mentions
-    const re = /(^|\s|\()(@)([a-zA-Z0-9.-]+)(\b)/g
+    const re = MENTION_REGEX
     while ((match = re.exec(text.utf16))) {
       if (!isValidDomain(match[3]) && !match[3].endsWith('.test')) {
         continue // probably not a handle
@@ -33,8 +39,7 @@ export function detectFacets(text: UnicodeString): Facet[] | undefined {
   }
   {
     // links
-    const re =
-      /(^|\s|\()((https?:\/\/[\S]+)|((?<domain>[a-z][a-z0-9]*(\.[a-z0-9]+)+)[\S]*))/gim
+    const re = URL_REGEX
     while ((match = re.exec(text.utf16))) {
       let uri = match[2]
       if (!uri.startsWith('http')) {
@@ -70,11 +75,14 @@ export function detectFacets(text: UnicodeString): Facet[] | undefined {
     }
   }
   {
-    const re = /(^|\s)#((?!\ufe0f)[^\d\s]\S*)(?=\s)?/g
+    const re = TAG_REGEX
     while ((match = re.exec(text.utf16))) {
       let [, leading, tag] = match
 
-      tag = tag.trim().replace(/\p{P}+$/gu, '') // strip ending punctuation
+      if (!tag) continue
+
+      // strip ending punctuation and any spaces
+      tag = tag.trim().replace(TRAILING_PUNCTUATION_REGEX, '')
 
       if (tag.length === 0 || tag.length > 64) continue
 

diff --git a/packages/api/src/rich-text/util.ts b/packages/api/src/rich-text/util.ts
@@ -0,0 +1,11 @@
+export const MENTION_REGEX = /(^|\s|\()(@)([a-zA-Z0-9.-]+)(\b)/g
+export const URL_REGEX =
+  /(^|\s|\()((https?:\/\/[\S]+)|((?<domain>[a-z][a-z0-9]*(\.[a-z0-9]+)+)[\S]*))/gim
+export const TRAILING_PUNCTUATION_REGEX = /\p{P}+$/gu
+
+/**
+ * `\ufe0f` emoji modifier
+ * `\u00AD\u2060\u200A\u200B\u200C\u200D\u20e2` zero-width spaces (likely incomplete)
+ */
+export const TAG_REGEX =
+  /(^|\s)[#＃]((?!\ufe0f)[^\s\u00AD\u2060\u200A\u200B\u200C\u200D\u20e2]*[^\d\s\p{P}\u00AD\u2060\u200A\u200B\u200C\u200D\u20e2]+[^\s\u00AD\u2060\u200A\u200B\u200C\u200D\u20e2]*)?/gu
diff --git a/packages/api/tests/rich-text-detection.test.ts b/packages/api/tests/rich-text-detection.test.ts
@@ -218,7 +218,7 @@ describe('detectFacets', () => {
     }
   })
 
-  it('correctly detects tags inline', async () => {
+  describe('correctly detects tags inline', () => {
     const inputs: [
       string,
       string[],
@@ -234,11 +234,13 @@ describe('detectFacets', () => {
         ],
       ],
       ['#1', [], []],
+      ['#1a', ['1a'], [{ byteStart: 0, byteEnd: 3 }]],
       ['#tag', ['tag'], [{ byteStart: 0, byteEnd: 4 }]],
       ['body #tag', ['tag'], [{ byteStart: 5, byteEnd: 9 }]],
       ['#tag body', ['tag'], [{ byteStart: 0, byteEnd: 4 }]],
       ['body #tag body', ['tag'], [{ byteStart: 5, byteEnd: 9 }]],
       ['body #1', [], []],
+      ['body #1a', ['1a'], [{ byteStart: 5, byteEnd: 8 }]],
       ['body #a1', ['a1'], [{ byteStart: 5, byteEnd: 8 }]],
       ['#', [], []],
       ['#?', [], []],
@@ -254,12 +256,18 @@ describe('detectFacets', () => {
         [],
         [],
       ],
+      [
+        'body #thisisa64characterstring_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa!',
+        ['thisisa64characterstring_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'],
+        [{ byteStart: 5, byteEnd: 70 }],
+      ],
       [
         'its a #double#rainbow',
         ['double#rainbow'],
         [{ byteStart: 6, byteEnd: 21 }],
       ],
       ['##hashash', ['#hashash'], [{ byteStart: 0, byteEnd: 9 }]],
+      ['##', [], []],
       ['some #n0n3s@n5e!', ['n0n3s@n5e'], [{ byteStart: 5, byteEnd: 15 }]],
       [
         'works #with,punctuation',
@@ -319,9 +327,26 @@ describe('detectFacets', () => {
           },
         ],
       ],
+      ['no match (\\u200B): #', [], []],
+      ['no match (\\u200Ba): #a', [], []],
+      ['match (a\\u200Bb): #ab', ['a'], [{ byteStart: 18, byteEnd: 20 }]],
+      ['match (ab\\u200B): #ab', ['ab'], [{ byteStart: 18, byteEnd: 21 }]],
+      ['no match (\\u20e2tag): #⃢tag', [], []],
+      ['no match (a\\u20e2b): #a⃢b', ['a'], [{ byteStart: 21, byteEnd: 23 }]],
+      [
+        'match full width number sign (tag): ＃tag',
+        ['tag'],
+        [{ byteStart: 36, byteEnd: 42 }],
+      ],
+      [
+        'match full width number sign (tag): ＃#️⃣tag',
+        ['#️⃣tag'],
+        [{ byteStart: 36, byteEnd: 49 }],
+      ],
+      ['no match 1?: #1?', [], []],
     ]
 
-    for (const [input, tags, indices] of inputs) {
+    it.each(inputs)('%s', async (input, tags, indices) => {
       const rt = new RichText({ text: input })
       await rt.detectFacets(agent)
 
@@ -340,7 +365,7 @@ describe('detectFacets', () => {
 
       expect(detectedTags).toEqual(tags)
       expect(detectedIndices).toEqual(indices)
-    }
+    })
   })
 })