From 7dc8fd7c543ab8875f63ea068906d08b51ef4124 Mon Sep 17 00:00:00 2001
From: Eric Bailey <git@esb.lol>
Date: Mon, 25 Sep 2023 17:45:57 -0500
Subject: [PATCH] Add hashtag detection to richtext (#1651)

* add tag detection to richtext

* fix duplicate tag index error

* add utils

* fix leading space index failures, test for them

* add changeset
---
 .changeset/moody-wombats-live.md              |   5 +
 packages/api/src/rich-text/detection.ts       |  27 +++++
 packages/api/src/rich-text/rich-text.ts       |  13 +++
 .../api/tests/rich-text-detection.test.ts     | 104 ++++++++++++++++++
 4 files changed, 149 insertions(+)
 create mode 100644 .changeset/moody-wombats-live.md

diff --git a/.changeset/moody-wombats-live.md b/.changeset/moody-wombats-live.md
new file mode 100644
index 00000000000..c146f008d5c
--- /dev/null
+++ b/.changeset/moody-wombats-live.md
@@ -0,0 +1,5 @@
+---
+'@atproto/api': patch
+---
+
+Adds support for hashtags in the `RichText.detectFacets` method.
diff --git a/packages/api/src/rich-text/detection.ts b/packages/api/src/rich-text/detection.ts
index 910804ca0db..503866d7df8 100644
--- a/packages/api/src/rich-text/detection.ts
+++ b/packages/api/src/rich-text/detection.ts
@@ -69,6 +69,33 @@ export function detectFacets(text: UnicodeString): Facet[] | undefined {
       })
     }
   }
+  {
+    const re = /(?:^|\s)(#[^\d\s]\S*)(?=\s)?/g
+    while ((match = re.exec(text.utf16))) {
+      let [tag] = match
+      const hasLeadingSpace = /^\s/.test(tag)
+
+      tag = tag.trim().replace(/\p{P}+$/gu, '') // strip ending punctuation
+
+      // inclusive of #, max of 64 chars
+      if (tag.length > 66) continue
+
+      const index = match.index + (hasLeadingSpace ? 1 : 0)
+
+      facets.push({
+        index: {
+          byteStart: text.utf16IndexToUtf8Index(index),
+          byteEnd: text.utf16IndexToUtf8Index(index + tag.length), // inclusive of last char
+        },
+        features: [
+          {
+            $type: 'app.bsky.richtext.facet#tag',
+            tag,
+          },
+        ],
+      })
+    }
+  }
   return facets.length > 0 ? facets : undefined
 }
 
diff --git a/packages/api/src/rich-text/rich-text.ts b/packages/api/src/rich-text/rich-text.ts
index 46ccc7dfef1..4c041b8bb5f 100644
--- a/packages/api/src/rich-text/rich-text.ts
+++ b/packages/api/src/rich-text/rich-text.ts
@@ -100,6 +100,7 @@ import { detectFacets } from './detection'
 export type Facet = AppBskyRichtextFacet.Main
 export type FacetLink = AppBskyRichtextFacet.Link
 export type FacetMention = AppBskyRichtextFacet.Mention
+export type FacetTag = AppBskyRichtextFacet.Tag
 export type Entity = AppBskyFeedPost.Entity
 
 export interface RichTextProps {
@@ -141,6 +142,18 @@ export class RichTextSegment {
   isMention() {
     return !!this.mention
   }
+
+  get tag(): FacetTag | undefined {
+    const tag = this.facet?.features.find(AppBskyRichtextFacet.isTag)
+    if (AppBskyRichtextFacet.isTag(tag)) {
+      return tag
+    }
+    return undefined
+  }
+
+  isTag() {
+    return !!this.tag
+  }
 }
 
 export class RichText {
diff --git a/packages/api/tests/rich-text-detection.test.ts b/packages/api/tests/rich-text-detection.test.ts
index da81fe415b1..df2aed84889 100644
--- a/packages/api/tests/rich-text-detection.test.ts
+++ b/packages/api/tests/rich-text-detection.test.ts
@@ -1,4 +1,5 @@
 import { AtpAgent, RichText, RichTextSegment } from '../src'
+import { isTag } from '../src/client/types/app/bsky/richtext/facet'
 
 describe('detectFacets', () => {
   const agent = new AtpAgent({ service: 'http://localhost' })
@@ -208,6 +209,109 @@ describe('detectFacets', () => {
       expect(Array.from(rt.segments(), segmentToOutput)).toEqual(outputs[i])
     }
   })
+
+  it('correctly detects tags inline', async () => {
+    const inputs: [
+      string,
+      string[],
+      { byteStart: number; byteEnd: number }[],
+    ][] = [
+      ['#a', ['#a'], [{ byteStart: 0, byteEnd: 2 }]],
+      [
+        '#a #b',
+        ['#a', '#b'],
+        [
+          { byteStart: 0, byteEnd: 2 },
+          { byteStart: 3, byteEnd: 5 },
+        ],
+      ],
+      ['#1', [], []],
+      ['#tag', ['#tag'], [{ byteStart: 0, byteEnd: 4 }]],
+      ['body #tag', ['#tag'], [{ byteStart: 5, byteEnd: 9 }]],
+      ['#tag body', ['#tag'], [{ byteStart: 0, byteEnd: 4 }]],
+      ['body #tag body', ['#tag'], [{ byteStart: 5, byteEnd: 9 }]],
+      ['body #1', [], []],
+      ['body #a1', ['#a1'], [{ byteStart: 5, byteEnd: 8 }]],
+      ['#', [], []],
+      ['text #', [], []],
+      ['text # text', [], []],
+      [
+        'body #thisisa64characterstring_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa',
+        ['#thisisa64characterstring_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'],
+        [{ byteStart: 5, byteEnd: 71 }],
+      ],
+      [
+        'body #thisisa65characterstring_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaab',
+        [],
+        [],
+      ],
+      [
+        'its a #double#rainbow',
+        ['#double#rainbow'],
+        [{ byteStart: 6, byteEnd: 21 }],
+      ],
+      ['##hashash', ['##hashash'], [{ byteStart: 0, byteEnd: 9 }]],
+      ['some #n0n3s@n5e!', ['#n0n3s@n5e'], [{ byteStart: 5, byteEnd: 15 }]],
+      [
+        'works #with,punctuation',
+        ['#with,punctuation'],
+        [{ byteStart: 6, byteEnd: 23 }],
+      ],
+      [
+        'strips trailing #punctuation, #like. #this!',
+        ['#punctuation', '#like', '#this'],
+        [
+          { byteStart: 16, byteEnd: 28 },
+          { byteStart: 30, byteEnd: 35 },
+          { byteStart: 37, byteEnd: 42 },
+        ],
+      ],
+      [
+        'strips #multi_trailing___...',
+        ['#multi_trailing'],
+        [{ byteStart: 7, byteEnd: 22 }],
+      ],
+      [
+        'works with #🦋 emoji, and #butter🦋fly',
+        ['#🦋', '#butter🦋fly'],
+        [
+          { byteStart: 11, byteEnd: 16 },
+          { byteStart: 28, byteEnd: 42 },
+        ],
+      ],
+      [
+        '#same #same #but #diff',
+        ['#same', '#same', '#but', '#diff'],
+        [
+          { byteStart: 0, byteEnd: 5 },
+          { byteStart: 6, byteEnd: 11 },
+          { byteStart: 12, byteEnd: 16 },
+          { byteStart: 17, byteEnd: 22 },
+        ],
+      ],
+    ]
+
+    for (const [input, tags, indices] of inputs) {
+      const rt = new RichText({ text: input })
+      await rt.detectFacets(agent)
+
+      let detectedTags: string[] = []
+      let detectedIndices: { byteStart: number; byteEnd: number }[] = []
+
+      for (const { facet } of rt.segments()) {
+        if (!facet) continue
+        for (const feature of facet.features) {
+          if (isTag(feature)) {
+            detectedTags.push(feature.tag)
+          }
+        }
+        detectedIndices.push(facet.index)
+      }
+
+      expect(detectedTags).toEqual(tags)
+      expect(detectedIndices).toEqual(indices)
+    }
+  })
 })
 
 function segmentToOutput(segment: RichTextSegment): string[] {