From 7dc8fd7c543ab8875f63ea068906d08b51ef4124 Mon Sep 17 00:00:00 2001 From: Eric Bailey Date: Mon, 25 Sep 2023 17:45:57 -0500 Subject: [PATCH] Add hashtag detection to richtext (#1651) * add tag detection to richtext * fix duplicate tag index error * add utils * fix leading space index failures, test for them * add changeset --- .changeset/moody-wombats-live.md | 5 + packages/api/src/rich-text/detection.ts | 27 +++++ packages/api/src/rich-text/rich-text.ts | 13 +++ .../api/tests/rich-text-detection.test.ts | 104 ++++++++++++++++++ 4 files changed, 149 insertions(+) create mode 100644 .changeset/moody-wombats-live.md diff --git a/.changeset/moody-wombats-live.md b/.changeset/moody-wombats-live.md new file mode 100644 index 00000000000..c146f008d5c --- /dev/null +++ b/.changeset/moody-wombats-live.md @@ -0,0 +1,5 @@ +--- +'@atproto/api': patch +--- + +Adds support for hashtags in the `RichText.detectFacets` method. diff --git a/packages/api/src/rich-text/detection.ts b/packages/api/src/rich-text/detection.ts index 910804ca0db..503866d7df8 100644 --- a/packages/api/src/rich-text/detection.ts +++ b/packages/api/src/rich-text/detection.ts @@ -69,6 +69,33 @@ export function detectFacets(text: UnicodeString): Facet[] | undefined { }) } } + { + const re = /(?:^|\s)(#[^\d\s]\S*)(?=\s)?/g + while ((match = re.exec(text.utf16))) { + let [tag] = match + const hasLeadingSpace = /^\s/.test(tag) + + tag = tag.trim().replace(/\p{P}+$/gu, '') // strip ending punctuation + + // inclusive of #, max of 64 chars + if (tag.length > 66) continue + + const index = match.index + (hasLeadingSpace ? 1 : 0) + + facets.push({ + index: { + byteStart: text.utf16IndexToUtf8Index(index), + byteEnd: text.utf16IndexToUtf8Index(index + tag.length), // inclusive of last char + }, + features: [ + { + $type: 'app.bsky.richtext.facet#tag', + tag, + }, + ], + }) + } + } return facets.length > 0 ? facets : undefined } diff --git a/packages/api/src/rich-text/rich-text.ts b/packages/api/src/rich-text/rich-text.ts index 46ccc7dfef1..4c041b8bb5f 100644 --- a/packages/api/src/rich-text/rich-text.ts +++ b/packages/api/src/rich-text/rich-text.ts @@ -100,6 +100,7 @@ import { detectFacets } from './detection' export type Facet = AppBskyRichtextFacet.Main export type FacetLink = AppBskyRichtextFacet.Link export type FacetMention = AppBskyRichtextFacet.Mention +export type FacetTag = AppBskyRichtextFacet.Tag export type Entity = AppBskyFeedPost.Entity export interface RichTextProps { @@ -141,6 +142,18 @@ export class RichTextSegment { isMention() { return !!this.mention } + + get tag(): FacetTag | undefined { + const tag = this.facet?.features.find(AppBskyRichtextFacet.isTag) + if (AppBskyRichtextFacet.isTag(tag)) { + return tag + } + return undefined + } + + isTag() { + return !!this.tag + } } export class RichText { diff --git a/packages/api/tests/rich-text-detection.test.ts b/packages/api/tests/rich-text-detection.test.ts index da81fe415b1..df2aed84889 100644 --- a/packages/api/tests/rich-text-detection.test.ts +++ b/packages/api/tests/rich-text-detection.test.ts @@ -1,4 +1,5 @@ import { AtpAgent, RichText, RichTextSegment } from '../src' +import { isTag } from '../src/client/types/app/bsky/richtext/facet' describe('detectFacets', () => { const agent = new AtpAgent({ service: 'http://localhost' }) @@ -208,6 +209,109 @@ describe('detectFacets', () => { expect(Array.from(rt.segments(), segmentToOutput)).toEqual(outputs[i]) } }) + + it('correctly detects tags inline', async () => { + const inputs: [ + string, + string[], + { byteStart: number; byteEnd: number }[], + ][] = [ + ['#a', ['#a'], [{ byteStart: 0, byteEnd: 2 }]], + [ + '#a #b', + ['#a', '#b'], + [ + { byteStart: 0, byteEnd: 2 }, + { byteStart: 3, byteEnd: 5 }, + ], + ], + ['#1', [], []], + ['#tag', ['#tag'], [{ byteStart: 0, byteEnd: 4 }]], + ['body #tag', ['#tag'], [{ byteStart: 5, byteEnd: 9 }]], + ['#tag body', ['#tag'], [{ byteStart: 0, byteEnd: 4 }]], + ['body #tag body', ['#tag'], [{ byteStart: 5, byteEnd: 9 }]], + ['body #1', [], []], + ['body #a1', ['#a1'], [{ byteStart: 5, byteEnd: 8 }]], + ['#', [], []], + ['text #', [], []], + ['text # text', [], []], + [ + 'body #thisisa64characterstring_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa', + ['#thisisa64characterstring_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'], + [{ byteStart: 5, byteEnd: 71 }], + ], + [ + 'body #thisisa65characterstring_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaab', + [], + [], + ], + [ + 'its a #double#rainbow', + ['#double#rainbow'], + [{ byteStart: 6, byteEnd: 21 }], + ], + ['##hashash', ['##hashash'], [{ byteStart: 0, byteEnd: 9 }]], + ['some #n0n3s@n5e!', ['#n0n3s@n5e'], [{ byteStart: 5, byteEnd: 15 }]], + [ + 'works #with,punctuation', + ['#with,punctuation'], + [{ byteStart: 6, byteEnd: 23 }], + ], + [ + 'strips trailing #punctuation, #like. #this!', + ['#punctuation', '#like', '#this'], + [ + { byteStart: 16, byteEnd: 28 }, + { byteStart: 30, byteEnd: 35 }, + { byteStart: 37, byteEnd: 42 }, + ], + ], + [ + 'strips #multi_trailing___...', + ['#multi_trailing'], + [{ byteStart: 7, byteEnd: 22 }], + ], + [ + 'works with #🦋 emoji, and #butter🦋fly', + ['#🦋', '#butter🦋fly'], + [ + { byteStart: 11, byteEnd: 16 }, + { byteStart: 28, byteEnd: 42 }, + ], + ], + [ + '#same #same #but #diff', + ['#same', '#same', '#but', '#diff'], + [ + { byteStart: 0, byteEnd: 5 }, + { byteStart: 6, byteEnd: 11 }, + { byteStart: 12, byteEnd: 16 }, + { byteStart: 17, byteEnd: 22 }, + ], + ], + ] + + for (const [input, tags, indices] of inputs) { + const rt = new RichText({ text: input }) + await rt.detectFacets(agent) + + let detectedTags: string[] = [] + let detectedIndices: { byteStart: number; byteEnd: number }[] = [] + + for (const { facet } of rt.segments()) { + if (!facet) continue + for (const feature of facet.features) { + if (isTag(feature)) { + detectedTags.push(feature.tag) + } + } + detectedIndices.push(facet.index) + } + + expect(detectedTags).toEqual(tags) + expect(detectedIndices).toEqual(indices) + } + }) }) function segmentToOutput(segment: RichTextSegment): string[] {