Skip to content

Commit

Permalink
Improve matching in results from searchRepos (#1507)
Browse files Browse the repository at this point in the history
* ✨ Better results from searchRepos when matching by handle

* ✅ Adjust tests for repo search result order

* 🧹 Cleanup

* ♻️ Refactor to re-use search query builder

* 🧹 Cleanup

* ✅ Add test for pagination in search result
  • Loading branch information
foysalit authored Sep 5, 2023
1 parent 4fb32e1 commit f7186f0
Show file tree
Hide file tree
Showing 3 changed files with 112 additions and 52 deletions.
19 changes: 4 additions & 15 deletions packages/bsky/src/api/com/atproto/admin/searchRepos.ts
Original file line number Diff line number Diff line change
@@ -1,36 +1,25 @@
import { InvalidRequestError } from '@atproto/xrpc-server'
import { Server } from '../../../../lexicon'
import AppContext from '../../../../context'
import { paginate } from '../../../../db/pagination'
import { ListKeyset } from '../../../../services/actor'

export default function (server: Server, ctx: AppContext) {
server.com.atproto.admin.searchRepos({
auth: ctx.roleVerifier,
handler: async ({ params }) => {
const db = ctx.db.getPrimary()
const moderationService = ctx.services.moderation(db)
const { term = '', limit, cursor, invitedBy } = params
const { invitedBy } = params
if (invitedBy) {
throw new InvalidRequestError('The invitedBy parameter is unsupported')
}

const searchField = term.startsWith('did:') ? 'did' : 'handle'

const { ref } = db.db.dynamic
const keyset = new ListKeyset(ref('indexedAt'), ref('did'))
let resultQb = ctx.services
const { results, cursor } = await ctx.services
.actor(db)
.searchQb(searchField, term)
.selectAll()
resultQb = paginate(resultQb, { keyset, cursor, limit })

const results = await resultQb.execute()

.getSearchResults({ ...params, includeSoftDeleted: true })
return {
encoding: 'application/json',
body: {
cursor: keyset.packFromResult(results),
cursor,
repos: await moderationService.views.repo(results),
},
}
Expand Down
74 changes: 45 additions & 29 deletions packages/bsky/src/services/actor/index.ts
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
import { sql } from 'kysely'
import { Database } from '../../db'
import { DbRef, notSoftDeletedClause } from '../../db/util'
import { notSoftDeletedClause } from '../../db/util'
import { ActorViews } from './views'
import { ImageUriBuilder } from '../../image/uri'
import { Actor } from '../../db/tables/actor'
import { TimeCidKeyset } from '../../db/pagination'
import { LabelCache } from '../../label-cache'
import { TimeCidKeyset, paginate } from '../../db/pagination'
import { SearchKeyset, getUserSearchQuery } from '../util/search'

export class ActorService {
constructor(
Expand Down Expand Up @@ -77,33 +78,53 @@ export class ActorService {
})
}

searchQb(searchField: 'did' | 'handle' = 'handle', term?: string) {
async getSearchResults({
cursor,
limit = 25,
term = '',
includeSoftDeleted,
}: {
cursor?: string
limit?: number
term?: string
includeSoftDeleted?: boolean
}) {
const searchField = term.startsWith('did:') ? 'did' : 'handle'
let paginatedBuilder
const { ref } = this.db.db.dynamic
let builder = this.db.db.selectFrom('actor')

// When searchField === 'did', the term will always be a valid string because
// searchField is set to 'did' after checking that the term is a valid did
if (searchField === 'did' && term) {
return builder.where('actor.did', '=', term)
const paginationOptions = {
limit,
cursor,
direction: 'asc' as const,
}
let keyset

if (term) {
builder = builder.where((qb) => {
// Performing matching by word using "strict word similarity" operator.
// The more characters the user gives us, the more we can ratchet down
// the distance threshold for matching.
const threshold = term.length < 3 ? 0.9 : 0.8
return qb
.where(distance(term, ref('handle')), '<', threshold)
.orWhereExists((q) =>
q
.selectFrom('profile')
.whereRef('profile.creator', '=', 'actor.did')
.where(distance(term, ref('displayName')), '<', threshold),
)
if (term && searchField === 'handle') {
keyset = new SearchKeyset(sql``, sql``)
paginatedBuilder = getUserSearchQuery(this.db, {
term,
includeSoftDeleted,
...paginationOptions,
}).select('distance')
} else {
paginatedBuilder = this.db.db
.selectFrom('actor')
.select([sql<number>`0`.as('distance')])
keyset = new ListKeyset(ref('indexedAt'), ref('did'))

// When searchField === 'did', the term will always be a valid string because
// searchField is set to 'did' after checking that the term is a valid did
if (term && searchField === 'did') {
paginatedBuilder = paginatedBuilder.where('actor.did', '=', term)
}
paginatedBuilder = paginate(paginatedBuilder, {
keyset,
...paginationOptions,
})
}
return builder

const results: Actor[] = await paginatedBuilder.selectAll('actor').execute()
return { results, cursor: keyset.packFromResult(results) }
}

async getRepoRev(did: string | null): Promise<string | null> {
Expand All @@ -118,11 +139,6 @@ export class ActorService {
}

type ActorResult = Actor

// Uses pg_trgm strict word similarity to check similarity between a search term and a stored value
const distance = (term: string, ref: DbRef) =>
sql<number>`(${term} <<<-> ${ref})`

export class ListKeyset extends TimeCidKeyset<{
indexedAt: string
did: string // handles are treated identically to cids in TimeCidKeyset
Expand Down
71 changes: 63 additions & 8 deletions packages/bsky/tests/views/admin/repo-search.test.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import AtpAgent from '@atproto/api'
import AtpAgent, { ComAtprotoAdminSearchRepos } from '@atproto/api'
import { wait } from '@atproto/common'
import { TestNetwork } from '@atproto/dev-env'
import { SeedClient } from '../../seeds/client'
Expand All @@ -9,6 +9,20 @@ describe('pds admin repo search views', () => {
let agent: AtpAgent
let sc: SeedClient
let headers: { [s: string]: string }
// In results that don't have a related profile record, we will only have handle but not a name
// And names are usually capitalized on each word so the comparison is done on lowercase version
const handleOrNameStartsWith =
(term: string) => (handleOrName: (string | undefined)[]) =>
!!handleOrName.find((str) =>
str?.toLowerCase().includes(term.toLowerCase()),
)
const resultToHandlesAndNames = (
result: ComAtprotoAdminSearchRepos.Response,
) =>
result.data.repos.map((u: any) => [
u.handle,
(u.relatedRecords[0] as Record<string, string | undefined>)?.displayName,
])

beforeAll(async () => {
network = await TestNetwork.create({
Expand Down Expand Up @@ -48,23 +62,64 @@ describe('pds admin repo search views', () => {
})

it('gives relevant results when searched by handle', async () => {
const term = 'car'
const result = await agent.api.com.atproto.admin.searchRepos(
{ term: 'car' },
{ term },
{ headers },
)

const shouldContain = [
'cara-wiegand69.test', // Present despite repo takedown
'eudora-dietrich4.test', // Carol Littel
'shane-torphy52.test', //Sadie Carter
'aliya-hodkiewicz.test', // Carlton Abernathy IV
// Present despite repo takedown
// First item in the array because of direct handle match
'cara-wiegand69.test',
'carlos6.test',
'aliya-hodkiewicz.test', // Carlton Abernathy IV
'eudora-dietrich4.test', // Carol Littel
'carolina-mcdermott77.test',
'shane-torphy52.test', // Sadie Carter
// Last item in the array because handle and display name none match very close to the the search term
'cayla-marquardt39.test',
]

const handles = result.data.repos.map((u) => u.handle)

const handlesAndNames = resultToHandlesAndNames(result)
const handles = handlesAndNames.map(([handle]) => handle)
// Assert that all matches are found
shouldContain.forEach((handle) => expect(handles).toContain(handle))
// Assert that the order is correct, showing the closest match by handle first
const containsTerm = handleOrNameStartsWith(term)
expect(containsTerm(handlesAndNames[0])).toBeTruthy()
expect(
containsTerm(handlesAndNames[handlesAndNames.length - 1]),
).toBeFalsy()
})

it('pagination respects matching order when searched by handle', async () => {
const term = 'car'
const resultPageOne = await agent.api.com.atproto.admin.searchRepos(
{ term, limit: 4 },
{ headers },
)
const resultPageTwo = await agent.api.com.atproto.admin.searchRepos(
{ term, limit: 4, cursor: resultPageOne.data.cursor },
{ headers },
)

const handlesAndNamesPageOne = resultToHandlesAndNames(resultPageOne)
const handlesAndNamesPageTwo = resultToHandlesAndNames(resultPageTwo)
const containsTerm = handleOrNameStartsWith(term)

// First result of first page always has matches either handle or did
expect(containsTerm(handlesAndNamesPageOne[0])).toBeTruthy()
// Since we only get 4 items per page max and know that among the test dataset
// at least 4 users have the term in handle or profile, last item in first page
// should contain the term
expect(
containsTerm(handlesAndNamesPageOne[handlesAndNamesPageOne.length - 1]),
).toBeTruthy()
// However, the last item of second page, should not contain the term
expect(
containsTerm(handlesAndNamesPageTwo[handlesAndNamesPageTwo.length - 1]),
).toBeFalsy()
})

it('gives relevant results when searched by did', async () => {
Expand Down

0 comments on commit f7186f0

Please sign in to comment.