Skip to content

Commit

Permalink
Implement nip50 fulltext searching
Browse files Browse the repository at this point in the history
This adds support for nip50 fulltext searches. This allows you to use
the nostrdb query interface for executing fulltext searches instead of
the typical `ndb_text_search` api. The benefits of this include a
standardized query interface that also further filters on other fields
in the filter.

Changelog-Added: Add nip50 search filters and queries
  • Loading branch information
jb55 committed Jan 14, 2025
1 parent 2a4eb70 commit 7e36518
Show file tree
Hide file tree
Showing 3 changed files with 113 additions and 57 deletions.
50 changes: 12 additions & 38 deletions ndb.c
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,7 @@ static int usage()
printf("usage: ndb [--skip-verification] [-d db_dir] <command>\n\n");
printf("commands\n\n");
printf(" stat\n");
printf(" search [--oldest-first] [--limit 42] <fulltext query>\n");
printf(" query [-k 42] [-k 1337] [-l 42] [-e abcdef...] [-a abcdef... -a bcdef...]\n");
printf(" query [-k 42] [-k 1337] [--search term] [-l 42] [-e abcdef...] [-a abcdef... -a bcdef...]\n");
printf(" profile <pubkey>\n");
printf(" print-search-keys\n");
printf(" print-kind-keys\n");
Expand Down Expand Up @@ -136,14 +135,11 @@ int main(int argc, char *argv[])
long nanos;
struct ndb_stat stat;
struct ndb_txn txn;
struct ndb_text_search_results results;
struct ndb_text_search_result *result;
const char *dir;
unsigned char *data;
size_t data_len;
struct ndb_config config;
struct timespec t1, t2;
struct ndb_text_search_config search_config;
unsigned char tmp_id[32];

// profiles
Expand All @@ -154,7 +150,6 @@ int main(int argc, char *argv[])

res = 0;
ndb_default_config(&config);
ndb_default_text_search_config(&search_config);
ndb_config_set_mapsize(&config, 1024ULL * 1024ULL * 1024ULL * 1024ULL /* 1 TiB */);

if (argc < 2) {
Expand Down Expand Up @@ -184,38 +179,7 @@ int main(int argc, char *argv[])
return 2;
}

if (argc >= 3 && !strcmp(argv[1], "search")) {
for (i = 0; i < 2; i++) {
if (!strcmp(argv[2], "--oldest-first")) {
ndb_text_search_config_set_order(&search_config, NDB_ORDER_ASCENDING);
argv++;
argc--;
} else if (!strcmp(argv[2], "--limit") || !strcmp(argv[2], "-l")) {
limit = atoi(argv[3]);
ndb_text_search_config_set_limit(&search_config, limit);
argv += 2;
argc -= 2;
}
}

ndb_begin_query(ndb, &txn);
clock_gettime(CLOCK_MONOTONIC, &t1);
ndb_text_search(&txn, argv[2], &results, &search_config);
clock_gettime(CLOCK_MONOTONIC, &t2);

nanos = (t2.tv_sec - t1.tv_sec) * (long)1e9 + (t2.tv_nsec - t1.tv_nsec);

fprintf(stderr, "%d results in %f ms\n", results.num_results, nanos/1000000.0);

// print results for now
for (i = 0; i < results.num_results; i++) {
result = &results.results[i];
//fprintf(stderr, "[%02d] ", i+1);
ndb_print_text_search_result(&txn, result);
}

ndb_end_query(&txn);
} else if (argc == 2 && !strcmp(argv[1], "stat")) {
if (argc == 2 && !strcmp(argv[1], "stat")) {
if (!ndb_stat(ndb, &stat)) {
res = 3;
goto cleanup;
Expand Down Expand Up @@ -282,6 +246,16 @@ int main(int argc, char *argv[])
ndb_filter_end_field(f);
argv += 2;
argc -= 2;
} else if (!strcmp(argv[0], "--search") || !strcmp(argv[0], "-S")) {
if (current_field) {
ndb_filter_end_field(f);
current_field = 0;
}
ndb_filter_start_field(f, NDB_FILTER_SEARCH);
ndb_filter_add_str_element(f, argv[1]);
ndb_filter_end_field(f);
argv += 2;
argc -= 2;
} else if (!strcmp(argv[0], "-e")) {
if (current_field != 'e') {
if (!ndb_filter_start_tag_field(f, 'e')) {
Expand Down
116 changes: 97 additions & 19 deletions src/nostrdb.c
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,7 @@ enum ndb_query_plan {
NDB_PLAN_AUTHOR_KINDS,
NDB_PLAN_CREATED,
NDB_PLAN_TAGS,
NDB_PLAN_SEARCH,
};

// A id + u64 + timestamp
Expand Down Expand Up @@ -334,27 +335,27 @@ static int ndb_make_noted_text_search_key(unsigned char *buf, int bufsize,

static int ndb_make_text_search_key_low(unsigned char *buf, int bufsize,
int wordlen, const char *word,
uint64_t since,
int *keysize)
{
uint64_t timestamp, note_id;
timestamp = 0;
uint64_t note_id;
note_id = 0;
return ndb_make_text_search_key(buf, bufsize, 0, wordlen, word,
timestamp, note_id, keysize);
since, note_id, keysize);
}

static int ndb_make_text_search_key_high(unsigned char *buf, int bufsize,
int wordlen, const char *word,
uint64_t until,
int *keysize)
{
uint64_t timestamp, note_id;
timestamp = INT32_MAX;
uint64_t note_id;
note_id = INT32_MAX;
return ndb_make_text_search_key(buf, bufsize, 0, wordlen, word,
timestamp, note_id, keysize);
until, note_id, keysize);
}

typedef int (*ndb_text_search_key_order_fn)(unsigned char *buf, int bufsize, int wordlen, const char *word, int *keysize);
typedef int (*ndb_text_search_key_order_fn)(unsigned char *buf, int bufsize, int wordlen, const char *word, uint64_t timestamp, int *keysize);

/** From LMDB: Compare two items lexically */
static int mdb_cmp_memn(const MDB_val *a, const MDB_val *b) {
Expand Down Expand Up @@ -3044,6 +3045,43 @@ static int query_is_full(struct ndb_query_results *results, int limit)
return cursor_count(&results->cur, sizeof(struct ndb_query_result)) >= limit;
}

static int ndb_query_plan_execute_search(struct ndb_txn *txn,
struct ndb_filter *filter,
struct ndb_query_results *results,
int limit)
{
const char *search;
int i;
struct ndb_text_search_results text_results;
struct ndb_text_search_result *text_result;
struct ndb_text_search_config config;
struct ndb_query_result result;

ndb_default_text_search_config(&config);

if (!(search = ndb_filter_find_search(filter)))
return 0;

if (!ndb_text_search_with(txn, search, &text_results, &config, filter))
return 0;

for (i = 0; i < text_results.num_results; i++) {
if (query_is_full(results, limit))
break;

text_result = &text_results.results[i];

result.note = text_result->note;
result.note_size = text_result->note_size;
result.note_id = text_result->key.note_id;

if (!push_query_result(results, &result))
break;
}

return 1;
}

static int ndb_query_plan_execute_ids(struct ndb_txn *txn,
struct ndb_filter *filter,
struct ndb_query_results *results,
Expand Down Expand Up @@ -3456,15 +3494,18 @@ static int ndb_query_plan_execute_kinds(struct ndb_txn *txn,

static enum ndb_query_plan ndb_filter_plan(struct ndb_filter *filter)
{
struct ndb_filter_elements *ids, *kinds, *authors, *tags;
struct ndb_filter_elements *ids, *kinds, *authors, *tags, *search;

ids = ndb_filter_find_elements(filter, NDB_FILTER_IDS);
search = ndb_filter_find_elements(filter, NDB_FILTER_SEARCH);
kinds = ndb_filter_find_elements(filter, NDB_FILTER_KINDS);
authors = ndb_filter_find_elements(filter, NDB_FILTER_AUTHORS);
tags = ndb_filter_find_elements(filter, NDB_FILTER_TAGS);

// this is rougly similar to the heuristic in strfry's dbscan
if (ids) {
if (search) {
return NDB_PLAN_SEARCH;
} else if (ids) {
return NDB_PLAN_IDS;
} else if (kinds && authors && authors->count <= 10) {
return NDB_PLAN_AUTHOR_KINDS;
Expand All @@ -3483,6 +3524,7 @@ static const char *ndb_query_plan_name(int plan_id)
{
switch (plan_id) {
case NDB_PLAN_IDS: return "ids";
case NDB_PLAN_SEARCH: return "search";
case NDB_PLAN_KINDS: return "kinds";
case NDB_PLAN_TAGS: return "tags";
case NDB_PLAN_CREATED: return "created";
Expand Down Expand Up @@ -3518,6 +3560,11 @@ static int ndb_query_filter(struct ndb_txn *txn, struct ndb_filter *filter,
return 0;
break;

case NDB_PLAN_SEARCH:
if (!ndb_query_plan_execute_search(txn, filter, &results, limit))
return 0;
break;

// We have just kinds, just scan the kind index
case NDB_PLAN_KINDS:
if (!ndb_query_plan_execute_kinds(txn, filter, &results, limit))
Expand Down Expand Up @@ -4031,24 +4078,44 @@ int ndb_text_search_with(struct ndb_txn *txn, const char *query,
struct ndb_word *search_word;
struct ndb_note *note;
struct cursor cur;
uint64_t since, until, timestamp_op, *pint, note_size;
ndb_text_search_key_order_fn key_order_fn;
MDB_dbi text_db;
MDB_cursor *cursor;
MDB_val k, v;
int i, j, keysize, saved_size, limit;
MDB_cursor_op op, order_op;

note_size = 0;
note = 0;
saved = NULL;
ndb_text_search_results_init(results);
ndb_search_words_init(&search_words);

// search config
until = UINT64_MAX;
since = 0;
limit = MAX_TEXT_SEARCH_RESULTS;

// until, since from filter
if (filter != NULL) {
if ((pint = ndb_filter_get_int(filter, NDB_FILTER_UNTIL)))
until = *pint;

if ((pint = ndb_filter_get_int(filter, NDB_FILTER_SINCE)))
since = *pint;

if ((pint = ndb_filter_get_int(filter, NDB_FILTER_LIMIT)))
limit = *pint;
}

order_op = MDB_PREV;
key_order_fn = ndb_make_text_search_key_high;
timestamp_op = until;
if (config) {
if (config->order == NDB_ORDER_ASCENDING) {
order_op = MDB_NEXT;
// set the min timestamp value to since when ascending
timestamp_op = since;
key_order_fn = ndb_make_text_search_key_low;
}
limit = min(limit, config->limit);
Expand All @@ -4067,9 +4134,11 @@ int ndb_text_search_with(struct ndb_txn *txn, const char *query,
return 0;
}

// TODO: sort words from largest to smallest. This should complete the
// query quicker because the larger words are likely to have fewer
// entries in the search index.
// This should complete the query quicker because the larger words are
// likely to have fewer entries in the search index. This is not always
// true. Words with higher frequency (like bitcoin on nostr in 2024)
// may be slower. TODO: Skip word recursion by leveraging a minimal
// perfect hashmap of parsed words on a note
sort_largest_to_smallest(&search_words);

// for each word, we recursively find all of the submatches
Expand Down Expand Up @@ -4099,7 +4168,9 @@ int ndb_text_search_with(struct ndb_txn *txn, const char *query,
// match
if (!key_order_fn(buffer, sizeof(buffer),
search_words.words[0].word_len,
search_words.words[0].word, &keysize))
search_words.words[0].word,
timestamp_op,
&keysize))
{
// word is too big to fit in 1024-sized key
continue;
Expand Down Expand Up @@ -4172,10 +4243,12 @@ int ndb_text_search_with(struct ndb_txn *txn, const char *query,

// save the first key match, since we will continue from
// this on the next root word result
if (j == 0 && !saved) {
memcpy(saved_buf, k.mv_data, k.mv_size);
saved = saved_buf;
saved_size = k.mv_size;
if (j == 0) {
if (!saved) {
memcpy(saved_buf, k.mv_data, k.mv_size);
saved = saved_buf;
saved_size = k.mv_size;
}

// since we will be trying to match the same
// note_id on all subsequent word matches,
Expand All @@ -4185,15 +4258,20 @@ int ndb_text_search_with(struct ndb_txn *txn, const char *query,
// remaining word queries
if (filter) {
if ((note = ndb_get_note_by_key(txn,
result->key.note_id, NULL)))
result->key.note_id,
&note_size)))
{
if (!ndb_filter_matches(filter, note)) {
break;
}
result->note = note;
result->note_size = note_size;
}
}
}

result->note = note;
result->note_size = note_size;
last_candidate = *result;
last_result = &last_candidate;
}
Expand Down
4 changes: 4 additions & 0 deletions src/nostrdb.h
Original file line number Diff line number Diff line change
Expand Up @@ -314,6 +314,10 @@ struct ndb_text_search_key
struct ndb_text_search_result {
struct ndb_text_search_key key;
int prefix_chars;

// This is only set if we passed a filter for nip50 searches
struct ndb_note *note;
uint64_t note_size;
};

struct ndb_text_search_results {
Expand Down

0 comments on commit 7e36518

Please sign in to comment.