Skip to content

Commit

Permalink
Slightly faster keyword lookups
Browse files Browse the repository at this point in the history
Its a micro optimization but seemed to give a bit of a boost to only
search words starting with the correct letter.
  • Loading branch information
davisp committed Dec 11, 2024
1 parent 00abaf2 commit 592dc97
Show file tree
Hide file tree
Showing 2 changed files with 60 additions and 4 deletions.
58 changes: 58 additions & 0 deletions src/keywords.rs
Original file line number Diff line number Diff line change
Expand Up @@ -973,3 +973,61 @@ pub const RESERVED_FOR_IDENTIFIER: &[Keyword] = &[
Keyword::STRUCT,
Keyword::TRIM,
];

pub const NA: usize = usize::MAX;

#[rustfmt::skip]
pub const KEYWORD_LOOKUP_INDEX_ROOT: &[usize; 26] = &[
0, 42, 67, 148, 198, 241, 281, 294, 305, 350, 357, 360, 390,
430, 465, 497, 539, 543, 605, 683, 728, 761, 780, 793, 795, 796,
];

pub fn lookup(word: &str) -> Keyword {
if word.len() < 2 {
return Keyword::NoKeyword;
}

let word = word.to_uppercase();
let byte1 = word.as_bytes()[0];
if !byte1.is_ascii_uppercase() {
return Keyword::NoKeyword;
}

let start = KEYWORD_LOOKUP_INDEX_ROOT[(byte1 - b'A') as usize];

let end = if (byte1 + 1) <= b'Z' {
KEYWORD_LOOKUP_INDEX_ROOT[(byte1 - b'A' + 1) as usize]
} else {
ALL_KEYWORDS.len()
};

let keyword = ALL_KEYWORDS[start..end].binary_search(&word.as_str());
keyword.map_or(Keyword::NoKeyword, |x| ALL_KEYWORDS_INDEX[x + start])
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn check_keyword_index_roots() {
let mut root_index = Vec::with_capacity(26);
root_index.push(0);
for idx in 1..ALL_KEYWORDS.len() {
assert!(ALL_KEYWORDS[idx - 1] < ALL_KEYWORDS[idx]);
let prev = ALL_KEYWORDS[idx - 1].as_bytes()[0];
let curr = ALL_KEYWORDS[idx].as_bytes()[0];
if curr != prev {
root_index.push(idx);
}
}
assert_eq!(&root_index, KEYWORD_LOOKUP_INDEX_ROOT);
}

#[test]
fn check_keyword_lookup() {
for idx in 0..ALL_KEYWORDS.len() {
assert_eq!(lookup(ALL_KEYWORDS[idx]), ALL_KEYWORDS_INDEX[idx]);
}
}
}
6 changes: 2 additions & 4 deletions src/tokenizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ use crate::dialect::{
BigQueryDialect, DuckDbDialect, GenericDialect, MySqlDialect, PostgreSqlDialect,
SnowflakeDialect,
};
use crate::keywords::{Keyword, ALL_KEYWORDS, ALL_KEYWORDS_INDEX};
use crate::keywords::{self, Keyword};

/// SQL Token enumeration
#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
Expand Down Expand Up @@ -344,13 +344,11 @@ impl Token {
}

pub fn make_word(word: &str, quote_style: Option<char>) -> Self {
let word_uppercase = word.to_uppercase();
Token::Word(Word {
value: word.to_string(),
quote_style,
keyword: if quote_style.is_none() {
let keyword = ALL_KEYWORDS.binary_search(&word_uppercase.as_str());
keyword.map_or(Keyword::NoKeyword, |x| ALL_KEYWORDS_INDEX[x])
keywords::lookup(word)
} else {
Keyword::NoKeyword
},
Expand Down

0 comments on commit 592dc97

Please sign in to comment.