diff --git a/Cargo.lock b/Cargo.lock index a1d9ab04..ed080626 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -714,11 +714,12 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.120" +version = "1.0.127" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4e0d21c9a8cae1235ad58a00c11cb40d4b1e5c784f1ef2c537876ed6ffd8b7c5" +checksum = "8043c06d9f82bd7271361ed64f415fe5e12a77fdb52e573e7f06a516dea329ad" dependencies = [ "itoa", + "memchr", "ryu", "serde", ] @@ -780,6 +781,15 @@ dependencies = [ "time", ] +[[package]] +name = "smallvec" +version = "1.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67" +dependencies = [ + "serde", +] + [[package]] name = "strsim" version = "0.11.1" @@ -906,7 +916,7 @@ dependencies = [ [[package]] name = "tripsu" -version = "0.0.2" +version = "0.1.0" dependencies = [ "bitflags", "blake3", @@ -917,10 +927,12 @@ dependencies = [ "rio_turtle", "rstest", "serde", + "serde_json", "serde_yml", "slog", "slog-async", "slog-term", + "smallvec", "tempfile", ] diff --git a/Cargo.toml b/Cargo.toml index a7b1a0ca..d644442d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -6,10 +6,12 @@ rio_api = '0.8.4' rio_turtle = '0.8.4' rstest = '0.21.0' +serde_json = "1.0.127" serde_yml = '0.0.10' slog = '2.7.0' slog-async = '2.8.0' slog-term = '2.9.0' +smallvec = { version = "1.13.2", features = ["serde"] } tempfile = '3.10.1' [dependencies.clap] diff --git a/docs/development-guide.md b/docs/development-guide.md index 4cb33f99..3a71b13d 100644 --- a/docs/development-guide.md +++ b/docs/development-guide.md @@ -66,7 +66,7 @@ just test ## Build the Package & Image -To build the package with Nix run: +To build the package with Nix run:b ```shell just nix-package @@ -110,3 +110,11 @@ It will: **Note: If the release pipeline fails, you can just run this same command again. Also rerun it when you made a mistake, it will cancel the current release (works also when `--amend`ing on the current commit)** + +## Benchmarking performances + +A benchmarking script is provided in `tools/bench/benchmark.sh`, along with a nix devshell. To run the benchmark in the isolated environment, run: + +```shell +just nix-develop-bench bash ./tools/bench/benchmark.sh +``` diff --git a/justfile b/justfile index 8cb8b3b9..c8c75752 100644 --- a/justfile +++ b/justfile @@ -23,6 +23,13 @@ nix-develop-ci *args: { [ -n "${cmd:-}" ] || cmd=("zsh"); } && \ nix develop ./tools/nix#ci --command "${cmd[@]}" +# Enter nix development shell for benchmarking. +nix-develop-bench *args: + cd "{{root_dir}}" && \ + cmd=("$@") && \ + { [ -n "${cmd:-}" ] || cmd=("zsh"); } && \ + nix develop ./tools/nix#bench --command "${cmd[@]}" + ## Standard stuff ============================================================= # Format the code. format *args: diff --git a/src/index.rs b/src/index.rs index 039edcea..1585a21a 100644 --- a/src/index.rs +++ b/src/index.rs @@ -1,18 +1,87 @@ use rio_api::parser::TriplesParser; use rio_turtle::TurtleError; -use std::{io::Write, path::Path}; +use serde::{Deserialize, Serialize}; +use smallvec::{smallvec, SmallVec}; +use std::{ + collections::HashMap, + hash::{DefaultHasher, Hash, Hasher}, + path::Path, +}; use crate::{ io, rdf_types::{Triple, TripleView}, }; -fn index_triple(t: Triple, out: &mut impl Write) { +/// Stores a mapping from hashed instance uri to their types. +/// The type URIs are stored once as a vector of strings. +/// Each subject in map is stored as hash(subject_uri): u64 +/// and refers to its types using their vector index. +#[derive(Serialize, Deserialize)] +pub struct TypeIndex { + pub types: Vec<String>, + map: HashMap<u64, SmallVec<[usize; 1]>>, +} + +impl TypeIndex { + fn hash(&self, s: &impl Hash) -> u64 { + let mut hasher = DefaultHasher::new(); + s.hash(&mut hasher); + hasher.finish().to_le() + } + + pub fn from_iter<'a>(type_map: impl Iterator<Item = (&'a str, &'a str)>) -> Self { + let mut idx = TypeIndex::new(); + + type_map.for_each(|(subject_uri, type_uri)| idx.insert(subject_uri, type_uri).unwrap()); + + return idx; + } + + pub fn new() -> Self { + TypeIndex { + types: Vec::new(), + map: HashMap::new(), + } + } + + // Insert input subject-type mapping into the index. + // The index will store the hash of the subject. + pub fn insert(&mut self, subject_uri: &str, type_uri: &str) -> Result<(), std::io::Error> { + let key = self.hash(&subject_uri.to_string()); + let type_idx: usize; + + // Get type index or add a new one. + if self.types.contains(&type_uri.to_string()) { + type_idx = self.types.iter().position(|x| *x == type_uri).unwrap(); + } else { + type_idx = self.types.len(); + self.types.push(type_uri.to_string()); + } + // Insert mapping into the index. + match self.map.get_mut(&key) { + Some(v) => { + v.push(type_idx); + } + None => { + self.map.insert(key, smallvec![type_idx]); + } + } + + Ok(()) + } + + pub fn get(&self, subject_key: &str) -> Option<Vec<&str>> { + let key = self.hash(&subject_key.to_string()); + self.map + .get(&key) + .map(|v| v.iter().map(|i| self.types[*i].as_ref()).collect()) + } +} + +fn index_triple(t: Triple, index: &mut TypeIndex) { if t.predicate.iri.as_str() == "http://www.w3.org/1999/02/22-rdf-syntax-ns#type" { - let r = || -> std::io::Result<()> { - out.write_all(t.to_string().as_bytes())?; - out.write_all(b" .\n") - }(); + let r = { index.insert(&t.subject.to_string(), &t.object.to_string()) }; if let Err(e) = r { panic!("Error writting to out buffer: {e}"); @@ -20,19 +89,45 @@ fn index_triple(t: Triple, out: &mut impl Write) { } } -pub fn create_type_map(input: &Path, output: &Path) { +pub fn create_type_index(input: &Path, output: &Path) { let buf_in = io::get_reader(input); - let mut buf_out = io::get_writer(output); + let buf_out = io::get_writer(output); let mut triples = io::parse_ntriples(buf_in); + let mut index = TypeIndex::new(); while !triples.is_end() { let _ = triples .parse_step(&mut |t: TripleView| { - index_triple(t.into(), &mut buf_out); + index_triple(t.into(), &mut index); Result::<(), TurtleError>::Ok(()) }) .inspect_err(|e| { panic!("Parsing error occured: {e}"); }); } + let _ = serde_json::to_writer(buf_out, &index); +} + +#[cfg(test)] +mod tests { + use super::*; + #[test] + // Test the parsing of a triple. + fn index_from_iter() { + let vals = vec![ + ("<urn:Alice>", "<urn:Person>"), + ("<urn:Alice>", "<urn:Employee>"), + ("<urn:ACME>", "<urn:Organization>"), + ] + .into_iter() + .map(|(a, b)| (a, b)); + + let idx = TypeIndex::from_iter(vals); + + assert_eq!( + idx.get("<urn:Alice>").unwrap(), + vec!["<urn:Person>", "<urn:Employee>"] + ); + println!("{}", serde_json::to_string(&idx).unwrap()); + } } diff --git a/src/io.rs b/src/io.rs index a4d05b11..24722e95 100644 --- a/src/io.rs +++ b/src/io.rs @@ -1,4 +1,4 @@ -use crate::rules::Rules; +use crate::{index::TypeIndex, rules::Rules}; use rio_turtle::NTriplesParser; use std::{ fs::File, @@ -46,7 +46,15 @@ pub fn parse_ntriples(reader: impl BufRead) -> NTriplesParser<impl BufRead> { pub fn parse_rules(path: &Path) -> Rules { return match File::open(path) { Ok(file) => serde_yml::from_reader(file).expect("Error parsing rules file."), - Err(e) => panic!("Cannot open file '{:?}': '{}'.", path, e), + Err(e) => panic!("Cannot open rules file '{:?}': '{}'.", path, e), + }; +} + +// Parse yaml type index +pub fn parse_index(path: &Path) -> TypeIndex { + return match File::open(path) { + Ok(file) => serde_json::from_reader(file).expect("Error parsing index file."), + Err(e) => panic!("Cannot open index file '{:?}': '{}'.", path, e), }; } diff --git a/src/log.rs b/src/log.rs index d7668460..5bcf3051 100644 --- a/src/log.rs +++ b/src/log.rs @@ -20,7 +20,7 @@ pub fn create_logger(use_stdout: bool) -> Arc<Logger> { .fuse(); let drain = slog_async::Async::new(drain) - .chan_size(5_000_000) + .chan_size(1_000) .build() .fuse(); diff --git a/src/main.rs b/src/main.rs index 09cc4810..e83c112e 100644 --- a/src/main.rs +++ b/src/main.rs @@ -10,7 +10,7 @@ mod rules; // Define the imports. use crate::{ - index::create_type_map, + index::create_type_index, log::{create_logger, info}, pseudo::pseudonymize_graph, }; @@ -87,7 +87,7 @@ fn main() { match cli.command { Subcommands::Index(args) => { info!(log, "Args: {:?}", args); - create_type_map(&args.input, &args.output) + create_type_index(&args.input, &args.output) } Subcommands::Pseudo(args) => { info!(log, "Args: {:?}", args); diff --git a/src/pseudo.rs b/src/pseudo.rs index 6fbca0ae..c1c55a3b 100644 --- a/src/pseudo.rs +++ b/src/pseudo.rs @@ -1,13 +1,13 @@ use rio_api::parser::TriplesParser; use rio_turtle::TurtleError; use std::{ - collections::HashMap, - io::{BufRead, Write}, + io::Write, path::{Path, PathBuf}, }; use crate::{ crypto::{new_pseudonymizer, Pseudonymize}, + index::TypeIndex, io, log::Logger, rdf_types::*, @@ -19,7 +19,7 @@ use crate::{ fn process_triple( triple: Triple, rules_config: &Rules, - node_to_type: &HashMap<String, String>, + node_to_type: &mut TypeIndex, out: &mut impl Write, hasher: &dyn Pseudonymize, ) { @@ -35,24 +35,6 @@ fn process_triple( } } -// Create a index mapping node -> type from an input ntriples buffer -fn load_type_map(input: impl BufRead) -> HashMap<String, String> { - let mut node_to_type: HashMap<String, String> = HashMap::new(); - let mut triples = io::parse_ntriples(input); - - while !triples.is_end() { - let _: Result<(), TurtleError> = triples.parse_step(&mut |t| { - node_to_type.insert( - t.subject.to_string().replace(['<', '>'], ""), - t.object.to_string().replace(['<', '>'], ""), - ); - Ok(()) - }); - } - - return node_to_type; -} - pub fn pseudonymize_graph( _: &Logger, input: &Path, @@ -62,11 +44,10 @@ pub fn pseudonymize_graph( secret_path: &Option<PathBuf>, ) { let buf_input = io::get_reader(input); - let buf_index = io::get_reader(index_path); let mut buf_output = io::get_writer(output); let rules = io::parse_rules(rules_path); - let node_to_type: HashMap<String, String> = load_type_map(buf_index); + let mut type_index = io::parse_index(index_path); let secret = secret_path.as_ref().map(io::read_bytes); let pseudonymizer = new_pseudonymizer(None, secret); @@ -80,7 +61,7 @@ pub fn pseudonymize_graph( process_triple( t.into(), &rules, - &node_to_type, + &mut type_index, &mut buf_output, &pseudonymizer, ); @@ -102,14 +83,14 @@ mod tests { #[test] // Test the parsing of a triple. - fn encrypt_nt_file() { + fn pseudo_nt_file() { let logger = log::create_logger(true); let dir = tempdir().unwrap(); let input_path = Path::new("tests/data/test.nt"); let rules_path = Path::new("tests/data/rules.yaml"); let output_path = dir.path().join("output.nt"); - let type_map_path = Path::new("tests/data/type_map.nt"); + let type_map_path = Path::new("tests/data/type_index.json"); let key = None; pseudonymize_graph( &logger, diff --git a/src/rules.rs b/src/rules.rs index 0dc26e00..c0f03ecb 100644 --- a/src/rules.rs +++ b/src/rules.rs @@ -2,7 +2,7 @@ use crate::rdf_types::*; use ::std::collections::{HashMap, HashSet}; use serde::{Deserialize, Serialize}; -use crate::model::TripleMask; +use crate::{index::TypeIndex, model::TripleMask}; /// Rules for pseudonymizing nodes #[derive(Serialize, Deserialize, Debug, Default)] @@ -38,11 +38,7 @@ pub struct Rules { } /// Check all parts of the triple against rules. -pub fn match_rules( - triple: &Triple, - rules: &Rules, - type_map: &HashMap<String, String>, -) -> TripleMask { +pub fn match_rules(triple: &Triple, rules: &Rules, type_map: &mut TypeIndex) -> TripleMask { let mut mask = match_node_rules(triple, rules, type_map) | match_object_rules(triple, rules, type_map); @@ -54,17 +50,13 @@ pub fn match_rules( } /// Check triple against node-pseudonymization rules. -pub fn match_node_rules( - triple: &Triple, - rules: &Rules, - type_map: &HashMap<String, String>, -) -> TripleMask { +pub fn match_node_rules(triple: &Triple, rules: &Rules, type_map: &mut TypeIndex) -> TripleMask { let pseudo_subject = match &triple.subject { - Subject::NamedNode(n) => match_type(&n.iri, rules, type_map), + Subject::NamedNode(n) => match_type(&n.to_string(), rules, type_map), Subject::BlankNode(_) => false, }; let pseudo_object = match &triple.object { - Term::NamedNode(n) => match_type(&n.iri, rules, type_map), + Term::NamedNode(n) => match_type(&n.to_string(), rules, type_map), Term::BlankNode(_) => false, Term::Literal(_) => false, }; @@ -81,22 +73,24 @@ pub fn match_node_rules( } /// Checks triple against object-pseudonymization rules -pub fn match_object_rules( - triple: &Triple, - rules: &Rules, - type_map: &HashMap<String, String>, -) -> TripleMask { - if match_predicate(&triple.predicate.iri, rules) { +pub fn match_object_rules(triple: &Triple, rules: &Rules, type_map: &mut TypeIndex) -> TripleMask { + if match_predicate(&triple.predicate.to_string(), rules) { return TripleMask::OBJECT; } let pseudo_object = match &triple.subject { - Subject::NamedNode(n) => { - match_type_predicate(&n.iri, &triple.predicate.iri, type_map, rules) - } - Subject::BlankNode(b) => { - match_type_predicate(&b.id, &triple.predicate.iri, type_map, rules) - } + Subject::NamedNode(n) => match_type_predicate( + &n.to_string(), + &triple.predicate.to_string(), + type_map, + rules, + ), + Subject::BlankNode(b) => match_type_predicate( + &b.to_string(), + &triple.predicate.to_string(), + type_map, + rules, + ), }; if pseudo_object { @@ -107,9 +101,9 @@ pub fn match_object_rules( } /// Check if the type of input instance URI is in the rules. -fn match_type(subject: &str, rules: &Rules, type_map: &HashMap<String, String>) -> bool { +fn match_type(subject: &str, rules: &Rules, type_map: &mut TypeIndex) -> bool { if let Some(v) = type_map.get(subject) { - rules.nodes.of_type.contains(v) + v.iter().any(|&i| rules.nodes.of_type.contains(i)) } else { false } @@ -124,19 +118,21 @@ fn match_predicate(predicate: &str, rules: &Rules) -> bool { fn match_type_predicate( subject: &str, predicate: &str, - type_map: &HashMap<String, String>, + type_map: &mut TypeIndex, rules: &Rules, ) -> bool { - let subject_type = match type_map.get(subject) { - None => return false, - Some(v) => v, - }; - let preds = rules.objects.on_type_predicate.get(subject_type); - if preds.is_none() || !preds.unwrap().contains(predicate) { + let Some(instance_types) = type_map.get(subject) else { return false; - } + }; - return true; + for typ in instance_types { + if let Some(preds) = rules.objects.on_type_predicate.get(typ) { + if preds.contains(predicate) { + return true; + } + } + } + return false; } #[cfg(test)] @@ -148,20 +144,21 @@ mod tests { use serde_yml; // Instance used in tests - const NODE_IRI: &str = "Alice"; - const PREDICATE_IRI: &str = "hasName"; + const NODE_IRI: &str = "<Alice>"; + const PREDICATE_IRI: &str = "<hasName>"; // Helper macro to create a HashMap from pairs #[macro_export] macro_rules! index { () => { - ::std::collections::HashMap::new() + TypeIndex::new() }; ($($key:expr => $value:expr),+ $(,)?) => { - ::std::collections::HashMap::from([ - $((String::from($key), String::from($value))),* - ]) + TypeIndex::from_iter( + vec![ + $(($key, $value)),* + ].into_iter()) }; } @@ -171,13 +168,13 @@ mod tests { #[rstest] // Subject is in the rules & type index - #[case(index! { NODE_IRI => "Person" }, "Person", true)] + #[case(index! { NODE_IRI => "<Person>" }, "<Person>", true)] // Subject is in the type index, not in the rules - #[case(index! { NODE_IRI => "Person" }, "Bank", false)] + #[case(index! { NODE_IRI => "<Person>" }, "<Bank>", false)] // Subject is not in the type index - #[case(index! { "BankName" => "Bank" }, "Bank", false)] + #[case(index! { "<BankName>" => "<Bank>" }, "<Bank>", false)] fn type_rule( - #[case] index: HashMap<String, String>, + #[case] mut index: TypeIndex, #[case] rule_type: &str, #[case] match_expected: bool, ) { @@ -189,7 +186,7 @@ mod tests { " )); - assert_eq!(match_type(NODE_IRI, &rules, &index), match_expected); + assert_eq!(match_type(NODE_IRI, &rules, &mut index), match_expected); } #[rstest] @@ -210,17 +207,17 @@ mod tests { #[rstest] // Subject predicate in config - #[case("Person", "hasName", index! { NODE_IRI => "Person" }, true)] + #[case("<Person>", "<hasName>", index! { NODE_IRI => "<Person>" }, true)] // Subject in config, predicate not - #[case("Person", "hasAge", index! { NODE_IRI => "Person" }, false)] + #[case("<Person>", "<hasAge>", index! { NODE_IRI => "<Person>" }, false)] // Subject predicate not in config - #[case("Bob", "hasAge", index! { NODE_IRI => "Person" }, false)] + #[case("<Bob>", "<hasAge>", index! { NODE_IRI => "<Person>" }, false)] // Subject not in type index - #[case("Bob", "hasAge", index! { "Bob" => "Person" }, false)] + #[case("<Bob>", "<hasAge>", index! { "<Bob>" => "<Person>" }, false)] fn type_predicate_rule( #[case] rule_type: &str, #[case] rule_predicate: &str, - #[case] index: HashMap<String, String>, + #[case] mut index: TypeIndex, #[case] match_expected: bool, ) { let rules = parse_rules(&format!( @@ -233,7 +230,7 @@ mod tests { )); assert_eq!( - match_type_predicate(NODE_IRI, PREDICATE_IRI, &index, &rules), + match_type_predicate(NODE_IRI, PREDICATE_IRI, &mut index, &rules), match_expected ); } @@ -254,21 +251,23 @@ mod tests { let rules: Rules = parse_rules( r#" nodes: - of_type: ["urn:Person"] + of_type: ["<urn:Person>"] objects: - on_predicate: ["urn:hasLastName"] + on_predicate: ["<urn:hasLastName>"] on_type_predicate: - "urn:Person": ["urn:hasAge"] + "<urn:Person>": ["<urn:hasAge>"] "#, ); - let index = index! { - "urn:Alice" => "urn:Person", - "urn:Bob" => "urn:Person", - "urn:ACME" => "urn:Organization" + let mut index = index! { + "<urn:Alice>" => "<urn:Person>", + "<urn:Bob>" => "<urn:Person>", + "<urn:ACME>" => "<urn:Organization>" }; + println!("{}", serde_yml::to_string(&rules).unwrap()); + println!("{}", serde_json::to_string(&index).unwrap()); TurtleParser::new(triple.as_ref(), None) .parse_all(&mut |t| { - let mask = match_rules(&t.into(), &rules, &index); + let mask = match_rules(&t.into(), &rules, &mut index); assert_eq!(mask.bits(), expected_mask); Ok(()) as Result<(), TurtleError> }) diff --git a/tests/data/rules.yaml b/tests/data/rules.yaml index 65f19b0b..46c9cb85 100644 --- a/tests/data/rules.yaml +++ b/tests/data/rules.yaml @@ -4,18 +4,18 @@ invert: false # hash URIs of people and online accounts nodes: of_type: - - "http://xmlns.com/foaf/0.1/Person" # All nodes which are rdf:type Person - - "http://xmlns.com/foaf/OnlineAccount" # "" OnlineAccount + - "<http://xmlns.com/foaf/0.1/Person>" # All nodes which are rdf:type Person + - "<http://xmlns.com/foaf/OnlineAccount>" # "" OnlineAccount objects: # hash accesscode values for all nodes on_predicate: - - "http://schema.org/accessCode" + - "<http://schema.org/accessCode>" #on_type: # NOTE: not currently supported #- "http://example.org/UserAccount" # hash name only for instances of person and online account on_type_predicate: - "http://xmlns.com/foaf/OnlineAccount": - - "http://schema.org/name" - "http://xmlns.com/foaf/0.1/Person": - - "http://schema.org/name" + "<http://xmlns.com/foaf/OnlineAccount>": + - "<http://schema.org/name>" + "<http://xmlns.com/foaf/0.1/Person>": + - "<http://schema.org/name>" diff --git a/tests/data/type_index.json b/tests/data/type_index.json new file mode 100644 index 00000000..abc6847c --- /dev/null +++ b/tests/data/type_index.json @@ -0,0 +1 @@ +{"types":["<http://xmlns.com/foaf/0.1/Person>","<http://xmlns.com/foaf/OnlineAccount>","<http://xmlns.com/foaf/0.1/Organization>"],"map":{"15212815035200482759":[1],"130358124972442050":[0],"9932096721503705860":[1],"10729855068363610633":[2],"8283467020653172379":[0]}} \ No newline at end of file diff --git a/tests/data/type_map.nt b/tests/data/type_map.nt deleted file mode 100644 index 1ff5b7ea..00000000 --- a/tests/data/type_map.nt +++ /dev/null @@ -1,4 +0,0 @@ -<http://example.org/Alice> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://xmlns.com/foaf/0.1/Person> . -<http://example.org/Alice-Bank-Account> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://xmlns.com/foaf/OnlineAccount> . -<http://example.org/Bob> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://xmlns.com/foaf/0.1/Person> . -<http://example.org/Bob-Bank-Account> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://xmlns.com/foaf/OnlineAccount> . \ No newline at end of file diff --git a/tools/bench/benchmark.sh b/tools/bench/benchmark.sh new file mode 100644 index 00000000..30db9250 --- /dev/null +++ b/tools/bench/benchmark.sh @@ -0,0 +1,171 @@ +#!/usr/bin/env bash + +# Benchmark runtime and memory usage of tripsu +# Compares the working directory version against a baseline branch (main by default) + +set -euo pipefail + +### Final output path +OUTPUT="profiling.md" +PROFILE='release' +BUILD_ARGS=( ) +[[ "${PROFILE}" == 'release' ]] && BUILD_ARGS+=( '--release' ) +### setup binaries + +# baseline binary +BASE_BRANCH='main' + +BASE_DIR=$(mktemp -d) +BASE_URL="$(git config --get remote.origin.url)" +( + GIT_CLONE_PROTECTION_ACTIVE=false \ + git clone \ + --branch "${BASE_BRANCH}" \ + "${BASE_URL}" \ + "${BASE_DIR}" \ + && cd "${BASE_DIR}" \ + && just build "${BUILD_ARGS[@]}" +) +BASE_BIN="${BASE_DIR}/target/${PROFILE}/tripsu" + +# current binary +COMP_BRANCH="$(git rev-parse --abbrev-ref HEAD)" +just build "${BUILD_ARGS[@]}" +COMP_BIN="./target/${PROFILE}/tripsu" + +# setup data +DATA_URL="https://ftp.uniprot.org/pub/databases/uniprot/current_release/rdf/proteomes.rdf.xz" +INPUT="/tmp/proteomes.nt" + +# Download data if needed +if [ ! -f ${INPUT} ]; then + curl "${DATA_URL}" \ + | xz -dc - \ + | rdfpipe-rs -i rdf-xml -o nt - \ + > "${INPUT}" || rm "${INPUT}" +fi + +# setup config +RULES=$(mktemp) +BASE_IDX=$(mktemp) +COMP_IDX=$(mktemp) + +cat << EOF > "${RULES}" + +nodes: + of_type: + - "http://purl.uniprot.org/core/Proteome" + - "http://purl.uniprot.org/core/Strain" + +objects: + on_type_predicate: + "http://purl.uniprot.org/core/Submission_Citation": + - "http://purl.uniprot.org/core/author" + + on_predicate: + - "http://purl.org/dc/terms/identifier" + +EOF + +### Commands to benchmark +BASE_CMD_IDX="${BASE_BIN} index -o ${BASE_IDX} ${INPUT}" +COMP_CMD_IDX="${COMP_BIN} index -o ${COMP_IDX} ${INPUT}" +BASE_CMD_PSD="${BASE_BIN} pseudo -r ${RULES} -x ${BASE_IDX} ${INPUT}" +COMP_CMD_PSD="${COMP_BIN} pseudo -r ${RULES} -x ${COMP_IDX} ${INPUT}" + +### functions for profiling + +cpu_prof() { + local branch1=$1 + local cmd1=$2 + local branch2=$3 + local cmd2=$4 + local out=$5 + hyperfine --export-markdown "${out}" -r 5 \ + -n "${branch1}" "${cmd1}" \ + -n "${branch2}" "${cmd2}" +} + +mem_prof() { + local name=$1 + local cmd=$2 + local heap_out + heap_out=$(mktemp) + echo -n "$name: " + # shellcheck disable=SC2086 + heaptrack -o "${heap_out}" ${cmd} >/dev/null + heaptrack_print "${heap_out}.zst" \ + | grep '^peak heap memory' +} + +make_report() { + local cpu_index=$1 + local cpu_pseudo=$2 + local mem_index=$3 + local mem_pseudo=$4 + local base_branch=$5 + + cat <<-MD + # tripsu profiling + + > date: $(date -u +%Y-%m-%d) + + Comparing $(git branch --show-current) against $base_branch. + + ## Timings + + Run time compared using hyperfine + + ### Indexing + + $(cat "${cpu_index}") + + ### Pseudonymization + + $(cat "${cpu_pseudo}") + + ## Memory + + Heap memory usage compared using heaptrack + + ### Indexing + + $(cat "${mem_index}") + + ### Pseudonymization + + $(cat "${mem_pseudo}") + MD +} + + +### Run profiling + +## Profile cpu time +HYPF_IDX_OUT=$(mktemp) +HYPF_PSD_OUT=$(mktemp) + +# indexing +cpu_prof "${BASE_BRANCH}" "${BASE_CMD_IDX}" \ + "${COMP_BRANCH}" "${COMP_CMD_IDX}" "${HYPF_IDX_OUT}" +# pseudonymization +cpu_prof "${BASE_BRANCH}" "${BASE_CMD_IDX}" \ + "${COMP_BRANCH}" "${COMP_CMD_IDX}" "${HYPF_PSD_OUT}" + +## Profile memory +HEAP_IDX_OUT=$(mktemp) +HEAP_PSD_OUT=$(mktemp) + +# indexing +mem_prof "${BASE_BRANCH}" "${BASE_CMD_IDX}" > "${HEAP_IDX_OUT}" +mem_prof "${COMP_BRANCH}" "${COMP_CMD_IDX}" >> "${HEAP_IDX_OUT}" +# pseudonymization +mem_prof "${BASE_BRANCH}" "${BASE_CMD_PSD}" > "${HEAP_PSD_OUT}" +mem_prof "${COMP_BRANCH}" "${COMP_CMD_PSD}" >> "${HEAP_PSD_OUT}" + + +### Reporting +make_report \ + "${HYPF_IDX_OUT}" "${HYPF_PSD_OUT}" \ + "${HEAP_IDX_OUT}" "${HEAP_PSD_OUT}" \ + "${BASE_BRANCH}" > "${OUTPUT}" diff --git a/tools/nix/flake.nix b/tools/nix/flake.nix index 3e0c2b4e..d561f10f 100644 --- a/tools/nix/flake.nix +++ b/tools/nix/flake.nix @@ -82,6 +82,11 @@ dasel ]; + benchInputs = with pkgs; [ + hyperfine + heaptrack + ]; + # Things needed at runtime. buildInputs = []; @@ -98,6 +103,12 @@ inherit buildInputs; nativeBuildInputs = nativeBuildInputsBasic ++ nativeBuildInputsDev; }; + bench = mkShell { + inherit buildInputs; + nativeBuildInputs = nativeBuildInputsBasic + ++ nativeBuildInputsDev + ++ benchInputs; + }; ci = mkShell { inherit buildInputs;