From bb95e2329937ae90668b80a806608fe99779dd14 Mon Sep 17 00:00:00 2001 From: Miguel Date: Thu, 30 May 2024 21:42:14 -0400 Subject: [PATCH] chore: remove wip example --- Cargo.lock | 172 ----------------- services/ai/examples/rust/nlp/Cargo.toml | 17 -- .../ai/examples/rust/nlp/examples/chatbot.rs | 173 ------------------ 3 files changed, 362 deletions(-) delete mode 100644 services/ai/examples/rust/nlp/Cargo.toml delete mode 100644 services/ai/examples/rust/nlp/examples/chatbot.rs diff --git a/Cargo.lock b/Cargo.lock index cdf2c09d6..4f33f426c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3337,12 +3337,6 @@ dependencies = [ "version_check", ] -[[package]] -name = "esaxx-rs" -version = "0.1.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d817e038c30374a4bcb22f94d0a8a0e216958d4c3dcde369b1439fec4bdda6e6" - [[package]] name = "eth-keystore" version = "0.5.0" @@ -3723,22 +3717,6 @@ dependencies = [ "workspace-hack 0.1.0", ] -[[package]] -name = "example-machine-comprehension" -version = "0.0.0" -dependencies = [ - "cdk-rust", - "ndarray", - "rand 0.8.5", - "reqwest", - "safetensors", - "safetensors-ndarray", - "serde_json", - "tokenizers", - "tokio", - "workspace-hack 0.1.0", -] - [[package]] name = "exr" version = "1.72.0" @@ -4789,23 +4767,6 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dfa686283ad6dd069f105e5ab091b04c62850d3e4cf5d67debad1933f55023df" -[[package]] -name = "hf-hub" -version = "0.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b780635574b3d92f036890d8373433d6f9fc7abb320ee42a5c25897fc8ed732" -dependencies = [ - "dirs 5.0.1", - "indicatif", - "log", - "native-tls", - "rand 0.8.5", - "serde", - "serde_json", - "thiserror", - "ureq", -] - [[package]] name = "hkdf" version = "0.12.4" @@ -7067,22 +7028,6 @@ dependencies = [ "libc", ] -[[package]] -name = "macro_rules_attribute" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a82271f7bc033d84bbca59a3ce3e4159938cb08a9c3aebbe54d215131518a13" -dependencies = [ - "macro_rules_attribute-proc_macro", - "paste", -] - -[[package]] -name = "macro_rules_attribute-proc_macro" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b8dd856d451cc0da70e2ef2ce95a18e39a93b7558bedf10201ad28503f918568" - [[package]] name = "malloc_buf" version = "0.0.6" @@ -7341,27 +7286,6 @@ dependencies = [ "syn 1.0.109", ] -[[package]] -name = "monostate" -version = "0.1.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "878c2a1f1c70e5724fa28f101ca787b6a7e8ad5c5e4ae4ca3b0fa4a419fa9075" -dependencies = [ - "monostate-impl", - "serde", -] - -[[package]] -name = "monostate-impl" -version = "0.1.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f686d68a09079e63b1d2c64aa305095887ce50565f00a922ebfaeeee0d9ba6ce" -dependencies = [ - "proc-macro2 1.0.74", - "quote 1.0.35", - "syn 2.0.46", -] - [[package]] name = "msim" version = "0.1.0" @@ -8311,28 +8235,6 @@ version = "1.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" -[[package]] -name = "onig" -version = "6.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c4b31c8722ad9171c6d77d3557db078cab2bd50afcc9d09c8b315c59df8ca4f" -dependencies = [ - "bitflags 1.3.2", - "libc", - "once_cell", - "onig_sys", -] - -[[package]] -name = "onig_sys" -version = "69.8.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7b829e3d7e9cc74c7e315ee8edb185bf4190da5acde74afd7fc59c35b1f086e7" -dependencies = [ - "cc", - "pkg-config", -] - [[package]] name = "oorandom" version = "11.1.3" @@ -9785,17 +9687,6 @@ dependencies = [ "rayon-core", ] -[[package]] -name = "rayon-cond" -version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "059f538b55efd2309c9794130bc149c6a553db90e9d99c2030785c82f0bd7df9" -dependencies = [ - "either", - "itertools 0.11.0", - "rayon", -] - [[package]] name = "rayon-core" version = "1.12.0" @@ -11487,18 +11378,6 @@ dependencies = [ "der 0.7.8", ] -[[package]] -name = "spm_precompiled" -version = "0.1.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5851699c4033c63636f7ea4cf7b7c1f1bf06d0cc03cfb42e711de5a5c46cf326" -dependencies = [ - "base64 0.13.1", - "nom", - "serde", - "unicode-segmentation", -] - [[package]] name = "stability" version = "0.2.0" @@ -12056,38 +11935,6 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" -[[package]] -name = "tokenizers" -version = "0.15.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3dd47962b0ba36e7fd33518fbf1754d136fd1474000162bbf2a8b5fcb2d3654d" -dependencies = [ - "aho-corasick", - "derive_builder", - "esaxx-rs", - "getrandom", - "hf-hub", - "itertools 0.12.0", - "lazy_static", - "log", - "macro_rules_attribute", - "monostate", - "onig", - "paste", - "rand 0.8.5", - "rayon", - "rayon-cond", - "regex", - "regex-syntax 0.8.2", - "serde", - "serde_json", - "spm_precompiled", - "thiserror", - "unicode-normalization-alignments", - "unicode-segmentation", - "unicode_categories", -] - [[package]] name = "tokio" version = "1.37.0" @@ -12904,15 +12751,6 @@ dependencies = [ "tinyvec", ] -[[package]] -name = "unicode-normalization-alignments" -version = "0.1.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "43f613e4fa046e69818dd287fdc4bc78175ff20331479dab6e1b0f98d57062de" -dependencies = [ - "smallvec", -] - [[package]] name = "unicode-segmentation" version = "1.10.1" @@ -12937,12 +12775,6 @@ version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f962df74c8c05a667b5ee8bcf162993134c104e96440b663c8daa176dc772d8c" -[[package]] -name = "unicode_categories" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "39ec24b3121d976906ece63c9daad25b85969647682eee313cb5779fdd69e14e" - [[package]] name = "universal-hash" version = "0.5.1" @@ -12984,14 +12816,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f8cdd25c339e200129fe4de81451814e5228c9b771d57378817d6117cc2b3f97" dependencies = [ "base64 0.21.5", - "flate2", "log", - "native-tls", "once_cell", "rustls 0.21.10", "rustls-webpki 0.101.7", - "serde", - "serde_json", "url", "webpki-roots 0.25.3", ] diff --git a/services/ai/examples/rust/nlp/Cargo.toml b/services/ai/examples/rust/nlp/Cargo.toml deleted file mode 100644 index 0259b4344..000000000 --- a/services/ai/examples/rust/nlp/Cargo.toml +++ /dev/null @@ -1,17 +0,0 @@ -[package] -name = "example-machine-comprehension" -version = "0.0.0" -edition = "2021" -publish = false - -[dependencies] -cdk-rust = { path = "../../../../../lib/cdk-rust" } -serde_json = "1.0" -ndarray = "0.15" -rand = "0.8" -reqwest = "0.11" -safetensors = "0.4" -safetensors-ndarray = { path = "../../../../../lib/safetensors-ndarray" } -tokenizers = { version = "0.15", default-features = false, features = [ "onig", "http" ] } -tokio.workspace = true -workspace-hack = { version = "0.1", path = "../../../../../etc/workspace-hack" } diff --git a/services/ai/examples/rust/nlp/examples/chatbot.rs b/services/ai/examples/rust/nlp/examples/chatbot.rs deleted file mode 100644 index ecf333b77..000000000 --- a/services/ai/examples/rust/nlp/examples/chatbot.rs +++ /dev/null @@ -1,173 +0,0 @@ -// You need to store the model on the node. -// Export the model along with config files using the optimum cli. -// See https://huggingface.co/docs/transformers/en/serialization#exporting-a--transformers-model-to-onnx-with-cli. -// You can find more info about the model here https://huggingface.co/microsoft/DialoGPT-medium. -// `tokenizer.json` should be included when you export the model. -use std::io; -use std::io::Write; -use std::net::SocketAddr; - -use cdk_rust::schema::ResponseFrame; -use cdk_rust::transport::tcp::TcpTransport; -use cdk_rust::Builder; -use ndarray::{s, Array1, Axis}; -use safetensors::{Dtype, SafeTensors}; -use safetensors_ndarray::collection::Collection; -use tokenizers::Tokenizer; - -const EOS: usize = 50256; - -#[tokio::main] -async fn main() { - let mut args = std::env::args(); - args.next(); - - let Some(tokenizer_path) = args.next() else { - println!("missing tokenizer path argument"); - std::process::exit(1); - }; - - let tokenizer = Tokenizer::from_file(tokenizer_path).unwrap(); - - let target: SocketAddr = "127.0.0.1:4221".parse().unwrap(); - let transport = TcpTransport::new(target); - let connector = Builder::primary([0u8; 32], 2) - .transport(transport) - .build() - .unwrap(); - let (mut sender, mut receiver) = connector.connect().await.unwrap().split(); - - // Start the session. - let start_session = serde_json::to_string(&serde_json::json!( { - "model": "387cbc21bd420764043db21330ccfbaaceafa9aa6c858a0cc16d8fc611c0dbb8".to_string(), - "origin": "blake3", - "device": "cpu", - "content_format": "bin", - "model_io_encoding": "safetensors" - })) - .unwrap(); - sender - .send(start_session.into_bytes().into()) - .await - .unwrap(); - - let mut stdout = io::stdout(); - - loop { - print!("User > "); - stdout.flush().unwrap(); - - let mut input = String::new(); - io::stdin() - .read_line(&mut input) - .expect("Failed to read line"); - input = input.trim().to_string(); - - if input == "q" || input == "quit" { - break; - } - - let mut conversation = String::new(); - // This token is also the BOS. - conversation.push_str("<|endoftext|>"); - conversation.push_str(&input); - conversation.push_str("<|endoftext|>"); - - print!("Bot > "); - stdout.flush().unwrap(); - - 'inner: loop { - // Create encoding from current conversation. - let encoding = tokenizer.encode(conversation.as_str(), true).unwrap(); - - let mut safetensors = Collection::new(); - - // Gather attention mask. - let attention_mask = encoding - .get_attention_mask() - .iter() - .copied() - .map(|mask| mask as i64) - .collect::>(); - let attention_mask = Array1::from(attention_mask).insert_axis(Axis(0)); - safetensors.insert_array_i64("attention_mask".to_string(), attention_mask.into_dyn()); - - // Gather position ids. - let position_ids = encoding - .get_word_ids() - .iter() - .copied() - .map(|id| id.unwrap() as i64) - .collect::>(); - let position_ids = Array1::from(position_ids).insert_axis(Axis(0)); - safetensors.insert_array_i64("position_ids".to_string(), position_ids.into_dyn()); - - // Gather input ids. - let tokens = encoding - .get_ids() - .iter() - .map(|i| *i as i64) - .collect::>(); - let tokens = Array1::from_iter(tokens.iter().cloned()); - let input_ids = tokens.view().insert_axis(Axis(0)).to_owned(); - safetensors.insert_array_i64("input_ids".to_string(), input_ids.into_dyn()); - - // Send service a request. - let serialized_input = safetensors.serialize(&None).unwrap(); - sender.send(serialized_input.into()).await.unwrap(); - - // Read response frame. - let resp = receiver.recv().await.unwrap().unwrap(); - - // Derive output array from response data. - let output = match resp { - ResponseFrame::ServicePayload { bytes } => { - let outputs = SafeTensors::deserialize(bytes.as_ref()).unwrap(); - let view = outputs.tensor("logits").unwrap(); - assert_eq!(view.dtype(), Dtype::F32); - safetensors_ndarray::utils::deserialize_f32(view.shape(), view.data()).unwrap() - }, - ResponseFrame::Termination { reason } => { - panic!("service terminated the connection: {reason:?}") - }, - _ => panic!("expected a service payload frame"), - }; - - // Convert to ndarray::Array. - let generated_tokens = output.view(); - - // Sort logits. - let probabilities = &mut generated_tokens - .slice(s![0, -1, ..]) - .insert_axis(Axis(0)) - .to_owned() - .iter() - .cloned() - .enumerate() - .collect::>(); - probabilities - .sort_unstable_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Less)); - - // Greedy search. We could implement beam search. - let token = probabilities[0].0; - - // The bot is done talking. - if token == EOS { - break 'inner; - } - - // Decode token. - let token_str = tokenizer.decode(&[token as _], true).unwrap(); - - // Add to history. - conversation.push_str(&token_str); - - // Print next token from bot. - print!("{}", token_str); - stdout.flush().unwrap(); - } - println!(); - } - - println!(); -}