Official commit

LooseLab · Jul 12, 2022 · fce8cfd · fce8cfd
commit fce8cfd
Show file tree

Hide file tree

Showing 102 changed files with 10,165 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,9 @@
+/target
+.vscode/settings.json
+*Cargo.lock
+*.idea*
+/test_basecall
+*.fast5
+basecall_test.py
+*LEFT*
+*.pyc
diff --git a/Cargo.toml b/Cargo.toml
@@ -0,0 +1,36 @@
+[package]
+name = "icarust"
+version = "0.1.0"
+edition = "2021"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+tonic = "0.6"
+prost = "0.9"
+prost-types = "0.9"
+futures = {version = "0.3", default-features = false, features = ["alloc"]}
+futures-util = "0.3"
+tokio = { version = "1.0", features = ["rt-multi-thread", "macros", "sync", "time"] }
+tokio-stream = "0.1"
+toml = "0.5"
+async-stream = "0.2"
+serde = { version = "1.0", features = ["derive"] }
+serde_json = "1.0"
+rand = "0.8"
+rand_distr = "0.4"
+fnv = "1.0"
+log = "0.4.0"
+pretty_env_logger = "0.4.0"
+uuid = { version = "0.8", features = ["v4"]}
+ndarray-npy = { version = "0.8.1", default-features = false }
+ndarray = "0.15.0"
+memmap2 = "0.5.3"
+rm-frust5-api = "0.0.3"
+chrono = "0.4"
+byteorder = "1"
+clap = { version = "3.2.5", features = ["derive"] }
+
+
+[build-dependencies]
+tonic-build = "0.6"
diff --git a/Icarust b/Icarust
diff --git a/LICENSE.md b/LICENSE.md
diff --git a/Profile_tomls/config.toml b/Profile_tomls/config.toml
@@ -0,0 +1,19 @@
+output_path = "/tmp/"
+
+[parameters]
+sample_name = "test"
+experiment_name = "test_2_bacteria"
+flowcell_name = "FAQ1234"
+experiment_duration_set = 4800
+device_id = "X6"
+position = "FenceSitting"
+
+[[sample]]
+input_genome = "NC_002516.2.squiggle.npy"
+mean_read_length = 20000
+weight = 1
+
+[[sample]]
+input_genome = "NC_003997.3.squiggle.npy"
+mean_read_length = 15000
+weight = 2
diff --git a/Profile_tomls/monkeypox.toml b/Profile_tomls/monkeypox.toml
@@ -0,0 +1,15 @@
+output_path = "/tmp/"
+
+[parameters]
+sample_name = "test"
+experiment_name = "test_2_bacteria"
+flowcell_name = "FAQ1234"
+experiment_duration_set = 4800
+device_id = "Bantersaurus"
+position = "FenceSitter"
+
+[[sample]]
+input_genome = "squiggle_arrs/monkeypox/"
+mean_read_length = 20000
+weights_file = "squiggle_arrs/monkeypox/distributions.json"
+amplicon = true
diff --git a/Profile_tomls/monkeypox_barcoded.toml b/Profile_tomls/monkeypox_barcoded.toml
@@ -0,0 +1,20 @@
+output_path = "/tmp/"
+random_seed = 10
+
+[parameters]
+sample_name = "test"
+experiment_name = "test_mpv"
+flowcell_name = "FAQ1234"
+experiment_duration_set = 4800
+device_id = "Bantersaurus"
+position = "FenceSitter"
+
+[[sample]]
+name = "MonkeyPox"
+weight = 1
+input_genome = "squiggle_arrs/monkeypox_barcoded/"
+mean_read_length = 20000
+amplicon = true
+barcodes = ["Barcode01", "Barcode02", "Barcode03"]
+barcode_weights = [1, 2, 3]
+uneven = true
diff --git a/README.md b/README.md
@@ -0,0 +1,150 @@
+# Icarust
+Rust based Minknow simulator
+---
+🦀🚀
+![Lament of Icarust](img/Draper_Herbert_James_Mourning_for_Icarus.jpg "The lament of Icarus")
+Figure 1 - Accurate depiction of a man learning Rust ☠️
+### `Warning`
+Icarust is a work in progress - as such some small bugs are to be expected.
+
+## Quick start
+
+Icarust requires Rust > 1.56. In order to install Rust instructions can be found [here.](https://www.rust-lang.org/tools/install)
+
+In order to run Icarust with the pre set config and squiggle - 
+
+```zsh
+git clone https://github.com/Adoni5/Icarust
+cd Icarust
+cargo run --release -- --Profile_tomls/config.toml -v
+```
+
+## Changing Configured settings
+<details open>
+<summary></summary>
+To configure an Icarust simulation a config [TOML](https://toml.io/en/) file is passed to the initialise command. TOML files are minimal and easy to read markup files. Each line is either a key-value pair or a 'table' 
+heading.
+
+The config file is split into a global settings, [Parameters](#parameters) and Sample. An example file can be found [here.](examples/example_config.toml)
+### Global fields
+Global fields are applied more as configuration variables that apply throughout the codebase.
+
+![Global Config Section](img/global_Section_toml.png "Global Config section example.")
+|          Key |       Type      | Required | Description |
+|:-------------|:---------------:|:-----------:|:--------:|
+| output_path | string | True | The path to a directory that the resulting FAST5 and readfish unblocked_read_ids.txt file will be written to. | 
+| global_mean_read_length | int | False | If set, any samples that do not have their own read length field will use this value| 
+| random_seed | int  | False | The seed to use in any Random Number generation. If set this makes exeriments repeatable if the value is retained. | 
+
+### Parameters
+The parameters are applied to the "sequencer". They are used to setup the GRPC server so that it is connectable to. They are also written out in the FAST5 files.
+
+![Parameters Config Section](img/parameters_section_toml.png "Parameters Config section example.")
+|          Key |       Type      | Required | Description |
+|:-------------|:---------------:|:-----------:|:--------:|
+| sample_name | string | True | The sample name for the simulation | 
+| experiment_name | string | True | The experiment name for the simulation| 
+| flowcell_name | string  | True | The flowcell name for the simulation | 
+| experiment_duration | int  | False | The experiment duration in minutes **CURRENTLY UNUSED** | 
+| device_id | string  | True | The device ID - can be anything. | 
+| position | string  | True | Position name. This has to match what readfish is looking for. |
+
+### Sample
+The sample configures what squiggle will be served. This is provided as an array of tables - i.e it is possible to specify more than one sample field. An Array of tables is sepcified by enclosing the section title in [[]].
+
+![Sample Config Section](img/sample_section_toml.png "Sample Config section example.")
+|          Key |       Type      | Required | Description |
+|:-------------|:---------------:|:-----------:|:--------:|
+| name | string | True | The sample name. | 
+| input_genome | string | True | Path to **either** the squiggle array or a directory of squiggle arrays. If a directory, all squiggle files will be considered as possible sources of reads for this sample.| 
+| mean_read_length | float  | False | The mean read length for the distribution of this sample. | 
+| weight | int  | True | The relative weight of this sample against any other sample. | 
+| weights_files | array[string]  | False | An array of paths to [distribution.json](#distributions) files, if you wish to specify relative likelihood of drawing a read from a given squiggle file. | 
+| amplicon | bool | False | Is the sample from a PCR amplicon based run. Means that read squiggle is always the complete length of a squiggle file. |
+| barcodes | array[string] | False | Array of Barcode names. Multiple Barcodes can be provided for one sample |
+| barcode_Weights | array[string] | False | The relative distribution of barcodes. If not provided any barcodes will be assigned a random likelihood. If provided must same length as the barcodes array.|
+| uneven | bool | False | Uneven likelihood of choosing a squiggle array. **currently unused**|
+</details>
+
+## Generating squiggle to serve
+<details open>
+<summary></summary>
+In the python directory a script called make_squiggle.py exists. I recommend [conda](https://conda.io/projects/conda/en/latest/user-guide/install/linux.html) in order to create the python environment to use this script. 
+
+`NB` - A python package we _currently_ use is scrappie - which depends on a few C libraries. The names of these for debian systems are listed below. 
+
+
+    libcunit1
+    libcunit1-dev
+    libhdf5
+    libhdf5-dev
+    libopenblas-base
+    libopenblas-dev
+
+These can be install with `apt-get install`.
+
+`sudo apt-get install libcunit1 libcunit1-dev libhdf5 libhdf5-dev libopenblas-base libopenblas-dev`
+
+Now that you have all the packages required, in the python directory -
+
+
+```zsh
+cd python
+conda env create -f icarust.yaml
+```
+
+To then generate signal to be served, use the provided script, giving any reference files you wish to use as arguments, space seperated. An example -
+
+```zsh
+python make_squiggle.py reference_1.fa reference_2.fa --out_dir /path/to/desired/output/squiggle
+```
+
+### Splittling the reference into multiple squiggle arrays with a bed file
+It is possible to split a reference into multiple squiggle arrays - i.e to simulate a PCR run by providing a bed file. This is only possible using one reference at a time currently.
+```zsh
+python make_squiggle.py reference_1.fa --bed_file /path/to/regions.bed --out_dir /path/to/desired/output/squiggle
+```
+
+### Distributions
+
+### `Warning` -> If a distributions.json file already exists, this will append to it.
+
+.npy files containing r9.4.1 sequence should now be present in the base directory. These files will have the name of the contig they contain sequence for.
+
+</details>
+
+# Ideology
+<details open>
+<summary>Expand</summary>
+
+![Icarust Ideology](img/Updated_Icarust_flowchart_backed.excalidraw.png "Basic flowchart of icarust architecture")
+The image above shows the structure of Icarust. The asynchronous main thread is a tokio runtime that handles GRPC requests from readfish. The core rust package that handles this is called Tonic. when Icarust is started the threads populate a shared Vec (think list in python or array in javascript) with one ReadInfo per channel. Any actions received are sent to a seperate thread to be processed, with the correct channel for the action marked as per the action type received. Finished reads are sent to a thread to be written out.
+
+### Parsing the config
+Upon initialisation Icarust uses toml-rs to deserialise the config toml into Rust structs. These are then passed through to the data servicer, to inform the threads there where to find squiggle, of any barcodes and ratios of barcodes.
+
+### Read fish connecting
+There are two servers, a manager and a position server. Readfish first queries the manager sevrer to get the name and port of the position, then it creates a bi-directional streaming RPC request to the position port, sending actions to perform on reads and receiving read chunks as they become available.
+
+### Data generation
+When Icarust is started, three threads are created, aptly named the Data generation thread, the Data write out thread and the Process actions thread. These serve as stand in for the actual sequencer and MinKNOW. The data generation thread is a loop with a 400ms pause.
+
+Every 400ms it unlocks a shared Vec(If from a python background think a List that can only contain one type of elements). This Vec has one element for each channel. The element is a struct, which contains information about the read that the "channel" is currently "sequencing". If the read has been marked by the Process actions thread as unblocked, or if the read has been in the channel longer than the period of time required to sequence a real read, the thread will randomly select a read length from a gamma distribution, a contig to pull from using a weighted choice based on contig length, and a random start point. It then reads the signal from a memory map to the signal .npy files, and stores the signal in the Struct. 
+
+Barcode squiggle can be appended to the randomly selected read by specifying desired barcodes in the config TOML. The chance of choosing a barcode within a sample is also specified in the Config TOML.
+
+This Vec is shared between the Tonic end point and the Data generation thread using a ARC (atomic reference counter) and a mutex for mutual exclusion. This allows either thread to get a lock on the vec whilst it is being read and modfified. 
+
+### Serving reads
+When a GetLiveReadsRequest GRPC request comes in, any actions specified in that request are sent to the process actions thread.
+If this is the first request, a new asynchronous thread is created, which runs in perpetuity. The thread gets a lock on the channels Vec. It loops through each ReadInfo and checks if the channel is marked as Stop receving or was unblocked. If not, the amount of squiggle is worked out based on how much time in milliseconds has passed since that read was last served. If there is enough a new HashMap (Python Dictionary, Javascript Map/Object) is created and the information and squiggle to return is added to this. Once every channel is checked, if there is data to serve, the HashMap is passed via a channel back to the main GRPC server runtime, where it is split up into 24 read chunks. These are then sent via the bi-directional stream back to the client (Presumably readfish).
+
+### Processing actions.
+The process actions thread loops infinitely, iterating a receiver, which has any received actions sent to it. If actions are found, the thread unlocks the shared ReadInfo Vec, and marks the channel that corresponds to the action according to teh action type.
+
+### Writing out data.
+The data writeout thread is sent any finished reads (reads that were unblocked or have completed sequencing naturally) via the data generation thread, using message passing with channels. This thread iterates the receiver of each channel in a loop, and once 4000 reads have been accrued these are written into a fast5 file, using the VBZ compression plugin provided by ONT. The fields in the Fast5 file are populated using a mixture of the provided config field values and hardcoded values in the code base.
+
+</details>
+
+# Happy Simulating!
diff --git a/build.rs b/build.rs
@@ -0,0 +1,30 @@
+#![deny(missing_docs)]
+#![deny(missing_doc_code_examples)]
+//!
+//! Adding docs
+fn main() -> Result<(), Box<dyn std::error::Error>> {
+    tonic_build::configure()
+        .build_client(false)
+        .protoc_arg("--experimental_allow_proto3_optional")
+        .compile(
+            &[
+                "proto/minknow_api/minion_device.proto",
+                "proto/minknow_api/data.proto",
+                "proto/minknow_api/protocol.proto",
+                "proto/minknow_api/statistics.proto",
+                "proto/minknow_api/acquisition.proto",
+                "proto/minknow_api/manager.proto",
+                "proto/minknow_api/protocol_settings.proto",
+                "proto/minknow_api/basecaller.proto",
+                "proto/minknow_api/analysis_configuration.proto",
+                "proto/minknow_api/promethion_device.proto",
+                "proto/minknow_api/instance.proto",
+                "proto/minknow_api/log.proto",
+                "proto/minknow_api/keystore.proto",
+                "proto/minknow_api/rpc_options.proto",
+                "proto/minknow_api/device.proto",
+            ],
+            &["proto/"],
+        )?;
+    Ok(())
+}
diff --git a/examples/example_config.toml b/examples/example_config.toml
@@ -0,0 +1,33 @@
+output_path = "/tmp"
+global_mean_read_length = 20000 #optional
+random_seed = 10
+
+[parameters]
+sample_name = "my_sample_name"
+experiment_name = "my_experiment_name"
+flowcell_name = "my_flowcell_name"
+experiment_duration = 4800
+device_id = "my_device_id"
+position = "my_position"
+
+[[sample]]
+name = "Sample1"
+input_genome = "/path/to/genome/squiggle.npy"
+mean_read_length = 40000.0
+weight = 1
+weights_files = ["/path/to/distibution_file_1.json", "/path/to/distirbution_file_2.json"] #optional
+amplicon = false # Not a PCR based run # optional
+barcodes = ["Barcode01", "Barcode02"] # optional
+barcode_weights = [1,2] # Optional
+uneven = false # Optional
+
+[[sample]]
+name = "Sample2"
+input_genome = "/path/to/genome/squiggle.npy"
+mean_read_length = 40000.0
+weight = 1
+weights_files = ["/path/to/distibution_file_1.json", "/path/to/distirbution_file_2.json"] #optional
+amplicon = false # Not a PCR based run # optional
+barcodes = ["Barcode03", "Barcode04"] # optional
+barcode_weights = [1,2] # Optional
+uneven = false # Optional
diff --git a/img/Draper_Herbert_James_Mourning_for_Icarus.jpg b/img/Draper_Herbert_James_Mourning_for_Icarus.jpg
diff --git a/img/Updated_Icarust_flowchart.excalidraw.png b/img/Updated_Icarust_flowchart.excalidraw.png
diff --git a/img/Updated_Icarust_flowchart_backed.excalidraw.png b/img/Updated_Icarust_flowchart_backed.excalidraw.png
diff --git a/img/global_Section_toml.png b/img/global_Section_toml.png
diff --git a/img/parameters_section_toml.png b/img/parameters_section_toml.png
diff --git a/img/sample_section_toml.png b/img/sample_section_toml.png