diff --git a/Cargo.lock b/Cargo.lock index c62803a..658d895 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -10,9 +10,9 @@ checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" [[package]] name = "ahash" -version = "0.8.8" +version = "0.8.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "42cd52102d3df161c77a887b608d7a4897d7cc112886a9537b738a887a03aaff" +checksum = "d713b3834d76b85304d4d525563c1276e2e30dc97cc67bfb4585a4a29fc2c89f" dependencies = [ "cfg-if", "getrandom", @@ -32,9 +32,9 @@ dependencies = [ [[package]] name = "anstream" -version = "0.6.11" +version = "0.6.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6e2e1ebcb11de5c03c67de28a7df593d32191b44939c482e97702baaaa6ab6a5" +checksum = "96b09b5178381e0874812a9b157f7fe84982617e48f71f4e3235482775e5b540" dependencies = [ "anstyle", "anstyle-parse", @@ -80,9 +80,9 @@ dependencies = [ [[package]] name = "anyhow" -version = "1.0.79" +version = "1.0.80" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "080e9890a082662b09c1ad45f567faeeb47f22b5fb23895fbe1e651e718e25ca" +checksum = "5ad32ce52e4161730f7098c077cd2ed6229b5804ccf99e5366be1ab72a98b4e1" [[package]] name = "array-init-cursor" @@ -90,6 +90,12 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bf7d0a018de4f6aa429b9d33d69edf69072b1c5b1cb8d3e4a5f7ef898fc3eb76" +[[package]] +name = "arrayvec" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "96d30a06541fbafbc7f82ed10c06164cfbd2c401138f6addd8404629c4b16711" + [[package]] name = "arrow-format" version = "0.8.1" @@ -146,7 +152,7 @@ checksum = "16e62a023e7c117e27523144c5d2459f4397fcc3cab0085af8e2224f643a0193" dependencies = [ "proc-macro2", "quote", - "syn 2.0.49", + "syn 2.0.51", ] [[package]] @@ -157,7 +163,7 @@ checksum = "c980ee35e870bd1a4d2c8294d4c04d0499e67bca1e4b5cefcc693c2fa00caea9" dependencies = [ "proc-macro2", "quote", - "syn 2.0.49", + "syn 2.0.51", ] [[package]] @@ -220,9 +226,9 @@ checksum = "ed570934406eb16438a4e976b1b4500774099c13b8cb96eec99f620f05090ddf" [[package]] name = "bstr" -version = "1.9.0" +version = "1.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c48f0051a4b4c5e0b6d365cd04af53aeaa209e3cc15ec2cdb69e73cc87fbd0dc" +checksum = "05efc5cfd9110c8416e471df0e96702d58690178e206e61b7173706673c93706" dependencies = [ "memchr", "regex-automata 0.4.5", @@ -231,9 +237,9 @@ dependencies = [ [[package]] name = "bumpalo" -version = "3.15.0" +version = "3.15.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d32a994c2b3ca201d9b263612a374263f05e7adde37c4707f693dcd375076d1f" +checksum = "8ea184aa71bb362a1157c896979544cc23974e08fd265f29ea96b59f0b4a555b" [[package]] name = "bytecount" @@ -258,7 +264,7 @@ checksum = "965ab7eb5f8f97d2a083c799f3a1b994fc397b2fe2da5d1da1626ce15a39f2b1" dependencies = [ "proc-macro2", "quote", - "syn 2.0.49", + "syn 2.0.51", ] [[package]] @@ -275,11 +281,10 @@ checksum = "a2bd12c1caf447e69cd4528f47f94d203fd2582878ecb9e9465484c4148a8223" [[package]] name = "cc" -version = "1.0.83" +version = "1.0.88" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f1174fb0b6ec23863f8b971027804a42614e347eafb0a95bf0b12cdae21fc4d0" +checksum = "02f341c093d19155a6e41631ce5971aac4e9a868262212153124c15fa22d1cdc" dependencies = [ - "jobserver", "libc", ] @@ -330,7 +335,7 @@ dependencies = [ "heck", "proc-macro2", "quote", - "syn 2.0.49", + "syn 2.0.51", ] [[package]] @@ -420,9 +425,9 @@ checksum = "fb4b4df9df7eac447b9d7c6d62011245385d2408b31e26ec5f1ad5bc96905312" [[package]] name = "dyn-clone" -version = "1.0.16" +version = "1.0.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "545b22097d44f8a9581187cdf93de7a71e4722bf51200cfaba810865b49a495d" +checksum = "0d6ef0072f8a535281e4876be788938b528e9a1d43900b82c2569af7da799125" [[package]] name = "either" @@ -536,7 +541,7 @@ checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac" dependencies = [ "proc-macro2", "quote", - "syn 2.0.49", + "syn 2.0.51", ] [[package]] @@ -619,15 +624,6 @@ version = "1.0.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b1a46d1a171d865aa5f83f92695765caa047a9b4cbae2cbf37dbd613a793fd4c" -[[package]] -name = "jobserver" -version = "0.1.28" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ab46a6e9526ddef3ae7f787c06f0f2600639ba80ea3eade3d8e670a2230f51d6" -dependencies = [ - "libc", -] - [[package]] name = "js-sys" version = "0.3.68" @@ -721,10 +717,11 @@ checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058" [[package]] name = "libradicl" -version = "0.7.0" -source = "git+https://github.com/COMBINE-lab/libradicl?branch=develop#ac09244484d9a079417185c31537b994f4ac8cac" +version = "0.8.0" +source = "git+https://github.com/COMBINE-lab/libradicl?branch=develop#aa850182902460bfe2d08f398cbfb818d4fa43a6" dependencies = [ "ahash", + "anyhow", "bio-types", "dashmap", "noodles-bam", @@ -890,6 +887,16 @@ dependencies = [ "num-traits", ] +[[package]] +name = "num-format" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a652d9771a63711fd3c3deb670acfbe5c30a4072e664d7a3bf5a9e1056ac72c3" +dependencies = [ + "arrayvec", + "itoa", +] + [[package]] name = "num-integer" version = "0.1.46" @@ -1014,7 +1021,7 @@ checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" [[package]] name = "piscem-infer" -version = "0.5.2" +version = "0.6.0" dependencies = [ "ahash", "anyhow", @@ -1025,6 +1032,7 @@ dependencies = [ "clap", "distrs", "libradicl", + "num-format", "path-tools", "rand", "rand_distr", @@ -1245,9 +1253,9 @@ checksum = "7ffc183a10b4478d04cbbbfc96d0873219d962dd5accaff2ffbd4ceb7df837f4" [[package]] name = "ryu" -version = "1.0.16" +version = "1.0.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f98d2aa92eebf49b69786be48e4477826b256916e84a57ff2a4f21923b48eb4c" +checksum = "e86697c916019a8588c99b5fac3cead74ec0b4b819707a682fd4d23fa0ce1ba1" [[package]] name = "scopeguard" @@ -1263,9 +1271,9 @@ checksum = "6ab8598aa408498679922eff7fa985c25d58a90771bd6be794434c5277eab1a6" [[package]] name = "semver" -version = "1.0.21" +version = "1.0.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b97ed7a9823b74f99c7742f5336af7be5ecd3eeafcb1507d1fa93347b1d589b0" +checksum = "92d43fe69e652f3df9bdc2b85b2854a0825b86e4fb76bc44d945137d053639ca" [[package]] name = "seq-macro" @@ -1275,29 +1283,29 @@ checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4" [[package]] name = "serde" -version = "1.0.196" +version = "1.0.197" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "870026e60fa08c69f064aa766c10f10b1d62db9ccd4d0abb206472bee0ce3b32" +checksum = "3fb1c873e1b9b056a4dc4c0c198b24c3ffa059243875552b2bd0933b1aee4ce2" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.196" +version = "1.0.197" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "33c85360c95e7d137454dc81d9a4ed2b8efd8fbe19cee57357b32b9771fccb67" +checksum = "7eb0b34b42edc17f6b7cac84a52a1c5f0e1bb2227e997ca9011ea3dd34e8610b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.49", + "syn 2.0.51", ] [[package]] name = "serde_json" -version = "1.0.113" +version = "1.0.114" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "69801b70b1c3dac963ecb03a364ba0ceda9cf60c71cfe475e99864759c8b8a79" +checksum = "c5f09b1bd632ef549eaa9f60a1f8de742bdbc698e6cee2095fc84dde5f549ae0" dependencies = [ "itoa", "ryu", @@ -1377,7 +1385,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.49", + "syn 2.0.51", ] [[package]] @@ -1393,9 +1401,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.49" +version = "2.0.51" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "915aea9e586f80826ee59f8453c1101f9d1c4b3964cd2460185ee8e299ada496" +checksum = "6ab617d94515e94ae53b8406c628598680aa0c9587474ecbe58188f7b345d66c" dependencies = [ "proc-macro2", "quote", @@ -1453,14 +1461,14 @@ checksum = "a953cb265bef375dae3de6663da4d3804eee9682ea80d8e2542529b73c531c81" dependencies = [ "proc-macro2", "quote", - "syn 2.0.49", + "syn 2.0.51", ] [[package]] name = "thread_local" -version = "1.1.7" +version = "1.1.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3fdd6f064ccff2d6567adcb3873ca630700f00b5ad3f060c25b5dcfd9a4ce152" +checksum = "8b9ef9bad013ada3808854ceac7b46812a6465ba368859a37e2100283d2d719c" dependencies = [ "cfg-if", "once_cell", @@ -1485,7 +1493,7 @@ checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.49", + "syn 2.0.51", ] [[package]] @@ -1584,7 +1592,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 2.0.49", + "syn 2.0.51", "wasm-bindgen-shared", ] @@ -1606,7 +1614,7 @@ checksum = "642f325be6301eb8107a83d12a8ac6c1e1c54345a7ef1a9261962dfefda09e66" dependencies = [ "proc-macro2", "quote", - "syn 2.0.49", + "syn 2.0.51", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -1654,7 +1662,7 @@ version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" dependencies = [ - "windows-targets 0.52.0", + "windows-targets 0.52.3", ] [[package]] @@ -1674,17 +1682,17 @@ dependencies = [ [[package]] name = "windows-targets" -version = "0.52.0" +version = "0.52.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a18201040b24831fbb9e4eb208f8892e1f50a37feb53cc7ff887feb8f50e7cd" +checksum = "d380ba1dc7187569a8a9e91ed34b8ccfc33123bbacb8c0aed2d1ad7f3ef2dc5f" dependencies = [ - "windows_aarch64_gnullvm 0.52.0", - "windows_aarch64_msvc 0.52.0", - "windows_i686_gnu 0.52.0", - "windows_i686_msvc 0.52.0", - "windows_x86_64_gnu 0.52.0", - "windows_x86_64_gnullvm 0.52.0", - "windows_x86_64_msvc 0.52.0", + "windows_aarch64_gnullvm 0.52.3", + "windows_aarch64_msvc 0.52.3", + "windows_i686_gnu 0.52.3", + "windows_i686_msvc 0.52.3", + "windows_x86_64_gnu 0.52.3", + "windows_x86_64_gnullvm 0.52.3", + "windows_x86_64_msvc 0.52.3", ] [[package]] @@ -1695,9 +1703,9 @@ checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" [[package]] name = "windows_aarch64_gnullvm" -version = "0.52.0" +version = "0.52.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cb7764e35d4db8a7921e09562a0304bf2f93e0a51bfccee0bd0bb0b666b015ea" +checksum = "68e5dcfb9413f53afd9c8f86e56a7b4d86d9a2fa26090ea2dc9e40fba56c6ec6" [[package]] name = "windows_aarch64_msvc" @@ -1707,9 +1715,9 @@ checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" [[package]] name = "windows_aarch64_msvc" -version = "0.52.0" +version = "0.52.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bbaa0368d4f1d2aaefc55b6fcfee13f41544ddf36801e793edbbfd7d7df075ef" +checksum = "8dab469ebbc45798319e69eebf92308e541ce46760b49b18c6b3fe5e8965b30f" [[package]] name = "windows_i686_gnu" @@ -1719,9 +1727,9 @@ checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" [[package]] name = "windows_i686_gnu" -version = "0.52.0" +version = "0.52.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a28637cb1fa3560a16915793afb20081aba2c92ee8af57b4d5f28e4b3e7df313" +checksum = "2a4e9b6a7cac734a8b4138a4e1044eac3404d8326b6c0f939276560687a033fb" [[package]] name = "windows_i686_msvc" @@ -1731,9 +1739,9 @@ checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" [[package]] name = "windows_i686_msvc" -version = "0.52.0" +version = "0.52.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ffe5e8e31046ce6230cc7215707b816e339ff4d4d67c65dffa206fd0f7aa7b9a" +checksum = "28b0ec9c422ca95ff34a78755cfa6ad4a51371da2a5ace67500cf7ca5f232c58" [[package]] name = "windows_x86_64_gnu" @@ -1743,9 +1751,9 @@ checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" [[package]] name = "windows_x86_64_gnu" -version = "0.52.0" +version = "0.52.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3d6fa32db2bc4a2f5abeacf2b69f7992cd09dca97498da74a151a3132c26befd" +checksum = "704131571ba93e89d7cd43482277d6632589b18ecf4468f591fbae0a8b101614" [[package]] name = "windows_x86_64_gnullvm" @@ -1755,9 +1763,9 @@ checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" [[package]] name = "windows_x86_64_gnullvm" -version = "0.52.0" +version = "0.52.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a657e1e9d3f514745a572a6846d3c7aa7dbe1658c056ed9c3344c4109a6949e" +checksum = "42079295511643151e98d61c38c0acc444e52dd42ab456f7ccfd5152e8ecf21c" [[package]] name = "windows_x86_64_msvc" @@ -1767,9 +1775,9 @@ checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" [[package]] name = "windows_x86_64_msvc" -version = "0.52.0" +version = "0.52.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dff9641d1cd4be8d1a070daf9e3773c5f67e78b4d9d42263020c057706765c04" +checksum = "0770833d60a970638e989b3fa9fd2bb1aaadcf88963d1659fd7d9990196ed2d6" [[package]] name = "zerocopy" @@ -1788,7 +1796,7 @@ checksum = "9ce1b18ccd8e73a9321186f97e46f9f04b778851177567b1975109d26a08d2a6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.49", + "syn 2.0.51", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 3b72379..e4394a4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "piscem-infer" -version = "0.5.2" +version = "0.6.0" edition = "2021" readme = "README.md" license-file = "LICENSE" @@ -21,12 +21,12 @@ include = [ # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] -libradicl = { git = "https://github.com/COMBINE-lab/libradicl", branch = "develop", version = "0.7.0" } -ahash = "0.8.8" +libradicl = { git = "https://github.com/COMBINE-lab/libradicl", branch = "develop", version = "0.8.0" } +ahash = "0.8.9" distrs = "0.2.1" -anyhow = "1.0.79" +anyhow = "1.0.80" bincode = "1.3.3" -bstr = "1.9.0" +bstr = "1.9.1" clap = { version = "4.5.1", features = ["derive", "wrap_help", "cargo", "help", "usage", "error-context"] } tracing = "0.1.40" tracing-subscriber = { version = "0.3.18", default-features = true, features = ["env-filter"] } @@ -34,14 +34,15 @@ tabled = "0.15.0" rayon = "1.8.1" rand_distr = "0.4.3" rand = "0.8.5" -serde_json = "1.0.113" -serde_derive = "1.0.196" -serde = { version = "1.0.196", features = ["serde_derive", "derive"] } +serde_json = "1.0.114" +serde_derive = "1.0.197" +serde = { version = "1.0.197", features = ["serde_derive", "derive"] } arrow2 = { version = "0.18.0", features = ["io_parquet", "io_parquet_gzip", "io_parquet_zstd", "io_parquet_snappy"] } scroll = "0.12.0" snap = "1.1.1" path-tools = "0.1.0" atomic_float = "0.1.0" +num-format = "0.4.4" [[bin]] name = "piscem-infer" diff --git a/src/main.rs b/src/main.rs index 2f77789..969972e 100644 --- a/src/main.rs +++ b/src/main.rs @@ -7,20 +7,23 @@ use arrow2::{ use clap::Args; use clap::{Parser, Subcommand}; -use libradicl::exit_codes; -use libradicl::rad_types; +use libradicl::rad_types::{self}; +use libradicl::{ + chunk, + header::RadPrelude, + record::{PiscemBulkReadRecord, PiscemBulkRecordContext}, +}; +use num_format::{Locale, ToFormattedString}; use path_tools::WithAdditionalExtension; -use scroll::Pread; use serde::Serialize; use serde_json::{json, Value}; use std::fs::{create_dir_all, File}; use std::io::{BufReader, Read}; use std::path::{Path, PathBuf}; use tabled::{settings::Style, Table, Tabled}; -use tracing::{error, info, warn, Level}; +use tracing::{info, warn, Level}; mod utils; -use utils::custom_rad_utils::MetaChunk; use utils::em::{adjust_ref_lengths, conditional_means, em, em_par, EMInfo, EqLabel, EqMap}; use crate::utils::em::conditional_means_from_params; @@ -28,7 +31,7 @@ use crate::utils::em::do_bootstrap; use crate::utils::em::OrientationProperty; use crate::utils::em::PackedEqMap; use crate::utils::io; -use utils::map_record_types::{LibraryType, MappingType}; +use utils::map_record_types::LibraryType; #[derive(Serialize)] struct MappedFragStats { @@ -91,7 +94,7 @@ fn build_ori_table(mapped_ori_count_global: &[u32]) -> Vec { fn process( br: &mut BufReader, nrec: usize, - frag_map_t: &rad_types::RadIntId, + record_context: &PiscemBulkRecordContext, lib_type: LibraryType, mapped_stats: &mut MappedFragStats, ) -> (EqMap, Vec) { @@ -110,9 +113,9 @@ fn process( let mut dir_ints = vec![]; //let mut dir_vec = vec![0u32, 64]; for _ in 0..nrec { - let c = MetaChunk::from_bytes(br, frag_map_t); + let c = chunk::Chunk::::from_bytes(br, record_context); for mappings in &c.reads { - let ft = MappingType::from_u8(mappings.frag_type); + let ft = rad_types::MappingType::from_u8(mappings.frag_type); let nm = mappings.positions.len(); mapped_stats.tot_mappings += nm; @@ -185,34 +188,6 @@ fn process( (eqmap, frag_lengths) } -/// Read the lengths of the reference sequences from the RAD file header. -fn read_ref_lengths(reader: &mut T) -> anyhow::Result> { - let mut rbuf = [0u8; 8]; - - // read type of length - reader.read_exact(&mut rbuf[0..1])?; - let lt = rbuf.pread::(0).unwrap() as u64; - assert_eq!(lt, 3); - - // read length of the array - reader.read_exact(&mut rbuf[0..4])?; - let len = rbuf.pread::(0).unwrap() as usize; - - // read type of entry - reader.read_exact(&mut rbuf[0..1])?; - let et = rbuf.pread::(0).unwrap() as u64; - assert_eq!(et, 3); - - let mut vec = Vec::::with_capacity(len); - for _ in 0..len { - reader.read_exact(&mut rbuf[0..4])?; - let v = rbuf.pread::(0).unwrap(); - vec.push(v); - } - - Ok(vec) -} - #[derive(Args, Serialize, Clone, Debug)] pub struct QuantOpts { /// input stem (i.e. without the .rad suffix) @@ -311,10 +286,10 @@ fn main() -> anyhow::Result<()> { input_map_info.set_extension("map_info.json"); if !input_map_info.exists() { ref_sig_json = None; - warn!(concat!("Expected the mapping info file {:?} to exist, but it doesn't. ", - "This is bad, and means that reference provenance signatures cannot be ", - "propagated to the output of piscem-infer. It is strongly recommended ", - "that you investigate why this file does not exist at the expected location."), + warn!("Expected the mapping info file {:?} to exist, but it doesn't. \ + This is bad, and means that reference provenance signatures cannot be \ + propagated to the output of piscem-infer. It is strongly recommended \ + that you investigate why this file does not exist at the expected location.", input_map_info); } else { let map_info_str = std::fs::read_to_string(&input_map_info) @@ -339,18 +314,18 @@ fn main() -> anyhow::Result<()> { let mut fl_mean = 0_f64; let mut fl_sd = 0_f64; - let hdr = rad_types::RadHeader::from_bytes(&mut br); + // read the header and tag sections from the rad file + let prelude = RadPrelude::from_bytes(&mut br)?; + info!("read header!"); - if hdr.is_paired > 0_u8 { + if prelude.hdr.is_paired > 0_u8 { info!("fragments paired in sequencing"); paired_end = true; if let (Some(flm), Some(flsd)) = (fld_mean, fld_sd) { warn!( - concat!( - "provided fragment length distribution mean and sd ({}, {}), but ", - "the RAD file contains paired-end fragments, so these will be ignored ", - "and the fragment length distribution will be estimated." - ), + "provided fragment length distribution mean and sd ({}, {}), but \ + the RAD file contains paired-end fragments, so these will be ignored \" + and the fragment length distribution will be estimated.", flm, flsd ); } @@ -362,65 +337,64 @@ fn main() -> anyhow::Result<()> { fl_sd = flsd; } else { bail!( - concat!( - "The input RAD file {} was for unpaired reads, so ", - "a fragment length distribution mean and standard deviation ", - "must be provided." - ), + "The input RAD file {} was for unpaired reads, so \ + a fragment length distribution mean and standard deviation \ + must be provided.", &input_rad.display() ); } } // file-level - let fl_tags = rad_types::TagSection::from_bytes(&mut br); - info!("read {:?} file-level tags", fl_tags.tags.len()); + info!("read {:?} file-level tags", prelude.file_tags.tags.len()); + // parse actual tags + for ft in &prelude.file_tags.tags { + info!("\tfile-level tag {}", ft.name); + } // read-level - let rl_tags = rad_types::TagSection::from_bytes(&mut br); - info!("read {:?} read-level tags", rl_tags.tags.len()); + info!("read {:?} read-level tags", prelude.read_tags.tags.len()); + // required read-level tag const FRAG_TYPE_NAME: &str = "frag_map_type"; - let mut ftt: Option = None; + let mut had_frag_map_type = false; // parse actual tags - for rt in &rl_tags.tags { - if rad_types::decode_int_type_tag(rt.typeid).is_none() { - error!("currently only RAD types 1--4 are supported for 'b' and 'u' tags."); - std::process::exit(exit_codes::EXIT_UNSUPPORTED_TAG_TYPE); - } + for rt in &prelude.read_tags.tags { info!("\tread-level tag {}", rt.name); if rt.name == FRAG_TYPE_NAME { - ftt = Some(rt.typeid); + had_frag_map_type = true; } } + if !had_frag_map_type { + bail!("read-level tag description missing required tag \"{FRAG_TYPE_NAME}\"; can't proceed."); + } // alignment-level - let al_tags = rad_types::TagSection::from_bytes(&mut br); - info!("read {:?} alignemnt-level tags", al_tags.tags.len()); + info!( + "read {:?} alignemnt-level tags", + prelude.aln_tags.tags.len() + ); + // required alignment level tags const REF_ORI_NAME: &str = "compressed_ori_ref"; const POS_NAME: &str = "pos"; const FRAGLEN_NAME: &str = "frag_len"; - let mut ref_ori_t: Option = None; - let mut pos_t: Option = None; - let mut fraglen_t: Option = None; + let mut found_ref_ori_t = false; + let mut found_pos_t = false; + let mut found_fraglen_t = false; // parse actual tags - for at in &al_tags.tags { - if rad_types::decode_int_type_tag(at.typeid).is_none() { - error!("currently only RAD types 1--4 are supported for 'b' and 'u' tags."); - std::process::exit(exit_codes::EXIT_UNSUPPORTED_TAG_TYPE); - } + for at in &prelude.aln_tags.tags { info!("\talignment-level tag {}", at.name); match at.name.as_str() { REF_ORI_NAME => { - ref_ori_t = Some(at.typeid); + found_ref_ori_t = true; } POS_NAME => { - pos_t = Some(at.typeid); + found_pos_t = true; } FRAGLEN_NAME => { - fraglen_t = Some(at.typeid); + found_fraglen_t = true; } _ => { info!("unknown alignment-level tag {}", at.name); @@ -428,39 +402,46 @@ fn main() -> anyhow::Result<()> { } } // ensure the tags we expect are found - assert!(ref_ori_t.is_some()); - assert!(pos_t.is_some()); - assert!(fraglen_t.is_some()); + assert!( + found_ref_ori_t, + "required alignment-level tag \"{}\" is missing", + REF_ORI_NAME + ); + assert!( + found_pos_t, + "required alignment-level tag \"{}\" is missing", + POS_NAME + ); + assert!( + found_fraglen_t, + "required alignment-level tag \"{}\" is missing", + FRAGLEN_NAME + ); - // parse the file-level tags + // parse the actual file-level tags const REF_LENGTHS_NAME: &str = "ref_lengths"; - let mut ref_lengths: Option> = None; - for ft in &fl_tags.tags { - if ft.name == REF_LENGTHS_NAME { - if ft.typeid != 7 { - error!("expected array type (7) but found, {}", ft.typeid); - } - info!("found frag_length file-level tag"); - - let v = read_ref_lengths(&mut br)?; - ref_lengths = Some(v); - } - } + let file_tag_map = prelude.file_tags.parse_tags_from_bytes(&mut br)?; + // get the reference lengths from the tag map + let ref_lengths = match file_tag_map.get(REF_LENGTHS_NAME) { + Some(rad_types::TagValue::ArrayU32(v)) => Some(v), + _ => None, + }; let ref_lengths = ref_lengths.expect("was not able to read reference lengths from file!"); - info!("read {} reference lengths", ref_lengths.len()); + info!( + "read {} reference lengths", + ref_lengths.len().to_formatted_string(&Locale::en) + ); - let frag_map_t = rad_types::decode_int_type_tag( - ftt.expect("no fragment mapping type description present."), - ) - .context("unknown fragment mapping type id.")?; + // extract whatever context we'll need to read the records + let tag_context = prelude.get_record_context::()?; let mut frag_stats = MappedFragStats::new(); let (eq_map, frag_lengths) = process( &mut br, - hdr.num_chunks as usize, - &frag_map_t, + prelude.hdr.num_chunks as usize, + &tag_context, lib_type, &mut frag_stats, ); @@ -488,13 +469,31 @@ fn main() -> anyhow::Result<()> { }; let quant_output = output.with_additional_extension(".quant"); - io::write_results(&quant_output, &hdr, &em_res, &ref_lengths, &eff_lengths)?; - - info!("num mapped reads = {}", frag_stats.num_mapped_reads); - info!("total mappings = {}", frag_stats.tot_mappings); - info!("number of equivalence classes = {}", eq_map.len()); + io::write_results( + &quant_output, + &prelude.hdr, + &em_res, + &ref_lengths, + &eff_lengths, + )?; + + info!( + "num mapped reads = {}", + frag_stats.num_mapped_reads.to_formatted_string(&Locale::en) + ); + info!( + "total mappings = {}", + frag_stats.tot_mappings.to_formatted_string(&Locale::en) + ); + info!( + "number of equivalence classes = {}", + eq_map.len().to_formatted_string(&Locale::en) + ); let total_weight: usize = eq_map.count_map.values().sum(); - info!("total equivalence map weight = {}", total_weight); + info!( + "total equivalence map weight = {}", + total_weight.to_formatted_string(&Locale::en) + ); { let fld_array = UInt32Array::from_vec(frag_lengths); diff --git a/src/utils.rs b/src/utils.rs index 942ae7a..893b83a 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -1,4 +1,3 @@ -pub mod custom_rad_utils; pub mod em; pub mod io; pub mod map_record_types; diff --git a/src/utils/custom_rad_utils.rs b/src/utils/custom_rad_utils.rs deleted file mode 100644 index 58a1206..0000000 --- a/src/utils/custom_rad_utils.rs +++ /dev/null @@ -1,97 +0,0 @@ -use libradicl::rad_types; -use scroll::Pread; -use std::io::Read; -use std::mem; - -use crate::utils::map_record_types::MetaReadRecord; - -#[derive(Debug)] -pub struct MetaChunk { - pub nbytes: u32, - pub nrec: u32, - pub reads: Vec, -} - -impl MetaChunk { - #[allow(dead_code)] - pub fn read_header(reader: &mut T) -> (u32, u32) { - let mut buf = [0u8; 8]; - - reader.read_exact(&mut buf).unwrap(); - let nbytes = buf.pread::(0).unwrap(); - let nrec = buf.pread::(4).unwrap(); - (nbytes, nrec) - } - - pub fn from_bytes(reader: &mut T, fmt: &rad_types::RadIntId) -> Self { - let mut buf = [0u8; 8]; - - reader.read_exact(&mut buf).unwrap(); - let nbytes = buf.pread::(0).unwrap(); - let nrec = buf.pread::(4).unwrap(); - let mut c = Self { - nbytes, - nrec, - reads: Vec::with_capacity(nrec as usize), - }; - - for _ in 0..(nrec as usize) { - c.reads.push(MetaReadRecord::from_bytes(reader, fmt)); - } - - c - } - - /// peeks to the first record in the buffer `buf`, and returns - /// the barcode and umi associated with this record. It is assumed - /// that there is at least one record present in the buffer. - #[allow(dead_code)] - pub fn peek_record( - buf: &[u8], - bct: &rad_types::RadIntId, - umit: &rad_types::RadIntId, - ) -> (u64, u64) { - let na_size = mem::size_of::(); - let bc_size = bct.bytes_for_type(); - - let _na = buf.pread::(0).unwrap(); - - let bc = match bct { - rad_types::RadIntId::U8 => buf.pread::(na_size).unwrap() as u64, - rad_types::RadIntId::U16 => buf.pread::(na_size).unwrap() as u64, - rad_types::RadIntId::U32 => buf.pread::(na_size).unwrap() as u64, - rad_types::RadIntId::U64 => buf.pread::(na_size).unwrap(), - }; - let umi = match umit { - rad_types::RadIntId::U8 => buf.pread::(na_size + bc_size).unwrap() as u64, - rad_types::RadIntId::U16 => buf.pread::(na_size + bc_size).unwrap() as u64, - rad_types::RadIntId::U32 => buf.pread::(na_size + bc_size).unwrap() as u64, - rad_types::RadIntId::U64 => buf.pread::(na_size + bc_size).unwrap(), - }; - (bc, umi) - } -} - -pub fn read_into_u64(reader: &mut T, rt: &rad_types::RadIntId) -> u64 { - let mut rbuf = [0u8; 8]; - - let v: u64 = match rt { - rad_types::RadIntId::U8 => { - reader.read_exact(&mut rbuf[0..1]).unwrap(); - rbuf.pread::(0).unwrap() as u64 - } - rad_types::RadIntId::U16 => { - reader.read_exact(&mut rbuf[0..2]).unwrap(); - rbuf.pread::(0).unwrap() as u64 - } - rad_types::RadIntId::U32 => { - reader.read_exact(&mut rbuf[0..4]).unwrap(); - rbuf.pread::(0).unwrap() as u64 - } - rad_types::RadIntId::U64 => { - reader.read_exact(&mut rbuf[0..8]).unwrap(); - rbuf.pread::(0).unwrap() - } - }; - v -} diff --git a/src/utils/io.rs b/src/utils/io.rs index 24a6743..b18c9d1 100644 --- a/src/utils/io.rs +++ b/src/utils/io.rs @@ -5,7 +5,7 @@ use arrow2::{ chunk::Chunk, datatypes::{Field, Schema}, }; -use libradicl::rad_types; +use libradicl::header; use path_tools::WithAdditionalExtension; use std::fs::File; use std::io::Write; @@ -15,7 +15,7 @@ use tracing::warn; pub(crate) fn write_results( output: &Path, - hdr: &rad_types::RadHeader, + hdr: &header::RadHeader, e_counts: &[f64], lengths: &[u32], eff_lengths: &[f64], diff --git a/src/utils/map_record_types.rs b/src/utils/map_record_types.rs index 4f20846..612ce8f 100644 --- a/src/utils/map_record_types.rs +++ b/src/utils/map_record_types.rs @@ -1,16 +1,12 @@ use anyhow::bail; -use libradicl::rad_types; -use scroll::Pread; +use libradicl::rad_types::MappedFragmentOrientation; use serde::Serialize; -use std::io::Read; use std::str::FromStr; -use crate::utils::custom_rad_utils::*; +// const MASK_LOWER_30_BITS: u32 = 0xC0000000; +// const MASK_UPPER_2_BITS: u32 = 0x3FFFFFFF; -const MASK_LOWER_30_BITS: u32 = 0xC0000000; -const MASK_UPPER_2_BITS: u32 = 0x3FFFFFFF; #[derive(Debug, Copy, Clone, Eq, PartialEq, Serialize)] - pub enum LibraryType { StrandedForward, InwardStrandedForward, @@ -64,174 +60,9 @@ impl LibraryType { } } -#[derive(Debug, Copy, Clone, Eq, PartialEq)] -pub enum MappingType { - Unmapped, - SingleMapped, - MappedFirstOrphan, - MappedSecondOrphan, - MappedPair, -} - -impl MappingType { - pub fn from_u8(t: u8) -> Self { - match t { - 0 => MappingType::Unmapped, - 1 => MappingType::SingleMapped, - 2 => MappingType::MappedFirstOrphan, - 3 => MappingType::MappedSecondOrphan, - 4 => MappingType::MappedPair, - _ => MappingType::Unmapped, - } - } - - #[inline] - pub fn get_mask(&self) -> u32 { - match &self { - // if unmapped, we ignore flags all together. - MappingType::Unmapped => 0b00, - // if paired we care about both read and mate. - MappingType::MappedPair => 0b11, - // if orphan or single, we care only about read - // mate flag should be ignored. - _ => 0b10, - } - } - - #[inline] - pub fn is_orphan(&self) -> bool { - matches!( - &self, - MappingType::MappedFirstOrphan | MappingType::MappedSecondOrphan - ) - } -} - -#[derive(Debug, Copy, Clone)] -pub enum MappedFragmentOrientation { - Reverse, - Forward, - ReverseReverse, - ReverseForward, - ForwardReverse, - ForwardForward, - Unknown, -} - -impl MappedFragmentOrientation { - pub fn from_u32_paired_status(n: u32, m: MappingType) -> Self { - // if not paired, then we don't care about - // the lowest order bit so shift it off - if matches!( - m, - MappingType::SingleMapped - | MappingType::MappedFirstOrphan - | MappingType::MappedSecondOrphan - ) { - if (n & 0b10) == 2 { - MappedFragmentOrientation::Forward - } else { - MappedFragmentOrientation::Reverse - } - } else { - match n { - 0 => MappedFragmentOrientation::ReverseReverse, - 1 => MappedFragmentOrientation::ReverseForward, - 2 => MappedFragmentOrientation::ForwardReverse, - 3 => MappedFragmentOrientation::ForwardForward, - _ => MappedFragmentOrientation::Unknown, - } - } - } -} - -impl From for u32 { - fn from(item: MappedFragmentOrientation) -> Self { - match item { - MappedFragmentOrientation::ForwardReverse => 0b011, - MappedFragmentOrientation::ForwardForward => 0b101, - MappedFragmentOrientation::ReverseReverse => 0b110, - MappedFragmentOrientation::ReverseForward => 0b100, - MappedFragmentOrientation::Forward => 0b1, - MappedFragmentOrientation::Reverse => 0b10, - MappedFragmentOrientation::Unknown => 0b0, - } - } -} - -impl From for MappedFragmentOrientation { - fn from(item: u32) -> Self { - match item { - 0b011 => MappedFragmentOrientation::ForwardReverse, - 0b101 => MappedFragmentOrientation::ForwardForward, - 0b110 => MappedFragmentOrientation::ReverseReverse, - 0b100 => MappedFragmentOrientation::ReverseForward, - 0b1 => MappedFragmentOrientation::Forward, - 0b10 => MappedFragmentOrientation::Reverse, - _ => MappedFragmentOrientation::Unknown, - } - } -} - -#[derive(Debug)] -pub struct MetaReadRecord { - pub frag_type: u8, - pub dirs: Vec, - pub refs: Vec, - pub positions: Vec, - pub frag_lengths: Vec, -} - -impl MetaReadRecord { - #[allow(dead_code)] - pub fn is_empty(&self) -> bool { - self.refs.is_empty() - } - - pub fn from_bytes(reader: &mut T, frag_map_t: &rad_types::RadIntId) -> Self { - let mut rbuf = [0u8; 255]; - - reader.read_exact(&mut rbuf[0..4]).unwrap(); - let na = rbuf.pread::(0).unwrap(); - let fmt = read_into_u64(reader, frag_map_t); - let f = MappingType::from_u8(fmt as u8); - - let mut rec = Self { - frag_type: fmt as u8, - dirs: Vec::with_capacity(na as usize), - refs: Vec::with_capacity(na as usize), - positions: Vec::with_capacity(na as usize), - frag_lengths: Vec::with_capacity(na as usize), - }; - - //println!("number of records : {:?}",na); - - for _ in 0..(na as usize) { - reader.read_exact(&mut rbuf[0..4]).unwrap(); - let v = rbuf.pread::(0).unwrap(); - - let dir_int = (v & MASK_LOWER_30_BITS) >> 30; - let dir = MappedFragmentOrientation::from_u32_paired_status(dir_int, f); - rec.dirs.push(dir); - rec.refs.push(v & MASK_UPPER_2_BITS); - // position - reader.read_exact(&mut rbuf[0..4]).unwrap(); - let pos = rbuf.pread::(0).unwrap(); - rec.positions.push(pos); - // length - reader.read_exact(&mut rbuf[0..2]).unwrap(); - let flen = rbuf.pread::(0).unwrap(); - rec.frag_lengths.push(flen); - } - - rec - } -} - #[cfg(test)] mod tests { use super::LibraryType; - use anyhow::{bail, Error}; #[test] fn lib_types_parse() {