From 1d66d5b3c91601976d9c7a01da5b18d47467a530 Mon Sep 17 00:00:00 2001 From: mholt Date: Mon, 30 Oct 2023 13:01:42 -0700 Subject: [PATCH] adds source files and test files --- Cargo.lock | 1548 ++++++++++++++++++++++++++++ Cargo.toml | 38 + build.rs | 19 + src/astar_phaser.rs | 788 ++++++++++++++ src/block_gen.rs | 1063 +++++++++++++++++++ src/cli.rs | 343 ++++++ src/data_types/mod.rs | 7 + src/data_types/read_segments.rs | 258 +++++ src/data_types/reference_genome.rs | 133 +++ src/data_types/variants.rs | 758 ++++++++++++++ src/lib.rs | 19 + src/main.rs | 639 ++++++++++++ src/phaser.rs | 796 ++++++++++++++ src/read_parsing.rs | 745 +++++++++++++ src/sequence_alignment.rs | 77 ++ src/wfa_graph.rs | 1157 +++++++++++++++++++++ src/writers/block_stats.rs | 370 +++++++ src/writers/haplotag_writer.rs | 72 ++ src/writers/mod.rs | 13 + src/writers/ordered_bam_writer.rs | 355 +++++++ src/writers/ordered_vcf_writer.rs | 423 ++++++++ src/writers/phase_stats.rs | 313 ++++++ src/writers/vcf_util.rs | 54 + test_data/header_only.bam | Bin 0 -> 3019 bytes test_data/header_only.bam.bai | Bin 0 -> 1576 bytes test_data/header_only.vcf.gz | Bin 0 -> 2039 bytes test_data/header_only.vcf.gz.tbi | Bin 0 -> 80 bytes test_data/multi_smrtcell.bam | Bin 0 -> 3034 bytes test_data/multi_smrtcell.bam.bai | Bin 0 -> 1576 bytes test_data/multisample.bam | Bin 0 -> 3038 bytes test_data/multisample.bam.bai | Bin 0 -> 1576 bytes test_data/test_reference.fa | 5 + test_data/test_reference.fa.gz | Bin 0 -> 90 bytes 33 files changed, 9993 insertions(+) create mode 100644 Cargo.lock create mode 100644 Cargo.toml create mode 100644 build.rs create mode 100644 src/astar_phaser.rs create mode 100644 src/block_gen.rs create mode 100644 src/cli.rs create mode 100644 src/data_types/mod.rs create mode 100644 src/data_types/read_segments.rs create mode 100644 src/data_types/reference_genome.rs create mode 100644 src/data_types/variants.rs create mode 100644 src/lib.rs create mode 100644 src/main.rs create mode 100644 src/phaser.rs create mode 100644 src/read_parsing.rs create mode 100644 src/sequence_alignment.rs create mode 100644 src/wfa_graph.rs create mode 100644 src/writers/block_stats.rs create mode 100644 src/writers/haplotag_writer.rs create mode 100644 src/writers/mod.rs create mode 100644 src/writers/ordered_bam_writer.rs create mode 100644 src/writers/ordered_vcf_writer.rs create mode 100644 src/writers/phase_stats.rs create mode 100644 src/writers/vcf_util.rs create mode 100644 test_data/header_only.bam create mode 100644 test_data/header_only.bam.bai create mode 100644 test_data/header_only.vcf.gz create mode 100644 test_data/header_only.vcf.gz.tbi create mode 100644 test_data/multi_smrtcell.bam create mode 100644 test_data/multi_smrtcell.bam.bai create mode 100644 test_data/multisample.bam create mode 100644 test_data/multisample.bam.bai create mode 100644 test_data/test_reference.fa create mode 100644 test_data/test_reference.fa.gz diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..25db964 --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,1548 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "adler" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" + +[[package]] +name = "aho-corasick" +version = "0.7.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4f55bd91a0978cbfd91c457a164bab8b4001c833b7f323132c0a4e1922dd44e" +dependencies = [ + "memchr", +] + +[[package]] +name = "android_system_properties" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311" +dependencies = [ + "libc", +] + +[[package]] +name = "anyhow" +version = "1.0.72" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b13c32d80ecc7ab747b80c3784bce54ee8a7a0cc4fbda9bf4cda2cf6fe90854" + +[[package]] +name = "approx" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cab112f0a86d568ea0e627cc1d6be74a1e9cd55214684db5561995f6dad897c6" +dependencies = [ + "num-traits", +] + +[[package]] +name = "atty" +version = "0.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8" +dependencies = [ + "hermit-abi", + "libc", + "winapi", +] + +[[package]] +name = "autocfg" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" + +[[package]] +name = "bio" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cd6b34e241a9992b9a896c7939147875818217e00a25fe0eeb0da234f0d7aafe" +dependencies = [ + "anyhow", + "approx", + "bio-types", + "bit-set", + "bv", + "bytecount", + "csv", + "custom_derive", + "enum-map", + "fxhash", + "getset", + "itertools", + "itertools-num", + "lazy_static", + "multimap", + "ndarray", + "newtype_derive", + "num-integer", + "num-traits", + "ordered-float", + "petgraph", + "rand", + "regex", + "serde", + "serde_derive", + "statrs", + "strum", + "strum_macros", + "thiserror", + "triple_accel", + "vec_map", +] + +[[package]] +name = "bio-types" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dfa990f40a28735fa598dc3dd58d73e62e6b41458959d623903b927ba7b04c80" +dependencies = [ + "derive-new", + "lazy_static", + "regex", + "strum_macros", + "thiserror", +] + +[[package]] +name = "bit-set" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0700ddab506f33b20a03b13996eccd309a48e5ff77d0d95926aa0210fb4e95f1" +dependencies = [ + "bit-vec", +] + +[[package]] +name = "bit-vec" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "349f9b6a179ed607305526ca489b34ad0a41aed5f7980fa90eb03160b69598fb" + +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + +[[package]] +name = "bstr" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba3569f383e8f1598449f1a423e72e99569137b47740b1da11ef19af3d5c3223" +dependencies = [ + "lazy_static", + "memchr", + "regex-automata", + "serde", +] + +[[package]] +name = "bumpalo" +version = "3.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b1ce199063694f33ffb7dd4e0ee620741495c32833cde5aa08f02a0bf96f0c8" + +[[package]] +name = "bv" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8834bb1d8ee5dc048ee3124f2c7c1afcc6bc9aed03f11e9dfd8c69470a5db340" +dependencies = [ + "feature-probe", + "serde", +] + +[[package]] +name = "bytecount" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2c676a478f63e9fa2dd5368a42f28bba0d6c560b775f38583c8bbaa7fcd67c9c" + +[[package]] +name = "bytemuck" +version = "1.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2f5715e491b5a1598fc2bef5a606847b5dc1d48ea625bd3c02c00de8285591da" + +[[package]] +name = "byteorder" +version = "1.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610" + +[[package]] +name = "cc" +version = "1.0.79" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "50d30906286121d95be3d479533b458f87493b30a4b5f79a607db8f5d11aa91f" +dependencies = [ + "jobserver", +] + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "chrono" +version = "0.4.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4e3c5919066adf22df73762e50cffcde3a758f2a848b113b586d1f86728b673b" +dependencies = [ + "iana-time-zone", + "js-sys", + "num-integer", + "num-traits", + "time 0.1.45", + "wasm-bindgen", + "winapi", +] + +[[package]] +name = "clap" +version = "4.0.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69d64e88428747154bd8bc378d178377ef4dace7a5735ca1f3855be72f2c2cb5" +dependencies = [ + "atty", + "bitflags", + "clap_derive", + "clap_lex", + "once_cell", + "strsim", + "termcolor", +] + +[[package]] +name = "clap_derive" +version = "4.0.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c42f169caba89a7d512b5418b09864543eeb4d497416c917d7137863bd2076ad" +dependencies = [ + "heck", + "proc-macro-error", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "clap_lex" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0d4198f73e42b4936b35b5bb248d81d2b595ecb170da0bac7655c54eedfa8da8" +dependencies = [ + "os_str_bytes", +] + +[[package]] +name = "cmake" +version = "0.1.49" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db34956e100b30725f2eb215f90d4871051239535632f84fea3bc92722c66b7c" +dependencies = [ + "cc", +] + +[[package]] +name = "codespan-reporting" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3538270d33cc669650c4b093848450d380def10c331d38c768e34cac80576e6e" +dependencies = [ + "termcolor", + "unicode-width", +] + +[[package]] +name = "core-foundation-sys" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e496a50fda8aacccc86d7529e2c1e0892dbd0f898a6b5645b5561b89c3210efa" + +[[package]] +name = "cpu-time" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e9e393a7668fe1fad3075085b86c781883000b4ede868f43627b34a87c8b7ded" +dependencies = [ + "libc", + "winapi", +] + +[[package]] +name = "crc32fast" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b540bd8bc810d3885c6ea91e2018302f68baba2129ab3e88f32389ee9370880d" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "csv" +version = "1.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22813a6dc45b335f9bade10bf7271dc477e81113e89eb251a0bc2a8a81c536e1" +dependencies = [ + "bstr", + "csv-core", + "itoa 0.4.8", + "ryu", + "serde", +] + +[[package]] +name = "csv-core" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b2466559f260f48ad25fe6317b3c8dac77b5bdb5763ac7d9d6103530663bc90" +dependencies = [ + "memchr", +] + +[[package]] +name = "custom_derive" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ef8ae57c4978a2acd8b869ce6b9ca1dfe817bff704c220209fdef2c0b75a01b9" + +[[package]] +name = "cxx" +version = "1.0.92" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a140f260e6f3f79013b8bfc65e7ce630c9ab4388c6a89c71e07226f49487b72" +dependencies = [ + "cc", + "cxxbridge-flags", + "cxxbridge-macro", + "link-cplusplus", +] + +[[package]] +name = "cxx-build" +version = "1.0.92" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da6383f459341ea689374bf0a42979739dc421874f112ff26f829b8040b8e613" +dependencies = [ + "cc", + "codespan-reporting", + "once_cell", + "proc-macro2", + "quote", + "scratch", + "syn", +] + +[[package]] +name = "cxxbridge-flags" +version = "1.0.92" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90201c1a650e95ccff1c8c0bb5a343213bdd317c6e600a93075bca2eff54ec97" + +[[package]] +name = "cxxbridge-macro" +version = "1.0.92" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b75aed41bb2e6367cae39e6326ef817a851db13c13e4f3263714ca3cfb8de56" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "deranged" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7684a49fb1af197853ef7b2ee694bc1f5b4179556f1e5710e1760c5db6f5e929" + +[[package]] +name = "derive-new" +version = "0.5.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3418329ca0ad70234b9735dc4ceed10af4df60eff9c8e7b06cb5e520d92c3535" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "either" +version = "1.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90e5c1c8368803113bf0c9584fc495a58b86dc8a29edbf8fe877d21d9507e797" + +[[package]] +name = "enum-map" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e893a7ba6116821058dec84a6fb14fb2a97cd8ce5fd0f85d5a4e760ecd7329d9" +dependencies = [ + "enum-map-derive", +] + +[[package]] +name = "enum-map-derive" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "84278eae0af6e34ff6c1db44c11634a694aafac559ff3080e4db4e4ac35907aa" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "env_logger" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c90bf5f19754d10198ccb95b70664fc925bd1fc090a0fd9a6ebc54acc8cd6272" +dependencies = [ + "atty", + "humantime", + "log", + "regex", + "termcolor", +] + +[[package]] +name = "exitcode" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "de853764b47027c2e862a995c34978ffa63c1501f2e15f987ba11bd4f9bba193" + +[[package]] +name = "feature-probe" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "835a3dc7d1ec9e75e2b5fb4ba75396837112d2060b03f7d43bc1897c7f7211da" + +[[package]] +name = "fixedbitset" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80" + +[[package]] +name = "flate2" +version = "1.0.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b9429470923de8e8cbd4d2dc513535400b4b3fef0319fb5c4e1f520a7bef743" +dependencies = [ + "crc32fast", + "miniz_oxide", +] + +[[package]] +name = "form_urlencoded" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a9c384f161156f5260c24a097c56119f9be8c798586aecc13afbcbe7b7e26bf8" +dependencies = [ + "percent-encoding", +] + +[[package]] +name = "fs-utils" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6fc7a9dc005c944c98a935e7fd626faf5bf7e5a609f94bc13e42fc4a02e52593" +dependencies = [ + "quick-error", +] + +[[package]] +name = "fxhash" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c" +dependencies = [ + "byteorder", +] + +[[package]] +name = "getrandom" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4eb1a864a501629691edf6c15a593b7a51eebaa1e8468e9ddc623de7c9b58ec6" +dependencies = [ + "cfg-if", + "libc", + "wasi 0.11.0+wasi-snapshot-preview1", +] + +[[package]] +name = "getset" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e45727250e75cc04ff2846a66397da8ef2b3db8e40e0cef4df67950a07621eb9" +dependencies = [ + "proc-macro-error", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "glob" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" + +[[package]] +name = "hashbrown" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888" + +[[package]] +name = "heck" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2540771e65fc8cb83cd6e8a237f70c319bd5c29f78ed1084ba5d50eeac86f7f9" + +[[package]] +name = "hermit-abi" +version = "0.1.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33" +dependencies = [ + "libc", +] + +[[package]] +name = "hiphase" +version = "1.0.0" +dependencies = [ + "bio", + "bit-vec", + "chrono", + "clap", + "cpu-time", + "csv", + "env_logger", + "exitcode", + "flate2", + "lazy_static", + "log", + "priority-queue", + "rust-htslib", + "rustc-hash", + "serde", + "simple-error", + "threadpool", + "vergen", +] + +[[package]] +name = "hts-sys" +version = "2.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0dba4fc406d3686926c84f98fd53026b625319d119e6056a40313862a6e3c4eb" +dependencies = [ + "cc", + "fs-utils", + "glob", + "libz-sys", +] + +[[package]] +name = "humantime" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" + +[[package]] +name = "iana-time-zone" +version = "0.1.56" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0722cd7114b7de04316e7ea5456a0bbb20e4adb46fd27a3697adb812cff0f37c" +dependencies = [ + "android_system_properties", + "core-foundation-sys", + "iana-time-zone-haiku", + "js-sys", + "wasm-bindgen", + "windows", +] + +[[package]] +name = "iana-time-zone-haiku" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0703ae284fc167426161c2e3f1da3ea71d94b21bedbcc9494e92b28e334e3dca" +dependencies = [ + "cxx", + "cxx-build", +] + +[[package]] +name = "idna" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e14ddfc70884202db2244c223200c204c2bda1bc6e0998d11b5e024d657209e6" +dependencies = [ + "unicode-bidi", + "unicode-normalization", +] + +[[package]] +name = "ieee754" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9007da9cacbd3e6343da136e98b0d2df013f553d35bdec8b518f07bea768e19c" + +[[package]] +name = "indexmap" +version = "1.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "10a35a97730320ffe8e2d410b5d3b69279b98d2c14bdb8b70ea89ecf7888d41e" +dependencies = [ + "autocfg", + "hashbrown", +] + +[[package]] +name = "itertools" +version = "0.10.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473" +dependencies = [ + "either", +] + +[[package]] +name = "itertools-num" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a872a22f9e6f7521ca557660adb96dd830e54f0f490fa115bb55dd69d38b27e7" +dependencies = [ + "num-traits", +] + +[[package]] +name = "itoa" +version = "0.4.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b71991ff56294aa922b450139ee08b3bfc70982c6b2c7562771375cf73542dd4" + +[[package]] +name = "itoa" +version = "1.0.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af150ab688ff2122fcef229be89cb50dd66af9e01a4ff320cc137eecc9bacc38" + +[[package]] +name = "jobserver" +version = "0.1.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "068b1ee6743e4d11fb9c6a1e6064b3693a1b600e7f5f5988047d98b3dc9fb90b" +dependencies = [ + "libc", +] + +[[package]] +name = "js-sys" +version = "0.3.61" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "445dde2150c55e483f3d8416706b97ec8e8237c307e5b7b4b8dd15e6af2a0730" +dependencies = [ + "wasm-bindgen", +] + +[[package]] +name = "lazy_static" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" + +[[package]] +name = "libc" +version = "0.2.135" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68783febc7782c6c5cb401fbda4de5a9898be1762314da0bb2c10ced61f18b0c" + +[[package]] +name = "libm" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "292a948cd991e376cf75541fe5b97a1081d713c618b4f1b9500f8844e49eb565" + +[[package]] +name = "libz-sys" +version = "1.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9702761c3935f8cc2f101793272e202c72b99da8f4224a19ddcf1279a6450bbf" +dependencies = [ + "cc", + "cmake", + "libc", + "pkg-config", + "vcpkg", +] + +[[package]] +name = "linear-map" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfae20f6b19ad527b550c223fddc3077a547fc70cda94b9b566575423fd303ee" + +[[package]] +name = "link-cplusplus" +version = "1.0.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ecd207c9c713c34f95a097a5b029ac2ce6010530c7b49d7fea24d977dede04f5" +dependencies = [ + "cc", +] + +[[package]] +name = "log" +version = "0.4.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "abb12e687cfb44aa40f41fc3978ef76448f9b6038cad6aef4259d3c095a2382e" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "matrixmultiply" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "add85d4dd35074e6fedc608f8c8f513a3548619a9024b751949ef0e8e45a4d84" +dependencies = [ + "rawpointer", +] + +[[package]] +name = "memchr" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d" + +[[package]] +name = "miniz_oxide" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e7810e0be55b428ada41041c41f32c9f1a42817901b4ccf45fa3d4b6561e74c7" +dependencies = [ + "adler", +] + +[[package]] +name = "multimap" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a" +dependencies = [ + "serde", +] + +[[package]] +name = "nalgebra" +version = "0.29.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d506eb7e08d6329505faa8a3a00a5dcc6de9f76e0c77e4b75763ae3c770831ff" +dependencies = [ + "approx", + "matrixmultiply", + "nalgebra-macros", + "num-complex", + "num-rational", + "num-traits", + "rand", + "rand_distr", + "simba", + "typenum", +] + +[[package]] +name = "nalgebra-macros" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "01fcc0b8149b4632adc89ac3b7b31a12fb6099a0317a4eb2ebff574ef7de7218" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "ndarray" +version = "0.15.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "adb12d4e967ec485a5f71c6311fe28158e9d6f4bc4a447b474184d0f91a8fa32" +dependencies = [ + "matrixmultiply", + "num-complex", + "num-integer", + "num-traits", + "rawpointer", +] + +[[package]] +name = "newtype_derive" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac8cd24d9f185bb7223958d8c1ff7a961b74b1953fd05dba7cc568a63b3861ec" +dependencies = [ + "rustc_version", +] + +[[package]] +name = "num-complex" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7ae39348c8bc5fbd7f40c727a9925f03517afd2ab27d46702108b6a7e5414c19" +dependencies = [ + "num-traits", +] + +[[package]] +name = "num-integer" +version = "0.1.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "225d3389fb3509a24c93f5c29eb6bde2586b98d9f016636dff58d7c6f7569cd9" +dependencies = [ + "autocfg", + "num-traits", +] + +[[package]] +name = "num-rational" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0638a1c9d0a3c0914158145bc76cff373a75a627e6ecbfb71cbe6f453a5a19b0" +dependencies = [ + "autocfg", + "num-integer", + "num-traits", +] + +[[package]] +name = "num-traits" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "578ede34cf02f8924ab9447f50c28075b4d3e5b269972345e7e0372b38c6cdcd" +dependencies = [ + "autocfg", + "libm", +] + +[[package]] +name = "num_cpus" +version = "1.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19e64526ebdee182341572e50e9ad03965aa510cd94427a4549448f285e957a1" +dependencies = [ + "hermit-abi", + "libc", +] + +[[package]] +name = "num_threads" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2819ce041d2ee131036f4fc9d6ae7ae125a3a40e97ba64d04fe799ad9dabbb44" +dependencies = [ + "libc", +] + +[[package]] +name = "once_cell" +version = "1.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e82dad04139b71a90c080c8463fe0dc7902db5192d939bd0950f074d014339e1" + +[[package]] +name = "ordered-float" +version = "3.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1f74e330193f90ec45e2b257fa3ef6df087784157ac1ad2c1e71c62837b03aa7" +dependencies = [ + "num-traits", +] + +[[package]] +name = "os_str_bytes" +version = "6.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ff7415e9ae3fff1225851df9e0d9e4e5479f947619774677a63572e55e80eff" + +[[package]] +name = "paste" +version = "1.0.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1de2e551fb905ac83f73f7aedf2f0cb4a0da7e35efa24a202a936269f1f18e1" + +[[package]] +name = "percent-encoding" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "478c572c3d73181ff3c2539045f6eb99e5491218eae919370993b890cdbdd98e" + +[[package]] +name = "petgraph" +version = "0.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6d5014253a1331579ce62aa67443b4a658c5e7dd03d4bc6d302b94474888143" +dependencies = [ + "fixedbitset", + "indexmap", +] + +[[package]] +name = "pkg-config" +version = "0.3.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ac9a59f73473f1b8d852421e59e64809f025994837ef743615c6d0c5b305160" + +[[package]] +name = "ppv-lite86" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eb9f9e6e233e5c4a35559a617bf40a4ec447db2e84c20b55a6f83167b7e57872" + +[[package]] +name = "priority-queue" +version = "1.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "815082d99af3acc75a3e67efd2a07f72e67b4e81b4344eb8ca34c6ebf3dfa9c5" +dependencies = [ + "autocfg", + "indexmap", +] + +[[package]] +name = "proc-macro-error" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c" +dependencies = [ + "proc-macro-error-attr", + "proc-macro2", + "quote", + "syn", + "version_check", +] + +[[package]] +name = "proc-macro-error-attr" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869" +dependencies = [ + "proc-macro2", + "quote", + "version_check", +] + +[[package]] +name = "proc-macro2" +version = "1.0.58" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa1fb82fc0c281dd9671101b66b771ebbe1eaf967b96ac8740dcba4b70005ca8" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quick-error" +version = "1.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1d01941d82fa2ab50be1e79e6714289dd7cde78eba4c074bc5a4374f650dfe0" + +[[package]] +name = "quote" +version = "1.0.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f4f29d145265ec1c483c7c654450edde0bfe043d3938d6972630663356d9500" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "rand" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" +dependencies = [ + "libc", + "rand_chacha", + "rand_core", +] + +[[package]] +name = "rand_chacha" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" +dependencies = [ + "ppv-lite86", + "rand_core", +] + +[[package]] +name = "rand_core" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" +dependencies = [ + "getrandom", +] + +[[package]] +name = "rand_distr" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32cb0b9bc82b0a0876c2dd994a7e7a2683d3e7390ca40e6886785ef0c7e3ee31" +dependencies = [ + "num-traits", + "rand", +] + +[[package]] +name = "rawpointer" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60a357793950651c4ed0f3f52338f53b2f809f32d83a07f72909fa13e4c6c1e3" + +[[package]] +name = "regex" +version = "1.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c4eb3267174b8c6c2f654116623910a0fef09c4753f8dd83db29c48a0df988b" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132" + +[[package]] +name = "regex-syntax" +version = "0.6.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a3f87b73ce11b1619a3c6332f45341e0047173771e8b8b73f87bfeefb7b56244" + +[[package]] +name = "rust-htslib" +version = "0.39.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "239ef7334dbf59acd56b7a6fa62a525ed7e36d6239a686ed4ff61bc794108e53" +dependencies = [ + "bio-types", + "byteorder", + "custom_derive", + "derive-new", + "hts-sys", + "ieee754", + "lazy_static", + "libc", + "linear-map", + "newtype_derive", + "regex", + "thiserror", + "url", +] + +[[package]] +name = "rustc-hash" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" + +[[package]] +name = "rustc_version" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c5f5376ea5e30ce23c03eb77cbe4962b988deead10910c372b226388b594c084" +dependencies = [ + "semver", +] + +[[package]] +name = "rustversion" +version = "1.0.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7ffc183a10b4478d04cbbbfc96d0873219d962dd5accaff2ffbd4ceb7df837f4" + +[[package]] +name = "ryu" +version = "1.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4501abdff3ae82a1c1b477a17252eb69cee9e66eb915c1abaa4f44d873df9f09" + +[[package]] +name = "safe_arch" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "794821e4ccb0d9f979512f9c1973480123f9bd62a90d74ab0f9426fcf8f4a529" +dependencies = [ + "bytemuck", +] + +[[package]] +name = "scratch" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1792db035ce95be60c3f8853017b3999209281c24e2ba5bc8e59bf97a0c590c1" + +[[package]] +name = "semver" +version = "0.1.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d4f410fedcf71af0345d7607d246e7ad15faaadd49d240ee3b24e5dc21a820ac" + +[[package]] +name = "serde" +version = "1.0.147" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d193d69bae983fc11a79df82342761dfbf28a99fc8d203dca4c3c1b590948965" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.147" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4f1d362ca8fc9c3e3a7484440752472d68a6caa98f1ab81d99b5dfe517cec852" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "simba" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0b7840f121a46d63066ee7a99fc81dcabbc6105e437cae43528cea199b5a05f" +dependencies = [ + "approx", + "num-complex", + "num-traits", + "paste", + "wide", +] + +[[package]] +name = "simple-error" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cc47a29ce97772ca5c927f75bac34866b16d64e07f330c3248e2d7226623901b" + +[[package]] +name = "statrs" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d08e5e1748192713cc281da8b16924fb46be7b0c2431854eadc785823e5696e" +dependencies = [ + "approx", + "lazy_static", + "nalgebra", + "num-traits", + "rand", +] + +[[package]] +name = "strsim" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" + +[[package]] +name = "strum" +version = "0.24.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "063e6045c0e62079840579a7e47a355ae92f60eb74daaf156fb1e84ba164e63f" + +[[package]] +name = "strum_macros" +version = "0.24.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e385be0d24f186b4ce2f9982191e7101bb737312ad61c1f2f984f34bcf85d59" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "rustversion", + "syn", +] + +[[package]] +name = "syn" +version = "1.0.102" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3fcd952facd492f9be3ef0d0b7032a6e442ee9b361d4acc2b1d0c4aaa5f613a1" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "termcolor" +version = "1.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bab24d30b911b2376f3a13cc2cd443142f0c81dda04c118693e35b3835757755" +dependencies = [ + "winapi-util", +] + +[[package]] +name = "thiserror" +version = "1.0.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "10deb33631e3c9018b9baf9dcbbc4f737320d2b576bac10f6aefa048fa407e3e" +dependencies = [ + "thiserror-impl", +] + +[[package]] +name = "thiserror-impl" +version = "1.0.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "982d17546b47146b28f7c22e3d08465f6b8903d0ea13c1660d9d84a6e7adcdbb" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "threadpool" +version = "1.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d050e60b33d41c19108b32cea32164033a9013fe3b46cbd4457559bfbf77afaa" +dependencies = [ + "num_cpus", +] + +[[package]] +name = "time" +version = "0.1.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b797afad3f312d1c66a56d11d0316f916356d11bd158fbc6ca6389ff6bf805a" +dependencies = [ + "libc", + "wasi 0.10.0+wasi-snapshot-preview1", + "winapi", +] + +[[package]] +name = "time" +version = "0.3.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b0fdd63d58b18d663fbdf70e049f00a22c8e42be082203be7f26589213cd75ea" +dependencies = [ + "deranged", + "itoa 1.0.9", + "libc", + "num_threads", + "serde", + "time-core", + "time-macros", +] + +[[package]] +name = "time-core" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7300fbefb4dadc1af235a9cef3737cea692a9d97e1b9cbcd4ebdae6f8868e6fb" + +[[package]] +name = "time-macros" +version = "0.2.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eb71511c991639bb078fd5bf97757e03914361c48100d52878b8e52b46fb92cd" +dependencies = [ + "time-core", +] + +[[package]] +name = "tinyvec" +version = "1.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87cc5ceb3875bb20c2890005a4e226a4651264a5c75edb2421b52861a0a0cb50" +dependencies = [ + "tinyvec_macros", +] + +[[package]] +name = "tinyvec_macros" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c" + +[[package]] +name = "triple_accel" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22048bc95dfb2ffd05b1ff9a756290a009224b60b2f0e7525faeee7603851e63" + +[[package]] +name = "typenum" +version = "1.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dcf81ac59edc17cc8697ff311e8f5ef2d99fcbd9817b34cec66f90b6c3dfd987" + +[[package]] +name = "unicode-bidi" +version = "0.3.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d54675592c1dbefd78cbd98db9bacd89886e1ca50692a0692baefffdeb92dd58" + +[[package]] +name = "unicode-ident" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ceab39d59e4c9499d4e5a8ee0e2735b891bb7308ac83dfb4e80cad195c9f6f3" + +[[package]] +name = "unicode-normalization" +version = "0.1.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c5713f0fc4b5db668a2ac63cdb7bb4469d8c9fed047b1d0292cc7b0ce2ba921" +dependencies = [ + "tinyvec", +] + +[[package]] +name = "unicode-width" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c0edd1e5b14653f783770bce4a4dabb4a5108a5370a5f5d8cfe8710c361f6c8b" + +[[package]] +name = "url" +version = "2.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0d68c799ae75762b8c3fe375feb6600ef5602c883c5d21eb51c09f22b83c4643" +dependencies = [ + "form_urlencoded", + "idna", + "percent-encoding", +] + +[[package]] +name = "vcpkg" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" + +[[package]] +name = "vec_map" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1bddf1187be692e79c5ffeab891132dfb0f236ed36a43c7ed39f1165ee20191" +dependencies = [ + "serde", +] + +[[package]] +name = "vergen" +version = "8.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbc5ad0d9d26b2c49a5ab7da76c3e79d3ee37e7821799f8223fcb8f2f391a2e7" +dependencies = [ + "anyhow", + "rustversion", + "time 0.3.25", +] + +[[package]] +name = "version_check" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" + +[[package]] +name = "wasi" +version = "0.10.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a143597ca7c7793eff794def352d41792a93c481eb1042423ff7ff72ba2c31f" + +[[package]] +name = "wasi" +version = "0.11.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" + +[[package]] +name = "wasm-bindgen" +version = "0.2.84" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "31f8dcbc21f30d9b8f2ea926ecb58f6b91192c17e9d33594b3df58b2007ca53b" +dependencies = [ + "cfg-if", + "wasm-bindgen-macro", +] + +[[package]] +name = "wasm-bindgen-backend" +version = "0.2.84" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "95ce90fd5bcc06af55a641a86428ee4229e44e07033963a2290a8e241607ccb9" +dependencies = [ + "bumpalo", + "log", + "once_cell", + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-macro" +version = "0.2.84" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c21f77c0bedc37fd5dc21f897894a5ca01e7bb159884559461862ae90c0b4c5" +dependencies = [ + "quote", + "wasm-bindgen-macro-support", +] + +[[package]] +name = "wasm-bindgen-macro-support" +version = "0.2.84" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2aff81306fcac3c7515ad4e177f521b5c9a15f2b08f4e32d823066102f35a5f6" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-backend", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-shared" +version = "0.2.84" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0046fef7e28c3804e5e38bfa31ea2a0f73905319b677e57ebe37e49358989b5d" + +[[package]] +name = "wide" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae41ecad2489a1655c8ef8489444b0b113c0a0c795944a3572a0931cf7d2525c" +dependencies = [ + "bytemuck", + "safe_arch", +] + +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + +[[package]] +name = "winapi-util" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178" +dependencies = [ + "winapi", +] + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + +[[package]] +name = "windows" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e686886bc078bc1b0b600cac0147aadb815089b6e4da64016cbd754b6342700f" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-targets" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b1eb6f0cd7c80c79759c929114ef071b87354ce476d9d94271031c0497adfd5" +dependencies = [ + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91ae572e1b79dba883e0d315474df7305d12f569b400fcf90581b06062f7e1bc" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2ef27e0d7bdfcfc7b868b317c1d32c641a6fe4629c171b8928c7b08d98d7cf3" + +[[package]] +name = "windows_i686_gnu" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "622a1962a7db830d6fd0a69683c80a18fda201879f0f447f065a3b7467daa241" + +[[package]] +name = "windows_i686_msvc" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4542c6e364ce21bf45d69fdd2a8e455fa38d316158cfd43b3ac1c5b1b19f8e00" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca2b8a661f7628cbd23440e50b05d705db3686f894fc9580820623656af974b1" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7896dbc1f41e08872e9d5e8f8baa8fdd2677f29468c4e156210174edc7f7b953" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a515f5799fe4961cb532f983ce2b23082366b898e52ffbce459c86f67c8378a" diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..28189e1 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,38 @@ +[package] +name = "hiphase" +version = "1.0.0" +authors = ["J. Matthew Holt "] +description = "A tool for phasing HiFi VCF files." +edition = "2021" +license-file="LICENSE.md" + +[build-dependencies] +vergen = { version = "8.2.4", features = ["git", "gitcl"] } + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html +# Note: cmake is required, module load cmake/3.20.2 for initial compiles + +[dependencies] +bio = "1.2.0" +bit-vec = "0.6.3" +chrono = "0.4.24" +clap = { version = "4.0.13", features = ["derive"] } +cpu-time = "1.0.0" +csv = "1.1.6" +env_logger = "0.9.1" +exitcode = "1.1.2" +flate2 = "1.0.26" +lazy_static = "1.4.0" +log = "0.4.17" +priority-queue = "1.2.3" +# consider the older version if we run into build issues later +# rust-htslib = { version = "0.37.0", default-features = false } +rust-htslib = { version = "0.39.5", default-features = false, features = ["static"] } +rustc-hash = "1.1.0" +serde = "1.0.147" +simple-error = "0.2.3" +threadpool = "1.8.1" + +[profile.release] +lto = "fat" +codegen-units = 1 diff --git a/build.rs b/build.rs new file mode 100644 index 0000000..afa5ed8 --- /dev/null +++ b/build.rs @@ -0,0 +1,19 @@ +use std::error::Error; +use vergen::EmitBuilder; + +fn main() -> Result<(), Box> { + EmitBuilder::builder() + .fail_on_error() + .all_git() + .git_describe(true, false, Some("ThisPatternShouldNotMatchAnythingEver")) + .emit()?; + + // emit build handles the git configuration and build.rs, but we also need to track the toml and src folder to catch dirty + println!("cargo:rerun-if-changed=Cargo.toml"); + println!("cargo:rerun-if-changed=src"); + + // uncomment if you ever want to easily see what this is emiting + // panic!(""); + + Ok(()) +} \ No newline at end of file diff --git a/src/astar_phaser.rs b/src/astar_phaser.rs new file mode 100644 index 0000000..bdbab4d --- /dev/null +++ b/src/astar_phaser.rs @@ -0,0 +1,788 @@ + +use crate::{block_gen::PhaseBlock, data_types::variants::VariantType}; +use crate::data_types::read_segments::ReadSegment; +use crate::data_types::variants::Variant; +use crate::writers::phase_stats::PhaseStats; + +use bio::data_structures::interval_tree::IntervalTree; +use log::{debug,trace}; +use priority_queue::PriorityQueue; +use std::cmp::Reverse; + +/// A node in the A* search tree. +#[derive(Eq,Hash,PartialEq)] +struct AstarNode { + /// The node index + node_index: u64, + /// The cost that is fixed that no longer needs to be re-computed. Corresponds to reads that overlap early parts of the haplotypes. + frozen_cost: u64, + /// The cost that needs to be re-computed, corresponds to reads that partially overlap this solution, but have more variants. + fluid_cost: u64, + /// An estimate of the remaining cost to extend this node to full. + heuristic_cost: u64, + /// The first haplotype in this node's solution. + h1: Vec, + /// The second haplotype in this node's solution. It can be identically to h1, but usually is not. + h2: Vec, + /// The number of heterozygous results in this node's solution. I.e. sum(h1[x] != h2[x]) + num_hets: u64 +} + +impl std::fmt::Debug for AstarNode { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("AstarNode") + .field("frozen_cost", &self.frozen_cost) + .field("fluid_cost", &self.fluid_cost) + .field("heuristic_cost", &self.heuristic_cost) + .field("hap.len()", &self.h1.len()) + .finish() + } +} + +impl AstarNode { + /// Returns a new empty haplotype node with a heuristic cost. + /// This should really only be used for a root node. + /// # Arguments + /// * `max_heuristic` - the estimate cost for the full phase block + pub fn new(max_heuristic: u64) -> AstarNode { + AstarNode { + node_index: 0, + frozen_cost: 0, + fluid_cost: 0, + heuristic_cost: max_heuristic, + h1: Default::default(), + h2: Default::default(), + num_hets: 0 + } + } + + /// This will create a newly extended node from haplotypes. + /// Heuristic cost must be provided, but actual cost will be calculated from the reads. + /// # Arguments + /// * `node_index` - the index to use for this node, generally the index = the order of generating/encountering nodes + /// * `parent_node` - the parent node used to spawn this one in our search tree space + /// * `allele1` - the first allele, gets appended to haplotype 1 from parent + /// * `allele2` - the second allele, gets appened to haplotype 2 from parent + /// * `heuristic_cost` - the estimated cost of adding remaining variants to the haplotypes + /// * `read_segments` - all the reads that we need to evaluate the _actual_ cost so far + /// * `hap_offset` - the offset of the haplotype relative to read starts, only needed if solving subproblems + pub fn new_extended_node( + node_index: u64, + parent_node: &AstarNode, + allele1: u8, + allele2: u8, + heuristic_cost: u64, + read_segments: &IntervalTree, + hap_offset: usize + ) -> AstarNode { + // make sure we didn't goof + let mut h1: Vec = parent_node.get_h1().to_vec(); + h1.push(allele1); + let mut h2: Vec = parent_node.get_h2().to_vec(); + h2.push(allele2); + assert_eq!(h1.len(), h2.len()); + + // num hets = parent hets + 0/1 depending on allelic extensions + let num_hets = parent_node.get_num_hets()+(if allele1 == allele2 { 0 } else { 1 }); + + // copy the frozen cost and initial fluid to 0, these will get added to below + let mut frozen_cost = parent_node.get_frozen_cost(); + let mut fluid_cost = 0; + let hap_len = h1.len()+hap_offset; + for rs_interval in read_segments.find(hap_len-1..hap_len) { + // calculate the cost of this segment with this phase block so far + let rs = rs_interval.data(); + let rs_cost = std::cmp::min( + rs.score_partial_haplotype(&h1[..], hap_offset), + rs.score_partial_haplotype(&h2[..], hap_offset) + ); + + // determine where that cost gets assigned + if rs.last_allele() < hap_len { + // this one is frozen because the last allele was just added + frozen_cost += rs_cost; + } else { + // there are more alleles for this segment, so it's liquid still + fluid_cost += rs_cost; + } + } + + AstarNode { + node_index, + frozen_cost, + fluid_cost, + heuristic_cost, + h1, + h2, + num_hets + } + } + + pub fn get_frozen_cost(&self) -> u64 { + self.frozen_cost + } + + /// Returns the combined frozen, fluid, and heuristic cost of this node. + pub fn get_total_cost(&self) -> u64 { + self.frozen_cost + self.fluid_cost + self.heuristic_cost + } + + /// Priority is ranked by minimum total cost -> max number of hets -> earliest node index + pub fn get_priority(&self) -> (Reverse, u64, Reverse) { + (Reverse(self.get_total_cost()), self.num_hets, Reverse(self.node_index)) + } + + /// Returns a priority where cost is 0, this is primarily to trigger forced pruning + pub fn get_cleared_priority(&self) -> (Reverse, u64, Reverse) { + (Reverse(0), self.num_hets, Reverse(self.node_index)) + } + + pub fn get_h1(&self) -> &[u8] { + &self.h1[..] + } + + pub fn get_h2(&self) -> &[u8] { + &self.h2[..] + } + + pub fn get_allele_count(&self) -> usize { + self.h1.len() + } + + #[allow(dead_code)] + pub fn get_node_index(&self) -> u64 { + self.node_index + } + + pub fn get_num_hets(&self) -> u64 { + self.num_hets + } + + /// Returns true if the internal haplotypes are identical. + /// Usually only happens when near the root of the tree. + pub fn is_identical_haplotypes(&self) -> bool { + self.h1 == self.h2 + } +} + +/// Struct for tracking the length of haplotypes in our queue that are greater than some threshold. +/// This allows us to track how long a queue actually is when we know we will ignore events smaller than the threshold. +/// Currently, it does not contain the queue itself, which may be worth doing in the long term. +struct PQueueHapTracker { + /// The count of each haplotype size in the queue + length_counts: Vec, + /// The total number of haplotypes in the queue with length >= threshold + total_count: usize, + /// The minimum threshold for a haplotype to count + threshold: usize +} + +impl PQueueHapTracker { + /// Creates a new tracker with a given maximum haplotype size. + /// # Arguments + /// * `max_hap_length` - the maximum size of the haplotypes tracked + pub fn new(max_hap_length: usize) -> PQueueHapTracker { + PQueueHapTracker { + length_counts: vec![0; max_hap_length+1], + total_count: 0, + threshold: 0 + } + } + + /// Adds a haplotype length to our tracker + /// # Arguments + /// * `value` - the length of the haplotype getting tracked + pub fn add_hap(&mut self, value: usize) { + self.length_counts[value] += 1; + if value >= self.threshold { + self.total_count += 1; + } + } + + /// Removes a haplotype length from the tracker + /// # Arguments + /// * `value` - the length of the haplotype getting removed from tracking + pub fn remove_hap(&mut self, value: usize) { + assert!(self.length_counts[value] > 0); + self.length_counts[value] -= 1; + if value >= self.threshold { + assert!(self.total_count > 0); + self.total_count -= 1; + } + } + + /// Increased the threshold of what is included in our total count + /// # Arguments + /// * `new_threshold` - the new minimum threshold to track, must be >= current threshold + pub fn increase_threshold(&mut self, new_threshold: usize) { + assert!(new_threshold >= self.threshold); + trace!("increase_threshold => {}, size = {}", self.threshold, self.total_count); + for t in self.threshold..new_threshold { + self.total_count -= self.length_counts[t]; + } + self.threshold = new_threshold; + trace!("increase_threshold => {}, size = {}", self.threshold, self.total_count); + } + + /// Returns the total number of haplotypes in the queue with length >= the internal threshold. + pub fn len(&self) -> usize { + self.total_count + } +} + +/// This calculates the heuristic estimates to be used by the full A* algorithm. +/// Given N alleles to phase, this will return a Vec with N+1 values, such that V[x] = H(x) where x is the number of set haplotype values. +/// This array is monotonically decreasing and always ends with 0. +/// Also returns a second boolean array indicating a variant is "problematic" and should be ignored (currently disabled, controlled via `bad_variants_enabled` constant). +/// This implementation uses the A* algorithm to generate the heuristic. +/// See `astar_subsolver(...)` for details. +/// # Arguments +/// * `num_variants` - the number of variants in the phase block; run-time grows linearly as the number of variants increases +/// * `max_segment_size` - the maximum number of variants to use when calculating sub-block heuristics; run-time grows a least linearly with this value +/// * `read_segments` - the reads to use when calculating the heuristics costs; run-time grows linearly with the length of this data +/// * `min_queue_size` - the minimum length of the queue +/// * `queue_increment` - the length that the queue grows as more variants are added to the solution +/// * `opt_bad_variants` - an optional set of "bad" or "ignored" variants that we should just ignore from the start +fn calculate_astar_heuristic( + num_variants: usize, max_segment_size: usize, read_segments: &IntervalTree, + min_queue_size: usize, queue_increment: usize, opt_bad_variants: Option> +) -> (Vec, Vec) { + assert!(max_segment_size >= 2); + // an extra slot is included because it makes some checks go away later + let mut heuristics: Vec = vec![0; num_variants+1]; + let mut bad_variants: Vec = match opt_bad_variants { + Some(obv) => { + assert_eq!(obv.len(), num_variants); + obv + }, + None => vec![false; num_variants] + }; + let bad_variants_enabled = false; + let mut max_clip_size: usize = 1; + for v_index in (0..num_variants).rev() { + debug!("solving subproblem {}..{}", v_index, v_index+max_clip_size); + let (max_estimate, solve_size): (u64, usize) = astar_subsolver( + v_index, max_clip_size, read_segments, &heuristics[..], &bad_variants[..], + min_queue_size / 10, queue_increment + ); + assert!(solve_size >= max_clip_size.min(2)); + debug!(" estimate => {} (solution distance => {}/{})", max_estimate, solve_size, max_clip_size); + + // monotonically decreasing so it's either + // 1) the cost of the next entry OR + // 2) this cost of solving this problem + the cost of solving everything after this problem + if bad_variants_enabled && solve_size < max_clip_size { + // we couldn't successfully form a partial haplotype + bad_variants[v_index] = true; + } + + if bad_variants[v_index] { + // this one was a bad variant that will get ignored (e.g., 0-cost), just copy the heuristic + heuristics[v_index] = heuristics[v_index+1]; + } else { + // we successfully navigated estimation from this point + assert!(max_estimate >= heuristics[v_index+1]); + heuristics[v_index] = max_estimate; + } + + max_clip_size = (solve_size+1).min(max_segment_size); + } + + (heuristics, bad_variants) +} + +/// An unpruned A* solver that will attempt to find the best path for a sub-problem. +/// It uses the heuristic estimates downstream to calculate this ones largest estimate. +/// For problem_size, `p`, and problem_offset, `o`, it will calculate max(`best_path(o..o+x)+H[o+x]`, for all `0 <= x <= p`). +/// This is basically the worst-case combination of two partial solutions within this region. +/// There is a very small queue used by this solver and if it reaches a maximum capacity, it will exit early without fully estimating. +/// However, as the solver goes, it will calculate the maximum heuristic encountered so far even if `x` does not get all the way to `p`. +/// While this estimator may exit early, it is not pruned, so you are guaranteed to find the best heuristic up to the point it exits. +/// In practice, most of them fully reach the problem size except in the problematic areas. +/// Returns tuple `(max_cost, farthest_estimate)` where max_cost is the highest heuristic cost found and farthest_estimate is how far the A* algorithm made it through the subproblem. +/// # Arguments +/// * `problem_offset` - the offset into the problem we are solving +/// * `problem_size` - the length of the sub-problem we are solving, usually constant except at the tail +/// * `read_segments` - the reads that are used to measure costs +/// * `heuristic_costs` - the heuristic costs so far, everything in `problem_offset+1..` is assumed to be already populated +/// * `bad_variants` - the bad variants encountered so far, if one is true, it will basically be ignored +/// * `min_queue_size` - the minimum length of the queue +/// * `queue_increment` - the length that the queue grows as more variants are added to the solution +fn astar_subsolver( + problem_offset: usize, problem_size: usize, read_segments: &IntervalTree, + heuristic_costs: &[u64], bad_variants: &[bool], min_queue_size: usize, queue_increment: usize +) -> (u64, usize) { + // now, the core looping algorithm + let mut pqueue: PriorityQueue, u64, Reverse)> = PriorityQueue::new(); + let mut next_node_index: u64 = 1; + + // this heuristic _should_ always be 0, because we're trying to calculate it + assert_eq!(heuristic_costs[problem_offset], 0); + // initialize with our neighbor heuristic (heuristic_costs is 1 longer than necessary, so no check needed here) + let initial_estimate = heuristic_costs[problem_offset+1]; + + // initialize this subsolver with the estimate from the _next_ node; it has to be >= that value + let initial_node = AstarNode::new(initial_estimate); + let initial_priority = initial_node.get_priority(); + pqueue.push(initial_node, initial_priority); + + let mut next_expected: usize = 0; + let mut max_cost_so_far: u64 = 0; + + // we want a base level queue length, but it also needs to grow *slightly* with the length of the problem + let max_visits: usize = min_queue_size + queue_increment * problem_size; + let mut nodes_visited: usize = 0; + + //we loop as long as the next entry is shorter than the problem size + while pqueue.peek().unwrap().0.get_allele_count() < problem_size && nodes_visited < max_visits { + let (top_node, _top_priority) = pqueue.pop().unwrap(); + let allele_count: usize = top_node.get_allele_count(); + nodes_visited += 1; + + if allele_count == next_expected { + // debug!("B#{} ({}/{}, {:?} {}/{}) => {:?}", phase_block.get_block_index(), next_expected, num_variants, top_priority, hap_tracker.len(), pqueue.len(), top_node); + max_cost_so_far = max_cost_so_far.max(top_node.get_total_cost()); + next_expected += 1; + } + + if bad_variants[problem_offset+allele_count] { + // the next variant we want to add to the haplotype is a bad variant, so skip it + let new_node = AstarNode::new_extended_node( + next_node_index, + &top_node, 2, 2, + heuristic_costs[problem_offset+allele_count+1], + read_segments, + problem_offset + ); + next_node_index += 1; + + // the new node should have identical total cost + assert_eq!(top_node.get_total_cost(), new_node.get_total_cost()); + + // add it to the queue + let new_priority = new_node.get_priority(); + pqueue.push(new_node, new_priority); + } else { + // we didn't exit, so we need to add all expansions of this allele + let hap_order = [(0, 1), (1, 0), (0, 0), (1, 1)]; + for &(h1, h2) in hap_order.iter() { + // we don't want to add both 0-1 and 1-0 if the haplotypes before are identical; it doubles our work + // so skip 1-0 if the haplotypes in the node are identical + if !(h1 == 1 && h2 == 0 && top_node.is_identical_haplotypes()) { + // generate a new node and add to the queue + let new_node = AstarNode::new_extended_node( + next_node_index, + &top_node, h1, h2, + heuristic_costs[problem_offset+allele_count+1], + read_segments, + problem_offset + ); + next_node_index += 1; + + // add it to the queue + let new_priority = new_node.get_priority(); + pqueue.push(new_node, new_priority); + } + } + } + } + + if pqueue.peek().unwrap().0.get_allele_count() == problem_size { + // loop terminated because we reached our problem size + let (top_node, _top_priority) = pqueue.peek().unwrap();//pqueue.pop().unwrap(); + max_cost_so_far = max_cost_so_far.max(top_node.get_total_cost()); + next_expected += 1; + } else { + // we exited early, so whatever max we found is what we have + } + + (max_cost_so_far, next_expected-1) +} + +/// A result for a phasing algorithm, assumes diploid solution currently. +pub struct AstarResult { + /// The first haplotype in the solution. + pub haplotype_1: Vec, + /// The second haplotype in the solution. + pub haplotype_2: Vec, + /// Optional statistics from the problem + pub statistics: PhaseStats +} + +/// Returns a phasing result by performing an A* tree search algorithm to calculate the best phase solution for a phase block. +/// This algorithm currently uses a fixed heuristic based on the distance to the end. +/// It also has a pruning strategy that fixes the priority queue size based on the farthest explored node so far. +/// # Arguments +/// * `phase_block` - the phase block summary information +/// * `variants` - the variants in the block, primarily passed-through to solution +/// * `read_segments` - interval tree of the reads that serve as data points for the phasing algorithm +/// * `min_queue_size` - the minimum length of the queue +/// * `queue_increment` - the length that the queue grows as more variants are added to the solution +pub fn astar_solver( + phase_block: &PhaseBlock, variants: &[Variant], read_segments: &IntervalTree, + min_queue_size: usize, queue_increment: usize +) -> AstarResult { + // we use this a lot + let num_variants: usize = variants.len(); + + // this is a sanity check for now that verifies that all read segments have all ignored variants set to 3 in that position + // there is technically a cost here, but it is negligible for our sanity while debugging + for rse in read_segments.find(0..usize::MAX) { + let segment = rse.data(); + let alleles = segment.alleles(); + for (var_index, v) in variants.iter().enumerate() { + if v.is_ignored() { + assert!(alleles[var_index] == 3); + } + } + } + + // add all the bad variants to this list that will seed the heuristic calculation list + let bad_variants: Vec = variants.iter() + .map(|v| v.is_ignored()) + .collect(); + let opt_bad_variants: Option> = Some(bad_variants); + + // our queue has a flat size + a buffer for each variant encountered so far + let mut curr_queue_size_threshold: usize = min_queue_size; + let full_prune_enabled: bool = true; // this will make sure the pqueue.len() field stays below the `max_queue_size` for conserving memory + + // set max queue size to either 2 * the highest functional queue size OR a large constant, whichever is greater + // the large constant help prevent over-use of the full pruning algorithm if the base queue values are small + let max_queue_size: usize = (2 * (min_queue_size + queue_increment * num_variants)).max(10000); + let mut min_progress: usize = 0; + let mut pqueue: PriorityQueue, u64, Reverse)> = PriorityQueue::new(); + let mut hap_tracker: PQueueHapTracker = PQueueHapTracker::new(num_variants); + let mut next_expected = 0; + + // calculate a heuristic by looking ahead 40 variants + // TODO: do we want to make this a parameter at some point? + let max_segment_size: usize = 40; + let (heuristic_costs, bad_variants): (Vec, Vec) = calculate_astar_heuristic( + num_variants, max_segment_size, read_segments, min_queue_size, queue_increment, opt_bad_variants + ); + debug!("Heuristics(<={}): {:?}", max_segment_size, heuristic_costs); + debug!("Bad variants: {:?}", bad_variants); + + // sanity check, make sure any ignore variants are flagged in bad_variants now + for (i, v) in variants.iter().enumerate() { + if v.is_ignored() { + assert!(bad_variants[i]); + } + } + + // statistics we want to gather + let mut num_pruned: u64 = 0; + let estimated_cost: u64 = heuristic_costs[0]; + + // now, the core looping algorithm + let initial_node = AstarNode::new(heuristic_costs[0]); + let initial_priority = initial_node.get_priority(); + pqueue.push(initial_node, initial_priority); + hap_tracker.add_hap(0); + let mut next_node_index: u64 = 1; + + //we loop as long as the next entry is shorter than the expected number of variants + while pqueue.peek().unwrap().0.get_allele_count() < num_variants { + let (top_node, top_priority) = pqueue.pop().unwrap(); + let allele_count: usize = top_node.get_allele_count(); + hap_tracker.remove_hap(allele_count); + trace!("popped: {:?} => {:?}, {:?}, {:?}", top_priority, top_node, top_node.get_h1(), top_node.get_h2()); + if allele_count == next_expected { + debug!("B#{} ({}/{}, {:?} {}/{}) => {:?}", phase_block.get_block_index(), next_expected, num_variants, top_priority, hap_tracker.len(), pqueue.len(), top_node); + next_expected += 1; + if num_pruned == 0 { + curr_queue_size_threshold += queue_increment; + assert_eq!(curr_queue_size_threshold, min_queue_size + queue_increment * next_expected); + } + } + + // if a node has fewer alleles than our minimum progression, it gets cut + if allele_count < min_progress { + //println!("Pruning {:?} {:?}", top_priority, top_node); + if num_pruned == 0 { + // first time we do this, we need to clear our queue size back to the minimum + curr_queue_size_threshold = min_queue_size; + } + num_pruned += 1; + continue; + } + + if bad_variants[allele_count] { + // we are skipping this variant, generate a new node with unassigned values and add it back to the queue + let new_node = AstarNode::new_extended_node( + next_node_index, + &top_node, 2, 2, + heuristic_costs[allele_count+1], + read_segments, + 0 + ); + next_node_index += 1; + + let new_priority = new_node.get_priority(); + assert_eq!(top_node.get_total_cost(), new_node.get_total_cost()); + pqueue.push(new_node, new_priority); + hap_tracker.add_hap(allele_count+1); + } else { + // we didn't exit, so we need to add all expansions of this allele + // these are ordered such that heterozygous options come first + let hap_order = [(0, 1), (1, 0), (0, 0), (1, 1)]; + for &(h1, h2) in hap_order.iter() { + // we don't want to add both 0-1 and 1-0 if the haplotypes before are identical; it doubles our work + // so skip 1-0 if the haplotypes in the node are identical + if !(h1 == 1 && h2 == 0 && top_node.is_identical_haplotypes()) { + // generate a new node and add to the queue + let new_node = AstarNode::new_extended_node( + next_node_index, + &top_node, h1, h2, + heuristic_costs[allele_count+1], + read_segments, + 0 + ); + next_node_index += 1; + + let new_priority = new_node.get_priority(); + trace!("Pushing {:?}, {:?}, {:?}", new_node, new_node.get_h1(), new_node.get_h2()); + pqueue.push(new_node, new_priority); + hap_tracker.add_hap(allele_count+1); + } + } + } + + // check if we need to increase our minimum progression to prune off some nodes in the queue + if hap_tracker.len() > curr_queue_size_threshold && min_progress < next_expected { + min_progress += 1; + debug!("B#{} min_progress={}", phase_block.get_block_index(), min_progress); + hap_tracker.increase_threshold(min_progress); + + // check if our literal queue is holding too much data + if full_prune_enabled && pqueue.len() > max_queue_size { + // this means there are a lot of short haps we will eventually prune but haven't yet due to relative high cost + // mark them as the "best" priority so they get deleted right away in the next few loops + // this can be expensive, and has no functional benefit other than freeing memory + // worst case HG001 test without this got up to 60GB + let mut prune_count: usize = 0; + for (node, priority) in pqueue.iter_mut() { + if node.get_allele_count() < min_progress { + *priority = node.get_cleared_priority(); + prune_count += 1; + } + } + // when the iter_mut() is released, the pqueue automatically re-prioritizes itself, magic! + debug!("B#{} Full prune triggered for {} nodes.", phase_block.get_block_index(), prune_count); + } + } + } + + let (top_node, top_priority) = pqueue.pop().unwrap(); + let allele_count: usize = top_node.get_allele_count(); + hap_tracker.remove_hap(allele_count); + if allele_count == num_variants { + // successful full solution + debug!("B#{} ({}/{}, {:?} {}/{}) => {:?}", phase_block.get_block_index(), next_expected, num_variants, top_priority, hap_tracker.len(), pqueue.len(), top_node); + let haplotype_1 = top_node.get_h1().to_vec(); + let haplotype_2 = top_node.get_h2().to_vec(); + let actual_cost: u64 = top_node.get_total_cost(); + + // gather stats on how many variants were phased, skipped, or homozygous in this block + let mut phased_variants = 0; + let mut phased_snvs = 0; + let mut homozygous_variants = 0; + let mut skipped_variants = 0; + for (i, (&h1, &h2)) in haplotype_1.iter().zip(haplotype_2.iter()).enumerate() { + if h1 != h2 { + phased_variants += 1; + if variants[i].get_type() == VariantType::Snv { + phased_snvs += 1; + } + } else if h1 == 2 { + // they are both equal to 2 + skipped_variants += 1; + } else { + homozygous_variants += 1; + } + } + debug!("B#{} phased: {}, homozygous: {}, skipped: {}", phase_block.get_block_index(), phased_variants, homozygous_variants, skipped_variants); + + let statistics: PhaseStats = PhaseStats::astar_new( + num_pruned, estimated_cost, actual_cost, + phased_variants, phased_snvs, homozygous_variants, skipped_variants + ); + + // send it all back + AstarResult { + haplotype_1, + haplotype_2, + statistics + } + } else { + // given our current algorithm setup, this should never happen; we will always find _locally_ optimal paths through the tree + panic!("B#{} failed to find solution; ({}/{}), {:?} {} => {:?}", phase_block.get_block_index(), next_expected, num_variants, top_priority, pqueue.len(), top_node); + } +} + +#[cfg(test)] +mod tests { + use super::*; + + /// Returns two full-length reads: the first is all 0s with qual 2 and a second is all 1s with qual 3 + /// # Arguments + /// * `num_alleles` - the length of each read + fn get_simple_reads(num_alleles: usize) -> IntervalTree { + let rs1 = ReadSegment::new( + "read_name".to_string(), + vec![0; num_alleles], + vec![2; num_alleles] + ); + let rs2 = ReadSegment::new( + "read_name_2".to_string(), + vec![1; num_alleles], + vec![3; num_alleles] + ); + let seg_vec = vec![rs1, rs2]; + + let mut read_segments: IntervalTree = IntervalTree::new(); + for rs in seg_vec.into_iter() { + read_segments.insert(rs.get_range(), rs); + } + read_segments + } + + #[test] + fn test_astarnode() { + // all 0 read with quals = 2 and an all 1 read with quals = 3 + let num_alleles = 4; + + // it doesn't matter that these are wrong for testing + let heuristic_costs: Vec = (0..(num_alleles+1)).map(|i| (num_alleles - i) as u64).collect(); + let hap_offset = 0; + let read_segments = get_simple_reads(num_alleles); + + // test an all 0-hom mode + let mut current_node = AstarNode::new(heuristic_costs[0]); + for i in 0..num_alleles { + let node_index: u64 = (i as u64)+1; + let num_hets: u64 = 0; + let next_node = AstarNode::new_extended_node( + node_index, + ¤t_node, 0, 0, + heuristic_costs[i+1], + &read_segments, + hap_offset + ); + + let expected_cost = heuristic_costs[i+1] + 3 * node_index; + assert_eq!(next_node.get_node_index(), node_index); + + let expected_frozen = if i == num_alleles - 1 { + 3 * num_alleles as u64 + } else { + 0 + }; + assert_eq!(next_node.get_frozen_cost(), expected_frozen); + assert_eq!(next_node.get_total_cost(), expected_cost); + assert_eq!(next_node.get_priority(), (Reverse(expected_cost), num_hets, Reverse(node_index))); + assert_eq!(next_node.get_cleared_priority(), (Reverse(0), num_hets, Reverse(node_index))); + assert_eq!(next_node.get_h1().to_vec(), vec![0; i+1]); + assert_eq!(next_node.get_h2().to_vec(), vec![0; i+1]); + assert_eq!(next_node.get_allele_count(), i+1); + assert_eq!(next_node.get_num_hets(), num_hets); + assert!(next_node.is_identical_haplotypes()); + current_node = next_node; + } + + // test an all het (0|1) mode + let mut current_node = AstarNode::new(heuristic_costs[0]); + for i in 0..num_alleles { + let node_index: u64 = (i as u64)+1; + let num_hets: u64 = node_index; + let next_node = AstarNode::new_extended_node( + node_index, + ¤t_node, 0, 1, + heuristic_costs[i+1], + &read_segments, + hap_offset + ); + + let expected_cost = heuristic_costs[i+1]; + assert_eq!(next_node.get_node_index(), node_index); + + let expected_frozen = 0; + assert_eq!(next_node.get_frozen_cost(), expected_frozen); + assert_eq!(next_node.get_total_cost(), expected_cost); + assert_eq!(next_node.get_priority(), (Reverse(expected_cost), num_hets, Reverse(node_index))); + assert_eq!(next_node.get_cleared_priority(), (Reverse(0), num_hets, Reverse(node_index))); + assert_eq!(next_node.get_h1().to_vec(), vec![0; i+1]); + assert_eq!(next_node.get_h2().to_vec(), vec![1; i+1]); + assert_eq!(next_node.get_allele_count(), i+1); + assert_eq!(next_node.get_num_hets(), num_hets); + assert!(!next_node.is_identical_haplotypes()); + current_node = next_node; + } + + // test an all 1-hom mode + let mut current_node = AstarNode::new(heuristic_costs[0]); + for i in 0..num_alleles { + let node_index: u64 = (i as u64)+1; + let num_hets: u64 = 0; + let next_node = AstarNode::new_extended_node( + node_index, + ¤t_node, 1, 1, + heuristic_costs[i+1], + &read_segments, + hap_offset + ); + + let expected_cost = heuristic_costs[i+1] + 2 * node_index; + assert_eq!(next_node.get_node_index(), node_index); + + let expected_frozen = if i == num_alleles - 1 { + 2 * num_alleles as u64 + } else { + 0 + }; + assert_eq!(next_node.get_frozen_cost(), expected_frozen); + assert_eq!(next_node.get_total_cost(), expected_cost); + assert_eq!(next_node.get_priority(), (Reverse(expected_cost), num_hets, Reverse(node_index))); + assert_eq!(next_node.get_cleared_priority(), (Reverse(0), num_hets, Reverse(node_index))); + assert_eq!(next_node.get_h1().to_vec(), vec![1; i+1]); + assert_eq!(next_node.get_h2().to_vec(), vec![1; i+1]); + assert_eq!(next_node.get_allele_count(), i+1); + assert_eq!(next_node.get_num_hets(), num_hets); + assert!(next_node.is_identical_haplotypes()); + current_node = next_node; + } + } + + #[test] + fn test_pqueuehaptracker() { + let mut tracker: PQueueHapTracker = PQueueHapTracker::new(10); + for i in 0..11 { + tracker.add_hap(i); + } + // make sure length matches everything so far + assert_eq!(tracker.len(), 11); + + // try basic subtraction + tracker.remove_hap(3); + assert_eq!(tracker.len(), 10); + + // try threshold increasing + tracker.increase_threshold(4); + assert_eq!(tracker.len(), 7); + + // make sure removal of pre-threshold data doesn't change our len() + for i in 0..3 { + tracker.remove_hap(i); + assert_eq!(tracker.len(), 7); + } + + // make sure adding pre-threshold data doeesn't change our len() + tracker.add_hap(0); + assert_eq!(tracker.len(), 7); + + // make sure increasing threshold to the same value does nothing + tracker.increase_threshold(4); + assert_eq!(tracker.len(), 7); + } +} \ No newline at end of file diff --git a/src/block_gen.rs b/src/block_gen.rs new file mode 100644 index 0000000..5d86ea8 --- /dev/null +++ b/src/block_gen.rs @@ -0,0 +1,1063 @@ + +use crate::data_types::variants::{VariantType, Zygosity}; + +use log::{debug, trace, warn}; +use priority_queue::PriorityQueue; +use rust_htslib::{bam,bcf,htslib}; +use rust_htslib::bcf::record::GenotypeAllele; +use rustc_hash::{FxHashMap as HashMap, FxHashSet as HashSet}; +use simple_error::{SimpleError, bail}; +use std::cell::RefCell; +use std::cmp::Reverse; +use std::path::{Path, PathBuf}; + +/// Uses for phase block priority during multi iteration +type PhaseBlockPriority = Reverse<(u32, u64, u64)>; + +/// Gets a list of sample names from a given VCF file +/// # Arguments +/// * `filename` - the VCF file to load +/// # Errors +/// * if the filename fails to load as a VCF +/// * if the sample name fails to parse from utf8 +pub fn get_vcf_samples(filename: &Path) -> Result, Box> { + use rust_htslib::bcf::Read; + let vcf_reader: bcf::IndexedReader = bcf::IndexedReader::from_path(filename)?; + let vcf_header: bcf::header::HeaderView = vcf_reader.header().clone(); + let mut sample_names = vec![]; + for sv in vcf_header.samples().iter() { + let vcf_sample_string: String = std::str::from_utf8(sv)?.to_string(); + sample_names.push(vcf_sample_string); + } + Ok(sample_names) +} + +/// Iterates through a collection of BAM files and finds the ones matching the given sample name +/// # Arguments +/// * `all_bam_files` - the full list of BAM files to parse +/// * `sample_name` - the name we want to match +/// * `reference_filename` - the path to the reference file +/// # Errors +/// * if BAM has no RG tags +/// * if an RG tag has no SM tag +/// * if multiple SM tags are detected +pub fn get_sample_bams(all_bam_files: &[PathBuf], sample_name: &str, reference_filename: &Path) -> Result<(Vec, Vec), Box> { + use rust_htslib::bam::Read; + let mut sample_bams: Vec = vec![]; + let mut bam_indices: Vec = vec![]; + for (bam_index, bam_fn) in all_bam_files.iter().enumerate() { + let bam_reader = { + let mut b = bam::IndexedReader::from_path(bam_fn)?; + b.set_reference(reference_filename)?; + b + }; + let bam_header = bam::header::Header::from_template(bam_reader.header()); + let header_hashmap = bam_header.to_hashmap(); + let empty_vec = vec![]; + let read_groups = header_hashmap.get("RG").unwrap_or(&empty_vec); + if read_groups.is_empty() { + bail!("BAM file has no read groups (RG) tag: {}", bam_fn.to_string_lossy()); + } + + // there is only one read group + let mut bam_sample_name: Option = None; + for read_group in read_groups.iter() { + let rg_sample_name = match read_group.get("SM") { + Some(s) => s, + None => { + bail!("BAM file has read group with no sample name (SM) tag: {}", bam_fn.to_string_lossy()); + } + }; + match bam_sample_name.as_ref() { + Some(s) => { + if rg_sample_name != s { + bail!("BAM file with multiple sample reads groups detected, this is not supported: {}", bam_fn.to_string_lossy()); + } + }, + None => { + bam_sample_name = Some(rg_sample_name.clone()); + } + }; + } + + if bam_sample_name.unwrap() == sample_name { + sample_bams.push(bam_fn.clone()); + bam_indices.push(bam_index); + } + } + Ok((sample_bams, bam_indices)) +} + +/// Returns true if an alignment record should be filtered out, aka ignored. +/// Main reasons for ignoring are if it is unmapped, secondary, failed QC, a duplicate, or has too low of a MAPQ. +/// # Arguments +/// * `record` - the record of interest +/// * `min_mapq` - the minimum MAPQ we allow +pub fn filter_out_alignment_record(record: &bam::Record, min_mapq: u8) -> bool { + static FLAG_FILTER: u32 = + htslib::BAM_FUNMAP | htslib::BAM_FSECONDARY | htslib::BAM_FQCFAIL | htslib::BAM_FDUP; + + ((record.flags() as u32) & FLAG_FILTER) != 0 || record.mapq() < min_mapq +} + +/// Returns true if a VCF record can be included in phasing based on provided criteria. +/// Homozygous reference variants are always excluded unless `is_hom_allowed==true`. +/// Calls with anything missing (e.g. "./.") are also excluded. +/// Calls that match an unknown/unhandled variant type are also excluded. +/// # Arguments +/// * `record` - the variant record to check +/// * `sample_index` - the sample index, always 0 for single-sample VCFs +/// * `min_quality` - the minimum GQ for an acceptable variant +/// * `is_hom_allowed` - if true, then homozygous ALT variants are allowed, provided they meet the other criteria +/// # Errors +/// * if zygosity cannot be loaded +/// * if a call does not have a GQ tag +pub fn is_phasable_variant(record: &bcf::Record, sample_index: usize, min_quality: i32, is_hom_allowed: bool) -> Result> { + // check if this variant is heterozygous + let zygosity: Zygosity = get_variant_zygosity(record, sample_index)?; + if zygosity == Zygosity::Unknown || zygosity == Zygosity::HomozygousReference || ( + zygosity == Zygosity::HomozygousAlternate && !is_hom_allowed + ) { + // if unknown or homozygous reference, we definitely return false + // if it's homozygous alternate, we also need to check if homs are allowed + Ok(false) + } else { + // heterozygous, check if the variant call is of sufficient quality + match record.format(b"GQ").integer() { // for some reason, calling .float() here will error + Ok(all_gq) => { + let call_quality: i32 = all_gq[sample_index][0]; + if call_quality < min_quality { + return Ok(false); + } + }, + Err(_) => { + // usually means there is not a GQ tag, so skip this check + // TODO: how do we long-term want to handle this for our variants? + // trace! added mostly so clippy stops yelling at me + trace!("Variant found without GQ tag {:?}", record); + } + } + + // heterozygous variant, check that the type is allowed + let variant_type = get_variant_type(record)?; + match variant_type { + VariantType::Snv | + VariantType::Insertion | + VariantType::Deletion | + VariantType::Indel | + VariantType::SvInsertion | + VariantType::SvDeletion | + VariantType::TandemRepeat => { Ok(true) }, + + VariantType::SvDuplication | + VariantType::SvInversion | + VariantType::SvBreakend | + VariantType::Unknown=> { Ok(false) } + } + } +} + +/// Looks at a bcf record and return the zygosity. Any "." alleles lead to Unknown zygosity results. +/// # Arguments +/// * `record` - the record to parse +/// * `sample_index` - the sample index, always 0 for single-sample VCFs +/// # Errors +/// * if rust_htslib fails to parse the genotype +/// * if the genotype field is completely empty +pub fn get_variant_zygosity(record: &bcf::Record, sample_index: usize) -> Result> { + let all_genotypes = record.genotypes()?; + let genotype = all_genotypes.get(sample_index); + + // if the genotype field is completely empty, something is wrong with the VCF + if genotype.is_empty() { + let chromosome = match record.rid() { + Some(rid) => { + let header = record.header(); + match header.rid2name(rid) { + Ok(name) => std::str::from_utf8(name).unwrap_or("FROMUTF8_ERROR"), + Err(_e) => "RID2NAME_ERROR" + } + }, + None => "NO_RID" + }; + bail!("Encountered empty GT field for record: {}:{}", chromosome, record.pos()); + } + + let gt1 = match genotype[0] { + GenotypeAllele::Unphased(at) => at, + GenotypeAllele::Phased(at) => at, + //TODO: ignore these for now, not sure how to handle it? + GenotypeAllele::UnphasedMissing => return Ok(Zygosity::Unknown), + GenotypeAllele::PhasedMissing => return Ok(Zygosity::Unknown) + }; + + let gt2 = if genotype.len() == 1 { + // if the genotype has only one entry, we will just assume that gt2 is identical to gt1 + // this basically converts all single-entry genotypes into some Homozygous state + gt1 + } else { + match genotype[1] { + GenotypeAllele::Unphased(at) => at, + GenotypeAllele::Phased(at) => at, + //TODO: ignore these for now, not sure how to handle it? + GenotypeAllele::UnphasedMissing => return Ok(Zygosity::Unknown), + GenotypeAllele::PhasedMissing => return Ok(Zygosity::Unknown) + } + }; + let zygosity = if gt1 == gt2 { + if gt1 == 0 { + Zygosity::HomozygousReference + } else { + Zygosity::HomozygousAlternate + } + } else { + Zygosity::Heterozygous + }; + Ok(zygosity) +} + +/// Returns a variant type based on the alleles in the VCF. +/// # Arguments +/// * `record` - the variant record to check +pub fn get_variant_type(record: &bcf::Record) -> Result> { + // check if this has an SVTYPE field and parse into an SV type if it does + let svtype_result = record.info("SVTYPE".as_bytes()).string(); + match svtype_result { + Ok(svtype_option) => { + if let Some(svtype) = svtype_option { + // svtype is an array of strings at this point, make sure we only get one + assert_eq!(svtype.len(), 1); + + // make sure these only have one ALT allele + let num_alleles = record.alleles().len(); + assert_eq!(num_alleles, 2); + + let svtype_str = std::str::from_utf8(svtype[0]).unwrap(); + let sv_tag = match svtype_str { + "DEL" => { + VariantType::SvDeletion + }, + "INS" => { + VariantType::SvInsertion + }, + "DUP" => { + VariantType::SvDuplication + }, + "INV" => { + VariantType::SvInversion + }, + "BND" => { + VariantType::SvBreakend + }, + _ => { + bail!("Unhandled SVTYPE tag: {:?}", svtype_str); + } + }; + return Ok(sv_tag); + }; + }, + Err(rust_htslib::errors::Error::BcfUndefinedTag{ tag: _ }) => {}, + Err(e) => { + // no SVTYPE entry, so we assume it matches SNV or indel models + bail!("Unexpected error: {:?}", e); + } + } + + let trid_result = record.info("TRID".as_bytes()).string(); + match trid_result { + Ok(_trid) => { + // we found a TRID field this is a tandem repeat + return Ok(VariantType::TandemRepeat); + }, + Err(rust_htslib::errors::Error::BcfUndefinedTag{ tag: _ }) => {}, + Err(e) => { + // no SVTYPE entry, so we assume it matches SNV or indel models + bail!("Unexpected error: {:?}", e); + } + } + // TODO: we may eventually need to add a check that verifies that the only REF and ALT alleles at this point are in + // the normal ACGTN alphabet + + // if we have no ALT alleles and know tags to inform us, we have know idea what this is + if record.alleles().len() <= 1 { + return Ok(VariantType::Unknown); + } + + // reference length is pulled out first, then we can look at the other alleles + let ref_len = record.alleles()[0].len(); + + // we only care about max ALT length when defining small variant type + let max_alt_len = record.alleles().iter().skip(1) + .map(|a| a.len()) + .max() + .unwrap(); + + Ok(if ref_len == 1 { + if max_alt_len == 1 { + VariantType::Snv + } else { + VariantType::Insertion + } + } else if max_alt_len == 1 { + VariantType::Deletion + } else { + VariantType::Indel + }) +} + +/// Defines a subset of the total reference space that is a single phasing problem or "block". +/// Each block has at least 1 read spanning from one variant to the next. +#[derive(Clone, Eq, Hash, Ord, PartialEq, PartialOrd)] +pub struct PhaseBlock { + // NOTE: order matters here because we're deriving the comparisons + /// An index of the block, important for maintaining output order downstream. + block_index: usize, + /// The chromosome of the block. + chrom: String, + /// The chromosome index in the first VCF file + chrom_index: u32, + /// The coordinate of the first variant in the block, inclusive. + start: u64, + /// The coordinate of the last variant in the block, inclusive. + end: u64, + /// The total number of variants in the block so far. + num_variants: usize, + /// The VCF index of the first variant in the block + first_variant_vcf: usize, + /// The minimum quality of variants that were included + min_quality: i32, + /// The sample name within the VCF that this block corresponds to + sample_name: String +} + +impl std::fmt::Debug for PhaseBlock { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + // main purpose of custom was to munge the coordinates into a single string + let mut result = f.debug_struct("PhaseBlock"); + result.field("block_index", &self.block_index) + .field("coordinates", &format!("{}:{}-{}", self.chrom, self.start, self.end)) + .field("num_variants", &self.num_variants); + if self.min_quality > 0 { + // theres no real reason to spit this out unless it is doing something + result.field("min_quality", &self.min_quality); + } + result.field("sample_name", &self.sample_name) + .finish() + } +} + +impl PhaseBlock { + /// Initializes a phase block with no variants + /// # Arguments + /// * `block_index` - the index of this block + /// * `chrom` - the chromosome of the phase block + /// * `chrom_index` - the chromosome index in the VCF file, for ordering + /// * `min_quality` - the minimum quality to include a variant in this phase block + /// * `sample_name` - the name of the sample in the VCF(s) that this block info corresponds to + pub fn new(block_index: usize, chrom: String, chrom_index: u32, min_quality: i32, sample_name: String) -> PhaseBlock { + PhaseBlock { + block_index, + chrom, + chrom_index, + start: 0, + end: 0, + num_variants: 0, + first_variant_vcf: 0, + min_quality, + sample_name + } + } + + pub fn get_block_index(&self) -> usize { + self.block_index + } + + pub fn set_block_index(&mut self, new_index: usize) { + self.block_index = new_index; + } + + pub fn get_chrom(&self) -> &str { + &self.chrom + } + + pub fn get_chrom_index(&self) -> u32 { + self.chrom_index + } + + pub fn get_start(&self) -> u64 { + self.start + } + + pub fn get_end(&self) -> u64 { + self.end + } + + pub fn get_num_variants(&self) -> usize { + self.num_variants + } + + pub fn get_first_variant_vcf(&self) -> usize { + self.first_variant_vcf + } + + pub fn get_min_quality(&self) -> i32 { + self.min_quality + } + + /// Returns the number of contained base pairs in the phase block. + pub fn bp_len(&self) -> u64 { + self.end - self.start + 1 + } + + pub fn sample_name(&self) -> &str { + &self.sample_name + } + + /// Add a single-position variant to the phase block, will panic if the chromosome does not match + /// # Arguments + /// * `chrom` - the chromosome string + /// * `pos` - the position of the variant + /// * `vcf_index` - the index of the VCF this variant comes from; use 0 if only one VCF is being used + pub fn add_locus_variant(&mut self, chrom: &str, pos: u64, vcf_index: usize) { + assert_eq!(self.chrom, chrom, "PhaseBlock chromosomes are not equal: \"{}\" \"{}\"", self.chrom, chrom); + //first condition unlikely to happen; second happens on an empty block + if self.start > pos || self.num_variants == 0 { + self.start = pos; + } + //most common case, extend to the right + if self.end < pos { + self.end = pos; + } + self.num_variants += 1; + + if self.num_variants == 1 { + self.first_variant_vcf = vcf_index; + } + } + + /// Checks if a given start/end overlaps the existing phase block + /// # Arguments + /// * `other_start` - the start position, inclusive + /// * `other_end` - the end position, exclusive + pub fn is_overlapping(&self, other_start: u64, other_end: u64) -> bool { + let max_start = self.start.max(other_start); + let min_end = (self.end+1).min(other_end); + max_start < min_end + } +} + +/// Iterator that will generate phase blocks consisting of a single "problem" to phase +pub struct PhaseBlockIterator { + /// The index of the next block to yield + next_block_index: usize, + /// The primary traversal reader + ref_vcf_readers: Vec>, + /// A copy of the VCF header, cached here for performance + vcf_headers: Vec, + /// The name of the sample we care about in the VCF + sample_name: String, + /// The indices in the VCF file corresponding to `sample_name` + sample_indices: Vec, + /// Secondary traversals needed to figure out which variants can be phased + bam_readers: Vec>, + /// Index is based on the bcf::IndexedReader lookups + chrom_index: u32, + /// Position is as well, 0-based + chrom_position: u64, + /// The minimum allowed variant quality + min_quality: i32, + /// The minimum MAPQ to include a read + min_mapq: u8, + /// The minimum number of reads spanning two loci to connect them into a block + min_spanning_reads: usize, + /// if true, then supplemental mappings are allowed to join blocks + allow_supplemental_joins: bool, + /// Statistics on encountered variants while we iterate + variant_stats: HashMap<(u32, VariantType, Zygosity), usize> +} + +impl PhaseBlockIterator { + /// Creates a new `PhaseBlockIterator` from a VCF file and collection of BAM files. + /// # Arguments + /// * `vcf_paths` - the VCF files to load variants from, must be zipped and indexed + /// * `bam_paths` - the BAM files to load reads from, must be indexed + /// * `reference_filename` - the reference genome filename + /// * `sample_name` - the sample name in the VCF file + /// * `min_quality` - the minimum quality to include a variant in a phase block + /// * `min_mapq` - the minimum MAPQ to include a read + /// * `min_spanning_reads` - the minimum number of reads that must span two adjacent variants to be joined into a phase block + /// * `allow_supplemental_joins` - if True, supplemental mappings are used for extending blocks + /// * `thread_pool` - a shared thread pool for BAM I/O + #[allow(clippy::too_many_arguments)] + pub fn new( + vcf_paths: &[PathBuf], bam_paths: &[PathBuf], reference_filename: &Path, + sample_name: String, + min_quality: i32, min_mapq: u8, min_spanning_reads: usize, + allow_supplemental_joins: bool, + thread_pool: &rust_htslib::tpool::ThreadPool + ) -> Result> { + // needed for header() extraction + use rust_htslib::bcf::Read; + + let mut ref_vcf_readers: Vec> = vec![]; + let mut vcf_headers: Vec = vec![]; + let mut vcf_contigs: Vec> = vec![]; + + let mut sample_indices: Vec = vec![]; + + for path in vcf_paths.iter() { + let vcf_reader: bcf::IndexedReader = bcf::IndexedReader::from_path(path)?; + let vcf_header: bcf::header::HeaderView = vcf_reader.header().clone(); + let ref_vcf_reader: RefCell = RefCell::new(vcf_reader); + + // first, check the sample names + let sample_index = { + let mut lookup_index: Option = None; + for (sample_index, &vcf_sample) in vcf_header.samples().iter().enumerate() { + let vcf_sample_string: String = std::str::from_utf8(vcf_sample).unwrap().to_string(); + if vcf_sample_string == sample_name { + lookup_index = Some(sample_index); + break; + } + } + match lookup_index { + Some(index) => { + index + }, + None => { + bail!("Sample name {:?} was not found in VCF: {:?}", sample_name, path); + } + } + }; + + let contig_count: usize = vcf_header.contig_count() as usize; + let contigs: HashSet = (0..contig_count) + .map(|i| + std::str::from_utf8( + vcf_header.rid2name(i as u32).unwrap() + ).unwrap().to_string() + ) + .collect(); + + // push everything to our lists + ref_vcf_readers.push(ref_vcf_reader); + vcf_headers.push(vcf_header); + vcf_contigs.push(contigs); + sample_indices.push(sample_index); + } + + // check that our chromosome entries are a match, if not we explode + let first_chromosomes = &vcf_contigs[0]; + for other_chroms in vcf_contigs.iter().skip(1) { + if first_chromosomes != other_chroms { + bail!("Contig sets in the VCF files do not match"); + } + } + + // open up the bam files as well + let mut bam_readers: Vec> = vec![]; + for path in bam_paths.iter() { + use rust_htslib::bam::Read; + let mut bam_reader = bam::IndexedReader::from_path(path)?; + bam_reader.set_reference(reference_filename)?; + bam_reader.set_thread_pool(thread_pool)?; + bam_readers.push(RefCell::new(bam_reader)); + } + assert!(min_spanning_reads > 0); + + debug!("Sample \"{}\" VCF indices: {:?}", sample_name, sample_indices); + + Ok(PhaseBlockIterator { + next_block_index: 0, + ref_vcf_readers, + vcf_headers, + sample_name, + sample_indices, + bam_readers, + chrom_index: 0, + chrom_position: 0, + min_quality, + min_mapq, + min_spanning_reads, + allow_supplemental_joins, + variant_stats: Default::default() + }) + } + + pub fn sample_indices(&self) -> &[usize] { + &self.sample_indices + } + pub fn sample_name(&self) -> &str { + &self.sample_name + } + + /// Retrieves variant counts from all parsed variants (whether included or not), best if used after done iterating + pub fn variant_stats(&self) -> HashMap<(String, VariantType, Zygosity), usize> { + let mut ret: HashMap<(String, VariantType, Zygosity), usize> = Default::default(); + + for (&(chrom_index, variant_type, zygosity), &count) in self.variant_stats.iter() { + //get the chromosome name, we iterate based on the order of the first VCF provided + let chrom_name: String = std::str::from_utf8( + self.vcf_headers[0].rid2name(chrom_index).unwrap() + ).unwrap().to_string(); + + ret.insert((chrom_name, variant_type, zygosity), count); + } + + ret + } + + /// Returns the farthest position of reads that spans chrom:pos such that _at least_ `min_read_count` reads cover that position. + /// Currently, supplemental alignments are each handled separately. + /// # Arguments + /// * `chrom` - the chromosome of the locus + /// * `pos` - the position of the locus + fn get_longest_multispan(&self, chrom: &str, pos: u64) -> u64 { + use bio::bio_types::genome::AbstractInterval; + use rust_htslib::bam::Read; + let mut span_list: Vec = vec![]; + for bam_ref in self.bam_readers.iter() { + let mut bam = bam_ref.borrow_mut(); + bam.fetch((chrom, pos, pos+1)).unwrap(); + + // calling .records() is what is triggering the URL warning + for read_entry in bam.records() { + let mut read = read_entry.unwrap(); + + //make sure we care about the alignment + if filter_out_alignment_record(&read, self.min_mapq) { + continue; + } + + // see if this mapping goes farther than anything else so far + read.cache_cigar(); + let full_range = read.range(); + // assertions always checked out, can remove + // assert!(full_range.start == read.pos() as u64); + // assert!(full_range.contains(&pos)); + span_list.push(full_range.end); + } + } + + if span_list.len() < self.min_spanning_reads { + // we don't have enough reads to reach our minimum threshold, so return this position+1 + // the +1 is because we are returning the end of a half-open range that only includes pos + pos + 1 + } else { + span_list.sort(); + span_list[span_list.len() - self.min_spanning_reads] + } + } + + /// Returns true if there are at least `min_read_count` reads that connect from the given position back into the current phase block. + /// # Arguments + /// * `chrom` - the chromosome of the locus + /// * `pos` - the position of the locus + fn is_supplemental_overlap(&self, chrom: &str, pos: u64, phase_block: &PhaseBlock) -> bool { + use rust_htslib::bam::Read; + use rust_htslib::bam::record::{Aux, CigarString, Record}; + use rust_htslib::bam::record::Cigar; + let mut overlap_count: usize = 0; + for bam_ref in self.bam_readers.iter() { + let mut bam = bam_ref.borrow_mut(); + bam.fetch((chrom, pos, pos+1)).unwrap(); + for read_entry in bam.records() { + let read: Record = read_entry.unwrap(); + + // make sure we care about the alignment + if filter_out_alignment_record(&read, self.min_mapq) { + continue; + } + + // check if we have any supplemental alignments + let sa_tag: &str = match read.aux(b"SA") { + Ok(value) => { + match value { + Aux::String(tag) => tag, + _ => panic!("Unexpected tag {value:?}") + } + }, + Err(_) => { + continue; + } + }; + + // there can be multiple, so split on the delimiter and handle each one separately + let sa_strings: Vec<&str> = sa_tag.split_terminator(';').collect(); + for &sa_str in sa_strings.iter() { + // we expect exactly 6 + let sa_frags: Vec<&str> = sa_str.split(',').collect(); + assert_eq!(sa_frags.len(), 6); + + let sa_chrom = sa_frags[0]; + let sa_mapq: u8 = sa_frags[4].parse().unwrap(); + if sa_chrom != chrom || sa_mapq < self.min_mapq { + // different chromosome OR mapq of the SA is too low, skip it + continue; + } + + // convert the start coordinate + CIGAR into an end coordinate + let sa_start: u64 = sa_frags[1].parse().unwrap(); + let mut sa_end: u64 = sa_start; + let cigar: CigarString = CigarString::try_from(sa_frags[3]).unwrap(); + for cigar_value in cigar.iter() { + match cigar_value { + Cigar::SoftClip(_) | Cigar::Ins(_) => {}, + Cigar::Match(c_len) | + Cigar::Del(c_len) | + Cigar::Equal(c_len) | + Cigar::Diff(c_len) => { + sa_end += *c_len as u64; + }, + _ => { + panic!("Unhandled cigar type: {cigar_value:?}"); + } + } + } + + // we have the SA start and end, see if it overlaps the existing block + // TODO: this isn't checking for variant overlaps, can we fix that somehow? maybe instead of phase block, we pass in a variant interval tree? + // this is fortunately not a major problem, any falsely connected blocks will get split on the back end + let overlapping: bool = phase_block.is_overlapping(sa_start, sa_end); + if overlapping { + // we found at least one overlap with the block, so increment and go to next read + overlap_count += 1; + break; + } + } + } + } + + // return true if we got enough supplements connecting us + overlap_count >= self.min_spanning_reads + } +} + +impl Iterator for PhaseBlockIterator { + type Item = Result>; + + fn next(&mut self) -> Option>> { + use rust_htslib::bcf::Read; + + // make sure we still have chromosome to iterate on + let num_contigs: u32 = self.vcf_headers[0].contig_count(); + if self.chrom_index < num_contigs { + //get the chromosome name, we iterate based on the order of the first VCF provided + let chrom_name: String = std::str::from_utf8( + self.vcf_headers[0].rid2name(self.chrom_index).unwrap() + ).unwrap().to_string(); + + // initialize with an empty block containing just this chromosome + let mut phase_block: PhaseBlock = PhaseBlock::new( + self.next_block_index, chrom_name.clone(), self.chrom_index, self.min_quality, self.sample_name.clone() + ); + self.next_block_index += 1; + + // initalize the variant queue with one variant from each VCF, any ties in position are broken by VCF input order + let mut variant_queue: PriorityQueue, Reverse)> = PriorityQueue::new(); + let mut vcf_readers: Vec<_> = self.ref_vcf_readers.iter().map(|rvr| rvr.borrow_mut()).collect(); + let mut vcf_iterators: Vec<_> = vec![]; + for (vcf_index, vcf_reader) in vcf_readers.iter_mut().enumerate() { + // fetch the corresponding chrom index for this VCF file (they are not guaranteed to match) + let chrom_index: u32 = self.vcf_headers[vcf_index].name2rid(chrom_name.as_bytes()).unwrap(); + + // fetch our position in the VCF file + match vcf_reader.fetch(chrom_index, self.chrom_position, None) { + Ok(()) => { + // we have entries, so get the first one and queue it + let mut vcf_iter = vcf_reader.records().peekable(); + let first_entry = vcf_iter.peek(); + if let Some(record_result) = first_entry { + let record: &rust_htslib::bcf::Record = match record_result { + Ok(r) => r, + // we have to convert to an owned error here, and the htslib errors are not cloneable + Err(e) => return Some(Err(Box::new(SimpleError::from(e)))) + }; + let position: i64 = record.pos(); + variant_queue.push(vcf_index, (Reverse(position), Reverse(vcf_index))); + }; + + // even if the iterator is empty, we push it so things are lined up correctly + vcf_iterators.push(vcf_iter); + }, + Err(_) => { + // this usually happens when there are no entries for the chromosome + vcf_iterators.push(vcf_reader.records().peekable()); + } + }; + } + + if variant_queue.is_empty() { + // this must be an empty chromosome block because neither iterator found stuff to iterate on + self.chrom_index += 1; + return Some(Ok(phase_block)); + } + + let mut previous_pos: u64 = 0; + let mut max_span: u64 = 0; + + while !variant_queue.is_empty() { + // get the source of the next variant to process + let (pop_index, pop_priority) = variant_queue.pop().unwrap(); + let sample_index = self.sample_indices[pop_index]; + + // process this variant + let record_result = vcf_iterators[pop_index].next().unwrap(); + let record = match record_result { + Ok(r) => r, + Err(e) => return Some(Err(Box::new(e))) + }; + + let variant_pos = record.pos() as u64; + assert_eq!(variant_pos, pop_priority.0.0 as u64); // sanity check that the variant matches our position priority + if variant_pos < self.chrom_position { + // this can happen when you have very very long indels that span one of our breaks + // we have already written though, so don't write it again + // skip down to variant advancement + } else { + let include_variant = match is_phasable_variant(&record, sample_index, self.min_quality, false) { + Ok(iv) => iv, + Err(e) => return Some(Err(e)) + }; + + // second condition is for variants that overlap but are before our start position + if include_variant { + //heterozygous variant found + if phase_block.get_num_variants() == 0 || max_span > variant_pos { + //either: + //1 - this is a new block OR + //2 - we already found enough reads that spans _past_ this variant + phase_block.add_locus_variant(&chrom_name, variant_pos, pop_index); + } else { + //we check the reads from the most recent locus + //max_span = self.get_longest_span(&chrom_name, previous_pos); + max_span = self.get_longest_multispan(&chrom_name, previous_pos); + if max_span > variant_pos { + //new max span connects + phase_block.add_locus_variant(&chrom_name, variant_pos, pop_index); + } else if !self.allow_supplemental_joins { + // no mapping spans and we are not allowing supplemental mappings to make the join + self.chrom_position = variant_pos; + return Some(Ok(phase_block)); + } else { + //no *mappings* span both this new position and the most recent, check if we can find a supplemental mapping that does + let supplemental_overlap: bool = self.is_supplemental_overlap(&chrom_name, variant_pos, &phase_block); + if supplemental_overlap { + // we got a supplemental mapping that works, so add this locus and go on as normal + phase_block.add_locus_variant(&chrom_name, variant_pos, pop_index); + } else { + // no overlapping mapping and no supplemental either, time to end the block + self.chrom_position = variant_pos; + return Some(Ok(phase_block)); + } + } + } + + previous_pos = variant_pos; + } + + // at this point either: + // 1) we added the variant to the current block and are looping back around OR + // 2) we did NOT add the variant to the block because it isn't phasable OR + // 3) we finished a block and returned out (aka, we can't get here if we just ended a block) + // this means that these variants are safe to add to our stats without being double counted or previously counted + let variant_type: VariantType = match get_variant_type(&record) { + Ok(vt) => vt, + // we have to convert to an owned error here, and the htslib errors are not cloneable + Err(e) => return Some(Err(e)) + }; + let zygosity: Zygosity = match get_variant_zygosity(&record, sample_index) { + Ok(z) => z, + // we have to convert to an owned error here, and the htslib errors are not cloneable + Err(e) => return Some(Err(e)) + }; + + // update our variant stats for reporting later + let stats_entry = self.variant_stats.entry((self.chrom_index, variant_type, zygosity)).or_insert(0); + *stats_entry += 1; + } + + // requeue from the one we popped from + let next_entry = vcf_iterators[pop_index].peek(); + if let Some(record_result) = next_entry { + let record: &rust_htslib::bcf::Record = match record_result { + Ok(r) => r, + // we have to convert to an owned error here, and the htslib errors are not cloneable + Err(e) => return Some(Err(Box::new(SimpleError::from(e)))) + }; + let position: i64 = record.pos(); + variant_queue.push(pop_index, (Reverse(position), Reverse(pop_index))); + }; + } + + //we have reached the end of the current chromosome, reset to next chromosome and return what we have + self.chrom_index += 1; + self.chrom_position = 0; + Some(Ok(phase_block)) + } else { + // no chromosomes left to iterate on + None + } + } +} + +/// Iterator over multiple phase blocks iterators. +/// Output blocks are ordered by (chromosome, start_position, end_position) and re-numbered to reflect traversal order. +pub struct MultiPhaseBlockIterator { + /// The internal iterators we use + sub_iterators: Vec, + /// The priority queue for the phase blocks at the front of each iterator + phase_block_queue: PriorityQueue<(usize, PhaseBlock), PhaseBlockPriority>, + /// The combined block index + joint_block_index: usize, +} + +impl MultiPhaseBlockIterator { + /// Creates a new iterator from a vector of sub-iterators, each one tied to a specific sample. + /// # Arguments + /// * `sub_iterators` - the original PhaseBlockIterators that this will wrap + /// # Errors + /// * if any sub-iterators generate errors while iterating + pub fn new(mut sub_iterators: Vec) -> Result> { + let mut phase_block_queue: PriorityQueue<(usize, PhaseBlock), PhaseBlockPriority> = PriorityQueue::new(); + + for (index, iterator) in sub_iterators.iter_mut().enumerate() { + let next_value = iterator.next(); + match next_value { + Some(result) => { + let first_block: PhaseBlock = result?; + let block_priority = Self::get_block_priority(&first_block); + phase_block_queue.push((index, first_block), block_priority); + }, + None => { + // first block is empty, which is weird but technically allowed + warn!("First block in iterator {} was empty.", index); + } + }; + } + + Ok(MultiPhaseBlockIterator { + sub_iterators, + phase_block_queue, + joint_block_index: 0 + }) + } + + /// Retrieves variant counts from all parsed variants (whether included or not) across all samples. + /// This is best if done when iteration is finished. + /// Returns a hashmap where key is (sample_name, chromosome, variant_type, zygosity) and value is a count. + pub fn variant_stats(&self) -> HashMap<(String, String, VariantType, Zygosity), usize> { + // key = (sample_name, chromosome, variant_type, zygosity); value = count + let mut ret: HashMap<(String, String, VariantType, Zygosity), usize> = Default::default(); + for pbi in self.sub_iterators.iter() { + let sample_name = pbi.sample_name().to_string(); + let pbi_stats = pbi.variant_stats(); + for ((chrom, vt, zyg), count) in pbi_stats.into_iter() { + ret.insert((sample_name.clone(), chrom, vt, zyg), count); + } + } + ret + } + + /// Returns the block priority for a phase block + /// # Arguments + /// * `phase_block` - the block to calculate priority for + fn get_block_priority(phase_block: &PhaseBlock) -> PhaseBlockPriority { + Reverse( + ( + phase_block.get_chrom_index(), + phase_block.get_start(), + phase_block.get_end() + ) + ) + } +} + +impl Iterator for MultiPhaseBlockIterator { + type Item = Result>; + + fn next(&mut self) -> Option>> { + let pq_next = self.phase_block_queue.pop(); + match pq_next { + Some(((source_index, mut phase_block), _priority)) => { + // get the next block and put it on the queue + let next_item = self.sub_iterators[source_index].next(); + if let Some(next_result) = next_item { + // we have more in this queue, add it to the priority queue + let next_block = match next_result { + Ok(b) => b, + Err(e) => { + // sub-queue error, propagate it up the chain + return Some(Err(e)); + } + }; + let next_priority = Self::get_block_priority(&next_block); + self.phase_block_queue.push((source_index, next_block), next_priority); + }; + + // we need to update the block index based on the joint values + phase_block.set_block_index(self.joint_block_index); + self.joint_block_index += 1; + + // finally send back the block + Some(Ok(phase_block)) + }, + None => { + None + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + // fortunately, none of the BAM checks actually look at the provided reference file contents + const LOCAL_REFERENCE: &str = "./test_data/test_reference.fa"; + + #[test] + fn test_get_vcf_samples() { + let header_only_vcf: PathBuf = "./test_data/header_only.vcf.gz".into(); + let expected_samples: Vec = vec![ + "HG001".to_string(), "HG002_30x".to_string(), "HG005_30x".to_string() + ]; + let samples = get_vcf_samples(&header_only_vcf).unwrap(); + assert_eq!(expected_samples, samples); + } + + #[test] + fn test_get_sample_bams() { + let sample_name: String = "HG002-rep1".to_string(); + let all_bams: Vec = vec![ + "./test_data/header_only.bam".into(), + "./test_data/multi_smrtcell.bam".into() + ]; + + let (bams_found, bam_indices) = get_sample_bams( + &all_bams, + &sample_name, + &PathBuf::from(LOCAL_REFERENCE) + ).unwrap(); + assert_eq!(all_bams, bams_found); + assert_eq!(vec![0, 1], bam_indices); + } + + #[test] + fn test_multisample_bam() { + let sample_name: String = "HG002-rep1".to_string(); + let all_bams: Vec = vec![ + "./test_data/multisample.bam".into() + ]; + let result = get_sample_bams( + &all_bams, + &sample_name, + &PathBuf::from(LOCAL_REFERENCE) + ); + + // should have an error with the following message + assert!(result.is_err()); + let expected_error_string = "BAM file with multiple sample reads groups detected, this is not supported: ./test_data/multisample.bam".to_string(); + assert_eq!(expected_error_string, result.err().unwrap().to_string()); + } +} \ No newline at end of file diff --git a/src/cli.rs b/src/cli.rs new file mode 100644 index 0000000..34ca2e8 --- /dev/null +++ b/src/cli.rs @@ -0,0 +1,343 @@ + +use clap::Parser; +use chrono::Datelike; +use flate2::bufread::MultiGzDecoder; +use lazy_static::lazy_static; +use log::{error, info, warn}; +use std::fs::File; +use std::io::{BufReader, Read}; +use std::path::{Path, PathBuf}; + +lazy_static! { + /// Stores the full version string we plan to use. + /// # Examples + /// * `0.11.0-6bb9635-dirty` - while on a dirty branch + /// * `0.11.0-6bb9635` - with a fresh commit + pub static ref FULL_VERSION: String = format!("{}-{}", env!("CARGO_PKG_VERSION"), env!("VERGEN_GIT_DESCRIBE")); +} + +#[derive(Clone, Parser)] +#[clap(author, + version = &**FULL_VERSION, + about, + after_help = format!("Copyright (C) 2004-{} Pacific Biosciences of California, Inc. +This program comes with ABSOLUTELY NO WARRANTY; it is intended for +Research Use Only and not for use in diagnostic procedures.", chrono::Utc::now().year()))] +pub struct Settings { + /// Input alignment file in BAM format. + #[clap(required = true)] + #[clap(short = 'b')] + #[clap(long = "bam")] + #[clap(value_name = "BAM")] + #[clap(help_heading = Some("Input/Output"))] + pub bam_filenames: Vec, + + /// Output haplotagged alignment file in BAM format. + #[clap(short = 'p')] + #[clap(long = "output-bam")] + #[clap(value_name = "BAM")] + #[clap(help_heading = Some("Input/Output"))] + pub output_bam_filenames: Vec, + + /// Input variant file in VCF format. + #[clap(required = true)] + #[clap(short = 'c')] + #[clap(long = "vcf")] + #[clap(value_name = "VCF")] + #[clap(help_heading = Some("Input/Output"))] + pub vcf_filenames: Vec, + + /// Output phased variant file in VCF format. + #[clap(required = true)] + #[clap(short = 'o')] + #[clap(long = "output-vcf")] + #[clap(value_name = "VCF")] + #[clap(help_heading = Some("Input/Output"))] + pub output_vcf_filenames: Vec, + + /// Reference FASTA file + #[clap(required = true)] + #[clap(short = 'r')] + #[clap(long = "reference")] + #[clap(value_name = "FASTA")] + #[clap(help_heading = Some("Input/Output"))] + pub reference_filename: PathBuf, + + /// Sample name to phase within the VCF (default: first sample) + #[clap(short = 's')] + #[clap(long = "sample-name")] + #[clap(value_name = "SAMPLE")] + #[clap(help_heading = Some("Input/Output"))] + pub sample_names: Vec, + + /// Ignore BAM file read group IDs + #[clap(long = "ignore-read-groups")] + #[clap(help_heading = Some("Input/Output"))] + pub ignore_read_groups: bool, + + /// Output summary phasing statistics file (optional, csv/tsv) + #[clap(long = "summary-file")] + #[clap(value_name = "FILE")] + #[clap(help_heading = Some("Input/Output"))] + pub summary_filename: Option, + + /// Output algorithmic statistics file (optional, csv/tsv) + #[clap(long = "stats-file")] + #[clap(value_name = "FILE")] + #[clap(help_heading = Some("Input/Output"))] + pub stats_filename: Option, + + /// Output blocks file (optional, csv/tsv) + #[clap(long = "blocks-file")] + #[clap(value_name = "FILE")] + #[clap(help_heading = Some("Input/Output"))] + pub blocks_filename: Option, + + /// Output haplotag file (optional, csv/tsv) + #[clap(long = "haplotag-file")] + #[clap(value_name = "FILE")] + #[clap(help_heading = Some("Input/Output"))] + pub haplotag_filename: Option, + + /// Number of threads for BAM I/O (default: copy `--threads`) + #[clap(long = "io-threads")] + #[clap(value_name = "THREADS")] + #[clap(help_heading = Some("Input/Output"))] + pub io_threads: Option, + + /// Number of threads to use for phasing. + #[clap(short = 't')] + #[clap(long = "threads")] + #[clap(value_name = "THREADS")] + #[clap(default_value = "1")] + pub threads: usize, + + /// Enable verbose output. + #[clap(short = 'v')] + #[clap(long = "verbose")] + #[clap(action = clap::ArgAction::Count)] + pub verbosity: u8, + + /// Sets a minimum genotype quality (GQ) value to include a variant in the phasing + #[clap(long = "min-vcf-qual")] + #[clap(value_name = "GQ")] + #[clap(default_value = "0")] + #[clap(help_heading = Some("Variant Filtering"))] + pub min_variant_quality: i32, + + /// Sets a minimum MAPQ to include a read in the phasing + #[clap(long = "min-mapq")] + #[clap(value_name = "MAPQ")] + #[clap(default_value = "5")] + #[clap(help_heading = Some("Mapping Filtering"))] + pub min_mapping_quality: u8, + + /// Sets a minimum number of matched variants required for a read to get included in the scoring + #[clap(long = "min-matched-alleles")] + #[clap(value_name = "COUNT")] + #[clap(default_value = "2")] + #[clap(help_heading = Some("Mapping Filtering"))] + pub min_matched_alleles: usize, + + /// Sets a minimum number of reads to span two adjacent variants to join a phase block + #[clap(long = "min-spanning-reads")] + #[clap(value_name = "READS")] + #[clap(default_value = "1")] + #[clap(help_heading = Some("Phase Block Generation"))] + pub min_spanning_reads: usize, + + /// Disables the use of supplemental mappings to join phase blocks + #[clap(long = "no-supplemental-joins")] + #[clap(help_heading = Some("Phase Block Generation"))] + pub disable_supplemental_joins: bool, + + /// Enables the phasing and haplotagging of singleton phase blocks + #[clap(long = "phase-singletons")] + #[clap(help_heading = Some("Phase Block Generation"))] + pub phase_singletons: bool, + + /// Sets a maximum reference buffer for local realignment + #[clap(long = "max-reference-buffer")] + #[clap(value_name = "LENGTH")] + #[clap(default_value = "15")] + #[clap(help_heading = Some("Allele Assignment"))] + pub reference_buffer: usize, + + /// Enables global realignment with a maximum allowed CPU time before fallback to local realignment + #[clap(long = "global-realignment-cputime")] + #[clap(value_name = "SECONDS")] + #[clap(default_value = "0.0")] + #[clap(help_heading = Some("Allele Assignment"))] + pub global_realign_cputime: f32, + + /// Sets a pruning threshold on global realignment, set to 0 to disable pruning + #[clap(long = "global-pruning-distance")] + #[clap(value_name = "LENGTH")] + #[clap(default_value = "500")] + #[clap(help_heading = Some("Allele Assignment"))] + pub wfa_prune_distance: usize, + + /// Sets the minimum queue size for the phasing algorithm + #[clap(long = "phase-min-queue-size")] + #[clap(value_name = "SIZE")] + #[clap(default_value = "1000")] + #[clap(help_heading = Some("Phasing"))] + pub phase_min_queue_size: usize, + + /// Sets the queue size increment per variant in a phase block + #[clap(long = "phase-queue-increment")] + #[clap(value_name = "SIZE")] + #[clap(default_value = "3")] + #[clap(help_heading = Some("Phasing"))] + pub phase_queue_increment: usize, + + /// Skips a number of blocks (debug only); non-0 values will cause an error on VCF output + #[clap(long = "skip")] + #[clap(hide = true)] + #[clap(default_value = "0")] + pub skip_blocks: usize, + + /// Take a number of blocks (debug only); non-0 values will cause an error on VCF output + #[clap(long = "take")] + #[clap(hide = true)] + #[clap(default_value = "0")] + pub take_blocks: usize, +} + +/// Checks if a file exists and will otherwise exit +/// # Arguments +/// * `filename` - the file path to check for +/// * `label` - the label to use for error messages +fn check_required_filename(filename: &Path, label: &str) { + if !filename.exists() { + error!("{} does not exist: \"{}\"", label, filename.display()); + std::process::exit(exitcode::NOINPUT); + } else { + info!("{}: \"{}\"", label, filename.display()); + } +} + +/// Checks if the VCF file exists, is bgzipped, and has an index. If it fails any of those, this will exit. +/// # Argument +/// * `filename` - the VCF file path to check +/// * `label` - the label to use for error messages +fn check_required_vcf(filename: &Path, label: &str) { + // first check the filename normally + check_required_filename(filename, label); + + // now we need to check that this is a bgzipped file by just trying to read a little bit of it + // NOTE: if the user generates a gzip file (as opposed to bgzip), this will still pass :( + // in theory, indexing checks should fail + let vcf_file: File = File::open(filename).unwrap(); + let file_reader = BufReader::new(vcf_file); + let mut gz_decoder = MultiGzDecoder::new(file_reader); + let mut small_buffer: [u8; 10] = [0; 10]; + match gz_decoder.read(&mut small_buffer) { + Ok(_) => {}, + Err(e) => { + if e.to_string() == "invalid gzip header" { + error!("Error while checking {filename:?}: {e}; is the VCF bgzipped?"); + } else { + error!("Error while checking {filename:?}: {e}"); + } + std::process::exit(exitcode::IOERR); + } + }; + + // finally, verify that an index file exists, should just be tbi and csi + let known_indices = ["tbi", "csi"]; + let mut index_found: bool = false; + for &ki in known_indices.iter() { + let mut extension_path = filename.to_owned() + .into_os_string(); + extension_path.push(format!(".{ki}")); + let extension_path: PathBuf = PathBuf::from(extension_path); + index_found |= extension_path.exists(); + } + if !index_found { + error!("Error while checking {filename:?}: no tabix index found (.tbi or .csi)"); + std::process::exit(exitcode::NOINPUT); + } + +} + +pub fn get_raw_settings() -> Settings { + Settings::parse() +} + +/// Do some additional checks here, we may increase these as we go. +/// Also can modify settings if needed since we're passing it around. +/// # Arguments +/// * `settings` - the raw settings, nothing has been checked other than what clap does for us. +pub fn check_settings(mut settings: Settings) -> Settings { + //check for any of our required files + for filename in settings.bam_filenames.iter() { + check_required_filename(filename, "Alignment file"); + } + for filename in settings.vcf_filenames.iter() { + check_required_vcf(filename, "Variant file"); + } + + // make sure the number of inputs and outputs are identical + if settings.vcf_filenames.len() != settings.output_vcf_filenames.len() { + error!("Detected {} input VCFs and {} output VCFs, these must be equal", settings.vcf_filenames.len(), settings.output_vcf_filenames.len()); + std::process::exit(exitcode::USAGE); + } + + // if we have any phased BAM outputs, make sure we have one for each file + if !settings.output_bam_filenames.is_empty() && settings.bam_filenames.len() != settings.output_bam_filenames.len() { + error!("Detected {} input BAMs and {} output BAMs, these must be equal", settings.bam_filenames.len(), settings.output_bam_filenames.len()); + std::process::exit(exitcode::USAGE); + } + + // check optional files + check_required_filename(&settings.reference_filename, "Reference file"); + + // 0 is just a sentinel for everything + if settings.take_blocks == 0 { + settings.take_blocks = usize::MAX; + } + if settings.wfa_prune_distance == 0 { + settings.wfa_prune_distance = usize::MAX; + } + + // 0 doesn't make sense, so lets just error proof it up to 1 + if settings.min_spanning_reads == 0 { + settings.min_spanning_reads = 1; + } + if settings.min_matched_alleles == 0 { + settings.min_matched_alleles = 1; + } + + // if this is not specified, then set it to the same as processing + if settings.io_threads.is_none() { + settings.io_threads = Some(settings.threads); + } + + // dump stuff to the logger + info!("Minimum call quality: {}", settings.min_variant_quality); + info!("Minimum mapping quality: {}", settings.min_mapping_quality); + info!("Minimum matched alleles: {}", settings.min_matched_alleles); + if settings.min_matched_alleles > 2 { + warn!("Setting the minimum matched alleles > 2 has not been tested.") + } + info!("Minimum spanning reads: {}", settings.min_spanning_reads); + info!("Supplemental mapping block joins: {}", if settings.disable_supplemental_joins { "DISABLED" } else { "ENABLED" }); + info!("Phase singleton blocks: {}", if settings.phase_singletons { "ENABLED" } else { "DISABLED" }); + info!("Local re-alignment maximum reference buffer: +-{} bp", settings.reference_buffer); + if settings.global_realign_cputime == 0.0 { + info!("Global re-alignment: DISABLED"); + } else { + info!("Global re-alignment CPU time: {} seconds", settings.global_realign_cputime); + if settings.wfa_prune_distance == usize::MAX { + info!("Global prune distance: DISABLED"); + } else { + info!("Global prune distance: {}", settings.wfa_prune_distance); + } + } + info!("Processing threads: {}", settings.threads); + info!("I/O threads: {}", settings.io_threads.unwrap()); + + //send the settings back + settings +} diff --git a/src/data_types/mod.rs b/src/data_types/mod.rs new file mode 100644 index 0000000..72699dd --- /dev/null +++ b/src/data_types/mod.rs @@ -0,0 +1,7 @@ + +/// Contains a ReadSegment observation type +pub mod read_segments; +/// Wrapper for an in-memory reference genome +pub mod reference_genome; +/// Contains Variant type as well as supporting definitions +pub mod variants; diff --git a/src/data_types/read_segments.rs b/src/data_types/read_segments.rs new file mode 100644 index 0000000..af2c1f2 --- /dev/null +++ b/src/data_types/read_segments.rs @@ -0,0 +1,258 @@ + +/// Container for a read segment that has been converted into a variant representation +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct ReadSegment { + /// the read name + read_name: String, + /// the actual alleles for the read, should always be 0, 1, 2 (ambiguous), or 3 (non-overlapping/undefined). + /// anything other than 0 or 1 are basically ignored in all functions + alleles: Vec, + /// the associated quality values for converting 0 <--> 1; undefined alleles should have qual == 0 + quals: Vec, + /// the index of the first defined 0/1 allele, inclusive + first_allele: usize, + /// the index of the last defined 0/1 allele, inclusive + last_allele: usize +} + +impl ReadSegment { + /// Creates a new read segment from a set of alleles and quality values. + /// Note that the segment should have the same length as the phase block, even if the segment does not actually span the full block. + /// # Arguments + /// * `alleles` - the alleles, should all be 0, 1, 2 (ambiguous) or 3 (non-overlapping/undefined); the first and last defined values determine the length of the segment + /// * `quals` - cost to convert an allele from 0 <--> 1, any undefined/ambiguous alleles should be qual = 0 + /// # Panics + /// * if `allele.len() != quals.len()` + pub fn new(read_name: String, alleles: Vec, quals: Vec) -> ReadSegment { + assert_eq!(alleles.len(), quals.len()); + let (first_allele, _) = alleles.iter().enumerate() + .find(|(_i, &a)| a < 2).unwrap_or((alleles.len(), &2)); + let (last_allele, _) = alleles.iter().enumerate().rev() + .find(|(_i, &a)| a < 2).unwrap_or((alleles.len(), &2)); + ReadSegment { + read_name, + alleles, + quals, + first_allele, + last_allele + } + } + + /// Given a collection of read segments, this will collapse them into a single one. + /// Any ambiguous/undefined alleles will have their quality set to 0. + /// # Arguments + /// * `read_segments` - the reads to collapse together + /// # Panics + /// * if `read_segments` is empty + /// * if `read_segments` are not all of equal length + pub fn collapse(read_segments: &[ReadSegment]) -> ReadSegment { + // short circuit + assert!(!read_segments.is_empty()); + if read_segments.len() == 1 { + return read_segments[0].clone(); + } + + let num_alleles: usize = read_segments[0].get_num_alleles(); + let read_name: String = read_segments[0].read_name().to_string(); + let mut alleles: Vec = vec![3; num_alleles]; + let mut quals: Vec = vec![0; num_alleles]; + for rs in read_segments.iter() { + let rs_alleles = rs.alleles(); + let rs_quals = rs.quals(); + assert_eq!(num_alleles, rs.get_num_alleles()); + assert_eq!(read_name, rs.read_name()); + + for (i, &rsa) in rs_alleles.iter().enumerate() { + // if rsa is unset, we skip everything + if rsa != 3 { + if alleles[i] == 3 { + alleles[i] = rsa; + quals[i] = rs_quals[i]; + } else if alleles[i] == 2 { + // we are already ambiguous, so quality should be 0 + } else { + // check for ambiguity + if alleles[i] == rsa { + // they match, make sure quals do also + // assert_eq!(quals[i], rs_quals[i]); + // quals won't always match in local mode, lets default to the lower + quals[i] = quals[i].min(rs_quals[i]); + assert!(quals[i] > 0); + } else { + // they don't match, change to ambiguous + alleles[i] = 2; + quals[i] = 0; + } + } + } + } + } + + // now just send it to the new function + Self::new(read_name, alleles, quals) + } + + pub fn read_name(&self) -> &str { + &self.read_name + } + + pub fn get_num_alleles(&self) -> usize { + self.alleles.len() + } + + pub fn alleles(&self) -> &[u8] { + &self.alleles[..] + } + + pub fn quals(&self) -> &[u8] { + &self.quals[..] + } + + pub fn first_allele(&self) -> usize { + self.first_allele + } + + pub fn last_allele(&self) -> usize { + self.last_allele + } + + /// Returns the range of this segment, e.g. [first_allele..last_allele+1) + pub fn get_range(&self) -> std::ops::Range { + self.first_allele..(self.last_allele+1) + } + + /// Returns the number of alleles that are set (i.e. non-ambiguous and overlapping, so 0 or 1) + pub fn get_num_set(&self) -> usize { + self.alleles.iter() + .filter(|&v| *v < 2) + .count() + } + + /// Given a haplotype, this will score the read against that haplotype. + /// If a haplotype has a 2, no cost is associated with that allele. + /// # Arguments + /// `haplotype` - the full haplotype to score, must have the same length as the block + pub fn score_haplotype(&self, haplotype: &[u8]) -> u64 { + assert_eq!(self.alleles.len(), haplotype.len()); + self.score_partial_haplotype(haplotype, 0) + } + + /// Given a partial haplotype, this will score the read against that haplotype. + /// The offset values is an index to where to start in our alleles for scoring. + /// For example, if offset = 10, then alleles[10..] will be compared to haplotype[0..] + /// If a haplotype has a 2, no cost is associated with that allele. + /// # Arguments + /// * `haplotype` - the partial haplotype to score + /// * `offset` - the offset into the read segment to start scoring + pub fn score_partial_haplotype(&self, haplotype: &[u8], offset: usize) -> u64 { + //info!("rs {}+{} <= {}?", haplotype.len(), offset, self.alleles.len()); + assert!(haplotype.len()+offset <= self.alleles.len()); + if haplotype.len() + offset <= self.first_allele || offset > self.last_allele { + // the haplotype starts and ends before our first allele, OR + // the haplotype starts after our last allele, SO + // return 0 in either case, because there is no overlaps to score + 0 + } else { + // the minimum comparison is either the first allele OR the offset, whichever is greater + let min_compare = self.first_allele.max(offset); + + // if the allele component is greater, then we need to shift into the haplotype + let offset_shift = min_compare - offset; + + // the maximum comparison is either the last allele we have OR the end of the haplotype+start offset + let max_compare = (self.last_allele+1).min(offset+haplotype.len()); + + self.quals[min_compare..max_compare].iter().enumerate() + .filter(|(i, _q)| haplotype[*i+offset_shift] < 2 && self.alleles[*i+min_compare] != haplotype[*i+offset_shift]) + .map(|(_i, &q)| q as u64) + .sum() + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_score_haplotype() { + let rs = ReadSegment::new( + "read_name".to_string(), + vec![2, 0, 1, 0, 0, 1, 2, 1, 2, 2], + vec![0, 1, 1, 1, 1, 1, 1, 1, 0, 0] + ); + assert_eq!(rs.first_allele, 1); + assert_eq!(rs.last_allele, 7); + assert_eq!(rs.get_num_set(), 6); + + //identical except for missing value in rs + let haplotype = vec![0, 0, 1, 0, 0, 1, 1, 1, 0, 0]; + assert_eq!(rs.score_haplotype(&haplotype), 1); + + //fully empty haplotype + let haplotype = vec![2; 10]; + assert_eq!(rs.score_haplotype(&haplotype), 0); + + //fully wrong haplotype + let haplotype = vec![1, 1, 0, 1, 1, 0, 0, 0, 1, 1]; + assert_eq!(rs.score_haplotype(&haplotype), 7); + } + + #[test] + fn test_score_partial_haplotype() { + let rs = ReadSegment::new( + "read_name".to_string(), + vec![2, 0, 1, 0, 0, 1, 2, 1, 2, 2], + vec![0, 1, 1, 1, 1, 1, 1, 1, 0, 0] + ); + + //identical except for missing value in rs + let haplotype = vec![0, 1, 0, 0, 1, 1, 1]; + assert_eq!(rs.score_partial_haplotype(&haplotype, 1), 1); + + //fully empty haplotype + let haplotype = vec![2; 7]; + assert_eq!(rs.score_partial_haplotype(&haplotype, 2), 0); + + //fully wrong haplotype + let haplotype = vec![1, 0, 1, 1, 0, 0, 0]; + assert_eq!(rs.score_partial_haplotype(&haplotype, 1), 7); + for x in 0..haplotype.len() { + assert_eq!(rs.score_partial_haplotype(&haplotype[x..], 1+x), 7-x as u64); + } + } + + #[test] + fn test_collapse() { + let rs1 = ReadSegment::new( + "read_name".to_string(), + vec![3, 1, 0, 2, 1, 3, 3], + vec![0, 2, 1, 0, 2, 0, 0] + ); + let rs2 = ReadSegment::new( + "read_name".to_string(), + vec![3, 3, 0, 1, 0, 1, 1], + vec![0, 0, 1, 2, 2, 1, 1] + ); + let expected = ReadSegment::new( + "read_name".to_string(), + vec![3, 1, 0, 2, 2, 1, 1], + vec![0, 2, 1, 0, 0, 1, 1] + ); + + // make sure normal collapsing works + let collapsed = ReadSegment::collapse(&[rs1.clone(), rs2.clone()]); + assert_eq!(expected, collapsed); + assert_eq!(collapsed.first_allele, 1); + assert_eq!(collapsed.last_allele, 6); + + // make sure scoring works fine with the 3s present + // vec![3, 1, 0, 2, 2, 1, 1] + let haplotype = vec![0, 1, 0, 0, 0, 1, 0]; + assert_eq!(collapsed.score_haplotype(&haplotype), 1); + + // check stupid collapsing also + let collapsed = ReadSegment::collapse(&[rs1.clone()]); + assert_eq!(collapsed, rs1); + } +} \ No newline at end of file diff --git a/src/data_types/reference_genome.rs b/src/data_types/reference_genome.rs new file mode 100644 index 0000000..d1c5c10 --- /dev/null +++ b/src/data_types/reference_genome.rs @@ -0,0 +1,133 @@ + +use bio::io::fasta; +use flate2::bufread::MultiGzDecoder; +use log::{debug, info, warn}; +use rustc_hash::FxHashMap as HashMap; +use std::io::{BufRead, BufReader}; +use std::path::{Path, PathBuf}; + +/// Wrapper structure for a reference genome +pub struct ReferenceGenome { + /// The filename we loaded + filename: PathBuf, + /// Contains the keys in order of the reference load + contig_keys: Vec, + /// Map where keys are contig names and value is ASCII formatted sequence + contig_map: HashMap> +} + +impl ReferenceGenome { + /// Loads a reference genome from a given FASTA file + /// # Arguments + /// * `fasta_fn` - the FASTA filename, gzip is allowed + /// # Errors + /// This will pass through any error detected from loading the provided FASTA file. + /// This includes file reading and/or record reading errors. + pub fn from_fasta(fasta_fn: &Path) -> Result> { + info!("Loading {:?}...", fasta_fn); + let mut contig_keys: Vec = Default::default(); + let mut contig_map: HashMap> = Default::default(); + + // needletail can technically read FASTA and FASTQ, not sure we can check for that easy though + let fasta_file: std::fs::File = std::fs::File::open(fasta_fn)?; + let file_reader = BufReader::new(fasta_file); + let fasta_reader: fasta::Reader> = if fasta_fn.extension().unwrap_or_default() == "gz" { + debug!("Detected gzip extension, loading reference with MultiGzDecoder..."); + let gz_decoder = MultiGzDecoder::new(file_reader); + let bufreader = BufReader::new(gz_decoder); + fasta::Reader::from_bufread(Box::new(bufreader)) + } else { + debug!("Loading reference as plain-text file..."); + fasta::Reader::from_bufread(Box::new(file_reader)) + }; + + for entry in fasta_reader.records() { + let record: fasta::Record = entry?; + let seq_id: String = record.id().to_string(); + let sequence: Vec = record.seq().to_ascii_uppercase(); + + contig_keys.push(seq_id.clone()); + contig_map.insert(seq_id, sequence); + } + info!("Finished loading {} contigs.", contig_map.len()); + + Ok(ReferenceGenome { + filename: fasta_fn.to_path_buf(), + contig_keys, + contig_map + }) + } + + pub fn filename(&self) -> &Path { + &self.filename + } + + pub fn contig_keys(&self) -> &[String] { + &self.contig_keys + } + + /// Retrieves a reference slice from a given 0-based coordinates. + /// If `start` or `end` goes past the full contig length, it will be truncated to the full contig length. + /// # Arguments + /// * `chromosome` - the chromosome to slice from + /// * `start` - the 0-based start index (included) + /// * `end` - the 0-based end index (excluded) + /// # Panics + /// * if `chromosome` was not in the FASTA file + /// * if `start` > `end` + pub fn get_slice(&self, chromosome: &str, start: usize, end: usize) -> &[u8] { + let full_contig = self.contig_map.get(chromosome).expect("a chromosome from the reference file"); + assert!(start <= end, "start > end: {start} > {end}"); + let truncated_start = if start <= full_contig.len() { start } else { + warn!("Received get_slice({:?}, {}, {}), truncated start to {}", chromosome, start, end, full_contig.len()); + full_contig.len() + }; + let truncated_end = if end <= full_contig.len() { end } else { + warn!("Received get_slice({:?}, {}, {}), truncated end to {}", chromosome, start, end, full_contig.len()); + full_contig.len() + }; + &full_contig[truncated_start..truncated_end] + } + + /// Retrieves a full chromosome by name + /// # Arguments + /// * `chromosome` - the chromosome to slice from + /// # Panics + /// * if `chromosome` was not in the FASTA file + pub fn get_full_chromosome(&self, chromosome: &str) -> &[u8] { + let full_contig = self.contig_map.get(chromosome).expect("a chromosome from the reference file"); + full_contig + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::path::PathBuf; + #[test] + fn test_simple_reference() { + let references = vec![ + "./test_data/test_reference.fa", + "./test_data/test_reference.fa.gz" + ]; + for &reference_fn in references.iter() { + let simple_reference_fn: PathBuf = PathBuf::from(reference_fn); + let reference_genome = ReferenceGenome::from_fasta(&simple_reference_fn).unwrap(); + + assert_eq!(reference_genome.contig_keys(), &[ + "chr1".to_string(), + "chr2".to_string() + ]); + + //chr1 = ACGTACGT + let chr1_string: Vec = "ACGTACGT".as_bytes().to_vec(); + for i in 0..8 { + assert_eq!(reference_genome.get_slice(&"chr1", i, 8), &chr1_string[i..]); + } + + //chr2 = ACCATGTA + let chr1_string: Vec = "ACCATGTA".as_bytes().to_vec(); + assert_eq!(reference_genome.get_slice(&"chr2", 0, 8), chr1_string); + } + } +} \ No newline at end of file diff --git a/src/data_types/variants.rs b/src/data_types/variants.rs new file mode 100644 index 0000000..d5da244 --- /dev/null +++ b/src/data_types/variants.rs @@ -0,0 +1,758 @@ + +use crate::sequence_alignment::edit_distance; + +use log::trace; +use std::cmp::Ordering; + +/// All the variant types we are currently allowing +#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq, PartialOrd, Ord)] +pub enum VariantType { + /// REF and ALT are both length = 1 + Snv=0, + /// REF length = 1, ALT length > 1 + Insertion, + /// REF length > 1, ALT length = 1 + Deletion, + /// REF and ALT lengths > 1 + Indel, + /// Must have two alleles and be tagged with SVTYPE=INS + SvInsertion, + /// Must have two alleles and be tagged with SVTYPE=DEL + SvDeletion, + /// Must have two alleles and be tagged with SVTYPE=DUP + SvDuplication, + /// Must have two alleles and be tagged with SVTYPE=INV + SvInversion, + /// Must have two alleles and be tagged with SVTYPE=BND + SvBreakend, + /// Must have two alleles and be tagged with TRID=#### + TandemRepeat, + /// Something that doesn't match the above criteria, must be 1 or 2 alleles + Unknown // make sure Unknown is always the last one in the list +} + +/// Zygosity definitions, mostly used elsewhere +#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq, PartialOrd, Ord)] +pub enum Zygosity { + HomozygousReference=0, + Heterozygous, + HomozygousAlternate, + Unknown // make sure Unknown is always the last one in the list +} + +/// A variant definition structure. +/// It currently assumes that chromosome is fixed and that the variant is a SNP. +#[derive(Debug)] +pub struct Variant { + /// The vcf index from the input datasets + vcf_index: usize, + /// The type of variant represented by this entry + variant_type: VariantType, + /// The coordinate of the event in the VCF file, 0-based + position: i64, + /// The length of the reference allele + ref_len: usize, + /// the start position of the allele (0-based), will be <= position + prefix_len: usize, + /// the end position (0-based, exclusive), will be >= position+ref_len + postfix_len: usize, + /// the first allele value + allele0: Vec, + /// the second allele value + allele1: Vec, + + //these only matter for multi-allelic sites; usize is "proper" type, but u8 will be nice and compact + /// the index of allele0, typically 0 (REF) + index_allele0: u8, + /// the index of allele1, typically 1 (usually len(ALT) == 1, so it's 1) + index_allele1: u8, + + // auxiliary booleans + /// if true, flags this is a variant to ignore for _some_ reason + is_ignored: bool +} + +impl Variant { + /// Creates a new single-nucleotide variant (SNV). + /// For SNV variants, all alleles must be exactly 1 bp long. + /// # Arguments + /// * `vcf_index` - the index of the source VCF file + /// * `position` - the coordinate of the variant in a contig + /// * `allele0` - the first allele (usually REF) + /// * `allele1` - the second allele (usually ALT[0]) + /// * `index_allele0` - the index for allele0, typically 0 for REF + /// * `index_allele1` - the index for allele1, typically 1 for simple heterozygous variants + /// # Panics + /// * if `index_allele0 > index_allele1` + /// * if the provided sequences do not match a single-nucleotide variant + pub fn new_snv(vcf_index: usize, position: i64, allele0: Vec, allele1: Vec, index_allele0: u8, index_allele1: u8) -> Variant { + // we always assume alleles come "sorted" and they are heterozygous + assert!(index_allele0 < index_allele1); + + // SNV alleles must be length 1 + assert_eq!(allele0.len(), 1); + assert_eq!(allele1.len(), 1); + Variant { + vcf_index, + variant_type: VariantType::Snv, + position, + ref_len: 1, + prefix_len: 0, + postfix_len: 0, + allele0, + allele1, + index_allele0, + index_allele1, + is_ignored: false + } + } + + /// Creates a new deletion variant. + /// Deletions must have a REF allele long than 1 bp, and all ALT alleles must be exactly 1 bp long. + /// # Arguments + /// * `vcf_index` - the index of the source VCF file + /// * `position` - the coordinate of the variant in a contig + /// * `ref_len` - the length of the reference allele + /// * `allele0` - the first allele (usually REF) + /// * `allele1` - the second allele (usually ALT[0]) + /// * `index_allele0` - the index for allele0, typically 0 for REF + /// * `index_allele1` - the index for allele1, typically 1 for simple heterozygous variants + /// # Panics + /// * if `index_allele0 > index_allele1` + /// * if the provided sequences do not match a deletion variant + /// * if the reference allele is passed in and it does not have the same length as `ref_len` + pub fn new_deletion(vcf_index: usize, position: i64, ref_len: usize, allele0: Vec, allele1: Vec, index_allele0: u8, index_allele1: u8) -> Variant { + // we always assume alleles come "sorted" and they are heterozygous + assert!(index_allele0 < index_allele1); + + // reference length must be greater than 1 to be a deletion + assert!(ref_len > 1); + + if index_allele0 == 0 { + // this allele is also the reference allele + assert_eq!(allele0.len(), ref_len); + } else { + // this allele is not the reference, must be a multi-allelic site; but all deletion alts have len = 1 + assert!(allele0.len() == 1); + } + // this one must always be length 1 + assert!(allele1.len() == 1); + + // make sure the alleles start with the same thing + // assert_eq!(allele0[0], allele1[0]); + if allele0[0] != allele1[0] { + /* + Counter example to requiring alleleX[0] be equal; this is rare, but it does seem to happen + chr12 117794450 . ACACACCAACATGCACACT T + */ + trace!("Deletion alleles are unexpected: {position}, {ref_len}, {allele0:?}, {allele1:?}"); + } + Variant { + vcf_index, + variant_type: VariantType::Deletion, + position, + ref_len, + prefix_len: 0, + postfix_len: 0, + allele0, + allele1, + index_allele0, + index_allele1, + is_ignored: false + } + } + + /// Creates a new insertion variant. + /// Insertions must have a REF allele exactly 1 bp long, and all ALT alleles must be longer than 1 bp. + /// # Arguments + /// * `vcf_index` - the index of the source VCF file + /// * `position` - the coordinate of the variant in a contig + /// * `allele0` - the first allele (usually REF) + /// * `allele1` - the second allele (usually ALT[0]) + /// * `index_allele0` - the index for allele0, typically 0 for REF + /// * `index_allele1` - the index for allele1, typically 1 for simple heterozygous variants + /// # Panics + /// * if `index_allele0 > index_allele1` + /// * if the provided sequences do not match an insertion variant + pub fn new_insertion(vcf_index: usize, position: i64, allele0: Vec, allele1: Vec, index_allele0: u8, index_allele1: u8) -> Variant { + // we always assume alleles come "sorted" and they are heterozygous + assert!(index_allele0 < index_allele1); + + if index_allele0 == 0 { + // if reference allele is present, it must be length 1 for this type + assert_eq!(allele0.len(), 1); + } else { + // allele0 isn't reference, so it must be >= 1 due to multi-allelics + // chr1 2122634 . T C,TG 14.1 + assert!(!allele0.is_empty(), "{position} {allele0:?}"); + } + // we have to do >= because of some multi-allelics: + // chr1 286158 . A ATG,G 34.4 + assert!(!allele1.is_empty(), "{position} {allele1:?}"); + + // make sure the alleles start with the same thing + if allele0[0] != allele1[0] { + // no counter example searched for yet, but probably exists, we'll leave this trace for now + trace!("Insertion alleles are unexpected: {position}, 1, {allele0:?}, {allele1:?}"); + } + Variant { + vcf_index, + variant_type: VariantType::Insertion, + position, + ref_len: 1, + prefix_len: 0, + postfix_len: 0, + allele0, + allele1, + index_allele0, + index_allele1, + is_ignored: false + } + } + + /// Creates a new indel variant. + /// All indels alleles must be more than 1 bp long. + /// # Arguments + /// * `vcf_index` - the index of the source VCF file + /// * `position` - the coordinate of the variant in a contig + /// * `ref_len` - the length of the reference allele + /// * `allele0` - the first allele (usually REF) + /// * `allele1` - the second allele (usually ALT[0]) + /// * `index_allele0` - the index for allele0, typically 0 for REF + /// * `index_allele1` - the index for allele1, typically 1 for simple heterozygous variants + /// # Panics + /// * if `index_allele0 > index_allele1` + /// * if the provided sequences do not match an indel variant + /// * if the reference allele is passed in and it does not have the same length as `ref_len` + pub fn new_indel(vcf_index: usize, position: i64, ref_len: usize, allele0: Vec, allele1: Vec, index_allele0: u8, index_allele1: u8) -> Variant { + // we always assume alleles come "sorted" and they are heterozygous + assert!(index_allele0 < index_allele1); + + // reference length must be greater than 1 to be an indel, but ALTs can really be any length after that (>=1 anyways) + assert!(ref_len > 1); + + if index_allele0 == 0 { + // this allele is also the reference allele + assert_eq!(allele0.len(), ref_len); + } else { + // it's not a reference allele, since this is an indel, length can be anything >= 1 + assert!(!allele0.is_empty()); + } + // this one just has to be >= 1 + assert!(!allele1.is_empty()); + + // there's no real reason to believe in any shared sequence between alleles + // we've seen it not work above, not worth even trying to codify warning here IMO + // assert!(???) + + Variant { + vcf_index, + variant_type: VariantType::Indel, + position, + ref_len, + prefix_len: 0, + postfix_len: 0, + allele0, + allele1, + index_allele0, + index_allele1, + is_ignored: false + } + } + + /// Creates a new SV deletion variant. + /// SV deletions must have a REF allele long than 1 bp, and all ALT alleles must be exactly 1 bp long. + /// # Arguments + /// * `vcf_index` - the index of the source VCF file + /// * `position` - the coordinate of the variant in a contig + /// * `ref_len` - the length of the reference allele + /// * `allele0` - the first allele (usually REF) + /// * `allele1` - the second allele (usually ALT[0]) + /// * `index_allele0` - the index for allele0, typically 0 for REF + /// * `index_allele1` - the index for allele1, typically 1 for simple heterozygous variants + /// # Panics + /// * if `index_allele0 > index_allele1` + /// * if the provided sequences do not match a deletion variant + /// * if the reference allele is passed in and it does not have the same length as `ref_len` + pub fn new_sv_deletion(vcf_index: usize, position: i64, ref_len: usize, allele0: Vec, allele1: Vec, index_allele0: u8, index_allele1: u8) -> Variant { + // we always assume alleles come "sorted" and they are heterozygous + assert!(index_allele0 < index_allele1); + + // this is one difference from plain Deletion + assert_eq!(index_allele0, 0); + assert_eq!(index_allele1, 1); + + // this allele is also the reference allele + assert_eq!(allele0.len(), ref_len); + + // this one must always be length 1 + assert!(allele1.len() == 1); + + // make sure the alleles start with the same thing + if allele0[0] != allele1[0] { + /* + Counter example to requiring alleleX[0] be equal; this is rare, but it does seem to happen + chr12 117794450 . ACACACCAACATGCACACT T + */ + trace!("Deletion alleles are unexpected: {}, {}, {:?}, {:?}", position, ref_len, allele0, allele1); + } + Variant { + vcf_index, + variant_type: VariantType::SvDeletion, + position, + ref_len, + prefix_len: 0, + postfix_len: 0, + allele0, + allele1, + index_allele0, + index_allele1, + is_ignored: false + } + } + + /// Creates a new SV insertion variant. + /// SV insertions must have a REF allele exactly 1 bp long, and all ALT alleles must be longer than 1 bp. + /// # Arguments + /// * `vcf_index` - the index of the source VCF file + /// * `position` - the coordinate of the variant in a contig + /// * `allele0` - the first allele (usually REF) + /// * `allele1` - the second allele (usually ALT[0]) + /// * `index_allele0` - the index for allele0, typically 0 for REF + /// * `index_allele1` - the index for allele1, typically 1 for simple heterozygous variants + /// # Panics + /// * if `index_allele0 > index_allele1` + /// * if the provided sequences do not match an insertion variant + pub fn new_sv_insertion(vcf_index: usize, position: i64, allele0: Vec, allele1: Vec, index_allele0: u8, index_allele1: u8) -> Variant { + // we always assume alleles come "sorted" and they are heterozygous + assert!(index_allele0 < index_allele1); + + // this is one difference from plain Insertion + assert_eq!(index_allele0, 0); + assert_eq!(index_allele1, 1); + + // if reference allele is present, it must be length 1 for this type + assert_eq!(allele0.len(), 1); + + // we have to do >= because of some multi-allelics: + // chr1 286158 . A ATG,G 34.4 + assert!(!allele1.is_empty(), "{position} {allele1:?}"); + + // make sure the alleles start with the same thing + if allele0[0] != allele1[0] { + // no counter example searched for yet, but probably exists, we'll leave this trace for now + trace!("Insertion alleles are unexpected: {}, {}, {:?}, {:?}", position, 1, allele0, allele1); + } + Variant { + vcf_index, + variant_type: VariantType::SvInsertion, + position, + ref_len: 1, + prefix_len: 0, + postfix_len: 0, + allele0, + allele1, + index_allele0, + index_allele1, + is_ignored: false + } + } + + /// Creates a new tandem repeat variant, functionally these act very similar to indel types. + /// All tandem repeat alleles must be at least 1 bp long by VCF definition. + /// # Arguments + /// * `vcf_index` - the index of the source VCF file + /// * `position` - the coordinate of the variant in a contig + /// * `ref_len` - the length of the reference allele + /// * `allele0` - the first allele (usually REF) + /// * `allele1` - the second allele (usually ALT[0]) + /// * `index_allele0` - the index for allele0, typically 0 for REF + /// * `index_allele1` - the index for allele1, typically 1 for simple heterozygous variants + /// # Panics + /// * if `index_allele0 > index_allele1` + /// * if the provided sequences do not match a tandem repeat variant + /// * if the reference allele is passed in and it does not have the same length as `ref_len` + pub fn new_tandem_repeat(vcf_index: usize, position: i64, ref_len: usize, allele0: Vec, allele1: Vec, index_allele0: u8, index_allele1: u8) -> Variant { + // we always assume alleles come "sorted" and they are heterozygous + assert!(index_allele0 < index_allele1); + + // all alleles must be >= 1 for tandem repeats, most are longer though + assert!(ref_len >= 1); + + if index_allele0 == 0 { + // this allele is also the reference allele + assert_eq!(allele0.len(), ref_len); + } else { + // it's not a reference allele, since this is an indel, length can be anything >= 1 + assert!(!allele0.is_empty()); + } + // this one just has to be >= 1 + assert!(!allele1.is_empty()); + + Variant { + vcf_index, + variant_type: VariantType::TandemRepeat, + position, + ref_len, + prefix_len: 0, + postfix_len: 0, + allele0, + allele1, + index_allele0, + index_allele1, + is_ignored: false + } + } + + /// This will add a prefix to each allele, generally reference genome sequence that will allow for better matching. + /// # Arguments + /// * `prefix` - the sequence to pre-pend to each allele + pub fn add_reference_prefix(&mut self, prefix: &[u8]) { + // make sure we don't set our reference start coordinate to less than 0 + let prefix_len: usize = prefix.len(); + assert!(prefix_len <= self.position as usize - self.prefix_len); + + // allele0, pre-pend is basically copy + let mut new_allele0: Vec = Vec::with_capacity(self.allele0.len()+prefix_len); + new_allele0.extend_from_slice(prefix); + new_allele0.extend_from_slice(&self.allele0); + self.allele0 = new_allele0; + + // same for allele1 + let mut new_allele1: Vec = Vec::with_capacity(self.allele1.len()+prefix_len); + new_allele1.extend_from_slice(prefix); + new_allele1.extend_from_slice(&self.allele1); + self.allele1 = new_allele1; + + // finally, adjust the start coordinates + self.prefix_len += prefix_len; + } + + /// This will add a postfix to each allele, generally reference genome sequence that will allow for better matching. + /// # Arguments + /// * `postfix` - the sequence to append to each allele + pub fn add_reference_postfix(&mut self, postfix: &[u8]) { + // easier operation, just extend the existing vecs + self.allele0.extend_from_slice(postfix); + self.allele1.extend_from_slice(postfix); + + // finally, adjust the end coordinates + self.postfix_len += postfix.len(); + } + + /// This will trim the postfix down to a smaller size. + pub fn truncate_reference_postfix(&mut self, truncate_amount: usize) { + // sanity check that we are only truncating the postfix + assert!(truncate_amount <= self.postfix_len); + + // truncate the alleles and shrink the postfix size + self.allele0.truncate(self.allele0.len() - truncate_amount); + self.allele1.truncate(self.allele1.len() - truncate_amount); + self.postfix_len -= truncate_amount; + } + + pub fn get_vcf_index(&self) -> usize { + self.vcf_index + } + + pub fn get_type(&self) -> VariantType { + self.variant_type + } + + pub fn position(&self) -> i64 { + self.position + } + + pub fn get_ref_len(&self) -> usize { + self.ref_len + } + + pub fn get_prefix_len(&self) -> usize { + self.prefix_len + } + + pub fn get_postfix_len(&self) -> usize { + self.postfix_len + } + + pub fn get_allele0(&self) -> &[u8] { + &self.allele0 + } + + pub fn get_allele1(&self) -> &[u8] { + &self.allele1 + } + + pub fn is_ignored(&self) -> bool { + self.is_ignored + } + + pub fn set_ignored(&mut self) { + self.is_ignored = true; + } + + pub fn get_truncated_allele0(&self) -> &[u8] { + let start: usize = self.prefix_len; + let end: usize = self.allele0.len() - self.postfix_len; + &self.allele0[start..end] + } + + pub fn get_truncated_allele1(&self) -> &[u8] { + let start: usize = self.prefix_len; + let end: usize = self.allele1.len() - self.postfix_len; + &self.allele1[start..end] + } + + /// This will determine the best matching allele (0 or 1) or return 2 if neither match. + /// Primary purpose of this is to convert all variant observations into a 0/1 scheme. + /// This method requires an exact match of the allele. + /// # Arguments + /// * `allele` - the allele that needs to get converted to a 0 or 1 (or 2 if neither match) + pub fn match_allele(&self, allele: &[u8]) -> u8 { + if allele == &self.allele0[..] { + 0 + } else if allele == &self.allele1[..] { + 1 + } else { + 2 + } + } + + /// This will determine the closest matching allele (0 or 1) based on edit distance, or return 2 if they are equi-distant. + /// This method does not require an exact match of the alleles. + /// Returns a tuple of the (allele chosen, min edit distance, other edit distance). + /// # Arguments + /// * `allele` - the allele sequence to compare to our internal alleles + pub fn closest_allele(&self, allele: &[u8]) -> (u8, usize, usize) { + self.closest_allele_clip(allele, 0, 0) + } + + /// This will determine the closest matching allele (0 or 1) based on edit distance, or return 2 if they are equi-distant. + /// This method does not require an exact match of the alleles, and allows for you to clip bases on the internal allele sequence. + /// This is most useful when you have to clip the provided allele to to incomplete matching. + /// Returns a tuple of the (allele chosen, min edit distance, other edit distance). + /// # Arguments + /// * `allele` - the allele sequence to compare to our internal alleles + /// * `offset` - will skip this many bases internally for calculating edit distance + pub fn closest_allele_clip(&self, allele: &[u8], head_clip: usize, tail_clip: usize) -> (u8, usize, usize) { + assert!(head_clip <= self.prefix_len); + assert!(tail_clip <= self.postfix_len); + let d0: usize = edit_distance(allele, &self.allele0[head_clip..(self.allele0.len() - tail_clip)]); + let d1: usize = edit_distance(allele, &self.allele1[head_clip..(self.allele1.len() - tail_clip)]); + trace!("clipping: {} {}", head_clip, tail_clip); + trace!("obs{:?}", allele); + trace!("a0 {:?} => {}", &self.allele0[head_clip..(self.allele0.len() - tail_clip)], d0); + trace!("a1 {:?} => {}", &self.allele1[head_clip..(self.allele1.len() - tail_clip)], d1); + match d0.cmp(&d1) { + // d0 is less, return that + Ordering::Less => (0, d0, d1), + // d1 is less, return that + Ordering::Greater => (1, d1, d0), + // equidistant, so undetermined + Ordering::Equal => (2, d0, d1) + } + } + + /// This will return the index allele for a given haplotype index. + /// Input must always be 0 or 1, but it might get converted to something else at multi-allelic sites. + /// # Arguments + /// * `index` - must be 0 or 1 + pub fn convert_index(&self, index: u8) -> u8 { + if index == 0 { + self.index_allele0 + } else if index == 1 { + self.index_allele1 + } else if index == 2 { + // we just need some indicator that it's undetermined, this will work for now + u8::MAX + } else { + panic!("index must be 0, 1, or 2"); + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_basic_snv() { + let variant = Variant::new_snv( + 0, 1, + b"A".to_vec(), b"C".to_vec(), + 0, 1 + ); + assert_eq!(variant.get_type(), VariantType::Snv); + assert_eq!(variant.position(), 1); + assert_eq!(variant.get_ref_len(), 1); + assert_eq!(variant.match_allele(b"A"), 0); + assert_eq!(variant.match_allele(b"C"), 1); + assert_eq!(variant.match_allele(b"G"), 2); + assert_eq!(variant.match_allele(b"T"), 2); + assert_eq!(variant.convert_index(0), 0); + assert_eq!(variant.convert_index(1), 1); + assert_eq!(variant.convert_index(2), u8::MAX); + } + + #[test] + fn test_basic_deletion() { + // this is the deletion we mostly expect + let variant = Variant::new_deletion( + 0, 10, 3, + b"AGT".to_vec(), b"A".to_vec(), + 0, 1 + ); + assert_eq!(variant.get_type(), VariantType::Deletion); + assert_eq!(variant.position(), 10); + assert_eq!(variant.get_ref_len(), 3); + assert_eq!(variant.match_allele(b"AGT"), 0); + assert_eq!(variant.match_allele(b"A"), 1); + assert_eq!(variant.match_allele(b"AG"), 2); + + // multi-allelic deletion, must still be length 1 though + let variant = Variant::new_deletion( + 0, 10, 4, + b"C".to_vec(), b"A".to_vec(), + 1, 2 + ); + assert_eq!(variant.get_type(), VariantType::Deletion); + assert_eq!(variant.position(), 10); + assert_eq!(variant.get_ref_len(), 4); + assert_eq!(variant.match_allele(b"ACCC"), 2); + assert_eq!(variant.match_allele(b"C"), 0); + assert_eq!(variant.match_allele(b"A"), 1); + assert_eq!(variant.convert_index(0), 1); + assert_eq!(variant.convert_index(1), 2); + } + + #[test] + fn test_basic_insertion() { + let variant = Variant::new_insertion( + 0, 20, + b"A".to_vec(), b"AGT".to_vec(), + 0, 1 + ); + assert_eq!(variant.get_type(), VariantType::Insertion); + assert_eq!(variant.position(), 20); + assert_eq!(variant.get_ref_len(), 1); + assert_eq!(variant.match_allele(b"A"), 0); + assert_eq!(variant.match_allele(b"AGT"), 1); + assert_eq!(variant.match_allele(b"AG"), 2); + } + + #[test] + fn test_basic_indel() { + // models AG -> A / AGT + let variant = Variant::new_indel( + 0, 20, 2, + b"A".to_vec(), b"AGT".to_vec(), + 1, 2 + ); + assert_eq!(variant.get_type(), VariantType::Indel); + assert_eq!(variant.position(), 20); + assert_eq!(variant.get_ref_len(), 2); + assert_eq!(variant.match_allele(b"A"), 0); + assert_eq!(variant.match_allele(b"AGT"), 1); + assert_eq!(variant.match_allele(b"AG"), 2); + } + + #[test] + fn test_sv_insertion() { + let variant = Variant::new_sv_insertion( + 0, 20, + b"A".to_vec(), b"AGT".to_vec(), + 0, 1 + ); + assert_eq!(variant.get_type(), VariantType::SvInsertion); + assert_eq!(variant.position(), 20); + assert_eq!(variant.get_ref_len(), 1); + + // TODO: replace this with the matching we will do with SVs + assert_eq!(variant.match_allele(b"A"), 0); + assert_eq!(variant.match_allele(b"AGT"), 1); + assert_eq!(variant.match_allele(b"AG"), 2); + } + + #[test] + fn test_sv_deletion() { + let variant = Variant::new_sv_deletion( + 0, 10, 3, + b"AGT".to_vec(), b"A".to_vec(), + 0, 1 + ); + assert_eq!(variant.get_type(), VariantType::SvDeletion); + assert_eq!(variant.position(), 10); + assert_eq!(variant.get_ref_len(), 3); + + // TODO: replace this with the matching we will do with SVs + assert_eq!(variant.match_allele(b"AGT"), 0); + assert_eq!(variant.match_allele(b"A"), 1); + assert_eq!(variant.match_allele(b"AG"), 2); + } + + #[test] + fn test_tandem_repeat() { + let variant = Variant::new_tandem_repeat( + 0, 10, 4, + b"AAAC".to_vec(), + b"AAACAAAC".to_vec(), + 0, 1 + ); + assert_eq!(variant.get_type(), VariantType::TandemRepeat); + assert_eq!(variant.position(), 10); + assert_eq!(variant.get_ref_len(), 4); + + assert_eq!(variant.match_allele(b"AAAC"), 0); + assert_eq!(variant.match_allele(b"AAACAAAC"), 1); + assert_eq!(variant.match_allele(b"AAACAA"), 2); + } + + #[test] + fn test_reference_adjustment() { + // models AG -> A / AGT + let mut variant = Variant::new_indel( + 0, 20, 2, + b"A".to_vec(), b"AGT".to_vec(), + 1, 2 + ); + + // make sure no fixins yet + assert_eq!(variant.get_prefix_len(), 0); + assert_eq!(variant.get_postfix_len(), 0); + + let prefix: Vec = b"AC".to_vec(); + variant.add_reference_prefix(&prefix); + let postfix: Vec = b"GGCC".to_vec(); + variant.add_reference_postfix(&postfix); + + assert_eq!(variant.get_truncated_allele0(), b"A"); + assert_eq!(variant.get_truncated_allele1(), b"AGT"); + + // trims off the extra 'C' we added + variant.truncate_reference_postfix(1); + + // make sure nothing here changes + assert_eq!(variant.get_type(), VariantType::Indel); + assert_eq!(variant.position(), 20); + assert_eq!(variant.get_ref_len(), 2); + + // check this new stuff + assert_eq!(variant.get_prefix_len(), 2); + assert_eq!(variant.get_postfix_len(), 3); + + // original alleles will not match exactly anymore + assert_eq!(variant.match_allele(b"A"), 2); + assert_eq!(variant.match_allele(b"AGT"), 2); + assert_eq!(variant.match_allele(b"AG"), 2); + + // inexact without the reference data will return weird results + assert_eq!(variant.closest_allele(b"A"), (0, 5, 7)); + assert_eq!(variant.closest_allele(b"AGT"), (0, 4, 5)); + assert_eq!(variant.closest_allele(b"AG"), (0, 4, 6)); + + // now lets inexact with the extensions + assert_eq!(variant.closest_allele(b"ACAGGC"), (0, 0, 2)); + assert_eq!(variant.closest_allele(b"ACAGTGGC"), (1, 0, 2)); + assert_eq!(variant.closest_allele(b"ACAGGGC"), (2, 1, 1)); + } +} diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..66eff21 --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,19 @@ + +/// A*-based phasing implementation +pub mod astar_phaser; +/// Functionality that iterates over VCF and BAM to form the prototype phase blocks +pub mod block_gen; +/// CLI functionality and checks +pub mod cli; +/// Contains multiple wrappers for useful data types in HiPhase +pub mod data_types; +/// Organizes primary workflow for a phase block including loading variants from VCF, loading reads from BAMs, running the phaser, and bundling the results +pub mod phaser; +/// Components for loading reads from a BAM file and converting them into haplotype observations +pub mod read_parsing; +/// Basic helpful utilities for pairwise sequence alignment +pub mod sequence_alignment; +/// Graph-based WFA - this is basically POA + WFA, but only allowing for measuring edit distance and no loops +pub mod wfa_graph; +/// Contains all the various output writer functionality +pub mod writers; diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..ee6b876 --- /dev/null +++ b/src/main.rs @@ -0,0 +1,639 @@ + +use hiphase::block_gen::{MultiPhaseBlockIterator, PhaseBlockIterator, get_vcf_samples, get_sample_bams}; +use hiphase::cli::{Settings,check_settings,get_raw_settings}; +use hiphase::data_types::reference_genome::ReferenceGenome; +use hiphase::phaser::{HaplotagResult, PhaseResult, solve_block, singleton_block}; +use hiphase::writers::block_stats::BlockStatsCollector; +use hiphase::writers::haplotag_writer::HaplotagWriter; +use hiphase::writers::ordered_bam_writer::OrderedBamWriter; +use hiphase::writers::ordered_vcf_writer::OrderedVcfWriter; +use hiphase::writers::phase_stats::StatsWriter; +use hiphase::writers::vcf_util::build_bcf_index; + +use log::{LevelFilter, debug, error, info, warn}; +use rustc_hash::FxHashMap as HashMap; +use std::path::PathBuf; +use std::sync::{Arc, mpsc}; +use std::time::Instant; +use threadpool::ThreadPool; + +fn main() { + // get the settings + let settings: Settings = get_raw_settings(); + let filter_level: LevelFilter = match settings.verbosity { + 0 => LevelFilter::Info, + 1 => LevelFilter::Debug, + _ => LevelFilter::Trace + }; + + // immediately setup logging first + env_logger::builder() + .format_timestamp_millis() + .filter_level(filter_level) + .init(); + + // okay, now we can check all the other settings + let cli_settings: Settings = check_settings(settings); + + // first we need to figure out which samples are getting phased + let mut sample_names: Vec = cli_settings.sample_names.clone(); + if sample_names.is_empty() { + // no samples were provided, so add the first one encountered + // we need to just infer that we're phasing the first one only for now + let all_sample_names = match get_vcf_samples(&cli_settings.vcf_filenames[0]) { + Ok(v) => v, + Err(e) => { + error!("Error during VCF sample name parsing: {}", e); + std::process::exit(exitcode::IOERR); + } + }; + + // some warnings as needed + if all_sample_names.len() > 1 { + warn!("Multi-sample VCF detected, but sample name was not provided. Assuming name is {:?}.", all_sample_names[0]); + } else { + debug!("Single-sample VCF detected, but sample name was not provided. Assuming name is {:?}.", all_sample_names[0]); + } + sample_names.push(all_sample_names[0].clone()); + } + + // if we are ignoring read groups, we need to verify only one sample is in use + if cli_settings.ignore_read_groups && sample_names.len() > 1 { + error!("Flag --ignore-read-groups cannot be used in conjuction with multiple sample names, either add read groups or run one sample name at a time."); + std::process::exit(exitcode::USAGE); + } + + // shared thread pool for bam IO + let bam_thread_pool = match rust_htslib::tpool::ThreadPool::new(cli_settings.io_threads.unwrap() as u32) { + Ok(btp) => btp, + Err(e) => { + error!("Error while starting thread pool: {}", e); + std::process::exit(exitcode::IOERR); + } + }; + + //here's where the fun starts + //generate blocks + let mut block_iterators: Vec = vec![]; + let mut all_used_bams = vec![]; + let mut sample_to_bams: HashMap> = Default::default(); + let mut sample_to_output_bams: HashMap> = Default::default(); + for sample_name in sample_names.iter() { + // figure out which BAMS go with the given sample + let (mut sample_bams, bam_indices) = if cli_settings.ignore_read_groups { + // if we are ignoring read groups, then we use all bams (and all indices) + ( + cli_settings.bam_filenames.clone(), + (0..cli_settings.bam_filenames.len()).collect() + ) + } else { + match get_sample_bams(&cli_settings.bam_filenames, sample_name, &cli_settings.reference_filename) { + Ok(sb) => sb, + Err(e) => { + error!("Error during BAM read group parsing: {}", e); + std::process::exit(exitcode::IOERR); + } + } + }; + sample_to_bams.insert(sample_name.clone(), sample_bams.clone()); + + // make a phase block iterator using just the sample-specific bams + let block_iterator: PhaseBlockIterator = match PhaseBlockIterator::new( + &cli_settings.vcf_filenames, + &sample_bams, + &cli_settings.reference_filename, + sample_name.clone(), + cli_settings.min_variant_quality, + cli_settings.min_mapping_quality, + cli_settings.min_spanning_reads, + !cli_settings.disable_supplemental_joins, + &bam_thread_pool + ) { + Ok(bi) => bi, + Err(e) => { + error!("Error during file loading: {}", e); + std::process::exit(exitcode::IOERR); + } + }; + + // add the iterator to our list to put together + block_iterators.push(block_iterator); + + // also save the used bams, we will check these soon + all_used_bams.append(&mut sample_bams); + + // check if we need to save names for BAM writing + if !cli_settings.output_bam_filenames.is_empty() { + let mut sample_output_bams = vec![]; + for &b_index in bam_indices.iter() { + sample_output_bams.push(cli_settings.output_bam_filenames[b_index].clone()); + } + sample_to_output_bams.insert(sample_name.clone(), sample_output_bams); + } + } + + if cli_settings.bam_filenames.len() != all_used_bams.len() { + let num_provided = cli_settings.bam_filenames.len(); + let num_used = all_used_bams.len(); + error!("User provided {} BAM files, but only {} matched samples for phasing", num_provided, num_used); + error!("Please remove extra BAM files or add additional samples, BAMs matching phasing: {:?}", all_used_bams); + std::process::exit(exitcode::IOERR); + } + + // create our joint iterator + let mut block_iterator: MultiPhaseBlockIterator = match MultiPhaseBlockIterator::new(block_iterators) { + Ok(mpbi) => mpbi, + Err(e) => { + error!("Error during phase block iterator creation: {}", e); + std::process::exit(exitcode::IOERR); + } + }; + + // this writer will write "in-order" provided we correctly pass the ordering of data to it + let mut vcf_writer: OrderedVcfWriter = match OrderedVcfWriter::new( + &cli_settings.vcf_filenames, + &cli_settings.output_vcf_filenames, + cli_settings.min_variant_quality, + &sample_names + ) { + Ok(vw) => vw, + Err(e) => { + error!("Error during VCF writer creation: {}", e); + std::process::exit(exitcode::IOERR); + } + }; + + // this write will write reads "in-order" provided we correctly pass the ordering of data to it + let mut opt_bam_writers: Option> = if cli_settings.output_bam_filenames.is_empty() { + None + } else { + let mut writer_map: HashMap = Default::default(); + for sample_name in sample_names.iter() { + let sample_bams = sample_to_bams.get(sample_name).unwrap(); + let sample_output_bams = sample_to_output_bams.get(sample_name).unwrap(); + writer_map.insert( + sample_name.clone(), + match OrderedBamWriter::new( + sample_name.clone(), + &cli_settings.reference_filename, + sample_bams, + sample_output_bams, + &bam_thread_pool + ) { + Ok(bw) => bw, + Err(e) => { + error!("Error during BAM writer creation: {}", e); + std::process::exit(exitcode::IOERR); + } + } + ); + } + Some(writer_map) + }; + + // create our stats file also + let mut stats_writer: Option = match cli_settings.stats_filename { + Some(ref filename) => { + match StatsWriter::new(filename) { + Ok(sw) => Some(sw), + Err(e) => { + error!("Error during statistics writer creation: {}", e); + std::process::exit(exitcode::IOERR); + } + } + }, + None => None + }; + + // create our block stats collector + let mut block_collector: BlockStatsCollector = BlockStatsCollector::new(); + + let skip_count = cli_settings.skip_blocks; + let take_count = cli_settings.take_blocks; + let debug_run: bool = if skip_count != 0 || take_count != usize::MAX { + warn!("Debug run detected, disabling file finalizing steps."); + warn!("Blocks to skip: {}", skip_count); + warn!("Blocks to process: {}", take_count); + true + } else { + false + }; + + // create our haplotag file if necessary + let mut haplotag_writer: Option = match cli_settings.haplotag_filename { + Some(ref filename) => { + match HaplotagWriter::new(filename) { + Ok(hw) => Some(hw), + Err(e) => { + error!("Error during haplotag writer creations: {}", e); + std::process::exit(exitcode::IOERR); + } + } + }, + None => None + }; + + // controls whether singletons are deeply run, including haplotagging + let phase_singletons: bool = cli_settings.phase_singletons; + + // get our reference genome if we have one + let reference_genome: ReferenceGenome = match ReferenceGenome::from_fasta(&cli_settings.reference_filename) { + Ok(rg) => rg, + Err(e) => { + error!("Error during reference loading: {}", e); + std::process::exit(exitcode::IOERR); + } + }; + + // we have to do this because we need access to the reference genome later also + let arc_reference_genome: Arc = Arc::new(reference_genome); + + //process the blocks (eventually in parallel) + let start_time: Instant = Instant::now(); + let mut total_variants: u64 = 0; + let mut results_received: u64 = 0; + + // values related to printing + const UPDATE_SPEED: u64 = 100; + + if cli_settings.threads <= 1 { + for (i, block_result) in block_iterator.by_ref().enumerate().skip(skip_count).take(take_count) { + let block = match block_result { + Ok(b) => b, + Err(e) => { + error!("Error while parsing VCF file: {}", e); + std::process::exit(exitcode::IOERR); + } + }; + debug!("block {}: {:?} {}", i, block, block.bp_len()); + + // we likely need to separate out the phase result from the haplotag result + let sample_bams = sample_to_bams.get(block.sample_name()).unwrap(); + let (phase_result, haplotag_result): (PhaseResult, HaplotagResult) = if phase_singletons || block.get_num_variants() > 1 { + match solve_block( + &block, + &cli_settings.vcf_filenames, + sample_bams, + &arc_reference_genome, + cli_settings.reference_buffer, + cli_settings.min_matched_alleles, + cli_settings.min_mapping_quality, + cli_settings.global_realign_cputime, + cli_settings.phase_min_queue_size, + cli_settings.phase_queue_increment, + cli_settings.wfa_prune_distance + ) { + Ok(r) => r, + Err(e) => { + error!("Error while processing {:?}:", block); + error!(" {}", e); + std::process::exit(exitcode::SOFTWARE); + } + } + } else { + singleton_block(&block) + }; + + // this is only for printing + total_variants += phase_result.phase_block.get_num_variants() as u64; + results_received += 1; + + process_results( + phase_result, haplotag_result, + &mut stats_writer, &mut block_collector, &mut haplotag_writer, + &mut vcf_writer, &mut opt_bam_writers, + ); + + if results_received % UPDATE_SPEED == 0 { + let time_so_far: f64 = start_time.elapsed().as_secs_f64(); + let blocks_per_sec: f64 = results_received as f64 / time_so_far; + let variants_per_sec: f64 = total_variants as f64 / time_so_far; + info!("Received results for {} phase blocks: {:.4} blocks/sec, {:.4} hets/sec, writer waiting on block {}", results_received, blocks_per_sec, variants_per_sec, vcf_writer.get_wait_block()); + } + } + } else { + //set up job configuration + info!("Starting job pool with {} threads...", cli_settings.threads); + let job_slots: u64 = 40 * cli_settings.threads as u64; + let mut jobs_queued: u64 = 0; + + //we need to set up the multiprocessing components now + let pool = ThreadPool::new(cli_settings.threads); + let (tx, rx) = mpsc::channel(); + let arc_cli_settings: Arc = Arc::new(cli_settings.clone()); + let arc_sample_to_bams = Arc::new(sample_to_bams.clone()); + + for (i, block_result) in block_iterator.by_ref().enumerate().skip(skip_count).take(take_count) { + // make sure no panics encountered so far + if pool.panic_count() > 0 { + error!("Panic detected in ThreadPool, check above for details."); + std::process::exit(exitcode::SOFTWARE); + } + + if jobs_queued - results_received >= job_slots { + let (phase_result, haplotag_result): (PhaseResult, HaplotagResult) = rx.recv().unwrap(); + + // this is only for printing + total_variants += phase_result.phase_block.get_num_variants() as u64; + results_received += 1; + + process_results( + phase_result, haplotag_result, + &mut stats_writer, &mut block_collector, &mut haplotag_writer, + &mut vcf_writer, &mut opt_bam_writers + ); + + if results_received % UPDATE_SPEED == 0 { + let time_so_far: f64 = start_time.elapsed().as_secs_f64(); + let blocks_per_sec: f64 = results_received as f64 / time_so_far; + let variants_per_sec: f64 = total_variants as f64 / time_so_far; + info!("Received results for {} phase blocks: {:.4} blocks/sec, {:.4} hets/sec, writer waiting on block {}", results_received, blocks_per_sec, variants_per_sec, vcf_writer.get_wait_block()); + } + } + + let block = match block_result { + Ok(b) => b, + Err(e) => { + error!("Error while parsing VCF file: {}", e); + std::process::exit(exitcode::IOERR); + } + }; + debug!("block {}: {:?} {}", i, block, block.bp_len()); + + jobs_queued += 1; + if jobs_queued % UPDATE_SPEED == 0 { + info!("Generated {} phase blocks, latest block: {:?}", jobs_queued, block); + } + + if phase_singletons || block.get_num_variants() > 1 { + let tx = tx.clone(); + let arc_cli_settings = arc_cli_settings.clone(); + let arc_reference_genome = arc_reference_genome.clone(); + let arc_sample_to_bams = arc_sample_to_bams.clone(); + + pool.execute(move|| { + let sample_bams = arc_sample_to_bams.get(block.sample_name()).unwrap(); + // dynamic errors cannot be sent via mpsc, so we need to handle errors here + let all_results = match solve_block( + &block, + &arc_cli_settings.vcf_filenames, + sample_bams, + &arc_reference_genome, + arc_cli_settings.reference_buffer, + arc_cli_settings.min_matched_alleles, + arc_cli_settings.min_mapping_quality, + arc_cli_settings.global_realign_cputime, + arc_cli_settings.phase_min_queue_size, + arc_cli_settings.phase_queue_increment, + arc_cli_settings.wfa_prune_distance + ) { + Ok(r) => r, + Err(e) => { + error!("Error while processing {:?}:", block); + error!(" {}", e); + std::process::exit(exitcode::SOFTWARE); + } + }; + tx.send(all_results).expect("channel will be there waiting for the pool"); + }); + } else { + // this is a singleton we can short-circuit here + let (phase_result, haplotag_result): (PhaseResult, HaplotagResult) = + singleton_block(&block); + + // this is only for printing + total_variants += phase_result.phase_block.get_num_variants() as u64; + results_received += 1; + + process_results( + phase_result, haplotag_result, + &mut stats_writer, &mut block_collector, &mut haplotag_writer, + &mut vcf_writer, &mut opt_bam_writers + ); + + if results_received % UPDATE_SPEED == 0 { + let time_so_far: f64 = start_time.elapsed().as_secs_f64(); + let blocks_per_sec: f64 = results_received as f64 / time_so_far; + let variants_per_sec: f64 = total_variants as f64 / time_so_far; + info!("Received results for {} phase blocks: {:.4} blocks/sec, {:.4} hets/sec, writer waiting on block {}", results_received, blocks_per_sec, variants_per_sec, vcf_writer.get_wait_block()); + } + } + } + + while results_received < jobs_queued { + // make sure no panics encountered so far + // TODO: if we hit deadlocks from panics, we may need to add this with some sort of suitable timeout: + // https://doc.rust-lang.org/std/sync/mpsc/struct.Receiver.html#method.recv_timeout + if pool.panic_count() > 0 { + error!("Panic detected in ThreadPool, check above for details."); + std::process::exit(exitcode::SOFTWARE); + } + + let (phase_result, haplotag_result): (PhaseResult, HaplotagResult) = rx.recv().unwrap(); + + // this is only for printing + total_variants += phase_result.phase_block.get_num_variants() as u64; + results_received += 1; + + process_results( + phase_result, haplotag_result, + &mut stats_writer, &mut block_collector, &mut haplotag_writer, + &mut vcf_writer, &mut opt_bam_writers + ); + + // do an update if we're on the mod of our speed OR it's the last one for a thread + if results_received % UPDATE_SPEED == 0 || (jobs_queued - results_received) < cli_settings.threads as u64 { + let time_so_far: f64 = start_time.elapsed().as_secs_f64(); + let blocks_per_sec: f64 = results_received as f64 / time_so_far; + let variants_per_sec: f64 = total_variants as f64 / time_so_far; + info!("Received results for {} / {} phase blocks: {:.4} blocks/sec, {:.4} hets/sec, writer waiting on block {}", results_received, jobs_queued, blocks_per_sec, variants_per_sec, vcf_writer.get_wait_block()); + } + } + } + + // if we are only doing partial files, this will not behave, so skip it + if !debug_run { + // we call this once at the end + match vcf_writer.write_to_end_position() { + Ok(()) => {}, + Err(e) => { + error!("Error while finalizing VCF chromosomes: {}", e); + std::process::exit(exitcode::IOERR); + } + }; + } + + // now we drop the VCF writer, this is to close out all the VCF files before indexing + std::mem::drop(vcf_writer); + for vcf_fn in cli_settings.output_vcf_filenames.iter() { + match build_bcf_index(vcf_fn, None, cli_settings.threads as u32, true) { + Ok(()) => { + info!("Finished building index for {:?}.", vcf_fn); + }, + Err(e) => { + error!("Error while building index for {:?}: {}", vcf_fn, e); + std::process::exit(exitcode::IOERR); + } + }; + } + + if let Some(bam_writers) = opt_bam_writers.as_mut() { + // if we are only doing partial files, this will not behave, so skip it + if !debug_run { + for bam_writer in bam_writers.values_mut() { + // first finalize whichever chromosome we were on + match bam_writer.finalize_chromosome() { + Ok(()) => {}, + Err(e) => { + error!("Error while finalizing BAM chromosomes: {}", e); + std::process::exit(exitcode::IOERR); + } + }; + + // copy reads from all the remaining chromosomes + match bam_writer.copy_remaining_chromosomes() { + Ok(()) => {}, + Err(e) => { + error!("Error while copying all remaining chromosomes: {}", e); + std::process::exit(exitcode::IOERR); + } + }; + } + } + + // now we need to drop the bam writer, this is to close out all the BAM files before indexing + std::mem::drop(opt_bam_writers); + + // index the BAM files with .bai files + for bam_fn in cli_settings.output_bam_filenames.iter() { + match rust_htslib::bam::index::build( + bam_fn, + None, + rust_htslib::bam::index::Type::Bai, + cli_settings.threads as u32 + ) { + Ok(()) => { + info!("Finished building index for {:?}.", bam_fn); + }, + Err(e) => { + error!("Error while building index for {:?}: {}", bam_fn, e); + std::process::exit(exitcode::IOERR); + } + }; + } + } + + if let Some(ref filename) = cli_settings.blocks_filename { + // this will save all block information to a csv/tsv file + info!("Saving all blocks to {:?}...", filename); + match block_collector.write_blocks(filename) { + Ok(()) => {}, + Err(e) => { + error!("Error while writing blocks file: {}", e); + std::process::exit(exitcode::IOERR); + } + }; + } + + if let Some(ref filename) = cli_settings.summary_filename { + // this will save chromosome level stats to a csv/tsv file + info!("Saving summary block statistics to {:?}...", filename); + match block_collector.write_block_stats( + &sample_names, filename, &arc_reference_genome, + block_iterator.variant_stats() + ) { + Ok(()) => {}, + Err(e) => { + error!("Error while writing summary statistics file: {}", e); + std::process::exit(exitcode::IOERR); + } + } + } + + info!("All phase blocks finished successfully after {} seconds.", start_time.elapsed().as_secs_f64()); +} + +/// Sub-routine to make sure we are always consistently processing results in an identical manner +/// This is mostly because I got tired of forgetting to change things in 3 places +/// # Argument +/// * `phase_result` - the phasing result from our algorithm +/// * `haplotag_result` - the haplotag result, does nothing if we are not haplotagging +/// * `opt_stats_writer` - mutable, optional reference to our algorithm stats writer +/// * `block_collector` - mutable reference to the block stats collector +/// * `opt_haplotag_writer` - mutable, optional reference to our haplotag writer +/// * `vcf_writer` - mutable reference to our VCF writer +/// * `opt_bam_writers` - mutable, optional reference to the BAM writers for haplotagging +fn process_results( + phase_result: PhaseResult, haplotag_result: HaplotagResult, + opt_stats_writer: &mut Option, block_collector: &mut BlockStatsCollector, + opt_haplotag_writer: &mut Option, + vcf_writer: &mut OrderedVcfWriter, opt_bam_writers: &mut Option> +) { + // common debug statements + debug!("block {} haplotypes:", phase_result.phase_block.get_block_index()); + debug!("{:?}", phase_result.haplotype_1); + debug!("{:?}", phase_result.haplotype_2); + + // write the stats if we have both a writer and a stats block + if let Some(stats_writer) = opt_stats_writer.as_mut() { + match stats_writer.write_stats(&phase_result) { + Ok(()) => {}, + Err(e) => { + error!("Error while writing statistics file: {}", e); + std::process::exit(exitcode::IOERR); + } + } + }; + + // save all the blocks here + for sub_block in phase_result.sub_phase_blocks.iter() { + block_collector.add_block(sub_block.clone()); + } + block_collector.add_result(&phase_result); + + match vcf_writer.write_phase_block(phase_result) { + Ok(()) => {}, + Err(e) => { + error!("Error while saving phase block: {}", e); + std::process::exit(exitcode::IOERR); + } + }; + + if let Some(haplotag_writer) = opt_haplotag_writer.as_mut() { + match haplotag_writer.write_block(&haplotag_result) { + Ok(()) => {}, + Err(e) => { + error!("Error while writing haplotag file: {}", e); + std::process::exit(exitcode::IOERR); + } + }; + } + + if let Some(bam_writers) = opt_bam_writers.as_mut() { + let sample_name = haplotag_result.phase_block.sample_name().to_string(); + let block_index = haplotag_result.phase_block.get_block_index(); + + // send this block to the correct writer + let bam_writer = bam_writers.get_mut(&sample_name).unwrap(); + match bam_writer.write_phase_block(haplotag_result) { + Ok(()) => {}, + Err(e) => { + error!("Error while saving haplotags: {}", e); + std::process::exit(exitcode::IOERR); + } + }; + + // now send the skip signal to the rest + for (sn, bam_writer) in bam_writers.iter_mut() { + if sn != &sample_name { + match bam_writer.write_dummy_block(block_index) { + Ok(()) => {}, + Err(e) => { + error!("Error while saving haplotags: {}", e); + std::process::exit(exitcode::IOERR); + } + }; + } + } + } +} \ No newline at end of file diff --git a/src/phaser.rs b/src/phaser.rs new file mode 100644 index 0000000..3255f65 --- /dev/null +++ b/src/phaser.rs @@ -0,0 +1,796 @@ + +use crate::astar_phaser; +use crate::block_gen::{PhaseBlock, is_phasable_variant, get_variant_type}; +use crate::data_types::read_segments::ReadSegment; +use crate::data_types::reference_genome::ReferenceGenome; +use crate::data_types::variants::{Variant, VariantType}; +use crate::read_parsing; +use crate::writers::phase_stats::{PhaseStats, ReadStats}; + +use bio::data_structures::interval_tree::IntervalTree; +use log::{debug, trace, warn}; +use priority_queue::PriorityQueue; +use rust_htslib::bcf; +use rust_htslib::bcf::record::GenotypeAllele; +use rustc_hash::FxHashMap as HashMap; +use simple_error::{SimpleError, bail}; +use std::cmp::{Ordering, Reverse}; +use std::path::PathBuf; + +/// Core function for loading variant calls from our VCF file and converting them into a `Variant` type. +/// # Arguments +/// * `region` - the phase block we need to load +/// * `vcf_paths` - the VCF files to load, must be zipped and indexed +/// * `reference_genome` - optional, the reference genome +/// * `reference_buffer` - the number of nearby bases to try to use for local realignment +/// * `is_hom_allowed` - if true, then non-reference homozygous variants will also be loaded into the second return Vec +fn load_variant_calls( + region: &PhaseBlock, + vcf_paths: &[PathBuf], + reference_genome: &ReferenceGenome, + reference_buffer: usize, + is_hom_allowed: bool +) -> Result<(Vec, Vec), Box> { + use rust_htslib::bcf::Read; + + // short circuit because otherwise bcf can throw errors + if region.get_num_variants() == 0 { + return Ok((vec![], vec![])); + } + + // initalize the variant queue with one variant from each VCF, any ties in position are broken by VCF input order + let mut variant_queue: PriorityQueue, Reverse)> = PriorityQueue::new(); + let mut vcf_readers: Vec = vcf_paths.iter() + .map(|filename| bcf::IndexedReader::from_path(filename).unwrap()) + .collect(); + let mut vcf_iterators: Vec<_> = vec![]; + let mut sample_indices: Vec = vec![]; + let sample_name: &str = region.sample_name(); + + // fetch the region for each VCF (if it exists) + for (vcf_index, vcf_reader) in vcf_readers.iter_mut().enumerate() { + // fetch the corresponding chrom index for this VCF file (they are not guaranteed to match) + let vcf_header: &bcf::header::HeaderView = vcf_reader.header(); + let chrom_index: u32 = vcf_header.name2rid(region.get_chrom().as_bytes())?; + + // first make sure we find the sample in this file + let mut lookup_index: Option = None; + for (sample_index, &vcf_sample) in vcf_header.samples().iter().enumerate() { + let vcf_sample_string: String = std::str::from_utf8(vcf_sample).unwrap().to_string(); + if vcf_sample_string == sample_name { + lookup_index = Some(sample_index); + break; + } + } + match lookup_index { + Some(index) => { + sample_indices.push(index); + }, + None => { + bail!("Sample name {:?} was not found in VCF: {:?}", sample_name, vcf_paths[vcf_index]); + } + }; + + // fetch our position in the VCF file + match vcf_reader.fetch(chrom_index, region.get_start(), Some(region.get_end())) { + Ok(()) => { + // we have entries, so get the first one and queue it + let mut vcf_iter = vcf_reader.records().peekable(); + let first_entry = vcf_iter.peek(); + if let Some(record_result) = first_entry { + let record: &rust_htslib::bcf::Record = match record_result { + Ok(r) => r, + // we have to convert to an owned error here, and the htslib errors are not cloneable + Err(e) => return Err(Box::new(SimpleError::from(e))) + }; + let position: i64 = record.pos(); + variant_queue.push(vcf_index, (Reverse(position), Reverse(vcf_index))); + }; + + // even if the iterator is empty, we push it so things are lined up correctly + vcf_iterators.push(vcf_iter); + }, + Err(_) => { + // this usually happens when there are no entries for the chromosome + vcf_iterators.push(vcf_reader.records().peekable()); + } + }; + } + + // parse all the records and convert them into our format + let mut variants: Vec = Vec::::with_capacity(region.get_num_variants()); + let mut hom_variants: Vec = vec![]; + let mut previous_het_end: usize = 0; + + while !variant_queue.is_empty() { + // get the source of the next variant to process and the sample index in that VCF file + let (pop_index, pop_priority) = variant_queue.pop().unwrap(); + let sample_index: usize = sample_indices[pop_index]; + + // process this variant + let record_result = vcf_iterators[pop_index].next().unwrap(); + let record = record_result?; + + let position: i64 = record.pos(); + assert_eq!(position, pop_priority.0.0); // sanity check that the variant matches our position priority + if position < region.get_start() as i64 { + // this can happen when you have very very long indels that span one of our breaks + // we have already written though, so don't write it again + } else { + let include_variant = is_phasable_variant(&record, sample_index, region.get_min_quality(), is_hom_allowed)?; + if include_variant { + let variant_type = get_variant_type(&record)?; + + // TODO: ideally, this would be consolidated with our Zygosity code block, but we need index_alleles further on + // possible solution is to make Zygosity types have u8 values tied to them + // low priority: not a major slowdown at this time + // get the genotypes + let all_genotypes = record.genotypes()?; + let genotype = all_genotypes.get(sample_index); + assert!(genotype.len() <= 2); + + // we don't really expect more than 255 alleles, but make sure we panic if that *does* happen + let mut index_allele0: u8 = match genotype[0] { + GenotypeAllele::Unphased(at) => at.try_into().unwrap(), + GenotypeAllele::Phased(at) => at.try_into().unwrap(), + //TODO: ignore these for now, not sure how to handle it? + GenotypeAllele::UnphasedMissing => panic!("Should not happen"), + GenotypeAllele::PhasedMissing => panic!("Should not happen") + }; + + let mut index_allele1: u8 = if genotype.len() == 1 { + // TRGT can generate single-haplotype results, in this instance just copy index_allele0 and pretend it was homozygous + index_allele0 + } else { + match genotype[1] { + GenotypeAllele::Unphased(at) => at.try_into().unwrap(), + GenotypeAllele::Phased(at) => at.try_into().unwrap(), + //TODO: ignore these for now, not sure how to handle it? + GenotypeAllele::UnphasedMissing => panic!("Should not happen"), + GenotypeAllele::PhasedMissing => panic!("Should not happen") + } + }; + + // in merged VCF files, they are not always ordered, check for that here and swap if they are out of order + // technically, this can happen in any VCF I supposed + // are there any concerns to this swap? we already do many output swaps with the phasing so seems like no + if index_allele0 > index_allele1 { + std::mem::swap(&mut index_allele0, &mut index_allele1); + } + + // special case for homozygous + let is_homozygous = index_allele0 == index_allele1; + assert!(!is_homozygous || is_hom_allowed); + if is_homozygous { + // this forces our homozygous variants to load as if they were heterozygous (e.g. with a reference allele) + // this is fine because we are not phasing them, just wanting to use the sequence + // TODO: can we remove this hackery? I could see it being an issue later + index_allele0 = 0; + } + + let all_alleles = record.alleles(); + let ref_len: usize = all_alleles[0].len(); + let allele0: Vec = all_alleles[index_allele0 as usize].to_vec(); + let allele1: Vec = all_alleles[index_allele1 as usize].to_vec(); + + let mut new_variant = match variant_type { + VariantType::Snv => { + Variant::new_snv( + pop_index, position, allele0, allele1, index_allele0, index_allele1 + ) + }, + VariantType::Deletion => { + Variant::new_deletion( + pop_index, position, ref_len, + allele0, allele1, + index_allele0, index_allele1 + ) + }, + VariantType::Insertion => { + Variant::new_insertion( + pop_index, position, allele0, allele1, index_allele0, index_allele1 + ) + }, + VariantType::Indel => { + Variant::new_indel( + pop_index, position, ref_len, + allele0, allele1, + index_allele0, index_allele1 + ) + }, + VariantType::SvDeletion => { + Variant::new_sv_deletion( + pop_index, position, ref_len, + allele0, allele1, + index_allele0, index_allele1 + ) + }, + VariantType::SvInsertion => { + Variant::new_sv_insertion( + pop_index, position, allele0, allele1, index_allele0, index_allele1 + ) + }, + VariantType::TandemRepeat => { + Variant::new_tandem_repeat( + pop_index, position, ref_len, + allele0, allele1, + index_allele0, index_allele1 + ) + } + VariantType::SvDuplication | + VariantType::SvInversion | + VariantType::SvBreakend | + VariantType::Unknown => { + // panic here because we shouldn't allow these types unless we implement the variant + panic!("no impl for {variant_type:?}"); + } + }; + + if reference_buffer > 0 && !is_homozygous { + // we have a reference genome and a desire buffer, extend our alleles + let mut ref_prefix_start: usize = if (position as usize) > reference_buffer { + position as usize - reference_buffer + } else { + 0 + }; + let ref_postfix_start: usize = position as usize + ref_len; + + // we used to have an assertion here with the plan to remove it eventually, but it turns out + // that users do crazy things, so we need to convert it to a full Error + let ref_sequence = reference_genome.get_slice(region.get_chrom(), position as usize, ref_postfix_start); + if all_alleles[0] != ref_sequence { + bail!( + "Reference mismatch error: variant at {}:{} has REF allele = \"{}\", but reference genome has \"{}\".", + region.get_chrom(), position+1, + // we don't want to panic in the middle of this, so use a safe unwrapper with default + std::str::from_utf8(all_alleles[0]).unwrap_or("utf8 decode error"), + std::str::from_utf8(ref_sequence).unwrap_or("utf8 decode error") + ); + } + + // check if this variant is too close to the previous variant + if ref_prefix_start < previous_het_end { + // for the previous variant, we need to truncate it, possibly all the way down + if let Some(v) = variants.last_mut() { + let current_end: usize = v.position() as usize + v.get_ref_len() + v.get_postfix_len(); + let truncate_length: usize = (current_end - position as usize).min(v.get_postfix_len()); + v.truncate_reference_postfix(truncate_length); + } else { + panic!("This should not happen with our checks."); + } + + // for this variant, set the new end to either the previous het end OR the position, whichever is lower + // previous het end can be lower if you have overlapping indels, not much we can do to fix that without global realignment + ref_prefix_start = previous_het_end.min(position as usize); + } + + // add the prefix + let prefix: &[u8] = reference_genome.get_slice(region.get_chrom(), ref_prefix_start, position as usize); + new_variant.add_reference_prefix(prefix); + + // add the postfix + let postfix: &[u8] = reference_genome.get_slice(region.get_chrom(), ref_postfix_start, ref_postfix_start + reference_buffer); + new_variant.add_reference_postfix(postfix); + + // update the previous position to match the position + ref length + previous_het_end = position as usize + ref_len; + } + + if is_homozygous { + hom_variants.push(new_variant); + } else { + variants.push(new_variant); + } + } + } + + // requeue from the one we popped from + let next_entry = vcf_iterators[pop_index].peek(); + if let Some(record_result) = next_entry { + let record: &rust_htslib::bcf::Record = match record_result { + Ok(r) => r, + // we have to convert to an owned error here, and the htslib errors are not cloneable + Err(e) => return Err(Box::new(SimpleError::from(e))) + }; + let position: i64 = record.pos(); + variant_queue.push(pop_index, (Reverse(position), Reverse(pop_index))); + }; + } + + // sanity check that we found the same number of things + assert_eq!(variants.len(), region.get_num_variants()); + Ok((variants, hom_variants)) +} + +/// A result for a phasing algorithm, assumes diploid solution currently. +pub struct PhaseResult { + /// The phase block defining the problem space. + pub phase_block: PhaseBlock, + /// The variants contained in the phase block. + pub variants: Vec, + /// The first haplotype in the solution. + pub haplotype_1: Vec, + /// The second haplotype in the solution. + pub haplotype_2: Vec, + /// Store the phase block ID of the variant + pub block_ids: Vec, + /// Stores all non-empty sub-blocks + pub sub_phase_blocks: Vec, + /// Optional read statistics + pub read_statistics: Option, + /// Optional statistics from the problem + pub statistics: Option +} + +/// Calculates the span count for each juncture in the solution, ignoring homozygous variants and unassigned alleles. +/// # Arguments +/// * `read_segments` - the read segments used to solve the problem +/// * `haplotype_1` - the first haplotype in the solution +/// * `haplotype_2` - the second haplotype in the solution +fn get_solution_span_counts( + read_segments: &IntervalTree, + haplotype_1: &[u8], + haplotype_2: &[u8] +) -> Vec { + // this will store the total spanning reads ignoring any homozygous or unsolved variants + assert_eq!(haplotype_1.len(), haplotype_2.len()); + + // there is one less connection than the total number of variants + let mut total_span_counts: Vec = vec![0; haplotype_1.len() - 1]; + + // iterate over all the read segments + for rs_interval in read_segments.find(0..usize::MAX) { + let rs = rs_interval.data(); + + // the range returns [first_allele, last_allele+1), we need to basically remove the +1 here since we're talking junctures + let mut juncture_range = rs.get_range(); + juncture_range.end -= 1; + + // if any of the head variants were converted to homozygous, do not include because they don't provide spanning evidence anymore + while juncture_range.start < juncture_range.end && + haplotype_1[juncture_range.start] == haplotype_2[juncture_range.start] { + juncture_range.start += 1; + } + + // if any of the tail variants were converted to homozygous, do not include because they don't provide spanning evidence anymore + while juncture_range.start < juncture_range.end && + haplotype_1[juncture_range.end] == haplotype_2[juncture_range.end] { + juncture_range.end -= 1; + } + + // the range has been truncated upstream so we're only looking at junctures + for tpc in total_span_counts[juncture_range].iter_mut() { + *tpc += 1; + } + } + + total_span_counts +} + +/// Core structure of phasing that can be run on a single processor to solve a phase block. +/// This method is designed to perform data loading and then run an algorithm from another module to solve the block. +/// See `astar_phaser::astar_solver(...)` for an example solver implementation. +/// # Arguments +/// * `phase_problem` - the problem definition, primarily defines coordinates of the phase block we want to solve +/// * `vcf_paths` - the VCF files to load variants from, must be zipped and indexed +/// * `sample_name` - the sample name inside the VCF files +/// * `bam_paths` - the BAM files to load read observations from, must be indexed +/// * `reference_genome` - optional, the reference genome +/// * `reference_buffer` - the number of nearby bases to try to use for local realignment +/// * `min_matched_alleles` - the minimum number of matched alleles required to include a read +/// * `min_mapq` - the minimum MAPQ to include a read +/// * `global_realign_cputime` - the maximum allowed global realignment CPU time; if 0, then only local realignment is used +/// * `min_queue_size` - the minimum length of the queue +/// * `queue_increment` - the length that the queue grows as more variants are added to the solution +/// * `wfa_prune_distance` - maximum allowed distance a wavefront can lag; make smaller to reduce run-time at the cost of accuracy +#[allow(clippy::too_many_arguments)] +pub fn solve_block( + phase_problem: &PhaseBlock, vcf_paths: &[PathBuf], bam_paths: &[PathBuf], + reference_genome: &ReferenceGenome, reference_buffer: usize, + min_matched_alleles: usize, min_mapq: u8, global_realign_cputime: f32, + min_queue_size: usize, queue_increment: usize, wfa_prune_distance: usize +) -> Result<(PhaseResult, HaplotagResult), Box> { + debug!("Solving problem: {:?}", phase_problem); + + // short circuit for "empty" problems + if phase_problem.get_num_variants() == 0 { + // this should only happen for chromosomes with no het alleles + assert!(phase_problem.get_start() == 0); + assert!(phase_problem.get_end() == 0); + let empty_result = PhaseResult { + phase_block: phase_problem.clone(), + variants: vec![], + haplotype_1: vec![], + haplotype_2: vec![], + block_ids: vec![], + sub_phase_blocks: vec![], + read_statistics: None, + statistics: None + }; + let empty_haplotag_result: HaplotagResult = HaplotagResult { + phase_block: phase_problem.clone(), + reads: Default::default() + }; + return Ok((empty_result, empty_haplotag_result)); + } + + // homs are only used if global realignment is being attempted + let load_homs: bool = global_realign_cputime > 0.0; + + // lets extract the variants we care about from the vcf + let (mut variant_calls, mut hom_calls): (Vec, Vec) = load_variant_calls( + phase_problem, + vcf_paths, + reference_genome, reference_buffer, + load_homs + )?; + assert_eq!(variant_calls.len(), phase_problem.get_num_variants()); + + // go through all the loaded variants, including homs, and pull out the TandemRepeat coordinates that have been loaded + let mut tr_segments: IntervalTree = Default::default(); + for variant in variant_calls.iter().chain(hom_calls.iter()) { + if variant.get_type() == VariantType::TandemRepeat { + let start: i64 = variant.position(); + let ref_len: usize = variant.get_ref_len(); + let end: i64 = start + ref_len as i64; + // sometimes TRGT inserts the base before which will match an insertion and sometimes it won't - confirmed with Egor + // we can't really tell which is which though, so lets just substract 1 from the start and assume that's the best for now + // TODO: if TRGT adjusts to always have an anchor, we will need to drop the "-1" operation + tr_segments.insert((start-1)..end, 0); + } + } + + // now mark all variants contained by the STRs as ignored + for variant in variant_calls.iter_mut() { + if variant.get_type() != VariantType::TandemRepeat { + let start: i64 = variant.position(); + let ref_len: usize = variant.get_ref_len(); + let end: i64 = start + ref_len as i64; + let var_interval = start..end; + let mut is_contained: bool = false; + for segment in tr_segments.find(var_interval) { + let seg_start = segment.interval().start; + let seg_end = segment.interval().end; + if seg_start <= start && seg_end >= end { + // this segment fully contains the variant + is_contained = true; + break; + } + } + + if is_contained { + // we need to mark this one as unphaseable + variant.set_ignored(); + debug!("Set ignored het: {:?}", variant); + } + } + } + + // mark any homs we are going to ignore as well + for variant in hom_calls.iter_mut() { + if variant.get_type() != VariantType::TandemRepeat { + let start: i64 = variant.position(); + let ref_len: usize = variant.get_ref_len(); + let end: i64 = start + ref_len as i64; + let var_interval = start..end; + let mut is_contained: bool = false; + for segment in tr_segments.find(var_interval) { + let seg_start = segment.interval().start; + let seg_end = segment.interval().end; + if seg_start <= start && seg_end >= end { + // this segment fully contains the variant + is_contained = true; + break; + } + } + + if is_contained { + // we need to mark this one as unphaseable + variant.set_ignored(); + debug!("Set ignored hom: {:?}", variant); + } + } + } + + // reads that meet our criteria for use in phasing + let read_segments: IntervalTree; + // reads that meet our criteria EXCEPT for the minimum number of alleles is just > 0, so they can potentially be phased after we solve + let phasable_segments: IntervalTree; + let read_stats: ReadStats; + + if global_realign_cputime == 0.0 { + // we are doing local re-alignments only + (read_segments, phasable_segments, read_stats) = read_parsing::load_read_segments( + phase_problem, bam_paths, reference_genome.filename(), + &variant_calls, min_matched_alleles, min_mapq + )?; + } else { + // we are attempting global re-alignments + (read_segments, phasable_segments, read_stats) = match read_parsing::load_full_read_segments( + phase_problem, bam_paths, &variant_calls, &hom_calls, + reference_genome, min_matched_alleles, min_mapq, + global_realign_cputime, wfa_prune_distance + ) { + Ok(rs) => rs, + Err(e) => { + if e.to_string() == "max_runtime reached" { + // fall back to the local realignment approach if we run too long + warn!( + "B#{} ({}:{}-{}) detected excessive runtime in read parsing, reverting to local re-alignment.", + phase_problem.get_block_index(), phase_problem.get_chrom(), phase_problem.get_start(), phase_problem.get_end() + ); + read_parsing::load_read_segments( + phase_problem, bam_paths, reference_genome.filename(), + &variant_calls, min_matched_alleles, min_mapq + )? + } else { + return Err(e); + } + } + }; + } + + // read segment debugging + for (i, seg) in read_segments.find(0..usize::MAX).enumerate() { + trace!("read segment #{} => {:?}", i, seg); + } + + // okay final phase is to solve some algorithm given those read segments + let astar_result: astar_phaser::AstarResult = astar_phaser::astar_solver( + phase_problem, &variant_calls[..], &read_segments, min_queue_size, queue_increment + ); + + // get the total spanning counts after accounting for homozygous variants / missing alleles + let total_span_counts: Vec = get_solution_span_counts(&read_segments, &astar_result.haplotype_1, &astar_result.haplotype_2); + debug!("total_span_counts: {:?}", total_span_counts); + + // if we end up having no reads spanning a juncture, it's time to split the block + let block_split: Vec = total_span_counts.iter() + .map(|&tc| tc == 0) + .collect::>(); + + debug!("Block split: {:?}", block_split); + + // now, figure out what the haplotag is for each variant + let mut block_tags: Vec = vec![0; variant_calls.len()]; + let mut current_tag: usize = variant_calls[0].position() as usize; + for (i, variant) in variant_calls.iter().enumerate() { + if i > 0 && block_split[i-1] { + // this is a new block + current_tag = variant.position() as usize; + } + block_tags[i] = current_tag; + } + debug!("Block tags: {:?}", block_tags); + + // generate all of our non-empty sub-blocks now + let mut sub_phase_blocks: Vec = vec![]; + let mut current_block: PhaseBlock = PhaseBlock::new( + phase_problem.get_block_index(), + phase_problem.get_chrom().to_string(), + phase_problem.get_chrom_index(), + phase_problem.get_min_quality(), + phase_problem.sample_name().to_string() + ); + let mut current_tag = block_tags[0]; + for (i, variant) in variant_calls.iter().enumerate() { + let h1 = astar_result.haplotype_1[i]; + let h2 = astar_result.haplotype_2[i]; + if h1 < 2 && h2 < 2 && h1 != h2 { + // this is a heterozygous variant in our result + if current_tag != block_tags[i] { + if current_block.get_num_variants() > 0 { + // it's part of a new block though, so we need to push the old one + sub_phase_blocks.push(current_block); + current_block = PhaseBlock::new( + phase_problem.get_block_index(), + phase_problem.get_chrom().to_string(), + phase_problem.get_chrom_index(), + phase_problem.get_min_quality(), + phase_problem.sample_name().to_string() + ); + } + + // make sure we update to the new tag also + current_tag = block_tags[i]; + } + + // add the variant to the current block + current_block.add_locus_variant(phase_problem.get_chrom(), variant.position() as u64, variant.get_vcf_index()); + } + } + + // check if we have a block left to push + if current_block.get_num_variants() > 0 { + sub_phase_blocks.push(current_block); + } + debug!("sub_phase_blocks: {:?}", sub_phase_blocks); + + // last step is to haplotag the reads we loaded + let mut haplotagged_reads: HashMap = haplotag_reads( + read_segments, &astar_result.haplotype_1, &astar_result.haplotype_2, + &block_tags + ); + + // also haplotype the extra reads and add them to our result + let phasable_haplotagged_reads: HashMap = haplotag_reads( + phasable_segments, &astar_result.haplotype_1, &astar_result.haplotype_2, + &block_tags + ); + + // we could just extend here, but we want to sanity check our keys don't overlap at all + for (k, v) in phasable_haplotagged_reads.into_iter() { + // we should never have the same read name in both hashmaps + assert!(!haplotagged_reads.contains_key(&k)); + haplotagged_reads.insert(k, v); + } + + let haplotag_result: HaplotagResult = HaplotagResult { + phase_block: phase_problem.clone(), + reads: haplotagged_reads + }; + + // save all our results here + let phase_result: PhaseResult = PhaseResult { + phase_block: phase_problem.clone(), + variants: variant_calls, + haplotype_1: astar_result.haplotype_1, + haplotype_2: astar_result.haplotype_2, + block_ids: block_tags, + sub_phase_blocks, + read_statistics: Some(read_stats), + statistics: Some(astar_result.statistics) + }; + Ok((phase_result, haplotag_result)) +} + +/// This function generates a singleton "solution". +/// It is boilerplate for an unsolved block because that block only has one variant, and we don't care to phase it. +/// # Arguments +/// * `phase_problem` - the problem definition, primarily defines coordinates of the phase block we want to solve +pub fn singleton_block(phase_problem: &PhaseBlock) -> (PhaseResult, HaplotagResult) { + debug!("Generating empty result for singleton: {phase_problem:?}"); + + // in downstream writing, the only thing that matters is the vcf_index, so make sure we set that correctly (knowing this is a singleton) + // everything else can be garbage + let dummy_variant = Variant::new_snv( + phase_problem.get_first_variant_vcf(), + phase_problem.get_start() as i64, + vec![0], + vec![1], + 0, + 1 + ); + let num_variants = phase_problem.get_num_variants(); + let variant_calls: Vec = if num_variants == 0 { vec![] } else { vec![dummy_variant] }; + assert_eq!(variant_calls.len(), num_variants); + + // now we can make our dummy results + let phase_result: PhaseResult = PhaseResult { + phase_block: phase_problem.clone(), + variants: variant_calls, + haplotype_1: vec![2; num_variants], + haplotype_2: vec![2; num_variants], + block_ids: vec![phase_problem.get_start() as usize; num_variants], + sub_phase_blocks: vec![], // empty because this is not getting treated as a block + read_statistics: None, + statistics: None + }; + let haplotag_result: HaplotagResult = HaplotagResult { + phase_block: phase_problem.clone(), + reads: Default::default() // no haplotagging in this mode, so give back an empty map + }; + (phase_result, haplotag_result) +} + +/// Stores all information for a haplotag result +#[derive(Debug)] +pub struct HaplotagResult { + /// The phase block defining the problem space. + pub phase_block: PhaseBlock, + /// Indexes reads by name and returns a tuple (phase block ID, haplotag) + pub reads: HashMap +} + +/// Returns the tagging results for the reads in a HashMap. +/// Values are tuples (phase block ID (0-based), haplotag value (0 or 1)) +/// # Arguments +/// * `read_segments` - the reads to tag +/// * `haplotype_1` - the first haplotype +/// * `haplotype_2` - the second haplotype +/// * `block_tags` - tags for the blocks based on the variant IDs +/// # Panics +/// * if the haplotypes are not the same length as each other, a read segment, and/or the variant calls +/// * the block breaks is not 1 less length than the variant calls +pub fn haplotag_reads( + read_segments: IntervalTree, + haplotype_1: &[u8], haplotype_2: &[u8], block_tags: &[usize] +) -> HashMap { + // now do the tagging + let mut haplotagged_reads: HashMap = Default::default(); + for rs_interval in read_segments.find(0..usize::MAX) { + // first, see if we can resolve it to a haplotype + let rs: &ReadSegment = rs_interval.data(); + let a1_score: u64 = rs.score_haplotype(haplotype_1); + let a2_score: u64 = rs.score_haplotype(haplotype_2); + let haplotag: usize = match a1_score.cmp(&a2_score) { + Ordering::Less => 0, + Ordering::Greater => 1, + Ordering::Equal => 2 + }; + + if haplotag != 2 { + // we can resolve to a haplotype, now get the phase block index + // find the first resolved variant in our read segment + let mut first_variant: usize = rs.first_allele(); + + // while the haplotypes are equal there OR the variant is not resolved (which can happen sometimes) + while haplotype_1[first_variant] == haplotype_2[first_variant] || rs.alleles()[first_variant] >= 2 { + first_variant += 1; + } + let phase_block: usize = block_tags[first_variant]; + + // finally, just get the read name and make sure we haven't somehow already marked this one + let read_name: String = rs.read_name().to_string(); + assert!(!haplotagged_reads.contains_key(&read_name)); + haplotagged_reads.insert(read_name, (phase_block, haplotag)); + } + } + + haplotagged_reads +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_get_solution_span_counts() { + let haplotype_1 = vec![0, 1, 1, 0, 0, 0]; + let haplotype_2 = vec![1, 1, 1, 1, 0, 1]; + let test_reads = vec![ + ReadSegment::new("r1".to_string(), vec![0, 0, 0, 0, 0, 0], vec![1, 1, 1, 1, 1, 1]), // adds 1 to everything + ReadSegment::new("r2".to_string(), vec![2, 2, 2, 1, 1, 2], vec![0, 0, 0, 1, 1, 0]), // one allele is a hom, so this does nothing + ReadSegment::new("r3".to_string(), vec![1, 1, 1, 1, 2, 2], vec![1, 1, 1, 1, 0, 0]), // adds 1 to first 3 + ReadSegment::new("r4".to_string(), vec![2, 1, 1, 1, 1, 1], vec![0, 1, 1, 1, 1, 1]), // adds 1 to last 2 + ]; + let mut read_segments: IntervalTree = Default::default(); + for rs in test_reads.into_iter() { + let rs_range = rs.get_range(); + read_segments.insert(rs_range, rs); + } + let expected_result: Vec = vec![2, 2, 2, 2, 2]; + + let result = get_solution_span_counts(&read_segments, &haplotype_1, &haplotype_2); + assert_eq!(expected_result, result); + } + + #[test] + fn test_haplotag_reads() { + let haplotype_1 = vec![0, 0, 0, 0, 0, 0]; + let haplotype_2 = vec![1, 1, 1, 1, 1, 1]; + let block_tags = vec![0, 0, 0, 3, 3, 5]; + let test_reads = vec![ + ReadSegment::new("r1".to_string(), vec![0, 0, 0, 0, 0, 0], vec![1, 1, 1, 1, 1, 1]), + ReadSegment::new("r2".to_string(), vec![2, 2, 2, 1, 1, 2], vec![0, 0, 0, 1, 1, 0]), + ReadSegment::new("r3".to_string(), vec![2, 2, 2, 1, 0, 2], vec![0, 0, 0, 1, 1, 0]), + ReadSegment::new("r4".to_string(), vec![2, 2, 2, 1, 0, 1], vec![0, 0, 0, 1, 1, 1]), + ReadSegment::new("r5".to_string(), vec![2, 2, 2, 1, 0, 2], vec![0, 0, 0, 2, 1, 0]), + ]; + + let mut read_segments: IntervalTree = Default::default(); + for rs in test_reads.into_iter() { + let rs_range = rs.get_range(); + read_segments.insert(rs_range, rs); + } + + let haplotag_result = haplotag_reads(read_segments, &haplotype_1, &haplotype_2, &block_tags); + + // simple-ish cases + assert_eq!(haplotag_result.get("r1").unwrap(), &(0, 0)); // exact match to haplotype 1 + assert_eq!(haplotag_result.get("r2").unwrap(), &(3, 1)); // exact, but incomplete, match to haplotype 2 + assert!(!haplotag_result.contains_key("r3")); // equal to both, so unassigned + assert_eq!(haplotag_result.get("r4").unwrap(), &(3, 1)); // spans two blocks and is inexact, but closer to hap 1 starting with block 3 + assert_eq!(haplotag_result.get("r2").unwrap(), &(3, 1)); // equal by alleles, but qual on allele 1 is higher + } +} \ No newline at end of file diff --git a/src/read_parsing.rs b/src/read_parsing.rs new file mode 100644 index 0000000..de3186b --- /dev/null +++ b/src/read_parsing.rs @@ -0,0 +1,745 @@ + +use crate::block_gen::{PhaseBlock, filter_out_alignment_record}; +use crate::data_types::read_segments::ReadSegment; +use crate::data_types::reference_genome::ReferenceGenome; +use crate::data_types::variants::{Variant, VariantType}; +use crate::wfa_graph::{NodeAlleleMap, WFAGraph, WFAResult}; +use crate::writers::phase_stats::ReadStats; + +use bio::data_structures::interval_tree::IntervalTree; +use log::{debug, trace, warn}; +use rust_htslib::bam; +use rustc_hash::FxHashMap as HashMap; +use simple_error::bail; +use std::path::{Path, PathBuf}; + +/// Loads up all the reads in a particular phase region and converts them into their variant representation. +/// This version uses local re-alignment to parse the alleles. +/// Returns an interval tree containing all reads to use for phasing, a second tree containing extra reads that *can* be phased but didn't match our criteria, +/// and statistics from loading the reads. +/// # Arguments +/// * `phase_problem` - the phase block we are loading data for +/// * `bam_paths` - the BAM files to parse, must be indexed +/// * `reference_filename` - the reference fasta file +/// * `variant_calls` - the variants used to convert full reads into haplotype observations (`ReadSegment`) +/// * `min_matched_alleles` - the minimum number of identified alleles required for a read to be included +/// * `min_mapq` - the minimum MAPQ to consider a read +#[allow(clippy::type_complexity)] +pub fn load_read_segments( + phase_problem: &PhaseBlock, bam_paths: &[PathBuf], reference_filename: &Path, + variant_calls: &[Variant], min_matched_alleles: usize, min_mapq: u8 +) -> Result<(IntervalTree, IntervalTree, ReadStats), Box> { + use rust_htslib::bam::Read; + use rust_htslib::bam::ext::BamRecordExtensions; + + let num_variants: usize = variant_calls.len(); + let mut read_groups: HashMap> = Default::default(); + + // stats we track + let mut num_reads: u64 = 0; + let mut skipped_reads: u64 = 0; + let mut num_alleles: u64 = 0; + let mut exact_matches: [u64; VariantType::Unknown as usize + 1] = [0; VariantType::Unknown as usize + 1]; + let mut inexact_matches: [u64; VariantType::Unknown as usize + 1] = [0; VariantType::Unknown as usize + 1]; + let mut failed_matches: [u64; VariantType::Unknown as usize + 1] = [0; VariantType::Unknown as usize + 1]; + let mut allele0_matches: [u64; VariantType::Unknown as usize + 1] = [0; VariantType::Unknown as usize + 1]; + let mut allele1_matches: [u64; VariantType::Unknown as usize + 1] = [0; VariantType::Unknown as usize + 1]; + + for bam_filename in bam_paths.iter() { + let mut bam_reader = bam::IndexedReader::from_path(bam_filename)?; + bam_reader.set_reference(reference_filename)?; + bam_reader.fetch((phase_problem.get_chrom(), phase_problem.get_start(), phase_problem.get_end()+1))?; + + for read_entry in bam_reader.records() { + let mut read = read_entry?; + + //make sure we care about the alignment + if filter_out_alignment_record(&read, min_mapq) { + continue; + } + + //build out the cigar info + read.cache_cigar(); + + //build a lookup from reference coordinate -> sequence coordinate + let mut coordinate_lookup: HashMap = Default::default(); + let min_position = read.pos(); + let mut max_position = read.pos(); + for bp in read.aligned_pairs() { + let segment_index = bp[0]; + let ref_index = bp[1]; + coordinate_lookup.insert(ref_index, segment_index); + max_position = max_position.max(ref_index); + } + assert!(max_position >= min_position); + + // max_position is the last one that we found, so add +1 to include in the range + let aligned_range = min_position..(max_position+1); + + //.seq() returns Seq<'_> type, but we should just full decode + let read_sequence: Vec = read.seq().as_bytes(); + let read_qualities: &[u8] = read.qual(); + assert_eq!(read_sequence.len(), read_qualities.len()); + + //we will populate these with the variant level info + let mut alleles: Vec = Vec::::with_capacity(num_variants); + let mut quals: Vec = Vec::::with_capacity(num_variants); + let mut num_overlaps: usize = 0; + let mut last_deletion_end: usize = 0; + + for variant in variant_calls.iter() { + /* + - We need to split on small variants and SVs + - for small variants, do what we normally do; it may be worth seeing if the method we create for SVs will help with this other mode fails though + - for SVs, check if the read FULLY spans the locus; if so, check how much sequence is inserted/deleted in the region and turn that into an allele + - TODO: for SVs, what if it doesn't, can we use clipping somehow? + */ + + trace!("{:?}", variant); + let variant_pos: i64 = variant.position(); + let variant_type: VariantType = variant.get_type(); + let vt_index = variant_type as usize; + + // regardless of variant type, we MUST populate these in the following branching logic + let mut allele: u8; + let qual: u8; + let exact_allele: bool; + let overlaps_allele: bool; + + if variant.is_ignored() { + // this variant is one marked to ignored, lets set it to undefined as opposed to ambiguous + trace!("\tMarking as undefined allele because it is flagged to be ignored"); + allele = 3; + qual = 0; + exact_allele = false; + overlaps_allele = false; + } else if variant_pos < last_deletion_end as i64 { + // check if this is within a region we have decided is a deleted + trace!("\tMarking as unknown allele because it overlaps detected SV deletion"); + // if the 0-allele is reference, mark as 0, else mark as ambiguous because it's multi-allelic call + allele = 2; + qual = 0; + exact_allele = false; + overlaps_allele = true; + } else { + match variant_type { + VariantType::Snv | + VariantType::Insertion | + VariantType::Deletion | + VariantType::Indel | + VariantType::SvInsertion | + VariantType::TandemRepeat => { + // we need these to build coordinate ranges + let ref_allele_len: usize = variant.get_ref_len(); + let prefix_len: usize = variant.get_prefix_len(); + let postfix_len: usize = variant.get_postfix_len(); + + // coordinate ranges we care about + let first_start_coordinate: usize = variant_pos as usize - prefix_len; + let last_start_coordinate: usize = variant_pos as usize + 1; // add one because we want to include variant_pos + let first_end_coordinate: usize = variant_pos as usize + ref_allele_len; + let last_end_coordinate: usize = variant_pos as usize + ref_allele_len + postfix_len + 1; // add one for same reason as above + + // first, try to find the closest start + let mut opt_closest_start: Option = None; + for sc in (first_start_coordinate..last_start_coordinate).rev() { + if let Some(&si) = coordinate_lookup.get(&(sc as i64)) { + opt_closest_start = Some(si as usize); + break; + } + } + + // now the closest end + let mut opt_closest_end: Option = None; + for ec in first_end_coordinate..last_end_coordinate { + if let Some(&ei) = coordinate_lookup.get(&(ec as i64)) { + opt_closest_end = Some(ei as usize); + break; + } + } + + // now find the best start coordinate with constraints + let mut start_coordinate: Option = None; + let mut start_clip: usize = 0; + let mut end_coordinate: Option = None; + let mut end_clip: usize = 0; + + if let (Some(closest_start), Some(closest_end)) = (opt_closest_start, opt_closest_end) { + for sc in first_start_coordinate..last_start_coordinate { + // always increment this + start_clip += 1; + + if let Some(&segment_index) = coordinate_lookup.get(&(sc as i64)) { + // check if it's too far away + if closest_start - segment_index as usize > 2*prefix_len { + continue; + } + + // we found a start coordinate + start_coordinate = Some(segment_index as usize); + + // now try to find an end coordinate also + for ec in (first_end_coordinate..last_end_coordinate).rev() { + // always increment this + end_clip += 1; + + if let Some(&next_index) = coordinate_lookup.get(&(ec as i64)) { + // check if it's too far away + if next_index as usize - closest_end > 2*postfix_len { + continue; + } + + // we found an end coordinate also + end_coordinate = Some(next_index as usize); + break; + } + } + break; + } + } + } else { + // the closest ones failed, we won't succeed here either + } + + if let Some(ss) = start_coordinate { + if let Some(se) = end_coordinate { + trace!("\t{}..{} = {:?} {:?}; next = {} {}", ss, se, &read_sequence[ss..se], &read_qualities[ss..se], read_sequence[se], read_qualities[se]); + + let edit_distance: usize; + allele = variant.match_allele(&read_sequence[ss..se]); + if allele == 2 { + // no exact match, do inexact matching + (allele, edit_distance, _) = variant.closest_allele_clip(&read_sequence[ss..se], start_clip - 1, end_clip - 1); + exact_allele = false; + } else { + edit_distance = 0; + exact_allele = true; + } + + // this approach uses harmonic mean of base quality + // * no ED penalty + // * weighted - the same weight factors are applied, but in a down-weighting approach (this is because we don't want to exceed u8::MAX) + let divisor_multiplier: i64 = match variant_type { + // these are down-weights for local mode + // SNV has highest confidence here + VariantType::Snv => 1, + + // indels tend to be pretty bad + VariantType::Deletion | + VariantType::Insertion | + VariantType::Indel => 4, + + // SVs are generally worst of all in local mode + VariantType::SvDeletion | + VariantType::SvInsertion => 4, + + // we want tandem repeats to have a higher weight than our generic indels + VariantType::TandemRepeat => 2, + + _ => { + panic!("No implementation for matching {variant_type:?}"); + } + }; + + qual = (( + (se - ss) as f64 / + read_qualities[ss..se].iter() + .map(|&q| 1.0f64 / q as f64) + .sum::() + ).max(4.0) / divisor_multiplier as f64) as u8; + + overlaps_allele = true; + trace!("\tallele = {}, qual = {}, ED = {}", allele, qual, edit_distance); + } else { + trace!("\tfailed allele match for ref extension"); + allele = 2; + qual = 0; + exact_allele = false; + overlaps_allele = true; + } + } else { + //no overlap + if aligned_range.contains(&variant_pos) { + trace!("\tOverlap, no position"); + overlaps_allele = true; + allele = 2; + } else { + // there is no alignment overlap + trace!("\tNo overlap"); + overlaps_allele = false; + allele = 3; + } + qual = 0; + exact_allele = false; + } + }, + VariantType::SvDeletion => { + // we need these to build coordinate ranges + let ref_allele_len: usize = variant.get_ref_len(); + + if aligned_range.contains(&variant_pos) { + // coordinate ranges we care about + let last_start_coordinate: usize = variant_pos as usize + 1; // add one because we want to include variant_pos + let first_end_coordinate: usize = variant_pos as usize + ref_allele_len; + if aligned_range.contains(&(first_end_coordinate as i64)) { + // calculate how many bases we expect to see deleted + let expected_deleted: usize = first_end_coordinate - last_start_coordinate; + + // now we need to move up and down until we find an anchor point + let mut start_anchor: usize = last_start_coordinate; + while !coordinate_lookup.contains_key(&(start_anchor as i64)) { + if start_anchor <= aligned_range.start as usize { + // fixes weird CIGARs where a mapping starts with non-matching types, e.g.: + // [SoftClip(3139), Del(798), Equal(4), ... + warn!("Reached start of read ({}) without finding start_anchor, using POS ({}) instead.", std::str::from_utf8(read.qname()).unwrap_or("utf8-decode-error"), start_anchor); + break; + } + start_anchor -= 1; + } + let mut end_anchor: usize = first_end_coordinate; + while !coordinate_lookup.contains_key(&(end_anchor as i64)) { + end_anchor += 1; + if end_anchor >= aligned_range.end as usize { + // we have not observed it, but this is a symmetrical handling of the weird CIGARs for the end + warn!("Reached end of read ({}) without finding end_anchor, using max ({}) found instead.", std::str::from_utf8(read.qname()).unwrap_or("utf8-decode-error"), end_anchor); + break; + } + } + + // count up the number of missing (i.e. deleted) based in the reference + let mut deleted_count: usize = 0; + for dc in start_anchor..end_anchor { + if !coordinate_lookup.contains_key(&(dc as i64)) { + deleted_count += 1; + } + } + + // it's possible to have more deleted bases than expected + // assert!(expected_deleted >= deleted_count); + + // the quality if we have exactly the right number of deleted (or not deleted) bases + let exact_allele_qual: f64 = 40.0; + // divisor for downweighting quality values, this is higher than indels currently + let deletion_factor: f64 = 8.0; + // fixes the ratios that match REF or ALT here to: REF = [0, match_window_size); ALT = (1.0 - match_window_size, 1.0 + match_window_size) + let match_window_size: f64 = 0.33; + + let deleted_ratio: f64 = deleted_count as f64 / expected_deleted as f64; + if deleted_ratio < match_window_size { + // mostly not deleted + allele = 0; + if deleted_ratio == 0.0 { + // this is pretty unlikely + qual = (exact_allele_qual / deletion_factor) as u8; + exact_allele = true; + } else { + qual = ((-10.0 * deleted_ratio.log10()) / deletion_factor).max(1.0) as u8; + exact_allele = false; + } + } else if (1.0 - deleted_ratio).abs() < match_window_size { + // mostly deleted and not over-deleted + allele = 1; + if deleted_ratio == 1.0 { + // this is pretty unlikely + qual = (exact_allele_qual / deletion_factor) as u8; + exact_allele = true; + } else { + qual = ((-10.0 * (1.0 - deleted_ratio).abs().log10()) / deletion_factor).max(1.0) as u8; + exact_allele = false; + } + + // this is getting labeled a deletion, force anything overlapping it to be reference (because it isn't there) + last_deletion_end = first_end_coordinate; + } else { + // ambiguous either because it's in between or over-deleted + allele = 2; + qual = 0; + exact_allele = false; + } + overlaps_allele = true; + } else { + // we have a partial overlap, but don't reach the far end + // mirror what we do above by marking overlap as true but otherwise a failure to match + allele = 2; + qual = 0; + exact_allele = false; + overlaps_allele = true; + } + } else { + // we don't overlap the start + allele = 3; + qual = 0; + exact_allele = false; + overlaps_allele = false; + } + }, + _ => { + panic!("Unhandled variant type: {variant_type:?}"); + } + }; + } + + // gather stats on the match + if overlaps_allele { + assert!(allele <= 2); + if allele == 2 { + failed_matches[vt_index] += 1; + } else { + if exact_allele { + exact_matches[vt_index] += 1; + } else { + inexact_matches[vt_index] += 1; + } + if allele == 0 { + allele0_matches[vt_index] += 1; + } else { + allele1_matches[vt_index] += 1; + } + num_overlaps += 1; + num_alleles += 1; + } + } else { + assert_eq!(allele, 3); + } + + // no matter what, we push these now + alleles.push(allele); + // make sure the quality is always at least 1 + quals.push(qual.max(1)); + } + assert_eq!(num_variants, alleles.len()); + assert_eq!(num_variants, quals.len()); + trace!("All alleles {:?}\n", alleles); + + if num_overlaps > 0 { + let read_name: String = String::from_utf8(read.qname().to_vec()).unwrap(); + let read_group: &mut Vec = read_groups.entry(read_name.clone()).or_insert(vec![]); + read_group.push(ReadSegment::new(read_name, alleles, quals)); + } else { + // this one has no overlaps, so it's just a skipped read + skipped_reads += 1; + } + } + } + + // now collapse all the reads, but only keeping those with at least 2 things set + let mut read_segments: IntervalTree = IntervalTree::new(); + let mut phasable_segments: IntervalTree = IntervalTree::new(); + for (_qname, read_group) in read_groups.iter() { + let collapsed_read: ReadSegment = ReadSegment::collapse(read_group); + let num_set: usize = collapsed_read.get_num_set(); + if num_set >= min_matched_alleles { + let segment_range = collapsed_read.get_range(); + read_segments.insert(segment_range, collapsed_read); + num_reads += read_group.len() as u64; + } else { + skipped_reads += read_group.len() as u64; + if num_set > 0 { + // even though this won't be used for phasing, it CAN be phased + let segment_range = collapsed_read.get_range(); + phasable_segments.insert(segment_range, collapsed_read); + } + } + } + + // sanity check this; this was before we started making sure failed alleles only applied if the mapping overlapped + // assert_eq!(num_alleles, (num_reads + skipped_reads) * (num_variants as u64)); + let segment_stats = ReadStats::new( + num_reads, skipped_reads, num_alleles, + exact_matches, inexact_matches, failed_matches, + allele0_matches, allele1_matches, + false + ); + debug!("Read segment stats: {:?}", segment_stats); + + Ok((read_segments, phasable_segments, segment_stats)) +} + +/// Loads up all the reads in a particular phase region and converts them into their variant representation. +/// This version uses global re-alignment to parse the alleles. +/// Returns an interval tree containing all reads to use for phasing, a second tree containing extra reads that *can* be phased but didn't match our criteria, +/// and statistics from loading the reads. +/// # Arguments +/// * `phase_problem` - the phase block we are loading data for +/// * `bam_paths` - the BAM files to parse, must be indexed +/// * `variant_calls` - the variants used to convert full reads into haplotype observations (`ReadSegment`) +/// * `hom_calls` - any homozygous variants within the region, these don't get phased but are useful for global realignment +/// * `reference_genome` - the reference genome sequences, required for this approach +/// * `min_matched_alleles` - the minimum number of identified alleles required for a read to be included +/// * `min_mapq` - the minimum MAPQ to consider a read +/// * `max_runtime` - controls the allowed runtime of the global realignment +/// * `wfa_prune_distance` - maximum allowed distance a wavefront can lag; make smaller to reduce run-time at the cost of accuracy +#[allow(clippy::too_many_arguments)] +#[allow(clippy::type_complexity)] +pub fn load_full_read_segments( + phase_problem: &PhaseBlock, bam_paths: &[PathBuf], variant_calls: &[Variant], hom_calls: &[Variant], + reference_genome: &ReferenceGenome, min_matched_alleles: usize, min_mapq: u8, + max_runtime: f32, wfa_prune_distance: usize +) -> Result<(IntervalTree, IntervalTree, ReadStats), Box> { + use rust_htslib::bam::Read; + use rust_htslib::bam::ext::BamRecordExtensions; + + let chromosome: &str = phase_problem.get_chrom(); + let num_variants: usize = variant_calls.len(); + let mut read_groups: HashMap> = Default::default(); + + // stats we track + let mut num_reads: u64 = 0; + let mut skipped_reads: u64 = 0; + let mut num_alleles: u64 = 0; + let mut exact_matches: [u64; VariantType::Unknown as usize + 1] = [0; VariantType::Unknown as usize + 1]; + let mut inexact_matches: [u64; VariantType::Unknown as usize + 1] = [0; VariantType::Unknown as usize + 1]; + let mut failed_matches: [u64; VariantType::Unknown as usize + 1] = [0; VariantType::Unknown as usize + 1]; + let mut allele0_matches: [u64; VariantType::Unknown as usize + 1] = [0; VariantType::Unknown as usize + 1]; + let mut allele1_matches: [u64; VariantType::Unknown as usize + 1] = [0; VariantType::Unknown as usize + 1]; + + let mut edit_distances: Vec = vec![]; + + let block_cpu_time = cpu_time::ThreadTime::now(); + + for bam_filename in bam_paths.iter() { + let mut bam_reader = bam::IndexedReader::from_path(bam_filename)?; + bam_reader.set_reference(reference_genome.filename())?; + bam_reader.fetch((chromosome, phase_problem.get_start(), phase_problem.get_end()+1))?; + + for read_entry in bam_reader.records() { + let time_elapsed: f32 = block_cpu_time.elapsed().as_secs_f32(); + if time_elapsed > max_runtime { + bail!("max_runtime reached"); + } + + let mut read = read_entry?; + + //make sure we care about the alignment + if filter_out_alignment_record(&read, min_mapq) { + continue; + } + + //build out the cigar info + read.cache_cigar(); + + //build a lookup from reference coordinate -> sequence coordinate + let mut coordinate_lookup: HashMap = Default::default(); + let mut min_position: i64 = i64::MAX; + let mut max_position: i64 = i64::MIN; + for bp in read.aligned_pairs() { + let segment_index = bp[0]; + let ref_index = bp[1]; + coordinate_lookup.insert(ref_index, segment_index); + min_position = min_position.min(ref_index); + max_position = max_position.max(ref_index); + } + assert!(max_position >= min_position); + + // max_position is the last one that we found, so add +1 to include in the range + let aligned_range = min_position..(max_position+1); + + //we will populate these with the variant level info + let mut num_overlaps: usize = 0; + let mut first_overlap: Option = None; + let mut last_overlap: usize = 0; + for (i, variant) in variant_calls.iter().enumerate() { + let variant_pos: i64 = variant.position(); + if aligned_range.contains(&variant_pos) { + if first_overlap.is_none() { + first_overlap = Some(i); + } + last_overlap = i+1; + num_overlaps += 1; + } + } + + // if this mapping overlaps no alleles, then there's no reason to look at it anymore + if num_overlaps == 0 { + skipped_reads += 1; + continue; + } + + // convert into a non-option + let first_overlap: usize = first_overlap.unwrap(); + assert_eq!(num_overlaps, last_overlap - first_overlap); + + // check for homozygous variants also + let mut first_hom_overlap: Option = None; + let mut last_hom_overlap: usize = 0; + for (i, variant) in hom_calls.iter().enumerate() { + let variant_pos: i64 = variant.position(); + if aligned_range.contains(&variant_pos) { + if first_hom_overlap.is_none() { + first_hom_overlap = Some(i); + } + last_hom_overlap = i+1; + } + } + let first_hom_overlap: usize = first_hom_overlap.unwrap_or(0); + + // .seq() returns Seq<'_> type, but we should just full decode + let read_sequence: Vec = read.seq().as_bytes(); + let read_qualities: &[u8] = read.qual(); + assert_eq!(read_sequence.len(), read_qualities.len()); + + // these should always exist based on how we set it up + let read_start: usize = *coordinate_lookup.get(&min_position).unwrap() as usize; + let read_end: usize = *coordinate_lookup.get(&max_position).unwrap() as usize; + + // pull out the part of the read we're aligning against + let read_align: &[u8] = &read_sequence[read_start..(read_end+1)]; + + /* + Current state: + - we have the reference genome + - we have the part of the read that aligns in `read_align`, the full read sequence in `read_sequence` + - we have the indices of the first and last variant overlaps in `first_overlap` and `last_overlap` + + We need to populate: + - alleles + - quals + - read stats (see below) + + Game plan: + - construct a graph representing just this reference location + relevant alleles + - while constructing, assign alleles to each new branch (it may be reference allele) + -- IF you have multiple alleles starting at the same coordinate (e.g. identical call), then do not create an in-between node; this should resolve in the tie-breaking as "identical" + -- so each branch should get a variant index + an allele assignment (0/1); reference alleles may end up with multiple 0 alleles in the event of multi-start + - align the read via POA + - look at the traversed nodes and copy the allele assignments; if anything is unassigned at the end, it gets 2; any with conflicting assignments get 2 also + - update stats according to the assignments, we can't really do exact right now (maybe we can look at score deltas from one node to the next?) + */ + let chrom_seq: &[u8] = reference_genome.get_full_chromosome(chromosome); + + // we need to also provide any preset alleles + let start_time = std::time::Instant::now(); + let (wfa_graph, node_to_alleles): (WFAGraph, NodeAlleleMap) = + WFAGraph::from_reference_variants_with_hom( + chrom_seq, + &variant_calls[first_overlap..last_overlap], // these are both range style indices + &hom_calls[first_hom_overlap..last_hom_overlap], + min_position as usize, + max_position as usize + 1 + ).unwrap(); + + let wfa_result: WFAResult = match wfa_graph.edit_distance_with_pruning(read_align, wfa_prune_distance) { + Ok(wr) => wr, + Err(e) => { + bail!( + "Encountered WFA error for mapping \"{}\" ({}:{}): {}", + std::str::from_utf8(read.qname()).unwrap_or("QNAME_UTF8_ERROR"), + chromosome, read.pos(), e + ); + } + }; + + debug!( + "B#{} WFAGraph result ({}) => num_nodes: {}, read_len: {}, variant_overlaps: {}, edit_distance: {}", + phase_problem.get_block_index(), start_time.elapsed().as_secs_f32(), wfa_graph.get_num_nodes(), max_position-min_position+1, num_overlaps, wfa_result.score() + ); + + edit_distances.push(wfa_result.score()); + + //we will populate these with the variant level info + let mut alleles: Vec = vec![3; num_variants]; + for traversed_index in wfa_result.traversed_nodes().iter() { + for &(var_index, allele_assignment) in node_to_alleles.get(traversed_index).unwrap_or(&vec![]).iter() { + let correct_index: usize = first_overlap+var_index; + if alleles[correct_index] == 3 { + alleles[correct_index] = allele_assignment; + } else if alleles[correct_index] != allele_assignment { + alleles[correct_index] = 2; + } + } + } + + // go through the result counting assigned and setting qualities + let mut quals: Vec = vec![0; num_variants]; + for (i, a) in alleles.iter_mut().enumerate() { + let variant_type: VariantType = variant_calls[i].get_type(); + let vt_index: usize = variant_type as usize; + if *a == 3 { + // no overlaps for this allele + } else if *a == 2 { + // overlaps, but ambiguous matching + failed_matches[vt_index] += 1; + } else { + // we got a match, figure out the quality for it + quals[i] = match variant_type { + // these weights are up-weights for global re-alignments + // SNVs tend to always be the cleanest + VariantType::Snv => 8, + + // these are probably the noisiest of the bunch + VariantType::Deletion | + VariantType::Insertion | + VariantType::Indel => 1, + + // these should be pretty high confidence because they have a lot of bases to make them work + VariantType::SvDeletion | + VariantType::SvInsertion => 2, + + // we want tandem repeats to have higher confidence than random indels + VariantType::TandemRepeat => 4, + + _ => { + panic!("No implementation for matching {variant_type:?}"); + } + }; + + // gather stats on the match + let exact_allele = false; // TODO: figure this out + if exact_allele { + exact_matches[vt_index] += 1; + } else { + inexact_matches[vt_index] += 1; + } + if *a == 0 { + allele0_matches[vt_index] += 1; + } else { + allele1_matches[vt_index] += 1; + } + num_alleles += 1; + } + } + + // need to check what these were before + assert_eq!(num_variants, alleles.len()); + assert_eq!(num_variants, quals.len()); + trace!("All alleles {:?}\n", alleles); + + let read_name: String = String::from_utf8(read.qname().to_vec()).unwrap(); + let read_group: &mut Vec = read_groups.entry(read_name.clone()).or_insert(vec![]); + read_group.push(ReadSegment::new(read_name, alleles, quals)); + } + } + + // now collapse all the reads, but only keeping those with at least 2 things set + let mut read_segments: IntervalTree = IntervalTree::new(); + let mut phasable_segments: IntervalTree = IntervalTree::new(); + for (_qname, read_group) in read_groups.iter() { + let collapsed_read: ReadSegment = ReadSegment::collapse(read_group); + let num_set: usize = collapsed_read.get_num_set(); + if num_set >= min_matched_alleles { + let segment_range = collapsed_read.get_range(); + read_segments.insert(segment_range, collapsed_read); + num_reads += read_group.len() as u64; + } else { + skipped_reads += read_group.len() as u64; + if num_set > 0 { + // even though this won't be used for phasing, it CAN be phased + let segment_range = collapsed_read.get_range(); + phasable_segments.insert(segment_range, collapsed_read); + } + } + } + + // sanity check this; this was before we started making sure failed alleles only applied if the mapping overlapped + // assert_eq!(num_alleles, (num_reads + skipped_reads) * (num_variants as u64)); + let segment_stats = ReadStats::new( + num_reads, skipped_reads, num_alleles, + exact_matches, inexact_matches, failed_matches, + allele0_matches, allele1_matches, + true + ); + debug!("Read segment stats: {:?}", segment_stats); + debug!("Edit distances: {:?}", edit_distances); + + Ok((read_segments, phasable_segments, segment_stats)) +} \ No newline at end of file diff --git a/src/sequence_alignment.rs b/src/sequence_alignment.rs new file mode 100644 index 0000000..0ff9721 --- /dev/null +++ b/src/sequence_alignment.rs @@ -0,0 +1,77 @@ + +/// Returns the edit distance between two u8 Vecs by doing the full grid calculation. +/// This version is row-based (rows are length of v1) for the main loop. +/// # Arguments +/// * `v1` - the first sequence +/// * `v2` - the second sequence +pub fn edit_distance(v1: &[u8], v2: &[u8]) -> usize { + // structured such that each "row" is the length of v1 (i.e. v1 is conceptually on the x-axis) + let l1: usize = v1.len(); + let mut row: Vec = vec![0; l1+1]; + let mut prev_row: Vec = (0..l1+1).collect(); + + // go through each row + for (i, &c2) in v2.iter().enumerate() { + row[0] = i+1; + for (j, &c1) in v1.iter().enumerate() { + row[j+1] = [ + // skip a character in v2 + prev_row[j+1]+1, + // skip a character in v1 + row[j]+1, + // diagonal match/mismatch + prev_row[j]+({ + if c1 == c2 { + 0 + } else { + 1 + } + }) + ].into_iter().min().unwrap(); + } + + // swap the rows at the end of each iteration + std::mem::swap(&mut row, &mut prev_row); + } + + prev_row[l1] +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_edit_distance() { + let v1: Vec = vec![0, 1, 2, 4, 5]; + let v2: Vec = vec![0, 1, 3, 4, 5]; + let v3: Vec = vec![1, 2, 3, 5]; + let v4: Vec = vec![]; + + assert_eq!(edit_distance(&v1, &v1), 0); + assert_eq!(edit_distance(&v1, &v2), 1); + assert_eq!(edit_distance(&v1, &v3), 2); + assert_eq!(edit_distance(&v1, &v4), 5); + + assert_eq!(edit_distance(&v2, &v2), 0); + assert_eq!(edit_distance(&v2, &v3), 3); + assert_eq!(edit_distance(&v2, &v4), 5); + + assert_eq!(edit_distance(&v3, &v3), 0); + assert_eq!(edit_distance(&v3, &v4), 4); + + assert_eq!(edit_distance(&v4, &v4), 0); + } + + #[test] + fn test_edit_error_001() { + let v1 = [65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 67, 65, 65, 65]; + let v2 = [65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 67, 65, 65, 65, 65, 65, 65, 67, 65, 65, 65]; + let v3 = [65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 67, 65, 65, 65]; + + assert_eq!(edit_distance(&v1, &v3), 1); + assert_eq!(edit_distance(&v2, &v3), 1); + assert_eq!(edit_distance(&v3, &v1), 1); + assert_eq!(edit_distance(&v3, &v2), 1); + } +} \ No newline at end of file diff --git a/src/wfa_graph.rs b/src/wfa_graph.rs new file mode 100644 index 0000000..513f0bf --- /dev/null +++ b/src/wfa_graph.rs @@ -0,0 +1,1157 @@ + +use crate::data_types::variants::Variant; + +use bit_vec::BitVec; +#[allow(unused_imports)] +use log::{debug, trace, warn}; +use priority_queue::PriorityQueue; +use simple_error::bail; +use std::cmp::Reverse; +use rustc_hash::FxHashMap as HashMap; + +pub type NodeAlleleMap = HashMap>; + +/// Contains the core data that represents a "node". +/// Most functional logic will not be in the struct, this is mostly a container. +#[derive(Debug)] +struct WFANode { + /// this node's index + node_index: usize, + /// the sequence contained by the POA node + sequence: Vec, + /// contains the indices of the parent nodes, sorted + parent_nodes: Vec +} + +impl WFANode { + /// Create a new WFANode and performs sanity checks on inputs. + pub fn new(node_index: usize, sequence: Vec, mut parent_nodes: Vec) -> WFANode { + parent_nodes.sort(); + WFANode { + node_index, + sequence, + parent_nodes + } + } + + #[allow(dead_code)] + pub fn node_index(&self) -> usize { + self.node_index + } + + pub fn sequence(&self) -> &[u8] { + &self.sequence + } + + #[allow(dead_code)] + pub fn parent_nodes(&self) -> &[usize] { + &self.parent_nodes + } +} + +/// Contains functionality for building a Partial-Order Alignment (POA) graph and then aligning a sequence to it. +/// Assumes that the last node added to the graph is the "target" or "destination" for mapping. +#[derive(Default)] +pub struct WFAGraph { + /// all the nodes in the graph so far + nodes: Vec, + /// all the edges from one node to the next + edges: Vec> +} + +impl WFAGraph { + /// Creates a new empty graph + pub fn new() -> WFAGraph { + WFAGraph { + nodes: Default::default(), + edges: Default::default() + } + } + + /// Constructs a WFA graph using only heterozygous variant types. This is the het-only entry point for graph construction. + /// # Arguments + /// * `reference` - the reference sequence that is the backbone for the graph + /// * `variants` - a set of heterozygous variants that we are trying to assign to a read + /// * `ref_start` - the reference start coordinate, used for offsetting variant positions; 0-based inclusive + /// * `ref_end` - the reference end coordinate, used for offsetting variant positions; 0-based exclusive + /// # Errors + /// * if there are any errors from adding a node to the graph + pub fn from_reference_variants(reference: &[u8], variants: &[Variant], ref_start: usize, ref_end: usize) -> + Result<(WFAGraph, NodeAlleleMap), Box> { + Self::from_reference_variants_with_hom( + reference, + variants, + &[], + ref_start, + ref_end + ) + } + + /// Constructs a WFA graph using both heterozygous and homozygous variant types. This is the het/hom entry point for graph construction. + /// # Arguments + /// * `reference` - the reference sequence that is the backbone for the graph + /// * `variants` - a set of heterozygous variants that we are trying to assign to a read + /// * `hom_variants` - a set of homozygous variants that we are not trying to assign to a read, but they make alignment better + /// * `ref_start` - the reference start coordinate, used for offsetting variant positions; 0-based inclusive + /// * `ref_end` - the reference end coordinate, used for offsetting variant positions; 0-based exclusive + /// # Errors + /// * if there are any errors from adding a node to the graph + pub fn from_reference_variants_with_hom(reference: &[u8], variants: &[Variant], hom_variants: &[Variant], ref_start: usize, ref_end: usize) -> + Result<(WFAGraph, NodeAlleleMap), Box> { + + let mut graph: WFAGraph = Default::default(); + let mut node_to_alleles: NodeAlleleMap = Default::default(); + + let mut previous_end: usize = ref_start; + let mut reference_index: usize; + + // this tracks nodes that need to be reconnected next, i.e. parents for the _next_ reference node + // initial one is empty because there is no start node yet + let mut reference_reconnect: Vec = vec![]; + // marks the alleles that should be tied to the next reference insertion + let mut reference_alleles: Vec<(usize, u8)> = vec![]; + + // this is a queue where the key is reconnect position and value is the node to reconnect at that juncture + let mut reconnect_queue: PriorityQueue> = PriorityQueue::new(); + + let mut all_variants: Vec<(&Variant, Option)> = Vec::with_capacity(variants.len() + hom_variants.len()); + for (variant_index, variant) in variants.iter().enumerate() { + all_variants.push((variant, Some(variant_index))); + } + for variant in hom_variants.iter() { + all_variants.push((variant, None)); + } + all_variants.sort_by(|v1, v2| v1.0.position().cmp(&v2.0.position())); + + for (variant, variant_index) in all_variants.iter() { + if variant.is_ignored() { + // I don't think we need a trace message here for now + continue; + } + + // look at where this variant is + let variant_pos: usize = variant.position() as usize; + let ref_len: usize = variant.get_ref_len(); + if variant_pos + ref_len > ref_end { + // this variant end after our reference block, so ignore it + trace!("Ignoring variant ending at {} after ref_end {}", variant_pos+ref_len, ref_end); + continue; + } + + // while we have something to reconnect that reconnects BEFORE the next variant, handle it + while (reconnect_queue.peek().unwrap_or((&usize::MAX, &Reverse(usize::MAX))).1).0 <= variant_pos { + // get the next thing that needs to reconnect before the next variant + let (alt_index, Reverse(alt_reconnect)) = reconnect_queue.pop().unwrap(); + assert!(alt_reconnect > previous_end); + + // first, we have to build up the reference node up until the reconnect point + let ref_sequence: Vec = reference[previous_end..alt_reconnect].to_vec(); + reference_index = graph.add_node(ref_sequence, reference_reconnect)?; + if !reference_alleles.is_empty() { + node_to_alleles.insert(reference_index, reference_alleles); + reference_alleles = vec![]; + } + previous_end = alt_reconnect; + + // now prep the next one by marking that reference node plus the alt_index we are reconnecting + reference_reconnect = vec![reference_index, alt_index]; + + // also check if any other reconnects have an identical reconnect point + while reconnect_queue.peek().unwrap_or((&usize::MAX, &Reverse(usize::MAX))).1.0 == alt_reconnect { + let (ai2, Reverse(ar2)) = reconnect_queue.pop().unwrap(); + assert_eq!(alt_reconnect, ar2); + reference_reconnect.push(ai2); + } + } + + // at this point, any reconnections before this point have been resolved + + // check if the reference ended upstream of this variant (which is usually true) OR + // if the graph is currently empty, indicating that there is a variant at position 0 and we need a dummy start node + if previous_end < variant_pos || graph.get_num_nodes() == 0 { + // we need to catch up the reference, add a node representing all sequence up to this point + let ref_sequence: Vec = reference[previous_end..variant_pos].to_vec(); + + // now add the reference node that catches us up to this variant + reference_index = graph.add_node(ref_sequence, reference_reconnect)?; + if !reference_alleles.is_empty() { + node_to_alleles.insert(reference_index, reference_alleles); + reference_alleles = vec![]; + } + + // set these fields for the next reference node that gets added + reference_reconnect = vec![reference_index]; + previous_end = variant_pos; + } else { + assert!(previous_end == variant_pos); + // in this situation, we have already generated the sequence up to this variant, likely because two variants start at the same location + // we should not have to do anything special because we already know the upstream index + } + + // now add the alt allele(s) + if variant.convert_index(0) != 0 { + // allele0 is an alt, so this must be multi-allelic; basically do the same thing we would do for allele1 + // add the sequence exactly with just the immediately upstream reference node + let alt_sequence: Vec = variant.get_truncated_allele0().to_vec(); + let parent_nodes: Vec = reference_reconnect.clone(); + let alt_index: usize = graph.add_node(alt_sequence, parent_nodes)?; + let alt_reconnect: usize = variant_pos + ref_len; + + // also mark this alt node has having this particular allele0 + if let Some(vi) = variant_index { + node_to_alleles.insert(alt_index, vec![(*vi, 0)]); + } + + // now we need to mark this new node for reconnection downstream + reconnect_queue.push(alt_index, Reverse(alt_reconnect)); + } else { + // the 0 allele is just reference, so add it to the reference allele set + if let Some(vi) = variant_index { + reference_alleles.push((*vi, 0)); + } + } + + // allele1 is *always* an alt, add the sequence exactly with just the immediately upstream reference node + let alt_sequence: Vec = variant.get_truncated_allele1().to_vec(); + let parent_nodes: Vec = reference_reconnect.clone(); + let alt_index: usize = graph.add_node(alt_sequence, parent_nodes)?; + let alt_reconnect: usize = variant_pos + ref_len; + + // also mark this alt node has having this particular allele + if let Some(vi) = variant_index { + node_to_alleles.insert(alt_index, vec![(*vi, 1)]); + } + + // now we need to mark this new node for reconnection downstream + reconnect_queue.push(alt_index, Reverse(alt_reconnect)); + } + + // reconnect everything downstream from here + while !reconnect_queue.is_empty() { + let (alt_index, Reverse(alt_reconnect)) = reconnect_queue.pop().unwrap(); + assert!(alt_reconnect > previous_end); + let ref_sequence: Vec = reference[previous_end..alt_reconnect].to_vec(); + reference_index = graph.add_node(ref_sequence, reference_reconnect)?; + if !reference_alleles.is_empty() { + node_to_alleles.insert(reference_index, reference_alleles); + reference_alleles = vec![]; + } + previous_end = alt_reconnect; + + // now prep the next one + reference_reconnect = vec![reference_index, alt_index]; + while reconnect_queue.peek().unwrap_or((&usize::MAX, &Reverse(usize::MAX))).1.0 == alt_reconnect { + let (ai2, Reverse(ar2)) = reconnect_queue.pop().unwrap(); + assert_eq!(alt_reconnect, ar2); + reference_reconnect.push(ai2); + } + } + + // now we just have one last reference node to add + assert!(previous_end <= ref_end); + let ref_sequence: Vec = reference[previous_end..ref_end].to_vec(); + graph.add_node(ref_sequence, reference_reconnect)?; + + // make sure we didn't have any loose reference alleles hanging about, I don't think this can happen unless users enter weird stuff + assert!(reference_alleles.is_empty()); + + Ok((graph, node_to_alleles)) + } + + pub fn get_num_nodes(&self) -> usize { + self.nodes.len() + } + + /// Adds a node to the graph and returns its index as a Result. + /// # Arguments + /// * `sequence` - the vector of sequence to add with this node + /// * `parent_nodes` - the index of any upstream nodes in the graph + /// # Errors + /// * if the first node inserted has parents; first node is assumed root, so this would break that assumption + /// * if any subsequent node is parent-less; all nodes must stem from the root + /// * if any parent node has a index >= this node's index; this is a DAG only + pub fn add_node(&mut self, sequence: Vec, parent_nodes: Vec) -> Result> { + let new_index: usize = self.nodes.len(); + + // sanity checks on what is being added, node wise anyways + if new_index == 0 { + // this is the first ndoe, it should not have any parents + if !parent_nodes.is_empty() { + bail!("First node must have no parent nodes."); + } + } else { + // this is a non-first node, it MUST have a parent + if parent_nodes.is_empty() { + bail!("All nodes after the first must have at least one parent node."); + } + // make sure all parent node indices comes before this node + for &pn in parent_nodes.iter() { + if new_index <= pn { + bail!("All parent nodes must come before this node."); + } + } + } + + // add any new edges from parents + for &p_index in parent_nodes.iter() { + self.edges[p_index].push(new_index); + } + + // add the new node with an empty set of edges coming from it + let new_node: WFANode = WFANode::new(new_index, sequence, parent_nodes); + self.nodes.push(new_node); + self.edges.push(vec![]); + + Ok(new_index) + } + + /// Calculates the edit distance of `other_sequence` onto this graph, returning both the score and the traversed nodes to achieve that score. + /// # Arguments + /// * `other_sequence` - the sequence being aligned to this graph + /// # Errors + /// * if the maximum edit distance is reached; this is a safeguard from run-away loops + pub fn edit_distance(&self, other_sequence: &[u8]) -> Result> { + self.edit_distance_with_pruning(other_sequence, usize::MAX) + } + + /// Calculates the edit distance of `other_sequence` onto this graph using the WFA algorithm, returning both the score and the traversed nodes to achieve that score. + /// If multiple paths exist that are equal, all nodes along each path are returned, allowing us to mark ambiguity. + /// This mode will prune wavefronts that fall too far behind the farthest, leading to potentially incorrect results under certain conditions. + /// # Arguments + /// * `other_sequence` - the sequence being aligned to this graph + /// * `prune_distance` - if a wavefront is behind the farthest wavefront by this distance, it will be pruned; set to usize::MAX to disable pruning + /// # Errors + /// * if the maximum edit distance is reached; this is a safeguard from run-away loops + pub fn edit_distance_with_pruning(&self, other_sequence: &[u8], prune_distance: usize) -> Result> { + // We will structure the algorithm mentally such that X-axis is the graph and Y-axis is `other_sequence`. + // This means we are iterating on columns representing characters in the graph. + // Each column will be other_len long. + + // each node *may* have a set of active wavefronts indicating progression - HashMap of (at most) length N + // these active wavefronts must all be the same ED, but may not be adjacent (if one branch has a big indel) - HashMap based on start position in other_sequence + // each start position should have current distance into the `other_sequence` along with a set of upstream nodes - (a, Vec(b)) where a is an offset and b is index of set(s) + let mut active_wavefronts: HashMap>> = Default::default(); + let mut next_wavefronts: HashMap>> = Default::default(); + let mut max_wavefronts: HashMap> = Default::default(); + + // we also need to track which nodes are traversed by our particular path + let mut treeset_to_index: HashMap = HashMap::default(); + let mut index_to_treeset: Vec = vec![]; + + // we always start in node 0, so make that set + let mut base_bitvec: BitVec = BitVec::from_elem(self.nodes.len(), false); + base_bitvec.set(0, true); + assert!(treeset_to_index.insert(base_bitvec.clone(), 0).is_none()); + index_to_treeset.push(base_bitvec); + + let base_hashset_index: usize = 0; + + // insert the starting wavefront at `other_sequence`[0] + offset = 0, with set containing just node #0 + let mut initial_wavefront: HashMap> = Default::default(); + initial_wavefront.insert(0, vec![(0, base_hashset_index)]); + // this goes into node 0 + active_wavefronts.insert(0, initial_wavefront); + + // in a given loop, these will track any nodes that were actively moving wavefronts + let mut min_active_wavefront: usize; + let mut max_active_wavefront: usize; + + // each loop of WFA will increase our edit distance by 1 + let mut edit_distance: usize = 0; + let max_edit_distance: usize = 100000; + let mut farthest_progression: usize = 0; + let mut min_progression: usize = 0; + + let mut encountered_nodes: HashMap = Default::default(); + + loop { + /* + * Outline of the core POA-WFA algorithm + * 1. Extend all current wavefronts and create splits - this is loop 1, traversed in node order + * 2. REMOVED - Back-propagate maximum progressions - this is loop 2, traversed in reverse node order; + * this was a net drag on the runtime AND would get wrong results on occasion + * 3. Increase edit distance to match splits + */ + min_active_wavefront = usize::MAX; + max_active_wavefront = 0; + + // trace!("WFAGraph ed={} start: farthest_progression = {}, set_len = {}", edit_distance, farthest_progression, index_to_treeset.len()); + + // we can iterate over our nodes in order because they are DAGs entered in order + let mut wavefronts_scanned = 0; + for (node_index, node) in self.nodes.iter().enumerate() { + /* + * Outline of this core extension loop: + * 1. Push all wavefronts for this node forward + * 2. Collapse all wavefronts on the same diagonal such that only the best remain. + * - These get consolidated into a single WF. If it's worse than the best so far, we remove it from consideration. + * 3. If a diagonal hits the end of this node's sequence, copy it into all of the children nodes. + * - If it's the final node and has progress through all sequence, we instead mark it as finished. + * 4. Once we reach the end, generate any splits that must have ed = ed+1 + */ + + // check if this node has *any* active wavefronts + if !active_wavefronts.contains_key(&node_index) { + continue; + } + + // if this entry doesn't exist, set it + if let std::collections::hash_map::Entry::Vacant(e) = encountered_nodes.entry(node_index) { + e.insert(true); + trace!("WFAGraph n#{} start: min ed = {}", node_index, edit_distance); + } + + // update our min & max + min_active_wavefront = min_active_wavefront.min(node_index); + max_active_wavefront = max_active_wavefront.max(node_index); + + let node_sequence: &[u8] = node.sequence(); + let node_length: usize = node_sequence.len(); + + // pull out the active wavefront for this node + let mut wavefront: HashMap> = active_wavefronts.remove(&node_index).unwrap(); + let maxfront: &mut HashMap = max_wavefronts.entry(node_index).or_insert_with(Default::default); + + // `other_start` represent the first position in `other_sequence` for this WF diagonal, + // and it *can* be negative when we "delete" more node sequence than other sequence + // `offset` (below) represents the offset into the current node we are comparing currently + // if `other_start` is negative, then the corresponding `offset` values must be positive enough to overcome it (e.g. >= 0 when added) + for (other_start, vec_waves) in wavefront.iter_mut() { + wavefronts_scanned += 1; + + // first extend all wavefronts as far as possible, tracking the farthest + let mut max_offset: usize = 0; + for (offset, _hashset_index) in vec_waves.iter_mut() { + // get the position in `other_sequence` we are currently comparing against + assert!(other_start + *offset as isize >= 0); + let mut other_position: usize = (other_start + *offset as isize) as usize; + + // now extend as far as we can, making sure to check for boundaries and inequality in bases + while *offset < node_length && + other_position < other_sequence.len() && + node_sequence[*offset] == other_sequence[other_position] { + *offset += 1; + other_position += 1; + } + max_offset = max_offset.max(*offset); + } + + // if we go along a diagonal and max is less than we've seen before, then this is a suboptimal solution we ignore + let maxfront_record: &mut usize = maxfront.entry(*other_start).or_insert(0); + if max_offset < *maxfront_record || (other_start + max_offset as isize) < min_progression as isize { + // skips_triggered += 1; + // vec_waves.clear(); + continue; + } + *maxfront_record = max_offset; + + // double check this truth + assert!(other_start+max_offset as isize >= 0); + farthest_progression = farthest_progression.max((other_start+max_offset as isize) as usize); + + // now collapse down everything that made it to the max_offset + let best_offset = max_offset; + let mut best_sets: Vec = vec![]; + for &(o, s) in vec_waves.iter() { + if o == best_offset { + best_sets.push(s); + } + } + + // remove duplicates + best_sets.sort(); + best_sets.dedup(); + + let best_set: usize = if best_sets.len() > 1 { + // we have multiple bests, collapse them into a single index + let mut set_union: BitVec = BitVec::from_elem(self.nodes.len(), false); + for &set_index in best_sets.iter() { + let other_set: &BitVec = &index_to_treeset[set_index]; + set_union.or(other_set); + } + + // get the index of this entry (or create one if necessary) + let new_set_index: usize = match treeset_to_index.get(&set_union) { + Some(i) => { *i }, + None => { + index_to_treeset.push(set_union.clone()); + treeset_to_index.insert(set_union, index_to_treeset.len() - 1); + index_to_treeset.len() - 1 + } + }; + new_set_index + } else { + // only one remains, just copy it + best_sets[0] + }; + + if max_offset == node_length { + // we are at the end of this node, do different things depending on if this is the final node or not + if node_index == self.nodes.len() - 1 { + assert!(other_start + max_offset as isize >= 0); + if ((other_start + max_offset as isize) as usize) < other_sequence.len() { + // we are *not* at the end of other sequence, but we *are* at the end of the graph + // now we would normally split this into three waves on this node, but only the +1 is valid in this situation + let node_wf: &mut HashMap> = next_wavefronts.entry(node_index).or_insert_with(Default::default); + // +1 on diagonal - graph does not advance, other does (other has relative insertion); other_start is one more, but offset does not increase + let plus_diagonal : &mut Vec<(usize, usize)> = node_wf.entry(*other_start+1).or_insert(vec![]); + plus_diagonal.push((max_offset, best_set)); + } else { + // we are at the end of both final node and the other sequence + // we will handle anything below + } + } else { + assert!(other_start + max_offset as isize >= 0); + + // we are not in the final node, so we need to push this to successor nodes for more extension + // the `new_offset` tells our algorithm which base we're comparing and orients us to a diagonal + let new_offset: isize = other_start + max_offset as isize; + for &successor_index in self.edges[node_index].iter() { + let node_wf: &mut HashMap> = active_wavefronts.entry(successor_index).or_insert_with(Default::default); + let copy_diagonal: &mut Vec<(usize, usize)> = node_wf.entry(new_offset).or_insert(vec![]); + + // the successor set should include the best + the successors node index + let current_set: &BitVec = &index_to_treeset[best_set]; + let mut new_set: BitVec = BitVec::from_elem(self.nodes.len(), false); + new_set.set(successor_index, true); + new_set.or(current_set); + + // get the index of this entry (or create one if necessary) + let new_set_index: usize = match treeset_to_index.get(&new_set) { + Some(i) => { *i }, + None => { + index_to_treeset.push(new_set.clone()); + treeset_to_index.insert(new_set, index_to_treeset.len() - 1); + index_to_treeset.len() - 1 + } + }; + copy_diagonal.push((0, new_set_index)); + } + } + } else { + // now we split this into three waves on this node + let node_wf: &mut HashMap> = next_wavefronts.entry(node_index).or_insert_with(Default::default); + // -1 on diagonal - graph advances, other does not (other has relative deletion); other_start is one less, but the offset increases still + let minus_diagonal: &mut Vec<(usize, usize)> = node_wf.entry(*other_start-1).or_insert(vec![]); + minus_diagonal.push((max_offset+1, best_set)); + + // these two can only happen if sequence remains in other + assert!(*other_start + max_offset as isize >= 0); + if ((*other_start + max_offset as isize) as usize) < other_sequence.len() { + // +0 on diagonal - both node and other advance with mismatch; other_start does not change, but offset increases +1 + let zero_diagonal: &mut Vec<(usize, usize)> = node_wf.entry(*other_start).or_insert(vec![]); + zero_diagonal.push((max_offset+1, best_set)); + + // +1 on diagonal - graph does not advance, other does (other has relative insertion); other_start is one more, but offset does not increase + let plus_diagonal : &mut Vec<(usize, usize)> = node_wf.entry(*other_start+1).or_insert(vec![]); + plus_diagonal.push((max_offset, best_set)); + } + } + } + + if node_index == self.nodes.len() - 1 { + // we are at the last node, check if we reached the end + // this will store any final results + let mut final_hashsets: Vec = vec![]; + for (other_start, vec_waves) in wavefront.iter() { + for &(offset, hashset_index) in vec_waves.iter() { + // if we are at the end of the node AND our sequence + assert!(other_start + offset as isize >= 0); + if offset == node_length && (other_start + offset as isize) as usize == other_sequence.len() { + final_hashsets.push(hashset_index); + } + } + } + + if !final_hashsets.is_empty() { + // we've reached the end through one or more means, collapse and return + // remove duplicates + final_hashsets.sort(); + final_hashsets.dedup(); + + // now collapse the non-duplicates if necessary + let best_set: usize = if final_hashsets.len() > 1 { + // we have more than one, merge them all together and then return that one + let mut set_union: BitVec = BitVec::from_elem(self.nodes.len(), false); + for &set_index in final_hashsets.iter() { + let other_set: &BitVec = &index_to_treeset[set_index]; + set_union.or(other_set); + } + + // get the index of this entry (or create one if necessary) + let new_set_index: usize = match treeset_to_index.get(&set_union) { + Some(i) => { *i }, + None => { + index_to_treeset.push(set_union.clone()); + treeset_to_index.insert(set_union, index_to_treeset.len() - 1); + index_to_treeset.len() - 1 + } + }; + new_set_index + } else { + // only one exists, just copy it + final_hashsets[0] + }; + + let sorted_traversed_nodes: Vec = index_to_treeset[best_set].iter() + .enumerate() + .filter(|(_i, b)| *b) + .map(|(i, _b)| i) + .collect(); + return Ok(WFAResult { + score: edit_distance, + traversed_nodes: sorted_traversed_nodes + }); + } + } + } + + // end of loop - increase ED and update active wavefronts + edit_distance += 1; + active_wavefronts = next_wavefronts; + next_wavefronts = Default::default(); + + if farthest_progression > prune_distance { + min_progression = farthest_progression - prune_distance; + } + + trace!("edit_distance => {}, wave_fronts scanned => {}, active_indices={}..{}", edit_distance, wavefronts_scanned, min_active_wavefront, max_active_wavefront); + + // safety while debugging + if edit_distance > max_edit_distance { + bail!("Max_edit_distance ({}) reached during WFA solving", max_edit_distance); + } + } + } +} + +/// Container for POA results. +#[derive(Debug, Eq, PartialEq)] +pub struct WFAResult { + /// The score of the best match from the alignment + score: usize, + /// Nodes that were traversed to get this best match; conflicting node results indicate a tie in which branch should be traversed. + traversed_nodes: Vec +} + +impl WFAResult { + pub fn score(&self) -> usize { + self.score + } + + pub fn traversed_nodes(&self) -> &[usize] { + &self.traversed_nodes + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_single_node() { + // create a new graph and add a single node to it + let mut graph: WFAGraph = WFAGraph::new(); + let v1: Vec = vec![0, 1, 2, 4, 5]; + graph.add_node(v1.clone(), vec![]).unwrap(); + + // test sequences + let v2: Vec = vec![0, 1, 3, 4, 5]; + let v3: Vec = vec![1, 2, 3, 5]; + let v4: Vec = vec![]; + + // check the nodes in the first one, but just score for the rest + assert_eq!(graph.edit_distance(&v1).unwrap(), WFAResult { score: 0, traversed_nodes: vec![0] }); + assert_eq!(graph.edit_distance(&v2).unwrap().score(), 1); + assert_eq!(graph.edit_distance(&v3).unwrap().score(), 2); + assert_eq!(graph.edit_distance(&v4).unwrap().score(), 5); + } + + #[test] + fn test_two_node_single_path() { + // this is the base sequence + let v1: Vec = vec![0, 1, 2, 4, 5]; + for split_point in 0..v1.len() { + // split the sequence at various points and verify everything is still correct + let mut graph: WFAGraph = WFAGraph::new(); + graph.add_node(v1[0..split_point].to_vec(), vec![]).unwrap(); + graph.add_node(v1[split_point..].to_vec(), vec![0]).unwrap(); + + // test sequences + let v2: Vec = vec![0, 1, 3, 4, 5]; + let v3: Vec = vec![1, 2, 3, 5]; + let v4: Vec = vec![]; + + // check the nodes in the first one, but just score for the rest + assert_eq!(graph.edit_distance(&v1).unwrap(), WFAResult { score: 0, traversed_nodes: vec![0, 1] }); + assert_eq!(graph.edit_distance(&v2).unwrap(), WFAResult { score: 1, traversed_nodes: vec![0, 1] }); + assert_eq!(graph.edit_distance(&v3).unwrap(), WFAResult { score: 2, traversed_nodes: vec![0, 1] }); + assert_eq!(graph.edit_distance(&v4).unwrap(), WFAResult { score: 5, traversed_nodes: vec![0, 1] }); + } + } + + #[test] + fn test_basic_variant() { + // create a graph where index 2 is an SNV change to either 2 or 3 + let mut graph: WFAGraph = WFAGraph::new(); + let v1: Vec = vec![0, 1, 2, 4, 5]; + graph.add_node(v1[0..2].to_vec(), vec![]).unwrap(); + graph.add_node(vec![2], vec![0]).unwrap(); + graph.add_node(vec![3], vec![0]).unwrap(); + graph.add_node(v1[3..].to_vec(), vec![1, 2]).unwrap(); + + // test sequences + // v2 is in the graph + let v2: Vec = vec![0, 1, 3, 4, 5]; + // v3 is 2 away from v1, 3 away from v2 + let v3: Vec = vec![1, 2, 3, 5]; + // 5 away from both + let v4: Vec = vec![]; + // 1 away from both + let v5: Vec = vec![0, 1, 4, 5]; + + // check the nodes in the first one, but just score for the rest + assert_eq!(graph.edit_distance(&v1).unwrap(), WFAResult { score: 0, traversed_nodes: vec![0, 1, 3] }); + assert_eq!(graph.edit_distance(&v2).unwrap(), WFAResult { score: 0, traversed_nodes: vec![0, 2, 3] }); + assert_eq!(graph.edit_distance(&v3).unwrap(), WFAResult { score: 2, traversed_nodes: vec![0, 1, 3] }); + assert_eq!(graph.edit_distance(&v4).unwrap(), WFAResult { score: 5, traversed_nodes: vec![0, 1, 2, 3] }); + assert_eq!(graph.edit_distance(&v5).unwrap(), WFAResult { score: 1, traversed_nodes: vec![0, 1, 2, 3] }); + } + + #[test] + fn test_triple_split() { + let v1 = vec![0, 1, 2, 3, 4, 5]; + let v2 = vec![0, 1, 2, 4, 4, 5]; + let v3 = vec![0, 1, 4, 4, 5]; + + // this construct splits the middle into 3 separate alleles (2, 3), (2, 4), and (-, 4) + let mut graph: WFAGraph = WFAGraph::new(); + let root = graph.add_node(v1[0..2].to_vec(), vec![]).unwrap(); + let s1 = graph.add_node(v1[2..4].to_vec(), vec![root]).unwrap(); + let s2 = graph.add_node(v2[2..4].to_vec(), vec![root]).unwrap(); + let s3 = graph.add_node(v3[2..3].to_vec(), vec![root]).unwrap(); + let tail = graph.add_node(v1[4..].to_vec(), vec![s1, s2, s3]).unwrap(); + + assert_eq!(graph.edit_distance(&v1).unwrap(), WFAResult { score: 0, traversed_nodes: vec![root, s1, tail] }); + assert_eq!(graph.edit_distance(&v2).unwrap(), WFAResult { score: 0, traversed_nodes: vec![root, s2, tail] }); + assert_eq!(graph.edit_distance(&v3).unwrap(), WFAResult { score: 0, traversed_nodes: vec![root, s3, tail] }); + } + + #[test] + fn test_nested_split() { + let v1 = vec![0, 1, 2, 3, 4, 5]; + let v2 = vec![0, 1, 2, 4, 4, 5]; + let v3 = vec![0, 1, 4, 4, 5]; + + // this construct pairs2 (2, 4) with (-, 4) + let mut graph: WFAGraph = WFAGraph::new(); + let root = graph.add_node(v1[0..2].to_vec(), vec![]).unwrap(); + // (2, 3) still alone + let s1 = graph.add_node(v1[2..4].to_vec(), vec![root]).unwrap(); + // s2 contains just (2, ) + let s2 = graph.add_node(v2[2..3].to_vec(), vec![root]).unwrap(); + // s3 contains just (4, ), but it allows you to come directly from root to enable the deletion in v3 + let s3 = graph.add_node(v2[3..4].to_vec(), vec![root, s2]).unwrap(); + // tail is the same + let tail = graph.add_node(v1[4..].to_vec(), vec![s1, s3]).unwrap(); + + assert_eq!(graph.edit_distance(&v1).unwrap(), WFAResult { score: 0, traversed_nodes: vec![root, s1, tail] }); + assert_eq!(graph.edit_distance(&v2).unwrap(), WFAResult { score: 0, traversed_nodes: vec![root, s2, s3, tail] }); + assert_eq!(graph.edit_distance(&v3).unwrap(), WFAResult { score: 0, traversed_nodes: vec![root, s3, tail] }); + } + + #[test] + fn test_double_split() { + let v1 = vec![0, 1, 2, 3, 4, 5]; + let v2 = vec![0, 1, 2, 4, 4, 5]; + let v3 = vec![0, 1, 4, 4, 5]; + + // this construct separate the deletion event from the SNV event with a "gap" in the middle + let mut graph: WFAGraph = WFAGraph::new(); + let root = graph.add_node(v1[0..2].to_vec(), vec![]).unwrap(); + // s1 just contains (2, ) + let s1 = graph.add_node(v1[2..3].to_vec(), vec![root]).unwrap(); + // s2 is an empty join + let s2 = graph.add_node(vec![], vec![root, s1]).unwrap(); + // s3 contains just (3, ) + let s3 = graph.add_node(v1[3..4].to_vec(), vec![s2]).unwrap(); + // s4 contains just (4, ) + let s4 = graph.add_node(v2[3..4].to_vec(), vec![s2]).unwrap(); + // tail picks up the last two bases + let tail = graph.add_node(v1[4..].to_vec(), vec![s3, s4]).unwrap(); + + assert_eq!(graph.edit_distance(&v1).unwrap(), WFAResult { score: 0, traversed_nodes: vec![root, s1, s2, s3, tail] }); + assert_eq!(graph.edit_distance(&v2).unwrap(), WFAResult { score: 0, traversed_nodes: vec![root, s1, s2, s4, tail] }); + assert_eq!(graph.edit_distance(&v3).unwrap(), WFAResult { score: 0, traversed_nodes: vec![root, s2, s4, tail] }); + } + + #[test] + fn test_overlapping_split() { + let v1 = vec![0, 1, 2, 3, 4, 5]; + let v2 = vec![0, 3, 4, 5]; + let v3 = vec![0, 1, 4, 5]; + + /* + Graph structure of overlapping splits, both of which delete the "2": + -> -> -v + 0 -> 1 -> 2 -> 3 -> 4,5 + -> -> -^ + */ + let mut graph: WFAGraph = WFAGraph::new(); + let root = graph.add_node(v1[0..1].to_vec(), vec![]).unwrap(); + // represents the (1, ) + let s1 = graph.add_node(v1[1..2].to_vec(), vec![root]).unwrap(); + // represents the (2, ) + let s2 = graph.add_node(v1[2..3].to_vec(), vec![s1]).unwrap(); + // represents the (3, ) + let s3 = graph.add_node(v1[3..4].to_vec(), vec![root, s2]).unwrap(); + // represent the tail + let tail = graph.add_node(v1[4..].to_vec(), vec![s1, s3]).unwrap(); + + assert_eq!(graph.edit_distance(&v1).unwrap(), WFAResult { score: 0, traversed_nodes: vec![root, s1, s2, s3, tail] }); + assert_eq!(graph.edit_distance(&v2).unwrap(), WFAResult { score: 0, traversed_nodes: vec![root, s3, tail] }); + assert_eq!(graph.edit_distance(&v3).unwrap(), WFAResult { score: 0, traversed_nodes: vec![root, s1, tail] }); + } + + #[test] + fn test_simple_snv() { + let reference = "AAA".as_bytes(); + let variants = vec![Variant::new_snv(0, 1, "A".as_bytes().to_vec(), "C".as_bytes().to_vec(), 0, 1)]; + + let (graph, node_to_alleles): (WFAGraph, NodeAlleleMap) = + WFAGraph::from_reference_variants(&reference, &variants, 0, reference.len()).unwrap(); + + // check the alignments first + assert_eq!(graph.get_num_nodes(), 4); + // remember ALT alleles get added before reference alleles + assert_eq!(graph.edit_distance(&reference).unwrap(), WFAResult { score: 0, traversed_nodes: vec![0, 2, 3] }); + assert_eq!(graph.edit_distance("ACA".as_bytes()).unwrap(), WFAResult { score: 0, traversed_nodes: vec![0, 1, 3] }); + assert_eq!(graph.edit_distance("AA".as_bytes()).unwrap(), WFAResult { score: 1, traversed_nodes: vec![0, 1, 2, 3] }); + + // now check our lookup tables + assert_eq!(*node_to_alleles.get(&0).unwrap_or(&vec![]), vec![]); + assert_eq!(*node_to_alleles.get(&1).unwrap_or(&vec![]), vec![(0, 1)]); + assert_eq!(*node_to_alleles.get(&2).unwrap_or(&vec![]), vec![(0, 0)]); + assert_eq!(*node_to_alleles.get(&3).unwrap_or(&vec![]), vec![]); + } + + #[test] + fn test_multiple_variants() { + // two A>C SNVs with reference in between + let reference = "AAAAA".as_bytes(); + let variants = vec![ + // vcf_index, position, allele0, allele1, index_allele0, index_allele1 + Variant::new_snv(0, 1, "A".as_bytes().to_vec(), "C".as_bytes().to_vec(), 0, 1), + Variant::new_snv(0, 3, "A".as_bytes().to_vec(), "C".as_bytes().to_vec(), 0, 1) + ]; + + let (graph, node_to_alleles): (WFAGraph, NodeAlleleMap) = + WFAGraph::from_reference_variants(&reference, &variants, 0, reference.len()).unwrap(); + + // check the alignments first + assert_eq!(graph.get_num_nodes(), 7); + // remember ALT alleles get added before reference alleles + /* + REF: 0 -> 2 -> 3 -> 5 -> 6 + ALT: -> 1 -^ -> 4 -^ + */ + assert_eq!(graph.edit_distance(&reference).unwrap(), WFAResult { score: 0, traversed_nodes: vec![0, 2, 3, 5, 6] }); + assert_eq!(graph.edit_distance("ACAAA".as_bytes()).unwrap(), WFAResult { score: 0, traversed_nodes: vec![0, 1, 3, 5, 6] }); + assert_eq!(graph.edit_distance("AAACA".as_bytes()).unwrap(), WFAResult { score: 0, traversed_nodes: vec![0, 2, 3, 4, 6] }); + assert_eq!(graph.edit_distance("ACACA".as_bytes()).unwrap(), WFAResult { score: 0, traversed_nodes: vec![0, 1, 3, 4, 6] }); + assert_eq!(graph.edit_distance("AAA".as_bytes()).unwrap(), WFAResult { score: 2, traversed_nodes: vec![0, 1, 2, 3, 4, 5, 6] }); + assert_eq!(graph.edit_distance("AGAGA".as_bytes()).unwrap(), WFAResult { score: 2, traversed_nodes: vec![0, 1, 2, 3, 4, 5, 6] }); + + // check some mismatches on reference real quick also + assert_eq!(graph.edit_distance("GAAAA".as_bytes()).unwrap(), WFAResult { score: 1, traversed_nodes: vec![0, 2, 3, 5, 6] }); + assert_eq!(graph.edit_distance("ACAGAA".as_bytes()).unwrap(), WFAResult { score: 1, traversed_nodes: vec![0, 1, 3, 5, 6] }); + + // now check our lookup tables + assert_eq!(*node_to_alleles.get(&0).unwrap_or(&vec![]), vec![]); + assert_eq!(*node_to_alleles.get(&1).unwrap_or(&vec![]), vec![(0, 1)]); + assert_eq!(*node_to_alleles.get(&2).unwrap_or(&vec![]), vec![(0, 0)]); + assert_eq!(*node_to_alleles.get(&3).unwrap_or(&vec![]), vec![]); + assert_eq!(*node_to_alleles.get(&4).unwrap_or(&vec![]), vec![(1, 1)]); + assert_eq!(*node_to_alleles.get(&5).unwrap_or(&vec![]), vec![(1, 0)]); + assert_eq!(*node_to_alleles.get(&6).unwrap_or(&vec![]), vec![]); + } + + #[test] + fn test_overlapping_variants() { + let reference = "ACGTA".as_bytes(); + let variants = vec![ + // vcf_index, position, ref_len, allele0, allele1, index_allele0, index_allele1 + Variant::new_deletion(0, 1, 2, "CG".as_bytes().to_vec(), "C".as_bytes().to_vec(), 0, 1), + Variant::new_deletion(0, 2, 2, "GT".as_bytes().to_vec(), "G".as_bytes().to_vec(), 0, 1) + ]; + + let (graph, node_to_alleles): (WFAGraph, NodeAlleleMap) = + WFAGraph::from_reference_variants(&reference, &variants, 0, reference.len()).unwrap(); + + // check the alignments first + assert_eq!(graph.get_num_nodes(), 7); + // remember ALT alleles get added before reference alleles + /* + REF: 0 -> 2 -> 4 -> 5 -> 6 + ALT: -> 1 -^ + -> 3 -^ + */ + assert_eq!(graph.edit_distance(&reference).unwrap(), WFAResult { score: 0, traversed_nodes: vec![0, 2, 4, 5, 6] }); + assert_eq!(graph.edit_distance("ACTA".as_bytes()).unwrap(), WFAResult { score: 0, traversed_nodes: vec![0, 1, 5, 6] }); + assert_eq!(graph.edit_distance("ACGA".as_bytes()).unwrap(), WFAResult { score: 0, traversed_nodes: vec![0, 2, 3, 6] }); + // ACGTA -> AGTA, 1 ed; ACTA -> AGTA, 1 ed; so first allele is ambiguous, other should be reference + assert_eq!(graph.edit_distance("AGTA".as_bytes()).unwrap(), WFAResult { score: 1, traversed_nodes: vec![0, 1, 2, 4, 5, 6] }); + // AA -> AGTA, 2 ed; AA -> ACTA, 2 ed; AA -> ACGTA, 3 ed; but this should still lead to fully ambiguous solution without node 4 (it's skipped in both 2-ed paths) + assert_eq!(graph.edit_distance("AA".as_bytes()).unwrap(), WFAResult { score: 2, traversed_nodes: vec![0, 1, 2, 3, 5, 6] }); + + // now check our lookup tables + assert_eq!(*node_to_alleles.get(&0).unwrap_or(&vec![]), vec![]); + assert_eq!(*node_to_alleles.get(&1).unwrap_or(&vec![]), vec![(0, 1)]); + assert_eq!(*node_to_alleles.get(&2).unwrap_or(&vec![]), vec![(0, 0)]); + assert_eq!(*node_to_alleles.get(&3).unwrap_or(&vec![]), vec![(1, 1)]); + assert_eq!(*node_to_alleles.get(&4).unwrap_or(&vec![]), vec![(1, 0)]); + assert_eq!(*node_to_alleles.get(&5).unwrap_or(&vec![]), vec![]); + assert_eq!(*node_to_alleles.get(&6).unwrap_or(&vec![]), vec![]); + } + + #[test] + fn test_identical_insertions() { + // this can happen if we pair something like DV with pbmm2 and there is a "large" insertion + let reference = "ACGTA".as_bytes(); + let variants = vec![ + // vcf_index, position, allele0, allele1, index_allele0, index_allele1 + Variant::new_insertion(0, 2, "G".as_bytes().to_vec(), "GT".as_bytes().to_vec(), 0, 1), + Variant::new_insertion(1, 2, "G".as_bytes().to_vec(), "GT".as_bytes().to_vec(), 0, 1) + ]; + + let (graph, node_to_alleles): (WFAGraph, NodeAlleleMap) = + WFAGraph::from_reference_variants(&reference, &variants, 0, reference.len()).unwrap(); + + // check the alignments first + assert_eq!(graph.get_num_nodes(), 5); + /* + REF: 0 -> 3 -> 4 + ALT: -> 1 -^ + -> 2 -^ + */ + assert_eq!(graph.edit_distance(&reference).unwrap(), WFAResult { score: 0, traversed_nodes: vec![0, 3, 4] }); + assert_eq!(graph.edit_distance("ACGTTA".as_bytes()).unwrap(), WFAResult { score: 0, traversed_nodes: vec![0, 1, 2, 4] }); + // tests an insertion of the wrong character, this should be fully ambiguous + assert_eq!(graph.edit_distance("ACGATA".as_bytes()).unwrap(), WFAResult { score: 1, traversed_nodes: vec![0, 1, 2, 3, 4] }); + + // now check our lookup tables + assert_eq!(*node_to_alleles.get(&0).unwrap_or(&vec![]), vec![]); + assert_eq!(*node_to_alleles.get(&1).unwrap_or(&vec![]), vec![(0, 1)]); + assert_eq!(*node_to_alleles.get(&2).unwrap_or(&vec![]), vec![(1, 1)]); + assert_eq!(*node_to_alleles.get(&3).unwrap_or(&vec![]), vec![(0, 0), (1, 0)]); // this one has BOTH reference alleles + assert_eq!(*node_to_alleles.get(&4).unwrap_or(&vec![]), vec![]); + } + + #[test] + fn test_multiallelic_indel() { + // tests GT -> G/GTT + let reference = "ACGTA".as_bytes(); + let variants = vec![ + // vcf_index, position, ref_len, allele0, allele1, index_allele0, index_allele1 + Variant::new_indel(0, 2, 2, "G".as_bytes().to_vec(), "GTT".as_bytes().to_vec(), 1, 2) + ]; + + let (graph, node_to_alleles): (WFAGraph, NodeAlleleMap) = + WFAGraph::from_reference_variants(&reference, &variants, 0, reference.len()).unwrap(); + + // check the alignments first + assert_eq!(graph.get_num_nodes(), 5); + /* + REF: 0 -> 3 -> 4 + ALT: -> 1 -^ + -> 2 -^ + */ + assert_eq!(graph.edit_distance(&reference).unwrap(), WFAResult { score: 0, traversed_nodes: vec![0, 3, 4] }); + assert_eq!(graph.edit_distance("ACGA".as_bytes()).unwrap(), WFAResult { score: 0, traversed_nodes: vec![0, 1, 4] }); + assert_eq!(graph.edit_distance("ACGTTA".as_bytes()).unwrap(), WFAResult { score: 0, traversed_nodes: vec![0, 2, 4] }); + // we can't really get an ambiguous allele assignment here, but we can do ambiguous branches + assert_eq!(graph.edit_distance("ACGGA".as_bytes()).unwrap(), WFAResult { score: 1, traversed_nodes: vec![0, 1, 3, 4] }); + assert_eq!(graph.edit_distance("ACGGTA".as_bytes()).unwrap(), WFAResult { score: 1, traversed_nodes: vec![0, 2, 3, 4] }); + + // now check our lookup tables + assert_eq!(*node_to_alleles.get(&0).unwrap_or(&vec![]), vec![]); + assert_eq!(*node_to_alleles.get(&1).unwrap_or(&vec![]), vec![(0, 0)]); // ALT0 + assert_eq!(*node_to_alleles.get(&2).unwrap_or(&vec![]), vec![(0, 1)]); // ALT1 + assert_eq!(*node_to_alleles.get(&3).unwrap_or(&vec![]), vec![]); // we still need to inject a reference allele, but it shouldn't have an assoc. with the variant + assert_eq!(*node_to_alleles.get(&4).unwrap_or(&vec![]), vec![]); + } + + #[test] + fn test_partial_reference() { + // we prepended and appended "AA" to the basic SNV test + let reference = "AAAAAAA".as_bytes(); + // variant coordinate shifted +2 + let variants = vec![Variant::new_snv(0, 3, "A".as_bytes().to_vec(), "C".as_bytes().to_vec(), 0, 1)]; + + let (graph, node_to_alleles): (WFAGraph, NodeAlleleMap) = + WFAGraph::from_reference_variants(&reference, &variants, 2, reference.len()-2).unwrap(); + + // check the alignments first + assert_eq!(graph.get_num_nodes(), 4); + // remember ALT alleles get added before reference alleles + assert_eq!(graph.edit_distance(&reference[2..5]).unwrap(), WFAResult { score: 0, traversed_nodes: vec![0, 2, 3] }); + assert_eq!(graph.edit_distance("ACA".as_bytes()).unwrap(), WFAResult { score: 0, traversed_nodes: vec![0, 1, 3] }); + assert_eq!(graph.edit_distance("AA".as_bytes()).unwrap(), WFAResult { score: 1, traversed_nodes: vec![0, 1, 2, 3] }); + + // now check our lookup tables + assert_eq!(*node_to_alleles.get(&0).unwrap_or(&vec![]), vec![]); + assert_eq!(*node_to_alleles.get(&1).unwrap_or(&vec![]), vec![(0, 1)]); + assert_eq!(*node_to_alleles.get(&2).unwrap_or(&vec![]), vec![(0, 0)]); + assert_eq!(*node_to_alleles.get(&3).unwrap_or(&vec![]), vec![]); + } + + #[test] + fn test_complex_problem() { + let reference = "AACGTTGACGTCC".as_bytes(); // we skip 2 on the front, 1 on the back + let variants = vec![ + // vcf_index, position, ref_len, allele0, allele1, index_allele0, index_allele1 + // 3: GTTG>G + Variant::new_deletion(0, 3, 4, "GTTG".as_bytes().to_vec(), "G".as_bytes().to_vec(), 0, 1), + // 4: TT>T + Variant::new_deletion(0, 4, 2, "TT".as_bytes().to_vec(), "T".as_bytes().to_vec(), 0, 1), + // vcf_index, position, allele0, allele1, index_allele0, index_allele1 + // 6: G>[A,C] + Variant::new_snv(0, 6, "A".as_bytes().to_vec(), "C".as_bytes().to_vec(), 1, 2) + ]; + + let (graph, node_to_alleles): (WFAGraph, NodeAlleleMap) = + WFAGraph::from_reference_variants(&reference, &variants, 2, reference.len()-1).unwrap(); + + // check the alignments first + assert_eq!(graph.get_num_nodes(), 9); + /* + // this tests the scenario where one variant rejoins at the start of another + // here, the TT>T joins right before the G>[A,C] so node 3 and 4 are *both* parents of 5, 6, and 7 + REF: 0 -> 2 -> 4 -> 7 -> 8 + ALT: -> 1 -^ + -> 3 -| + -> 5 -^ + -> 6 -^ + */ + // remember ALT alleles get added before reference alleles + assert_eq!(graph.edit_distance("CGTTGACGTC".as_bytes()).unwrap(), WFAResult { score: 0, traversed_nodes: vec![0, 2, 4, 7, 8] }); //reference + assert_eq!(graph.edit_distance("CGACGTC".as_bytes()).unwrap(), WFAResult { score: 0, traversed_nodes: vec![0, 1, 8] }); //first variant only + assert_eq!(graph.edit_distance("CGTGACGTC".as_bytes()).unwrap(), WFAResult { score: 0, traversed_nodes: vec![0, 2, 3, 7, 8] }); //second variant only + assert_eq!(graph.edit_distance("CGTTAACGTC".as_bytes()).unwrap(), WFAResult { score: 0, traversed_nodes: vec![0, 2, 4, 5, 8] }); //third-0 variant only + assert_eq!(graph.edit_distance("CGTTCACGTC".as_bytes()).unwrap(), WFAResult { score: 0, traversed_nodes: vec![0, 2, 4, 6, 8] }); //third-1 variant only + assert_eq!(graph.edit_distance("CGTAACGTC".as_bytes()).unwrap(), WFAResult { score: 0, traversed_nodes: vec![0, 2, 3, 5, 8] }); //second and third-0 variants + assert_eq!(graph.edit_distance("CGTCACGTC".as_bytes()).unwrap(), WFAResult { score: 0, traversed_nodes: vec![0, 2, 3, 6, 8] }); //second and third-1 variants + + // now try some inexacts + assert_eq!(graph.edit_distance("CGGACGTC".as_bytes()).unwrap(), WFAResult { score: 1, traversed_nodes: vec![0, 1, 2, 3, 7, 8] }); //delete both Ts, ambiguous deletion call + assert_eq!(graph.edit_distance("CGTACGTC".as_bytes()).unwrap(), WFAResult { score: 1, traversed_nodes: vec![0, 1, 2, 3, 5, 6, 7, 8] }); //delete TG, ambiguous in a lot of ways + + // now check our lookup tables + assert_eq!(*node_to_alleles.get(&0).unwrap_or(&vec![]), vec![]); + assert_eq!(*node_to_alleles.get(&1).unwrap_or(&vec![]), vec![(0, 1)]); + assert_eq!(*node_to_alleles.get(&2).unwrap_or(&vec![]), vec![(0, 0)]); + assert_eq!(*node_to_alleles.get(&3).unwrap_or(&vec![]), vec![(1, 1)]); + assert_eq!(*node_to_alleles.get(&4).unwrap_or(&vec![]), vec![(1, 0)]); + assert_eq!(*node_to_alleles.get(&5).unwrap_or(&vec![]), vec![(2, 0)]); + assert_eq!(*node_to_alleles.get(&6).unwrap_or(&vec![]), vec![(2, 1)]); + assert_eq!(*node_to_alleles.get(&7).unwrap_or(&vec![]), vec![]); + assert_eq!(*node_to_alleles.get(&8).unwrap_or(&vec![]), vec![]); + } + + //////////////////////////////////////////////////////////////////////////////// + // After here are mostly edge case bug tests + //////////////////////////////////////////////////////////////////////////////// + + + // tests when a variant goes past the provided reference, we should ignore it basically + #[test] + fn test_span_ref_end() { + // tests GT -> G/GTT + let reference = "ACGTA".as_bytes(); + let variants = vec![ + // vcf_index, position, ref_len, allele0, allele1, index_allele0, index_allele1 + Variant::new_deletion(0, 3, 3, "TAG".as_bytes().to_vec(), "T".as_bytes().to_vec(), 0, 1) + ]; + + let (graph, node_to_alleles): (WFAGraph, NodeAlleleMap) = + WFAGraph::from_reference_variants(&reference, &variants, 0, reference.len()).unwrap(); + + // check the alignments first + assert_eq!(graph.get_num_nodes(), 1); + + // now check our lookup tables + assert_eq!(*node_to_alleles.get(&0).unwrap_or(&vec![]), vec![]); + } + + #[test] + fn test_hom_variants() { + // add a hom variant at base 1; these can be traversed, but provide no index lookup because they are not a "variant" + let reference = "AAAAA".as_bytes(); + let variants = vec![Variant::new_snv(0, 3, "A".as_bytes().to_vec(), "C".as_bytes().to_vec(), 0, 1)]; + let hom_variants = vec![Variant::new_snv(0, 1, "A".as_bytes().to_vec(), "C".as_bytes().to_vec(), 0, 1)]; + + let (graph, node_to_alleles): (WFAGraph, NodeAlleleMap) = + WFAGraph::from_reference_variants_with_hom(&reference, &variants, &hom_variants, 0, reference.len()).unwrap(); + + // check the alignments first + assert_eq!(graph.get_num_nodes(), 7); + // remember ALT alleles get added before reference alleles + assert_eq!(graph.edit_distance("AAAAA".as_bytes()).unwrap(), WFAResult { score: 0, traversed_nodes: vec![0, 2, 3, 5, 6] }); + assert_eq!(graph.edit_distance("ACAAA".as_bytes()).unwrap(), WFAResult { score: 0, traversed_nodes: vec![0, 1, 3, 5, 6] }); + assert_eq!(graph.edit_distance("ACACA".as_bytes()).unwrap(), WFAResult { score: 0, traversed_nodes: vec![0, 1, 3, 4, 6] }); + assert_eq!(graph.edit_distance("ACAA".as_bytes()).unwrap(), WFAResult { score: 1, traversed_nodes: vec![0, 1, 3, 4, 5, 6] }); + + // now check our lookup tables + assert_eq!(*node_to_alleles.get(&0).unwrap_or(&vec![]), vec![]); + assert_eq!(*node_to_alleles.get(&1).unwrap_or(&vec![]), vec![]); + assert_eq!(*node_to_alleles.get(&2).unwrap_or(&vec![]), vec![]); + assert_eq!(*node_to_alleles.get(&3).unwrap_or(&vec![]), vec![]); + assert_eq!(*node_to_alleles.get(&4).unwrap_or(&vec![]), vec![(0, 1)]); + assert_eq!(*node_to_alleles.get(&5).unwrap_or(&vec![]), vec![(0, 0)]); + assert_eq!(*node_to_alleles.get(&6).unwrap_or(&vec![]), vec![]); + } + + #[test] + fn test_variant_at_start() { + let reference = "AAA".as_bytes(); + let variants = vec![Variant::new_snv(0, 0, "A".as_bytes().to_vec(), "C".as_bytes().to_vec(), 0, 1)]; + + let (graph, node_to_alleles): (WFAGraph, NodeAlleleMap) = + WFAGraph::from_reference_variants(&reference, &variants, 0, reference.len()).unwrap(); + + // check the alignments first + assert_eq!(graph.get_num_nodes(), 4); + // remember ALT alleles get added before reference alleles + assert_eq!(graph.edit_distance(&reference).unwrap(), WFAResult { score: 0, traversed_nodes: vec![0, 2, 3] }); + assert_eq!(graph.edit_distance("CAA".as_bytes()).unwrap(), WFAResult { score: 0, traversed_nodes: vec![0, 1, 3] }); + assert_eq!(graph.edit_distance("AA".as_bytes()).unwrap(), WFAResult { score: 1, traversed_nodes: vec![0, 1, 2, 3] }); + + // now check our lookup tables + assert_eq!(*node_to_alleles.get(&0).unwrap_or(&vec![]), vec![]); + assert_eq!(*node_to_alleles.get(&1).unwrap_or(&vec![]), vec![(0, 1)]); + assert_eq!(*node_to_alleles.get(&2).unwrap_or(&vec![]), vec![(0, 0)]); + assert_eq!(*node_to_alleles.get(&3).unwrap_or(&vec![]), vec![]); + } + + #[test] + fn test_variant_at_end() { + let reference = "AAA".as_bytes(); + let variants = vec![Variant::new_snv(0, 2, "A".as_bytes().to_vec(), "C".as_bytes().to_vec(), 0, 1)]; + + let (graph, node_to_alleles): (WFAGraph, NodeAlleleMap) = + WFAGraph::from_reference_variants(&reference, &variants, 0, reference.len()).unwrap(); + + // check the alignments first + assert_eq!(graph.get_num_nodes(), 4); + // remember ALT alleles get added before reference alleles + assert_eq!(graph.edit_distance(&reference).unwrap(), WFAResult { score: 0, traversed_nodes: vec![0, 2, 3] }); + assert_eq!(graph.edit_distance("AAC".as_bytes()).unwrap(), WFAResult { score: 0, traversed_nodes: vec![0, 1, 3] }); + assert_eq!(graph.edit_distance("AA".as_bytes()).unwrap(), WFAResult { score: 1, traversed_nodes: vec![0, 1, 2, 3] }); + + // now check our lookup tables + assert_eq!(*node_to_alleles.get(&0).unwrap_or(&vec![]), vec![]); + assert_eq!(*node_to_alleles.get(&1).unwrap_or(&vec![]), vec![(0, 1)]); + assert_eq!(*node_to_alleles.get(&2).unwrap_or(&vec![]), vec![(0, 0)]); + assert_eq!(*node_to_alleles.get(&3).unwrap_or(&vec![]), vec![]); + } +} \ No newline at end of file diff --git a/src/writers/block_stats.rs b/src/writers/block_stats.rs new file mode 100644 index 0000000..5293076 --- /dev/null +++ b/src/writers/block_stats.rs @@ -0,0 +1,370 @@ + +use log::debug; +use rustc_hash::FxHashMap as HashMap; +use serde::Serialize; +use std::fs::File; +use std::path::Path; + +use crate::block_gen::PhaseBlock; +use crate::data_types::reference_genome::ReferenceGenome; +use crate::data_types::variants::{VariantType, Zygosity}; +use crate::phaser::PhaseResult; + +/// This is a wrapper for writing out any stats to a file +#[derive(Default)] +pub struct BlockStatsCollector { + /// Blocks that will be written out eventually + blocks: Vec, + /// Tracks the number of phased SNVs, key is (sample_name, chromosome) + phased_snvs: HashMap<(String, String), usize> +} + +/// Contains all the data written to each row of our stats file +#[derive(Serialize)] +struct BlockRow { + /// The index of the block + source_block_index: usize, + /// the sample name tied to the block + sample_name: String, + /// Phase set ID - usually the position first variant in the block + phase_block_id: u64, + /// the chromosome of the block + chrom: String, + /// the position of the first allele + start: u64, + /// the position of the last allele + end: u64, + /// the number of variants in the block + num_variants: usize +} + +#[derive(Serialize)] +struct SummaryRow { + /// the sample name + sample_name: String, + /// The chromosome or "all" + chromosome: String, + /// The total number of variants + num_variants: usize, + /// The total number of heterozygous variants + num_heterozygous: usize, + /// The number of phased heterozygous variants + num_phased: usize, + /// The number of unphased heterozygous variants + num_unphased: usize, + /// The number of heterozygous SNVs + num_het_snv: usize, + /// The number of phased, heterozygous SNVs + num_phased_snv: usize, + /// The total number of blocks + num_blocks: usize, + /// The number of blocks with only 1 phased variant + num_singletons: usize, + /// variants per block stats + variants_per_block_median: usize, + variants_per_block_mean: usize, + variants_per_block_min: usize, + variants_per_block_max: usize, + variants_per_block_sum: usize, + /// basepairs per block stats + basepairs_per_block_median: u64, + basepairs_per_block_mean: u64, + basepairs_per_block_min: u64, + basepairs_per_block_max: u64, + basepairs_per_block_sum: u64, + /// block NG50 + block_ng50: Option +} + +impl BlockStatsCollector { + /// Creates a new writer for a given filename + /// # Arguments + /// * `csv_filename` - the path to write all stats to + pub fn new() -> BlockStatsCollector { + Self::default() + } + + /// Adds a block to our collection + /// # Arguments + /// * `block` - the block to add, no checks are performed on the input + pub fn add_block(&mut self, block: PhaseBlock) { + self.blocks.push(block); + } + + /// Adds a phase result to our statistics + /// # Arguments + /// * `chrom` - the chromosome for the result + /// * `result` + pub fn add_result(&mut self, result: &PhaseResult) { + if let Some(stats) = result.statistics.as_ref() { + if let Some(count) = stats.phased_snvs() { + let sample_name: String = result.phase_block.sample_name().to_string(); + let chrom: String = result.phase_block.get_chrom().to_string(); + *self.phased_snvs.entry((sample_name, chrom)).or_insert(0) += count as usize; + } + } + } + + /// Will write all blocks to a CSV filename in order + /// # Arguments + /// * `filename` - the filename for the output (tsv/csv) + pub fn write_blocks(&mut self, filename: &Path) -> csv::Result<()> { + // modify the delimiter to "," if it ends with .csv + let is_csv: bool = filename.extension().unwrap_or_default() == "csv"; + let delimiter: u8 = if is_csv { b',' } else { b'\t' }; + let mut csv_writer: csv::Writer = csv::WriterBuilder::new() + .delimiter(delimiter) + .from_path(filename)?; + + // make sure we go through the blocks in order + self.blocks.sort(); + for block in self.blocks.iter() { + let block_row = BlockRow { + source_block_index: block.get_block_index(), + sample_name: block.sample_name().to_string(), + phase_block_id: block.get_start()+1, + chrom: block.get_chrom().to_string(), + start: block.get_start()+1, + end: block.get_end()+1, + num_variants: block.get_num_variants() + }; + csv_writer.serialize(&block_row)?; + } + csv_writer.flush()?; + Ok(()) + } + + /// Will write out a file containing chromosome level block statistics and overall statistics. + /// # Arguments + /// * `filename` - the filename for the output (tsv/csv) + /// * `reference_genome` - the reference genome, without this we can't determine lengths + /// * `variant_counts` - the variant count data from parsing; key is (sample, chromosome, variant type, zygosity), value is a count + pub fn write_block_stats( + &self, + sample_order: &[String], + filename: &Path, + reference_genome: &ReferenceGenome, + variant_counts: HashMap<(String, String, VariantType, Zygosity), usize> + ) -> csv::Result<()> { + // modify the delimiter to "," if it ends with .csv + let is_csv: bool = filename.extension().unwrap_or_default() == "csv"; + let delimiter: u8 = if is_csv { b',' } else { b'\t' }; + let mut csv_writer: csv::Writer = csv::WriterBuilder::new() + .delimiter(delimiter) + .from_path(filename)?; + + // calculate the total reference length assuming we have a reference + let total_contig_length: u64 = { + let mut contig_sum: u64 = 0; + for contig_name in reference_genome.contig_keys().iter() { + let contig_length = reference_genome.get_full_chromosome(contig_name).len(); + contig_sum += contig_length as u64; + } + contig_sum + }; + + for sample_name in sample_order.iter() { + // go through all the blocks and generate chromosome level and overall stats + let mut blocks_by_chrom: HashMap> = Default::default(); + let mut all_sample_blocks: Vec = Default::default(); + + for block in self.blocks.iter() { + if block.sample_name() == sample_name { + let chrom: String = block.get_chrom().to_string(); + blocks_by_chrom.entry(chrom).or_insert(vec![]).push(block.clone()); + all_sample_blocks.push(block.clone()); + } + } + + // these are from parsing the VCFs + let mut num_variants: HashMap = Default::default(); + let mut num_heterozygous: HashMap = Default::default(); + let mut num_het_snv: HashMap = Default::default(); + let mut ordered_iteration: Vec<_> = variant_counts.iter().collect(); + ordered_iteration.sort(); + for ((sample, chrom, variant_type, zygosity), &count) in ordered_iteration.iter() { + debug!("{} {} {:?} {:?} => {}", sample, chrom, variant_type, zygosity, count); + if sample == sample_name && *variant_type != VariantType::Unknown && *zygosity != Zygosity::HomozygousReference && *zygosity != Zygosity::Unknown { + // okay, we're working with something we can count now + *num_variants.entry(chrom.clone()).or_insert(0) += count; + + if *zygosity == Zygosity::Heterozygous { + *num_heterozygous.entry(chrom.clone()).or_insert(0) += count; + if *variant_type == VariantType::Snv { + *num_het_snv.entry(chrom.clone()).or_insert(0) += count; + } + } + } + } + + // best place to pull contig order from is our reference genome + for contig in reference_genome.contig_keys().iter() { + let contig_length: u64 = reference_genome.get_full_chromosome(contig).len() as u64; + let chrow_stats_row: SummaryRow = self.generate_summary_row( + sample_name, + contig, + blocks_by_chrom.get(contig).unwrap_or(&vec![]), + *num_variants.get(contig).unwrap_or(&0), + *num_heterozygous.get(contig).unwrap_or(&0), + *num_het_snv.get(contig).unwrap_or(&0), + *self.phased_snvs.get(&(sample_name.clone(), contig.clone())).unwrap_or(&0), + contig_length + ); + csv_writer.serialize(&chrow_stats_row)?; + } + + let all_stats_row: SummaryRow = self.generate_summary_row( + sample_name, + "all", + &all_sample_blocks, + num_variants.values().sum(), + num_heterozygous.values().sum(), + num_het_snv.values().sum(), + self.phased_snvs.iter().map(|(_chrom, &count)| count).sum(), + total_contig_length + ); + csv_writer.serialize(&all_stats_row)?; + } + + csv_writer.flush()?; + Ok(()) + } + + /// Utility function for building a summary row from given data. + /// # Arguments + /// * `sample_name` - pass-through sample ID + /// * `chrom` - pass-through chromosome name + /// * `blocks` - the phase blocks that get parsed into a summary row for the sample/chromosome + /// * `num_variants` - total number of variants + /// * `num_heterozygous` - total number of heterozygous variants + /// * `num_het_snv` - total number of heterozygous SNVs + /// * `num_phased_snv` - total number of output phased SNVs + /// * `contig_length` - length of the chromosome + #[allow(clippy::too_many_arguments)] + fn generate_summary_row( + &self, sample_name: &str, chrom: &str, blocks: &[PhaseBlock], + num_variants: usize, num_heterozygous: usize, num_het_snv: usize, num_phased_snv: usize, + contig_length: u64 + ) -> SummaryRow { + // make sure every block has the correct sample name + assert!(blocks.iter().all(|b| b.sample_name() == sample_name)); + + // these are derivable from our results + let num_blocks = blocks.len(); + let num_singletons = blocks.iter() + .filter(|block| block.get_num_variants() == 1) + .count(); + + // collect the block variant stats, this is only counting heterozygous variants + let mut block_variants: Vec = blocks.iter() + .map(|b| b.get_num_variants()) + .collect(); + + // collect the block length stats + let mut block_lengths: Vec = blocks.iter() + .map(|b| b.bp_len()) + .collect(); + + // let total_heterozygous: usize = num_heterozygous.iter().map(|(_chrom, &count)| count).sum(); + let num_phased: usize = block_variants.iter().sum(); + let num_unphased = num_heterozygous - num_phased; + + block_variants.sort(); + let variants_per_block_median: usize = if block_variants.is_empty() { 0 } else { block_variants[block_variants.len() / 2] }; + let variants_per_block_mean: usize = if block_variants.is_empty() { 0 } else { block_variants.iter().sum::() / block_variants.len() }; + let variants_per_block_min: usize = *block_variants.iter().min().unwrap_or(&0); + let variants_per_block_max: usize = *block_variants.iter().max().unwrap_or(&0); + let variants_per_block_sum: usize = block_variants.iter().sum(); + + block_lengths.sort(); + let basepairs_per_block_median: u64 = if block_lengths.is_empty() { 0 } else { block_lengths[block_lengths.len() / 2] }; + let basepairs_per_block_mean: u64 = if block_lengths.is_empty() { 0 } else { block_lengths.iter().sum::() / block_lengths.len() as u64 }; + let basepairs_per_block_min: u64 = *block_lengths.iter().min().unwrap_or(&0); + let basepairs_per_block_max: u64 = *block_lengths.iter().max().unwrap_or(&0); + let basepairs_per_block_sum: u64 = block_lengths.iter().sum(); + + let block_ng50: Option = if contig_length != 0 { + Some(calculate_block_ng50(&block_lengths, contig_length)) + } else { + None + }; + + SummaryRow { + sample_name: sample_name.to_string(), + chromosome: chrom.to_string(), + num_variants, + num_heterozygous, + num_phased, + num_unphased, + num_het_snv, + num_phased_snv, + num_blocks, + num_singletons, + variants_per_block_median, + variants_per_block_mean, + variants_per_block_min, + variants_per_block_max, + variants_per_block_sum, + basepairs_per_block_median, + basepairs_per_block_mean, + basepairs_per_block_min, + basepairs_per_block_max, + basepairs_per_block_sum, + block_ng50 + } + } +} + +/// Helper subroutine for calculating block NG50 from a list of sorted blocks. +/// # Arguments +/// * `sorted_blocks` - block sizes sorted in ascending order +/// * `contig_length` - the total contig length, half is needed to reach NG50 +/// # Panics +/// * if while iterating it detects unsorted blocks +fn calculate_block_ng50(sorted_blocks: &[u64], contig_length: u64) -> u64 { + let mut last_block_size: u64 = u64::MAX; + let mut length_sum: u64 = 0; + + // add one to handle odd values (e.g. rounding up) + let target_length: u64 = (contig_length + 1) / 2; + + for &block_size in sorted_blocks.iter().rev() { + // we're going in reverse, so block sizes *should* be monotonically decreasing + assert!(block_size <= last_block_size); + last_block_size = block_size; + + // add in the block and check again half the total length + length_sum += block_size; + if length_sum >= target_length { + // we made it, return the size of this block + return block_size; + } + } + + // we didn't make it + 0 +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_calculate_block_ng50() { + // odd block length + let contig_length: u64 = 21; + let blocks: Vec = vec![1, 2, 3, 4, 10]; + let bad_blocks: Vec = vec![2]; + let good_blocks: Vec = vec![9, 10]; + + assert_eq!(calculate_block_ng50(&blocks, contig_length), 4); + assert_eq!(calculate_block_ng50(&bad_blocks, contig_length), 0); + assert_eq!(calculate_block_ng50(&good_blocks, contig_length), 9); + + // same but now even block length + let contig_length: u64 = 20; + assert_eq!(calculate_block_ng50(&blocks, contig_length), 10); + assert_eq!(calculate_block_ng50(&bad_blocks, contig_length), 0); + assert_eq!(calculate_block_ng50(&good_blocks, contig_length), 10); + } +} \ No newline at end of file diff --git a/src/writers/haplotag_writer.rs b/src/writers/haplotag_writer.rs new file mode 100644 index 0000000..877dc5b --- /dev/null +++ b/src/writers/haplotag_writer.rs @@ -0,0 +1,72 @@ + +use serde::Serialize; +use std::fs::File; +use std::path::Path; + +use crate::phaser::HaplotagResult; + +/// This is a wrapper for writing out any stats to a file +pub struct HaplotagWriter { + /// Handle for the CSV writer + csv_writer: csv::Writer +} + +/// Contains all the data written to each row of our stats file +#[derive(Serialize)] +struct HaplotagRow { + /// The index of the block + source_block_index: usize, + /// the sample name tied to the block + sample_name: String, + /// the chromosome of the block + chrom: String, + /// Phase set ID - usually the position first variant in the block + phase_block_id: u64, + /// the read name that is assigned + read_name: String, + /// the haplotype the read is assigned to + haplotag: u8 +} + +impl HaplotagWriter { + /// Creates a new writer for a given filename + /// # Arguments + /// * `filename` - the path to write all stats to + pub fn new(filename: &Path) -> csv::Result { + // modify the delimiter to "," if it ends with .csv + let is_csv: bool = filename.extension().unwrap_or_default() == "csv"; + let delimiter: u8 = if is_csv { b',' } else { b'\t' }; + let csv_writer: csv::Writer = csv::WriterBuilder::new() + .delimiter(delimiter) + .from_path(filename)?; + Ok(HaplotagWriter { + csv_writer + }) + } + + /// Writes the haplotag results from a given block. + /// # Arguments + /// * `haplotag_result` - the HaplotagResult from phasing, in-order is not required + /// # Errors + /// * if the csv_writer has any errors + pub fn write_block(&mut self, haplotag_result: &HaplotagResult) -> Result<(), Box> { + let source_block_index: usize = haplotag_result.phase_block.get_block_index(); + let sample_name: String = haplotag_result.phase_block.sample_name().to_string(); + let chrom: String = haplotag_result.phase_block.get_chrom().to_string(); + for (read_name, &(phase_block_id_0, haplotag_0)) in haplotag_result.reads.iter() { + let phase_block_id: u64 = (phase_block_id_0 + 1).try_into()?; + let haplotag: u8 = (haplotag_0 + 1).try_into()?; + let row: HaplotagRow = HaplotagRow { + source_block_index, + sample_name: sample_name.clone(), + chrom: chrom.clone(), + phase_block_id, + read_name: read_name.clone(), + haplotag + }; + self.csv_writer.serialize(&row)?; + self.csv_writer.flush()?; + } + Ok(()) + } +} \ No newline at end of file diff --git a/src/writers/mod.rs b/src/writers/mod.rs new file mode 100644 index 0000000..9b179a0 --- /dev/null +++ b/src/writers/mod.rs @@ -0,0 +1,13 @@ + +/// Contains writer for phase block stats, both the phase blocks themselves and the summary +pub mod block_stats; +/// Contains the writer for haplotag results +pub mod haplotag_writer; +/// Contains writer for BAM files +pub mod ordered_bam_writer; +/// Contains writer for VCF files +pub mod ordered_vcf_writer; +/// Contains writer for phasing statistics for underlying algorithms +pub mod phase_stats; +/// Contains additional VCF utilities +pub mod vcf_util; \ No newline at end of file diff --git a/src/writers/ordered_bam_writer.rs b/src/writers/ordered_bam_writer.rs new file mode 100644 index 0000000..14c3832 --- /dev/null +++ b/src/writers/ordered_bam_writer.rs @@ -0,0 +1,355 @@ + +use crate::phaser::HaplotagResult; + +use log::{debug, trace, warn}; +use rust_htslib::bam; +use rust_htslib::bam::Read; +use rustc_hash::{FxHashMap as HashMap, FxHashSet as HashSet}; +use simple_error::bail; +use std::cell::RefCell; +use std::path::{PathBuf, Path}; + +/// Structure that maintains order of phase problems while writing solutions. +pub struct OrderedBamWriter { + /// each bam writer is responsible for a specific sample + sample_name: String, + /// the template BAMs we are reading from and copying/modifying + ref_bam_readers: Vec>, + /// the BAMs we are writing to + ref_bam_writers: Vec>, + /// the data that may be cached because we are waiting on earlier results + map_store: HashMap, + /// set of blocks that are indicated to skip + skip_set: HashSet, + /// the index of data we are waiting for + current_index: usize, + /// the most recently written chromosome + current_chrom: String, + /// the most recently written POS field + current_pos: u64, + /// tracks chromosomes that have been finalized + finished_chroms: HashSet +} + +impl OrderedBamWriter { + /// Creates a new `OrderedBamWriter` using template BAMs. + /// # Arguments + /// * `sample_name` - the sample this BAM writer corresponds to + /// * `reference_filename` - the path to the reference genome + /// * `input_bams` - the template VCF file containing unphased variants + /// * `output_bams` - the VCF file that will get created containing our phase solutions + /// * `thread_pool` - a shared thread pool for BAM I/O + pub fn new( + sample_name: String, reference_filename: &Path, + input_bams: &[PathBuf], output_bams: &[PathBuf], + thread_pool: &rust_htslib::tpool::ThreadPool + ) -> Result { + // log this + debug!("Creating BAM writer for {}:", sample_name); + debug!("\tInputs: {:?}", input_bams); + debug!("\tOutputs: {:?}", output_bams); + + // get all the stuff we need for reading the VCF setup + let mut ref_bam_readers: Vec> = vec![]; + let mut ref_bam_writers: Vec> = vec![]; + for (i, path) in input_bams.iter().enumerate() { + let mut bam_reader: bam::IndexedReader = bam::IndexedReader::from_path(path)?; + bam_reader.set_reference(reference_filename)?; + bam_reader.set_thread_pool(thread_pool)?; + let bam_header: bam::HeaderView = bam_reader.header().clone(); + let ref_bam_reader: RefCell = RefCell::new(bam_reader); + + //now setup the outputs, we want to do the header stuff here also + let mut output_header: bam::header::Header = bam::header::Header::from_template(&bam_header); + let cli_string: String = std::env::args().collect::>().join(" "); + let cli_version: &str = &crate::cli::FULL_VERSION; + + let mut cli_record = bam::header::HeaderRecord::new("PG".as_bytes()); + cli_record.push_tag("PN".as_bytes(), &"hiphase"); + cli_record.push_tag("ID".as_bytes(), &format!("hiphase-v{cli_version}")); + cli_record.push_tag("VN".as_bytes(), &cli_version); + cli_record.push_tag("CL".as_bytes(), &cli_string); + output_header.push_record(&cli_record); + + // TODO: do we need to add something to the BAM headers? I'm not sure yet, we'll come back to this + // output_header.push_record(r#"##FORMAT="#.as_bytes()); + let bam_format: bam::Format = if output_bams[i].extension().unwrap_or_default() == "cram" { + bam::Format::Cram + } else { + bam::Format::Bam + }; + let mut bam_writer: bam::Writer = bam::Writer::from_path( + &output_bams[i], + &output_header, + bam_format + )?; + bam_writer.set_reference(reference_filename)?; + bam_writer.set_thread_pool(thread_pool)?; + let ref_bam_writer: RefCell = RefCell::new(bam_writer); + + // push everything into the vecs for storage + ref_bam_readers.push(ref_bam_reader); + ref_bam_writers.push(ref_bam_writer); + } + + Ok(OrderedBamWriter { + sample_name, + ref_bam_readers, + ref_bam_writers, + map_store: Default::default(), + skip_set: Default::default(), + current_index: 0, + current_chrom: "".to_string(), + current_pos: 0, + finished_chroms: Default::default() + }) + } + + /// Returns the phase block result that the writer is currently waiting to receive. + pub fn get_wait_block(&self) -> usize { + self.current_index + } + + /// Adds a phase block result to our queue for writing. + /// # Arguments + /// * `phase_result` - a phasing result that will be written in the correct order with other blocks + pub fn write_phase_block(&mut self, haplotag_result: HaplotagResult) -> Result<(), Box> { + let block_index: usize = haplotag_result.phase_block.get_block_index(); + if block_index < self.current_index { + bail!("Block index is smaller than next expected index"); + } + if haplotag_result.phase_block.sample_name() != self.sample_name { + bail!("Received haplotag result for sample other than the one specified"); + } + match self.map_store.insert(block_index, haplotag_result) { + None => {}, + Some(_) => { + bail!("Block index was already present in the map_store"); + } + }; + self.drain_map_store()?; + Ok(()) + } + + /// Adds a dummy block result to our queue for writing. + /// This is only necessary because we have block indices that are shared across samples, but this only cares about one sample. + /// # Arguments + /// * `phase_result` - a phasing result that will be written in the correct order with other blocks + pub fn write_dummy_block(&mut self, block_index: usize) -> Result<(), Box> { + if block_index < self.current_index { + bail!("Block index is smaller than next expected index"); + } + + // add to the skip list and drain away + self.skip_set.insert(block_index); + self.drain_map_store()?; + Ok(()) + } + + /// This will drain phase block solutions in the correct order if they have been received. + /// It drains as far as it can given the current results and then stops to wait for more data. + fn drain_map_store(&mut self) -> Result<(), Box> { + while !self.map_store.is_empty() { + match self.map_store.remove(&self.current_index) { + Some(haplotag_result) => { + trace!("Draining {}", self.current_index); + + let chrom_result: &str = haplotag_result.phase_block.get_chrom(); + if chrom_result != self.current_chrom { + if self.current_chrom.is_empty() { + // this is the first block, so lets set chrom and move on + self.current_chrom = chrom_result.to_string(); + } else { + // the next block is on a different chromosome, so we need to finalize this chromosome + self.finalize_chromosome()?; + + // now setup for the next chromosome + self.current_chrom = chrom_result.to_string(); + self.current_pos = 0; + } + } else { + // the chromosome matches previous, I don't think we actually have to do anything + // as long as our assumptions are correct + } + + /* + General plan: loop over the writers and write each BAM separately + - get all of the index-specific readers/writers ready + - adjust haplotype_index to only correspond to the current vcf_index + - we may need to adjust our errors to allow for empty blocks; we can make a dummy VCF to test with + - ^ we should make a dummy VCF anyways just to verify that nothing in our output changes by adding an empty VCF + */ + let start_pos = self.current_pos; + let end_pos = haplotag_result.phase_block.get_end(); + + for (bam_index, ref_bam_writer) in self.ref_bam_writers.iter().enumerate() { + // prep the writer + let mut bam_writer = ref_bam_writer.borrow_mut(); + + // we _should_ just have to iterate from the last written through the end of the block + let mut bam_reader = self.ref_bam_readers[bam_index].borrow_mut(); + + // key is a read name, value is (block_id, haplotype) + let read_block_lookup = &haplotag_result.reads; + + // annoyingly, the bam reader (here) uses normal rust conventions for 0..len (end is exclusive), whereas the VCF reader is inclusive + // so to fix, we need to add the +1 as shown below + match bam_reader.fetch(bam::FetchDefinition::RegionString(chrom_result.as_bytes(), start_pos as i64, end_pos as i64 + 1)) { + Ok(()) => { + for record_result in bam_reader.records() { + // set up the record for the new BAM file + let mut record = record_result?; + let record_pos = record.pos(); + if record_pos < start_pos as i64 { + // this can happen when you have reads that overlap the location but don't start *after* the position + // we have already written though, so don't write it again + continue; + } + + // this may need to be <=, hard to tell yet + assert!(record_pos <= end_pos as i64); + + // now check if the read name has a lookup + let read_name = std::str::from_utf8(record.qname()).unwrap(); + match read_block_lookup.get(read_name) { + Some((phase_block_id, haplotag)) => { + // we have a match, modify phase info + // phase_block_id is 0-based, so add 1 to it + record.push_aux("PS".as_bytes(), bam::record::Aux::I32((phase_block_id + 1).try_into()?))?; + // haplotag is 0/1 and we want 1/2 in the BAM, so add 1 to it + record.push_aux("HP".as_bytes(), bam::record::Aux::U8((haplotag + 1).try_into()?))?; + bam_writer.write(&record)?; + }, + None => { + // no match, so just copy the read over + bam_writer.write(&record)?; + } + }; + } + }, + Err(e) => { + if end_pos == 0 { + warn!("Empty problem block received, no read mappings on chromosome {}", chrom_result); + } else { + warn!("Received \'{}\', while seeking to {}:{}-{} in bam #{}, likely no reads in region", e, chrom_result, start_pos, end_pos, bam_index); + //return Err(e); + } + } + }; + } + + // set up for the next block we get + self.current_pos = end_pos+1; + self.current_index += 1; + }, + None => { + // it's not in the received blocks, check if it is in the skip set + if self.skip_set.remove(&self.current_index) { + // we did find it here + self.current_index += 1; + } else { + // it's not there either, time to end the looping for now + break; + } + } + }; + } + Ok(()) + } + + /// Writes all remaining variants for a chromosome. + /// It should be called after all phase blocks for a chromosome are received. + /// # Panics + /// * if the current chromosome has been previously finalized + pub fn finalize_chromosome(&mut self) -> Result<(), rust_htslib::errors::Error> { + // make sure we haven't done this chromosome before + assert!(!self.finished_chroms.contains(&self.current_chrom)); + + // finalize the area + let start_pos = self.current_pos; + for (bam_index, ref_bam_writer) in self.ref_bam_writers.iter().enumerate() { + // prep the writer + let mut bam_writer = ref_bam_writer.borrow_mut(); + let mut bam_reader = self.ref_bam_readers[bam_index].borrow_mut(); + + //match vcf_reader.fetch(chrom_index, start_pos, None) { + match bam_reader.fetch(bam::FetchDefinition::RegionString(self.current_chrom.as_bytes(), start_pos as i64, i64::MAX)) { + Ok(()) => { + for record_result in bam_reader.records() { + // set up the record for the new BAM file + let record = record_result?; + let record_pos = record.pos(); + if record_pos < start_pos as i64 { + // this can happen when you have reads that overlap the location but don't start *after* the position + // we have already written though, so don't write it again + continue; + } + + // nothing left should be tagged + bam_writer.write(&record)?; + + // adding this last bit should prevent double writes by accident from a user + self.current_pos = self.current_pos.max(record_pos as u64 + 1); + } + }, + Err(e) => { + warn!("Received \'{}\', likely caused by no trailing variants detected for {}:{}-END", e, self.current_chrom, start_pos); + } + }; + } + + self.finished_chroms.insert(self.current_chrom.clone()); + Ok(()) + } + + pub fn copy_remaining_chromosomes(&mut self) -> Result<(), rust_htslib::errors::Error> { + // go through each BAM and just copy any chromosomes we didn't do + for (bam_index, ref_bam_writer) in self.ref_bam_writers.iter().enumerate() { + // prep the writer + let mut bam_writer = ref_bam_writer.borrow_mut(); + let mut bam_reader = self.ref_bam_readers[bam_index].borrow_mut(); + debug!("Finalizing bam #{}...", bam_index); + + // first, find anything that didn't copy + let mut remaining_contigs: Vec = vec![]; + let mut remaining_contigs_copy: Vec = vec![]; // we have to do this because there is no clone for FetchDefs + let target_names: Vec = bam_reader.header().target_names().iter() + .map(|s| String::from_utf8(s.to_vec()).unwrap()) + .collect(); + for tn in target_names.iter() { + if !self.finished_chroms.contains(tn) { + // this target has not been completed yet + remaining_contigs.push(bam::FetchDefinition::String(tn.as_bytes())); + remaining_contigs_copy.push(bam::FetchDefinition::String(tn.as_bytes())); + } + } + + // add unmapped at the end + remaining_contigs.push(bam::FetchDefinition::Unmapped); + remaining_contigs_copy.push(bam::FetchDefinition::Unmapped); + + debug!("Remaining contigs: {:?}", remaining_contigs); + + // now go through each, and fully copy everything + for (fetch_index, fetch_definition) in remaining_contigs.into_iter().enumerate() { + match bam_reader.fetch(fetch_definition) { + Ok(()) => { + for record_result in bam_reader.records() { + // set up the record for the new BAM file + let record = record_result?; + + // nothing left should be tagged + bam_writer.write(&record)?; + } + }, + Err(e) => { + warn!("Received \'{}\', likely caused by no reads detected for {:?}", e, remaining_contigs_copy[fetch_index]); + } + }; + } + } + + // if we make it here, everything should be good + Ok(()) + } +} diff --git a/src/writers/ordered_vcf_writer.rs b/src/writers/ordered_vcf_writer.rs new file mode 100644 index 0000000..976130b --- /dev/null +++ b/src/writers/ordered_vcf_writer.rs @@ -0,0 +1,423 @@ + +use crate::block_gen::is_phasable_variant; +use crate::phaser::PhaseResult; + +use log::{warn,debug,trace}; +use rust_htslib::bcf; +use rust_htslib::bcf::Read; +use rust_htslib::bcf::record::GenotypeAllele; +use rustc_hash::FxHashMap as HashMap; +use simple_error::bail; +use std::cell::RefCell; +use std::collections::VecDeque; +use std::io; +use std::path::PathBuf; + +/// Mostly for keeping the necessary information for a phase together when we store the results in memory prior to writing +#[derive(Debug)] +struct SingleVariantPhase { + /// haplotype 1 + h1: u8, + /// haplotype 2 + h2: u8, + /// phase block ID + block_id: usize +} + +/// Structure that maintains order of phase problems while writing solutions. +pub struct OrderedVcfWriter { + /// the template VCF we are reading from and copying if no phases are given + ref_vcf_readers: Vec>, + /// A copy of the input VCF header, cached here for performance + vcf_headers: Vec, + /// the VCF we are writing to + ref_vcf_writers: Vec>, + /// the data that may be cached because we are waiting on earlier results + map_store: HashMap, + /// the index of data we are waiting for + current_index: usize, + /// the most recently written chromosome + current_chrom: String, + /// the most recently written POS field + current_pos: u64, + /// the distance you can write for each sample + current_positions: HashMap, + /// The minimum quality for variants that were phased. If this is different from what is generating blocks, there will likely be a panic. + min_quality: i32, + /// Per VCF file, there is a hashmap of sample_name -> index in VCF file + sample_indices: Vec>, + /// Per VCF file, there is a hashmap of sample_name -> queue of variants + phase_queues: Vec>> +} + +impl OrderedVcfWriter { + /// Creates a new `OrderedVcfWriter` using template VCFs. + /// # Arguments + /// * `input_vcfs` - the template VCF files containing unphased variants + /// * `output_vcfs` - the VCF files that will get created containing our phase solutions + /// * `min_quality` - the minimum quality that indicates a variant to phase + /// * `sample_name` - the sample getting phased + pub fn new(input_vcfs: &[PathBuf], output_vcfs: &[PathBuf], min_quality: i32, sample_names: &[String]) -> Result> { + //get all the stuff we need for reading the VCF setup + let mut ref_vcf_readers: Vec> = vec![]; + let mut vcf_headers: Vec = vec![]; + let mut ref_vcf_writers: Vec> = vec![]; + let mut sample_indices: Vec> = vec![]; + let mut phase_queues: Vec>> = vec![]; + for (i, path) in input_vcfs.iter().enumerate() { + let vcf_reader: bcf::IndexedReader = bcf::IndexedReader::from_path(path)?; + let vcf_header: bcf::header::HeaderView = vcf_reader.header().clone(); + let ref_vcf_reader: RefCell = RefCell::new(vcf_reader); + + // first make sure we find the sample in this file + let mut vcf_sample_hash: HashMap = Default::default(); + let mut vcf_phase_queue: HashMap> = Default::default(); + for sample_name in sample_names.iter() { + let mut lookup_index: Option = None; + for (sample_index, &vcf_sample) in vcf_header.samples().iter().enumerate() { + let vcf_sample_string: String = std::str::from_utf8(vcf_sample).unwrap().to_string(); + if &vcf_sample_string == sample_name { + lookup_index = Some(sample_index); + break; + } + } + let lookup_index: usize = match lookup_index { + Some(index) => { + index + }, + None => { + bail!("Sample name {:?} was not found in VCF: {:?}", sample_name, path); + } + }; + + // add the sample lookup and initialize to an empty queue + vcf_sample_hash.insert(sample_name.to_string(), lookup_index); + vcf_phase_queue.insert(sample_name.to_string(), Default::default()); + } + + //now setup the outputs, we want to do the header stuff here also + let mut output_header: bcf::header::Header = bcf::header::Header::from_template(&vcf_header); + let cli_string: String = std::env::args().collect::>().join(" "); + let cli_version: &str = &crate::cli::FULL_VERSION; + output_header.push_record(format!(r#"##HiPhase_version="{cli_version}""#).as_bytes()); + output_header.push_record(format!(r#"##HiPhase_command="{cli_string}""#).as_bytes()); + output_header.push_record(r#"##FORMAT="#.as_bytes()); + output_header.push_record(r#"##FORMAT="#.as_bytes()); + let vcf_writer: bcf::Writer = bcf::Writer::from_path( + &output_vcfs[i], + &output_header, + false, + bcf::Format::Vcf + )?; + let ref_vcf_writer: RefCell = RefCell::new(vcf_writer); + + // push everything into the vecs for storage + ref_vcf_readers.push(ref_vcf_reader); + vcf_headers.push(vcf_header); + ref_vcf_writers.push(ref_vcf_writer); + sample_indices.push(vcf_sample_hash); + phase_queues.push(vcf_phase_queue); + } + + // default all of them to start at 0 + let mut current_positions: HashMap = Default::default(); + for sample in sample_names.iter() { + current_positions.insert(sample.clone(), 0); + } + + Ok(OrderedVcfWriter { + ref_vcf_readers, + vcf_headers, + ref_vcf_writers, + map_store: Default::default(), + current_index: 0, + current_chrom: "".to_string(), + current_pos: 0, + current_positions, + min_quality, + sample_indices, + phase_queues + }) + } + + /// Returns the phase block result that the writer is currently waiting to receive. + pub fn get_wait_block(&self) -> usize { + self.current_index + } + + /// Adds a phase block result to our queue for writing. + /// # Arguments + /// * `phase_result` - a phasing result that will be written in the correct order with other blocks + pub fn write_phase_block(&mut self, phase_result: PhaseResult) -> Result<(), Box> { + let block_index: usize = phase_result.phase_block.get_block_index(); + if block_index < self.current_index { + return Err(Box::new(io::Error::new(io::ErrorKind::Other, "Block index is smaller than next expected index"))); + } + match self.map_store.insert(block_index, phase_result) { + None => {}, + Some(_) => { + return Err(Box::new(io::Error::new(io::ErrorKind::Other, "Block index was already present in the map_store"))); + } + }; + self.drain_map_store() + } + + /// This will drain phase block solutions in the correct order if they have been received. + /// It drains as far as it can given the current results and then stops to wait for more data. + fn drain_map_store(&mut self) -> Result<(), Box> { + + /* + * New plan for multi-sample VCFs: + * 1. we check if we have the next phase result like we already do + * 2. we append all of the relevant info from that phase block into the variant queue for a given sample + * - do we have one queue per sample? or one per sample-VCF combination? + * - probably sample-VCF combination + * 3. we iterate over each VCF from current position to the minimum end position of all sample blocks so far + * - this _should_ guarantee at least one sample queue is empty + * - for each variant, we check if it's getting phase for the sample and modify if so + * - this will pop an entry from the variant queue for the sample + * 4. at the end of a chromosome: + * - iterate as normal + * - check that all queues are empty + * - put the new info into the queue, rinse and repeat + */ + + while !self.map_store.is_empty() { + match self.map_store.remove(&self.current_index) { + Some(phase_result) => { + trace!("Draining {}", self.current_index); + + let chrom_result: &str = phase_result.phase_block.get_chrom(); + if chrom_result != self.current_chrom { + if self.current_index == 0 { + // this is the first block, so lets set chrom and move on + self.current_chrom = chrom_result.to_string(); + } else { + // the next block is on a different chromosome, so we need to finalize this chromosome + self.write_to_end_position()?; + + // now setup for the next chromosome + self.current_chrom = chrom_result.to_string(); + self.current_pos = 0; + for (_k, v) in self.current_positions.iter_mut() { + // set all current positions back to 0 + *v = 0; + } + } + } else { + // the chromosome matches previous, I don't think we actually have to do anything + // as long as our assumptions are correct + } + + let sample_name = phase_result.phase_block.sample_name(); + for (vcf_index, phase_queue) in self.phase_queues.iter_mut().enumerate() { + let sample_queue: &mut VecDeque = phase_queue.get_mut(sample_name).unwrap(); + let mut previous_block_id: usize = 0; + for (haplotype_index, &h1_index) in phase_result.haplotype_1.iter().enumerate() { + if vcf_index == phase_result.variants[haplotype_index].get_vcf_index() { + // h1 and h2 are just internal representations + let h2_index: u8 = phase_result.haplotype_2[haplotype_index]; + + // convert them to the file representations where possible + let h1 = phase_result.variants[haplotype_index].convert_index(h1_index); + let h2 = phase_result.variants[haplotype_index].convert_index(h2_index); + + // add one here because we need it to be 1-based + let block_id: usize = phase_result.block_ids[haplotype_index]+1; + if haplotype_index == 0 || block_id != previous_block_id { + debug!("New block ID found for {}: {}", self.current_index, block_id); + } + previous_block_id = block_id; + + sample_queue.push_back(SingleVariantPhase { h1, h2, block_id }); + } else { + // this variant is not a part of this VCF file + } + } + } + + // update the minimum position for this sample and then tell the writer to do what it can + *self.current_positions.get_mut(sample_name).unwrap() = phase_result.phase_block.get_end(); + self.write_to_min_position()?; + self.current_index += 1; + }, + None => { + break; + } + }; + } + Ok(()) + } + + /// This will trigger the writer to try to write everything that remains on this chromosome + /// # Errors + /// * if we have any issues writing the VCF file + /// * if when we finish writing, there are still blocks in the queue + pub fn write_to_end_position(&mut self) -> Result<(), Box> { + self.write_to_position(u64::MAX)?; + + // sanity check to make sure all queues are empty once we get to this point + for hm in self.phase_queues.iter() { + for (_sample_name, queue) in hm.iter() { + if !queue.is_empty() { + bail!("Finished writing chromosome, but variant queues are not empty"); + } + } + } + + Ok(()) + } + + /// This will trigger to writer to write everything up to the minimum position on this chromosome + /// # Errors + /// * if we have any issues writing the VCF file + fn write_to_min_position(&mut self) -> Result<(), Box> { + let min_position: u64 = *self.current_positions.values().min().unwrap(); + self.write_to_position(min_position) + } + + /// This should write out all VCF lines up to a certain position and then break. + /// # Arguments + /// * `final_position` - the last position that is included in this write + /// # Errors + /// * if we have any issues writing the VCF file + fn write_to_position(&mut self, final_position: u64) -> Result<(), Box> { + if self.current_pos == final_position { + // we can't write anything new + return Ok(()); + } + + // + let start_pos = self.current_pos; + for (vcf_index, ref_vcf_writer) in self.ref_vcf_writers.iter().enumerate() { + // prep the writer + let mut vcf_writer = ref_vcf_writer.borrow_mut(); + let mut vcf_reader = self.ref_vcf_readers[vcf_index].borrow_mut(); + let chrom_index: u32 = self.vcf_headers[vcf_index].name2rid(self.current_chrom.as_bytes())?; + // let sample_index: usize = self.sample_indices[vcf_index]; + + match vcf_reader.fetch(chrom_index, start_pos, Some(final_position)) { + Ok(()) => { + for record_result in vcf_reader.records() { + // set up the record for the new VCF file + let mut record = record_result?; + vcf_writer.translate(&mut record); + let record_pos = record.pos(); + if record_pos < start_pos as i64 { + // this can happen when you have very very long indels that span one of our breaks + // we have already written though, so don't write it again + continue; + } + + // we now have to iterate over each sample and modify the entries as necessary + let vcf_sample_indices = &self.sample_indices[vcf_index]; + let mut changes_made: bool = false; + + // initialize the alleles array to be the same, these may change inside the loop + let mut alleles = vec![]; + let record_gt = record.genotypes().unwrap(); + for si in 0..record.sample_count() { + let genotype = record_gt.get(si as usize); + match genotype.len() { + 0 => bail!("Encountered empty genotype record at position {}", record.pos()), + 1 => { + // TRGT can make single-allele GT calls, just copy it over as normal + // it will not be modified below because it is a homozygous allele + alleles.push(genotype[0]); + }, + 2 => { + // this is 99.99999999% path + alleles.push(genotype[0]); + alleles.push(genotype[1]); + }, + gt_len => { + // we do not have 3+ GT entries implemented + bail!("Encountered GT of length {} at position {}", gt_len, record.pos()) + } + } + } + + // initially empty PS blocks also + let mut ps_blocks: Vec> = vec![".".as_bytes().to_vec(); record.sample_count() as usize]; + let mut phase_flags: Vec> = vec![".".as_bytes().to_vec(); record.sample_count() as usize]; + let mut flagged_variants: bool = false; + + for (sample_name, &sample_index) in vcf_sample_indices.iter() { + // now see how we handle this variant + let include_variant = is_phasable_variant(&record, sample_index, self.min_quality, false)?; + if include_variant { + // sanity checks + assert!(u64::try_from(record_pos).unwrap() >= start_pos); + assert!(u64::try_from(record_pos).unwrap() <= final_position); + + let variant_to_write = match self.phase_queues[vcf_index].get_mut(sample_name).unwrap().pop_front() { + Some(v) => v, + None => { + bail!("Variant requested from empty queue during VCF writing"); + } + }; + + // these are already converted to the VCF entries, so just compare to see if we need to overwrite + let h1 = variant_to_write.h1; + let h2 = variant_to_write.h2; + if h1 == h2 { + // algorithm decided it was better if these were homozygous allele + // for now, we will just write out the original record + + if h1 == u8::MAX { + // these were intentionally ignored by HiPhase, mark it as such + phase_flags[sample_index] = "TR_OVERLAP".as_bytes().to_vec(); + flagged_variants = true; + } + + } else { + // we need to alter the genotypes for this sample to phased + let sample_gt_offset: usize = 2 * sample_index; + alleles[sample_gt_offset] = GenotypeAllele::Unphased(h1 as i32); + alleles[sample_gt_offset + 1] = GenotypeAllele::Phased(h2 as i32); + + // the push_format_string expects &[u8] bytes so we have to: + // 1. convert the output to a String + // 2. interpret that to bytes + // 3. convert to a Vec for ownership + ps_blocks[sample_index] = variant_to_write.block_id + .to_string().as_bytes().to_vec(); + changes_made = true; + } + } else { + // this variant is not included in phasing, so we can just leave it as is + } + } + + if changes_made { + // if we altered something, then alter the record and add PS + record.push_genotypes(&alleles)?; + record.push_format_string("PS".as_bytes(), &ps_blocks).unwrap(); + } + if flagged_variants { + // we have at least one variant that was ignored, use PF = PhaseFlag + record.push_format_string("PF".as_bytes(), &phase_flags).unwrap(); + } + // all modifications have been made, write it out + vcf_writer.write(&record)?; + } + }, + Err(e) => { + if final_position == 0 { + warn!("Empty problem block received, no heterozygous variants on chromosome {}", self.current_chrom); + } else { + warn!("Received \'{}\', while seeking to {}:{}-{} in vcf #{}, likely no variants present", e, self.current_chrom, start_pos, final_position, vcf_index); + } + } + } + } + + // we wrote out things at final_position, so go one past it + if final_position == u64::MAX { + self.current_pos = final_position; + } else { + self.current_pos = final_position+1; + } + Ok(()) + } +} diff --git a/src/writers/phase_stats.rs b/src/writers/phase_stats.rs new file mode 100644 index 0000000..a9c909b --- /dev/null +++ b/src/writers/phase_stats.rs @@ -0,0 +1,313 @@ + +use crate::data_types::variants::VariantType; +use crate::phaser::PhaseResult; + +use serde::Serialize; +use std::fs::File; +use std::path::Path; + +/// Contains statistics on the loading and parsing of reads into alleles. +#[derive(Debug)] +pub struct ReadStats { + /// The number of reads loaded + num_reads: u64, + /// The number of reads without any determined alleles + skipped_reads: u64, + /// The number of alleles successfully loaded + num_alleles: u64, + /// Records the number of exact matches found for a type + exact_matches: [u64; VariantType::Unknown as usize + 1], + /// Records the number of inexact matches found for a type + inexact_matches: [u64; VariantType::Unknown as usize + 1], + /// Records the number of no-matches found for a type + failed_matches: [u64; VariantType::Unknown as usize + 1], + /// Records the number of matches to allele 0 + allele0_matches: [u64; VariantType::Unknown as usize + 1], + /// Records the number of matches to allele 1 + allele1_matches: [u64; VariantType::Unknown as usize + 1], + /// If true, then global realignment was used to pull out reads + is_global_realignment: bool +} + +impl ReadStats { + /// Creates a new `ReadStats` struct and does some sanity checks. + /// # Arguments + /// * `num_reads` - the number of reads loaded + /// * `skipped_reads` - the number of reads that were skipped during loading + /// * `num_alleles` - the number of alleles successfully (i.e. un-ambiguously) loaded + /// * `exact_matches` - the number of exact matches for each type + /// * `inexact_matches` - the number of inexact matches for each type + /// * `failed_matches` - the number of ambiguous or no-sequence alleles + /// * `allele0_matches` - the number of alleles assigned to allele 0 + /// * `allele1_matches` - the number of alleles assigned to allele 1 + /// * `is_global_realignment` - if True, then global realignment was used to transform reads into alleles + /// # Panics + /// * if `num_reads` > `num_alleles`, because that would imply some reads have no alleles + /// * if `num_alleles != exact_matches.sum() + inexact_matches.sum()`, because that would imply some alleles are not being counted correctly somewhere + /// * if `num_alleles != allele0_matches.sum() + allele1_matches.sum()`, because that would imply some alleles are not being counted correctly somewhere + #[allow(clippy::too_many_arguments)] + pub fn new( + num_reads: u64, skipped_reads: u64, num_alleles: u64, + exact_matches: [u64; VariantType::Unknown as usize + 1], + inexact_matches: [u64; VariantType::Unknown as usize + 1], + failed_matches: [u64; VariantType::Unknown as usize + 1], + allele0_matches: [u64; VariantType::Unknown as usize + 1], + allele1_matches: [u64; VariantType::Unknown as usize + 1], + is_global_realignment: bool + ) -> ReadStats { + assert!(num_alleles >= num_reads); + assert_eq!(num_alleles, exact_matches.iter().sum::() + inexact_matches.iter().sum::()); + assert_eq!(num_alleles, allele0_matches.iter().sum::() + allele1_matches.iter().sum::()); + ReadStats { + num_reads, + skipped_reads, + num_alleles, + exact_matches, + inexact_matches, + failed_matches, + allele0_matches, + allele1_matches, + is_global_realignment + } + } +} + +/// Contains any statistics from the phasing problem solver that may be relevant +pub struct PhaseStats { + /// The number of solutions that were pruned during calculation + pruned_solutions: Option, + /// For heuristic solvers, the estimate cost prior to computation + estimated_cost: Option, + /// The actual cost of the solution + actual_cost: Option, + /// The number of phased variants + phased_variants: Option, + /// The number of phased SNV variants + phased_snvs: Option, + /// The number of variants where the phasing solution turned them homozygous + homozygous_variants: Option, + /// The number of ignored variants + skipped_variants: Option +} + +impl PhaseStats { + /// Creates phase stats for an A* algorithm solution + /// # Arguments + /// * `pruned_solutions` - the number of solutions that were pruned from the solution space to reduce memory consumption and run-time; if this is 0, then we have a guaranteed best solution + /// * `estimated_cost` - the estimated cost of this phase block calculated from the heuristic + /// * `actual_cost` - the actual cost of the final solution + /// * `phased_variants` - the number of variants that were phased in the solution (e.g. not converted to homozygous) + /// * `homozygous_variants` - the number of variants that were converted to homozygous in the solution + /// * `skipped_variants` - the number of variants ignored in the solution + /// # Panics + /// * if `actual_cost < estimated_cost`, because that would imply something broken in our heuristics that makes A* no longer work + pub fn astar_new( + pruned_solutions: u64, estimated_cost: u64, actual_cost: u64, + phased_variants: u64, phased_snvs: u64, homozygous_variants: u64, skipped_variants: u64 + ) -> PhaseStats { + assert!(actual_cost >= estimated_cost); + PhaseStats { + pruned_solutions: Some(pruned_solutions), + estimated_cost: Some(estimated_cost), + actual_cost: Some(actual_cost), + phased_variants: Some(phased_variants), + phased_snvs: Some(phased_snvs), + homozygous_variants: Some(homozygous_variants), + skipped_variants: Some(skipped_variants) + } + } + + pub fn get_pruned_solutions(&self) -> Option { + self.pruned_solutions + } + + pub fn phased_snvs(&self) -> Option { + self.phased_snvs + } + + /// Returns the ratio of estimated_cost / actual_cost. In a perfect world, this is very near 1.0. + pub fn get_cost_ratio(&self) -> Option { + match self.estimated_cost { + Some(ec) => { + self.actual_cost.map(|ac| + if ac == 0 { + assert_eq!(ec, 0); + 1.0 + } else { + ec as f64 / ac as f64 + } + ) + }, + None => None + } + } +} + +/// This is a wrapper for writing out any stats to a file +pub struct StatsWriter { + /// Handle for the CSV writer + csv_writer: csv::Writer +} + +/// Contains all the data written to each row of our stats file +#[derive(Serialize)] +struct CsvRow { + /// The index of the block + block_index: usize, + /// the sample for the block + sample_name: String, + /// the chromosome of the block + chrom: String, + /// the position of the first allele + start: u64, + /// the position of the last allele + end: u64, + /// the number of variants in the block + num_variants: u64, + /// the number of reads in the block + num_reads: Option, + /// the number of skipped reads in the block + skipped_reads: Option, + /// The number of variants loaded + num_alleles: Option, + /// Records the number of exact matches found for a type + allele_matches: Option, + /// Records the number of inexact matches found for a type + allele_partials: Option, + /// Records the number of no-matches found for a type + allele_failures: Option, + /// Records the number of assignments to allele0 for a type + allele0_assigned: Option, + /// Records the number of assignments to allele1 for a type + allele1_assigned: Option, + /// if True, then global realignment was used to transform reads to alleles for this block + is_global_realignment: Option, + /// The number of solutions that were pruned during calculation + pruned_solutions: Option, + /// For heuristic solvers, the estimate cost prior to computation + estimated_cost: Option, + /// The actual cost of the solution + actual_cost: Option, + /// The estimated / actual cost ratio + cost_ratio: Option, + /// The number of phased variants + phased_variants: Option, + /// The number of variants where the phasing solution turned them homozygous + homozygous_variants: Option, + /// The number of ignored variants + skipped_variants: Option +} + +impl StatsWriter { + /// Creates a new writer for a given filename + /// # Arguments + /// * `filename` - the path to write all stats to + pub fn new(filename: &Path) -> csv::Result { + // modify the delimiter to "," if it ends with .csv + let is_csv: bool = filename.extension().unwrap_or_default() == "csv"; + let delimiter: u8 = if is_csv { b',' } else { b'\t' }; + let csv_writer: csv::Writer = csv::WriterBuilder::new() + .delimiter(delimiter) + .from_path(filename)?; + Ok(StatsWriter { + csv_writer + }) + } + + /// Will write stats to a CSV file for us + /// # Arguments + /// * `phase_result` - the phasing results, which wraps block metadata, the read statistics, and the phasing statistics + pub fn write_stats(&mut self, phase_result: &PhaseResult) -> csv::Result<()> { + let num_reads; + let skipped_reads; + let num_alleles; + let allele_matches; + let allele_partials; + let allele_failures; + let allele0_assigned; + let allele1_assigned; + let is_global_realignment; + match &phase_result.read_statistics { + Some(rs) => { + num_reads = Some(rs.num_reads); + skipped_reads = Some(rs.skipped_reads); + num_alleles = Some(rs.num_alleles); + allele_matches = Some(format!("{:?}", rs.exact_matches)); + allele_partials = Some(format!("{:?}", rs.inexact_matches)); + allele_failures = Some(format!("{:?}", rs.failed_matches)); + allele0_assigned = Some(format!("{:?}", rs.allele0_matches)); + allele1_assigned = Some(format!("{:?}", rs.allele1_matches)); + is_global_realignment = Some(rs.is_global_realignment); + }, + None => { + num_reads = None; + skipped_reads = None; + num_alleles = None; + allele_matches = None; + allele_partials = None; + allele_failures = None; + allele0_assigned = None; + allele1_assigned = None; + is_global_realignment = None; + } + }; + + let pruned_solutions; + let estimated_cost; + let actual_cost; + let cost_ratio; + let phased_variants; + let homozygous_variants; + let skipped_variants; + + match &phase_result.statistics { + Some(ps) => { + pruned_solutions = ps.pruned_solutions; + estimated_cost = ps.estimated_cost; + actual_cost = ps.actual_cost; + cost_ratio = ps.get_cost_ratio(); + phased_variants = ps.phased_variants; + homozygous_variants = ps.homozygous_variants; + skipped_variants = ps.skipped_variants; + }, + None => { + pruned_solutions = None; + estimated_cost = None; + actual_cost = None; + cost_ratio = None; + phased_variants = None; + homozygous_variants = None; + skipped_variants = None; + } + }; + + let row: CsvRow = CsvRow { + block_index: phase_result.phase_block.get_block_index(), + sample_name: phase_result.phase_block.sample_name().to_string(), + chrom: phase_result.phase_block.get_chrom().to_string(), + start: phase_result.phase_block.get_start(), + end: phase_result.phase_block.get_end(), + num_variants: phase_result.phase_block.get_num_variants() as u64, + num_reads, + skipped_reads, + num_alleles, + allele_matches, + allele_partials, + allele_failures, + allele0_assigned, + allele1_assigned, + is_global_realignment, + pruned_solutions, + estimated_cost, + actual_cost, + cost_ratio, + phased_variants, + homozygous_variants, + skipped_variants + }; + + self.csv_writer.serialize(&row)?; + self.csv_writer.flush()?; + Ok(()) + } +} \ No newline at end of file diff --git a/src/writers/vcf_util.rs b/src/writers/vcf_util.rs new file mode 100644 index 0000000..d57d3c4 --- /dev/null +++ b/src/writers/vcf_util.rs @@ -0,0 +1,54 @@ + +// This was all provided by Daniel Baker as a way to get VCF indexing since rust_htslib does not have a user-friendly method yet. +// Parts have been tweaked for readability. + +/// The error type we can generate from trying to index +#[derive(Debug)] +pub struct BcfBuildError { + pub msg: String, +} + +impl BcfBuildError { + pub fn error_message(error: i32) -> &'static str { + match error { + -1 => "indexing failed", + -2 => "opening @fn failed", + -3 => "format not indexable", + -4 => "failed to create and/or save the index", + _ => "unknown error", + } + } +} +impl std::error::Error for BcfBuildError {} + +impl std::fmt::Display for BcfBuildError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "BcfBuildError{{msg: {}}}", self.msg) + } +} + +/// Build a bcf or vcf.gz index. +/// Builds tbi or csi depending on if build_tbi is set. +pub fn build_bcf_index>( + bcf_path: P, + idx_path: Option

, + n_threads: u32, + build_tbi: bool, +) -> Result<(), BcfBuildError> { + let min_shift = if build_tbi {0} else {14}; + let idx_path_cstr = idx_path.map(|p| rust_htslib::utils::path_to_cstring(&p).expect("path_to_cstring")); + let ret = unsafe { + rust_htslib::htslib::bcf_index_build3( + rust_htslib::utils::path_to_cstring(&bcf_path).unwrap().as_ptr(), + idx_path_cstr.map_or(std::ptr::null(), |p| p.as_ptr()), + min_shift, + n_threads as i32, + ) + }; + match ret { + 0 => Ok(()), + e => Err(BcfBuildError { + msg: format!("Failed to build bcf index. Error: {e:?}/{}", BcfBuildError::error_message(e)), + }), + } +} diff --git a/test_data/header_only.bam b/test_data/header_only.bam new file mode 100644 index 0000000000000000000000000000000000000000..00ada6557cd0c8da709d820dfdffcfaedb1c9dbe GIT binary patch literal 3019 zcmV;+3pDf}iwFb&00000{{{d;LjnM<3w@b;Y@Sse$4}p`m$wsS2si|F{$U8lmGe9I z^VWs6T^WV8D=UmmF}1LfO*UHF@eU-QMuHI&i8Ap5;kF4zLJTBIKop`0nu#%Z89|J~ zL=s~#Y64;~DCc?4dEV#ph5fbD&+l@6mvecaGd4QibM1Q=+j#D#=C+v@4V&iNmeys9 zi_1IrT)D7+S99somLBHAd}!m`1Z zK(=?PVz}bY7oG3O6s!~$}KOF zm@a!Yq%0x1N+q@!H&1cSg`kR@mcHm$b>ujAZBdXD+}qM3($QX}WL}uk>hNiL zwvrTX1QIJ-la6bJ#C)|Bu9V{>CwSSM=ekr`N^+H?a4t*hkMgYeN>-?xl_V%cxom=n zN_!xw)?9C?8a+Oi;*PC6)9*&V#J z8p@T+Vx=z~j60&3s;p-#RB?V6UD=V7T(moPB4HKeDsnO!a@pbJm4IAFt}Nst`6pF6 zak>J$BM{DW#3^zToFF%80tyC5I!^$$A1CO&gP4+gSvo|z0=6R{t?{S=Dmfss9`fix z7-b+&6;clC`)HpF%w1jqjd&`6kb)prxjv51Mqa)HboE7EAnZSOn0MHHYB1*%ks4d&C*2pJwPaLH(3M$vNctE@WU6>%57jBLSz7+DIE%1)Y|WBVi$rRWP_u zG>=u(QcI|xsw^mtnvI^ zItUalv9l>YgY|i`kMrkZ{X(}RZe#qCxWopcfKm!~h)*YtaT*#>k!!*ZVh0+{p?&8f zCk-p;V+JKx1bNJWwjQ2TsYA$UeAMO#0lo^z)BzH9n3QpT!r+rG?PJ6h)=wRz1mudp zJ(nig7dU6G^vN9#(!6I$Ao9g+s})VxMxF7oJbbLQK(hT++CO_mln zZ@Q&Ad_zgchxHyFFCuO4rusV~{-)2bdBRnbRToJ7(vWYEFp_8=J635MG^V8Zm{KLwsm@q}o6BLN^G)BQoomFYS z@XksNCDNrr;@<{VrJ870{v44@=zP<64tA#1dB(X_FjSpxkJc(UPaX4Vgoi%@txiWf zC!8!Dvnh=g5(>m@TCY70!$tuT)fE=oB0qkuC0z8hhMlq1#)&peyYnsI8uJ#-mo{lb zq>9JD#MriuCbM9rzO;%N4w);^~=(Yz4c zb1_d=J4N#=)Z8@J+Bz}1>B1ee6C1|I=SC)%cDBx0Uf8#5Wy7AOk=d*EuI##a&(8h3 z(;t@(jEqgrY?_?eeE#Hx8;FbbR{HhC`N-VF1s6@sj28%zTkkj@85^CO7$2RQn%KH+ zV(Z-GmYEHDn2(&Zb#!`S>*$3O=YDhpaX!qmf6dvc*6ir`*yNVx>~t%$IW@F#Hj8y- zVc-76#l0)d*_rPDvv(!Z!_*v~%Ky5qy=T{fb?YzPJ@iw?SRZ37d=6v%PaZu!4g|aA z;9?p`Ht@vLtK&ei+dj1|4K(}Balaml1H+y>Z#WGs)B8Ttj04A>I&|+~9C-He$Ic$? zk4`xozV$x?X+-ROW5?x8um|q=XU-)1&O4v(Pb0-{{MbT1((L5%lk$;aXYRkWFO4i) z*!}9NG;-{NmmkSTo}I9GeQz8I&IT@iCLa;|(_=qxq>*5U_YL&M5Vo_&-*AmMtEYy27k{{~h70c^>>sz+j2GTR z*s;TPEV}sq?=@;zcJY~a*0og^zj9BG!E*$p&oUp~~RVo@l`P2XT_ZEF6+3z7Zd3q3gp4-@R= zclB0z@jZh5#T9w9Unpwh)kamn`1-(J{#&E2U#R+@53)Kh=zsoOfEVUBbO_il6#lvI zcJ;xV0(E0y}Wa={W}v2<+d#Z4`LaFO+X!a#fAXLjCT0IIj<{^B42!TC$Q08pFQA_u^>iM4L20hO7$Z505))V3i2LJ}Wo0^r-k`j9Et zCblqG2U)Jx08=9PxCCfqalAeNG(sHT?gJo{_IJGigzR2|Ooccu0H##%aoy{EHDE#o zp9Ybe;N$Q80E9SZ-&qGTXxJ-y>L4WXnuqJa1n^AVl+GG%u~`Q^1b93ES)pzk1R4R{ zIRG?53 zFss!A0E9TsSPei(`;p@S2+lvdt`4kwivY7OIT?VgOHKppd z@L{^!LF8HcJb^SqN{;jbjbQ4($a(0ir$8Lp8a#?L0yrIM1m~Yd8lfJ)K(h+YZ$bfv zl>8BC1XBl20EG$xE};sBS$$#%fM9C5AAk_x{e1w0S)JDeAjI(sxJ+a_oCABi2pYeC zZxM9Ag0iI}jVj1KezG@ySO?iHW9VuCLgX+1sCE=(UpU!omjjKE#JgIo?mVRA=Fb6B z0UR^Ebr77d{VLE10UkQU>L57(L2pkTgoU$Z5cmk${o-Dr5ps3L-nw%gaA^kh8DFbYOPU^E0qLtr!nMneF52mk<~O#zVr literal 0 HcmV?d00001 diff --git a/test_data/header_only.vcf.gz b/test_data/header_only.vcf.gz new file mode 100644 index 0000000000000000000000000000000000000000..6e74d72b7cccf23b44944a8216bd6e9d4aef0ac7 GIT binary patch literal 2039 zcmVs^k|+V20kXMG zuxXp7onYrFI5|hjG3wY^`y`!y`@y-IjulQ`+FX95DDqEHx4(aSe(~+(>fqn!#|M{(U%$RTPTjOE9~Wg)AN+DyRmELdEmGSRkCU!TvtNE4 z+FYf(qe)c_&CXKYEPg(g^?mWs%u-b>=4Ji8TmIJ-UAag_*%fWNOKqyB>BCE(7tN0^ zPm>oukDB_foGtmHtk<5NCv7>Y7sYK_uGFSUwe)+Ns^&*AomAEGr;B-;x_ML0Myq^k z#({rb#x|dH#VkE8<_D%|Q}r=M4TPCgyK|Mc{5o7w|-|Lx~v zIykQv={~hX*C+L?cx<|I`CajkyQVFkx>Woq7xSX~`Ec7*<+PYoRjN|=&vmGuuRb1r zTP<<+?ZpaT7j0SJXOo^r#$VnN-ceJ}R;*4dd7-Q~PZvwwBd zNtI@OQL6HLs>*rO%(|j&p6Xee6^mw-k7?C(saq7~+S@4aC&zEidzzJ#`tQ^JGJrZw zS`ol{-^VFEZZdQ8oZ4==0(_tk=H34Ow5b>6{VKZYyoL8ws_z%`1GFJJZxMeTG1xN* zksQV`Lt6%7!04RIFt%q5f_2V~8Mk8sI0~5a<97_3U|i%3!#l=BvMw@>nXqH50}7t} zn29@v0!7QfV+KqQ5l7A@5uk^h3EpQR6rPYBnatLp&ox-`mM~5hgdkaZbmm+nC5*jmUiVaFI^!jSW`OScOi*h8iwxXZS;)>`nKVRp&3o=uE_ zGR!X5&IAb7XA<?wd-3pWnSgJn=mGIe*vF?Y`1oLv}W5FumYC}sk)YZGI_GrllE6FGOpxXmgV ztwi*>QzCA+!kG*kkBY^34GP9t#UkbkA97{fnxGjk5{=8L3$`Yi3>!naBMEjXj+qog zHpN>T#Iarw0CE==*llbX=|`L2&t@HnHe&(PsAYd%ZRxgU|iYk8Cy63KYE4Q0h^L7B$of z)Ut-6j1_Ede&|`#v&*qiK|!E^1~k;mV6D|qH%P1+fY?`zK_fwiO1Y7sXSrvIeNnqGe)G zLzNv9g4Ix&R`QxxD?+7+Muoi}=#Vs2M$4>4>vFvw#zdey6u0jSjqLRrgpS5OvtB`~DA6|A*Z6?c@Qi4RV5 zYX#~+jX-plcSZ{!1c8Fnv=v3Y$XL)&ky*!48+fF+!8>*48w>{OC`(eYq_qr@*cN(q zjFLAE7^`}fePB^Tt$8y_UIu__URj`^%{$~}BL+hnDo3NBU5Vu75aGyZsMuB@)KCs@ zE&&haqQx2{it5-cI~FOh=2e!!k%mU8EU=`ZQUf7q$&$?ts6O~aQP{|;Q@K6|HCOTx z#cIXinv>qAKhlN7Ja1VtI?&07)zh0OU1PwySq4 zii1?CJvg|y9cZY0@+DT=RX$7xbugDh2FIw)BPj29|H|8k&1=xXvlNC`uC4W(yuq}~ z5neenHZO7qcN_$R8f*uiy*9XB%$QpZ#6tyVi6$(aeG#(6-KB%vG z*ovB12oW?^Y|neEJub18M{^8JPJ`vBOzOJ>wo(@^WM2pGU^lq=L(sa8-JsNlEo!iQ zaAUUG_ME7m$Edx@Vkd`C18O~-s2zDm(pXVDjxMT+V8H^ua{g>&JeUKSK~<(a0)yvQ zG{5oepZh<_v-f&8$&1(n*vu?&@}KM8AB0u9j()rP{BiH{^VhxejW#tf+k4H+32SQs`0x(d&a|F6SzW1G5ypohaxzIUJh&es7d Pl}FPf&A<#c0z?1+^TQC- literal 0 HcmV?d00001 diff --git a/test_data/multi_smrtcell.bam b/test_data/multi_smrtcell.bam new file mode 100644 index 0000000000000000000000000000000000000000..5ed6c692a2021ad748dd9103a05db0a9a21ecc64 GIT binary patch literal 3034 zcmV<03nla)iwFb&00000{{{d;LjnN33)Pu>Y@Sse$4}p`m$wsS2si@j{KF88E9ZCa zr*mO#S4N@jN(*CCOf76=laLqwd2N&s1t8i$ke6Ec0+0ca zcfxC}i$Jvj7o@b-1)wwFNJ}S58YTmlJFdMg0-FKtrPAKY0&p2nnxr$*6oAiwbkZwJ zwiqC96KTCIM@ZBrnmc34L_{57wdB4?+R>AeMoHm{q#ZuDTuDnMqT7VLCvqigSpqzl z(n+O@AWMLgLR+P%2wVoZv{JZ|MwS5W3|T|P#E2|@qm)u)Y!PJPD`UC$P8LBHz2=Ub zGQ~i)x2s~f;?5VH@5mIa5LQv~?~|E=(WVsi+=b)9a?4GN=9cHL-nqEYZUfCrU6ae% zlAIJ=3MA&r=7cp4%K4I)l}2K!EJo#) z7fDQ)y&6)MkX)q_T3TOP@pD@yIOjr8MNUg!^sCx(oV%7N$O-OkX%XpYuTnBEOlfua zG(B5M3O53Ym90s~wL)UPS_)Unagr0fY|e9Csw^eBN>VtNrS(U7R(vHZRL)8g6rx-< zK}4lJkW_MA=_;=(bBq?rtBN2gSlJGo6;_u_+SG-*R2OnudnyY?ljY74jyvg*pyXxWB(yHAJCOvd z3yRxP&vpdMl~$ycgmSWyZ*Q%Df|ZUiy4*S)!EyzaaD+pt>>-Z4zy@tukV_{W6s+tH zURn+1%4MC$-OKcA{_zS7Le9>Q~{M75Lpj- z^dO8fkf#bMhxL85&jsc#uYg886+lQqkgHrD$7dri-vK)MA}^?P^a|-el zzBEu@MfcJRjwOuAOAG@rX$BT9A!TgG1L893dr`iab_87N5TQBDk%gPS;4Af6E^qQc4N}=Q&@_ZLEu!(iv#L|&~Q%G&34XlDr%gB+ikjE+* z+$WmHDr%`E)K66wnCE@yJd9AJ6yqk)GL?{wgFM>z9;-+5p31-Qasd^WF$pdpPbJoP z{w^H^3YXZ~6raKRJlV(jbFqG*+Yz@heo0(n15rRJg*(Kjlg2m=4XDU9VF$4Tjpoq4 zbCHvV74$KKk}HBdW~2|G;6I6qsdrfv+My zKd7CB`YAtJLhe)j2p-hZr~Eqy<4gTb0^ihp%&)gtJ(;%-hnH?z3Lf+_-+-)!_I=`# zL;GoYlhN3}NYf{%o~GwvyQXRgbq~pVS504j-ws%wg9T9)i=hr;ps@q@thi!gEu%4<5Bz|egH%M|^Kna!G zFz_6;DwSDVC9w^YBshoDppZF%mT!%DljcjC zv>{T(V_4E*;n~GCs9=d}_;u;}>rvF4kM=*AwR(X2&kNWNd1*K#1IW$N7el;n}g#;faZ{ znQdb;v*TN*HtIpX;k=pQ$+4N?i^tCY&_?2XkZ1pz(-T9}!=oePTbt99Lz&Hq6zk6Y z3kwJTSDE{?~P_J-ZI9TYvfPfgdx* zdKhEjDwr zj=`G^>zT_jc&cIRWp~2hg@ Se`ZTGQ+OxWDUH?uzL>GaiJtXf3Q)-qEM0>zsA_w z)clDTBKzIvx^fQQTi8qQ=&thOGYk8ZEAnW+P}IgNjjDd}QH8zqmquN`Q1w3^Vs&27 z|LivaFU)V?FtA@J{IlQg=!2&c_Uf_ToP!qWDDbFXC|}?Bsv4Jt z`rZ3bULW-M!H!Qf>b#KuEqAjzFXZ=wKLq-P{62C$;D!8dxEI(jIzc0c&1v}mkfNBxjkAQ3uyQL2R-6HmQPaTNh z!Em{P&VwXgNmfC&|R3Pf&#kH7W;5aO7AdmYH2VXx|{gOJ3757mJQ;OV+4oi*HI zvkrI&@I(NzLfzO8Gy=Gz4`_tQPw4?5Jfl$qSq0yJ8qf%x_}F^@2)TM@ zEtqb|?w?Nq8UcLwZFP{<{Hi_xLge?Zu7j|LvupYR2&SIz1|XQae`i+>gdgba!Buq) z6*SK6tpg{6^Bw>~fL~t?K(^G7sgT_p&Zq+$;_wH7MgWi8Uk5(;m^>M1gaGea13;+x z?$ZGXk)H@k>qF$vuLdB*aircpGAq=+-a5!?e$pzS5vIGn2Y}%G3j+XzPF#yjg*XlY zQ;NR{2xlv6AXA~V^8lDItJQq~ggDMx4M0fy(Gvg&&Og1b4y=2V0JAPR6@aWuP6h#H zT{7DQ8X>^0UZ4@OdjR z0*#QwJBL`^c}U4kp8=)}+dYIzKRkdk8)0E8}?-^=PCME=~%0EEc@a~l95@|Bwa2$8>Ny6PZAK6_eM zy&l2GE2o3(hUu<61KQ_}&!OzO2LK33{N(-y>uZBIK7O)u?r5<80%Fw^U{fvt03VA8 c1ONa4009360763o02=@U00000000000Q7{kX#fBK literal 0 HcmV?d00001 diff --git a/test_data/multi_smrtcell.bam.bai b/test_data/multi_smrtcell.bam.bai new file mode 100644 index 0000000000000000000000000000000000000000..3a17d3953a10ae362b40cf11d4ea85c8e8c1c40b GIT binary patch literal 1576 ecmZ>A^kh8DFbYOPU^E0qLtr!nMneF52mk<~O#zVr literal 0 HcmV?d00001 diff --git a/test_data/multisample.bam b/test_data/multisample.bam new file mode 100644 index 0000000000000000000000000000000000000000..6abe8c6ca7f3d3ff198f1dc78c0861cc5eae60b6 GIT binary patch literal 3038 zcmV<43nBC$iwFb&00000{{{d;LjnN73)Pu>Y@Sse$4}p`m$#G45O4(4`G+AGSI+O; zPv^qgu8cz4l@`XPm|ED#CM&J&_Ks;ljRYem5@q5A!fg|bgcwMafG9*0G!tX+GJ+U| zi6q8g)C9y}P|ow7^SsaH3;(;ncKZ2U&hK(A?{h|mC%b0OWNgy~o15FFhG@_7>#{kR`xLp{-I>1TF(yS}9yfBTImGhOD7tVni0dQA#N?wg|HDm9gA=CyOA9UUNrI znPMQ@+f^}Kap#N9cVr4y2&<_0_sLAbXj2M$?xJyFx#gxsbIbGB>|9)Ew}IxRuF2(W zNlpqb1rl>*bHW-2<$Oub7-u*TBVH0yoUB1&R2CD0pfyp6)o>+|oUC}YN+U5<7Nc^@ zizKGYUJWTrNUl-|Ev+xD__-|;oO2nj=K^z=S3o143LvB)$W^Y7M?*JWrkrxR2j~(V6cApx|IR$wN zUmB>dqI>BD#}dZmC58c*Gy@BlkTSO80dX1iy(nKyI|43sh|nD7$V3TMzEdWy(;^Y& z3v14~Pci23A3*W#mX$$YT`@ z?i0;p6}8k7>Zd9T%=12U9!4lqig6QYnMz2;K_2aUkJY1jPvu{Dxqynxm;@J)rxI&C zf0qsdg-h&giqBwup6uiNxmdr@?TFhLza%cPfheGq!X4t%Nn@Ob22|vlu!GovMssN2 zxyVVw3i_Bq$rV8!GoY=9CspbYG8!MX`9XlM0y1@ggdHYjoS!iGq)Yo4afS6$2Ppx$ z;&0ETN%jSf-$qXGppqtasN)jybdWi?iAs}B3N1P0aeht;$gO+({Mb%?pnsRRz*iBU zAJoo5{gfXqA@`|%1P^NIQ~sTU@umJIfp2O)=GR-Sp3GZ^!%H_U1rK_eZ$MT<`#y2W zq5ZVH$!P3fr0Ek>Pt)_bJ6t;DJ;u?x67YnUHf4vTfjl*@(U^-oI^3N3_N`%fHeHjY z#m$>;sSe*z((z%vhsTRZ+q?Y~5k3g%_ z(as4cOUGef#7J>8ed*AGQ=Di{ z2=2LxuIXvtt)uIyN<0AVhAx<9x%&@a)*=@WjN} z%(k(a+3~GY8}%UHaQ@8j$g_XV>4~A~;n9)tttT$AuY9cc$zvzRfnWy@EvA8FeNQ~SIt~=O?GxM5K(k+;@T-A1FzmSt2h+eZ zz3}N~960vW;d}ezz_X7&c3yvPbjsP_t^e*zBVz9vIU#3)J#fcAawgfgPkOpHjTF1_ zBlG!4v(rXT%}0ivv+s(YG_q`d_p7VY$g%fdbu=G&cJku&-Eky1>$~ikd_?SzkNvEX zMuHt#=+0IBgMa+b zt{V0o{K1|YETzzg}^a38Q=$nRZG0{udM z@AxW+uaMt!ZvecIzpnS?YYV(ruxr1}*qS!Ze_w=m3U<670M#P49|74Sc55F1x<%}X zo;nc0$BVN7@D{O6YwCakcyJW}rbX-s@Ik@(i~Rru=Pz{wKrLd6901!QHgro3sLa%D zs{jb5whaIflK4my0N*0kgG{*=vHAWw$a1v?m=eLqt64v0TU|tG>F^;AAjowAjC0!QXR;kVXyA0gOJ3757&VS;F-E9 zoi*HIvkrI&@I(NzLfzC4Gy=G@4`_tQPwfF9I1|XPvU}skigdgba zp;dJa6*SK4tpg{6^Bw>~fZtdRK(^G7sgT_p&#VI*;_wH7MgWgKPzOHvm^=k&gaGee z13;+x?lS-gk)H@k>qF!(tOg*&aircpGAq=+-a5!?e)1}y5vIGn2Y}%Givs|JPF#yj zg*XlYQ;NR@2xlv6AXA~V^8lDItJQq~ggDM#4M0fy(Gvg&&OfuR4y=2N0JAPR4S=jm zP5}XCT{7DQ8X>^0UZ4@OdjR}+dYIzKRkdk8)0E8}?-^=PCME?9M0EEc@dpiIj@|Bwb2$8>Jy6PZA zK6`psy&l2Gt7m}hhUu<63)<(+&!O!32LT94{M7yi>uZBIKYp@v?`*LD0dA^kh8DFbYOPU^E0qLtr!nMneF52mk<~O#zVr literal 0 HcmV?d00001 diff --git a/test_data/test_reference.fa b/test_data/test_reference.fa new file mode 100644 index 0000000..c6f5c6d --- /dev/null +++ b/test_data/test_reference.fa @@ -0,0 +1,5 @@ +>chr1 +acgt +ACGT +>chr2 +AccATGTA diff --git a/test_data/test_reference.fa.gz b/test_data/test_reference.fa.gz new file mode 100644 index 0000000000000000000000000000000000000000..08107fba98f49920307503ff28ed7d637baf7a06 GIT binary patch literal 90 zcmb2|=3rp}f&Xj_PR>jWwhWB&3=jU>C1(^FawR6GmvA{cyN7Ur_(oig$;pl(?jep` V)0RrL$pbaXqv@7rU