From 1d66d5b3c91601976d9c7a01da5b18d47467a530 Mon Sep 17 00:00:00 2001
From: mholt <mholt@pacificbiosciences.com>
Date: Mon, 30 Oct 2023 13:01:42 -0700
Subject: [PATCH] adds source files and test files

---
 Cargo.lock                         | 1548 ++++++++++++++++++++++++++++
 Cargo.toml                         |   38 +
 build.rs                           |   19 +
 src/astar_phaser.rs                |  788 ++++++++++++++
 src/block_gen.rs                   | 1063 +++++++++++++++++++
 src/cli.rs                         |  343 ++++++
 src/data_types/mod.rs              |    7 +
 src/data_types/read_segments.rs    |  258 +++++
 src/data_types/reference_genome.rs |  133 +++
 src/data_types/variants.rs         |  758 ++++++++++++++
 src/lib.rs                         |   19 +
 src/main.rs                        |  639 ++++++++++++
 src/phaser.rs                      |  796 ++++++++++++++
 src/read_parsing.rs                |  745 +++++++++++++
 src/sequence_alignment.rs          |   77 ++
 src/wfa_graph.rs                   | 1157 +++++++++++++++++++++
 src/writers/block_stats.rs         |  370 +++++++
 src/writers/haplotag_writer.rs     |   72 ++
 src/writers/mod.rs                 |   13 +
 src/writers/ordered_bam_writer.rs  |  355 +++++++
 src/writers/ordered_vcf_writer.rs  |  423 ++++++++
 src/writers/phase_stats.rs         |  313 ++++++
 src/writers/vcf_util.rs            |   54 +
 test_data/header_only.bam          |  Bin 0 -> 3019 bytes
 test_data/header_only.bam.bai      |  Bin 0 -> 1576 bytes
 test_data/header_only.vcf.gz       |  Bin 0 -> 2039 bytes
 test_data/header_only.vcf.gz.tbi   |  Bin 0 -> 80 bytes
 test_data/multi_smrtcell.bam       |  Bin 0 -> 3034 bytes
 test_data/multi_smrtcell.bam.bai   |  Bin 0 -> 1576 bytes
 test_data/multisample.bam          |  Bin 0 -> 3038 bytes
 test_data/multisample.bam.bai      |  Bin 0 -> 1576 bytes
 test_data/test_reference.fa        |    5 +
 test_data/test_reference.fa.gz     |  Bin 0 -> 90 bytes
 33 files changed, 9993 insertions(+)
 create mode 100644 Cargo.lock
 create mode 100644 Cargo.toml
 create mode 100644 build.rs
 create mode 100644 src/astar_phaser.rs
 create mode 100644 src/block_gen.rs
 create mode 100644 src/cli.rs
 create mode 100644 src/data_types/mod.rs
 create mode 100644 src/data_types/read_segments.rs
 create mode 100644 src/data_types/reference_genome.rs
 create mode 100644 src/data_types/variants.rs
 create mode 100644 src/lib.rs
 create mode 100644 src/main.rs
 create mode 100644 src/phaser.rs
 create mode 100644 src/read_parsing.rs
 create mode 100644 src/sequence_alignment.rs
 create mode 100644 src/wfa_graph.rs
 create mode 100644 src/writers/block_stats.rs
 create mode 100644 src/writers/haplotag_writer.rs
 create mode 100644 src/writers/mod.rs
 create mode 100644 src/writers/ordered_bam_writer.rs
 create mode 100644 src/writers/ordered_vcf_writer.rs
 create mode 100644 src/writers/phase_stats.rs
 create mode 100644 src/writers/vcf_util.rs
 create mode 100644 test_data/header_only.bam
 create mode 100644 test_data/header_only.bam.bai
 create mode 100644 test_data/header_only.vcf.gz
 create mode 100644 test_data/header_only.vcf.gz.tbi
 create mode 100644 test_data/multi_smrtcell.bam
 create mode 100644 test_data/multi_smrtcell.bam.bai
 create mode 100644 test_data/multisample.bam
 create mode 100644 test_data/multisample.bam.bai
 create mode 100644 test_data/test_reference.fa
 create mode 100644 test_data/test_reference.fa.gz

diff --git a/Cargo.lock b/Cargo.lock
new file mode 100644
index 0000000..25db964
--- /dev/null
+++ b/Cargo.lock
@@ -0,0 +1,1548 @@
+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+version = 3
+
+[[package]]
+name = "adler"
+version = "1.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
+
+[[package]]
+name = "aho-corasick"
+version = "0.7.19"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b4f55bd91a0978cbfd91c457a164bab8b4001c833b7f323132c0a4e1922dd44e"
+dependencies = [
+ "memchr",
+]
+
+[[package]]
+name = "android_system_properties"
+version = "0.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311"
+dependencies = [
+ "libc",
+]
+
+[[package]]
+name = "anyhow"
+version = "1.0.72"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3b13c32d80ecc7ab747b80c3784bce54ee8a7a0cc4fbda9bf4cda2cf6fe90854"
+
+[[package]]
+name = "approx"
+version = "0.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cab112f0a86d568ea0e627cc1d6be74a1e9cd55214684db5561995f6dad897c6"
+dependencies = [
+ "num-traits",
+]
+
+[[package]]
+name = "atty"
+version = "0.2.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8"
+dependencies = [
+ "hermit-abi",
+ "libc",
+ "winapi",
+]
+
+[[package]]
+name = "autocfg"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
+
+[[package]]
+name = "bio"
+version = "1.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cd6b34e241a9992b9a896c7939147875818217e00a25fe0eeb0da234f0d7aafe"
+dependencies = [
+ "anyhow",
+ "approx",
+ "bio-types",
+ "bit-set",
+ "bv",
+ "bytecount",
+ "csv",
+ "custom_derive",
+ "enum-map",
+ "fxhash",
+ "getset",
+ "itertools",
+ "itertools-num",
+ "lazy_static",
+ "multimap",
+ "ndarray",
+ "newtype_derive",
+ "num-integer",
+ "num-traits",
+ "ordered-float",
+ "petgraph",
+ "rand",
+ "regex",
+ "serde",
+ "serde_derive",
+ "statrs",
+ "strum",
+ "strum_macros",
+ "thiserror",
+ "triple_accel",
+ "vec_map",
+]
+
+[[package]]
+name = "bio-types"
+version = "0.13.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dfa990f40a28735fa598dc3dd58d73e62e6b41458959d623903b927ba7b04c80"
+dependencies = [
+ "derive-new",
+ "lazy_static",
+ "regex",
+ "strum_macros",
+ "thiserror",
+]
+
+[[package]]
+name = "bit-set"
+version = "0.5.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0700ddab506f33b20a03b13996eccd309a48e5ff77d0d95926aa0210fb4e95f1"
+dependencies = [
+ "bit-vec",
+]
+
+[[package]]
+name = "bit-vec"
+version = "0.6.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "349f9b6a179ed607305526ca489b34ad0a41aed5f7980fa90eb03160b69598fb"
+
+[[package]]
+name = "bitflags"
+version = "1.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
+
+[[package]]
+name = "bstr"
+version = "0.2.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ba3569f383e8f1598449f1a423e72e99569137b47740b1da11ef19af3d5c3223"
+dependencies = [
+ "lazy_static",
+ "memchr",
+ "regex-automata",
+ "serde",
+]
+
+[[package]]
+name = "bumpalo"
+version = "3.12.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9b1ce199063694f33ffb7dd4e0ee620741495c32833cde5aa08f02a0bf96f0c8"
+
+[[package]]
+name = "bv"
+version = "0.11.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8834bb1d8ee5dc048ee3124f2c7c1afcc6bc9aed03f11e9dfd8c69470a5db340"
+dependencies = [
+ "feature-probe",
+ "serde",
+]
+
+[[package]]
+name = "bytecount"
+version = "0.6.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2c676a478f63e9fa2dd5368a42f28bba0d6c560b775f38583c8bbaa7fcd67c9c"
+
+[[package]]
+name = "bytemuck"
+version = "1.12.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2f5715e491b5a1598fc2bef5a606847b5dc1d48ea625bd3c02c00de8285591da"
+
+[[package]]
+name = "byteorder"
+version = "1.4.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610"
+
+[[package]]
+name = "cc"
+version = "1.0.79"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "50d30906286121d95be3d479533b458f87493b30a4b5f79a607db8f5d11aa91f"
+dependencies = [
+ "jobserver",
+]
+
+[[package]]
+name = "cfg-if"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
+
+[[package]]
+name = "chrono"
+version = "0.4.24"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4e3c5919066adf22df73762e50cffcde3a758f2a848b113b586d1f86728b673b"
+dependencies = [
+ "iana-time-zone",
+ "js-sys",
+ "num-integer",
+ "num-traits",
+ "time 0.1.45",
+ "wasm-bindgen",
+ "winapi",
+]
+
+[[package]]
+name = "clap"
+version = "4.0.13"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "69d64e88428747154bd8bc378d178377ef4dace7a5735ca1f3855be72f2c2cb5"
+dependencies = [
+ "atty",
+ "bitflags",
+ "clap_derive",
+ "clap_lex",
+ "once_cell",
+ "strsim",
+ "termcolor",
+]
+
+[[package]]
+name = "clap_derive"
+version = "4.0.13"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c42f169caba89a7d512b5418b09864543eeb4d497416c917d7137863bd2076ad"
+dependencies = [
+ "heck",
+ "proc-macro-error",
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "clap_lex"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0d4198f73e42b4936b35b5bb248d81d2b595ecb170da0bac7655c54eedfa8da8"
+dependencies = [
+ "os_str_bytes",
+]
+
+[[package]]
+name = "cmake"
+version = "0.1.49"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "db34956e100b30725f2eb215f90d4871051239535632f84fea3bc92722c66b7c"
+dependencies = [
+ "cc",
+]
+
+[[package]]
+name = "codespan-reporting"
+version = "0.11.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3538270d33cc669650c4b093848450d380def10c331d38c768e34cac80576e6e"
+dependencies = [
+ "termcolor",
+ "unicode-width",
+]
+
+[[package]]
+name = "core-foundation-sys"
+version = "0.8.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e496a50fda8aacccc86d7529e2c1e0892dbd0f898a6b5645b5561b89c3210efa"
+
+[[package]]
+name = "cpu-time"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e9e393a7668fe1fad3075085b86c781883000b4ede868f43627b34a87c8b7ded"
+dependencies = [
+ "libc",
+ "winapi",
+]
+
+[[package]]
+name = "crc32fast"
+version = "1.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b540bd8bc810d3885c6ea91e2018302f68baba2129ab3e88f32389ee9370880d"
+dependencies = [
+ "cfg-if",
+]
+
+[[package]]
+name = "csv"
+version = "1.1.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "22813a6dc45b335f9bade10bf7271dc477e81113e89eb251a0bc2a8a81c536e1"
+dependencies = [
+ "bstr",
+ "csv-core",
+ "itoa 0.4.8",
+ "ryu",
+ "serde",
+]
+
+[[package]]
+name = "csv-core"
+version = "0.1.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2b2466559f260f48ad25fe6317b3c8dac77b5bdb5763ac7d9d6103530663bc90"
+dependencies = [
+ "memchr",
+]
+
+[[package]]
+name = "custom_derive"
+version = "0.1.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ef8ae57c4978a2acd8b869ce6b9ca1dfe817bff704c220209fdef2c0b75a01b9"
+
+[[package]]
+name = "cxx"
+version = "1.0.92"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9a140f260e6f3f79013b8bfc65e7ce630c9ab4388c6a89c71e07226f49487b72"
+dependencies = [
+ "cc",
+ "cxxbridge-flags",
+ "cxxbridge-macro",
+ "link-cplusplus",
+]
+
+[[package]]
+name = "cxx-build"
+version = "1.0.92"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "da6383f459341ea689374bf0a42979739dc421874f112ff26f829b8040b8e613"
+dependencies = [
+ "cc",
+ "codespan-reporting",
+ "once_cell",
+ "proc-macro2",
+ "quote",
+ "scratch",
+ "syn",
+]
+
+[[package]]
+name = "cxxbridge-flags"
+version = "1.0.92"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "90201c1a650e95ccff1c8c0bb5a343213bdd317c6e600a93075bca2eff54ec97"
+
+[[package]]
+name = "cxxbridge-macro"
+version = "1.0.92"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0b75aed41bb2e6367cae39e6326ef817a851db13c13e4f3263714ca3cfb8de56"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "deranged"
+version = "0.3.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7684a49fb1af197853ef7b2ee694bc1f5b4179556f1e5710e1760c5db6f5e929"
+
+[[package]]
+name = "derive-new"
+version = "0.5.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3418329ca0ad70234b9735dc4ceed10af4df60eff9c8e7b06cb5e520d92c3535"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "either"
+version = "1.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "90e5c1c8368803113bf0c9584fc495a58b86dc8a29edbf8fe877d21d9507e797"
+
+[[package]]
+name = "enum-map"
+version = "1.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e893a7ba6116821058dec84a6fb14fb2a97cd8ce5fd0f85d5a4e760ecd7329d9"
+dependencies = [
+ "enum-map-derive",
+]
+
+[[package]]
+name = "enum-map-derive"
+version = "0.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "84278eae0af6e34ff6c1db44c11634a694aafac559ff3080e4db4e4ac35907aa"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "env_logger"
+version = "0.9.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c90bf5f19754d10198ccb95b70664fc925bd1fc090a0fd9a6ebc54acc8cd6272"
+dependencies = [
+ "atty",
+ "humantime",
+ "log",
+ "regex",
+ "termcolor",
+]
+
+[[package]]
+name = "exitcode"
+version = "1.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "de853764b47027c2e862a995c34978ffa63c1501f2e15f987ba11bd4f9bba193"
+
+[[package]]
+name = "feature-probe"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "835a3dc7d1ec9e75e2b5fb4ba75396837112d2060b03f7d43bc1897c7f7211da"
+
+[[package]]
+name = "fixedbitset"
+version = "0.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80"
+
+[[package]]
+name = "flate2"
+version = "1.0.26"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3b9429470923de8e8cbd4d2dc513535400b4b3fef0319fb5c4e1f520a7bef743"
+dependencies = [
+ "crc32fast",
+ "miniz_oxide",
+]
+
+[[package]]
+name = "form_urlencoded"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a9c384f161156f5260c24a097c56119f9be8c798586aecc13afbcbe7b7e26bf8"
+dependencies = [
+ "percent-encoding",
+]
+
+[[package]]
+name = "fs-utils"
+version = "1.1.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6fc7a9dc005c944c98a935e7fd626faf5bf7e5a609f94bc13e42fc4a02e52593"
+dependencies = [
+ "quick-error",
+]
+
+[[package]]
+name = "fxhash"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c"
+dependencies = [
+ "byteorder",
+]
+
+[[package]]
+name = "getrandom"
+version = "0.2.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4eb1a864a501629691edf6c15a593b7a51eebaa1e8468e9ddc623de7c9b58ec6"
+dependencies = [
+ "cfg-if",
+ "libc",
+ "wasi 0.11.0+wasi-snapshot-preview1",
+]
+
+[[package]]
+name = "getset"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e45727250e75cc04ff2846a66397da8ef2b3db8e40e0cef4df67950a07621eb9"
+dependencies = [
+ "proc-macro-error",
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "glob"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b"
+
+[[package]]
+name = "hashbrown"
+version = "0.12.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888"
+
+[[package]]
+name = "heck"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2540771e65fc8cb83cd6e8a237f70c319bd5c29f78ed1084ba5d50eeac86f7f9"
+
+[[package]]
+name = "hermit-abi"
+version = "0.1.19"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33"
+dependencies = [
+ "libc",
+]
+
+[[package]]
+name = "hiphase"
+version = "1.0.0"
+dependencies = [
+ "bio",
+ "bit-vec",
+ "chrono",
+ "clap",
+ "cpu-time",
+ "csv",
+ "env_logger",
+ "exitcode",
+ "flate2",
+ "lazy_static",
+ "log",
+ "priority-queue",
+ "rust-htslib",
+ "rustc-hash",
+ "serde",
+ "simple-error",
+ "threadpool",
+ "vergen",
+]
+
+[[package]]
+name = "hts-sys"
+version = "2.0.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0dba4fc406d3686926c84f98fd53026b625319d119e6056a40313862a6e3c4eb"
+dependencies = [
+ "cc",
+ "fs-utils",
+ "glob",
+ "libz-sys",
+]
+
+[[package]]
+name = "humantime"
+version = "2.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4"
+
+[[package]]
+name = "iana-time-zone"
+version = "0.1.56"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0722cd7114b7de04316e7ea5456a0bbb20e4adb46fd27a3697adb812cff0f37c"
+dependencies = [
+ "android_system_properties",
+ "core-foundation-sys",
+ "iana-time-zone-haiku",
+ "js-sys",
+ "wasm-bindgen",
+ "windows",
+]
+
+[[package]]
+name = "iana-time-zone-haiku"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0703ae284fc167426161c2e3f1da3ea71d94b21bedbcc9494e92b28e334e3dca"
+dependencies = [
+ "cxx",
+ "cxx-build",
+]
+
+[[package]]
+name = "idna"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e14ddfc70884202db2244c223200c204c2bda1bc6e0998d11b5e024d657209e6"
+dependencies = [
+ "unicode-bidi",
+ "unicode-normalization",
+]
+
+[[package]]
+name = "ieee754"
+version = "0.2.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9007da9cacbd3e6343da136e98b0d2df013f553d35bdec8b518f07bea768e19c"
+
+[[package]]
+name = "indexmap"
+version = "1.9.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "10a35a97730320ffe8e2d410b5d3b69279b98d2c14bdb8b70ea89ecf7888d41e"
+dependencies = [
+ "autocfg",
+ "hashbrown",
+]
+
+[[package]]
+name = "itertools"
+version = "0.10.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473"
+dependencies = [
+ "either",
+]
+
+[[package]]
+name = "itertools-num"
+version = "0.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a872a22f9e6f7521ca557660adb96dd830e54f0f490fa115bb55dd69d38b27e7"
+dependencies = [
+ "num-traits",
+]
+
+[[package]]
+name = "itoa"
+version = "0.4.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b71991ff56294aa922b450139ee08b3bfc70982c6b2c7562771375cf73542dd4"
+
+[[package]]
+name = "itoa"
+version = "1.0.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "af150ab688ff2122fcef229be89cb50dd66af9e01a4ff320cc137eecc9bacc38"
+
+[[package]]
+name = "jobserver"
+version = "0.1.25"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "068b1ee6743e4d11fb9c6a1e6064b3693a1b600e7f5f5988047d98b3dc9fb90b"
+dependencies = [
+ "libc",
+]
+
+[[package]]
+name = "js-sys"
+version = "0.3.61"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "445dde2150c55e483f3d8416706b97ec8e8237c307e5b7b4b8dd15e6af2a0730"
+dependencies = [
+ "wasm-bindgen",
+]
+
+[[package]]
+name = "lazy_static"
+version = "1.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
+
+[[package]]
+name = "libc"
+version = "0.2.135"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "68783febc7782c6c5cb401fbda4de5a9898be1762314da0bb2c10ced61f18b0c"
+
+[[package]]
+name = "libm"
+version = "0.2.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "292a948cd991e376cf75541fe5b97a1081d713c618b4f1b9500f8844e49eb565"
+
+[[package]]
+name = "libz-sys"
+version = "1.1.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9702761c3935f8cc2f101793272e202c72b99da8f4224a19ddcf1279a6450bbf"
+dependencies = [
+ "cc",
+ "cmake",
+ "libc",
+ "pkg-config",
+ "vcpkg",
+]
+
+[[package]]
+name = "linear-map"
+version = "1.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bfae20f6b19ad527b550c223fddc3077a547fc70cda94b9b566575423fd303ee"
+
+[[package]]
+name = "link-cplusplus"
+version = "1.0.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ecd207c9c713c34f95a097a5b029ac2ce6010530c7b49d7fea24d977dede04f5"
+dependencies = [
+ "cc",
+]
+
+[[package]]
+name = "log"
+version = "0.4.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "abb12e687cfb44aa40f41fc3978ef76448f9b6038cad6aef4259d3c095a2382e"
+dependencies = [
+ "cfg-if",
+]
+
+[[package]]
+name = "matrixmultiply"
+version = "0.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "add85d4dd35074e6fedc608f8c8f513a3548619a9024b751949ef0e8e45a4d84"
+dependencies = [
+ "rawpointer",
+]
+
+[[package]]
+name = "memchr"
+version = "2.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d"
+
+[[package]]
+name = "miniz_oxide"
+version = "0.7.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e7810e0be55b428ada41041c41f32c9f1a42817901b4ccf45fa3d4b6561e74c7"
+dependencies = [
+ "adler",
+]
+
+[[package]]
+name = "multimap"
+version = "0.8.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a"
+dependencies = [
+ "serde",
+]
+
+[[package]]
+name = "nalgebra"
+version = "0.29.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d506eb7e08d6329505faa8a3a00a5dcc6de9f76e0c77e4b75763ae3c770831ff"
+dependencies = [
+ "approx",
+ "matrixmultiply",
+ "nalgebra-macros",
+ "num-complex",
+ "num-rational",
+ "num-traits",
+ "rand",
+ "rand_distr",
+ "simba",
+ "typenum",
+]
+
+[[package]]
+name = "nalgebra-macros"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "01fcc0b8149b4632adc89ac3b7b31a12fb6099a0317a4eb2ebff574ef7de7218"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "ndarray"
+version = "0.15.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "adb12d4e967ec485a5f71c6311fe28158e9d6f4bc4a447b474184d0f91a8fa32"
+dependencies = [
+ "matrixmultiply",
+ "num-complex",
+ "num-integer",
+ "num-traits",
+ "rawpointer",
+]
+
+[[package]]
+name = "newtype_derive"
+version = "0.1.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ac8cd24d9f185bb7223958d8c1ff7a961b74b1953fd05dba7cc568a63b3861ec"
+dependencies = [
+ "rustc_version",
+]
+
+[[package]]
+name = "num-complex"
+version = "0.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7ae39348c8bc5fbd7f40c727a9925f03517afd2ab27d46702108b6a7e5414c19"
+dependencies = [
+ "num-traits",
+]
+
+[[package]]
+name = "num-integer"
+version = "0.1.45"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "225d3389fb3509a24c93f5c29eb6bde2586b98d9f016636dff58d7c6f7569cd9"
+dependencies = [
+ "autocfg",
+ "num-traits",
+]
+
+[[package]]
+name = "num-rational"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0638a1c9d0a3c0914158145bc76cff373a75a627e6ecbfb71cbe6f453a5a19b0"
+dependencies = [
+ "autocfg",
+ "num-integer",
+ "num-traits",
+]
+
+[[package]]
+name = "num-traits"
+version = "0.2.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "578ede34cf02f8924ab9447f50c28075b4d3e5b269972345e7e0372b38c6cdcd"
+dependencies = [
+ "autocfg",
+ "libm",
+]
+
+[[package]]
+name = "num_cpus"
+version = "1.13.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "19e64526ebdee182341572e50e9ad03965aa510cd94427a4549448f285e957a1"
+dependencies = [
+ "hermit-abi",
+ "libc",
+]
+
+[[package]]
+name = "num_threads"
+version = "0.1.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2819ce041d2ee131036f4fc9d6ae7ae125a3a40e97ba64d04fe799ad9dabbb44"
+dependencies = [
+ "libc",
+]
+
+[[package]]
+name = "once_cell"
+version = "1.15.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e82dad04139b71a90c080c8463fe0dc7902db5192d939bd0950f074d014339e1"
+
+[[package]]
+name = "ordered-float"
+version = "3.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1f74e330193f90ec45e2b257fa3ef6df087784157ac1ad2c1e71c62837b03aa7"
+dependencies = [
+ "num-traits",
+]
+
+[[package]]
+name = "os_str_bytes"
+version = "6.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9ff7415e9ae3fff1225851df9e0d9e4e5479f947619774677a63572e55e80eff"
+
+[[package]]
+name = "paste"
+version = "1.0.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b1de2e551fb905ac83f73f7aedf2f0cb4a0da7e35efa24a202a936269f1f18e1"
+
+[[package]]
+name = "percent-encoding"
+version = "2.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "478c572c3d73181ff3c2539045f6eb99e5491218eae919370993b890cdbdd98e"
+
+[[package]]
+name = "petgraph"
+version = "0.6.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e6d5014253a1331579ce62aa67443b4a658c5e7dd03d4bc6d302b94474888143"
+dependencies = [
+ "fixedbitset",
+ "indexmap",
+]
+
+[[package]]
+name = "pkg-config"
+version = "0.3.26"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6ac9a59f73473f1b8d852421e59e64809f025994837ef743615c6d0c5b305160"
+
+[[package]]
+name = "ppv-lite86"
+version = "0.2.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "eb9f9e6e233e5c4a35559a617bf40a4ec447db2e84c20b55a6f83167b7e57872"
+
+[[package]]
+name = "priority-queue"
+version = "1.2.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "815082d99af3acc75a3e67efd2a07f72e67b4e81b4344eb8ca34c6ebf3dfa9c5"
+dependencies = [
+ "autocfg",
+ "indexmap",
+]
+
+[[package]]
+name = "proc-macro-error"
+version = "1.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c"
+dependencies = [
+ "proc-macro-error-attr",
+ "proc-macro2",
+ "quote",
+ "syn",
+ "version_check",
+]
+
+[[package]]
+name = "proc-macro-error-attr"
+version = "1.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "version_check",
+]
+
+[[package]]
+name = "proc-macro2"
+version = "1.0.58"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fa1fb82fc0c281dd9671101b66b771ebbe1eaf967b96ac8740dcba4b70005ca8"
+dependencies = [
+ "unicode-ident",
+]
+
+[[package]]
+name = "quick-error"
+version = "1.2.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a1d01941d82fa2ab50be1e79e6714289dd7cde78eba4c074bc5a4374f650dfe0"
+
+[[package]]
+name = "quote"
+version = "1.0.27"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8f4f29d145265ec1c483c7c654450edde0bfe043d3938d6972630663356d9500"
+dependencies = [
+ "proc-macro2",
+]
+
+[[package]]
+name = "rand"
+version = "0.8.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
+dependencies = [
+ "libc",
+ "rand_chacha",
+ "rand_core",
+]
+
+[[package]]
+name = "rand_chacha"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88"
+dependencies = [
+ "ppv-lite86",
+ "rand_core",
+]
+
+[[package]]
+name = "rand_core"
+version = "0.6.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
+dependencies = [
+ "getrandom",
+]
+
+[[package]]
+name = "rand_distr"
+version = "0.4.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "32cb0b9bc82b0a0876c2dd994a7e7a2683d3e7390ca40e6886785ef0c7e3ee31"
+dependencies = [
+ "num-traits",
+ "rand",
+]
+
+[[package]]
+name = "rawpointer"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "60a357793950651c4ed0f3f52338f53b2f809f32d83a07f72909fa13e4c6c1e3"
+
+[[package]]
+name = "regex"
+version = "1.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4c4eb3267174b8c6c2f654116623910a0fef09c4753f8dd83db29c48a0df988b"
+dependencies = [
+ "aho-corasick",
+ "memchr",
+ "regex-syntax",
+]
+
+[[package]]
+name = "regex-automata"
+version = "0.1.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132"
+
+[[package]]
+name = "regex-syntax"
+version = "0.6.27"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a3f87b73ce11b1619a3c6332f45341e0047173771e8b8b73f87bfeefb7b56244"
+
+[[package]]
+name = "rust-htslib"
+version = "0.39.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "239ef7334dbf59acd56b7a6fa62a525ed7e36d6239a686ed4ff61bc794108e53"
+dependencies = [
+ "bio-types",
+ "byteorder",
+ "custom_derive",
+ "derive-new",
+ "hts-sys",
+ "ieee754",
+ "lazy_static",
+ "libc",
+ "linear-map",
+ "newtype_derive",
+ "regex",
+ "thiserror",
+ "url",
+]
+
+[[package]]
+name = "rustc-hash"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2"
+
+[[package]]
+name = "rustc_version"
+version = "0.1.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c5f5376ea5e30ce23c03eb77cbe4962b988deead10910c372b226388b594c084"
+dependencies = [
+ "semver",
+]
+
+[[package]]
+name = "rustversion"
+version = "1.0.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7ffc183a10b4478d04cbbbfc96d0873219d962dd5accaff2ffbd4ceb7df837f4"
+
+[[package]]
+name = "ryu"
+version = "1.0.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4501abdff3ae82a1c1b477a17252eb69cee9e66eb915c1abaa4f44d873df9f09"
+
+[[package]]
+name = "safe_arch"
+version = "0.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "794821e4ccb0d9f979512f9c1973480123f9bd62a90d74ab0f9426fcf8f4a529"
+dependencies = [
+ "bytemuck",
+]
+
+[[package]]
+name = "scratch"
+version = "1.0.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1792db035ce95be60c3f8853017b3999209281c24e2ba5bc8e59bf97a0c590c1"
+
+[[package]]
+name = "semver"
+version = "0.1.20"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d4f410fedcf71af0345d7607d246e7ad15faaadd49d240ee3b24e5dc21a820ac"
+
+[[package]]
+name = "serde"
+version = "1.0.147"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d193d69bae983fc11a79df82342761dfbf28a99fc8d203dca4c3c1b590948965"
+dependencies = [
+ "serde_derive",
+]
+
+[[package]]
+name = "serde_derive"
+version = "1.0.147"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4f1d362ca8fc9c3e3a7484440752472d68a6caa98f1ab81d99b5dfe517cec852"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "simba"
+version = "0.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f0b7840f121a46d63066ee7a99fc81dcabbc6105e437cae43528cea199b5a05f"
+dependencies = [
+ "approx",
+ "num-complex",
+ "num-traits",
+ "paste",
+ "wide",
+]
+
+[[package]]
+name = "simple-error"
+version = "0.2.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cc47a29ce97772ca5c927f75bac34866b16d64e07f330c3248e2d7226623901b"
+
+[[package]]
+name = "statrs"
+version = "0.16.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2d08e5e1748192713cc281da8b16924fb46be7b0c2431854eadc785823e5696e"
+dependencies = [
+ "approx",
+ "lazy_static",
+ "nalgebra",
+ "num-traits",
+ "rand",
+]
+
+[[package]]
+name = "strsim"
+version = "0.10.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623"
+
+[[package]]
+name = "strum"
+version = "0.24.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "063e6045c0e62079840579a7e47a355ae92f60eb74daaf156fb1e84ba164e63f"
+
+[[package]]
+name = "strum_macros"
+version = "0.24.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1e385be0d24f186b4ce2f9982191e7101bb737312ad61c1f2f984f34bcf85d59"
+dependencies = [
+ "heck",
+ "proc-macro2",
+ "quote",
+ "rustversion",
+ "syn",
+]
+
+[[package]]
+name = "syn"
+version = "1.0.102"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3fcd952facd492f9be3ef0d0b7032a6e442ee9b361d4acc2b1d0c4aaa5f613a1"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "unicode-ident",
+]
+
+[[package]]
+name = "termcolor"
+version = "1.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bab24d30b911b2376f3a13cc2cd443142f0c81dda04c118693e35b3835757755"
+dependencies = [
+ "winapi-util",
+]
+
+[[package]]
+name = "thiserror"
+version = "1.0.37"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "10deb33631e3c9018b9baf9dcbbc4f737320d2b576bac10f6aefa048fa407e3e"
+dependencies = [
+ "thiserror-impl",
+]
+
+[[package]]
+name = "thiserror-impl"
+version = "1.0.37"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "982d17546b47146b28f7c22e3d08465f6b8903d0ea13c1660d9d84a6e7adcdbb"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "threadpool"
+version = "1.8.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d050e60b33d41c19108b32cea32164033a9013fe3b46cbd4457559bfbf77afaa"
+dependencies = [
+ "num_cpus",
+]
+
+[[package]]
+name = "time"
+version = "0.1.45"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1b797afad3f312d1c66a56d11d0316f916356d11bd158fbc6ca6389ff6bf805a"
+dependencies = [
+ "libc",
+ "wasi 0.10.0+wasi-snapshot-preview1",
+ "winapi",
+]
+
+[[package]]
+name = "time"
+version = "0.3.25"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b0fdd63d58b18d663fbdf70e049f00a22c8e42be082203be7f26589213cd75ea"
+dependencies = [
+ "deranged",
+ "itoa 1.0.9",
+ "libc",
+ "num_threads",
+ "serde",
+ "time-core",
+ "time-macros",
+]
+
+[[package]]
+name = "time-core"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7300fbefb4dadc1af235a9cef3737cea692a9d97e1b9cbcd4ebdae6f8868e6fb"
+
+[[package]]
+name = "time-macros"
+version = "0.2.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "eb71511c991639bb078fd5bf97757e03914361c48100d52878b8e52b46fb92cd"
+dependencies = [
+ "time-core",
+]
+
+[[package]]
+name = "tinyvec"
+version = "1.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "87cc5ceb3875bb20c2890005a4e226a4651264a5c75edb2421b52861a0a0cb50"
+dependencies = [
+ "tinyvec_macros",
+]
+
+[[package]]
+name = "tinyvec_macros"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c"
+
+[[package]]
+name = "triple_accel"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "22048bc95dfb2ffd05b1ff9a756290a009224b60b2f0e7525faeee7603851e63"
+
+[[package]]
+name = "typenum"
+version = "1.15.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dcf81ac59edc17cc8697ff311e8f5ef2d99fcbd9817b34cec66f90b6c3dfd987"
+
+[[package]]
+name = "unicode-bidi"
+version = "0.3.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d54675592c1dbefd78cbd98db9bacd89886e1ca50692a0692baefffdeb92dd58"
+
+[[package]]
+name = "unicode-ident"
+version = "1.0.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6ceab39d59e4c9499d4e5a8ee0e2735b891bb7308ac83dfb4e80cad195c9f6f3"
+
+[[package]]
+name = "unicode-normalization"
+version = "0.1.22"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5c5713f0fc4b5db668a2ac63cdb7bb4469d8c9fed047b1d0292cc7b0ce2ba921"
+dependencies = [
+ "tinyvec",
+]
+
+[[package]]
+name = "unicode-width"
+version = "0.1.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c0edd1e5b14653f783770bce4a4dabb4a5108a5370a5f5d8cfe8710c361f6c8b"
+
+[[package]]
+name = "url"
+version = "2.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0d68c799ae75762b8c3fe375feb6600ef5602c883c5d21eb51c09f22b83c4643"
+dependencies = [
+ "form_urlencoded",
+ "idna",
+ "percent-encoding",
+]
+
+[[package]]
+name = "vcpkg"
+version = "0.2.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426"
+
+[[package]]
+name = "vec_map"
+version = "0.8.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f1bddf1187be692e79c5ffeab891132dfb0f236ed36a43c7ed39f1165ee20191"
+dependencies = [
+ "serde",
+]
+
+[[package]]
+name = "vergen"
+version = "8.2.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bbc5ad0d9d26b2c49a5ab7da76c3e79d3ee37e7821799f8223fcb8f2f391a2e7"
+dependencies = [
+ "anyhow",
+ "rustversion",
+ "time 0.3.25",
+]
+
+[[package]]
+name = "version_check"
+version = "0.9.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f"
+
+[[package]]
+name = "wasi"
+version = "0.10.0+wasi-snapshot-preview1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1a143597ca7c7793eff794def352d41792a93c481eb1042423ff7ff72ba2c31f"
+
+[[package]]
+name = "wasi"
+version = "0.11.0+wasi-snapshot-preview1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
+
+[[package]]
+name = "wasm-bindgen"
+version = "0.2.84"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "31f8dcbc21f30d9b8f2ea926ecb58f6b91192c17e9d33594b3df58b2007ca53b"
+dependencies = [
+ "cfg-if",
+ "wasm-bindgen-macro",
+]
+
+[[package]]
+name = "wasm-bindgen-backend"
+version = "0.2.84"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "95ce90fd5bcc06af55a641a86428ee4229e44e07033963a2290a8e241607ccb9"
+dependencies = [
+ "bumpalo",
+ "log",
+ "once_cell",
+ "proc-macro2",
+ "quote",
+ "syn",
+ "wasm-bindgen-shared",
+]
+
+[[package]]
+name = "wasm-bindgen-macro"
+version = "0.2.84"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4c21f77c0bedc37fd5dc21f897894a5ca01e7bb159884559461862ae90c0b4c5"
+dependencies = [
+ "quote",
+ "wasm-bindgen-macro-support",
+]
+
+[[package]]
+name = "wasm-bindgen-macro-support"
+version = "0.2.84"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2aff81306fcac3c7515ad4e177f521b5c9a15f2b08f4e32d823066102f35a5f6"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+ "wasm-bindgen-backend",
+ "wasm-bindgen-shared",
+]
+
+[[package]]
+name = "wasm-bindgen-shared"
+version = "0.2.84"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0046fef7e28c3804e5e38bfa31ea2a0f73905319b677e57ebe37e49358989b5d"
+
+[[package]]
+name = "wide"
+version = "0.7.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ae41ecad2489a1655c8ef8489444b0b113c0a0c795944a3572a0931cf7d2525c"
+dependencies = [
+ "bytemuck",
+ "safe_arch",
+]
+
+[[package]]
+name = "winapi"
+version = "0.3.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419"
+dependencies = [
+ "winapi-i686-pc-windows-gnu",
+ "winapi-x86_64-pc-windows-gnu",
+]
+
+[[package]]
+name = "winapi-i686-pc-windows-gnu"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
+
+[[package]]
+name = "winapi-util"
+version = "0.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178"
+dependencies = [
+ "winapi",
+]
+
+[[package]]
+name = "winapi-x86_64-pc-windows-gnu"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
+
+[[package]]
+name = "windows"
+version = "0.48.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e686886bc078bc1b0b600cac0147aadb815089b6e4da64016cbd754b6342700f"
+dependencies = [
+ "windows-targets",
+]
+
+[[package]]
+name = "windows-targets"
+version = "0.48.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7b1eb6f0cd7c80c79759c929114ef071b87354ce476d9d94271031c0497adfd5"
+dependencies = [
+ "windows_aarch64_gnullvm",
+ "windows_aarch64_msvc",
+ "windows_i686_gnu",
+ "windows_i686_msvc",
+ "windows_x86_64_gnu",
+ "windows_x86_64_gnullvm",
+ "windows_x86_64_msvc",
+]
+
+[[package]]
+name = "windows_aarch64_gnullvm"
+version = "0.48.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "91ae572e1b79dba883e0d315474df7305d12f569b400fcf90581b06062f7e1bc"
+
+[[package]]
+name = "windows_aarch64_msvc"
+version = "0.48.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b2ef27e0d7bdfcfc7b868b317c1d32c641a6fe4629c171b8928c7b08d98d7cf3"
+
+[[package]]
+name = "windows_i686_gnu"
+version = "0.48.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "622a1962a7db830d6fd0a69683c80a18fda201879f0f447f065a3b7467daa241"
+
+[[package]]
+name = "windows_i686_msvc"
+version = "0.48.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4542c6e364ce21bf45d69fdd2a8e455fa38d316158cfd43b3ac1c5b1b19f8e00"
+
+[[package]]
+name = "windows_x86_64_gnu"
+version = "0.48.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ca2b8a661f7628cbd23440e50b05d705db3686f894fc9580820623656af974b1"
+
+[[package]]
+name = "windows_x86_64_gnullvm"
+version = "0.48.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7896dbc1f41e08872e9d5e8f8baa8fdd2677f29468c4e156210174edc7f7b953"
+
+[[package]]
+name = "windows_x86_64_msvc"
+version = "0.48.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1a515f5799fe4961cb532f983ce2b23082366b898e52ffbce459c86f67c8378a"
diff --git a/Cargo.toml b/Cargo.toml
new file mode 100644
index 0000000..28189e1
--- /dev/null
+++ b/Cargo.toml
@@ -0,0 +1,38 @@
+[package]
+name = "hiphase"
+version = "1.0.0"
+authors = ["J. Matthew Holt <mholt@pacificbiosciences.com>"]
+description = "A tool for phasing HiFi VCF files."
+edition = "2021"
+license-file="LICENSE.md"
+
+[build-dependencies]
+vergen = { version = "8.2.4", features = ["git", "gitcl"] }
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+# Note: cmake is required, module load cmake/3.20.2 for initial compiles
+
+[dependencies]
+bio = "1.2.0"
+bit-vec = "0.6.3"
+chrono = "0.4.24"
+clap = { version = "4.0.13", features = ["derive"] }
+cpu-time = "1.0.0"
+csv = "1.1.6"
+env_logger = "0.9.1"
+exitcode = "1.1.2"
+flate2 = "1.0.26"
+lazy_static = "1.4.0"
+log = "0.4.17"
+priority-queue = "1.2.3"
+# consider the older version if we run into build issues later
+# rust-htslib = { version = "0.37.0", default-features = false }
+rust-htslib = { version = "0.39.5", default-features = false, features = ["static"] }
+rustc-hash = "1.1.0"
+serde = "1.0.147"
+simple-error = "0.2.3"
+threadpool = "1.8.1"
+
+[profile.release]
+lto = "fat"
+codegen-units = 1
diff --git a/build.rs b/build.rs
new file mode 100644
index 0000000..afa5ed8
--- /dev/null
+++ b/build.rs
@@ -0,0 +1,19 @@
+use std::error::Error;
+use vergen::EmitBuilder;
+
+fn main() -> Result<(), Box<dyn Error>> {
+    EmitBuilder::builder()
+        .fail_on_error()
+        .all_git()
+        .git_describe(true, false, Some("ThisPatternShouldNotMatchAnythingEver"))
+        .emit()?;
+
+    // emit build handles the git configuration and build.rs, but we also need to track the toml and src folder to catch dirty
+    println!("cargo:rerun-if-changed=Cargo.toml");
+    println!("cargo:rerun-if-changed=src");
+
+    // uncomment if you ever want to easily see what this is emiting
+    // panic!("");
+
+    Ok(())
+}
\ No newline at end of file
diff --git a/src/astar_phaser.rs b/src/astar_phaser.rs
new file mode 100644
index 0000000..bdbab4d
--- /dev/null
+++ b/src/astar_phaser.rs
@@ -0,0 +1,788 @@
+
+use crate::{block_gen::PhaseBlock, data_types::variants::VariantType};
+use crate::data_types::read_segments::ReadSegment;
+use crate::data_types::variants::Variant;
+use crate::writers::phase_stats::PhaseStats;
+
+use bio::data_structures::interval_tree::IntervalTree;
+use log::{debug,trace};
+use priority_queue::PriorityQueue;
+use std::cmp::Reverse;
+
+/// A node in the A* search tree.
+#[derive(Eq,Hash,PartialEq)]
+struct AstarNode {
+    /// The node index
+    node_index: u64,
+    /// The cost that is fixed that no longer needs to be re-computed. Corresponds to reads that overlap early parts of the haplotypes.
+    frozen_cost: u64,
+    /// The cost that needs to be re-computed, corresponds to reads that partially overlap this solution, but have more variants.
+    fluid_cost: u64,
+    /// An estimate of the remaining cost to extend this node to full.
+    heuristic_cost: u64,
+    /// The first haplotype in this node's solution.
+    h1: Vec<u8>,
+    /// The second haplotype in this node's solution.  It can be identically to h1, but usually is not.
+    h2: Vec<u8>,
+    /// The number of heterozygous results in this node's solution.  I.e. sum(h1[x] != h2[x])
+    num_hets: u64
+}
+
+impl std::fmt::Debug for AstarNode {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("AstarNode")
+            .field("frozen_cost", &self.frozen_cost)
+            .field("fluid_cost", &self.fluid_cost)
+            .field("heuristic_cost", &self.heuristic_cost)
+            .field("hap.len()", &self.h1.len())
+            .finish()
+    }
+}
+
+impl AstarNode {
+    /// Returns a new empty haplotype node with a heuristic cost.
+    /// This should really only be used for a root node.
+    /// # Arguments
+    /// * `max_heuristic` - the estimate cost for the full phase block
+    pub fn new(max_heuristic: u64) -> AstarNode {
+        AstarNode {
+            node_index: 0,
+            frozen_cost: 0,
+            fluid_cost: 0,
+            heuristic_cost: max_heuristic,
+            h1: Default::default(),
+            h2: Default::default(),
+            num_hets: 0
+        }
+    }
+
+    /// This will create a newly extended node from haplotypes.
+    /// Heuristic cost must be provided, but actual cost will be calculated from the reads.
+    /// # Arguments
+    /// * `node_index` - the index to use for this node, generally the index = the order of generating/encountering nodes
+    /// * `parent_node` - the parent node used to spawn this one in our search tree space
+    /// * `allele1` - the first allele, gets appended to haplotype 1 from parent
+    /// * `allele2` - the second allele, gets appened to haplotype 2 from parent
+    /// * `heuristic_cost` - the estimated cost of adding remaining variants to the haplotypes
+    /// * `read_segments` - all the reads that we need to evaluate the _actual_ cost so far
+    /// * `hap_offset` - the offset of the haplotype relative to read starts, only needed if solving subproblems
+    pub fn new_extended_node(
+        node_index: u64,
+        parent_node: &AstarNode,
+        allele1: u8, 
+        allele2: u8,
+        heuristic_cost: u64, 
+        read_segments: &IntervalTree<usize, ReadSegment>,
+        hap_offset: usize
+    ) -> AstarNode {
+        // make sure we didn't goof
+        let mut h1: Vec<u8> = parent_node.get_h1().to_vec();
+        h1.push(allele1);
+        let mut h2: Vec<u8> = parent_node.get_h2().to_vec();
+        h2.push(allele2);
+        assert_eq!(h1.len(), h2.len());
+        
+        // num hets = parent hets + 0/1 depending on allelic extensions
+        let num_hets = parent_node.get_num_hets()+(if allele1 == allele2 { 0 } else { 1 });
+
+        // copy the frozen cost and initial fluid to 0, these will get added to below
+        let mut frozen_cost = parent_node.get_frozen_cost();
+        let mut fluid_cost = 0;
+        let hap_len = h1.len()+hap_offset;
+        for rs_interval in read_segments.find(hap_len-1..hap_len) {
+            // calculate the cost of this segment with this phase block so far
+            let rs = rs_interval.data();
+            let rs_cost = std::cmp::min(
+                rs.score_partial_haplotype(&h1[..], hap_offset),
+                rs.score_partial_haplotype(&h2[..], hap_offset)
+            );
+
+            // determine where that cost gets assigned
+            if rs.last_allele() < hap_len {
+                // this one is frozen because the last allele was just added
+                frozen_cost += rs_cost;
+            } else {
+                // there are more alleles for this segment, so it's liquid still
+                fluid_cost += rs_cost;
+            }
+        }
+        
+        AstarNode {
+            node_index,
+            frozen_cost,
+            fluid_cost,
+            heuristic_cost,
+            h1,
+            h2,
+            num_hets
+        }
+    }
+
+    pub fn get_frozen_cost(&self) -> u64 {
+        self.frozen_cost
+    }
+
+    /// Returns the combined frozen, fluid, and heuristic cost of this node.
+    pub fn get_total_cost(&self) -> u64 {
+        self.frozen_cost + self.fluid_cost + self.heuristic_cost
+    }
+
+    /// Priority is ranked by minimum total cost -> max number of hets -> earliest node index
+    pub fn get_priority(&self) -> (Reverse<u64>, u64, Reverse<u64>) {
+        (Reverse(self.get_total_cost()), self.num_hets, Reverse(self.node_index))
+    }
+
+    /// Returns a priority where cost is 0, this is primarily to trigger forced pruning
+    pub fn get_cleared_priority(&self) -> (Reverse<u64>, u64, Reverse<u64>) {
+        (Reverse(0), self.num_hets, Reverse(self.node_index))
+    }
+
+    pub fn get_h1(&self) -> &[u8] {
+        &self.h1[..]
+    }
+
+    pub fn get_h2(&self) -> &[u8] {
+        &self.h2[..]
+    }
+
+    pub fn get_allele_count(&self) -> usize {
+        self.h1.len()
+    }
+
+    #[allow(dead_code)]
+    pub fn get_node_index(&self) -> u64 {
+        self.node_index
+    }
+
+    pub fn get_num_hets(&self) -> u64 {
+        self.num_hets
+    }
+
+    /// Returns true if the internal haplotypes are identical.
+    /// Usually only happens when near the root of the tree.
+    pub fn is_identical_haplotypes(&self) -> bool {
+        self.h1 == self.h2
+    }
+}
+
+/// Struct for tracking the length of haplotypes in our queue that are greater than some threshold.
+/// This allows us to track how long a queue actually is when we know we will ignore events smaller than the threshold.
+/// Currently, it does not contain the queue itself, which may be worth doing in the long term.
+struct PQueueHapTracker {
+    /// The count of each haplotype size in the queue
+    length_counts: Vec<usize>,
+    /// The total number of haplotypes in the queue with length >= threshold
+    total_count: usize,
+    /// The minimum threshold for a haplotype to count
+    threshold: usize
+}
+
+impl PQueueHapTracker {
+    /// Creates a new tracker with a given maximum haplotype size.
+    /// # Arguments
+    /// * `max_hap_length` - the maximum size of the haplotypes tracked
+    pub fn new(max_hap_length: usize) -> PQueueHapTracker {
+        PQueueHapTracker {
+            length_counts: vec![0; max_hap_length+1],
+            total_count: 0,
+            threshold: 0
+        }
+    }
+
+    /// Adds a haplotype length to our tracker
+    /// # Arguments
+    /// * `value` - the length of the haplotype getting tracked
+    pub fn add_hap(&mut self, value: usize) {
+        self.length_counts[value] += 1;
+        if value >= self.threshold {
+            self.total_count += 1;
+        }
+    }
+
+    /// Removes a haplotype length from the tracker
+    /// # Arguments
+    /// * `value` - the length of the haplotype getting removed from tracking
+    pub fn remove_hap(&mut self, value: usize) {
+        assert!(self.length_counts[value] > 0);
+        self.length_counts[value] -= 1;
+        if value >= self.threshold {
+            assert!(self.total_count > 0);
+            self.total_count -= 1;
+        }
+    }
+
+    /// Increased the threshold of what is included in our total count
+    /// # Arguments
+    /// * `new_threshold` - the new minimum threshold to track, must be >= current threshold
+    pub fn increase_threshold(&mut self, new_threshold: usize) {
+        assert!(new_threshold >= self.threshold);
+        trace!("increase_threshold => {}, size = {}", self.threshold, self.total_count);
+        for t in self.threshold..new_threshold {
+            self.total_count -= self.length_counts[t];
+        }
+        self.threshold = new_threshold;
+        trace!("increase_threshold => {}, size = {}", self.threshold, self.total_count);
+    }
+
+    /// Returns the total number of haplotypes in the queue with length >= the internal threshold.
+    pub fn len(&self) -> usize {
+        self.total_count
+    }
+}
+
+/// This calculates the heuristic estimates to be used by the full A* algorithm.
+/// Given N alleles to phase, this will return a Vec with N+1 values, such that V[x] = H(x) where x is the number of set haplotype values.
+/// This array is monotonically decreasing and always ends with 0.
+/// Also returns a second boolean array indicating a variant is "problematic" and should be ignored (currently disabled, controlled via `bad_variants_enabled` constant).
+/// This implementation uses the A* algorithm to generate the heuristic.
+/// See `astar_subsolver(...)` for details.
+/// # Arguments
+/// * `num_variants` - the number of variants in the phase block; run-time grows linearly as the number of variants increases
+/// * `max_segment_size` - the maximum number of variants to use when calculating sub-block heuristics; run-time grows a least linearly with this value
+/// * `read_segments` - the reads to use when calculating the heuristics costs; run-time grows linearly with the length of this data
+/// * `min_queue_size` - the minimum length of the queue
+/// * `queue_increment` - the length that the queue grows as more variants are added to the solution
+/// * `opt_bad_variants` - an optional set of "bad" or "ignored" variants that we should just ignore from the start
+fn calculate_astar_heuristic(
+    num_variants: usize, max_segment_size: usize, read_segments: &IntervalTree<usize, ReadSegment>,
+    min_queue_size: usize, queue_increment: usize, opt_bad_variants: Option<Vec<bool>>
+) -> (Vec<u64>, Vec<bool>) {
+    assert!(max_segment_size >= 2);
+    // an extra slot is included because it makes some checks go away later
+    let mut heuristics: Vec<u64> = vec![0; num_variants+1];
+    let mut bad_variants: Vec<bool> = match opt_bad_variants {
+        Some(obv) => {
+            assert_eq!(obv.len(), num_variants);
+            obv
+        },
+        None => vec![false; num_variants]
+    };
+    let bad_variants_enabled = false;
+    let mut max_clip_size: usize = 1;
+    for v_index in (0..num_variants).rev() {
+        debug!("solving subproblem {}..{}", v_index, v_index+max_clip_size);
+        let (max_estimate, solve_size): (u64, usize) = astar_subsolver(
+            v_index, max_clip_size, read_segments, &heuristics[..], &bad_variants[..],
+            min_queue_size / 10, queue_increment
+        );
+        assert!(solve_size >= max_clip_size.min(2));
+        debug!("  estimate => {} (solution distance => {}/{})", max_estimate, solve_size, max_clip_size);
+
+        // monotonically decreasing so it's either
+        // 1) the cost of the next entry OR
+        // 2) this cost of solving this problem + the cost of solving everything after this problem
+        if bad_variants_enabled && solve_size < max_clip_size {
+            // we couldn't successfully form a partial haplotype
+            bad_variants[v_index] = true;
+        }
+
+        if bad_variants[v_index] {
+            // this one was a bad variant that will get ignored (e.g., 0-cost), just copy the heuristic
+            heuristics[v_index] = heuristics[v_index+1];
+        } else {
+            // we successfully navigated estimation from this point
+            assert!(max_estimate >= heuristics[v_index+1]);
+            heuristics[v_index] = max_estimate;
+        }
+
+        max_clip_size = (solve_size+1).min(max_segment_size);
+    }
+
+    (heuristics, bad_variants)
+}
+
+/// An unpruned A* solver that will attempt to find the best path for a sub-problem.
+/// It uses the heuristic estimates downstream to calculate this ones largest estimate.
+/// For problem_size, `p`, and problem_offset, `o`, it will calculate max(`best_path(o..o+x)+H[o+x]`, for all `0 <= x <= p`).
+/// This is basically the worst-case combination of two partial solutions within this region.
+/// There is a very small queue used by this solver and if it reaches a maximum capacity, it will exit early without fully estimating.
+/// However, as the solver goes, it will calculate the maximum heuristic encountered so far even if `x` does not get all the way to `p`.
+/// While this estimator may exit early, it is not pruned, so you are guaranteed to find the best heuristic up to the point it exits.
+/// In practice, most of them fully reach the problem size except in the problematic areas.
+/// Returns tuple `(max_cost, farthest_estimate)` where max_cost is the highest heuristic cost found and farthest_estimate is how far the A* algorithm made it through the subproblem.
+/// # Arguments
+/// * `problem_offset` - the offset into the problem we are solving
+/// * `problem_size` - the length of the sub-problem we are solving, usually constant except at the tail
+/// * `read_segments` - the reads that are used to measure costs
+/// * `heuristic_costs` - the heuristic costs so far, everything in `problem_offset+1..` is assumed to be already populated
+/// * `bad_variants` - the bad variants encountered so far, if one is true, it will basically be ignored
+/// * `min_queue_size` - the minimum length of the queue
+/// * `queue_increment` - the length that the queue grows as more variants are added to the solution
+fn astar_subsolver(
+    problem_offset: usize, problem_size: usize, read_segments: &IntervalTree<usize, ReadSegment>, 
+    heuristic_costs: &[u64], bad_variants: &[bool], min_queue_size: usize, queue_increment: usize
+) -> (u64, usize) {
+    // now, the core looping algorithm
+    let mut pqueue: PriorityQueue<AstarNode, (Reverse<u64>, u64, Reverse<u64>)> = PriorityQueue::new();
+    let mut next_node_index: u64 = 1;
+
+    // this heuristic _should_ always be 0, because we're trying to calculate it
+    assert_eq!(heuristic_costs[problem_offset], 0);
+    // initialize with our neighbor heuristic (heuristic_costs is 1 longer than necessary, so no check needed here)
+    let initial_estimate = heuristic_costs[problem_offset+1];
+    
+    // initialize this subsolver with the estimate from the _next_ node; it has to be >= that value
+    let initial_node = AstarNode::new(initial_estimate);
+    let initial_priority = initial_node.get_priority();
+    pqueue.push(initial_node, initial_priority);
+
+    let mut next_expected: usize = 0;
+    let mut max_cost_so_far: u64 = 0;
+    
+    // we want a base level queue length, but it also needs to grow *slightly* with the length of the problem
+    let max_visits: usize = min_queue_size + queue_increment * problem_size;
+    let mut nodes_visited: usize = 0;
+
+    //we loop as long as the next entry is shorter than the problem size
+    while pqueue.peek().unwrap().0.get_allele_count() < problem_size && nodes_visited < max_visits {
+        let (top_node, _top_priority) = pqueue.pop().unwrap();
+        let allele_count: usize = top_node.get_allele_count();
+        nodes_visited += 1;
+        
+        if allele_count == next_expected {
+            // debug!("B#{} ({}/{}, {:?} {}/{}) => {:?}", phase_block.get_block_index(), next_expected, num_variants, top_priority, hap_tracker.len(), pqueue.len(), top_node);
+            max_cost_so_far = max_cost_so_far.max(top_node.get_total_cost());
+            next_expected += 1;
+        }
+        
+        if bad_variants[problem_offset+allele_count] {
+            // the next variant we want to add to the haplotype is a bad variant, so skip it
+            let new_node = AstarNode::new_extended_node(
+                next_node_index,
+                &top_node, 2, 2, 
+                heuristic_costs[problem_offset+allele_count+1], 
+                read_segments,
+                problem_offset
+            );
+            next_node_index += 1;
+
+            // the new node should have identical total cost
+            assert_eq!(top_node.get_total_cost(), new_node.get_total_cost());
+
+            // add it to the queue
+            let new_priority = new_node.get_priority();
+            pqueue.push(new_node, new_priority);
+        } else {
+            // we didn't exit, so we need to add all expansions of this allele
+            let hap_order = [(0, 1), (1, 0), (0, 0), (1, 1)];
+            for &(h1, h2) in hap_order.iter() {
+                // we don't want to add both 0-1 and 1-0 if the haplotypes before are identical; it doubles our work
+                // so skip 1-0 if the haplotypes in the node are identical
+                if !(h1 == 1 && h2 == 0 && top_node.is_identical_haplotypes()) {
+                    // generate a new node and add to the queue
+                    let new_node = AstarNode::new_extended_node(
+                        next_node_index,
+                        &top_node, h1, h2,
+                        heuristic_costs[problem_offset+allele_count+1], 
+                        read_segments,
+                        problem_offset
+                    );
+                    next_node_index += 1;
+
+                    // add it to the queue
+                    let new_priority = new_node.get_priority();
+                    pqueue.push(new_node, new_priority);
+                }
+            }
+        }
+    }
+
+    if pqueue.peek().unwrap().0.get_allele_count() == problem_size {
+        // loop terminated because we reached our problem size
+        let (top_node, _top_priority) = pqueue.peek().unwrap();//pqueue.pop().unwrap();
+        max_cost_so_far = max_cost_so_far.max(top_node.get_total_cost());
+        next_expected += 1;
+    } else {
+        // we exited early, so whatever max we found is what we have
+    }
+
+    (max_cost_so_far, next_expected-1)
+}
+
+/// A result for a phasing algorithm, assumes diploid solution currently.
+pub struct AstarResult {
+    /// The first haplotype in the solution.
+    pub haplotype_1: Vec<u8>,
+    /// The second haplotype in the solution.
+    pub haplotype_2: Vec<u8>,
+    /// Optional statistics from the problem
+    pub statistics: PhaseStats
+}
+
+/// Returns a phasing result by performing an A* tree search algorithm to calculate the best phase solution for a phase block.
+/// This algorithm currently uses a fixed heuristic based on the distance to the end.
+/// It also has a pruning strategy that fixes the priority queue size based on the farthest explored node so far.
+/// # Arguments
+/// * `phase_block` - the phase block summary information
+/// * `variants` - the variants in the block, primarily passed-through to solution
+/// * `read_segments` - interval tree of the reads that serve as data points for the phasing algorithm
+/// * `min_queue_size` - the minimum length of the queue
+/// * `queue_increment` - the length that the queue grows as more variants are added to the solution
+pub fn astar_solver(
+    phase_block: &PhaseBlock, variants: &[Variant], read_segments: &IntervalTree<usize, ReadSegment>,
+    min_queue_size: usize, queue_increment: usize
+) -> AstarResult {
+    // we use this a lot
+    let num_variants: usize = variants.len();
+
+    // this is a sanity check for now that verifies that all read segments have all ignored variants set to 3 in that position
+    // there is technically a cost here, but it is negligible for our sanity while debugging
+    for rse in read_segments.find(0..usize::MAX) {
+        let segment = rse.data();
+        let alleles = segment.alleles();
+        for (var_index, v) in variants.iter().enumerate() {
+            if v.is_ignored() {
+                assert!(alleles[var_index] == 3);
+            }
+        }
+    }
+
+    // add all the bad variants to this list that will seed the heuristic calculation list
+    let bad_variants: Vec<bool> = variants.iter()
+        .map(|v| v.is_ignored())
+        .collect();
+    let opt_bad_variants: Option<Vec<bool>> = Some(bad_variants);
+    
+    // our queue has a flat size + a buffer for each variant encountered so far
+    let mut curr_queue_size_threshold: usize = min_queue_size;
+    let full_prune_enabled: bool = true; // this will make sure the pqueue.len() field stays below the `max_queue_size` for conserving memory
+    
+    // set max queue size to either 2 * the highest functional queue size OR a large constant, whichever is greater
+    // the large constant help prevent over-use of the full pruning algorithm if the base queue values are small
+    let max_queue_size: usize = (2 * (min_queue_size + queue_increment * num_variants)).max(10000);
+    let mut min_progress: usize = 0;
+    let mut pqueue: PriorityQueue<AstarNode, (Reverse<u64>, u64, Reverse<u64>)> = PriorityQueue::new();
+    let mut hap_tracker: PQueueHapTracker = PQueueHapTracker::new(num_variants);
+    let mut next_expected = 0;
+    
+    // calculate a heuristic by looking ahead 40 variants
+    // TODO: do we want to make this a parameter at some point?
+    let max_segment_size: usize = 40;
+    let (heuristic_costs, bad_variants): (Vec<u64>, Vec<bool>) = calculate_astar_heuristic(
+        num_variants, max_segment_size, read_segments, min_queue_size, queue_increment, opt_bad_variants
+    );
+    debug!("Heuristics(<={}): {:?}", max_segment_size, heuristic_costs);
+    debug!("Bad variants: {:?}", bad_variants);
+
+    // sanity check, make sure any ignore variants are flagged in bad_variants now
+    for (i, v) in variants.iter().enumerate() {
+        if v.is_ignored() {
+            assert!(bad_variants[i]);
+        }
+    }
+
+    // statistics we want to gather
+    let mut num_pruned: u64 = 0;
+    let estimated_cost: u64 = heuristic_costs[0];
+
+    // now, the core looping algorithm
+    let initial_node = AstarNode::new(heuristic_costs[0]);
+    let initial_priority = initial_node.get_priority();
+    pqueue.push(initial_node, initial_priority);
+    hap_tracker.add_hap(0);
+    let mut next_node_index: u64 = 1;
+    
+    //we loop as long as the next entry is shorter than the expected number of variants
+    while pqueue.peek().unwrap().0.get_allele_count() < num_variants {
+        let (top_node, top_priority) = pqueue.pop().unwrap();
+        let allele_count: usize = top_node.get_allele_count();
+        hap_tracker.remove_hap(allele_count);
+        trace!("popped: {:?} => {:?}, {:?}, {:?}", top_priority, top_node, top_node.get_h1(), top_node.get_h2());
+        if allele_count == next_expected {
+            debug!("B#{} ({}/{}, {:?} {}/{}) => {:?}", phase_block.get_block_index(), next_expected, num_variants, top_priority, hap_tracker.len(), pqueue.len(), top_node);
+            next_expected += 1;
+            if num_pruned == 0 {
+                curr_queue_size_threshold += queue_increment;
+                assert_eq!(curr_queue_size_threshold, min_queue_size + queue_increment * next_expected);
+            }
+        }
+
+        // if a node has fewer alleles than our minimum progression, it gets cut
+        if allele_count < min_progress {
+            //println!("Pruning {:?} {:?}", top_priority, top_node);
+            if num_pruned == 0 {
+                // first time we do this, we need to clear our queue size back to the minimum
+                curr_queue_size_threshold = min_queue_size;
+            }
+            num_pruned += 1;
+            continue;
+        }
+
+        if bad_variants[allele_count] {
+            // we are skipping this variant, generate a new node with unassigned values and add it back to the queue
+            let new_node = AstarNode::new_extended_node(
+                next_node_index,
+                &top_node, 2, 2,
+                heuristic_costs[allele_count+1], 
+                read_segments,
+                0
+            );
+            next_node_index += 1;
+
+            let new_priority = new_node.get_priority();
+            assert_eq!(top_node.get_total_cost(), new_node.get_total_cost());
+            pqueue.push(new_node, new_priority);
+            hap_tracker.add_hap(allele_count+1);
+        } else {
+            // we didn't exit, so we need to add all expansions of this allele
+            // these are ordered such that heterozygous options come first
+            let hap_order = [(0, 1), (1, 0), (0, 0), (1, 1)];
+            for &(h1, h2) in hap_order.iter() {
+                // we don't want to add both 0-1 and 1-0 if the haplotypes before are identical; it doubles our work
+                // so skip 1-0 if the haplotypes in the node are identical
+                if !(h1 == 1 && h2 == 0 && top_node.is_identical_haplotypes()) {
+                    // generate a new node and add to the queue
+                    let new_node = AstarNode::new_extended_node(
+                        next_node_index,
+                        &top_node, h1, h2,
+                        heuristic_costs[allele_count+1], 
+                        read_segments,
+                        0
+                    );
+                    next_node_index += 1;
+
+                    let new_priority = new_node.get_priority();
+                    trace!("Pushing {:?}, {:?}, {:?}", new_node, new_node.get_h1(), new_node.get_h2());
+                    pqueue.push(new_node, new_priority);
+                    hap_tracker.add_hap(allele_count+1);
+                }
+            }
+        }
+
+        // check if we need to increase our minimum progression to prune off some nodes in the queue
+        if hap_tracker.len() > curr_queue_size_threshold && min_progress < next_expected {
+            min_progress += 1;
+            debug!("B#{} min_progress={}", phase_block.get_block_index(), min_progress);
+            hap_tracker.increase_threshold(min_progress);
+
+            // check if our literal queue is holding too much data
+            if full_prune_enabled && pqueue.len() > max_queue_size {
+                // this means there are a lot of short haps we will eventually prune but haven't yet due to relative high cost
+                // mark them as the "best" priority so they get deleted right away in the next few loops
+                // this can be expensive, and has no functional benefit other than freeing memory
+                // worst case HG001 test without this got up to 60GB
+                let mut prune_count: usize = 0;
+                for (node, priority) in pqueue.iter_mut() {
+                    if node.get_allele_count() < min_progress {
+                        *priority = node.get_cleared_priority();
+                        prune_count += 1;
+                    }
+                }
+                // when the iter_mut() is released, the pqueue automatically re-prioritizes itself, magic!
+                debug!("B#{} Full prune triggered for {} nodes.", phase_block.get_block_index(), prune_count);
+            }
+        }
+    }
+
+    let (top_node, top_priority) = pqueue.pop().unwrap();
+    let allele_count: usize = top_node.get_allele_count();
+    hap_tracker.remove_hap(allele_count);
+    if allele_count == num_variants {
+        // successful full solution
+        debug!("B#{} ({}/{}, {:?} {}/{}) => {:?}", phase_block.get_block_index(), next_expected, num_variants, top_priority, hap_tracker.len(), pqueue.len(), top_node);
+        let haplotype_1 = top_node.get_h1().to_vec();
+        let haplotype_2 = top_node.get_h2().to_vec();
+        let actual_cost: u64 = top_node.get_total_cost();
+        
+        // gather stats on how many variants were phased, skipped, or homozygous in this block
+        let mut phased_variants = 0;
+        let mut phased_snvs = 0;
+        let mut homozygous_variants = 0;
+        let mut skipped_variants = 0;
+        for (i, (&h1, &h2)) in haplotype_1.iter().zip(haplotype_2.iter()).enumerate() {
+            if h1 != h2 {
+                phased_variants += 1;
+                if variants[i].get_type() == VariantType::Snv {
+                    phased_snvs += 1;
+                }
+            } else if h1 == 2 {
+                // they are both equal to 2
+                skipped_variants += 1;
+            } else {
+                homozygous_variants += 1;
+            }
+        }
+        debug!("B#{} phased: {}, homozygous: {}, skipped: {}", phase_block.get_block_index(), phased_variants, homozygous_variants, skipped_variants);
+
+        let statistics: PhaseStats = PhaseStats::astar_new(
+            num_pruned, estimated_cost, actual_cost,
+            phased_variants, phased_snvs, homozygous_variants, skipped_variants
+        );
+
+        // send it all back
+        AstarResult {
+            haplotype_1,
+            haplotype_2,
+            statistics
+        }
+    } else {
+        // given our current algorithm setup, this should never happen; we will always find _locally_ optimal paths through the tree
+        panic!("B#{} failed to find solution; ({}/{}), {:?} {} => {:?}", phase_block.get_block_index(), next_expected, num_variants, top_priority, pqueue.len(), top_node);
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    
+    /// Returns two full-length reads: the first is all 0s with qual 2 and a second is all 1s with qual 3
+    /// # Arguments
+    /// * `num_alleles` - the length of each read
+    fn get_simple_reads(num_alleles: usize) -> IntervalTree<usize, ReadSegment> {
+        let rs1 = ReadSegment::new(
+            "read_name".to_string(),
+            vec![0; num_alleles],
+            vec![2; num_alleles]
+        );
+        let rs2 = ReadSegment::new(
+            "read_name_2".to_string(),
+            vec![1; num_alleles],
+            vec![3; num_alleles]
+        );
+        let seg_vec = vec![rs1, rs2];
+
+        let mut read_segments: IntervalTree<usize, ReadSegment> = IntervalTree::new();
+        for rs in seg_vec.into_iter() {
+            read_segments.insert(rs.get_range(), rs);
+        }
+        read_segments
+    }
+
+    #[test]
+    fn test_astarnode() {
+        // all 0 read with quals = 2 and an all 1 read with quals = 3
+        let num_alleles = 4;
+        
+        // it doesn't matter that these are wrong for testing
+        let heuristic_costs: Vec<u64> = (0..(num_alleles+1)).map(|i| (num_alleles - i) as u64).collect();
+        let hap_offset = 0;
+        let read_segments = get_simple_reads(num_alleles);
+
+        // test an all 0-hom mode
+        let mut current_node = AstarNode::new(heuristic_costs[0]);
+        for i in 0..num_alleles {
+            let node_index: u64 = (i as u64)+1;
+            let num_hets: u64 = 0;
+            let next_node = AstarNode::new_extended_node(
+                node_index,
+                &current_node, 0, 0,
+                heuristic_costs[i+1],
+                &read_segments,
+                hap_offset
+            );
+
+            let expected_cost = heuristic_costs[i+1] + 3 * node_index;
+            assert_eq!(next_node.get_node_index(), node_index);
+            
+            let expected_frozen = if i == num_alleles - 1 {
+                3 * num_alleles as u64 
+            } else {
+                0
+            };
+            assert_eq!(next_node.get_frozen_cost(), expected_frozen);
+            assert_eq!(next_node.get_total_cost(), expected_cost);
+            assert_eq!(next_node.get_priority(), (Reverse(expected_cost), num_hets, Reverse(node_index)));
+            assert_eq!(next_node.get_cleared_priority(), (Reverse(0), num_hets, Reverse(node_index)));
+            assert_eq!(next_node.get_h1().to_vec(), vec![0; i+1]);
+            assert_eq!(next_node.get_h2().to_vec(), vec![0; i+1]);
+            assert_eq!(next_node.get_allele_count(), i+1);
+            assert_eq!(next_node.get_num_hets(), num_hets);
+            assert!(next_node.is_identical_haplotypes());
+            current_node = next_node;
+        }
+
+        // test an all het (0|1) mode
+        let mut current_node = AstarNode::new(heuristic_costs[0]);
+        for i in 0..num_alleles {
+            let node_index: u64 = (i as u64)+1;
+            let num_hets: u64 = node_index;
+            let next_node = AstarNode::new_extended_node(
+                node_index,
+                &current_node, 0, 1,
+                heuristic_costs[i+1],
+                &read_segments,
+                hap_offset
+            );
+
+            let expected_cost = heuristic_costs[i+1];
+            assert_eq!(next_node.get_node_index(), node_index);
+            
+            let expected_frozen = 0;
+            assert_eq!(next_node.get_frozen_cost(), expected_frozen);
+            assert_eq!(next_node.get_total_cost(), expected_cost);
+            assert_eq!(next_node.get_priority(), (Reverse(expected_cost), num_hets, Reverse(node_index)));
+            assert_eq!(next_node.get_cleared_priority(), (Reverse(0), num_hets, Reverse(node_index)));
+            assert_eq!(next_node.get_h1().to_vec(), vec![0; i+1]);
+            assert_eq!(next_node.get_h2().to_vec(), vec![1; i+1]);
+            assert_eq!(next_node.get_allele_count(), i+1);
+            assert_eq!(next_node.get_num_hets(), num_hets);
+            assert!(!next_node.is_identical_haplotypes());
+            current_node = next_node;
+        }
+
+        // test an all 1-hom mode
+        let mut current_node = AstarNode::new(heuristic_costs[0]);
+        for i in 0..num_alleles {
+            let node_index: u64 = (i as u64)+1;
+            let num_hets: u64 = 0;
+            let next_node = AstarNode::new_extended_node(
+                node_index,
+                &current_node, 1, 1,
+                heuristic_costs[i+1],
+                &read_segments,
+                hap_offset
+            );
+
+            let expected_cost = heuristic_costs[i+1] + 2 * node_index;
+            assert_eq!(next_node.get_node_index(), node_index);
+            
+            let expected_frozen = if i == num_alleles - 1 {
+                2 * num_alleles as u64 
+            } else {
+                0
+            };
+            assert_eq!(next_node.get_frozen_cost(), expected_frozen);
+            assert_eq!(next_node.get_total_cost(), expected_cost);
+            assert_eq!(next_node.get_priority(), (Reverse(expected_cost), num_hets, Reverse(node_index)));
+            assert_eq!(next_node.get_cleared_priority(), (Reverse(0), num_hets, Reverse(node_index)));
+            assert_eq!(next_node.get_h1().to_vec(), vec![1; i+1]);
+            assert_eq!(next_node.get_h2().to_vec(), vec![1; i+1]);
+            assert_eq!(next_node.get_allele_count(), i+1);
+            assert_eq!(next_node.get_num_hets(), num_hets);
+            assert!(next_node.is_identical_haplotypes());
+            current_node = next_node;
+        }
+    }
+
+    #[test]
+    fn test_pqueuehaptracker() {
+        let mut tracker: PQueueHapTracker = PQueueHapTracker::new(10);
+        for i in 0..11 {
+            tracker.add_hap(i);
+        }
+        // make sure length matches everything so far
+        assert_eq!(tracker.len(), 11);
+        
+        // try basic subtraction
+        tracker.remove_hap(3);
+        assert_eq!(tracker.len(), 10);
+
+        // try threshold increasing
+        tracker.increase_threshold(4);
+        assert_eq!(tracker.len(), 7);
+        
+        // make sure removal of pre-threshold data doesn't change our len()
+        for i in 0..3 {
+            tracker.remove_hap(i);
+            assert_eq!(tracker.len(), 7);
+        }
+
+        // make sure adding pre-threshold data doeesn't change our len()
+        tracker.add_hap(0);
+        assert_eq!(tracker.len(), 7);
+
+        // make sure increasing threshold to the same value does nothing
+        tracker.increase_threshold(4);
+        assert_eq!(tracker.len(), 7);
+    }
+}
\ No newline at end of file
diff --git a/src/block_gen.rs b/src/block_gen.rs
new file mode 100644
index 0000000..5d86ea8
--- /dev/null
+++ b/src/block_gen.rs
@@ -0,0 +1,1063 @@
+
+use crate::data_types::variants::{VariantType, Zygosity};
+
+use log::{debug, trace, warn};
+use priority_queue::PriorityQueue;
+use rust_htslib::{bam,bcf,htslib};
+use rust_htslib::bcf::record::GenotypeAllele;
+use rustc_hash::{FxHashMap as HashMap, FxHashSet as HashSet};
+use simple_error::{SimpleError, bail};
+use std::cell::RefCell;
+use std::cmp::Reverse;
+use std::path::{Path, PathBuf};
+
+/// Uses for phase block priority during multi iteration
+type PhaseBlockPriority = Reverse<(u32, u64, u64)>;
+
+/// Gets a list of sample names from a given VCF file
+/// # Arguments
+/// * `filename` - the VCF file to load
+/// # Errors
+/// * if the filename fails to load as a VCF
+/// * if the sample name fails to parse from utf8
+pub fn get_vcf_samples(filename: &Path) -> Result<Vec<String>, Box<dyn std::error::Error>> {
+    use rust_htslib::bcf::Read;
+    let vcf_reader: bcf::IndexedReader = bcf::IndexedReader::from_path(filename)?;
+    let vcf_header: bcf::header::HeaderView = vcf_reader.header().clone();
+    let mut sample_names = vec![];
+    for sv in vcf_header.samples().iter() {
+        let vcf_sample_string: String = std::str::from_utf8(sv)?.to_string();
+        sample_names.push(vcf_sample_string);
+    }
+    Ok(sample_names)
+}
+
+/// Iterates through a collection of BAM files and finds the ones matching the given sample name
+/// # Arguments
+/// * `all_bam_files` - the full list of BAM files to parse
+/// * `sample_name` - the name we want to match
+/// * `reference_filename` - the path to the reference file
+/// # Errors
+/// * if BAM has no RG tags
+/// * if an RG tag has no SM tag
+/// * if multiple SM tags are detected
+pub fn get_sample_bams(all_bam_files: &[PathBuf], sample_name: &str, reference_filename: &Path) -> Result<(Vec<PathBuf>, Vec<usize>), Box<dyn std::error::Error>> {
+    use rust_htslib::bam::Read;
+    let mut sample_bams: Vec<PathBuf> = vec![];
+    let mut bam_indices: Vec<usize> = vec![];
+    for (bam_index, bam_fn) in all_bam_files.iter().enumerate() {
+        let bam_reader = {
+            let mut b = bam::IndexedReader::from_path(bam_fn)?;
+            b.set_reference(reference_filename)?;
+            b
+        };
+        let bam_header =  bam::header::Header::from_template(bam_reader.header());
+        let header_hashmap = bam_header.to_hashmap();
+        let empty_vec = vec![];
+        let read_groups = header_hashmap.get("RG").unwrap_or(&empty_vec);
+        if read_groups.is_empty() {
+            bail!("BAM file has no read groups (RG) tag: {}", bam_fn.to_string_lossy());
+        }
+
+        // there is only one read group
+        let mut bam_sample_name: Option<String> = None;
+        for read_group in read_groups.iter() {
+            let rg_sample_name = match read_group.get("SM") {
+                Some(s) => s,
+                None => {
+                    bail!("BAM file has read group with no sample name (SM) tag: {}", bam_fn.to_string_lossy());
+                }
+            };
+            match bam_sample_name.as_ref() {
+                Some(s) => {
+                    if rg_sample_name != s {
+                        bail!("BAM file with multiple sample reads groups detected, this is not supported: {}", bam_fn.to_string_lossy());
+                    }
+                },
+                None => {
+                    bam_sample_name = Some(rg_sample_name.clone());
+                }
+            };
+        }
+
+        if bam_sample_name.unwrap() == sample_name {
+            sample_bams.push(bam_fn.clone());
+            bam_indices.push(bam_index);
+        }
+    }
+    Ok((sample_bams, bam_indices))
+}
+
+/// Returns true if an alignment record should be filtered out, aka ignored.
+/// Main reasons for ignoring are if it is unmapped, secondary, failed QC, a duplicate, or has too low of a MAPQ.
+/// # Arguments
+/// * `record` - the record of interest
+/// * `min_mapq` - the minimum MAPQ we allow
+pub fn filter_out_alignment_record(record: &bam::Record, min_mapq: u8) -> bool {
+    static FLAG_FILTER: u32 =
+        htslib::BAM_FUNMAP | htslib::BAM_FSECONDARY | htslib::BAM_FQCFAIL | htslib::BAM_FDUP;
+
+    ((record.flags() as u32) & FLAG_FILTER) != 0 || record.mapq() < min_mapq
+}
+
+/// Returns true if a VCF record can be included in phasing based on provided criteria.
+/// Homozygous reference variants are always excluded unless `is_hom_allowed==true`.
+/// Calls with anything missing (e.g. "./.") are also excluded.
+/// Calls that match an unknown/unhandled variant type are also excluded.
+/// # Arguments
+/// * `record` - the variant record to check
+/// * `sample_index` - the sample index, always 0 for single-sample VCFs
+/// * `min_quality` - the minimum GQ for an acceptable variant
+/// * `is_hom_allowed` - if true, then homozygous ALT variants are allowed, provided they meet the other criteria
+/// # Errors
+/// * if zygosity cannot be loaded
+/// * if a call does not have a GQ tag
+pub fn is_phasable_variant(record: &bcf::Record, sample_index: usize, min_quality: i32, is_hom_allowed: bool) -> Result<bool, Box<dyn std::error::Error>> {
+    // check if this variant is heterozygous
+    let zygosity: Zygosity = get_variant_zygosity(record, sample_index)?;
+    if zygosity == Zygosity::Unknown || zygosity == Zygosity::HomozygousReference || (
+        zygosity == Zygosity::HomozygousAlternate && !is_hom_allowed
+    ) {
+        // if unknown or homozygous reference, we definitely return false
+        // if it's homozygous alternate, we also need to check if homs are allowed
+        Ok(false)
+    } else {
+        // heterozygous, check if the variant call is of sufficient quality
+        match record.format(b"GQ").integer() { // for some reason, calling .float() here will error
+            Ok(all_gq) => {
+                let call_quality: i32 = all_gq[sample_index][0];
+                if call_quality < min_quality {
+                    return Ok(false);
+                }
+            },
+            Err(_) => {
+                // usually means there is not a GQ tag, so skip this check
+                // TODO: how do we long-term want to handle this for our variants?
+                // trace! added mostly so clippy stops yelling at me
+                trace!("Variant found without GQ tag {:?}", record);
+            }
+        }
+
+        // heterozygous variant, check that the type is allowed
+        let variant_type = get_variant_type(record)?;
+        match variant_type {
+            VariantType::Snv | 
+            VariantType::Insertion |
+            VariantType::Deletion |
+            VariantType::Indel |
+            VariantType::SvInsertion | 
+            VariantType::SvDeletion |
+            VariantType::TandemRepeat => { Ok(true) },
+
+            VariantType::SvDuplication |
+            VariantType::SvInversion |
+            VariantType::SvBreakend |
+            VariantType::Unknown=> { Ok(false) }
+        }
+    }
+}
+
+/// Looks at a bcf record and return the zygosity. Any "." alleles lead to Unknown zygosity results.
+/// # Arguments
+/// * `record` - the record to parse
+/// * `sample_index` - the sample index, always 0 for single-sample VCFs
+/// # Errors
+/// * if rust_htslib fails to parse the genotype
+/// * if the genotype field is completely empty
+pub fn get_variant_zygosity(record: &bcf::Record, sample_index: usize) -> Result<Zygosity, Box<dyn std::error::Error>> {
+    let all_genotypes = record.genotypes()?;
+    let genotype = all_genotypes.get(sample_index);
+
+    // if the genotype field is completely empty, something is wrong with the VCF
+    if genotype.is_empty() {
+        let chromosome = match record.rid() {
+            Some(rid) => {
+                let header = record.header();
+                match header.rid2name(rid) {
+                    Ok(name) => std::str::from_utf8(name).unwrap_or("FROMUTF8_ERROR"),
+                    Err(_e) => "RID2NAME_ERROR"
+                }
+            },
+            None => "NO_RID"
+        };
+        bail!("Encountered empty GT field for record: {}:{}", chromosome, record.pos());
+    }
+
+    let gt1 = match genotype[0] {
+        GenotypeAllele::Unphased(at) => at,
+        GenotypeAllele::Phased(at) => at,
+        //TODO: ignore these for now, not sure how to handle it?
+        GenotypeAllele::UnphasedMissing => return Ok(Zygosity::Unknown),
+        GenotypeAllele::PhasedMissing => return Ok(Zygosity::Unknown)
+    };
+
+    let gt2 = if genotype.len() == 1 {
+        // if the genotype has only one entry, we will just assume that gt2 is identical to gt1
+        // this basically converts all single-entry genotypes into some Homozygous state
+        gt1
+    } else {
+        match genotype[1] {
+            GenotypeAllele::Unphased(at) => at,
+            GenotypeAllele::Phased(at) => at,
+            //TODO: ignore these for now, not sure how to handle it?
+            GenotypeAllele::UnphasedMissing => return Ok(Zygosity::Unknown),
+            GenotypeAllele::PhasedMissing => return Ok(Zygosity::Unknown)
+        }
+    };
+    let zygosity = if gt1 == gt2 {
+        if gt1 == 0 {
+            Zygosity::HomozygousReference
+        } else {
+            Zygosity::HomozygousAlternate
+        }
+    } else {
+        Zygosity::Heterozygous
+    };
+    Ok(zygosity)
+}
+
+/// Returns a variant type based on the alleles in the VCF.
+/// # Arguments
+/// * `record` - the variant record to check
+pub fn get_variant_type(record: &bcf::Record) -> Result<VariantType, Box<dyn std::error::Error>> {
+    // check if this has an SVTYPE field and parse into an SV type if it does
+    let svtype_result = record.info("SVTYPE".as_bytes()).string();
+    match svtype_result {
+        Ok(svtype_option) => {
+            if let Some(svtype) = svtype_option {
+                // svtype is an array of strings at this point, make sure we only get one
+                assert_eq!(svtype.len(), 1);
+                
+                // make sure these only have one ALT allele
+                let num_alleles = record.alleles().len();
+                assert_eq!(num_alleles, 2);
+                
+                let svtype_str = std::str::from_utf8(svtype[0]).unwrap();
+                let sv_tag = match svtype_str {
+                    "DEL" => {
+                        VariantType::SvDeletion
+                    },
+                    "INS" => {
+                        VariantType::SvInsertion
+                    },
+                    "DUP" => {
+                        VariantType::SvDuplication
+                    },
+                    "INV" => {
+                        VariantType::SvInversion
+                    },
+                    "BND" => {
+                        VariantType::SvBreakend
+                    },
+                    _ => {
+                        bail!("Unhandled SVTYPE tag: {:?}", svtype_str);
+                    }
+                };
+                return Ok(sv_tag);
+            };
+        },
+        Err(rust_htslib::errors::Error::BcfUndefinedTag{ tag: _ }) => {},
+        Err(e) => {
+            // no SVTYPE entry, so we assume it matches SNV or indel models
+            bail!("Unexpected error: {:?}", e);
+        }
+    }
+
+    let trid_result = record.info("TRID".as_bytes()).string();
+    match trid_result {
+        Ok(_trid) => {
+            // we found a TRID field this is a tandem repeat
+            return Ok(VariantType::TandemRepeat);
+        },
+        Err(rust_htslib::errors::Error::BcfUndefinedTag{ tag: _ }) => {},
+        Err(e) => {
+            // no SVTYPE entry, so we assume it matches SNV or indel models
+            bail!("Unexpected error: {:?}", e);
+        }
+    }
+    // TODO: we may eventually need to add a check that verifies that the only REF and ALT alleles at this point are in 
+    // the normal ACGTN alphabet
+
+    // if we have no ALT alleles and know tags to inform us, we have know idea what this is
+    if record.alleles().len() <= 1 {
+        return Ok(VariantType::Unknown);
+    }
+    
+    // reference length is pulled out first, then we can look at the other alleles
+    let ref_len = record.alleles()[0].len();
+
+    // we only care about max ALT length when defining small variant type
+    let max_alt_len = record.alleles().iter().skip(1)
+        .map(|a| a.len())
+        .max()
+        .unwrap();
+
+    Ok(if ref_len == 1 {
+        if max_alt_len == 1 {
+            VariantType::Snv
+        } else {
+            VariantType::Insertion
+        }
+    } else if max_alt_len == 1 {
+        VariantType::Deletion
+    } else {
+        VariantType::Indel
+    })
+}
+
+/// Defines a subset of the total reference space that is a single phasing problem or "block".
+/// Each block has at least 1 read spanning from one variant to the next.
+#[derive(Clone, Eq, Hash, Ord, PartialEq, PartialOrd)]
+pub struct PhaseBlock {
+    // NOTE: order matters here because we're deriving the comparisons
+    /// An index of the block, important for maintaining output order downstream.
+    block_index: usize,
+    /// The chromosome of the block.
+    chrom: String,
+    /// The chromosome index in the first VCF file
+    chrom_index: u32,
+    /// The coordinate of the first variant in the block, inclusive.
+    start: u64,
+    /// The coordinate of the last variant in the block, inclusive.
+    end: u64,
+    /// The total number of variants in the block so far.
+    num_variants: usize,
+    /// The VCF index of the first variant in the block
+    first_variant_vcf: usize,
+    /// The minimum quality of variants that were included
+    min_quality: i32,
+    /// The sample name within the VCF that this block corresponds to
+    sample_name: String
+}
+
+impl std::fmt::Debug for PhaseBlock {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        // main purpose of custom was to munge the coordinates into a single string
+        let mut result = f.debug_struct("PhaseBlock");
+        result.field("block_index", &self.block_index)
+            .field("coordinates", &format!("{}:{}-{}", self.chrom, self.start, self.end))
+            .field("num_variants", &self.num_variants);
+        if self.min_quality > 0 {
+            // theres no real reason to spit this out unless it is doing something
+            result.field("min_quality", &self.min_quality);
+        }
+        result.field("sample_name", &self.sample_name)
+            .finish()
+    }
+}
+
+impl PhaseBlock {
+    /// Initializes a phase block with no variants
+    /// # Arguments
+    /// * `block_index` - the index of this block
+    /// * `chrom` - the chromosome of the phase block
+    /// * `chrom_index` - the chromosome index in the VCF file, for ordering
+    /// * `min_quality` - the minimum quality to include a variant in this phase block
+    /// * `sample_name` - the name of the sample in the VCF(s) that this block info corresponds to 
+    pub fn new(block_index: usize, chrom: String, chrom_index: u32, min_quality: i32, sample_name: String) -> PhaseBlock {
+        PhaseBlock {
+            block_index,
+            chrom,
+            chrom_index,
+            start: 0,
+            end: 0,
+            num_variants: 0,
+            first_variant_vcf: 0,
+            min_quality,
+            sample_name
+        }
+    }
+
+    pub fn get_block_index(&self) -> usize {
+        self.block_index
+    }
+
+    pub fn set_block_index(&mut self, new_index: usize) {
+        self.block_index = new_index;
+    }
+
+    pub fn get_chrom(&self) -> &str {
+        &self.chrom
+    }
+
+    pub fn get_chrom_index(&self) -> u32 {
+        self.chrom_index
+    }
+
+    pub fn get_start(&self) -> u64 {
+        self.start
+    }
+
+    pub fn get_end(&self) -> u64 {
+        self.end
+    }
+
+    pub fn get_num_variants(&self) -> usize {
+        self.num_variants
+    }
+
+    pub fn get_first_variant_vcf(&self) -> usize {
+        self.first_variant_vcf
+    }
+
+    pub fn get_min_quality(&self) -> i32 {
+        self.min_quality
+    }
+
+    /// Returns the number of contained base pairs in the phase block.
+    pub fn bp_len(&self) -> u64 {
+        self.end - self.start + 1
+    }
+
+    pub fn sample_name(&self) -> &str {
+        &self.sample_name
+    }
+
+    /// Add a single-position variant to the phase block, will panic if the chromosome does not match
+    /// # Arguments
+    /// * `chrom` - the chromosome string
+    /// * `pos` - the position of the variant
+    /// * `vcf_index` - the index of the VCF this variant comes from; use 0 if only one VCF is being used
+    pub fn add_locus_variant(&mut self, chrom: &str, pos: u64, vcf_index: usize) {
+        assert_eq!(self.chrom, chrom, "PhaseBlock chromosomes are not equal: \"{}\" \"{}\"", self.chrom, chrom);
+        //first condition unlikely to happen; second happens on an empty block
+        if self.start > pos || self.num_variants == 0 {
+            self.start = pos;
+        }
+        //most common case, extend to the right
+        if self.end < pos {
+            self.end = pos;
+        }
+        self.num_variants += 1;
+
+        if self.num_variants == 1 {
+            self.first_variant_vcf = vcf_index;
+        }
+    }
+
+    /// Checks if a given start/end overlaps the existing phase block
+    /// # Arguments
+    /// * `other_start` - the start position, inclusive
+    /// * `other_end` - the end position, exclusive
+    pub fn is_overlapping(&self, other_start: u64, other_end: u64) -> bool {
+        let max_start = self.start.max(other_start);
+        let min_end = (self.end+1).min(other_end);
+        max_start < min_end 
+    }
+}
+
+/// Iterator that will generate phase blocks consisting of a single "problem" to phase
+pub struct PhaseBlockIterator {
+    /// The index of the next block to yield
+    next_block_index: usize,
+    /// The primary traversal reader
+    ref_vcf_readers: Vec<RefCell<bcf::IndexedReader>>,
+    /// A copy of the VCF header, cached here for performance
+    vcf_headers: Vec<bcf::header::HeaderView>,
+    /// The name of the sample we care about in the VCF
+    sample_name: String,
+    /// The indices in the VCF file corresponding to `sample_name`
+    sample_indices: Vec<usize>,
+    /// Secondary traversals needed to figure out which variants can be phased
+    bam_readers: Vec<RefCell<bam::IndexedReader>>,
+    /// Index is based on the bcf::IndexedReader lookups
+    chrom_index: u32,
+    /// Position is as well, 0-based
+    chrom_position: u64,
+    /// The minimum allowed variant quality
+    min_quality: i32,
+    /// The minimum MAPQ to include a read
+    min_mapq: u8,
+    /// The minimum number of reads spanning two loci to connect them into a block
+    min_spanning_reads: usize,
+    /// if true, then supplemental mappings are allowed to join blocks
+    allow_supplemental_joins: bool,
+    /// Statistics on encountered variants while we iterate
+    variant_stats: HashMap<(u32, VariantType, Zygosity), usize>
+}
+
+impl PhaseBlockIterator {
+    /// Creates a new `PhaseBlockIterator` from a VCF file and collection of BAM files.
+    /// # Arguments
+    /// * `vcf_paths` - the VCF files to load variants from, must be zipped and indexed
+    /// * `bam_paths` - the BAM files to load reads from, must be indexed
+    /// * `reference_filename` - the reference genome filename
+    /// * `sample_name` - the sample name in the VCF file
+    /// * `min_quality` - the minimum quality to include a variant in a phase block
+    /// * `min_mapq` - the minimum MAPQ to include a read
+    /// * `min_spanning_reads` - the minimum number of reads that must span two adjacent variants to be joined into a phase block
+    /// * `allow_supplemental_joins` - if True, supplemental mappings are used for extending blocks
+    /// * `thread_pool` - a shared thread pool for BAM I/O
+    #[allow(clippy::too_many_arguments)]
+    pub fn new(
+        vcf_paths: &[PathBuf], bam_paths: &[PathBuf], reference_filename: &Path,
+        sample_name: String,
+        min_quality: i32, min_mapq: u8, min_spanning_reads: usize,
+        allow_supplemental_joins: bool,
+        thread_pool: &rust_htslib::tpool::ThreadPool
+    ) -> Result<PhaseBlockIterator, Box<dyn std::error::Error>> {
+        // needed for header() extraction
+        use rust_htslib::bcf::Read;
+        
+        let mut ref_vcf_readers: Vec<RefCell<bcf::IndexedReader>> = vec![];
+        let mut vcf_headers: Vec<bcf::header::HeaderView> = vec![];
+        let mut vcf_contigs: Vec<HashSet<String>> = vec![];
+        
+        let mut sample_indices: Vec<usize> = vec![];
+
+        for path in vcf_paths.iter() {
+            let vcf_reader: bcf::IndexedReader = bcf::IndexedReader::from_path(path)?;
+            let vcf_header: bcf::header::HeaderView = vcf_reader.header().clone();
+            let ref_vcf_reader: RefCell<bcf::IndexedReader> = RefCell::new(vcf_reader);
+
+            // first, check the sample names
+            let sample_index = {
+                let mut lookup_index: Option<usize> = None;
+                for (sample_index, &vcf_sample) in vcf_header.samples().iter().enumerate() {
+                    let vcf_sample_string: String = std::str::from_utf8(vcf_sample).unwrap().to_string();
+                    if vcf_sample_string == sample_name {
+                        lookup_index = Some(sample_index);
+                        break;
+                    }
+                }
+                match lookup_index {
+                    Some(index) => {
+                        index
+                    },
+                    None => {
+                        bail!("Sample name {:?} was not found in VCF: {:?}", sample_name, path);
+                    }
+                }
+            };
+
+            let contig_count: usize = vcf_header.contig_count() as usize;
+            let contigs: HashSet<String> = (0..contig_count)
+                .map(|i| 
+                    std::str::from_utf8(
+                        vcf_header.rid2name(i as u32).unwrap()
+                    ).unwrap().to_string()
+                )
+                .collect();
+
+            // push everything to our lists
+            ref_vcf_readers.push(ref_vcf_reader);
+            vcf_headers.push(vcf_header);
+            vcf_contigs.push(contigs);
+            sample_indices.push(sample_index);
+        }
+
+        // check that our chromosome entries are a match, if not we explode
+        let first_chromosomes = &vcf_contigs[0];
+        for other_chroms in vcf_contigs.iter().skip(1) {
+            if first_chromosomes != other_chroms {
+                bail!("Contig sets in the VCF files do not match");
+            }
+        }
+
+        // open up the bam files as well
+        let mut bam_readers: Vec<RefCell<bam::IndexedReader>> = vec![];
+        for path in bam_paths.iter() {
+            use rust_htslib::bam::Read;
+            let mut bam_reader = bam::IndexedReader::from_path(path)?;
+            bam_reader.set_reference(reference_filename)?;
+            bam_reader.set_thread_pool(thread_pool)?;
+            bam_readers.push(RefCell::new(bam_reader));
+        }
+        assert!(min_spanning_reads > 0);
+
+        debug!("Sample \"{}\" VCF indices: {:?}", sample_name, sample_indices);
+        
+        Ok(PhaseBlockIterator {
+            next_block_index: 0,
+            ref_vcf_readers,
+            vcf_headers,
+            sample_name,
+            sample_indices,
+            bam_readers,
+            chrom_index: 0,
+            chrom_position: 0,
+            min_quality,
+            min_mapq,
+            min_spanning_reads,
+            allow_supplemental_joins,
+            variant_stats: Default::default()
+        })
+    }
+
+    pub fn sample_indices(&self) -> &[usize] {
+        &self.sample_indices
+    }
+    pub fn sample_name(&self) -> &str {
+        &self.sample_name
+    }
+
+    /// Retrieves variant counts from all parsed variants (whether included or not), best if used after done iterating
+    pub fn variant_stats(&self) -> HashMap<(String, VariantType, Zygosity), usize> {
+        let mut ret: HashMap<(String, VariantType, Zygosity), usize> = Default::default();
+
+        for (&(chrom_index, variant_type, zygosity), &count) in self.variant_stats.iter() {
+            //get the chromosome name, we iterate based on the order of the first VCF provided
+            let chrom_name: String = std::str::from_utf8(
+                self.vcf_headers[0].rid2name(chrom_index).unwrap()
+            ).unwrap().to_string();
+
+            ret.insert((chrom_name, variant_type, zygosity), count);
+        }
+
+        ret
+    }
+
+    /// Returns the farthest position of reads that spans chrom:pos such that _at least_ `min_read_count` reads cover that position.
+    /// Currently, supplemental alignments are each handled separately.
+    /// # Arguments
+    /// * `chrom` - the chromosome of the locus
+    /// * `pos` - the position of the locus
+    fn get_longest_multispan(&self, chrom: &str, pos: u64) -> u64 {
+        use bio::bio_types::genome::AbstractInterval;
+        use rust_htslib::bam::Read;
+        let mut span_list: Vec<u64> = vec![];
+        for bam_ref in self.bam_readers.iter() {
+            let mut bam = bam_ref.borrow_mut();
+            bam.fetch((chrom, pos, pos+1)).unwrap();
+            
+            // calling .records() is what is triggering the URL warning
+            for read_entry in bam.records() {
+                let mut read = read_entry.unwrap();
+                
+                //make sure we care about the alignment
+                if filter_out_alignment_record(&read, self.min_mapq) {
+                    continue;
+                }
+                
+                // see if this mapping goes farther than anything else so far
+                read.cache_cigar();
+                let full_range = read.range();
+                // assertions always checked out, can remove
+                // assert!(full_range.start == read.pos() as u64);
+                // assert!(full_range.contains(&pos));
+                span_list.push(full_range.end);
+            }
+        }
+
+        if span_list.len() < self.min_spanning_reads {
+            // we don't have enough reads to reach our minimum threshold, so return this position+1
+            // the +1 is because we are returning the end of a half-open range that only includes pos
+            pos + 1
+        } else {
+            span_list.sort();
+            span_list[span_list.len() - self.min_spanning_reads]
+        }
+    }
+
+    /// Returns true if there are at least `min_read_count` reads that connect from the given position back into the current phase block.
+    /// # Arguments
+    /// * `chrom` - the chromosome of the locus
+    /// * `pos` - the position of the locus
+    fn is_supplemental_overlap(&self, chrom: &str, pos: u64, phase_block: &PhaseBlock) -> bool {
+        use rust_htslib::bam::Read;
+        use rust_htslib::bam::record::{Aux, CigarString, Record};
+        use rust_htslib::bam::record::Cigar;
+        let mut overlap_count: usize = 0;
+        for bam_ref in self.bam_readers.iter() {
+            let mut bam = bam_ref.borrow_mut();
+            bam.fetch((chrom, pos, pos+1)).unwrap();
+            for read_entry in bam.records() {
+                let read: Record = read_entry.unwrap();
+                
+                // make sure we care about the alignment
+                if filter_out_alignment_record(&read, self.min_mapq) {
+                    continue;
+                }
+                
+                // check if we have any supplemental alignments
+                let sa_tag: &str = match read.aux(b"SA") {
+                    Ok(value) => {
+                        match value {
+                            Aux::String(tag) => tag,
+                            _ => panic!("Unexpected tag {value:?}")
+                        }
+                    },
+                    Err(_) => {
+                        continue;
+                    }
+                };
+
+                // there can be multiple, so split on the delimiter and handle each one separately
+                let sa_strings: Vec<&str> = sa_tag.split_terminator(';').collect();
+                for &sa_str in sa_strings.iter() {
+                    // we expect exactly 6
+                    let sa_frags: Vec<&str> = sa_str.split(',').collect();
+                    assert_eq!(sa_frags.len(), 6);
+
+                    let sa_chrom = sa_frags[0];
+                    let sa_mapq: u8 = sa_frags[4].parse().unwrap();
+                    if sa_chrom != chrom || sa_mapq < self.min_mapq {
+                        // different chromosome OR mapq of the SA is too low, skip it
+                        continue;
+                    }
+                    
+                    // convert the start coordinate + CIGAR into an end coordinate
+                    let sa_start: u64 = sa_frags[1].parse().unwrap();
+                    let mut sa_end: u64 = sa_start;
+                    let cigar: CigarString = CigarString::try_from(sa_frags[3]).unwrap();
+                    for cigar_value in cigar.iter() {
+                        match cigar_value {
+                            Cigar::SoftClip(_) | Cigar::Ins(_) => {},
+                            Cigar::Match(c_len) | 
+                            Cigar::Del(c_len) | 
+                            Cigar::Equal(c_len) |
+                            Cigar::Diff(c_len) => {
+                                sa_end += *c_len as u64;
+                            },
+                            _ => {
+                                panic!("Unhandled cigar type: {cigar_value:?}");
+                            }
+                        }
+                    }
+                    
+                    // we have the SA start and end, see if it overlaps the existing block
+                    // TODO: this isn't checking for variant overlaps, can we fix that somehow? maybe instead of phase block, we pass in a variant interval tree?
+                    //       this is fortunately not a major problem, any falsely connected blocks will get split on the back end
+                    let overlapping: bool = phase_block.is_overlapping(sa_start, sa_end);
+                    if overlapping {
+                        // we found at least one overlap with the block, so increment and go to next read
+                        overlap_count += 1;
+                        break;
+                    }
+                }
+            }
+        }
+
+        // return true if we got enough supplements connecting us
+        overlap_count >= self.min_spanning_reads
+    }
+}
+
+impl Iterator for PhaseBlockIterator {
+    type Item = Result<PhaseBlock, Box<dyn std::error::Error>>;
+
+    fn next(&mut self) -> Option<Result<PhaseBlock, Box<dyn std::error::Error>>> {
+        use rust_htslib::bcf::Read;
+
+        // make sure we still have chromosome to iterate on
+        let num_contigs: u32 = self.vcf_headers[0].contig_count();
+        if self.chrom_index < num_contigs {
+            //get the chromosome name, we iterate based on the order of the first VCF provided
+            let chrom_name: String = std::str::from_utf8(
+                self.vcf_headers[0].rid2name(self.chrom_index).unwrap()
+            ).unwrap().to_string();
+
+            // initialize with an empty block containing just this chromosome
+            let mut phase_block: PhaseBlock = PhaseBlock::new(
+                self.next_block_index, chrom_name.clone(), self.chrom_index, self.min_quality, self.sample_name.clone()
+            );
+            self.next_block_index += 1;
+            
+            // initalize the variant queue with one variant from each VCF, any ties in position are broken by VCF input order
+            let mut variant_queue: PriorityQueue<usize, (Reverse<i64>, Reverse<usize>)> = PriorityQueue::new();
+            let mut vcf_readers: Vec<_> = self.ref_vcf_readers.iter().map(|rvr| rvr.borrow_mut()).collect();
+            let mut vcf_iterators: Vec<_> = vec![];
+            for (vcf_index, vcf_reader) in vcf_readers.iter_mut().enumerate() {
+                // fetch the corresponding chrom index for this VCF file (they are not guaranteed to match)
+                let chrom_index: u32 = self.vcf_headers[vcf_index].name2rid(chrom_name.as_bytes()).unwrap();
+
+                // fetch our position in the VCF file
+                match vcf_reader.fetch(chrom_index, self.chrom_position, None) {
+                    Ok(()) => {
+                        // we have entries, so get the first one and queue it
+                        let mut vcf_iter = vcf_reader.records().peekable();
+                        let first_entry = vcf_iter.peek();
+                        if let Some(record_result) = first_entry {
+                            let record: &rust_htslib::bcf::Record = match record_result {
+                                Ok(r) => r,
+                                // we have to convert to an owned error here, and the htslib errors are not cloneable
+                                Err(e) => return Some(Err(Box::new(SimpleError::from(e))))
+                            };
+                            let position: i64 = record.pos();
+                            variant_queue.push(vcf_index, (Reverse(position), Reverse(vcf_index)));
+                        };
+
+                        // even if the iterator is empty, we push it so things are lined up correctly
+                        vcf_iterators.push(vcf_iter);
+                    },
+                    Err(_) => {
+                        // this usually happens when there are no entries for the chromosome
+                        vcf_iterators.push(vcf_reader.records().peekable());
+                    }
+                };
+            }
+
+            if variant_queue.is_empty() {
+                // this must be an empty chromosome block because neither iterator found stuff to iterate on
+                self.chrom_index += 1;
+                return Some(Ok(phase_block));
+            }
+
+            let mut previous_pos: u64 = 0;
+            let mut max_span: u64 = 0;
+
+            while !variant_queue.is_empty() {
+                // get the source of the next variant to process
+                let (pop_index, pop_priority) = variant_queue.pop().unwrap();
+                let sample_index = self.sample_indices[pop_index];
+
+                // process this variant
+                let record_result = vcf_iterators[pop_index].next().unwrap();
+                let record = match record_result {
+                    Ok(r) => r,
+                    Err(e) => return Some(Err(Box::new(e)))
+                };
+
+                let variant_pos = record.pos() as u64;
+                assert_eq!(variant_pos, pop_priority.0.0 as u64); // sanity check that the variant matches our position priority
+                if variant_pos < self.chrom_position {
+                    // this can happen when you have very very long indels that span one of our breaks
+                    // we have already written though, so don't write it again
+                    // skip down to variant advancement
+                } else {
+                    let include_variant = match is_phasable_variant(&record, sample_index, self.min_quality, false) {
+                        Ok(iv) => iv,
+                        Err(e) => return Some(Err(e))
+                    };
+
+                    // second condition is for variants that overlap but are before our start position
+                    if include_variant {
+                        //heterozygous variant found
+                        if phase_block.get_num_variants() == 0 || max_span > variant_pos {
+                            //either:
+                            //1 - this is a new block OR
+                            //2 - we already found enough reads that spans _past_ this variant
+                            phase_block.add_locus_variant(&chrom_name, variant_pos, pop_index);
+                        } else {
+                            //we check the reads from the most recent locus
+                            //max_span = self.get_longest_span(&chrom_name, previous_pos);
+                            max_span = self.get_longest_multispan(&chrom_name, previous_pos);
+                            if max_span > variant_pos {
+                                //new max span connects
+                                phase_block.add_locus_variant(&chrom_name, variant_pos, pop_index);
+                            } else if !self.allow_supplemental_joins {
+                                // no mapping spans and we are not allowing supplemental mappings to make the join
+                                self.chrom_position = variant_pos;
+                                return Some(Ok(phase_block));
+                            } else {
+                                //no *mappings* span both this new position and the most recent, check if we can find a supplemental mapping that does
+                                let supplemental_overlap: bool = self.is_supplemental_overlap(&chrom_name, variant_pos, &phase_block);
+                                if supplemental_overlap {
+                                    // we got a supplemental mapping that works, so add this locus and go on as normal
+                                    phase_block.add_locus_variant(&chrom_name, variant_pos, pop_index);
+                                } else {
+                                    // no overlapping mapping and no supplemental either, time to end the block
+                                    self.chrom_position = variant_pos;
+                                    return Some(Ok(phase_block));
+                                }
+                            }
+                        }
+
+                        previous_pos = variant_pos;
+                    }
+
+                    // at this point either:
+                    // 1) we added the variant to the current block and are looping back around OR 
+                    // 2) we did NOT add the variant to the block because it isn't phasable OR
+                    // 3) we finished a block and returned out (aka, we can't get here if we just ended a block)
+                    // this means that these variants are safe to add to our stats without being double counted or previously counted
+                    let variant_type: VariantType = match get_variant_type(&record) {
+                        Ok(vt) => vt,
+                        // we have to convert to an owned error here, and the htslib errors are not cloneable
+                        Err(e) => return Some(Err(e))
+                    };
+                    let zygosity: Zygosity = match get_variant_zygosity(&record, sample_index) {
+                        Ok(z) => z,
+                        // we have to convert to an owned error here, and the htslib errors are not cloneable
+                        Err(e) => return Some(Err(e))
+                    };
+
+                    // update our variant stats for reporting later
+                    let stats_entry = self.variant_stats.entry((self.chrom_index, variant_type, zygosity)).or_insert(0);
+                    *stats_entry += 1;
+                }
+
+                // requeue from the one we popped from
+                let next_entry = vcf_iterators[pop_index].peek();
+                if let Some(record_result) = next_entry {
+                    let record: &rust_htslib::bcf::Record = match record_result {
+                        Ok(r) => r,
+                        // we have to convert to an owned error here, and the htslib errors are not cloneable
+                        Err(e) => return Some(Err(Box::new(SimpleError::from(e))))
+                    };
+                    let position: i64 = record.pos();
+                    variant_queue.push(pop_index, (Reverse(position), Reverse(pop_index)));
+                };
+            }
+
+            //we have reached the end of the current chromosome, reset to next chromosome and return what we have
+            self.chrom_index += 1;
+            self.chrom_position = 0;
+            Some(Ok(phase_block))
+        } else {
+            // no chromosomes left to iterate on
+            None
+        }
+    }
+}
+
+/// Iterator over multiple phase blocks iterators.
+/// Output blocks are ordered by (chromosome, start_position, end_position) and re-numbered to reflect traversal order.
+pub struct MultiPhaseBlockIterator {
+    /// The internal iterators we use
+    sub_iterators: Vec<PhaseBlockIterator>,
+    /// The priority queue for the phase blocks at the front of each iterator
+    phase_block_queue: PriorityQueue<(usize, PhaseBlock), PhaseBlockPriority>,
+    /// The combined block index
+    joint_block_index: usize,
+}
+
+impl MultiPhaseBlockIterator {
+    /// Creates a new iterator from a vector of sub-iterators, each one tied to a specific sample.
+    /// # Arguments
+    /// * `sub_iterators` - the original PhaseBlockIterators that this will wrap
+    /// # Errors
+    /// * if any sub-iterators generate errors while iterating
+    pub fn new(mut sub_iterators: Vec<PhaseBlockIterator>) -> Result<MultiPhaseBlockIterator, Box<dyn std::error::Error>> {
+        let mut phase_block_queue: PriorityQueue<(usize, PhaseBlock), PhaseBlockPriority> = PriorityQueue::new();
+
+        for (index, iterator) in sub_iterators.iter_mut().enumerate() {
+            let next_value = iterator.next();
+            match next_value {
+                Some(result) => {
+                    let first_block: PhaseBlock = result?;
+                    let block_priority = Self::get_block_priority(&first_block);
+                    phase_block_queue.push((index, first_block), block_priority);
+                },
+                None => {
+                    // first block is empty, which is weird but technically allowed
+                    warn!("First block in iterator {} was empty.", index);
+                }
+            };
+        }
+
+        Ok(MultiPhaseBlockIterator { 
+            sub_iterators, 
+            phase_block_queue,
+            joint_block_index: 0
+        })
+    }
+
+    /// Retrieves variant counts from all parsed variants (whether included or not) across all samples.
+    /// This is best if done when iteration is finished.
+    /// Returns a hashmap where key is (sample_name, chromosome, variant_type, zygosity) and value is a count.
+    pub fn variant_stats(&self) -> HashMap<(String, String, VariantType, Zygosity), usize> {
+        // key = (sample_name, chromosome, variant_type, zygosity); value = count
+        let mut ret: HashMap<(String, String, VariantType, Zygosity), usize> = Default::default();
+        for pbi in self.sub_iterators.iter() {
+            let sample_name = pbi.sample_name().to_string();
+            let pbi_stats = pbi.variant_stats();
+            for ((chrom, vt, zyg), count) in pbi_stats.into_iter() {
+                ret.insert((sample_name.clone(), chrom, vt, zyg), count);
+            }
+        }
+        ret
+    }
+
+    /// Returns the block priority for a phase block
+    /// # Arguments
+    /// * `phase_block` - the block to calculate priority for
+    fn get_block_priority(phase_block: &PhaseBlock) -> PhaseBlockPriority {
+        Reverse(
+            (
+                phase_block.get_chrom_index(),
+                phase_block.get_start(),
+                phase_block.get_end()
+            )
+        )
+    }
+}
+
+impl Iterator for MultiPhaseBlockIterator {
+    type Item = Result<PhaseBlock, Box<dyn std::error::Error>>;
+
+    fn next(&mut self) -> Option<Result<PhaseBlock, Box<dyn std::error::Error>>> {
+        let pq_next = self.phase_block_queue.pop();
+        match pq_next {
+            Some(((source_index, mut phase_block), _priority)) => {
+                // get the next block and put it on the queue
+                let next_item = self.sub_iterators[source_index].next();
+                if let Some(next_result) = next_item {
+                    // we have more in this queue, add it to the priority queue
+                    let next_block = match next_result {
+                        Ok(b) => b,
+                        Err(e) => {
+                            // sub-queue error, propagate it up the chain
+                            return Some(Err(e));
+                        }
+                    };
+                    let next_priority = Self::get_block_priority(&next_block);
+                    self.phase_block_queue.push((source_index, next_block), next_priority);
+                };
+
+                // we need to update the block index based on the joint values
+                phase_block.set_block_index(self.joint_block_index);
+                self.joint_block_index += 1;
+
+                // finally send back the block
+                Some(Ok(phase_block))
+            },
+            None => {
+                None
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    // fortunately, none of the BAM checks actually look at the provided reference file contents 
+    const LOCAL_REFERENCE: &str = "./test_data/test_reference.fa";
+    
+    #[test]
+    fn test_get_vcf_samples() {
+        let header_only_vcf: PathBuf = "./test_data/header_only.vcf.gz".into();
+        let expected_samples: Vec<String> = vec![
+            "HG001".to_string(), "HG002_30x".to_string(), "HG005_30x".to_string()
+        ];
+        let samples = get_vcf_samples(&header_only_vcf).unwrap();
+        assert_eq!(expected_samples, samples);
+    }
+
+    #[test]
+    fn test_get_sample_bams() {
+        let sample_name: String = "HG002-rep1".to_string();
+        let all_bams: Vec<PathBuf> = vec![
+            "./test_data/header_only.bam".into(),
+            "./test_data/multi_smrtcell.bam".into()
+        ];
+        
+        let (bams_found, bam_indices) = get_sample_bams(
+            &all_bams,
+            &sample_name,
+            &PathBuf::from(LOCAL_REFERENCE)
+        ).unwrap();
+        assert_eq!(all_bams, bams_found);
+        assert_eq!(vec![0, 1], bam_indices);
+    }
+
+    #[test]
+    fn test_multisample_bam() {
+        let sample_name: String = "HG002-rep1".to_string();
+        let all_bams: Vec<PathBuf> = vec![
+            "./test_data/multisample.bam".into()
+        ];
+        let result = get_sample_bams(
+            &all_bams,
+            &sample_name,
+            &PathBuf::from(LOCAL_REFERENCE)
+        );
+
+        // should have an error with the following message
+        assert!(result.is_err());
+        let expected_error_string = "BAM file with multiple sample reads groups detected, this is not supported: ./test_data/multisample.bam".to_string();
+        assert_eq!(expected_error_string, result.err().unwrap().to_string());
+    }
+}
\ No newline at end of file
diff --git a/src/cli.rs b/src/cli.rs
new file mode 100644
index 0000000..34ca2e8
--- /dev/null
+++ b/src/cli.rs
@@ -0,0 +1,343 @@
+
+use clap::Parser;
+use chrono::Datelike;
+use flate2::bufread::MultiGzDecoder;
+use lazy_static::lazy_static;
+use log::{error, info, warn};
+use std::fs::File;
+use std::io::{BufReader, Read};
+use std::path::{Path, PathBuf};
+
+lazy_static! {
+    /// Stores the full version string we plan to use.
+    /// # Examples
+    /// * `0.11.0-6bb9635-dirty` - while on a dirty branch
+    /// * `0.11.0-6bb9635` - with a fresh commit
+    pub static ref FULL_VERSION: String = format!("{}-{}", env!("CARGO_PKG_VERSION"), env!("VERGEN_GIT_DESCRIBE"));
+}
+
+#[derive(Clone, Parser)]
+#[clap(author, 
+    version = &**FULL_VERSION,
+    about, 
+    after_help = format!("Copyright (C) 2004-{}     Pacific Biosciences of California, Inc.
+This program comes with ABSOLUTELY NO WARRANTY; it is intended for
+Research Use Only and not for use in diagnostic procedures.", chrono::Utc::now().year()))]
+pub struct Settings {
+    /// Input alignment file in BAM format.
+    #[clap(required = true)]
+    #[clap(short = 'b')]
+    #[clap(long = "bam")]
+    #[clap(value_name = "BAM")]
+    #[clap(help_heading = Some("Input/Output"))]
+    pub bam_filenames: Vec<PathBuf>,
+
+    /// Output haplotagged alignment file in BAM format.
+    #[clap(short = 'p')]
+    #[clap(long = "output-bam")]
+    #[clap(value_name = "BAM")]
+    #[clap(help_heading = Some("Input/Output"))]
+    pub output_bam_filenames: Vec<PathBuf>,
+
+    /// Input variant file in VCF format.
+    #[clap(required = true)]
+    #[clap(short = 'c')]
+    #[clap(long = "vcf")]
+    #[clap(value_name = "VCF")]
+    #[clap(help_heading = Some("Input/Output"))]
+    pub vcf_filenames: Vec<PathBuf>,
+
+    /// Output phased variant file in VCF format.
+    #[clap(required = true)]
+    #[clap(short = 'o')]
+    #[clap(long = "output-vcf")]
+    #[clap(value_name = "VCF")]
+    #[clap(help_heading = Some("Input/Output"))]
+    pub output_vcf_filenames: Vec<PathBuf>,
+
+    /// Reference FASTA file
+    #[clap(required = true)]
+    #[clap(short = 'r')]
+    #[clap(long = "reference")]
+    #[clap(value_name = "FASTA")]
+    #[clap(help_heading = Some("Input/Output"))]
+    pub reference_filename: PathBuf,
+
+    /// Sample name to phase within the VCF (default: first sample)
+    #[clap(short = 's')]
+    #[clap(long = "sample-name")]
+    #[clap(value_name = "SAMPLE")]
+    #[clap(help_heading = Some("Input/Output"))]
+    pub sample_names: Vec<String>,
+
+    /// Ignore BAM file read group IDs
+    #[clap(long = "ignore-read-groups")]
+    #[clap(help_heading = Some("Input/Output"))]
+    pub ignore_read_groups: bool,
+
+    /// Output summary phasing statistics file (optional, csv/tsv)
+    #[clap(long = "summary-file")]
+    #[clap(value_name = "FILE")]
+    #[clap(help_heading = Some("Input/Output"))]
+    pub summary_filename: Option<PathBuf>,
+
+    /// Output algorithmic statistics file (optional, csv/tsv)
+    #[clap(long = "stats-file")]
+    #[clap(value_name = "FILE")]
+    #[clap(help_heading = Some("Input/Output"))]
+    pub stats_filename: Option<PathBuf>,
+
+    /// Output blocks file (optional, csv/tsv)
+    #[clap(long = "blocks-file")]
+    #[clap(value_name = "FILE")]
+    #[clap(help_heading = Some("Input/Output"))]
+    pub blocks_filename: Option<PathBuf>,
+
+    /// Output haplotag file (optional, csv/tsv)
+    #[clap(long = "haplotag-file")]
+    #[clap(value_name = "FILE")]
+    #[clap(help_heading = Some("Input/Output"))]
+    pub haplotag_filename: Option<PathBuf>,
+
+    /// Number of threads for BAM I/O (default: copy `--threads`)
+    #[clap(long = "io-threads")]
+    #[clap(value_name = "THREADS")]
+    #[clap(help_heading = Some("Input/Output"))]
+    pub io_threads: Option<usize>,
+
+    /// Number of threads to use for phasing.
+    #[clap(short = 't')]
+    #[clap(long = "threads")]
+    #[clap(value_name = "THREADS")]
+    #[clap(default_value = "1")]
+    pub threads: usize,
+
+    /// Enable verbose output.
+    #[clap(short = 'v')]
+    #[clap(long = "verbose")]
+    #[clap(action = clap::ArgAction::Count)]
+    pub verbosity: u8,
+
+    /// Sets a minimum genotype quality (GQ) value to include a variant in the phasing
+    #[clap(long = "min-vcf-qual")]
+    #[clap(value_name = "GQ")]
+    #[clap(default_value = "0")]
+    #[clap(help_heading = Some("Variant Filtering"))]
+    pub min_variant_quality: i32,
+
+    /// Sets a minimum MAPQ to include a read in the phasing
+    #[clap(long = "min-mapq")]
+    #[clap(value_name = "MAPQ")]
+    #[clap(default_value = "5")]
+    #[clap(help_heading = Some("Mapping Filtering"))]
+    pub min_mapping_quality: u8,
+
+    /// Sets a minimum number of matched variants required for a read to get included in the scoring
+    #[clap(long = "min-matched-alleles")]
+    #[clap(value_name = "COUNT")]
+    #[clap(default_value = "2")]
+    #[clap(help_heading = Some("Mapping Filtering"))]
+    pub min_matched_alleles: usize,
+
+    /// Sets a minimum number of reads to span two adjacent variants to join a phase block
+    #[clap(long = "min-spanning-reads")]
+    #[clap(value_name = "READS")]
+    #[clap(default_value = "1")]
+    #[clap(help_heading = Some("Phase Block Generation"))]
+    pub min_spanning_reads: usize,
+
+    /// Disables the use of supplemental mappings to join phase blocks
+    #[clap(long = "no-supplemental-joins")]
+    #[clap(help_heading = Some("Phase Block Generation"))]
+    pub disable_supplemental_joins: bool,
+
+    /// Enables the phasing and haplotagging of singleton phase blocks
+    #[clap(long = "phase-singletons")]
+    #[clap(help_heading = Some("Phase Block Generation"))]
+    pub phase_singletons: bool,
+
+    /// Sets a maximum reference buffer for local realignment
+    #[clap(long = "max-reference-buffer")]
+    #[clap(value_name = "LENGTH")]
+    #[clap(default_value = "15")]
+    #[clap(help_heading = Some("Allele Assignment"))]
+    pub reference_buffer: usize,
+
+    /// Enables global realignment with a maximum allowed CPU time before fallback to local realignment
+    #[clap(long = "global-realignment-cputime")]
+    #[clap(value_name = "SECONDS")]
+    #[clap(default_value = "0.0")]
+    #[clap(help_heading = Some("Allele Assignment"))]
+    pub global_realign_cputime: f32,
+
+    /// Sets a pruning threshold on global realignment, set to 0 to disable pruning
+    #[clap(long = "global-pruning-distance")]
+    #[clap(value_name = "LENGTH")]
+    #[clap(default_value = "500")]
+    #[clap(help_heading = Some("Allele Assignment"))]
+    pub wfa_prune_distance: usize,
+
+    /// Sets the minimum queue size for the phasing algorithm
+    #[clap(long = "phase-min-queue-size")]
+    #[clap(value_name = "SIZE")]
+    #[clap(default_value = "1000")]
+    #[clap(help_heading = Some("Phasing"))]
+    pub phase_min_queue_size: usize,
+    
+    /// Sets the queue size increment per variant in a phase block
+    #[clap(long = "phase-queue-increment")]
+    #[clap(value_name = "SIZE")]
+    #[clap(default_value = "3")]
+    #[clap(help_heading = Some("Phasing"))]
+    pub phase_queue_increment: usize,
+
+    /// Skips a number of blocks (debug only); non-0 values will cause an error on VCF output
+    #[clap(long = "skip")]
+    #[clap(hide = true)]
+    #[clap(default_value = "0")]
+    pub skip_blocks: usize,
+
+    /// Take a number of blocks (debug only); non-0 values will cause an error on VCF output
+    #[clap(long = "take")]
+    #[clap(hide = true)]
+    #[clap(default_value = "0")]
+    pub take_blocks: usize,
+}
+
+/// Checks if a file exists and will otherwise exit
+/// # Arguments
+/// * `filename` - the file path to check for
+/// * `label` - the label to use for error messages
+fn check_required_filename(filename: &Path, label: &str) {
+    if !filename.exists() {
+        error!("{} does not exist: \"{}\"", label, filename.display());
+        std::process::exit(exitcode::NOINPUT);
+    } else {
+        info!("{}: \"{}\"", label, filename.display());
+    }
+}
+
+/// Checks if the VCF file exists, is bgzipped, and has an index. If it fails any of those, this will exit.
+/// # Argument
+/// * `filename` - the VCF file path to check
+/// * `label` - the label to use for error messages
+fn check_required_vcf(filename: &Path, label: &str) {
+    // first check the filename normally
+    check_required_filename(filename, label);
+
+    // now we need to check that this is a bgzipped file by just trying to read a little bit of it
+    // NOTE: if the user generates a gzip file (as opposed to bgzip), this will still pass :(
+    //       in theory, indexing checks should fail
+    let vcf_file: File = File::open(filename).unwrap();
+    let file_reader = BufReader::new(vcf_file);
+    let mut gz_decoder = MultiGzDecoder::new(file_reader);
+    let mut small_buffer: [u8; 10] = [0; 10];
+    match gz_decoder.read(&mut small_buffer) {
+        Ok(_) => {},
+        Err(e) => {
+            if e.to_string() == "invalid gzip header" {
+                error!("Error while checking {filename:?}: {e}; is the VCF bgzipped?");
+            } else {
+                error!("Error while checking {filename:?}: {e}");
+            }
+            std::process::exit(exitcode::IOERR);
+        }  
+    };
+
+    // finally, verify that an index file exists, should just be tbi and csi
+    let known_indices = ["tbi", "csi"];
+    let mut index_found: bool = false;
+    for &ki in known_indices.iter() {
+        let mut extension_path = filename.to_owned()
+            .into_os_string();
+        extension_path.push(format!(".{ki}"));
+        let extension_path: PathBuf = PathBuf::from(extension_path);
+        index_found |= extension_path.exists();
+    }
+    if !index_found {
+        error!("Error while checking {filename:?}: no tabix index found (.tbi or .csi)");
+        std::process::exit(exitcode::NOINPUT);
+    }
+
+}
+
+pub fn get_raw_settings() -> Settings {
+    Settings::parse()
+}
+
+/// Do some additional checks here, we may increase these as we go.
+/// Also can modify settings if needed since we're passing it around.
+/// # Arguments
+/// * `settings` - the raw settings, nothing has been checked other than what clap does for us.
+pub fn check_settings(mut settings: Settings) -> Settings {
+    //check for any of our required files
+    for filename in settings.bam_filenames.iter() {
+        check_required_filename(filename, "Alignment file");
+    }
+    for filename in settings.vcf_filenames.iter() {
+        check_required_vcf(filename, "Variant file");
+    }
+
+    // make sure the number of inputs and outputs are identical
+    if settings.vcf_filenames.len() != settings.output_vcf_filenames.len() {
+        error!("Detected {} input VCFs and {} output VCFs, these must be equal", settings.vcf_filenames.len(), settings.output_vcf_filenames.len());
+        std::process::exit(exitcode::USAGE);
+    }
+
+    // if we have any phased BAM outputs, make sure we have one for each file
+    if !settings.output_bam_filenames.is_empty() && settings.bam_filenames.len() != settings.output_bam_filenames.len() {
+        error!("Detected {} input BAMs and {} output BAMs, these must be equal", settings.bam_filenames.len(), settings.output_bam_filenames.len());
+        std::process::exit(exitcode::USAGE);
+    }
+
+    // check optional files
+    check_required_filename(&settings.reference_filename, "Reference file");
+
+    // 0 is just a sentinel for everything
+    if settings.take_blocks == 0 {
+        settings.take_blocks = usize::MAX;
+    }
+    if settings.wfa_prune_distance == 0 {
+        settings.wfa_prune_distance = usize::MAX;
+    }
+
+    // 0 doesn't make sense, so lets just error proof it up to 1
+    if settings.min_spanning_reads == 0 {
+        settings.min_spanning_reads = 1;
+    }
+    if settings.min_matched_alleles == 0 {
+        settings.min_matched_alleles = 1;
+    }
+
+    // if this is not specified, then set it to the same as processing
+    if settings.io_threads.is_none() {
+        settings.io_threads = Some(settings.threads);
+    }
+
+    // dump stuff to the logger
+    info!("Minimum call quality: {}", settings.min_variant_quality);
+    info!("Minimum mapping quality: {}", settings.min_mapping_quality);
+    info!("Minimum matched alleles: {}", settings.min_matched_alleles);
+    if settings.min_matched_alleles > 2 {
+        warn!("Setting the minimum matched alleles > 2 has not been tested.")
+    }
+    info!("Minimum spanning reads: {}", settings.min_spanning_reads);
+    info!("Supplemental mapping block joins: {}", if settings.disable_supplemental_joins { "DISABLED" } else { "ENABLED" });
+    info!("Phase singleton blocks: {}", if settings.phase_singletons { "ENABLED" } else { "DISABLED" });
+    info!("Local re-alignment maximum reference buffer: +-{} bp", settings.reference_buffer);
+    if settings.global_realign_cputime == 0.0 {
+        info!("Global re-alignment: DISABLED");
+    } else {
+        info!("Global re-alignment CPU time: {} seconds", settings.global_realign_cputime);
+        if settings.wfa_prune_distance == usize::MAX {
+            info!("Global prune distance: DISABLED");
+        } else {
+            info!("Global prune distance: {}", settings.wfa_prune_distance);
+        }
+    }
+    info!("Processing threads: {}", settings.threads);
+    info!("I/O threads: {}", settings.io_threads.unwrap());
+
+    //send the settings back
+    settings
+}
diff --git a/src/data_types/mod.rs b/src/data_types/mod.rs
new file mode 100644
index 0000000..72699dd
--- /dev/null
+++ b/src/data_types/mod.rs
@@ -0,0 +1,7 @@
+
+/// Contains a ReadSegment observation type
+pub mod read_segments;
+/// Wrapper for an in-memory reference genome
+pub mod reference_genome;
+/// Contains Variant type as well as supporting definitions
+pub mod variants;
diff --git a/src/data_types/read_segments.rs b/src/data_types/read_segments.rs
new file mode 100644
index 0000000..af2c1f2
--- /dev/null
+++ b/src/data_types/read_segments.rs
@@ -0,0 +1,258 @@
+
+/// Container for a read segment that has been converted into a variant representation
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct ReadSegment {
+    /// the read name
+    read_name: String,
+    /// the actual alleles for the read, should always be 0, 1, 2 (ambiguous), or 3 (non-overlapping/undefined).
+    /// anything other than 0 or 1 are basically ignored in all functions
+    alleles: Vec<u8>,
+    /// the associated quality values for converting 0 <--> 1; undefined alleles should have qual == 0
+    quals: Vec<u8>,
+    /// the index of the first defined 0/1 allele, inclusive
+    first_allele: usize,
+    /// the index of the last defined 0/1 allele, inclusive
+    last_allele: usize
+}
+
+impl ReadSegment {
+    /// Creates a new read segment from a set of alleles and quality values.
+    /// Note that the segment should have the same length as the phase block, even if the segment does not actually span the full block.
+    /// # Arguments
+    /// * `alleles` - the alleles, should all be 0, 1, 2 (ambiguous) or 3 (non-overlapping/undefined); the first and last defined values determine the length of the segment
+    /// * `quals` - cost to convert an allele from 0 <--> 1, any undefined/ambiguous alleles should be qual = 0
+    /// # Panics
+    /// * if `allele.len() != quals.len()`
+    pub fn new(read_name: String, alleles: Vec<u8>, quals: Vec<u8>) -> ReadSegment {
+        assert_eq!(alleles.len(), quals.len());
+        let (first_allele, _) = alleles.iter().enumerate()
+            .find(|(_i, &a)| a < 2).unwrap_or((alleles.len(), &2));
+        let (last_allele, _) = alleles.iter().enumerate().rev()
+            .find(|(_i, &a)| a < 2).unwrap_or((alleles.len(), &2));
+        ReadSegment {
+            read_name,
+            alleles,
+            quals,
+            first_allele,
+            last_allele
+        }
+    }
+
+    /// Given a collection of read segments, this will collapse them into a single one.
+    /// Any ambiguous/undefined alleles will have their quality set to 0.
+    /// # Arguments
+    /// * `read_segments` - the reads to collapse together
+    /// # Panics
+    /// * if `read_segments` is empty
+    /// * if `read_segments` are not all of equal length
+    pub fn collapse(read_segments: &[ReadSegment]) -> ReadSegment {
+        // short circuit
+        assert!(!read_segments.is_empty());
+        if read_segments.len() == 1 {
+            return read_segments[0].clone();
+        }
+
+        let num_alleles: usize = read_segments[0].get_num_alleles();
+        let read_name: String = read_segments[0].read_name().to_string();
+        let mut alleles: Vec<u8> = vec![3; num_alleles];
+        let mut quals: Vec<u8> = vec![0; num_alleles];
+        for rs in read_segments.iter() {
+            let rs_alleles = rs.alleles();
+            let rs_quals = rs.quals();
+            assert_eq!(num_alleles, rs.get_num_alleles());
+            assert_eq!(read_name, rs.read_name());
+
+            for (i, &rsa) in rs_alleles.iter().enumerate() {
+                // if rsa is unset, we skip everything
+                if rsa != 3 {
+                    if alleles[i] == 3 {
+                        alleles[i] = rsa;
+                        quals[i] = rs_quals[i];
+                    } else if alleles[i] == 2 {
+                        // we are already ambiguous, so quality should be 0
+                    } else {
+                        // check for ambiguity
+                        if alleles[i] == rsa {
+                            // they match, make sure quals do also
+                            // assert_eq!(quals[i], rs_quals[i]);
+                            // quals won't always match in local mode, lets default to the lower
+                            quals[i] = quals[i].min(rs_quals[i]);
+                            assert!(quals[i] > 0);
+                        } else {
+                            // they don't match, change to ambiguous
+                            alleles[i] = 2;
+                            quals[i] = 0;
+                        }
+                    }
+                }
+            }
+        }
+
+        // now just send it to the new function
+        Self::new(read_name, alleles, quals)
+    }
+
+    pub fn read_name(&self) -> &str {
+        &self.read_name
+    }
+
+    pub fn get_num_alleles(&self) -> usize {
+        self.alleles.len()
+    }
+
+    pub fn alleles(&self) -> &[u8] {
+        &self.alleles[..]
+    }
+
+    pub fn quals(&self) -> &[u8] {
+        &self.quals[..]
+    }
+
+    pub fn first_allele(&self) -> usize {
+        self.first_allele
+    }
+
+    pub fn last_allele(&self) -> usize {
+        self.last_allele
+    }
+
+    /// Returns the range of this segment, e.g. [first_allele..last_allele+1)
+    pub fn get_range(&self) -> std::ops::Range<usize> {
+        self.first_allele..(self.last_allele+1)
+    }
+
+    /// Returns the number of alleles that are set (i.e. non-ambiguous and overlapping, so 0 or 1)
+    pub fn get_num_set(&self) -> usize {
+        self.alleles.iter()
+            .filter(|&v| *v < 2)
+            .count()
+    }
+
+    /// Given a haplotype, this will score the read against that haplotype.
+    /// If a haplotype has a 2, no cost is associated with that allele.
+    /// # Arguments
+    /// `haplotype` - the full haplotype to score, must have the same length as the block
+    pub fn score_haplotype(&self, haplotype: &[u8]) -> u64 {
+        assert_eq!(self.alleles.len(), haplotype.len());
+        self.score_partial_haplotype(haplotype, 0)
+    }
+
+    /// Given a partial haplotype, this will score the read against that haplotype.
+    /// The offset values is an index to where to start in our alleles for scoring.
+    /// For example, if offset = 10, then alleles[10..] will be compared to haplotype[0..]
+    /// If a haplotype has a 2, no cost is associated with that allele.
+    /// # Arguments
+    /// * `haplotype` - the partial haplotype to score
+    /// * `offset` - the offset into the read segment to start scoring
+    pub fn score_partial_haplotype(&self, haplotype: &[u8], offset: usize) -> u64 {
+        //info!("rs {}+{} <= {}?", haplotype.len(), offset, self.alleles.len());
+        assert!(haplotype.len()+offset <= self.alleles.len());
+        if haplotype.len() + offset <= self.first_allele || offset > self.last_allele {
+            // the haplotype starts and ends before our first allele, OR
+            // the haplotype starts after our last allele, SO
+            // return 0 in either case, because there is no overlaps to score
+            0
+        } else {
+            // the minimum comparison is either the first allele OR the offset, whichever is greater
+            let min_compare = self.first_allele.max(offset);
+
+            // if the allele component is greater, then we need to shift into the haplotype
+            let offset_shift = min_compare - offset;
+
+            // the maximum comparison is either the last allele we have OR the end of the haplotype+start offset
+            let max_compare = (self.last_allele+1).min(offset+haplotype.len());
+            
+            self.quals[min_compare..max_compare].iter().enumerate()
+                .filter(|(i, _q)| haplotype[*i+offset_shift] < 2 && self.alleles[*i+min_compare] != haplotype[*i+offset_shift])
+                .map(|(_i, &q)| q as u64)
+                .sum()
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_score_haplotype() {
+        let rs = ReadSegment::new(
+            "read_name".to_string(),
+            vec![2, 0, 1, 0, 0, 1, 2, 1, 2, 2],
+            vec![0, 1, 1, 1, 1, 1, 1, 1, 0, 0]
+        );
+        assert_eq!(rs.first_allele, 1);
+        assert_eq!(rs.last_allele, 7);
+        assert_eq!(rs.get_num_set(), 6);
+
+        //identical except for missing value in rs
+        let haplotype = vec![0, 0, 1, 0, 0, 1, 1, 1, 0, 0];
+        assert_eq!(rs.score_haplotype(&haplotype), 1);
+        
+        //fully empty haplotype
+        let haplotype = vec![2; 10];
+        assert_eq!(rs.score_haplotype(&haplotype), 0);
+
+        //fully wrong haplotype
+        let haplotype = vec![1, 1, 0, 1, 1, 0, 0, 0, 1, 1];
+        assert_eq!(rs.score_haplotype(&haplotype), 7);
+    }
+
+    #[test]
+    fn test_score_partial_haplotype() {
+        let rs = ReadSegment::new(
+            "read_name".to_string(),
+            vec![2, 0, 1, 0, 0, 1, 2, 1, 2, 2],
+            vec![0, 1, 1, 1, 1, 1, 1, 1, 0, 0]
+        );
+
+        //identical except for missing value in rs
+        let haplotype = vec![0, 1, 0, 0, 1, 1, 1];
+        assert_eq!(rs.score_partial_haplotype(&haplotype, 1), 1);
+        
+        //fully empty haplotype
+        let haplotype = vec![2; 7];
+        assert_eq!(rs.score_partial_haplotype(&haplotype, 2), 0);
+
+        //fully wrong haplotype
+        let haplotype = vec![1, 0, 1, 1, 0, 0, 0];
+        assert_eq!(rs.score_partial_haplotype(&haplotype, 1), 7);
+        for x in 0..haplotype.len() {
+            assert_eq!(rs.score_partial_haplotype(&haplotype[x..], 1+x), 7-x as u64);
+        }
+    }
+
+    #[test]
+    fn test_collapse() {
+        let rs1 = ReadSegment::new(
+            "read_name".to_string(),
+            vec![3, 1, 0, 2, 1, 3, 3],
+            vec![0, 2, 1, 0, 2, 0, 0]
+        );
+        let rs2 = ReadSegment::new(
+            "read_name".to_string(),
+            vec![3, 3, 0, 1, 0, 1, 1],
+            vec![0, 0, 1, 2, 2, 1, 1]
+        );
+        let expected = ReadSegment::new(
+            "read_name".to_string(),
+            vec![3, 1, 0, 2, 2, 1, 1],
+            vec![0, 2, 1, 0, 0, 1, 1]
+        );
+
+        // make sure normal collapsing works
+        let collapsed = ReadSegment::collapse(&[rs1.clone(), rs2.clone()]);
+        assert_eq!(expected, collapsed);
+        assert_eq!(collapsed.first_allele, 1);
+        assert_eq!(collapsed.last_allele, 6);
+
+        // make sure scoring works fine with the 3s present
+        //              vec![3, 1, 0, 2, 2, 1, 1]
+        let haplotype = vec![0, 1, 0, 0, 0, 1, 0];
+        assert_eq!(collapsed.score_haplotype(&haplotype), 1);
+
+        // check stupid collapsing also
+        let collapsed = ReadSegment::collapse(&[rs1.clone()]);
+        assert_eq!(collapsed, rs1);
+    }
+}
\ No newline at end of file
diff --git a/src/data_types/reference_genome.rs b/src/data_types/reference_genome.rs
new file mode 100644
index 0000000..d1c5c10
--- /dev/null
+++ b/src/data_types/reference_genome.rs
@@ -0,0 +1,133 @@
+
+use bio::io::fasta;
+use flate2::bufread::MultiGzDecoder;
+use log::{debug, info, warn};
+use rustc_hash::FxHashMap as HashMap;
+use std::io::{BufRead, BufReader};
+use std::path::{Path, PathBuf};
+
+/// Wrapper structure for a reference genome
+pub struct ReferenceGenome {
+    /// The filename we loaded 
+    filename: PathBuf,
+    /// Contains the keys in order of the reference load
+    contig_keys: Vec<String>,
+    /// Map where keys are contig names and value is ASCII formatted sequence
+    contig_map: HashMap<String, Vec<u8>>
+}
+
+impl ReferenceGenome {
+    /// Loads a reference genome from a given FASTA file
+    /// # Arguments
+    /// * `fasta_fn` - the FASTA filename, gzip is allowed
+    /// # Errors
+    /// This will pass through any error detected from loading the provided FASTA file.
+    /// This includes file reading and/or record reading errors.
+    pub fn from_fasta(fasta_fn: &Path) -> Result<ReferenceGenome, Box<dyn std::error::Error>> {
+        info!("Loading {:?}...", fasta_fn);
+        let mut contig_keys: Vec<String> = Default::default();
+        let mut contig_map: HashMap<String, Vec<u8>> = Default::default();
+        
+        // needletail can technically read FASTA and FASTQ, not sure we can check for that easy though
+        let fasta_file: std::fs::File = std::fs::File::open(fasta_fn)?;
+        let file_reader = BufReader::new(fasta_file);
+        let fasta_reader: fasta::Reader<Box<dyn BufRead>> = if fasta_fn.extension().unwrap_or_default() == "gz" {
+            debug!("Detected gzip extension, loading reference with MultiGzDecoder...");
+            let gz_decoder = MultiGzDecoder::new(file_reader);
+            let bufreader = BufReader::new(gz_decoder);
+            fasta::Reader::from_bufread(Box::new(bufreader))
+        } else {
+            debug!("Loading reference as plain-text file...");
+            fasta::Reader::from_bufread(Box::new(file_reader))
+        };
+
+        for entry in fasta_reader.records() {
+            let record: fasta::Record = entry?;
+            let seq_id: String = record.id().to_string();
+            let sequence: Vec<u8> = record.seq().to_ascii_uppercase();
+
+            contig_keys.push(seq_id.clone());
+            contig_map.insert(seq_id, sequence);
+        }
+        info!("Finished loading {} contigs.", contig_map.len());
+
+        Ok(ReferenceGenome {
+            filename: fasta_fn.to_path_buf(),
+            contig_keys,
+            contig_map
+        })
+    }
+
+    pub fn filename(&self) -> &Path {
+        &self.filename
+    }
+
+    pub fn contig_keys(&self) -> &[String] {
+        &self.contig_keys
+    }
+
+    /// Retrieves a reference slice from a given 0-based coordinates.
+    /// If `start` or `end` goes past the full contig length, it will be truncated to the full contig length.
+    /// # Arguments
+    /// * `chromosome` - the chromosome to slice from
+    /// * `start` - the 0-based start index (included)
+    /// * `end` - the 0-based end index (excluded)
+    /// # Panics
+    /// * if `chromosome` was not in the FASTA file
+    /// * if `start` > `end`
+    pub fn get_slice(&self, chromosome: &str, start: usize, end: usize) -> &[u8] {
+        let full_contig = self.contig_map.get(chromosome).expect("a chromosome from the reference file");
+        assert!(start <= end, "start > end: {start} > {end}");
+        let truncated_start = if start <= full_contig.len() { start } else {
+            warn!("Received get_slice({:?}, {}, {}), truncated start to {}", chromosome, start, end, full_contig.len());
+            full_contig.len()
+        };
+        let truncated_end = if end <= full_contig.len() { end } else {
+            warn!("Received get_slice({:?}, {}, {}), truncated end to {}", chromosome, start, end, full_contig.len());
+            full_contig.len()
+        };
+        &full_contig[truncated_start..truncated_end]
+    }
+
+    /// Retrieves a full chromosome by name
+    /// # Arguments
+    /// * `chromosome` - the chromosome to slice from
+    /// # Panics
+    /// * if `chromosome` was not in the FASTA file
+    pub fn get_full_chromosome(&self, chromosome: &str) -> &[u8] {
+        let full_contig = self.contig_map.get(chromosome).expect("a chromosome from the reference file");
+        full_contig
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::path::PathBuf;
+    #[test]
+    fn test_simple_reference() {
+        let references = vec![
+            "./test_data/test_reference.fa",
+            "./test_data/test_reference.fa.gz"
+        ];
+        for &reference_fn in references.iter() {
+            let simple_reference_fn: PathBuf = PathBuf::from(reference_fn);
+            let reference_genome = ReferenceGenome::from_fasta(&simple_reference_fn).unwrap();
+
+            assert_eq!(reference_genome.contig_keys(), &[
+                "chr1".to_string(),
+                "chr2".to_string()
+            ]);
+
+            //chr1 = ACGTACGT
+            let chr1_string: Vec<u8> = "ACGTACGT".as_bytes().to_vec();
+            for i in 0..8 {
+                assert_eq!(reference_genome.get_slice(&"chr1", i, 8), &chr1_string[i..]);
+            }
+
+            //chr2 = ACCATGTA
+            let chr1_string: Vec<u8> = "ACCATGTA".as_bytes().to_vec();
+            assert_eq!(reference_genome.get_slice(&"chr2", 0, 8), chr1_string);
+        }
+    }
+}
\ No newline at end of file
diff --git a/src/data_types/variants.rs b/src/data_types/variants.rs
new file mode 100644
index 0000000..d5da244
--- /dev/null
+++ b/src/data_types/variants.rs
@@ -0,0 +1,758 @@
+
+use crate::sequence_alignment::edit_distance;
+
+use log::trace;
+use std::cmp::Ordering;
+
+/// All the variant types we are currently allowing
+#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq, PartialOrd, Ord)]
+pub enum VariantType {
+    /// REF and ALT are both length = 1
+    Snv=0,
+    /// REF length = 1, ALT length > 1
+    Insertion,
+    /// REF length > 1, ALT length = 1
+    Deletion,
+    /// REF and ALT lengths > 1
+    Indel,
+    /// Must have two alleles and be tagged with SVTYPE=INS
+    SvInsertion,
+    /// Must have two alleles and be tagged with SVTYPE=DEL
+    SvDeletion,
+    /// Must have two alleles and be tagged with SVTYPE=DUP
+    SvDuplication,
+    /// Must have two alleles and be tagged with SVTYPE=INV
+    SvInversion,
+    /// Must have two alleles and be tagged with SVTYPE=BND
+    SvBreakend,
+    /// Must have two alleles and be tagged with TRID=####
+    TandemRepeat,
+    /// Something that doesn't match the above criteria, must be 1 or 2 alleles
+    Unknown // make sure Unknown is always the last one in the list
+}
+
+/// Zygosity definitions, mostly used elsewhere
+#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq, PartialOrd, Ord)]
+pub enum Zygosity {
+    HomozygousReference=0,
+    Heterozygous,
+    HomozygousAlternate,
+    Unknown // make sure Unknown is always the last one in the list
+}
+
+/// A variant definition structure.
+/// It currently assumes that chromosome is fixed and that the variant is a SNP.
+#[derive(Debug)]
+pub struct Variant {
+    /// The vcf index from the input datasets
+    vcf_index: usize,
+    /// The type of variant represented by this entry
+    variant_type: VariantType,
+    /// The coordinate of the event in the VCF file, 0-based
+    position: i64,
+    /// The length of the reference allele
+    ref_len: usize,
+    /// the start position of the allele (0-based), will be <= position
+    prefix_len: usize,
+    /// the end position (0-based, exclusive), will be >= position+ref_len
+    postfix_len: usize,
+    /// the first allele value
+    allele0: Vec<u8>,
+    /// the second allele value
+    allele1: Vec<u8>,
+
+    //these only matter for multi-allelic sites; usize is "proper" type, but u8 will be nice and compact
+    /// the index of allele0, typically 0 (REF)
+    index_allele0: u8,
+    /// the index of allele1, typically 1 (usually len(ALT) == 1, so it's 1)
+    index_allele1: u8,
+
+    // auxiliary booleans
+    /// if true, flags this is a variant to ignore for _some_ reason
+    is_ignored: bool
+}
+
+impl Variant {
+    /// Creates a new single-nucleotide variant (SNV).
+    /// For SNV variants, all alleles must be exactly 1 bp long.
+    /// # Arguments
+    /// * `vcf_index` - the index of the source VCF file
+    /// * `position` - the coordinate of the variant in a contig
+    /// * `allele0` - the first allele (usually REF)
+    /// * `allele1` - the second allele (usually ALT[0])
+    /// * `index_allele0` - the index for allele0, typically 0 for REF
+    /// * `index_allele1` - the index for allele1, typically 1 for simple heterozygous variants
+    /// # Panics
+    /// * if `index_allele0 > index_allele1`
+    /// * if the provided sequences do not match a single-nucleotide variant
+    pub fn new_snv(vcf_index: usize, position: i64, allele0: Vec<u8>, allele1: Vec<u8>, index_allele0: u8, index_allele1: u8) -> Variant {
+        // we always assume alleles come "sorted" and they are heterozygous
+        assert!(index_allele0 < index_allele1);
+        
+        // SNV alleles must be length 1
+        assert_eq!(allele0.len(), 1);
+        assert_eq!(allele1.len(), 1);
+        Variant {
+            vcf_index,
+            variant_type: VariantType::Snv,
+            position,
+            ref_len: 1,
+            prefix_len: 0,
+            postfix_len: 0,
+            allele0,
+            allele1,
+            index_allele0,
+            index_allele1,
+            is_ignored: false
+        }
+    }
+
+    /// Creates a new deletion variant.
+    /// Deletions must have a REF allele long than 1 bp, and all ALT alleles must be exactly 1 bp long.
+    /// # Arguments
+    /// * `vcf_index` - the index of the source VCF file
+    /// * `position` - the coordinate of the variant in a contig
+    /// * `ref_len` - the length of the reference allele
+    /// * `allele0` - the first allele (usually REF)
+    /// * `allele1` - the second allele (usually ALT[0])
+    /// * `index_allele0` - the index for allele0, typically 0 for REF
+    /// * `index_allele1` - the index for allele1, typically 1 for simple heterozygous variants
+    /// # Panics
+    /// * if `index_allele0 > index_allele1`
+    /// * if the provided sequences do not match a deletion variant
+    /// * if the reference allele is passed in and it does not have the same length as `ref_len`
+    pub fn new_deletion(vcf_index: usize, position: i64, ref_len: usize, allele0: Vec<u8>, allele1: Vec<u8>, index_allele0: u8, index_allele1: u8) -> Variant {
+        // we always assume alleles come "sorted" and they are heterozygous
+        assert!(index_allele0 < index_allele1);
+        
+        // reference length must be greater than 1 to be a deletion
+        assert!(ref_len > 1);
+        
+        if index_allele0 == 0 {
+            // this allele is also the reference allele
+            assert_eq!(allele0.len(), ref_len);
+        } else {
+            // this allele is not the reference, must be a multi-allelic site; but all deletion alts have len = 1
+            assert!(allele0.len() == 1);
+        }
+        // this one must always be length 1
+        assert!(allele1.len() == 1);
+        
+        // make sure the alleles start with the same thing
+        // assert_eq!(allele0[0], allele1[0]);
+        if allele0[0] != allele1[0] {
+            /*
+            Counter example to requiring alleleX[0] be equal; this is rare, but it does seem to happen
+            chr12	117794450	.	ACACACCAACATGCACACT	T
+            */
+            trace!("Deletion alleles are unexpected: {position}, {ref_len}, {allele0:?}, {allele1:?}");
+        }
+        Variant {
+            vcf_index,
+            variant_type: VariantType::Deletion,
+            position,
+            ref_len,
+            prefix_len: 0,
+            postfix_len: 0,
+            allele0,
+            allele1,
+            index_allele0,
+            index_allele1,
+            is_ignored: false
+        }
+    }
+
+    /// Creates a new insertion variant.
+    /// Insertions must have a REF allele exactly 1 bp long, and all ALT alleles must be longer than 1 bp.
+    /// # Arguments
+    /// * `vcf_index` - the index of the source VCF file
+    /// * `position` - the coordinate of the variant in a contig
+    /// * `allele0` - the first allele (usually REF)
+    /// * `allele1` - the second allele (usually ALT[0])
+    /// * `index_allele0` - the index for allele0, typically 0 for REF
+    /// * `index_allele1` - the index for allele1, typically 1 for simple heterozygous variants
+    /// # Panics
+    /// * if `index_allele0 > index_allele1`
+    /// * if the provided sequences do not match an insertion variant
+    pub fn new_insertion(vcf_index: usize, position: i64, allele0: Vec<u8>, allele1: Vec<u8>, index_allele0: u8, index_allele1: u8) -> Variant {
+        // we always assume alleles come "sorted" and they are heterozygous
+        assert!(index_allele0 < index_allele1);
+        
+        if index_allele0 == 0 {
+            // if reference allele is present, it must be length 1 for this type
+            assert_eq!(allele0.len(), 1);
+        } else {
+            // allele0 isn't reference, so it must be >= 1 due to multi-allelics
+            //     chr1	2122634	.	T	C,TG	14.1
+            assert!(!allele0.is_empty(), "{position} {allele0:?}");
+        }
+        // we have to do >= because of some multi-allelics:
+        //     chr1	286158	.	A	ATG,G	34.4
+        assert!(!allele1.is_empty(), "{position} {allele1:?}");
+
+        // make sure the alleles start with the same thing
+        if allele0[0] != allele1[0] {
+            // no counter example searched for yet, but probably exists, we'll leave this trace for now
+            trace!("Insertion alleles are unexpected: {position}, 1, {allele0:?}, {allele1:?}");
+        }
+        Variant {
+            vcf_index,
+            variant_type: VariantType::Insertion,
+            position,
+            ref_len: 1,
+            prefix_len: 0,
+            postfix_len: 0,
+            allele0,
+            allele1,
+            index_allele0,
+            index_allele1,
+            is_ignored: false
+        }
+    }
+
+    /// Creates a new indel variant.
+    /// All indels alleles must be more than 1 bp long.
+    /// # Arguments
+    /// * `vcf_index` - the index of the source VCF file
+    /// * `position` - the coordinate of the variant in a contig
+    /// * `ref_len` - the length of the reference allele
+    /// * `allele0` - the first allele (usually REF)
+    /// * `allele1` - the second allele (usually ALT[0])
+    /// * `index_allele0` - the index for allele0, typically 0 for REF
+    /// * `index_allele1` - the index for allele1, typically 1 for simple heterozygous variants
+    /// # Panics
+    /// * if `index_allele0 > index_allele1`
+    /// * if the provided sequences do not match an indel variant
+    /// * if the reference allele is passed in and it does not have the same length as `ref_len`
+    pub fn new_indel(vcf_index: usize, position: i64, ref_len: usize, allele0: Vec<u8>, allele1: Vec<u8>, index_allele0: u8, index_allele1: u8) -> Variant {
+        // we always assume alleles come "sorted" and they are heterozygous
+        assert!(index_allele0 < index_allele1);
+
+        // reference length must be greater than 1 to be an indel, but ALTs can really be any length after that (>=1 anyways)
+        assert!(ref_len > 1);
+        
+        if index_allele0 == 0 {
+            // this allele is also the reference allele
+            assert_eq!(allele0.len(), ref_len);
+        } else {
+            // it's not a reference allele, since this is an indel, length can be anything >= 1
+            assert!(!allele0.is_empty());
+        }
+        // this one just has to be >= 1
+        assert!(!allele1.is_empty());
+        
+        // there's no real reason to believe in any shared sequence between alleles
+        // we've seen it not work above, not worth even trying to codify warning here IMO
+        // assert!(???)
+        
+        Variant {
+            vcf_index,
+            variant_type: VariantType::Indel,
+            position,
+            ref_len,
+            prefix_len: 0,
+            postfix_len: 0,
+            allele0,
+            allele1,
+            index_allele0,
+            index_allele1,
+            is_ignored: false
+        }
+    }
+
+    /// Creates a new SV deletion variant.
+    /// SV deletions must have a REF allele long than 1 bp, and all ALT alleles must be exactly 1 bp long.
+    /// # Arguments
+    /// * `vcf_index` - the index of the source VCF file
+    /// * `position` - the coordinate of the variant in a contig
+    /// * `ref_len` - the length of the reference allele
+    /// * `allele0` - the first allele (usually REF)
+    /// * `allele1` - the second allele (usually ALT[0])
+    /// * `index_allele0` - the index for allele0, typically 0 for REF
+    /// * `index_allele1` - the index for allele1, typically 1 for simple heterozygous variants
+    /// # Panics
+    /// * if `index_allele0 > index_allele1`
+    /// * if the provided sequences do not match a deletion variant
+    /// * if the reference allele is passed in and it does not have the same length as `ref_len`
+    pub fn new_sv_deletion(vcf_index: usize, position: i64, ref_len: usize, allele0: Vec<u8>, allele1: Vec<u8>, index_allele0: u8, index_allele1: u8) -> Variant {
+        // we always assume alleles come "sorted" and they are heterozygous
+        assert!(index_allele0 < index_allele1);
+
+        // this is one difference from plain Deletion
+        assert_eq!(index_allele0, 0);
+        assert_eq!(index_allele1, 1);
+        
+        // this allele is also the reference allele
+        assert_eq!(allele0.len(), ref_len);
+        
+        // this one must always be length 1
+        assert!(allele1.len() == 1);
+        
+        // make sure the alleles start with the same thing
+        if allele0[0] != allele1[0] {
+            /*
+            Counter example to requiring alleleX[0] be equal; this is rare, but it does seem to happen
+            chr12	117794450	.	ACACACCAACATGCACACT	T
+            */
+            trace!("Deletion alleles are unexpected: {}, {}, {:?}, {:?}", position, ref_len, allele0, allele1);
+        }
+        Variant {
+            vcf_index,
+            variant_type: VariantType::SvDeletion,
+            position,
+            ref_len,
+            prefix_len: 0,
+            postfix_len: 0,
+            allele0,
+            allele1,
+            index_allele0,
+            index_allele1,
+            is_ignored: false
+        }
+    }
+
+    /// Creates a new SV insertion variant.
+    /// SV insertions must have a REF allele exactly 1 bp long, and all ALT alleles must be longer than 1 bp.
+    /// # Arguments
+    /// * `vcf_index` - the index of the source VCF file
+    /// * `position` - the coordinate of the variant in a contig
+    /// * `allele0` - the first allele (usually REF)
+    /// * `allele1` - the second allele (usually ALT[0])
+    /// * `index_allele0` - the index for allele0, typically 0 for REF
+    /// * `index_allele1` - the index for allele1, typically 1 for simple heterozygous variants
+    /// # Panics
+    /// * if `index_allele0 > index_allele1`
+    /// * if the provided sequences do not match an insertion variant
+    pub fn new_sv_insertion(vcf_index: usize, position: i64, allele0: Vec<u8>, allele1: Vec<u8>, index_allele0: u8, index_allele1: u8) -> Variant {
+        // we always assume alleles come "sorted" and they are heterozygous
+        assert!(index_allele0 < index_allele1);
+        
+        // this is one difference from plain Insertion
+        assert_eq!(index_allele0, 0);
+        assert_eq!(index_allele1, 1);
+        
+        // if reference allele is present, it must be length 1 for this type
+        assert_eq!(allele0.len(), 1);
+        
+        // we have to do >= because of some multi-allelics:
+        //     chr1	286158	.	A	ATG,G	34.4
+        assert!(!allele1.is_empty(), "{position} {allele1:?}");
+
+        // make sure the alleles start with the same thing
+        if allele0[0] != allele1[0] {
+            // no counter example searched for yet, but probably exists, we'll leave this trace for now
+            trace!("Insertion alleles are unexpected: {}, {}, {:?}, {:?}", position, 1, allele0, allele1);
+        }
+        Variant {
+            vcf_index,
+            variant_type: VariantType::SvInsertion,
+            position,
+            ref_len: 1,
+            prefix_len: 0,
+            postfix_len: 0,
+            allele0,
+            allele1,
+            index_allele0,
+            index_allele1,
+            is_ignored: false
+        }
+    }
+
+    /// Creates a new tandem repeat variant, functionally these act very similar to indel types.
+    /// All tandem repeat alleles must be at least 1 bp long by VCF definition.
+    /// # Arguments
+    /// * `vcf_index` - the index of the source VCF file
+    /// * `position` - the coordinate of the variant in a contig
+    /// * `ref_len` - the length of the reference allele
+    /// * `allele0` - the first allele (usually REF)
+    /// * `allele1` - the second allele (usually ALT[0])
+    /// * `index_allele0` - the index for allele0, typically 0 for REF
+    /// * `index_allele1` - the index for allele1, typically 1 for simple heterozygous variants
+    /// # Panics
+    /// * if `index_allele0 > index_allele1`
+    /// * if the provided sequences do not match a tandem repeat variant
+    /// * if the reference allele is passed in and it does not have the same length as `ref_len`
+    pub fn new_tandem_repeat(vcf_index: usize, position: i64, ref_len: usize, allele0: Vec<u8>, allele1: Vec<u8>, index_allele0: u8, index_allele1: u8) -> Variant {
+        // we always assume alleles come "sorted" and they are heterozygous
+        assert!(index_allele0 < index_allele1);
+
+        // all alleles must be >= 1 for tandem repeats, most are longer though
+        assert!(ref_len >= 1);
+        
+        if index_allele0 == 0 {
+            // this allele is also the reference allele
+            assert_eq!(allele0.len(), ref_len);
+        } else {
+            // it's not a reference allele, since this is an indel, length can be anything >= 1
+            assert!(!allele0.is_empty());
+        }
+        // this one just has to be >= 1
+        assert!(!allele1.is_empty());
+        
+        Variant {
+            vcf_index,
+            variant_type: VariantType::TandemRepeat,
+            position,
+            ref_len,
+            prefix_len: 0,
+            postfix_len: 0,
+            allele0,
+            allele1,
+            index_allele0,
+            index_allele1,
+            is_ignored: false
+        }
+    }
+
+    /// This will add a prefix to each allele, generally reference genome sequence that will allow for better matching.
+    /// # Arguments
+    /// * `prefix` - the sequence to pre-pend to each allele
+    pub fn add_reference_prefix(&mut self, prefix: &[u8]) {
+        // make sure we don't set our reference start coordinate to less than 0
+        let prefix_len: usize = prefix.len();
+        assert!(prefix_len <= self.position as usize - self.prefix_len);
+        
+        // allele0, pre-pend is basically copy
+        let mut new_allele0: Vec<u8> = Vec::with_capacity(self.allele0.len()+prefix_len);
+        new_allele0.extend_from_slice(prefix);
+        new_allele0.extend_from_slice(&self.allele0);
+        self.allele0 = new_allele0;
+
+        // same for allele1
+        let mut new_allele1: Vec<u8> = Vec::with_capacity(self.allele1.len()+prefix_len);
+        new_allele1.extend_from_slice(prefix);
+        new_allele1.extend_from_slice(&self.allele1);
+        self.allele1 = new_allele1;
+
+        // finally, adjust the start coordinates
+        self.prefix_len += prefix_len;
+    }
+
+    /// This will add a postfix to each allele, generally reference genome sequence that will allow for better matching.
+    /// # Arguments
+    /// * `postfix` - the sequence to append to each allele
+    pub fn add_reference_postfix(&mut self, postfix: &[u8]) {
+        // easier operation, just extend the existing vecs
+        self.allele0.extend_from_slice(postfix);
+        self.allele1.extend_from_slice(postfix);
+
+        // finally, adjust the end coordinates
+        self.postfix_len += postfix.len();
+    }
+
+    /// This will trim the postfix down to a smaller size.
+    pub fn truncate_reference_postfix(&mut self, truncate_amount: usize) {
+        // sanity check that we are only truncating the postfix
+        assert!(truncate_amount <= self.postfix_len);
+
+        // truncate the alleles and shrink the postfix size
+        self.allele0.truncate(self.allele0.len() - truncate_amount);
+        self.allele1.truncate(self.allele1.len() - truncate_amount);
+        self.postfix_len -= truncate_amount;
+    }
+
+    pub fn get_vcf_index(&self) -> usize {
+        self.vcf_index
+    }
+
+    pub fn get_type(&self) -> VariantType {
+        self.variant_type
+    }
+
+    pub fn position(&self) -> i64 {
+        self.position
+    }
+
+    pub fn get_ref_len(&self) -> usize {
+        self.ref_len
+    }
+
+    pub fn get_prefix_len(&self) -> usize {
+        self.prefix_len
+    }
+
+    pub fn get_postfix_len(&self) -> usize {
+        self.postfix_len
+    }
+
+    pub fn get_allele0(&self) -> &[u8] {
+        &self.allele0
+    }
+
+    pub fn get_allele1(&self) -> &[u8] {
+        &self.allele1
+    }
+
+    pub fn is_ignored(&self) -> bool {
+        self.is_ignored
+    }
+
+    pub fn set_ignored(&mut self) {
+        self.is_ignored = true;
+    }
+
+    pub fn get_truncated_allele0(&self) -> &[u8] {
+        let start: usize = self.prefix_len;
+        let end: usize = self.allele0.len() - self.postfix_len;
+        &self.allele0[start..end]
+    }
+
+    pub fn get_truncated_allele1(&self) -> &[u8] {
+        let start: usize = self.prefix_len;
+        let end: usize = self.allele1.len() - self.postfix_len;
+        &self.allele1[start..end]
+    }
+
+    /// This will determine the best matching allele (0 or 1) or return 2 if neither match.
+    /// Primary purpose of this is to convert all variant observations into a 0/1 scheme.
+    /// This method requires an exact match of the allele.
+    /// # Arguments
+    /// * `allele` - the allele that needs to get converted to a 0 or 1 (or 2 if neither match)
+    pub fn match_allele(&self, allele: &[u8]) -> u8 {
+        if allele == &self.allele0[..] {
+            0
+        } else if allele == &self.allele1[..] {
+            1
+        } else {
+            2
+        }
+    }
+
+    /// This will determine the closest matching allele (0 or 1) based on edit distance, or return 2 if they are equi-distant.
+    /// This method does not require an exact match of the alleles.
+    /// Returns a tuple of the (allele chosen, min edit distance, other edit distance).
+    /// # Arguments
+    /// * `allele` - the allele sequence to compare to our internal alleles
+    pub fn closest_allele(&self, allele: &[u8]) -> (u8, usize, usize) {
+        self.closest_allele_clip(allele, 0, 0)
+    }
+
+    /// This will determine the closest matching allele (0 or 1) based on edit distance, or return 2 if they are equi-distant.
+    /// This method does not require an exact match of the alleles, and allows for you to clip bases on the internal allele sequence.
+    /// This is most useful when you have to clip the provided allele to to incomplete matching.
+    /// Returns a tuple of the (allele chosen, min edit distance, other edit distance).
+    /// # Arguments
+    /// * `allele` - the allele sequence to compare to our internal alleles
+    /// * `offset` - will skip this many bases internally for calculating edit distance
+    pub fn closest_allele_clip(&self, allele: &[u8], head_clip: usize, tail_clip: usize) -> (u8, usize, usize) {
+        assert!(head_clip <= self.prefix_len);
+        assert!(tail_clip <= self.postfix_len);
+        let d0: usize = edit_distance(allele, &self.allele0[head_clip..(self.allele0.len() - tail_clip)]);
+        let d1: usize = edit_distance(allele, &self.allele1[head_clip..(self.allele1.len() - tail_clip)]);
+        trace!("clipping: {} {}", head_clip, tail_clip);
+        trace!("obs{:?}", allele);
+        trace!("a0 {:?} => {}", &self.allele0[head_clip..(self.allele0.len() - tail_clip)], d0);
+        trace!("a1 {:?} => {}", &self.allele1[head_clip..(self.allele1.len() - tail_clip)], d1);
+        match d0.cmp(&d1) {
+            // d0 is less, return that
+            Ordering::Less => (0, d0, d1),
+            // d1 is less, return that
+            Ordering::Greater => (1, d1, d0),
+            // equidistant, so undetermined
+            Ordering::Equal => (2, d0, d1)
+        }
+    }
+
+    /// This will return the index allele for a given haplotype index.
+    /// Input must always be 0 or 1, but it might get converted to something else at multi-allelic sites.
+    /// # Arguments
+    /// * `index` - must be 0 or 1
+    pub fn convert_index(&self, index: u8) -> u8 {
+        if index == 0 {
+            self.index_allele0
+        } else if index == 1 {
+            self.index_allele1
+        } else if index == 2 {
+            // we just need some indicator that it's undetermined, this will work for now
+            u8::MAX  
+        } else {
+            panic!("index must be 0, 1, or 2");
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    
+    #[test]
+    fn test_basic_snv() {
+        let variant = Variant::new_snv(
+            0, 1,
+            b"A".to_vec(), b"C".to_vec(),
+            0, 1
+        );
+        assert_eq!(variant.get_type(), VariantType::Snv);
+        assert_eq!(variant.position(), 1);
+        assert_eq!(variant.get_ref_len(), 1);
+        assert_eq!(variant.match_allele(b"A"), 0);
+        assert_eq!(variant.match_allele(b"C"), 1);
+        assert_eq!(variant.match_allele(b"G"), 2);
+        assert_eq!(variant.match_allele(b"T"), 2);
+        assert_eq!(variant.convert_index(0), 0);
+        assert_eq!(variant.convert_index(1), 1);
+        assert_eq!(variant.convert_index(2), u8::MAX);
+    }
+
+    #[test]
+    fn test_basic_deletion() {
+        // this is the deletion we mostly expect
+        let variant = Variant::new_deletion(
+            0, 10, 3,
+            b"AGT".to_vec(), b"A".to_vec(),
+            0, 1
+        );
+        assert_eq!(variant.get_type(), VariantType::Deletion);
+        assert_eq!(variant.position(), 10);
+        assert_eq!(variant.get_ref_len(), 3);
+        assert_eq!(variant.match_allele(b"AGT"), 0);
+        assert_eq!(variant.match_allele(b"A"), 1);
+        assert_eq!(variant.match_allele(b"AG"), 2);
+
+        // multi-allelic deletion, must still be length 1 though
+        let variant = Variant::new_deletion(
+            0, 10, 4,
+            b"C".to_vec(), b"A".to_vec(),
+            1, 2
+        );
+        assert_eq!(variant.get_type(), VariantType::Deletion);
+        assert_eq!(variant.position(), 10);
+        assert_eq!(variant.get_ref_len(), 4);
+        assert_eq!(variant.match_allele(b"ACCC"), 2);
+        assert_eq!(variant.match_allele(b"C"), 0);
+        assert_eq!(variant.match_allele(b"A"), 1);
+        assert_eq!(variant.convert_index(0), 1);
+        assert_eq!(variant.convert_index(1), 2);
+    }
+
+    #[test]
+    fn test_basic_insertion() {
+        let variant = Variant::new_insertion(
+            0, 20,
+            b"A".to_vec(), b"AGT".to_vec(),
+            0, 1
+        );
+        assert_eq!(variant.get_type(), VariantType::Insertion);
+        assert_eq!(variant.position(), 20);
+        assert_eq!(variant.get_ref_len(), 1);
+        assert_eq!(variant.match_allele(b"A"), 0);
+        assert_eq!(variant.match_allele(b"AGT"), 1);
+        assert_eq!(variant.match_allele(b"AG"), 2);
+    }
+
+    #[test]
+    fn test_basic_indel() {
+        // models AG -> A / AGT
+        let variant = Variant::new_indel(
+            0, 20, 2,
+            b"A".to_vec(), b"AGT".to_vec(),
+            1, 2
+        );
+        assert_eq!(variant.get_type(), VariantType::Indel);
+        assert_eq!(variant.position(), 20);
+        assert_eq!(variant.get_ref_len(), 2);
+        assert_eq!(variant.match_allele(b"A"), 0);
+        assert_eq!(variant.match_allele(b"AGT"), 1);
+        assert_eq!(variant.match_allele(b"AG"), 2);
+    }
+
+    #[test]
+    fn test_sv_insertion() {
+        let variant = Variant::new_sv_insertion(
+            0, 20,
+            b"A".to_vec(), b"AGT".to_vec(),
+            0, 1
+        );
+        assert_eq!(variant.get_type(), VariantType::SvInsertion);
+        assert_eq!(variant.position(), 20);
+        assert_eq!(variant.get_ref_len(), 1);
+
+        // TODO: replace this with the matching we will do with SVs
+        assert_eq!(variant.match_allele(b"A"), 0);
+        assert_eq!(variant.match_allele(b"AGT"), 1);
+        assert_eq!(variant.match_allele(b"AG"), 2);
+    }
+
+    #[test]
+    fn test_sv_deletion() {
+        let variant = Variant::new_sv_deletion(
+            0, 10, 3,
+            b"AGT".to_vec(), b"A".to_vec(),
+            0, 1
+        );
+        assert_eq!(variant.get_type(), VariantType::SvDeletion);
+        assert_eq!(variant.position(), 10);
+        assert_eq!(variant.get_ref_len(), 3);
+
+        // TODO: replace this with the matching we will do with SVs
+        assert_eq!(variant.match_allele(b"AGT"), 0);
+        assert_eq!(variant.match_allele(b"A"), 1);
+        assert_eq!(variant.match_allele(b"AG"), 2);
+    }
+
+    #[test]
+    fn test_tandem_repeat() {
+        let variant = Variant::new_tandem_repeat(
+            0, 10, 4, 
+            b"AAAC".to_vec(),
+            b"AAACAAAC".to_vec(),
+            0, 1
+        );
+        assert_eq!(variant.get_type(), VariantType::TandemRepeat);
+        assert_eq!(variant.position(), 10);
+        assert_eq!(variant.get_ref_len(), 4);
+
+        assert_eq!(variant.match_allele(b"AAAC"), 0);
+        assert_eq!(variant.match_allele(b"AAACAAAC"), 1);
+        assert_eq!(variant.match_allele(b"AAACAA"), 2);
+    }
+
+    #[test]
+    fn test_reference_adjustment() {
+        // models AG -> A / AGT
+        let mut variant = Variant::new_indel(
+            0, 20, 2,
+            b"A".to_vec(), b"AGT".to_vec(),
+            1, 2
+        );
+
+        // make sure no fixins yet
+        assert_eq!(variant.get_prefix_len(), 0);
+        assert_eq!(variant.get_postfix_len(), 0);
+        
+        let prefix: Vec<u8> = b"AC".to_vec();
+        variant.add_reference_prefix(&prefix);
+        let postfix: Vec<u8> = b"GGCC".to_vec();
+        variant.add_reference_postfix(&postfix);
+
+        assert_eq!(variant.get_truncated_allele0(), b"A");
+        assert_eq!(variant.get_truncated_allele1(), b"AGT");
+
+        // trims off the extra 'C' we added
+        variant.truncate_reference_postfix(1);
+        
+        // make sure nothing here changes
+        assert_eq!(variant.get_type(), VariantType::Indel);
+        assert_eq!(variant.position(), 20);
+        assert_eq!(variant.get_ref_len(), 2);
+
+        // check this new stuff
+        assert_eq!(variant.get_prefix_len(), 2);
+        assert_eq!(variant.get_postfix_len(), 3);
+
+        // original alleles will not match exactly anymore
+        assert_eq!(variant.match_allele(b"A"), 2);
+        assert_eq!(variant.match_allele(b"AGT"), 2);
+        assert_eq!(variant.match_allele(b"AG"), 2);
+
+        // inexact without the reference data will return weird results
+        assert_eq!(variant.closest_allele(b"A"), (0, 5, 7));
+        assert_eq!(variant.closest_allele(b"AGT"), (0, 4, 5));
+        assert_eq!(variant.closest_allele(b"AG"), (0, 4, 6));
+
+        // now lets inexact with the extensions
+        assert_eq!(variant.closest_allele(b"ACAGGC"), (0, 0, 2));
+        assert_eq!(variant.closest_allele(b"ACAGTGGC"), (1, 0, 2));
+        assert_eq!(variant.closest_allele(b"ACAGGGC"), (2, 1, 1));
+    }
+}
diff --git a/src/lib.rs b/src/lib.rs
new file mode 100644
index 0000000..66eff21
--- /dev/null
+++ b/src/lib.rs
@@ -0,0 +1,19 @@
+
+/// A*-based phasing implementation
+pub mod astar_phaser;
+/// Functionality that iterates over VCF and BAM to form the prototype phase blocks
+pub mod block_gen;
+/// CLI functionality and checks
+pub mod cli;
+/// Contains multiple wrappers for useful data types in HiPhase
+pub mod data_types;
+/// Organizes primary workflow for a phase block including loading variants from VCF, loading reads from BAMs, running the phaser, and bundling the results
+pub mod phaser;
+/// Components for loading reads from a BAM file and converting them into haplotype observations
+pub mod read_parsing;
+/// Basic helpful utilities for pairwise sequence alignment
+pub mod sequence_alignment;
+/// Graph-based WFA - this is basically POA + WFA, but only allowing for measuring edit distance and no loops
+pub mod wfa_graph;
+/// Contains all the various output writer functionality
+pub mod writers;
diff --git a/src/main.rs b/src/main.rs
new file mode 100644
index 0000000..ee6b876
--- /dev/null
+++ b/src/main.rs
@@ -0,0 +1,639 @@
+
+use hiphase::block_gen::{MultiPhaseBlockIterator, PhaseBlockIterator, get_vcf_samples, get_sample_bams};
+use hiphase::cli::{Settings,check_settings,get_raw_settings};
+use hiphase::data_types::reference_genome::ReferenceGenome;
+use hiphase::phaser::{HaplotagResult, PhaseResult, solve_block, singleton_block};
+use hiphase::writers::block_stats::BlockStatsCollector;
+use hiphase::writers::haplotag_writer::HaplotagWriter;
+use hiphase::writers::ordered_bam_writer::OrderedBamWriter;
+use hiphase::writers::ordered_vcf_writer::OrderedVcfWriter;
+use hiphase::writers::phase_stats::StatsWriter;
+use hiphase::writers::vcf_util::build_bcf_index;
+
+use log::{LevelFilter, debug, error, info, warn};
+use rustc_hash::FxHashMap as HashMap;
+use std::path::PathBuf;
+use std::sync::{Arc, mpsc};
+use std::time::Instant;
+use threadpool::ThreadPool;
+
+fn main() {
+    // get the settings
+    let settings: Settings = get_raw_settings();
+    let filter_level: LevelFilter = match settings.verbosity {
+        0 => LevelFilter::Info,
+        1 => LevelFilter::Debug,
+        _ => LevelFilter::Trace
+    };
+
+    // immediately setup logging first
+    env_logger::builder()
+        .format_timestamp_millis()
+        .filter_level(filter_level)
+        .init();
+    
+    // okay, now we can check all the other settings
+    let cli_settings: Settings = check_settings(settings);
+
+    // first we need to figure out which samples are getting phased
+    let mut sample_names: Vec<String> = cli_settings.sample_names.clone();
+    if sample_names.is_empty() {
+        // no samples were provided, so add the first one encountered
+        // we need to just infer that we're phasing the first one only for now
+        let all_sample_names = match get_vcf_samples(&cli_settings.vcf_filenames[0]) {
+            Ok(v) => v,
+            Err(e) => {
+                error!("Error during VCF sample name parsing: {}", e);
+                std::process::exit(exitcode::IOERR);
+            }
+        };
+
+        // some warnings as needed
+        if all_sample_names.len() > 1 {
+            warn!("Multi-sample VCF detected, but sample name was not provided.  Assuming name is {:?}.", all_sample_names[0]);
+        } else {
+            debug!("Single-sample VCF detected, but sample name was not provided.  Assuming name is {:?}.", all_sample_names[0]);
+        }
+        sample_names.push(all_sample_names[0].clone());
+    }
+
+    // if we are ignoring read groups, we need to verify only one sample is in use
+    if cli_settings.ignore_read_groups && sample_names.len() > 1 {
+        error!("Flag --ignore-read-groups cannot be used in conjuction with multiple sample names, either add read groups or run one sample name at a time.");
+        std::process::exit(exitcode::USAGE);
+    }
+
+    // shared thread pool for bam IO
+    let bam_thread_pool = match rust_htslib::tpool::ThreadPool::new(cli_settings.io_threads.unwrap() as u32) {
+        Ok(btp) => btp,
+        Err(e) => {
+            error!("Error while starting thread pool: {}", e);
+            std::process::exit(exitcode::IOERR);
+        }
+    };
+
+    //here's where the fun starts
+    //generate blocks
+    let mut block_iterators: Vec<PhaseBlockIterator> = vec![];
+    let mut all_used_bams = vec![];
+    let mut sample_to_bams: HashMap<String, Vec<PathBuf>> = Default::default();
+    let mut sample_to_output_bams: HashMap<String, Vec<PathBuf>> = Default::default();
+    for sample_name in sample_names.iter() {
+        // figure out which BAMS go with the given sample
+        let (mut sample_bams, bam_indices) = if cli_settings.ignore_read_groups {
+            // if we are ignoring read groups, then we use all bams (and all indices)
+            (
+                cli_settings.bam_filenames.clone(),
+                (0..cli_settings.bam_filenames.len()).collect()
+            )
+        } else {
+            match get_sample_bams(&cli_settings.bam_filenames, sample_name, &cli_settings.reference_filename) {
+                Ok(sb) => sb,
+                Err(e) => {
+                    error!("Error during BAM read group parsing: {}", e);
+                    std::process::exit(exitcode::IOERR);
+                }
+            }
+        };
+        sample_to_bams.insert(sample_name.clone(), sample_bams.clone());
+
+        // make a phase block iterator using just the sample-specific bams
+        let block_iterator: PhaseBlockIterator = match PhaseBlockIterator::new(
+            &cli_settings.vcf_filenames,
+            &sample_bams,
+            &cli_settings.reference_filename,
+            sample_name.clone(),
+            cli_settings.min_variant_quality,
+            cli_settings.min_mapping_quality,
+            cli_settings.min_spanning_reads,
+            !cli_settings.disable_supplemental_joins,
+            &bam_thread_pool
+        ) {
+            Ok(bi) => bi,
+            Err(e) => {
+                error!("Error during file loading: {}", e);
+                std::process::exit(exitcode::IOERR);
+            }
+        };
+
+        // add the iterator to our list to put together
+        block_iterators.push(block_iterator);
+
+        // also save the used bams, we will check these soon
+        all_used_bams.append(&mut sample_bams);
+
+        // check if we need to save names for BAM writing
+        if !cli_settings.output_bam_filenames.is_empty() {
+            let mut sample_output_bams = vec![];
+            for &b_index in bam_indices.iter() {
+                sample_output_bams.push(cli_settings.output_bam_filenames[b_index].clone());
+            }
+            sample_to_output_bams.insert(sample_name.clone(), sample_output_bams);
+        }
+    }
+
+    if cli_settings.bam_filenames.len() != all_used_bams.len() {
+        let num_provided = cli_settings.bam_filenames.len();
+        let num_used = all_used_bams.len();
+        error!("User provided {} BAM files, but only {} matched samples for phasing", num_provided, num_used);
+        error!("Please remove extra BAM files or add additional samples, BAMs matching phasing: {:?}", all_used_bams);
+        std::process::exit(exitcode::IOERR);
+    }
+
+    // create our joint iterator
+    let mut block_iterator: MultiPhaseBlockIterator = match MultiPhaseBlockIterator::new(block_iterators) {
+        Ok(mpbi) => mpbi,
+        Err(e) => {
+            error!("Error during phase block iterator creation: {}", e);
+            std::process::exit(exitcode::IOERR);
+        }
+    };
+
+    // this writer will write "in-order" provided we correctly pass the ordering of data to it
+    let mut vcf_writer: OrderedVcfWriter = match OrderedVcfWriter::new(
+        &cli_settings.vcf_filenames,
+        &cli_settings.output_vcf_filenames,
+        cli_settings.min_variant_quality,
+        &sample_names
+    ) {
+        Ok(vw) => vw,
+        Err(e) => {
+            error!("Error during VCF writer creation: {}", e);
+            std::process::exit(exitcode::IOERR);
+        }
+    };
+
+    // this write will write reads "in-order" provided we correctly pass the ordering of data to it
+    let mut opt_bam_writers: Option<HashMap<String, OrderedBamWriter>> = if cli_settings.output_bam_filenames.is_empty() {
+        None
+    } else {
+        let mut writer_map: HashMap<String, OrderedBamWriter> = Default::default();
+        for sample_name in sample_names.iter() {
+            let sample_bams = sample_to_bams.get(sample_name).unwrap();
+            let sample_output_bams = sample_to_output_bams.get(sample_name).unwrap();
+            writer_map.insert(
+                sample_name.clone(), 
+                match OrderedBamWriter::new(
+                    sample_name.clone(),
+                    &cli_settings.reference_filename,
+                    sample_bams,
+                    sample_output_bams,
+                    &bam_thread_pool
+                ) {
+                    Ok(bw) => bw,
+                    Err(e) => {
+                        error!("Error during BAM writer creation: {}", e);
+                        std::process::exit(exitcode::IOERR);
+                    }
+                }
+            );
+        }
+        Some(writer_map)
+    };
+
+    // create our stats file also
+    let mut stats_writer: Option<StatsWriter> = match cli_settings.stats_filename {
+        Some(ref filename) => {
+            match StatsWriter::new(filename) {
+                Ok(sw) => Some(sw),
+                Err(e) => {
+                    error!("Error during statistics writer creation: {}", e);
+                    std::process::exit(exitcode::IOERR);
+                }
+            }
+        },
+        None => None
+    };
+
+    // create our block stats collector
+    let mut block_collector: BlockStatsCollector = BlockStatsCollector::new();
+
+    let skip_count = cli_settings.skip_blocks;
+    let take_count = cli_settings.take_blocks;
+    let debug_run: bool = if skip_count != 0 || take_count != usize::MAX {
+        warn!("Debug run detected, disabling file finalizing steps.");
+        warn!("Blocks to skip: {}", skip_count);
+        warn!("Blocks to process: {}", take_count);
+        true
+    } else {
+        false
+    };
+
+    // create our haplotag file if necessary
+    let mut haplotag_writer: Option<HaplotagWriter> = match cli_settings.haplotag_filename {
+        Some(ref filename) => {
+            match HaplotagWriter::new(filename) {
+                Ok(hw) => Some(hw),
+                Err(e) => {
+                    error!("Error during haplotag writer creations: {}", e);
+                    std::process::exit(exitcode::IOERR);
+                }
+            }
+        },
+        None => None
+    };
+
+    // controls whether singletons are deeply run, including haplotagging
+    let phase_singletons: bool = cli_settings.phase_singletons;
+
+    // get our reference genome if we have one
+    let reference_genome: ReferenceGenome = match ReferenceGenome::from_fasta(&cli_settings.reference_filename) {
+        Ok(rg) => rg,
+        Err(e) => {
+            error!("Error during reference loading: {}", e);
+            std::process::exit(exitcode::IOERR);
+        }
+    };
+
+    // we have to do this because we need access to the reference genome later also
+    let arc_reference_genome: Arc<ReferenceGenome> = Arc::new(reference_genome);
+
+    //process the blocks (eventually in parallel)
+    let start_time: Instant = Instant::now();
+    let mut total_variants: u64 = 0;
+    let mut results_received: u64 = 0;
+    
+    // values related to printing
+    const UPDATE_SPEED: u64 = 100;
+
+    if cli_settings.threads <= 1 {
+        for (i, block_result) in block_iterator.by_ref().enumerate().skip(skip_count).take(take_count) {
+            let block = match block_result {
+                Ok(b) => b,
+                Err(e) => {
+                    error!("Error while parsing VCF file: {}", e);
+                    std::process::exit(exitcode::IOERR);
+                }
+            };
+            debug!("block {}: {:?} {}", i, block, block.bp_len());
+
+            // we likely need to separate out the phase result from the haplotag result
+            let sample_bams = sample_to_bams.get(block.sample_name()).unwrap();
+            let (phase_result, haplotag_result): (PhaseResult, HaplotagResult) = if phase_singletons || block.get_num_variants() > 1 {
+                match solve_block(
+                    &block,
+                    &cli_settings.vcf_filenames,
+                    sample_bams,
+                    &arc_reference_genome,
+                    cli_settings.reference_buffer,
+                    cli_settings.min_matched_alleles,
+                    cli_settings.min_mapping_quality,
+                    cli_settings.global_realign_cputime,
+                    cli_settings.phase_min_queue_size,
+                    cli_settings.phase_queue_increment,
+                    cli_settings.wfa_prune_distance
+                ) {
+                    Ok(r) => r,
+                    Err(e) => {
+                        error!("Error while processing {:?}:", block);
+                        error!("  {}", e);
+                        std::process::exit(exitcode::SOFTWARE);
+                    }
+                }
+            } else {
+                singleton_block(&block)
+            };
+
+            // this is only for printing
+            total_variants += phase_result.phase_block.get_num_variants() as u64;
+            results_received += 1;
+
+            process_results(
+                phase_result, haplotag_result, 
+                &mut stats_writer, &mut block_collector, &mut haplotag_writer,
+                &mut vcf_writer, &mut opt_bam_writers,
+            );
+
+            if results_received % UPDATE_SPEED == 0 {
+                let time_so_far: f64 = start_time.elapsed().as_secs_f64();
+                let blocks_per_sec: f64 = results_received as f64 / time_so_far;
+                let variants_per_sec: f64 = total_variants as f64 / time_so_far;
+                info!("Received results for {} phase blocks: {:.4} blocks/sec, {:.4} hets/sec, writer waiting on block {}", results_received, blocks_per_sec, variants_per_sec, vcf_writer.get_wait_block());
+            }
+        }
+    } else {
+        //set up job configuration
+        info!("Starting job pool with {} threads...", cli_settings.threads);
+        let job_slots: u64 = 40 * cli_settings.threads as u64;
+        let mut jobs_queued: u64 = 0;
+        
+        //we need to set up the multiprocessing components now
+        let pool = ThreadPool::new(cli_settings.threads);
+        let (tx, rx) = mpsc::channel();
+        let arc_cli_settings: Arc<Settings> = Arc::new(cli_settings.clone());
+        let arc_sample_to_bams = Arc::new(sample_to_bams.clone());
+
+        for (i, block_result) in block_iterator.by_ref().enumerate().skip(skip_count).take(take_count) {
+            // make sure no panics encountered so far
+            if pool.panic_count() > 0 {
+                error!("Panic detected in ThreadPool, check above for details.");
+                std::process::exit(exitcode::SOFTWARE);
+            }
+
+            if jobs_queued - results_received >= job_slots {
+                let (phase_result, haplotag_result): (PhaseResult, HaplotagResult) = rx.recv().unwrap();
+                
+                // this is only for printing
+                total_variants += phase_result.phase_block.get_num_variants() as u64;
+                results_received += 1;
+                                
+                process_results(
+                    phase_result, haplotag_result, 
+                    &mut stats_writer, &mut block_collector, &mut haplotag_writer,
+                    &mut vcf_writer, &mut opt_bam_writers
+                );
+
+                if results_received % UPDATE_SPEED == 0 {
+                    let time_so_far: f64 = start_time.elapsed().as_secs_f64();
+                    let blocks_per_sec: f64 = results_received as f64 / time_so_far;
+                    let variants_per_sec: f64 = total_variants as f64 / time_so_far;
+                    info!("Received results for {} phase blocks: {:.4} blocks/sec, {:.4} hets/sec, writer waiting on block {}", results_received, blocks_per_sec, variants_per_sec, vcf_writer.get_wait_block());
+                }
+            }
+            
+            let block = match block_result {
+                Ok(b) => b,
+                Err(e) => {
+                    error!("Error while parsing VCF file: {}", e);
+                    std::process::exit(exitcode::IOERR);
+                }
+            };
+            debug!("block {}: {:?} {}", i, block, block.bp_len());
+
+            jobs_queued += 1;
+            if jobs_queued % UPDATE_SPEED == 0 {
+                info!("Generated {} phase blocks, latest block: {:?}", jobs_queued, block); 
+            }
+
+            if phase_singletons || block.get_num_variants() > 1 {
+                let tx = tx.clone();
+                let arc_cli_settings = arc_cli_settings.clone();
+                let arc_reference_genome = arc_reference_genome.clone();
+                let arc_sample_to_bams = arc_sample_to_bams.clone();
+                
+                pool.execute(move|| {
+                    let sample_bams = arc_sample_to_bams.get(block.sample_name()).unwrap();
+                    // dynamic errors cannot be sent via mpsc, so we need to handle errors here
+                    let all_results = match solve_block(
+                        &block,
+                        &arc_cli_settings.vcf_filenames,
+                        sample_bams,
+                        &arc_reference_genome,
+                        arc_cli_settings.reference_buffer,
+                        arc_cli_settings.min_matched_alleles,
+                        arc_cli_settings.min_mapping_quality,
+                        arc_cli_settings.global_realign_cputime,
+                        arc_cli_settings.phase_min_queue_size,
+                        arc_cli_settings.phase_queue_increment,
+                        arc_cli_settings.wfa_prune_distance
+                    ) {
+                        Ok(r) => r,
+                        Err(e) => {
+                            error!("Error while processing {:?}:", block);
+                            error!("  {}", e);
+                            std::process::exit(exitcode::SOFTWARE);
+                        }
+                    };
+                    tx.send(all_results).expect("channel will be there waiting for the pool");
+                });
+            } else {
+                // this is a singleton we can short-circuit here
+                let (phase_result, haplotag_result): (PhaseResult, HaplotagResult) = 
+                    singleton_block(&block);
+                
+                // this is only for printing
+                total_variants += phase_result.phase_block.get_num_variants() as u64;
+                results_received += 1;
+                                
+                process_results(
+                    phase_result, haplotag_result, 
+                    &mut stats_writer, &mut block_collector, &mut haplotag_writer,
+                    &mut vcf_writer, &mut opt_bam_writers
+                );
+
+                if results_received % UPDATE_SPEED == 0 {
+                    let time_so_far: f64 = start_time.elapsed().as_secs_f64();
+                    let blocks_per_sec: f64 = results_received as f64 / time_so_far;
+                    let variants_per_sec: f64 = total_variants as f64 / time_so_far;
+                    info!("Received results for {} phase blocks: {:.4} blocks/sec, {:.4} hets/sec, writer waiting on block {}", results_received, blocks_per_sec, variants_per_sec, vcf_writer.get_wait_block());
+                }
+            }
+        }
+
+        while results_received < jobs_queued {
+            // make sure no panics encountered so far
+            // TODO: if we hit deadlocks from panics, we may need to add this with some sort of suitable timeout:
+            //       https://doc.rust-lang.org/std/sync/mpsc/struct.Receiver.html#method.recv_timeout
+            if pool.panic_count() > 0 {
+                error!("Panic detected in ThreadPool, check above for details.");
+                std::process::exit(exitcode::SOFTWARE);
+            }
+
+            let (phase_result, haplotag_result): (PhaseResult, HaplotagResult) = rx.recv().unwrap();
+            
+            // this is only for printing
+            total_variants += phase_result.phase_block.get_num_variants() as u64;
+            results_received += 1;
+            
+            process_results(
+                phase_result, haplotag_result, 
+                &mut stats_writer, &mut block_collector, &mut haplotag_writer,
+                &mut vcf_writer, &mut opt_bam_writers
+            );
+
+            // do an update if we're on the mod of our speed OR it's the last one for a thread
+            if results_received % UPDATE_SPEED == 0 || (jobs_queued - results_received) < cli_settings.threads as u64 {
+                let time_so_far: f64 = start_time.elapsed().as_secs_f64();
+                let blocks_per_sec: f64 = results_received as f64 / time_so_far;
+                let variants_per_sec: f64 = total_variants as f64 / time_so_far;
+                info!("Received results for {} / {} phase blocks: {:.4} blocks/sec, {:.4} hets/sec, writer waiting on block {}", results_received, jobs_queued, blocks_per_sec, variants_per_sec, vcf_writer.get_wait_block());
+            }
+        }
+    }
+
+    // if we are only doing partial files, this will not behave, so skip it
+    if !debug_run {
+        // we call this once at the end
+        match vcf_writer.write_to_end_position() {
+            Ok(()) => {},
+            Err(e) => {
+                error!("Error while finalizing VCF chromosomes: {}", e);
+                std::process::exit(exitcode::IOERR);
+            }
+        };
+    }
+
+    // now we drop the VCF writer, this is to close out all the VCF files before indexing
+    std::mem::drop(vcf_writer);
+    for vcf_fn in cli_settings.output_vcf_filenames.iter() {
+        match build_bcf_index(vcf_fn, None, cli_settings.threads as u32, true) {
+            Ok(()) => {
+                info!("Finished building index for {:?}.", vcf_fn);
+            },
+            Err(e) => {
+                error!("Error while building index for {:?}: {}", vcf_fn, e);
+                std::process::exit(exitcode::IOERR);
+            }
+        };
+    }
+
+    if let Some(bam_writers) = opt_bam_writers.as_mut() {
+        // if we are only doing partial files, this will not behave, so skip it
+        if !debug_run {
+            for bam_writer in bam_writers.values_mut() {
+                // first finalize whichever chromosome we were on
+                match bam_writer.finalize_chromosome() {
+                    Ok(()) => {},
+                    Err(e) => {
+                        error!("Error while finalizing BAM chromosomes: {}", e);
+                        std::process::exit(exitcode::IOERR);
+                    }
+                };
+
+                // copy reads from all the remaining chromosomes
+                match bam_writer.copy_remaining_chromosomes() {
+                    Ok(()) => {},
+                    Err(e) => {
+                        error!("Error while copying all remaining chromosomes: {}", e);
+                        std::process::exit(exitcode::IOERR);
+                    }
+                };
+            }
+        }
+        
+        // now we need to drop the bam writer, this is to close out all the BAM files before indexing
+        std::mem::drop(opt_bam_writers);
+
+        // index the BAM files with .bai files
+        for bam_fn in cli_settings.output_bam_filenames.iter() {
+            match rust_htslib::bam::index::build(
+                bam_fn,
+                None,
+                rust_htslib::bam::index::Type::Bai,
+                cli_settings.threads as u32
+            ) {
+                Ok(()) => {
+                    info!("Finished building index for {:?}.", bam_fn);
+                },
+                Err(e) => {
+                    error!("Error while building index for {:?}: {}", bam_fn, e);
+                    std::process::exit(exitcode::IOERR);
+                }
+            };
+        }
+    }
+
+    if let Some(ref filename) = cli_settings.blocks_filename {
+        // this will save all block information to a csv/tsv file
+        info!("Saving all blocks to {:?}...", filename);
+        match block_collector.write_blocks(filename) {
+            Ok(()) => {},
+            Err(e) => {
+                error!("Error while writing blocks file: {}", e);
+                std::process::exit(exitcode::IOERR);
+            }
+        };
+    }
+
+    if let Some(ref filename) = cli_settings.summary_filename {
+        // this will save chromosome level stats to a csv/tsv file
+        info!("Saving summary block statistics to {:?}...", filename);
+        match block_collector.write_block_stats(
+            &sample_names, filename, &arc_reference_genome, 
+            block_iterator.variant_stats()
+        ) {
+            Ok(()) => {},
+            Err(e) => {
+                error!("Error while writing summary statistics file: {}", e);
+                std::process::exit(exitcode::IOERR);
+            }
+        }
+    }
+
+    info!("All phase blocks finished successfully after {} seconds.", start_time.elapsed().as_secs_f64());
+}
+
+/// Sub-routine to make sure we are always consistently processing results in an identical manner
+/// This is mostly because I got tired of forgetting to change things in 3 places
+/// # Argument
+/// * `phase_result` - the phasing result from our algorithm
+/// * `haplotag_result` - the haplotag result, does nothing if we are not haplotagging
+/// * `opt_stats_writer` - mutable, optional reference to our algorithm stats writer
+/// * `block_collector` - mutable reference to the block stats collector
+/// * `opt_haplotag_writer` - mutable, optional reference to our haplotag writer
+/// * `vcf_writer` - mutable reference to our VCF writer
+/// * `opt_bam_writers` - mutable, optional reference to the BAM writers for haplotagging
+fn process_results(
+    phase_result: PhaseResult, haplotag_result: HaplotagResult,
+    opt_stats_writer: &mut Option<StatsWriter>, block_collector: &mut BlockStatsCollector, 
+    opt_haplotag_writer: &mut Option<HaplotagWriter>,
+    vcf_writer: &mut OrderedVcfWriter, opt_bam_writers: &mut Option<HashMap<String, OrderedBamWriter>>
+) {
+    // common debug statements
+    debug!("block {} haplotypes:", phase_result.phase_block.get_block_index());
+    debug!("{:?}", phase_result.haplotype_1);
+    debug!("{:?}", phase_result.haplotype_2);
+
+    // write the stats if we have both a writer and a stats block
+    if let Some(stats_writer) = opt_stats_writer.as_mut() {
+        match stats_writer.write_stats(&phase_result) {
+            Ok(()) => {},
+            Err(e) => {
+                error!("Error while writing statistics file: {}", e);
+                std::process::exit(exitcode::IOERR);
+            }
+        }
+    };
+
+    // save all the blocks here
+    for sub_block in phase_result.sub_phase_blocks.iter() {
+        block_collector.add_block(sub_block.clone());
+    }
+    block_collector.add_result(&phase_result);
+
+    match vcf_writer.write_phase_block(phase_result) {
+        Ok(()) => {},
+        Err(e) => {
+            error!("Error while saving phase block: {}", e);
+            std::process::exit(exitcode::IOERR);
+        }
+    };
+
+    if let Some(haplotag_writer) = opt_haplotag_writer.as_mut() {
+        match haplotag_writer.write_block(&haplotag_result) {
+            Ok(()) => {},
+            Err(e) => {
+                error!("Error while writing haplotag file: {}", e);
+                std::process::exit(exitcode::IOERR);
+            }
+        };
+    }
+
+    if let Some(bam_writers) = opt_bam_writers.as_mut() {
+        let sample_name = haplotag_result.phase_block.sample_name().to_string();
+        let block_index = haplotag_result.phase_block.get_block_index();
+
+        // send this block to the correct writer
+        let bam_writer = bam_writers.get_mut(&sample_name).unwrap();
+        match bam_writer.write_phase_block(haplotag_result) {
+            Ok(()) => {},
+            Err(e) => {
+                error!("Error while saving haplotags: {}", e);
+                std::process::exit(exitcode::IOERR);
+            }
+        };
+
+        // now send the skip signal to the rest
+        for (sn, bam_writer) in bam_writers.iter_mut() {
+            if sn != &sample_name {
+                match bam_writer.write_dummy_block(block_index) {
+                    Ok(()) => {},
+                    Err(e) => {
+                        error!("Error while saving haplotags: {}", e);
+                        std::process::exit(exitcode::IOERR);
+                    }
+                };
+            }
+        }
+    }
+}
\ No newline at end of file
diff --git a/src/phaser.rs b/src/phaser.rs
new file mode 100644
index 0000000..3255f65
--- /dev/null
+++ b/src/phaser.rs
@@ -0,0 +1,796 @@
+
+use crate::astar_phaser;
+use crate::block_gen::{PhaseBlock, is_phasable_variant, get_variant_type};
+use crate::data_types::read_segments::ReadSegment;
+use crate::data_types::reference_genome::ReferenceGenome;
+use crate::data_types::variants::{Variant, VariantType};
+use crate::read_parsing;
+use crate::writers::phase_stats::{PhaseStats, ReadStats};
+
+use bio::data_structures::interval_tree::IntervalTree;
+use log::{debug, trace, warn};
+use priority_queue::PriorityQueue;
+use rust_htslib::bcf;
+use rust_htslib::bcf::record::GenotypeAllele;
+use rustc_hash::FxHashMap as HashMap;
+use simple_error::{SimpleError, bail};
+use std::cmp::{Ordering, Reverse};
+use std::path::PathBuf;
+
+/// Core function for loading variant calls from our VCF file and converting them into a `Variant` type.
+/// # Arguments
+/// * `region` - the phase block we need to load
+/// * `vcf_paths` - the VCF files to load, must be zipped and indexed
+/// * `reference_genome` - optional, the reference genome 
+/// * `reference_buffer` - the number of nearby bases to try to use for local realignment
+/// * `is_hom_allowed` - if true, then non-reference homozygous variants will also be loaded into the second return Vec
+fn load_variant_calls(
+    region: &PhaseBlock, 
+    vcf_paths: &[PathBuf],
+    reference_genome: &ReferenceGenome, 
+    reference_buffer: usize, 
+    is_hom_allowed: bool
+) -> Result<(Vec<Variant>, Vec<Variant>), Box<dyn std::error::Error>> {
+    use rust_htslib::bcf::Read;
+    
+    // short circuit because otherwise bcf can throw errors
+    if region.get_num_variants() == 0 {
+        return Ok((vec![], vec![]));
+    }
+
+    // initalize the variant queue with one variant from each VCF, any ties in position are broken by VCF input order
+    let mut variant_queue: PriorityQueue<usize, (Reverse<i64>, Reverse<usize>)> = PriorityQueue::new();
+    let mut vcf_readers: Vec<bcf::IndexedReader> = vcf_paths.iter()
+        .map(|filename| bcf::IndexedReader::from_path(filename).unwrap())
+        .collect();
+    let mut vcf_iterators: Vec<_> = vec![];
+    let mut sample_indices: Vec<usize> = vec![];
+    let sample_name: &str = region.sample_name();
+
+    // fetch the region for each VCF (if it exists)
+    for (vcf_index, vcf_reader) in vcf_readers.iter_mut().enumerate() {
+        // fetch the corresponding chrom index for this VCF file (they are not guaranteed to match)
+        let vcf_header: &bcf::header::HeaderView = vcf_reader.header();
+        let chrom_index: u32 = vcf_header.name2rid(region.get_chrom().as_bytes())?;
+
+        // first make sure we find the sample in this file
+        let mut lookup_index: Option<usize> = None;
+        for (sample_index, &vcf_sample) in vcf_header.samples().iter().enumerate() {
+            let vcf_sample_string: String = std::str::from_utf8(vcf_sample).unwrap().to_string();
+            if vcf_sample_string == sample_name {
+                lookup_index = Some(sample_index);
+                break;
+            }
+        }
+        match lookup_index {
+            Some(index) => {
+                sample_indices.push(index);
+            },
+            None => {
+                bail!("Sample name {:?} was not found in VCF: {:?}", sample_name, vcf_paths[vcf_index]);
+            }
+        };
+    
+        // fetch our position in the VCF file
+        match vcf_reader.fetch(chrom_index, region.get_start(), Some(region.get_end())) {
+            Ok(()) => {
+                // we have entries, so get the first one and queue it
+                let mut vcf_iter = vcf_reader.records().peekable();
+                let first_entry = vcf_iter.peek();
+                if let Some(record_result) = first_entry {
+                    let record: &rust_htslib::bcf::Record = match record_result {
+                        Ok(r) => r,
+                        // we have to convert to an owned error here, and the htslib errors are not cloneable
+                        Err(e) => return Err(Box::new(SimpleError::from(e)))
+                    };
+                    let position: i64 = record.pos();
+                    variant_queue.push(vcf_index, (Reverse(position), Reverse(vcf_index)));
+                };
+
+                // even if the iterator is empty, we push it so things are lined up correctly
+                vcf_iterators.push(vcf_iter);
+            },
+            Err(_) => {
+                // this usually happens when there are no entries for the chromosome
+                vcf_iterators.push(vcf_reader.records().peekable());
+            }
+        };
+    }
+
+    // parse all the records and convert them into our format
+    let mut variants: Vec<Variant> = Vec::<Variant>::with_capacity(region.get_num_variants());
+    let mut hom_variants: Vec<Variant> = vec![];
+    let mut previous_het_end: usize = 0;
+    
+    while !variant_queue.is_empty() {
+        // get the source of the next variant to process and the sample index in that VCF file
+        let (pop_index, pop_priority) = variant_queue.pop().unwrap();
+        let sample_index: usize = sample_indices[pop_index];
+
+        // process this variant
+        let record_result = vcf_iterators[pop_index].next().unwrap();
+        let record = record_result?;
+        
+        let position: i64 = record.pos();
+        assert_eq!(position, pop_priority.0.0); // sanity check that the variant matches our position priority
+        if position < region.get_start() as i64 {
+            // this can happen when you have very very long indels that span one of our breaks
+            // we have already written though, so don't write it again
+        } else {
+            let include_variant = is_phasable_variant(&record, sample_index, region.get_min_quality(), is_hom_allowed)?;
+            if include_variant {
+                let variant_type = get_variant_type(&record)?;
+                
+                // TODO: ideally, this would be consolidated with our Zygosity code block, but we need index_alleles further on
+                //       possible solution is to make Zygosity types have u8 values tied to them
+                //       low priority: not a major slowdown at this time
+                // get the genotypes
+                let all_genotypes = record.genotypes()?;
+                let genotype = all_genotypes.get(sample_index);
+                assert!(genotype.len() <= 2);
+
+                // we don't really expect more than 255 alleles, but make sure we panic if that *does* happen
+                let mut index_allele0: u8 = match genotype[0] {
+                    GenotypeAllele::Unphased(at) => at.try_into().unwrap(),
+                    GenotypeAllele::Phased(at) => at.try_into().unwrap(),
+                    //TODO: ignore these for now, not sure how to handle it?
+                    GenotypeAllele::UnphasedMissing => panic!("Should not happen"),
+                    GenotypeAllele::PhasedMissing => panic!("Should not happen")
+                };
+
+                let mut index_allele1: u8 = if genotype.len() == 1 {
+                    // TRGT can generate single-haplotype results, in this instance just copy index_allele0 and pretend it was homozygous
+                    index_allele0
+                } else {
+                    match genotype[1] {
+                        GenotypeAllele::Unphased(at) => at.try_into().unwrap(),
+                        GenotypeAllele::Phased(at) => at.try_into().unwrap(),
+                        //TODO: ignore these for now, not sure how to handle it?
+                        GenotypeAllele::UnphasedMissing => panic!("Should not happen"),
+                        GenotypeAllele::PhasedMissing => panic!("Should not happen")
+                    }
+                };
+
+                // in merged VCF files, they are not always ordered, check for that here and swap if they are out of order
+                // technically, this can happen in any VCF I supposed
+                // are there any concerns to this swap? we already do many output swaps with the phasing so seems like no
+                if index_allele0 > index_allele1 {
+                    std::mem::swap(&mut index_allele0, &mut index_allele1);
+                }
+
+                // special case for homozygous
+                let is_homozygous = index_allele0 == index_allele1;
+                assert!(!is_homozygous || is_hom_allowed);
+                if is_homozygous {
+                    // this forces our homozygous variants to load as if they were heterozygous (e.g. with a reference allele)
+                    // this is fine because we are not phasing them, just wanting to use the sequence
+                    // TODO: can we remove this hackery? I could see it being an issue later
+                    index_allele0 = 0;
+                }
+                
+                let all_alleles = record.alleles();
+                let ref_len: usize = all_alleles[0].len();
+                let allele0: Vec<u8> = all_alleles[index_allele0 as usize].to_vec();
+                let allele1: Vec<u8> = all_alleles[index_allele1 as usize].to_vec();
+
+                let mut new_variant = match variant_type {
+                    VariantType::Snv => {
+                        Variant::new_snv(
+                            pop_index, position, allele0, allele1, index_allele0, index_allele1
+                        )
+                    },
+                    VariantType::Deletion => {
+                        Variant::new_deletion(
+                            pop_index, position, ref_len,
+                            allele0, allele1, 
+                            index_allele0, index_allele1
+                        )
+                    },
+                    VariantType::Insertion => {
+                        Variant::new_insertion(
+                            pop_index, position, allele0, allele1, index_allele0, index_allele1
+                        )
+                    },
+                    VariantType::Indel => {
+                        Variant::new_indel(
+                            pop_index, position, ref_len,
+                            allele0, allele1, 
+                            index_allele0, index_allele1
+                        )
+                    },
+                    VariantType::SvDeletion => {
+                        Variant::new_sv_deletion(
+                            pop_index, position, ref_len,
+                            allele0, allele1, 
+                            index_allele0, index_allele1
+                        )
+                    },
+                    VariantType::SvInsertion => {
+                        Variant::new_sv_insertion(
+                            pop_index, position, allele0, allele1, index_allele0, index_allele1
+                        )
+                    },
+                    VariantType::TandemRepeat => {
+                        Variant::new_tandem_repeat(
+                            pop_index, position, ref_len,
+                            allele0, allele1, 
+                            index_allele0, index_allele1
+                        )
+                    }
+                    VariantType::SvDuplication |
+                    VariantType::SvInversion |
+                    VariantType::SvBreakend |
+                    VariantType::Unknown => {
+                        // panic here because we shouldn't allow these types unless we implement the variant
+                        panic!("no impl for {variant_type:?}");
+                    }
+                };
+
+                if reference_buffer > 0 && !is_homozygous {
+                    // we have a reference genome and a desire buffer, extend our alleles
+                    let mut ref_prefix_start: usize = if (position as usize) > reference_buffer {
+                        position as usize - reference_buffer
+                    } else {
+                        0
+                    };
+                    let ref_postfix_start: usize = position as usize + ref_len;
+                    
+                    // we used to have an assertion here with the plan to remove it eventually, but it turns out
+                    // that users do crazy things, so we need to convert it to a full Error
+                    let ref_sequence = reference_genome.get_slice(region.get_chrom(), position as usize, ref_postfix_start);
+                    if all_alleles[0] != ref_sequence {
+                        bail!(
+                            "Reference mismatch error: variant at {}:{} has REF allele = \"{}\", but reference genome has \"{}\".",
+                            region.get_chrom(), position+1, 
+                            // we don't want to panic in the middle of this, so use a safe unwrapper with default
+                            std::str::from_utf8(all_alleles[0]).unwrap_or("utf8 decode error"), 
+                            std::str::from_utf8(ref_sequence).unwrap_or("utf8 decode error")
+                        );
+                    }
+
+                    // check if this variant is too close to the previous variant
+                    if ref_prefix_start < previous_het_end {
+                        // for the previous variant, we need to truncate it, possibly all the way down
+                        if let Some(v) = variants.last_mut() {
+                            let current_end: usize = v.position() as usize + v.get_ref_len() + v.get_postfix_len();
+                            let truncate_length: usize = (current_end - position as usize).min(v.get_postfix_len());
+                            v.truncate_reference_postfix(truncate_length);
+                        } else {
+                            panic!("This should not happen with our checks.");
+                        }
+
+                        // for this variant, set the new end to either the previous het end OR the position, whichever is lower
+                        // previous het end can be lower if you have overlapping indels, not much we can do to fix that without global realignment
+                        ref_prefix_start = previous_het_end.min(position as usize);
+                    }
+
+                    // add the prefix
+                    let prefix: &[u8] = reference_genome.get_slice(region.get_chrom(), ref_prefix_start, position as usize);
+                    new_variant.add_reference_prefix(prefix);
+
+                    // add the postfix
+                    let postfix: &[u8] = reference_genome.get_slice(region.get_chrom(), ref_postfix_start, ref_postfix_start + reference_buffer);
+                    new_variant.add_reference_postfix(postfix);
+
+                    // update the previous position to match the position + ref length
+                    previous_het_end = position as usize + ref_len;
+                }
+
+                if is_homozygous {
+                    hom_variants.push(new_variant);
+                } else {
+                    variants.push(new_variant);
+                }
+            }
+        }
+
+        // requeue from the one we popped from
+        let next_entry = vcf_iterators[pop_index].peek();
+        if let Some(record_result) = next_entry {
+            let record: &rust_htslib::bcf::Record = match record_result {
+                Ok(r) => r,
+                // we have to convert to an owned error here, and the htslib errors are not cloneable
+                Err(e) => return Err(Box::new(SimpleError::from(e)))
+            };
+            let position: i64 = record.pos();
+            variant_queue.push(pop_index, (Reverse(position), Reverse(pop_index)));
+        };
+    }
+
+    // sanity check that we found the same number of things
+    assert_eq!(variants.len(), region.get_num_variants());
+    Ok((variants, hom_variants))
+}
+
+/// A result for a phasing algorithm, assumes diploid solution currently.
+pub struct PhaseResult {
+    /// The phase block defining the problem space.
+    pub phase_block: PhaseBlock,
+    /// The variants contained in the phase block.
+    pub variants: Vec<Variant>,
+    /// The first haplotype in the solution.
+    pub haplotype_1: Vec<u8>,
+    /// The second haplotype in the solution.
+    pub haplotype_2: Vec<u8>,
+    /// Store the phase block ID of the variant
+    pub block_ids: Vec<usize>,
+    /// Stores all non-empty sub-blocks
+    pub sub_phase_blocks: Vec<PhaseBlock>,
+    /// Optional read statistics
+    pub read_statistics: Option<ReadStats>,
+    /// Optional statistics from the problem
+    pub statistics: Option<PhaseStats>
+}
+
+/// Calculates the span count for each juncture in the solution, ignoring homozygous variants and unassigned alleles.
+/// # Arguments
+/// * `read_segments` - the read segments used to solve the problem
+/// * `haplotype_1` - the first haplotype in the solution
+/// * `haplotype_2` - the second haplotype in the solution
+fn get_solution_span_counts(
+    read_segments: &IntervalTree<usize, ReadSegment>,
+    haplotype_1: &[u8],
+    haplotype_2: &[u8]
+) -> Vec<usize> {
+    // this will store the total spanning reads ignoring any homozygous or unsolved variants
+    assert_eq!(haplotype_1.len(), haplotype_2.len());
+
+    // there is one less connection than the total number of variants
+    let mut total_span_counts: Vec<usize> = vec![0; haplotype_1.len() - 1];
+    
+    // iterate over all the read segments
+    for rs_interval in read_segments.find(0..usize::MAX) {
+        let rs = rs_interval.data();
+
+        // the range returns [first_allele, last_allele+1), we need to basically remove the +1 here since we're talking junctures
+        let mut juncture_range = rs.get_range();
+        juncture_range.end -= 1;
+
+        // if any of the head variants were converted to homozygous, do not include because they don't provide spanning evidence anymore
+        while juncture_range.start < juncture_range.end && 
+            haplotype_1[juncture_range.start] == haplotype_2[juncture_range.start] {
+                juncture_range.start += 1;
+        }
+
+        // if any of the tail variants were converted to homozygous, do not include because they don't provide spanning evidence anymore
+        while juncture_range.start < juncture_range.end &&
+            haplotype_1[juncture_range.end] == haplotype_2[juncture_range.end] {
+                juncture_range.end -= 1;
+        }
+        
+        // the range has been truncated upstream so we're only looking at junctures
+        for tpc in total_span_counts[juncture_range].iter_mut() {
+            *tpc += 1;
+        }
+    }
+    
+    total_span_counts
+}
+
+/// Core structure of phasing that can be run on a single processor to solve a phase block.
+/// This method is designed to perform data loading and then run an algorithm from another module to solve the block.
+/// See `astar_phaser::astar_solver(...)` for an example solver implementation.
+/// # Arguments
+/// * `phase_problem` - the problem definition, primarily defines coordinates of the phase block we want to solve
+/// * `vcf_paths` - the VCF files to load variants from, must be zipped and indexed
+/// * `sample_name` - the sample name inside the VCF files
+/// * `bam_paths` - the BAM files to load read observations from, must be indexed
+/// * `reference_genome` - optional, the reference genome 
+/// * `reference_buffer` - the number of nearby bases to try to use for local realignment
+/// * `min_matched_alleles` - the minimum number of matched alleles required to include a read
+/// * `min_mapq` - the minimum MAPQ to include a read
+/// * `global_realign_cputime` - the maximum allowed global realignment CPU time; if 0, then only local realignment is used
+/// * `min_queue_size` - the minimum length of the queue
+/// * `queue_increment` - the length that the queue grows as more variants are added to the solution
+/// * `wfa_prune_distance` - maximum allowed distance a wavefront can lag; make smaller to reduce run-time at the cost of accuracy
+#[allow(clippy::too_many_arguments)]
+pub fn solve_block(
+    phase_problem: &PhaseBlock, vcf_paths: &[PathBuf], bam_paths: &[PathBuf], 
+    reference_genome: &ReferenceGenome, reference_buffer: usize,
+    min_matched_alleles: usize, min_mapq: u8, global_realign_cputime: f32,
+    min_queue_size: usize, queue_increment: usize, wfa_prune_distance: usize
+) -> Result<(PhaseResult, HaplotagResult), Box<dyn std::error::Error>> {
+    debug!("Solving problem: {:?}", phase_problem);
+    
+    // short circuit for "empty" problems
+    if phase_problem.get_num_variants() == 0 {
+        // this should only happen for chromosomes with no het alleles
+        assert!(phase_problem.get_start() == 0);
+        assert!(phase_problem.get_end() == 0);
+        let empty_result = PhaseResult {
+            phase_block: phase_problem.clone(),
+            variants: vec![],
+            haplotype_1: vec![],
+            haplotype_2: vec![],
+            block_ids: vec![],
+            sub_phase_blocks: vec![],
+            read_statistics: None,
+            statistics: None
+        };
+        let empty_haplotag_result: HaplotagResult = HaplotagResult { 
+            phase_block: phase_problem.clone(),
+            reads: Default::default() 
+        };
+        return Ok((empty_result, empty_haplotag_result));
+    }
+
+    // homs are only used if global realignment is being attempted
+    let load_homs: bool = global_realign_cputime > 0.0;
+
+    // lets extract the variants we care about from the vcf
+    let (mut variant_calls, mut hom_calls): (Vec<Variant>, Vec<Variant>) = load_variant_calls(
+        phase_problem, 
+        vcf_paths, 
+        reference_genome, reference_buffer, 
+        load_homs
+    )?;
+    assert_eq!(variant_calls.len(), phase_problem.get_num_variants());
+
+    // go through all the loaded variants, including homs, and pull out the TandemRepeat coordinates that have been loaded
+    let mut tr_segments: IntervalTree<i64, usize> = Default::default();
+    for variant in variant_calls.iter().chain(hom_calls.iter()) {
+        if variant.get_type() == VariantType::TandemRepeat {
+            let start: i64 = variant.position();
+            let ref_len: usize = variant.get_ref_len();
+            let end: i64 = start + ref_len as i64;
+            // sometimes TRGT inserts the base before which will match an insertion and sometimes it won't - confirmed with Egor
+            // we can't really tell which is which though, so lets just substract 1 from the start and assume that's the best for now
+            // TODO: if TRGT adjusts to always have an anchor, we will need to drop the "-1" operation
+            tr_segments.insert((start-1)..end, 0);
+        }
+    }
+
+    // now mark all variants contained by the STRs as ignored
+    for variant in variant_calls.iter_mut() {
+        if variant.get_type() != VariantType::TandemRepeat {
+            let start: i64 = variant.position();
+            let ref_len: usize = variant.get_ref_len();
+            let end: i64 = start + ref_len as i64;
+            let var_interval = start..end;
+            let mut is_contained: bool = false;
+            for segment in tr_segments.find(var_interval) {
+                let seg_start = segment.interval().start;
+                let seg_end = segment.interval().end;
+                if seg_start <= start && seg_end >= end {
+                    // this segment fully contains the variant
+                    is_contained = true;
+                    break;
+                }
+            }
+
+            if is_contained {
+                // we need to mark this one as unphaseable
+                variant.set_ignored();
+                debug!("Set ignored het: {:?}", variant);
+            }
+        }
+    }
+
+    // mark any homs we are going to ignore as well
+    for variant in hom_calls.iter_mut() {
+        if variant.get_type() != VariantType::TandemRepeat {
+            let start: i64 = variant.position();
+            let ref_len: usize = variant.get_ref_len();
+            let end: i64 = start + ref_len as i64;
+            let var_interval = start..end;
+            let mut is_contained: bool = false;
+            for segment in tr_segments.find(var_interval) {
+                let seg_start = segment.interval().start;
+                let seg_end = segment.interval().end;
+                if seg_start <= start && seg_end >= end {
+                    // this segment fully contains the variant
+                    is_contained = true;
+                    break;
+                }
+            }
+
+            if is_contained {
+                // we need to mark this one as unphaseable
+                variant.set_ignored();
+                debug!("Set ignored hom: {:?}", variant);
+            }
+        }
+    }
+
+    // reads that meet our criteria for use in phasing
+    let read_segments: IntervalTree<usize, ReadSegment>;
+    // reads that meet our criteria EXCEPT for the minimum number of alleles is just > 0, so they can potentially be phased after we solve
+    let phasable_segments: IntervalTree<usize, ReadSegment>;
+    let read_stats: ReadStats;
+
+    if global_realign_cputime == 0.0 {
+        // we are doing local re-alignments only
+        (read_segments, phasable_segments, read_stats) = read_parsing::load_read_segments(
+            phase_problem, bam_paths, reference_genome.filename(),
+            &variant_calls, min_matched_alleles, min_mapq
+        )?;
+    } else {
+        // we are attempting global re-alignments
+        (read_segments, phasable_segments, read_stats) = match read_parsing::load_full_read_segments(
+            phase_problem, bam_paths, &variant_calls, &hom_calls,
+            reference_genome, min_matched_alleles, min_mapq,
+            global_realign_cputime, wfa_prune_distance
+        ) {
+            Ok(rs) => rs,
+            Err(e) => {
+                if e.to_string() == "max_runtime reached" {
+                    // fall back to the local realignment approach if we run too long
+                    warn!(
+                        "B#{} ({}:{}-{}) detected excessive runtime in read parsing, reverting to local re-alignment.", 
+                        phase_problem.get_block_index(), phase_problem.get_chrom(), phase_problem.get_start(), phase_problem.get_end()
+                    );
+                    read_parsing::load_read_segments(
+                        phase_problem, bam_paths, reference_genome.filename(),
+                        &variant_calls, min_matched_alleles, min_mapq
+                    )?
+                } else {
+                    return Err(e);
+                }
+            }
+        };
+    }
+
+    // read segment debugging
+    for (i, seg) in read_segments.find(0..usize::MAX).enumerate() {
+        trace!("read segment #{} => {:?}", i, seg);
+    }
+
+    // okay final phase is to solve some algorithm given those read segments
+    let astar_result: astar_phaser::AstarResult = astar_phaser::astar_solver(
+        phase_problem, &variant_calls[..], &read_segments, min_queue_size, queue_increment
+    );
+
+    // get the total spanning counts after accounting for homozygous variants / missing alleles
+    let total_span_counts: Vec<usize> = get_solution_span_counts(&read_segments, &astar_result.haplotype_1, &astar_result.haplotype_2);
+    debug!("total_span_counts: {:?}", total_span_counts);
+
+    // if we end up having no reads spanning a juncture, it's time to split the block
+    let block_split: Vec<bool> = total_span_counts.iter()
+        .map(|&tc| tc == 0)
+        .collect::<Vec<bool>>();
+    
+    debug!("Block split: {:?}", block_split);
+
+    // now, figure out what the haplotag is for each variant
+    let mut block_tags: Vec<usize> = vec![0; variant_calls.len()];
+    let mut current_tag: usize = variant_calls[0].position() as usize;
+    for (i, variant) in variant_calls.iter().enumerate() {
+        if i > 0 && block_split[i-1] {
+            // this is a new block
+            current_tag = variant.position() as usize;
+        }
+        block_tags[i] = current_tag;
+    }
+    debug!("Block tags: {:?}", block_tags);
+
+    // generate all of our non-empty sub-blocks now
+    let mut sub_phase_blocks: Vec<PhaseBlock> = vec![];
+    let mut current_block: PhaseBlock = PhaseBlock::new(
+        phase_problem.get_block_index(), 
+        phase_problem.get_chrom().to_string(),
+        phase_problem.get_chrom_index(),
+        phase_problem.get_min_quality(),
+        phase_problem.sample_name().to_string()
+    );
+    let mut current_tag = block_tags[0];
+    for (i, variant) in variant_calls.iter().enumerate() {
+        let h1 = astar_result.haplotype_1[i];
+        let h2 = astar_result.haplotype_2[i];
+        if h1 < 2 && h2 < 2 && h1 != h2 {
+            // this is a heterozygous variant in our result
+            if current_tag != block_tags[i] {
+                if current_block.get_num_variants() > 0 {
+                    // it's part of a new block though, so we need to push the old one
+                    sub_phase_blocks.push(current_block);
+                    current_block = PhaseBlock::new(
+                        phase_problem.get_block_index(), 
+                        phase_problem.get_chrom().to_string(), 
+                        phase_problem.get_chrom_index(),
+                        phase_problem.get_min_quality(),
+                        phase_problem.sample_name().to_string()
+                    );
+                }
+
+                // make sure we update to the new tag also
+                current_tag = block_tags[i];
+            }
+
+            // add the variant to the current block
+            current_block.add_locus_variant(phase_problem.get_chrom(), variant.position() as u64, variant.get_vcf_index());
+        }
+    }
+
+    // check if we have a block left to push
+    if current_block.get_num_variants() > 0 {
+        sub_phase_blocks.push(current_block);
+    }
+    debug!("sub_phase_blocks: {:?}", sub_phase_blocks);
+
+    // last step is to haplotag the reads we loaded
+    let mut haplotagged_reads: HashMap<String, (usize, usize)> = haplotag_reads(
+        read_segments, &astar_result.haplotype_1, &astar_result.haplotype_2,
+        &block_tags
+    );
+
+    // also haplotype the extra reads and add them to our result
+    let phasable_haplotagged_reads: HashMap<String, (usize, usize)> = haplotag_reads(
+        phasable_segments, &astar_result.haplotype_1, &astar_result.haplotype_2,
+        &block_tags
+    );
+
+    // we could just extend here, but we want to sanity check our keys don't overlap at all
+    for (k, v) in phasable_haplotagged_reads.into_iter() {
+        // we should never have the same read name in both hashmaps
+        assert!(!haplotagged_reads.contains_key(&k));
+        haplotagged_reads.insert(k, v);
+    }
+
+    let haplotag_result: HaplotagResult = HaplotagResult {
+        phase_block: phase_problem.clone(),
+        reads: haplotagged_reads
+    };
+
+    // save all our results here
+    let phase_result: PhaseResult = PhaseResult {
+        phase_block: phase_problem.clone(),
+        variants: variant_calls,
+        haplotype_1: astar_result.haplotype_1,
+        haplotype_2: astar_result.haplotype_2,
+        block_ids: block_tags,
+        sub_phase_blocks,
+        read_statistics: Some(read_stats),
+        statistics: Some(astar_result.statistics)
+    };
+    Ok((phase_result, haplotag_result))
+}
+
+/// This function generates a singleton "solution".
+/// It is boilerplate for an unsolved block because that block only has one variant, and we don't care to phase it.
+/// # Arguments
+/// * `phase_problem` - the problem definition, primarily defines coordinates of the phase block we want to solve
+pub fn singleton_block(phase_problem: &PhaseBlock) -> (PhaseResult, HaplotagResult) {
+    debug!("Generating empty result for singleton: {phase_problem:?}");
+
+    // in downstream writing, the only thing that matters is the vcf_index, so make sure we set that correctly (knowing this is a singleton)
+    // everything else can be garbage
+    let dummy_variant = Variant::new_snv(
+        phase_problem.get_first_variant_vcf(),
+        phase_problem.get_start() as i64,
+        vec![0],
+        vec![1],
+        0,
+        1
+    );
+    let num_variants = phase_problem.get_num_variants();
+    let variant_calls: Vec<Variant> = if num_variants == 0 { vec![] } else { vec![dummy_variant] };
+    assert_eq!(variant_calls.len(), num_variants);
+    
+    // now we can make our dummy results
+    let phase_result: PhaseResult = PhaseResult {
+        phase_block: phase_problem.clone(),
+        variants: variant_calls,
+        haplotype_1: vec![2; num_variants],
+        haplotype_2: vec![2; num_variants],
+        block_ids: vec![phase_problem.get_start() as usize; num_variants],
+        sub_phase_blocks: vec![], // empty because this is not getting treated as a block
+        read_statistics: None,
+        statistics: None
+    };
+    let haplotag_result: HaplotagResult = HaplotagResult { 
+        phase_block: phase_problem.clone(), 
+        reads: Default::default() // no haplotagging in this mode, so give back an empty map
+    };
+    (phase_result, haplotag_result)
+}
+
+/// Stores all information for a haplotag result
+#[derive(Debug)]
+pub struct HaplotagResult {
+    /// The phase block defining the problem space.
+    pub phase_block: PhaseBlock,
+    /// Indexes reads by name and returns a tuple (phase block ID, haplotag)
+    pub reads: HashMap<String, (usize, usize)>
+}
+
+/// Returns the tagging results for the reads in a HashMap.
+/// Values are tuples (phase block ID (0-based), haplotag value (0 or 1))
+/// # Arguments
+/// * `read_segments` - the reads to tag
+/// * `haplotype_1` - the first haplotype
+/// * `haplotype_2` - the second haplotype
+/// * `block_tags` - tags for the blocks based on the variant IDs
+/// # Panics
+/// * if the haplotypes are not the same length as each other, a read segment, and/or the variant calls
+/// * the block breaks is not 1 less length than the variant calls
+pub fn haplotag_reads(
+    read_segments: IntervalTree<usize, ReadSegment>, 
+    haplotype_1: &[u8], haplotype_2: &[u8], block_tags: &[usize]
+) -> HashMap<String, (usize, usize)> {
+    // now do the tagging
+    let mut haplotagged_reads: HashMap<String, (usize, usize)> = Default::default();
+    for rs_interval in read_segments.find(0..usize::MAX) {
+        // first, see if we can resolve it to a haplotype
+        let rs: &ReadSegment = rs_interval.data();
+        let a1_score: u64 = rs.score_haplotype(haplotype_1);
+        let a2_score: u64 = rs.score_haplotype(haplotype_2);
+        let haplotag: usize = match a1_score.cmp(&a2_score) {
+            Ordering::Less => 0,
+            Ordering::Greater => 1,
+            Ordering::Equal => 2
+        };
+
+        if haplotag != 2 {
+            // we can resolve to a haplotype, now get the phase block index
+            // find the first resolved variant in our read segment
+            let mut first_variant: usize = rs.first_allele();
+
+            // while the haplotypes are equal there OR the variant is not resolved (which can happen sometimes)
+            while haplotype_1[first_variant] == haplotype_2[first_variant] || rs.alleles()[first_variant] >= 2 {
+                first_variant += 1;
+            }
+            let phase_block: usize = block_tags[first_variant];
+
+            // finally, just get the read name and make sure we haven't somehow already marked this one
+            let read_name: String = rs.read_name().to_string();
+            assert!(!haplotagged_reads.contains_key(&read_name));
+            haplotagged_reads.insert(read_name, (phase_block, haplotag));
+        }
+    }
+
+    haplotagged_reads
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_get_solution_span_counts() {
+        let haplotype_1 = vec![0, 1, 1, 0, 0, 0];
+        let haplotype_2 = vec![1, 1, 1, 1, 0, 1];
+        let test_reads = vec![
+            ReadSegment::new("r1".to_string(), vec![0, 0, 0, 0, 0, 0], vec![1, 1, 1, 1, 1, 1]), // adds 1 to everything
+            ReadSegment::new("r2".to_string(), vec![2, 2, 2, 1, 1, 2], vec![0, 0, 0, 1, 1, 0]), // one allele is a hom, so this does nothing
+            ReadSegment::new("r3".to_string(), vec![1, 1, 1, 1, 2, 2], vec![1, 1, 1, 1, 0, 0]), // adds 1 to first 3
+            ReadSegment::new("r4".to_string(), vec![2, 1, 1, 1, 1, 1], vec![0, 1, 1, 1, 1, 1]), // adds 1 to last 2
+        ];
+        let mut read_segments: IntervalTree<usize, ReadSegment> = Default::default();
+        for rs in test_reads.into_iter() {
+            let rs_range = rs.get_range();
+            read_segments.insert(rs_range, rs);
+        }
+        let expected_result: Vec<usize> = vec![2, 2, 2, 2, 2];
+
+        let result = get_solution_span_counts(&read_segments, &haplotype_1, &haplotype_2);
+        assert_eq!(expected_result, result);
+    }
+
+    #[test]
+    fn test_haplotag_reads() {
+        let haplotype_1 = vec![0, 0, 0, 0, 0, 0];
+        let haplotype_2 = vec![1, 1, 1, 1, 1, 1];
+        let block_tags = vec![0, 0, 0, 3, 3, 5];
+        let test_reads = vec![
+            ReadSegment::new("r1".to_string(), vec![0, 0, 0, 0, 0, 0], vec![1, 1, 1, 1, 1, 1]),
+            ReadSegment::new("r2".to_string(), vec![2, 2, 2, 1, 1, 2], vec![0, 0, 0, 1, 1, 0]),
+            ReadSegment::new("r3".to_string(), vec![2, 2, 2, 1, 0, 2], vec![0, 0, 0, 1, 1, 0]),
+            ReadSegment::new("r4".to_string(), vec![2, 2, 2, 1, 0, 1], vec![0, 0, 0, 1, 1, 1]),
+            ReadSegment::new("r5".to_string(), vec![2, 2, 2, 1, 0, 2], vec![0, 0, 0, 2, 1, 0]),
+        ];
+
+        let mut read_segments: IntervalTree<usize, ReadSegment> = Default::default();
+        for rs in test_reads.into_iter() {
+            let rs_range = rs.get_range();
+            read_segments.insert(rs_range, rs);
+        }
+
+        let haplotag_result = haplotag_reads(read_segments, &haplotype_1, &haplotype_2, &block_tags);
+
+        // simple-ish cases
+        assert_eq!(haplotag_result.get("r1").unwrap(), &(0, 0)); // exact match to haplotype 1
+        assert_eq!(haplotag_result.get("r2").unwrap(), &(3, 1)); // exact, but incomplete, match to haplotype 2
+        assert!(!haplotag_result.contains_key("r3")); // equal to both, so unassigned
+        assert_eq!(haplotag_result.get("r4").unwrap(), &(3, 1)); // spans two blocks and is inexact, but closer to hap 1 starting with block 3
+        assert_eq!(haplotag_result.get("r2").unwrap(), &(3, 1)); // equal by alleles, but qual on allele 1 is higher
+    }
+}
\ No newline at end of file
diff --git a/src/read_parsing.rs b/src/read_parsing.rs
new file mode 100644
index 0000000..de3186b
--- /dev/null
+++ b/src/read_parsing.rs
@@ -0,0 +1,745 @@
+
+use crate::block_gen::{PhaseBlock, filter_out_alignment_record};
+use crate::data_types::read_segments::ReadSegment;
+use crate::data_types::reference_genome::ReferenceGenome;
+use crate::data_types::variants::{Variant, VariantType};
+use crate::wfa_graph::{NodeAlleleMap, WFAGraph, WFAResult};
+use crate::writers::phase_stats::ReadStats;
+
+use bio::data_structures::interval_tree::IntervalTree;
+use log::{debug, trace, warn};
+use rust_htslib::bam;
+use rustc_hash::FxHashMap as HashMap;
+use simple_error::bail;
+use std::path::{Path, PathBuf};
+
+/// Loads up all the reads in a particular phase region and converts them into their variant representation.
+/// This version uses local re-alignment to parse the alleles.
+/// Returns an interval tree containing all reads to use for phasing, a second tree containing extra reads that *can* be phased but didn't match our criteria,
+/// and statistics from loading the reads.
+/// # Arguments
+/// * `phase_problem` - the phase block we are loading data for
+/// * `bam_paths` - the BAM files to parse, must be indexed
+/// * `reference_filename` - the reference fasta file
+/// * `variant_calls` - the variants used to convert full reads into haplotype observations (`ReadSegment`)
+/// * `min_matched_alleles` - the minimum number of identified alleles required for a read to be included 
+/// * `min_mapq` - the minimum MAPQ to consider a read
+#[allow(clippy::type_complexity)]
+pub fn load_read_segments(
+    phase_problem: &PhaseBlock, bam_paths: &[PathBuf], reference_filename: &Path,
+    variant_calls: &[Variant], min_matched_alleles: usize, min_mapq: u8
+) -> Result<(IntervalTree<usize, ReadSegment>, IntervalTree<usize, ReadSegment>, ReadStats), Box<dyn std::error::Error>> {
+    use rust_htslib::bam::Read;
+    use rust_htslib::bam::ext::BamRecordExtensions;
+    
+    let num_variants: usize = variant_calls.len();
+    let mut read_groups: HashMap<String, Vec<ReadSegment>> = Default::default();
+
+    // stats we track
+    let mut num_reads: u64 = 0;
+    let mut skipped_reads: u64 = 0;
+    let mut num_alleles: u64 = 0;
+    let mut exact_matches: [u64; VariantType::Unknown as usize + 1] = [0; VariantType::Unknown as usize + 1];
+    let mut inexact_matches: [u64; VariantType::Unknown as usize + 1] = [0; VariantType::Unknown as usize + 1];
+    let mut failed_matches: [u64; VariantType::Unknown as usize + 1] = [0; VariantType::Unknown as usize + 1];
+    let mut allele0_matches: [u64; VariantType::Unknown as usize + 1] = [0; VariantType::Unknown as usize + 1];
+    let mut allele1_matches: [u64; VariantType::Unknown as usize + 1] = [0; VariantType::Unknown as usize + 1];
+    
+    for bam_filename in bam_paths.iter() {
+        let mut bam_reader = bam::IndexedReader::from_path(bam_filename)?;
+        bam_reader.set_reference(reference_filename)?;
+        bam_reader.fetch((phase_problem.get_chrom(), phase_problem.get_start(), phase_problem.get_end()+1))?;
+        
+        for read_entry in bam_reader.records() {
+            let mut read = read_entry?;
+            
+            //make sure we care about the alignment
+            if filter_out_alignment_record(&read, min_mapq) {
+                continue;
+            }
+
+            //build out the cigar info
+            read.cache_cigar();
+            
+            //build a lookup from reference coordinate -> sequence coordinate
+            let mut coordinate_lookup: HashMap<i64, i64> = Default::default();
+            let min_position = read.pos();
+            let mut max_position = read.pos();
+            for bp in read.aligned_pairs() {
+                let segment_index = bp[0];
+                let ref_index = bp[1];
+                coordinate_lookup.insert(ref_index, segment_index);
+                max_position = max_position.max(ref_index);
+            }
+            assert!(max_position >= min_position);
+
+            // max_position is the last one that we found, so add +1 to include in the range
+            let aligned_range = min_position..(max_position+1);
+
+            //.seq() returns Seq<'_> type, but we should just full decode
+            let read_sequence: Vec<u8> = read.seq().as_bytes();
+            let read_qualities: &[u8] = read.qual();
+            assert_eq!(read_sequence.len(), read_qualities.len());
+
+            //we will populate these with the variant level info
+            let mut alleles: Vec<u8> = Vec::<u8>::with_capacity(num_variants);
+            let mut quals: Vec<u8> = Vec::<u8>::with_capacity(num_variants);
+            let mut num_overlaps: usize = 0;
+            let mut last_deletion_end: usize = 0;
+
+            for variant in variant_calls.iter() {
+                /*
+                - We need to split on small variants and SVs
+                - for small variants, do what we normally do; it may be worth seeing if the method we create for SVs will help with this other mode fails though
+                - for SVs, check if the read FULLY spans the locus; if so, check how much sequence is inserted/deleted in the region and turn that into an allele
+                - TODO: for SVs, what if it doesn't, can we use clipping somehow?
+                */
+
+                trace!("{:?}", variant);
+                let variant_pos: i64 = variant.position();
+                let variant_type: VariantType = variant.get_type();
+                let vt_index = variant_type as usize;
+
+                // regardless of variant type, we MUST populate these in the following branching logic
+                let mut allele: u8;
+                let qual: u8;
+                let exact_allele: bool;
+                let overlaps_allele: bool;
+
+                if variant.is_ignored() {
+                    // this variant is one marked to ignored, lets set it to undefined as opposed to ambiguous
+                    trace!("\tMarking as undefined allele because it is flagged to be ignored");
+                    allele = 3;
+                    qual = 0;
+                    exact_allele = false;
+                    overlaps_allele = false;
+                } else if variant_pos < last_deletion_end as i64 {
+                    // check if this is within a region we have decided is a deleted
+                    trace!("\tMarking as unknown allele because it overlaps detected SV deletion");
+                    // if the 0-allele is reference, mark as 0, else mark as ambiguous because it's multi-allelic call
+                    allele = 2;
+                    qual = 0;
+                    exact_allele = false;
+                    overlaps_allele = true;
+                } else {
+                    match variant_type {
+                        VariantType::Snv | 
+                        VariantType::Insertion | 
+                        VariantType::Deletion |
+                        VariantType::Indel |
+                        VariantType::SvInsertion |
+                        VariantType::TandemRepeat => {
+                            // we need these to build coordinate ranges
+                            let ref_allele_len: usize = variant.get_ref_len();
+                            let prefix_len: usize = variant.get_prefix_len();
+                            let postfix_len: usize = variant.get_postfix_len();
+                            
+                            // coordinate ranges we care about
+                            let first_start_coordinate: usize = variant_pos as usize - prefix_len;
+                            let last_start_coordinate: usize = variant_pos as usize + 1; // add one because we want to include variant_pos
+                            let first_end_coordinate: usize = variant_pos as usize + ref_allele_len;
+                            let last_end_coordinate: usize = variant_pos as usize + ref_allele_len + postfix_len + 1; // add one for same reason as above
+
+                            // first, try to find the closest start
+                            let mut opt_closest_start: Option<usize> = None;
+                            for sc in (first_start_coordinate..last_start_coordinate).rev() {
+                                if let Some(&si) = coordinate_lookup.get(&(sc as i64)) {
+                                    opt_closest_start = Some(si as usize);
+                                    break;
+                                }
+                            }
+
+                            // now the closest end
+                            let mut opt_closest_end: Option<usize> = None;
+                            for ec in first_end_coordinate..last_end_coordinate {
+                                if let Some(&ei) = coordinate_lookup.get(&(ec as i64)) {
+                                    opt_closest_end = Some(ei as usize);
+                                    break;
+                                }
+                            }
+
+                            // now find the best start coordinate with constraints
+                            let mut start_coordinate: Option<usize> = None;
+                            let mut start_clip: usize = 0;
+                            let mut end_coordinate: Option<usize> = None;
+                            let mut end_clip: usize = 0;
+
+                            if let (Some(closest_start), Some(closest_end)) = (opt_closest_start, opt_closest_end) {
+                                for sc in first_start_coordinate..last_start_coordinate {
+                                    // always increment this
+                                    start_clip += 1;
+
+                                    if let Some(&segment_index) = coordinate_lookup.get(&(sc as i64)) {
+                                        // check if it's too far away
+                                        if closest_start - segment_index as usize > 2*prefix_len {
+                                            continue;
+                                        }
+
+                                        // we found a start coordinate
+                                        start_coordinate = Some(segment_index as usize);
+
+                                        // now try to find an end coordinate also
+                                        for ec in (first_end_coordinate..last_end_coordinate).rev() {
+                                            // always increment this
+                                            end_clip += 1;
+
+                                            if let Some(&next_index) = coordinate_lookup.get(&(ec as i64)) {
+                                                // check if it's too far away
+                                                if next_index as usize - closest_end > 2*postfix_len {
+                                                    continue;
+                                                }
+                                                
+                                                // we found an end coordinate also
+                                                end_coordinate = Some(next_index as usize);
+                                                break;
+                                            }
+                                        }
+                                        break;
+                                    }
+                                }
+                            } else {
+                                // the closest ones failed, we won't succeed here either
+                            }
+
+                            if let Some(ss) = start_coordinate {
+                                if let Some(se) = end_coordinate {
+                                    trace!("\t{}..{} = {:?} {:?}; next = {} {}", ss, se, &read_sequence[ss..se], &read_qualities[ss..se], read_sequence[se], read_qualities[se]);
+                                    
+                                    let edit_distance: usize;
+                                    allele = variant.match_allele(&read_sequence[ss..se]);
+                                    if allele == 2 {
+                                        // no exact match, do inexact matching
+                                        (allele, edit_distance, _) = variant.closest_allele_clip(&read_sequence[ss..se], start_clip - 1, end_clip - 1);
+                                        exact_allele = false;
+                                    } else {
+                                        edit_distance = 0;
+                                        exact_allele = true;
+                                    }
+
+                                    // this approach uses harmonic mean of base quality
+                                    // * no ED penalty
+                                    // * weighted - the same weight factors are applied, but in a down-weighting approach (this is because we don't want to exceed u8::MAX)
+                                    let divisor_multiplier: i64 = match variant_type {
+                                        // these are down-weights for local mode
+                                        // SNV has highest confidence here
+                                        VariantType::Snv => 1,
+
+                                        // indels tend to be pretty bad
+                                        VariantType::Deletion |
+                                        VariantType::Insertion |
+                                        VariantType::Indel => 4,
+
+                                        // SVs are generally worst of all in local mode
+                                        VariantType::SvDeletion |
+                                        VariantType::SvInsertion => 4,
+
+                                        // we want tandem repeats to have a higher weight than our generic indels
+                                        VariantType::TandemRepeat => 2,
+
+                                        _ => {
+                                            panic!("No implementation for matching {variant_type:?}");
+                                        }
+                                    };
+                                    
+                                    qual = ((
+                                        (se - ss) as f64 /
+                                        read_qualities[ss..se].iter()
+                                            .map(|&q| 1.0f64 / q as f64)
+                                            .sum::<f64>()
+                                    ).max(4.0) / divisor_multiplier as f64) as u8;
+                                    
+                                    overlaps_allele = true;
+                                    trace!("\tallele = {}, qual = {}, ED = {}", allele, qual, edit_distance);
+                                } else {
+                                    trace!("\tfailed allele match for ref extension");
+                                    allele = 2;
+                                    qual = 0;
+                                    exact_allele = false;
+                                    overlaps_allele = true;
+                                }
+                            } else {
+                                //no overlap
+                                if aligned_range.contains(&variant_pos) {
+                                    trace!("\tOverlap, no position");
+                                    overlaps_allele = true;
+                                    allele = 2;
+                                } else {
+                                    // there is no alignment overlap
+                                    trace!("\tNo overlap");
+                                    overlaps_allele = false;
+                                    allele = 3;
+                                }
+                                qual = 0;
+                                exact_allele = false;
+                            }
+                        },
+                        VariantType::SvDeletion => {
+                            // we need these to build coordinate ranges
+                            let ref_allele_len: usize = variant.get_ref_len();
+                            
+                            if aligned_range.contains(&variant_pos) {
+                                // coordinate ranges we care about
+                                let last_start_coordinate: usize = variant_pos as usize + 1; // add one because we want to include variant_pos
+                                let first_end_coordinate: usize = variant_pos as usize + ref_allele_len;
+                                if aligned_range.contains(&(first_end_coordinate as i64)) {
+                                    // calculate how many bases we expect to see deleted
+                                    let expected_deleted: usize = first_end_coordinate - last_start_coordinate;
+                                    
+                                    // now we need to move up and down until we find an anchor point
+                                    let mut start_anchor: usize = last_start_coordinate;
+                                    while !coordinate_lookup.contains_key(&(start_anchor as i64)) {
+                                        if start_anchor <= aligned_range.start as usize {
+                                            // fixes weird CIGARs where a mapping starts with non-matching types, e.g.:
+                                            //   [SoftClip(3139), Del(798), Equal(4), ...
+                                            warn!("Reached start of read ({}) without finding start_anchor, using POS ({}) instead.", std::str::from_utf8(read.qname()).unwrap_or("utf8-decode-error"), start_anchor);
+                                            break;
+                                        }
+                                        start_anchor -= 1;
+                                    }
+                                    let mut end_anchor: usize = first_end_coordinate;
+                                    while !coordinate_lookup.contains_key(&(end_anchor as i64)) {
+                                        end_anchor += 1;
+                                        if end_anchor >= aligned_range.end as usize {
+                                            // we have not observed it, but this is a symmetrical handling of the weird CIGARs for the end
+                                            warn!("Reached end of read ({}) without finding end_anchor, using max ({}) found instead.", std::str::from_utf8(read.qname()).unwrap_or("utf8-decode-error"), end_anchor);
+                                            break;
+                                        }
+                                    }
+                                    
+                                    // count up the number of missing (i.e. deleted) based in the reference
+                                    let mut deleted_count: usize = 0;
+                                    for dc in start_anchor..end_anchor {
+                                        if !coordinate_lookup.contains_key(&(dc as i64)) {
+                                            deleted_count += 1;
+                                        }
+                                    }
+                                    
+                                    // it's possible to have more deleted bases than expected
+                                    // assert!(expected_deleted >= deleted_count);
+                                    
+                                    // the quality if we have exactly the right number of deleted (or not deleted) bases
+                                    let exact_allele_qual: f64 = 40.0;
+                                    // divisor for downweighting quality values, this is higher than indels currently
+                                    let deletion_factor: f64 = 8.0;
+                                    // fixes the ratios that match REF or ALT here to: REF = [0, match_window_size); ALT = (1.0 - match_window_size, 1.0 + match_window_size)
+                                    let match_window_size: f64 = 0.33;
+                                    
+                                    let deleted_ratio: f64 = deleted_count as f64 / expected_deleted as f64;
+                                    if deleted_ratio < match_window_size {
+                                        // mostly not deleted
+                                        allele = 0;
+                                        if deleted_ratio == 0.0 {
+                                            // this is pretty unlikely
+                                            qual = (exact_allele_qual / deletion_factor) as u8;
+                                            exact_allele = true;
+                                        } else {
+                                            qual = ((-10.0 * deleted_ratio.log10()) / deletion_factor).max(1.0) as u8;
+                                            exact_allele = false;
+                                        }
+                                    } else if (1.0 - deleted_ratio).abs() < match_window_size  {
+                                        // mostly deleted and not over-deleted
+                                        allele = 1;
+                                        if deleted_ratio == 1.0 {
+                                            // this is pretty unlikely
+                                            qual = (exact_allele_qual / deletion_factor) as u8;
+                                            exact_allele = true;
+                                        } else {
+                                            qual = ((-10.0 * (1.0 - deleted_ratio).abs().log10()) / deletion_factor).max(1.0) as u8;
+                                            exact_allele = false;
+                                        }
+
+                                        // this is getting labeled a deletion, force anything overlapping it to be reference (because it isn't there)
+                                        last_deletion_end = first_end_coordinate;
+                                    } else {
+                                        // ambiguous either because it's in between or over-deleted
+                                        allele = 2;
+                                        qual = 0;
+                                        exact_allele = false;
+                                    }
+                                    overlaps_allele = true;
+                                } else {
+                                    // we have a partial overlap, but don't reach the far end
+                                    // mirror what we do above by marking overlap as true but otherwise a failure to match
+                                    allele = 2;
+                                    qual = 0;
+                                    exact_allele = false;
+                                    overlaps_allele = true;
+                                }
+                            } else {
+                                // we don't overlap the start
+                                allele = 3;
+                                qual = 0;
+                                exact_allele = false;
+                                overlaps_allele = false;
+                            }
+                        },
+                        _ => {
+                            panic!("Unhandled variant type: {variant_type:?}");
+                        }
+                    };
+                }
+
+                // gather stats on the match
+                if overlaps_allele {
+                    assert!(allele <= 2);
+                    if allele == 2 {
+                        failed_matches[vt_index] += 1;
+                    } else {
+                        if exact_allele {
+                            exact_matches[vt_index] += 1;
+                        } else {
+                            inexact_matches[vt_index] += 1;
+                        }
+                        if allele == 0 {
+                            allele0_matches[vt_index] += 1;
+                        } else {
+                            allele1_matches[vt_index] += 1;
+                        }
+                        num_overlaps += 1;
+                        num_alleles += 1;
+                    }
+                } else {
+                    assert_eq!(allele, 3);
+                }
+
+                // no matter what, we push these now
+                alleles.push(allele);
+                // make sure the quality is always at least 1
+                quals.push(qual.max(1));
+            }
+            assert_eq!(num_variants, alleles.len());
+            assert_eq!(num_variants, quals.len());
+            trace!("All alleles {:?}\n", alleles);
+
+            if num_overlaps > 0 {
+                let read_name: String = String::from_utf8(read.qname().to_vec()).unwrap();
+                let read_group: &mut Vec<ReadSegment> = read_groups.entry(read_name.clone()).or_insert(vec![]);
+                read_group.push(ReadSegment::new(read_name, alleles, quals));
+            } else {
+                // this one has no overlaps, so it's just a skipped read
+                skipped_reads += 1;
+            }
+        }
+    }
+
+    // now collapse all the reads, but only keeping those with at least 2 things set
+    let mut read_segments: IntervalTree<usize, ReadSegment> = IntervalTree::new();
+    let mut phasable_segments: IntervalTree<usize, ReadSegment> = IntervalTree::new();
+    for (_qname, read_group) in read_groups.iter() {
+        let collapsed_read: ReadSegment = ReadSegment::collapse(read_group);
+        let num_set: usize = collapsed_read.get_num_set();
+        if num_set >= min_matched_alleles {
+            let segment_range = collapsed_read.get_range();
+            read_segments.insert(segment_range, collapsed_read);
+            num_reads += read_group.len() as u64;
+        } else {
+            skipped_reads += read_group.len() as u64;
+            if num_set > 0 {
+                // even though this won't be used for phasing, it CAN be phased
+                let segment_range = collapsed_read.get_range();
+                phasable_segments.insert(segment_range, collapsed_read);
+            }
+        }
+    }
+    
+    // sanity check this; this was before we started making sure failed alleles only applied if the mapping overlapped
+    // assert_eq!(num_alleles, (num_reads + skipped_reads) * (num_variants as u64));
+    let segment_stats = ReadStats::new(
+        num_reads, skipped_reads, num_alleles, 
+        exact_matches, inexact_matches, failed_matches,
+        allele0_matches, allele1_matches,
+        false
+    );
+    debug!("Read segment stats: {:?}", segment_stats);
+    
+    Ok((read_segments, phasable_segments, segment_stats))
+}
+
+/// Loads up all the reads in a particular phase region and converts them into their variant representation.
+/// This version uses global re-alignment to parse the alleles.
+/// Returns an interval tree containing all reads to use for phasing, a second tree containing extra reads that *can* be phased but didn't match our criteria,
+/// and statistics from loading the reads.
+/// # Arguments
+/// * `phase_problem` - the phase block we are loading data for
+/// * `bam_paths` - the BAM files to parse, must be indexed
+/// * `variant_calls` - the variants used to convert full reads into haplotype observations (`ReadSegment`)
+/// * `hom_calls` - any homozygous variants within the region, these don't get phased but are useful for global realignment
+/// * `reference_genome` - the reference genome sequences, required for this approach
+/// * `min_matched_alleles` - the minimum number of identified alleles required for a read to be included 
+/// * `min_mapq` - the minimum MAPQ to consider a read
+/// * `max_runtime` - controls the allowed runtime of the global realignment
+/// * `wfa_prune_distance` - maximum allowed distance a wavefront can lag; make smaller to reduce run-time at the cost of accuracy
+#[allow(clippy::too_many_arguments)]
+#[allow(clippy::type_complexity)]
+pub fn load_full_read_segments(
+    phase_problem: &PhaseBlock, bam_paths: &[PathBuf], variant_calls: &[Variant], hom_calls: &[Variant],
+    reference_genome: &ReferenceGenome, min_matched_alleles: usize, min_mapq: u8,
+    max_runtime: f32, wfa_prune_distance: usize
+) -> Result<(IntervalTree<usize, ReadSegment>, IntervalTree<usize, ReadSegment>, ReadStats), Box<dyn std::error::Error>> {
+    use rust_htslib::bam::Read;
+    use rust_htslib::bam::ext::BamRecordExtensions;
+    
+    let chromosome: &str = phase_problem.get_chrom();
+    let num_variants: usize = variant_calls.len();
+    let mut read_groups: HashMap<String, Vec<ReadSegment>> = Default::default();
+
+    // stats we track
+    let mut num_reads: u64 = 0;
+    let mut skipped_reads: u64 = 0;
+    let mut num_alleles: u64 = 0;
+    let mut exact_matches: [u64; VariantType::Unknown as usize + 1] = [0; VariantType::Unknown as usize + 1];
+    let mut inexact_matches: [u64; VariantType::Unknown as usize + 1] = [0; VariantType::Unknown as usize + 1];
+    let mut failed_matches: [u64; VariantType::Unknown as usize + 1] = [0; VariantType::Unknown as usize + 1];
+    let mut allele0_matches: [u64; VariantType::Unknown as usize + 1] = [0; VariantType::Unknown as usize + 1];
+    let mut allele1_matches: [u64; VariantType::Unknown as usize + 1] = [0; VariantType::Unknown as usize + 1];
+    
+    let mut edit_distances: Vec<usize> = vec![];
+
+    let block_cpu_time = cpu_time::ThreadTime::now();
+
+    for bam_filename in bam_paths.iter() {
+        let mut bam_reader = bam::IndexedReader::from_path(bam_filename)?;
+        bam_reader.set_reference(reference_genome.filename())?;
+        bam_reader.fetch((chromosome, phase_problem.get_start(), phase_problem.get_end()+1))?;
+        
+        for read_entry in bam_reader.records() {
+            let time_elapsed: f32 = block_cpu_time.elapsed().as_secs_f32();
+            if time_elapsed > max_runtime {
+                bail!("max_runtime reached");
+            }
+
+            let mut read = read_entry?;
+            
+            //make sure we care about the alignment
+            if filter_out_alignment_record(&read, min_mapq) {
+                continue;
+            }
+
+            //build out the cigar info
+            read.cache_cigar();
+            
+            //build a lookup from reference coordinate -> sequence coordinate
+            let mut coordinate_lookup: HashMap<i64, i64> = Default::default();
+            let mut min_position: i64 = i64::MAX;
+            let mut max_position: i64 = i64::MIN;
+            for bp in read.aligned_pairs() {
+                let segment_index = bp[0];
+                let ref_index = bp[1];
+                coordinate_lookup.insert(ref_index, segment_index);
+                min_position = min_position.min(ref_index);
+                max_position = max_position.max(ref_index);
+            }
+            assert!(max_position >= min_position);
+
+            // max_position is the last one that we found, so add +1 to include in the range
+            let aligned_range = min_position..(max_position+1);
+
+            //we will populate these with the variant level info
+            let mut num_overlaps: usize = 0;
+            let mut first_overlap: Option<usize> = None;
+            let mut last_overlap: usize = 0;
+            for (i, variant) in variant_calls.iter().enumerate() {
+                let variant_pos: i64 = variant.position();
+                if aligned_range.contains(&variant_pos) {
+                    if first_overlap.is_none() {
+                        first_overlap = Some(i);
+                    }
+                    last_overlap = i+1;
+                    num_overlaps += 1;
+                }
+            }
+
+            // if this mapping overlaps no alleles, then there's no reason to look at it anymore
+            if num_overlaps == 0 {
+                skipped_reads += 1;
+                continue;
+            }
+
+            // convert into a non-option
+            let first_overlap: usize = first_overlap.unwrap();
+            assert_eq!(num_overlaps, last_overlap - first_overlap);
+
+            // check for homozygous variants also
+            let mut first_hom_overlap: Option<usize> = None;
+            let mut last_hom_overlap: usize = 0;
+            for (i, variant) in hom_calls.iter().enumerate() {
+                let variant_pos: i64 = variant.position();
+                if aligned_range.contains(&variant_pos) {
+                    if first_hom_overlap.is_none() {
+                        first_hom_overlap = Some(i);
+                    }
+                    last_hom_overlap = i+1;
+                }
+            }
+            let first_hom_overlap: usize = first_hom_overlap.unwrap_or(0);
+
+            // .seq() returns Seq<'_> type, but we should just full decode
+            let read_sequence: Vec<u8> = read.seq().as_bytes();
+            let read_qualities: &[u8] = read.qual();
+            assert_eq!(read_sequence.len(), read_qualities.len());
+            
+            // these should always exist based on how we set it up
+            let read_start: usize = *coordinate_lookup.get(&min_position).unwrap() as usize;
+            let read_end: usize = *coordinate_lookup.get(&max_position).unwrap() as usize;
+
+            // pull out the part of the read we're aligning against
+            let read_align: &[u8] = &read_sequence[read_start..(read_end+1)];
+
+            /*
+            Current state:
+            - we have the reference genome
+            - we have the part of the read that aligns in `read_align`, the full read sequence in `read_sequence`
+            - we have the indices of the first and last variant overlaps in `first_overlap` and `last_overlap`
+            
+            We need to populate:
+            - alleles
+            - quals
+            - read stats (see below)
+
+            Game plan:
+            - construct a graph representing just this reference location + relevant alleles
+            - while constructing, assign alleles to each new branch (it may be reference allele)
+            -- IF you have multiple alleles starting at the same coordinate (e.g. identical call), then do not create an in-between node; this should resolve in the tie-breaking as "identical"
+            -- so each branch should get a variant index + an allele assignment (0/1); reference alleles may end up with multiple 0 alleles in the event of multi-start
+            - align the read via POA
+            - look at the traversed nodes and copy the allele assignments; if anything is unassigned at the end, it gets 2; any with conflicting assignments get 2 also
+            - update stats according to the assignments, we can't really do exact right now (maybe we can look at score deltas from one node to the next?)
+            */
+            let chrom_seq: &[u8] = reference_genome.get_full_chromosome(chromosome);
+            
+            // we need to also provide any preset alleles
+            let start_time = std::time::Instant::now();
+            let (wfa_graph, node_to_alleles): (WFAGraph, NodeAlleleMap) = 
+                WFAGraph::from_reference_variants_with_hom(
+                    chrom_seq, 
+                    &variant_calls[first_overlap..last_overlap], // these are both range style indices
+                    &hom_calls[first_hom_overlap..last_hom_overlap],
+                    min_position as usize, 
+                    max_position as usize + 1
+                ).unwrap();
+            
+            let wfa_result: WFAResult = match wfa_graph.edit_distance_with_pruning(read_align, wfa_prune_distance) {
+                Ok(wr) => wr,
+                Err(e) => {
+                    bail!(
+                        "Encountered WFA error for mapping \"{}\" ({}:{}): {}", 
+                        std::str::from_utf8(read.qname()).unwrap_or("QNAME_UTF8_ERROR"), 
+                        chromosome, read.pos(), e
+                    );
+                }
+            };
+            
+            debug!(
+                "B#{} WFAGraph result ({}) => num_nodes: {}, read_len: {}, variant_overlaps: {}, edit_distance: {}", 
+                phase_problem.get_block_index(), start_time.elapsed().as_secs_f32(), wfa_graph.get_num_nodes(), max_position-min_position+1, num_overlaps, wfa_result.score()
+            );
+            
+            edit_distances.push(wfa_result.score());
+
+            //we will populate these with the variant level info
+            let mut alleles: Vec<u8> = vec![3; num_variants];
+            for traversed_index in wfa_result.traversed_nodes().iter() {
+                for &(var_index, allele_assignment) in node_to_alleles.get(traversed_index).unwrap_or(&vec![]).iter() {
+                    let correct_index: usize = first_overlap+var_index;
+                    if alleles[correct_index] == 3 {
+                        alleles[correct_index] = allele_assignment;
+                    } else if alleles[correct_index] != allele_assignment {
+                        alleles[correct_index] = 2;
+                    }
+                }
+            }
+            
+            // go through the result counting assigned and setting qualities
+            let mut quals: Vec<u8> = vec![0; num_variants];
+            for (i, a) in alleles.iter_mut().enumerate() {
+                let variant_type: VariantType = variant_calls[i].get_type();
+                let vt_index: usize = variant_type as usize;
+                if *a == 3 {
+                    // no overlaps for this allele
+                } else if *a == 2 {
+                    // overlaps, but ambiguous matching
+                    failed_matches[vt_index] += 1;
+                } else {
+                    // we got a match, figure out the quality for it
+                    quals[i] = match variant_type {
+                        // these weights are up-weights for global re-alignments
+                        // SNVs tend to always be the cleanest
+                        VariantType::Snv => 8,
+
+                        // these are probably the noisiest of the bunch
+                        VariantType::Deletion |
+                        VariantType::Insertion |
+                        VariantType::Indel => 1,
+                        
+                        // these should be pretty high confidence because they have a lot of bases to make them work
+                        VariantType::SvDeletion |
+                        VariantType::SvInsertion => 2,
+                        
+                        // we want tandem repeats to have higher confidence than random indels
+                        VariantType::TandemRepeat => 4,
+
+                        _ => {
+                            panic!("No implementation for matching {variant_type:?}");
+                        }
+                    };
+
+                    // gather stats on the match
+                    let exact_allele = false; // TODO: figure this out
+                    if exact_allele {
+                        exact_matches[vt_index] += 1;
+                    } else {
+                        inexact_matches[vt_index] += 1;
+                    }
+                    if *a == 0 {
+                        allele0_matches[vt_index] += 1;
+                    } else {
+                        allele1_matches[vt_index] += 1;
+                    }
+                    num_alleles += 1;
+                }
+            } 
+
+            // need to check what these were before
+            assert_eq!(num_variants, alleles.len());
+            assert_eq!(num_variants, quals.len());
+            trace!("All alleles {:?}\n", alleles);
+
+            let read_name: String = String::from_utf8(read.qname().to_vec()).unwrap();
+            let read_group: &mut Vec<ReadSegment> = read_groups.entry(read_name.clone()).or_insert(vec![]);
+            read_group.push(ReadSegment::new(read_name, alleles, quals));
+        }
+    }
+
+    // now collapse all the reads, but only keeping those with at least 2 things set
+    let mut read_segments: IntervalTree<usize, ReadSegment> = IntervalTree::new();
+    let mut phasable_segments: IntervalTree<usize, ReadSegment> = IntervalTree::new();
+    for (_qname, read_group) in read_groups.iter() {
+        let collapsed_read: ReadSegment = ReadSegment::collapse(read_group);
+        let num_set: usize = collapsed_read.get_num_set();
+        if num_set >= min_matched_alleles {
+            let segment_range = collapsed_read.get_range();
+            read_segments.insert(segment_range, collapsed_read);
+            num_reads += read_group.len() as u64;
+        } else {
+            skipped_reads += read_group.len() as u64;
+            if num_set > 0 {
+                // even though this won't be used for phasing, it CAN be phased
+                let segment_range = collapsed_read.get_range();
+                phasable_segments.insert(segment_range, collapsed_read);
+            }
+        }
+    }
+    
+    // sanity check this; this was before we started making sure failed alleles only applied if the mapping overlapped
+    // assert_eq!(num_alleles, (num_reads + skipped_reads) * (num_variants as u64));
+    let segment_stats = ReadStats::new(
+        num_reads, skipped_reads, num_alleles, 
+        exact_matches, inexact_matches, failed_matches,
+        allele0_matches, allele1_matches,
+        true
+    );
+    debug!("Read segment stats: {:?}", segment_stats);
+    debug!("Edit distances: {:?}", edit_distances);
+    
+    Ok((read_segments, phasable_segments, segment_stats))
+}
\ No newline at end of file
diff --git a/src/sequence_alignment.rs b/src/sequence_alignment.rs
new file mode 100644
index 0000000..0ff9721
--- /dev/null
+++ b/src/sequence_alignment.rs
@@ -0,0 +1,77 @@
+
+/// Returns the edit distance between two u8 Vecs by doing the full grid calculation.
+/// This version is row-based (rows are length of v1) for the main loop.
+/// # Arguments
+/// * `v1` - the first sequence
+/// * `v2` - the second sequence
+pub fn edit_distance(v1: &[u8], v2: &[u8]) -> usize {
+    // structured such that each "row" is the length of v1 (i.e. v1 is conceptually on the x-axis)
+    let l1: usize = v1.len();
+    let mut row: Vec<usize> = vec![0; l1+1];
+    let mut prev_row: Vec<usize> = (0..l1+1).collect();
+    
+    // go through each row
+    for (i, &c2) in v2.iter().enumerate() {
+        row[0] = i+1;
+        for (j, &c1) in v1.iter().enumerate() {
+            row[j+1] = [
+                // skip a character in v2
+                prev_row[j+1]+1,
+                // skip a character in v1 
+                row[j]+1,
+                // diagonal match/mismatch
+                prev_row[j]+({
+                    if c1 == c2 {
+                        0
+                    } else {
+                        1
+                    }
+                })
+            ].into_iter().min().unwrap();
+        }
+
+        // swap the rows at the end of each iteration
+        std::mem::swap(&mut row, &mut prev_row);
+    }
+
+    prev_row[l1]
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    
+    #[test]
+    fn test_edit_distance() {
+        let v1: Vec<u8> = vec![0, 1, 2, 4, 5];
+        let v2: Vec<u8> = vec![0, 1, 3, 4, 5];
+        let v3: Vec<u8> = vec![1, 2, 3, 5];
+        let v4: Vec<u8> = vec![];
+
+        assert_eq!(edit_distance(&v1, &v1), 0);
+        assert_eq!(edit_distance(&v1, &v2), 1);
+        assert_eq!(edit_distance(&v1, &v3), 2);
+        assert_eq!(edit_distance(&v1, &v4), 5);
+
+        assert_eq!(edit_distance(&v2, &v2), 0);
+        assert_eq!(edit_distance(&v2, &v3), 3);
+        assert_eq!(edit_distance(&v2, &v4), 5);
+        
+        assert_eq!(edit_distance(&v3, &v3), 0);
+        assert_eq!(edit_distance(&v3, &v4), 4);
+
+        assert_eq!(edit_distance(&v4, &v4), 0);
+    }
+
+    #[test]
+    fn test_edit_error_001() {
+        let v1 = [65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 67, 65, 65, 65];
+        let v2 = [65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 67, 65, 65, 65, 65, 65, 65, 67, 65, 65, 65];
+        let v3 = [65, 65, 65, 65, 65, 65, 65, 65, 65, 65,     65, 65, 65, 65, 65, 65, 67, 65, 65, 65];
+
+        assert_eq!(edit_distance(&v1, &v3), 1);
+        assert_eq!(edit_distance(&v2, &v3), 1);
+        assert_eq!(edit_distance(&v3, &v1), 1);
+        assert_eq!(edit_distance(&v3, &v2), 1);
+    }
+}
\ No newline at end of file
diff --git a/src/wfa_graph.rs b/src/wfa_graph.rs
new file mode 100644
index 0000000..513f0bf
--- /dev/null
+++ b/src/wfa_graph.rs
@@ -0,0 +1,1157 @@
+
+use crate::data_types::variants::Variant;
+
+use bit_vec::BitVec;
+#[allow(unused_imports)]
+use log::{debug, trace, warn};
+use priority_queue::PriorityQueue;
+use simple_error::bail;
+use std::cmp::Reverse;
+use rustc_hash::FxHashMap as HashMap;
+
+pub type NodeAlleleMap = HashMap<usize, Vec<(usize, u8)>>;
+
+/// Contains the core data that represents a "node".
+/// Most functional logic will not be in the struct, this is mostly a container.
+#[derive(Debug)]
+struct WFANode {
+    /// this node's index
+    node_index: usize,
+    /// the sequence contained by the POA node
+    sequence: Vec<u8>,
+    /// contains the indices of the parent nodes, sorted
+    parent_nodes: Vec<usize>
+}
+
+impl WFANode {
+    /// Create a new WFANode and performs sanity checks on inputs.
+    pub fn new(node_index: usize, sequence: Vec<u8>, mut parent_nodes: Vec<usize>) -> WFANode {
+        parent_nodes.sort();
+        WFANode {
+            node_index,
+            sequence,
+            parent_nodes
+        }
+    }
+
+    #[allow(dead_code)]
+    pub fn node_index(&self) -> usize {
+        self.node_index
+    }
+
+    pub fn sequence(&self) -> &[u8] {
+        &self.sequence
+    }
+
+    #[allow(dead_code)]
+    pub fn parent_nodes(&self) -> &[usize] {
+        &self.parent_nodes
+    }
+}
+
+/// Contains functionality for building a Partial-Order Alignment (POA) graph and then aligning a sequence to it.
+/// Assumes that the last node added to the graph is the "target" or "destination" for mapping.
+#[derive(Default)]
+pub struct WFAGraph {
+    /// all the nodes in the graph so far
+    nodes: Vec<WFANode>,
+    /// all the edges from one node to the next
+    edges: Vec<Vec<usize>>
+}
+
+impl WFAGraph {
+    /// Creates a new empty graph
+    pub fn new() -> WFAGraph {
+        WFAGraph {
+            nodes: Default::default(),
+            edges: Default::default()
+        }
+    }
+
+    /// Constructs a WFA graph using only heterozygous variant types. This is the het-only entry point for graph construction.
+    /// # Arguments
+    /// * `reference` - the reference sequence that is the backbone for the graph
+    /// * `variants` - a set of heterozygous variants that we are trying to assign to a read
+    /// * `ref_start` - the reference start coordinate, used for offsetting variant positions; 0-based inclusive
+    /// * `ref_end` - the reference end coordinate, used for offsetting variant positions; 0-based exclusive
+    /// # Errors
+    /// * if there are any errors from adding a node to the graph
+    pub fn from_reference_variants(reference: &[u8], variants: &[Variant], ref_start: usize, ref_end: usize) -> 
+        Result<(WFAGraph, NodeAlleleMap), Box<dyn std::error::Error>> {
+        Self::from_reference_variants_with_hom(
+            reference,
+            variants,
+            &[],
+            ref_start,
+            ref_end
+        )
+    }
+
+    /// Constructs a WFA graph using both heterozygous and homozygous variant types. This is the het/hom entry point for graph construction.
+    /// # Arguments
+    /// * `reference` - the reference sequence that is the backbone for the graph
+    /// * `variants` - a set of heterozygous variants that we are trying to assign to a read
+    /// * `hom_variants` - a set of homozygous variants that we are not trying to assign to a read, but they make alignment better
+    /// * `ref_start` - the reference start coordinate, used for offsetting variant positions; 0-based inclusive
+    /// * `ref_end` - the reference end coordinate, used for offsetting variant positions; 0-based exclusive
+    /// # Errors
+    /// * if there are any errors from adding a node to the graph
+    pub fn from_reference_variants_with_hom(reference: &[u8], variants: &[Variant], hom_variants: &[Variant], ref_start: usize, ref_end: usize) -> 
+        Result<(WFAGraph, NodeAlleleMap), Box<dyn std::error::Error>> {
+        
+        let mut graph: WFAGraph = Default::default();
+        let mut node_to_alleles: NodeAlleleMap = Default::default();
+
+        let mut previous_end: usize = ref_start;
+        let mut reference_index: usize;
+
+        // this tracks nodes that need to be reconnected next, i.e. parents for the _next_ reference node
+        // initial one is empty because there is no start node yet
+        let mut reference_reconnect: Vec<usize> = vec![];
+        // marks the alleles that should be tied to the next reference insertion
+        let mut reference_alleles: Vec<(usize, u8)> = vec![];
+
+        // this is a queue where the key is reconnect position and value is the node to reconnect at that juncture
+        let mut reconnect_queue: PriorityQueue<usize, Reverse<usize>> = PriorityQueue::new();
+
+        let mut all_variants: Vec<(&Variant, Option<usize>)> = Vec::with_capacity(variants.len() + hom_variants.len());
+        for (variant_index, variant) in variants.iter().enumerate() {
+            all_variants.push((variant, Some(variant_index)));
+        }
+        for variant in hom_variants.iter() {
+            all_variants.push((variant, None));
+        }
+        all_variants.sort_by(|v1, v2| v1.0.position().cmp(&v2.0.position()));
+
+        for (variant, variant_index) in all_variants.iter() {
+            if variant.is_ignored() {
+                // I don't think we need a trace message here for now
+                continue;
+            }
+
+            // look at where this variant is
+            let variant_pos: usize = variant.position() as usize;
+            let ref_len: usize = variant.get_ref_len();
+            if variant_pos + ref_len > ref_end {
+                // this variant end after our reference block, so ignore it
+                trace!("Ignoring variant ending at {} after ref_end {}", variant_pos+ref_len, ref_end);
+                continue;
+            }
+
+            // while we have something to reconnect that reconnects BEFORE the next variant, handle it
+            while (reconnect_queue.peek().unwrap_or((&usize::MAX, &Reverse(usize::MAX))).1).0 <= variant_pos {
+                // get the next thing that needs to reconnect before the next variant
+                let (alt_index, Reverse(alt_reconnect)) = reconnect_queue.pop().unwrap();
+                assert!(alt_reconnect > previous_end);
+
+                // first, we have to build up the reference node up until the reconnect point
+                let ref_sequence: Vec<u8> = reference[previous_end..alt_reconnect].to_vec();
+                reference_index = graph.add_node(ref_sequence, reference_reconnect)?;
+                if !reference_alleles.is_empty() {
+                    node_to_alleles.insert(reference_index, reference_alleles);
+                    reference_alleles = vec![];
+                }
+                previous_end = alt_reconnect;
+                
+                // now prep the next one by marking that reference node plus the alt_index we are reconnecting
+                reference_reconnect = vec![reference_index, alt_index];
+
+                // also check if any other reconnects have an identical reconnect point
+                while reconnect_queue.peek().unwrap_or((&usize::MAX, &Reverse(usize::MAX))).1.0 == alt_reconnect {
+                    let (ai2, Reverse(ar2)) = reconnect_queue.pop().unwrap();
+                    assert_eq!(alt_reconnect, ar2);
+                    reference_reconnect.push(ai2);
+                }
+            }
+
+            // at this point, any reconnections before this point have been resolved
+            
+            // check if the reference ended upstream of this variant (which is usually true) OR
+            //   if the graph is currently empty, indicating that there is a variant at position 0 and we need a dummy start node
+            if previous_end < variant_pos || graph.get_num_nodes() == 0 {
+                // we need to catch up the reference, add a node representing all sequence up to this point
+                let ref_sequence: Vec<u8> = reference[previous_end..variant_pos].to_vec();
+                
+                // now add the reference node that catches us up to this variant
+                reference_index = graph.add_node(ref_sequence, reference_reconnect)?;
+                if !reference_alleles.is_empty() {
+                    node_to_alleles.insert(reference_index, reference_alleles);
+                    reference_alleles = vec![];
+                }
+
+                // set these fields for the next reference node that gets added
+                reference_reconnect = vec![reference_index];
+                previous_end = variant_pos;
+            } else {
+                assert!(previous_end == variant_pos);
+                // in this situation, we have already generated the sequence up to this variant, likely because two variants start at the same location
+                // we should not have to do anything special because we already know the upstream index
+            }
+
+            // now add the alt allele(s)
+            if variant.convert_index(0) != 0 {
+                // allele0 is an alt, so this must be multi-allelic; basically do the same thing we would do for allele1
+                // add the sequence exactly with just the immediately upstream reference node
+                let alt_sequence: Vec<u8> = variant.get_truncated_allele0().to_vec();
+                let parent_nodes: Vec<usize> = reference_reconnect.clone();
+                let alt_index: usize = graph.add_node(alt_sequence, parent_nodes)?;
+                let alt_reconnect: usize = variant_pos + ref_len;
+
+                // also mark this alt node has having this particular allele0
+                if let Some(vi) = variant_index {
+                    node_to_alleles.insert(alt_index, vec![(*vi, 0)]);
+                }
+
+                // now we need to mark this new node for reconnection downstream
+                reconnect_queue.push(alt_index, Reverse(alt_reconnect));
+            } else {
+                // the 0 allele is just reference, so add it to the reference allele set
+                if let Some(vi) = variant_index {
+                    reference_alleles.push((*vi, 0));
+                }
+            }
+
+            // allele1 is *always* an alt, add the sequence exactly with just the immediately upstream reference node
+            let alt_sequence: Vec<u8> = variant.get_truncated_allele1().to_vec();
+            let parent_nodes: Vec<usize> = reference_reconnect.clone();
+            let alt_index: usize = graph.add_node(alt_sequence, parent_nodes)?;
+            let alt_reconnect: usize = variant_pos + ref_len;
+
+            // also mark this alt node has having this particular allele
+            if let Some(vi) = variant_index {
+                node_to_alleles.insert(alt_index, vec![(*vi, 1)]);
+            }
+
+            // now we need to mark this new node for reconnection downstream
+            reconnect_queue.push(alt_index, Reverse(alt_reconnect));
+        }
+
+        // reconnect everything downstream from here
+        while !reconnect_queue.is_empty() {
+            let (alt_index, Reverse(alt_reconnect)) = reconnect_queue.pop().unwrap();
+            assert!(alt_reconnect > previous_end);
+            let ref_sequence: Vec<u8> = reference[previous_end..alt_reconnect].to_vec();
+            reference_index = graph.add_node(ref_sequence, reference_reconnect)?;
+            if !reference_alleles.is_empty() {
+                node_to_alleles.insert(reference_index, reference_alleles);
+                reference_alleles = vec![];
+            }
+            previous_end = alt_reconnect;
+            
+            // now prep the next one
+            reference_reconnect = vec![reference_index, alt_index];
+            while reconnect_queue.peek().unwrap_or((&usize::MAX, &Reverse(usize::MAX))).1.0 == alt_reconnect {
+                let (ai2, Reverse(ar2)) = reconnect_queue.pop().unwrap();
+                assert_eq!(alt_reconnect, ar2);
+                reference_reconnect.push(ai2);
+            }
+        }
+
+        // now we just have one last reference node to add
+        assert!(previous_end <= ref_end);
+        let ref_sequence: Vec<u8> = reference[previous_end..ref_end].to_vec();
+        graph.add_node(ref_sequence, reference_reconnect)?;
+
+        // make sure we didn't have any loose reference alleles hanging about, I don't think this can happen unless users enter weird stuff
+        assert!(reference_alleles.is_empty());
+
+        Ok((graph, node_to_alleles))
+    }
+
+    pub fn get_num_nodes(&self) -> usize {
+        self.nodes.len()
+    }
+
+    /// Adds a node to the graph and returns its index as a Result.
+    /// # Arguments
+    /// * `sequence` - the vector of sequence to add with this node
+    /// * `parent_nodes` - the index of any upstream nodes in the graph
+    /// # Errors
+    /// * if the first node inserted has parents; first node is assumed root, so this would break that assumption
+    /// * if any subsequent node is parent-less; all nodes must stem from the root
+    /// * if any parent node has a index >= this node's index; this is a DAG only
+    pub fn add_node(&mut self, sequence: Vec<u8>, parent_nodes: Vec<usize>) -> Result<usize, Box<dyn std::error::Error>> {
+        let new_index: usize = self.nodes.len();
+
+        // sanity checks on what is being added, node wise anyways
+        if new_index == 0 {
+            // this is the first ndoe, it should not have any parents
+            if !parent_nodes.is_empty() {
+                bail!("First node must have no parent nodes.");
+            }
+        } else {
+            // this is a non-first node, it MUST have a parent
+            if parent_nodes.is_empty() {
+                bail!("All nodes after the first must have at least one parent node.");
+            }
+            // make sure all parent node indices comes before this node
+            for &pn in parent_nodes.iter() {
+                if new_index <= pn {
+                    bail!("All parent nodes must come before this node.");
+                }
+            }
+        }
+
+        // add any new edges from parents
+        for &p_index in parent_nodes.iter() {
+            self.edges[p_index].push(new_index);
+        }
+        
+        // add the new node with an empty set of edges coming from it
+        let new_node: WFANode = WFANode::new(new_index, sequence, parent_nodes);
+        self.nodes.push(new_node);
+        self.edges.push(vec![]);
+        
+        Ok(new_index)
+    }
+
+    /// Calculates the edit distance of `other_sequence` onto this graph, returning both the score and the traversed nodes to achieve that score.
+    /// # Arguments
+    /// * `other_sequence` - the sequence being aligned to this graph
+    /// # Errors
+    /// * if the maximum edit distance is reached; this is a safeguard from run-away loops
+    pub fn edit_distance(&self, other_sequence: &[u8]) -> Result<WFAResult, Box<dyn std::error::Error>> {
+        self.edit_distance_with_pruning(other_sequence, usize::MAX)
+    }
+
+    /// Calculates the edit distance of `other_sequence` onto this graph using the WFA algorithm, returning both the score and the traversed nodes to achieve that score.
+    /// If multiple paths exist that are equal, all nodes along each path are returned, allowing us to mark ambiguity.
+    /// This mode will prune wavefronts that fall too far behind the farthest, leading to potentially incorrect results under certain conditions.
+    /// # Arguments
+    /// * `other_sequence` - the sequence being aligned to this graph
+    /// * `prune_distance` - if a wavefront is behind the farthest wavefront by this distance, it will be pruned; set to usize::MAX to disable pruning
+    /// # Errors
+    /// * if the maximum edit distance is reached; this is a safeguard from run-away loops
+    pub fn edit_distance_with_pruning(&self, other_sequence: &[u8], prune_distance: usize) -> Result<WFAResult, Box<dyn std::error::Error>> {
+        // We will structure the algorithm mentally such that X-axis is the graph and Y-axis is `other_sequence`.
+        // This means we are iterating on columns representing characters in the graph.
+        // Each column will be other_len long.
+
+        // each node *may* have a set of active wavefronts indicating progression - HashMap of (at most) length N
+        //   these active wavefronts must all be the same ED, but may not be adjacent (if one branch has a big indel) - HashMap based on start position in other_sequence
+        //     each start position should have current distance into the `other_sequence` along with a set of upstream nodes - (a, Vec(b)) where a is an offset and b is index of set(s)
+        let mut active_wavefronts: HashMap<usize, HashMap<isize, Vec<(usize, usize)>>> = Default::default();
+        let mut next_wavefronts: HashMap<usize, HashMap<isize, Vec<(usize, usize)>>> = Default::default();
+        let mut max_wavefronts: HashMap<usize, HashMap<isize, usize>> = Default::default();
+
+        // we also need to track which nodes are traversed by our particular path
+        let mut treeset_to_index: HashMap<BitVec, usize> = HashMap::default();
+        let mut index_to_treeset: Vec<BitVec> = vec![];
+
+        // we always start in node 0, so make that set
+        let mut base_bitvec: BitVec = BitVec::from_elem(self.nodes.len(), false);
+        base_bitvec.set(0, true);
+        assert!(treeset_to_index.insert(base_bitvec.clone(), 0).is_none());
+        index_to_treeset.push(base_bitvec);
+        
+        let base_hashset_index: usize = 0;
+
+        // insert the starting wavefront at `other_sequence`[0] + offset = 0, with set containing just node #0
+        let mut initial_wavefront: HashMap<isize, Vec<(usize, usize)>> = Default::default();
+        initial_wavefront.insert(0, vec![(0, base_hashset_index)]);
+        // this goes into node 0
+        active_wavefronts.insert(0, initial_wavefront);
+
+        // in a given loop, these will track any nodes that were actively moving wavefronts
+        let mut min_active_wavefront: usize;
+        let mut max_active_wavefront: usize;
+
+        // each loop of WFA will increase our edit distance by 1
+        let mut edit_distance: usize = 0;
+        let max_edit_distance: usize = 100000;
+        let mut farthest_progression: usize = 0;
+        let mut min_progression: usize = 0;
+        
+        let mut encountered_nodes: HashMap<usize, bool> = Default::default();
+
+        loop {
+            /*
+             * Outline of the core POA-WFA algorithm
+             * 1. Extend all current wavefronts and create splits - this is loop 1, traversed in node order
+             * 2. REMOVED  - Back-propagate maximum progressions - this is loop 2, traversed in reverse node order; 
+             *      this was a net drag on the runtime AND would get wrong results on occasion
+             * 3. Increase edit distance to match splits
+             */
+            min_active_wavefront = usize::MAX;
+            max_active_wavefront = 0;
+
+            // trace!("WFAGraph ed={} start: farthest_progression = {}, set_len = {}", edit_distance, farthest_progression, index_to_treeset.len());
+
+            // we can iterate over our nodes in order because they are DAGs entered in order
+            let mut wavefronts_scanned = 0;
+            for (node_index, node) in self.nodes.iter().enumerate() {
+                /* 
+                 * Outline of this core extension loop:
+                 * 1. Push all wavefronts for this node forward
+                 * 2. Collapse all wavefronts on the same diagonal such that only the best remain.
+                 *    - These get consolidated into a single WF.  If it's worse than the best so far, we remove it from consideration.
+                 * 3. If a diagonal hits the end of this node's sequence, copy it into all of the children nodes.
+                 *    - If it's the final node and has progress through all sequence, we instead mark it as finished.
+                 * 4. Once we reach the end, generate any splits that must have ed = ed+1
+                 */
+                
+                // check if this node has *any* active wavefronts
+                if !active_wavefronts.contains_key(&node_index) {
+                    continue;
+                }
+
+                // if this entry doesn't exist, set it
+                if let std::collections::hash_map::Entry::Vacant(e) = encountered_nodes.entry(node_index) {
+                    e.insert(true);
+                    trace!("WFAGraph n#{} start: min ed = {}", node_index, edit_distance);
+                }
+
+                // update our min & max
+                min_active_wavefront = min_active_wavefront.min(node_index);
+                max_active_wavefront = max_active_wavefront.max(node_index);
+
+                let node_sequence: &[u8] = node.sequence();
+                let node_length: usize = node_sequence.len();
+                
+                // pull out the active wavefront for this node
+                let mut wavefront: HashMap<isize, Vec<(usize, usize)>> = active_wavefronts.remove(&node_index).unwrap();
+                let maxfront: &mut HashMap<isize, usize> = max_wavefronts.entry(node_index).or_insert_with(Default::default);
+
+                // `other_start` represent the first position in `other_sequence` for this WF diagonal, 
+                //      and it *can* be negative when we "delete" more node sequence than other sequence
+                // `offset` (below) represents the offset into the current node we are comparing currently
+                // if `other_start` is negative, then the corresponding `offset` values must be positive enough to overcome it (e.g. >= 0 when added)
+                for (other_start, vec_waves) in wavefront.iter_mut() {
+                    wavefronts_scanned += 1;
+
+                    // first extend all wavefronts as far as possible, tracking the farthest
+                    let mut max_offset: usize = 0;
+                    for (offset, _hashset_index) in vec_waves.iter_mut() {
+                        // get the position in `other_sequence` we are currently comparing against
+                        assert!(other_start + *offset as isize >= 0);
+                        let mut other_position: usize = (other_start + *offset as isize) as usize;
+
+                        // now extend as far as we can, making sure to check for boundaries and inequality in bases
+                        while *offset < node_length && 
+                            other_position < other_sequence.len() && 
+                            node_sequence[*offset] == other_sequence[other_position] {
+                            *offset += 1;
+                            other_position += 1;
+                        }
+                        max_offset = max_offset.max(*offset);
+                    }
+
+                    // if we go along a diagonal and max is less than we've seen before, then this is a suboptimal solution we ignore
+                    let maxfront_record: &mut usize = maxfront.entry(*other_start).or_insert(0);    
+                    if max_offset < *maxfront_record  || (other_start + max_offset as isize) < min_progression as isize {
+                        // skips_triggered += 1;
+                        // vec_waves.clear();
+                        continue;
+                    }
+                    *maxfront_record = max_offset;
+
+                    // double check this truth
+                    assert!(other_start+max_offset as isize >= 0);
+                    farthest_progression = farthest_progression.max((other_start+max_offset as isize) as usize);
+
+                    // now collapse down everything that made it to the max_offset
+                    let best_offset = max_offset;
+                    let mut best_sets: Vec<usize> = vec![];
+                    for &(o, s) in vec_waves.iter() {
+                        if o == best_offset {
+                            best_sets.push(s);
+                        }
+                    }
+
+                    // remove duplicates
+                    best_sets.sort();
+                    best_sets.dedup();
+
+                    let best_set: usize = if best_sets.len() > 1 {
+                        // we have multiple bests, collapse them into a single index
+                        let mut set_union: BitVec = BitVec::from_elem(self.nodes.len(), false);
+                        for &set_index in best_sets.iter() {
+                            let other_set: &BitVec = &index_to_treeset[set_index];
+                            set_union.or(other_set);
+                        }
+
+                        // get the index of this entry (or create one if necessary)
+                        let new_set_index: usize = match treeset_to_index.get(&set_union) {
+                            Some(i) => { *i },
+                            None => {
+                                index_to_treeset.push(set_union.clone());
+                                treeset_to_index.insert(set_union, index_to_treeset.len() - 1);
+                                index_to_treeset.len() - 1
+                            } 
+                        };
+                        new_set_index
+                    } else {
+                        // only one remains, just copy it
+                        best_sets[0]
+                    };
+
+                    if max_offset == node_length {
+                        // we are at the end of this node, do different things depending on if this is the final node or not
+                        if node_index == self.nodes.len() - 1 {
+                            assert!(other_start + max_offset as isize >= 0);
+                            if ((other_start + max_offset as isize) as usize) < other_sequence.len() {
+                                // we are *not* at the end of other sequence, but we *are* at the end of the graph
+                                // now we would normally split this into three waves on this node, but only the +1 is valid in this situation
+                                let node_wf: &mut HashMap<isize, Vec<(usize, usize)>> = next_wavefronts.entry(node_index).or_insert_with(Default::default);
+                                // +1 on diagonal - graph does not advance, other does (other has relative insertion); other_start is one more, but offset does not increase
+                                let plus_diagonal : &mut Vec<(usize, usize)> = node_wf.entry(*other_start+1).or_insert(vec![]);
+                                plus_diagonal.push((max_offset, best_set));
+                            } else {
+                                // we are at the end of both final node and the other sequence
+                                // we will handle anything below
+                            }
+                        } else {
+                            assert!(other_start + max_offset as isize >= 0);
+                            
+                            // we are not in the final node, so we need to push this to successor nodes for more extension
+                            // the `new_offset` tells our algorithm which base we're comparing and orients us to a diagonal
+                            let new_offset: isize = other_start + max_offset as isize;
+                            for &successor_index in self.edges[node_index].iter() {
+                                let node_wf: &mut HashMap<isize, Vec<(usize, usize)>> = active_wavefronts.entry(successor_index).or_insert_with(Default::default);
+                                let copy_diagonal: &mut Vec<(usize, usize)> = node_wf.entry(new_offset).or_insert(vec![]);
+                                
+                                // the successor set should include the best + the successors node index
+                                let current_set: &BitVec = &index_to_treeset[best_set];
+                                let mut new_set: BitVec = BitVec::from_elem(self.nodes.len(), false);
+                                new_set.set(successor_index, true);
+                                new_set.or(current_set);
+
+                                // get the index of this entry (or create one if necessary)
+                                let new_set_index: usize = match treeset_to_index.get(&new_set) {
+                                    Some(i) => { *i },
+                                    None => {
+                                        index_to_treeset.push(new_set.clone());
+                                        treeset_to_index.insert(new_set, index_to_treeset.len() - 1);
+                                        index_to_treeset.len() - 1
+                                    } 
+                                };
+                                copy_diagonal.push((0, new_set_index));
+                            }
+                        }
+                    } else {
+                         // now we split this into three waves on this node
+                         let node_wf: &mut HashMap<isize, Vec<(usize, usize)>> = next_wavefronts.entry(node_index).or_insert_with(Default::default);
+                         // -1 on diagonal - graph advances, other does not (other has relative deletion); other_start is one less, but the offset increases still
+                         let minus_diagonal: &mut Vec<(usize, usize)> = node_wf.entry(*other_start-1).or_insert(vec![]);
+                         minus_diagonal.push((max_offset+1, best_set));
+                         
+                         // these two can only happen if sequence remains in other
+                         assert!(*other_start + max_offset as isize >= 0);
+                         if ((*other_start + max_offset as isize) as usize) < other_sequence.len() {
+                             // +0 on diagonal - both node and other advance with mismatch; other_start does not change, but offset increases +1
+                             let zero_diagonal: &mut Vec<(usize, usize)> = node_wf.entry(*other_start).or_insert(vec![]);
+                             zero_diagonal.push((max_offset+1, best_set));
+                             
+                             // +1 on diagonal - graph does not advance, other does (other has relative insertion); other_start is one more, but offset does not increase
+                             let plus_diagonal : &mut Vec<(usize, usize)> = node_wf.entry(*other_start+1).or_insert(vec![]);
+                             plus_diagonal.push((max_offset, best_set));
+                         }
+                    }
+                }
+                
+                if node_index == self.nodes.len() - 1 {
+                    // we are at the last node, check if we reached the end
+                    // this will store any final results
+                    let mut final_hashsets: Vec<usize> = vec![];
+                    for (other_start, vec_waves) in wavefront.iter() {
+                        for &(offset, hashset_index) in vec_waves.iter() {
+                            // if we are at the end of the node AND our sequence
+                            assert!(other_start + offset as isize >= 0);
+                            if offset == node_length && (other_start + offset as isize) as usize == other_sequence.len() {
+                                final_hashsets.push(hashset_index);
+                            }
+                        }
+                    }
+
+                    if !final_hashsets.is_empty() {
+                        // we've reached the end through one or more means, collapse and return
+                        // remove duplicates
+                        final_hashsets.sort();
+                        final_hashsets.dedup();
+                    
+                        // now collapse the non-duplicates if necessary
+                        let best_set: usize = if final_hashsets.len() > 1 {
+                            // we have more than one, merge them all together and then return that one
+                            let mut set_union: BitVec = BitVec::from_elem(self.nodes.len(), false);
+                            for &set_index in final_hashsets.iter() {
+                                let other_set: &BitVec = &index_to_treeset[set_index];
+                                set_union.or(other_set);
+                            }
+
+                            // get the index of this entry (or create one if necessary)
+                            let new_set_index: usize = match treeset_to_index.get(&set_union) {
+                                Some(i) => { *i },
+                                None => {
+                                    index_to_treeset.push(set_union.clone());
+                                    treeset_to_index.insert(set_union, index_to_treeset.len() - 1);
+                                    index_to_treeset.len() - 1
+                                } 
+                            };
+                            new_set_index
+                        } else {
+                            // only one exists, just copy it
+                            final_hashsets[0]
+                        };
+
+                        let sorted_traversed_nodes: Vec<usize> = index_to_treeset[best_set].iter()
+                            .enumerate()
+                            .filter(|(_i, b)| *b)
+                            .map(|(i, _b)| i)
+                            .collect();
+                        return Ok(WFAResult {
+                            score: edit_distance,
+                            traversed_nodes: sorted_traversed_nodes
+                        });
+                    }
+                }
+            }
+
+            // end of loop - increase ED and update active wavefronts
+            edit_distance += 1;
+            active_wavefronts = next_wavefronts;
+            next_wavefronts = Default::default();
+
+            if farthest_progression > prune_distance {
+                min_progression = farthest_progression - prune_distance;
+            }
+
+            trace!("edit_distance => {}, wave_fronts scanned => {}, active_indices={}..{}", edit_distance, wavefronts_scanned, min_active_wavefront, max_active_wavefront);
+
+            // safety while debugging
+            if edit_distance > max_edit_distance {
+                bail!("Max_edit_distance ({}) reached during WFA solving", max_edit_distance);
+            }
+        }
+    }
+}
+
+/// Container for POA results.
+#[derive(Debug, Eq, PartialEq)]
+pub struct WFAResult {
+    /// The score of the best match from the alignment
+    score: usize,
+    /// Nodes that were traversed to get this best match; conflicting node results indicate a tie in which branch should be traversed.
+    traversed_nodes: Vec<usize>
+}
+
+impl WFAResult {
+    pub fn score(&self) -> usize {
+        self.score
+    }
+    
+    pub fn traversed_nodes(&self) -> &[usize] {
+        &self.traversed_nodes
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    
+    #[test]
+    fn test_single_node() {
+        // create a new graph and add a single node to it
+        let mut graph: WFAGraph = WFAGraph::new();
+        let v1: Vec<u8> = vec![0, 1, 2, 4, 5];
+        graph.add_node(v1.clone(), vec![]).unwrap();
+
+        // test sequences
+        let v2: Vec<u8> = vec![0, 1, 3, 4, 5];
+        let v3: Vec<u8> = vec![1, 2, 3, 5];
+        let v4: Vec<u8> = vec![];
+
+        // check the nodes in the first one, but just score for the rest
+        assert_eq!(graph.edit_distance(&v1).unwrap(), WFAResult { score: 0, traversed_nodes: vec![0] });
+        assert_eq!(graph.edit_distance(&v2).unwrap().score(), 1);
+        assert_eq!(graph.edit_distance(&v3).unwrap().score(), 2);
+        assert_eq!(graph.edit_distance(&v4).unwrap().score(), 5);
+    }
+
+    #[test]
+    fn test_two_node_single_path() {
+        // this is the base sequence
+        let v1: Vec<u8> = vec![0, 1, 2, 4, 5];
+        for split_point in 0..v1.len() {
+            // split the sequence at various points and verify everything is still correct
+            let mut graph: WFAGraph = WFAGraph::new();
+            graph.add_node(v1[0..split_point].to_vec(), vec![]).unwrap();
+            graph.add_node(v1[split_point..].to_vec(), vec![0]).unwrap();
+
+            // test sequences
+            let v2: Vec<u8> = vec![0, 1, 3, 4, 5];
+            let v3: Vec<u8> = vec![1, 2, 3, 5];
+            let v4: Vec<u8> = vec![];
+
+            // check the nodes in the first one, but just score for the rest
+            assert_eq!(graph.edit_distance(&v1).unwrap(), WFAResult { score: 0, traversed_nodes: vec![0, 1] });
+            assert_eq!(graph.edit_distance(&v2).unwrap(), WFAResult { score: 1, traversed_nodes: vec![0, 1] });
+            assert_eq!(graph.edit_distance(&v3).unwrap(), WFAResult { score: 2, traversed_nodes: vec![0, 1] });
+            assert_eq!(graph.edit_distance(&v4).unwrap(), WFAResult { score: 5, traversed_nodes: vec![0, 1] });
+        }
+    }
+
+    #[test]
+    fn test_basic_variant() {
+        // create a graph where index 2 is an SNV change to either 2 or 3
+        let mut graph: WFAGraph = WFAGraph::new();
+        let v1: Vec<u8> = vec![0, 1, 2, 4, 5];
+        graph.add_node(v1[0..2].to_vec(), vec![]).unwrap();
+        graph.add_node(vec![2], vec![0]).unwrap();
+        graph.add_node(vec![3], vec![0]).unwrap();
+        graph.add_node(v1[3..].to_vec(), vec![1, 2]).unwrap();
+
+        // test sequences
+        // v2 is in the graph
+        let v2: Vec<u8> = vec![0, 1, 3, 4, 5];
+        // v3 is 2 away from v1, 3 away from v2
+        let v3: Vec<u8> = vec![1, 2, 3, 5];
+        // 5 away from both
+        let v4: Vec<u8> = vec![];
+        // 1 away from both
+        let v5: Vec<u8> = vec![0, 1, 4, 5];
+
+        // check the nodes in the first one, but just score for the rest
+        assert_eq!(graph.edit_distance(&v1).unwrap(), WFAResult { score: 0, traversed_nodes: vec![0, 1, 3] });
+        assert_eq!(graph.edit_distance(&v2).unwrap(), WFAResult { score: 0, traversed_nodes: vec![0, 2, 3] });
+        assert_eq!(graph.edit_distance(&v3).unwrap(), WFAResult { score: 2, traversed_nodes: vec![0, 1, 3] });
+        assert_eq!(graph.edit_distance(&v4).unwrap(), WFAResult { score: 5, traversed_nodes: vec![0, 1, 2, 3] });
+        assert_eq!(graph.edit_distance(&v5).unwrap(), WFAResult { score: 1, traversed_nodes: vec![0, 1, 2, 3] });
+    }
+
+    #[test]
+    fn test_triple_split() {
+        let v1 = vec![0, 1, 2, 3, 4, 5];
+        let v2 = vec![0, 1, 2, 4, 4, 5];
+        let v3 = vec![0, 1,    4, 4, 5];
+
+        // this construct splits the middle into 3 separate alleles (2, 3), (2, 4), and (-, 4)
+        let mut graph: WFAGraph = WFAGraph::new();
+        let root = graph.add_node(v1[0..2].to_vec(), vec![]).unwrap();
+        let s1 = graph.add_node(v1[2..4].to_vec(), vec![root]).unwrap();
+        let s2 = graph.add_node(v2[2..4].to_vec(), vec![root]).unwrap();
+        let s3 = graph.add_node(v3[2..3].to_vec(), vec![root]).unwrap();
+        let tail = graph.add_node(v1[4..].to_vec(), vec![s1, s2, s3]).unwrap();
+
+        assert_eq!(graph.edit_distance(&v1).unwrap(), WFAResult { score: 0, traversed_nodes: vec![root, s1, tail] });
+        assert_eq!(graph.edit_distance(&v2).unwrap(), WFAResult { score: 0, traversed_nodes: vec![root, s2, tail] });
+        assert_eq!(graph.edit_distance(&v3).unwrap(), WFAResult { score: 0, traversed_nodes: vec![root, s3, tail] });
+    }
+
+    #[test]
+    fn test_nested_split() {
+        let v1 = vec![0, 1, 2, 3, 4, 5];
+        let v2 = vec![0, 1, 2, 4, 4, 5];
+        let v3 = vec![0, 1,    4, 4, 5];
+
+        // this construct pairs2 (2, 4) with (-, 4)
+        let mut graph: WFAGraph = WFAGraph::new();
+        let root = graph.add_node(v1[0..2].to_vec(), vec![]).unwrap();
+        // (2, 3) still alone
+        let s1 = graph.add_node(v1[2..4].to_vec(), vec![root]).unwrap();
+        // s2 contains just (2, )
+        let s2 = graph.add_node(v2[2..3].to_vec(), vec![root]).unwrap();
+        // s3 contains just (4, ), but it allows you to come directly from root to enable the deletion in v3
+        let s3 = graph.add_node(v2[3..4].to_vec(), vec![root, s2]).unwrap();
+        // tail is the same
+        let tail = graph.add_node(v1[4..].to_vec(), vec![s1, s3]).unwrap();
+
+        assert_eq!(graph.edit_distance(&v1).unwrap(), WFAResult { score: 0, traversed_nodes: vec![root, s1, tail] });
+        assert_eq!(graph.edit_distance(&v2).unwrap(), WFAResult { score: 0, traversed_nodes: vec![root, s2, s3, tail] });
+        assert_eq!(graph.edit_distance(&v3).unwrap(), WFAResult { score: 0, traversed_nodes: vec![root, s3, tail] });
+    }
+
+    #[test]
+    fn test_double_split() {
+        let v1 = vec![0, 1, 2, 3, 4, 5];
+        let v2 = vec![0, 1, 2, 4, 4, 5];
+        let v3 = vec![0, 1,    4, 4, 5];
+
+        // this construct separate the deletion event from the SNV event with a "gap" in the middle
+        let mut graph: WFAGraph = WFAGraph::new();
+        let root = graph.add_node(v1[0..2].to_vec(), vec![]).unwrap();
+        // s1 just contains (2, ) 
+        let s1 = graph.add_node(v1[2..3].to_vec(), vec![root]).unwrap();
+        // s2 is an empty join
+        let s2 = graph.add_node(vec![], vec![root, s1]).unwrap();
+        // s3 contains just (3, )
+        let s3 = graph.add_node(v1[3..4].to_vec(), vec![s2]).unwrap();
+        // s4 contains just (4, )
+        let s4 = graph.add_node(v2[3..4].to_vec(), vec![s2]).unwrap();
+        // tail picks up the last two bases
+        let tail = graph.add_node(v1[4..].to_vec(), vec![s3, s4]).unwrap();
+
+        assert_eq!(graph.edit_distance(&v1).unwrap(), WFAResult { score: 0, traversed_nodes: vec![root, s1, s2, s3, tail] });
+        assert_eq!(graph.edit_distance(&v2).unwrap(), WFAResult { score: 0, traversed_nodes: vec![root, s1, s2, s4, tail] });
+        assert_eq!(graph.edit_distance(&v3).unwrap(), WFAResult { score: 0, traversed_nodes: vec![root, s2, s4, tail] });
+    }
+
+    #[test]
+    fn test_overlapping_split() {
+        let v1 = vec![0, 1, 2, 3, 4, 5];
+        let v2 = vec![0,       3, 4, 5];
+        let v3 = vec![0, 1,       4, 5];
+
+        /*
+        Graph structure of overlapping splits, both of which delete the "2":
+               ->   ->   -v
+        0 -> 1 -> 2 -> 3 -> 4,5
+          ->   ->   -^
+        */
+        let mut graph: WFAGraph = WFAGraph::new();
+        let root = graph.add_node(v1[0..1].to_vec(), vec![]).unwrap();
+        // represents the (1, )
+        let s1 = graph.add_node(v1[1..2].to_vec(), vec![root]).unwrap();
+        // represents the (2, )
+        let s2 = graph.add_node(v1[2..3].to_vec(), vec![s1]).unwrap();
+        // represents the (3, )
+        let s3 = graph.add_node(v1[3..4].to_vec(), vec![root, s2]).unwrap();
+        // represent the tail
+        let tail = graph.add_node(v1[4..].to_vec(), vec![s1, s3]).unwrap();
+
+        assert_eq!(graph.edit_distance(&v1).unwrap(), WFAResult { score: 0, traversed_nodes: vec![root, s1, s2, s3, tail] });
+        assert_eq!(graph.edit_distance(&v2).unwrap(), WFAResult { score: 0, traversed_nodes: vec![root, s3, tail] });
+        assert_eq!(graph.edit_distance(&v3).unwrap(), WFAResult { score: 0, traversed_nodes: vec![root, s1, tail] });
+    }
+
+    #[test]
+    fn test_simple_snv() {
+        let reference = "AAA".as_bytes();
+        let variants = vec![Variant::new_snv(0, 1, "A".as_bytes().to_vec(), "C".as_bytes().to_vec(), 0, 1)];
+
+        let (graph, node_to_alleles): (WFAGraph, NodeAlleleMap) = 
+            WFAGraph::from_reference_variants(&reference, &variants, 0, reference.len()).unwrap();
+
+        // check the alignments first
+        assert_eq!(graph.get_num_nodes(), 4);
+        // remember ALT alleles get added before reference alleles
+        assert_eq!(graph.edit_distance(&reference).unwrap(), WFAResult { score: 0, traversed_nodes: vec![0, 2, 3] });
+        assert_eq!(graph.edit_distance("ACA".as_bytes()).unwrap(), WFAResult { score: 0, traversed_nodes: vec![0, 1, 3] });
+        assert_eq!(graph.edit_distance("AA".as_bytes()).unwrap(), WFAResult { score: 1, traversed_nodes: vec![0, 1, 2, 3] });
+
+        // now check our lookup tables
+        assert_eq!(*node_to_alleles.get(&0).unwrap_or(&vec![]), vec![]);
+        assert_eq!(*node_to_alleles.get(&1).unwrap_or(&vec![]), vec![(0, 1)]);
+        assert_eq!(*node_to_alleles.get(&2).unwrap_or(&vec![]), vec![(0, 0)]);
+        assert_eq!(*node_to_alleles.get(&3).unwrap_or(&vec![]), vec![]);
+    }
+
+    #[test]
+    fn test_multiple_variants() {
+        // two A>C SNVs with reference in between
+        let reference = "AAAAA".as_bytes();
+        let variants = vec![
+            // vcf_index, position, allele0, allele1, index_allele0, index_allele1
+            Variant::new_snv(0, 1, "A".as_bytes().to_vec(), "C".as_bytes().to_vec(), 0, 1),
+            Variant::new_snv(0, 3, "A".as_bytes().to_vec(), "C".as_bytes().to_vec(), 0, 1)
+        ];
+
+        let (graph, node_to_alleles): (WFAGraph, NodeAlleleMap) = 
+            WFAGraph::from_reference_variants(&reference, &variants, 0, reference.len()).unwrap();
+        
+        // check the alignments first
+        assert_eq!(graph.get_num_nodes(), 7);
+        // remember ALT alleles get added before reference alleles
+        /*
+        REF: 0 -> 2 -> 3 -> 5 -> 6
+        ALT:   -> 1 -^   -> 4 -^
+         */
+        assert_eq!(graph.edit_distance(&reference).unwrap(), WFAResult { score: 0, traversed_nodes: vec![0, 2, 3, 5, 6] });
+        assert_eq!(graph.edit_distance("ACAAA".as_bytes()).unwrap(), WFAResult { score: 0, traversed_nodes: vec![0, 1, 3, 5, 6] });
+        assert_eq!(graph.edit_distance("AAACA".as_bytes()).unwrap(), WFAResult { score: 0, traversed_nodes: vec![0, 2, 3, 4, 6] });
+        assert_eq!(graph.edit_distance("ACACA".as_bytes()).unwrap(), WFAResult { score: 0, traversed_nodes: vec![0, 1, 3, 4, 6] });
+        assert_eq!(graph.edit_distance("AAA".as_bytes()).unwrap(), WFAResult { score: 2, traversed_nodes: vec![0, 1, 2, 3, 4, 5, 6] });
+        assert_eq!(graph.edit_distance("AGAGA".as_bytes()).unwrap(), WFAResult { score: 2, traversed_nodes: vec![0, 1, 2, 3, 4, 5, 6] });
+
+        // check some mismatches on reference real quick also
+        assert_eq!(graph.edit_distance("GAAAA".as_bytes()).unwrap(), WFAResult { score: 1, traversed_nodes: vec![0, 2, 3, 5, 6] });
+        assert_eq!(graph.edit_distance("ACAGAA".as_bytes()).unwrap(), WFAResult { score: 1, traversed_nodes: vec![0, 1, 3, 5, 6] });
+
+        // now check our lookup tables
+        assert_eq!(*node_to_alleles.get(&0).unwrap_or(&vec![]), vec![]);
+        assert_eq!(*node_to_alleles.get(&1).unwrap_or(&vec![]), vec![(0, 1)]);
+        assert_eq!(*node_to_alleles.get(&2).unwrap_or(&vec![]), vec![(0, 0)]);
+        assert_eq!(*node_to_alleles.get(&3).unwrap_or(&vec![]), vec![]);
+        assert_eq!(*node_to_alleles.get(&4).unwrap_or(&vec![]), vec![(1, 1)]);
+        assert_eq!(*node_to_alleles.get(&5).unwrap_or(&vec![]), vec![(1, 0)]);
+        assert_eq!(*node_to_alleles.get(&6).unwrap_or(&vec![]), vec![]);
+    }
+
+    #[test]
+    fn test_overlapping_variants() {
+        let reference = "ACGTA".as_bytes();
+        let variants = vec![
+            // vcf_index, position, ref_len, allele0, allele1, index_allele0, index_allele1
+            Variant::new_deletion(0, 1, 2, "CG".as_bytes().to_vec(), "C".as_bytes().to_vec(), 0, 1),
+            Variant::new_deletion(0, 2, 2, "GT".as_bytes().to_vec(), "G".as_bytes().to_vec(), 0, 1)
+        ];
+
+        let (graph, node_to_alleles): (WFAGraph, NodeAlleleMap) = 
+            WFAGraph::from_reference_variants(&reference, &variants, 0, reference.len()).unwrap();
+        
+        // check the alignments first
+        assert_eq!(graph.get_num_nodes(), 7);
+        // remember ALT alleles get added before reference alleles
+        /*
+        REF: 0 -> 2 -> 4 -> 5 -> 6
+        ALT:   -> 1      -^
+                    -> 3      -^
+         */
+        assert_eq!(graph.edit_distance(&reference).unwrap(), WFAResult { score: 0, traversed_nodes: vec![0, 2, 4, 5, 6] });
+        assert_eq!(graph.edit_distance("ACTA".as_bytes()).unwrap(), WFAResult { score: 0, traversed_nodes: vec![0, 1, 5, 6] });
+        assert_eq!(graph.edit_distance("ACGA".as_bytes()).unwrap(), WFAResult { score: 0, traversed_nodes: vec![0, 2, 3, 6] });
+        // ACGTA -> AGTA, 1 ed; ACTA -> AGTA, 1 ed; so first allele is ambiguous, other should be reference
+        assert_eq!(graph.edit_distance("AGTA".as_bytes()).unwrap(), WFAResult { score: 1, traversed_nodes: vec![0, 1, 2, 4, 5, 6] });
+        // AA -> AGTA, 2 ed; AA -> ACTA, 2 ed; AA -> ACGTA, 3 ed; but this should still lead to fully ambiguous solution without node 4 (it's skipped in both 2-ed paths)
+        assert_eq!(graph.edit_distance("AA".as_bytes()).unwrap(), WFAResult { score: 2, traversed_nodes: vec![0, 1, 2, 3, 5, 6] });
+        
+        // now check our lookup tables
+        assert_eq!(*node_to_alleles.get(&0).unwrap_or(&vec![]), vec![]);
+        assert_eq!(*node_to_alleles.get(&1).unwrap_or(&vec![]), vec![(0, 1)]);
+        assert_eq!(*node_to_alleles.get(&2).unwrap_or(&vec![]), vec![(0, 0)]);
+        assert_eq!(*node_to_alleles.get(&3).unwrap_or(&vec![]), vec![(1, 1)]);
+        assert_eq!(*node_to_alleles.get(&4).unwrap_or(&vec![]), vec![(1, 0)]);
+        assert_eq!(*node_to_alleles.get(&5).unwrap_or(&vec![]), vec![]);
+        assert_eq!(*node_to_alleles.get(&6).unwrap_or(&vec![]), vec![]);
+    }
+
+    #[test]
+    fn test_identical_insertions() {
+        // this can happen if we pair something like DV with pbmm2 and there is a "large" insertion
+        let reference = "ACGTA".as_bytes();
+        let variants = vec![
+            // vcf_index, position, allele0, allele1, index_allele0, index_allele1
+            Variant::new_insertion(0, 2, "G".as_bytes().to_vec(), "GT".as_bytes().to_vec(), 0, 1),
+            Variant::new_insertion(1, 2, "G".as_bytes().to_vec(), "GT".as_bytes().to_vec(), 0, 1)
+        ];
+
+        let (graph, node_to_alleles): (WFAGraph, NodeAlleleMap) = 
+            WFAGraph::from_reference_variants(&reference, &variants, 0, reference.len()).unwrap();
+        
+        // check the alignments first
+        assert_eq!(graph.get_num_nodes(), 5);
+        /* 
+        REF: 0 -> 3 -> 4
+        ALT:   -> 1 -^
+               -> 2 -^
+        */
+        assert_eq!(graph.edit_distance(&reference).unwrap(), WFAResult { score: 0, traversed_nodes: vec![0, 3, 4] });
+        assert_eq!(graph.edit_distance("ACGTTA".as_bytes()).unwrap(), WFAResult { score: 0, traversed_nodes: vec![0, 1, 2, 4] });
+        // tests an insertion of the wrong character, this should be fully ambiguous
+        assert_eq!(graph.edit_distance("ACGATA".as_bytes()).unwrap(), WFAResult { score: 1, traversed_nodes: vec![0, 1, 2, 3, 4] });
+        
+        // now check our lookup tables
+        assert_eq!(*node_to_alleles.get(&0).unwrap_or(&vec![]), vec![]);
+        assert_eq!(*node_to_alleles.get(&1).unwrap_or(&vec![]), vec![(0, 1)]);
+        assert_eq!(*node_to_alleles.get(&2).unwrap_or(&vec![]), vec![(1, 1)]);
+        assert_eq!(*node_to_alleles.get(&3).unwrap_or(&vec![]), vec![(0, 0), (1, 0)]); // this one has BOTH reference alleles
+        assert_eq!(*node_to_alleles.get(&4).unwrap_or(&vec![]), vec![]);
+    }
+
+    #[test]
+    fn test_multiallelic_indel() {
+        // tests GT -> G/GTT
+        let reference = "ACGTA".as_bytes();
+        let variants = vec![
+            // vcf_index, position, ref_len, allele0, allele1, index_allele0, index_allele1
+            Variant::new_indel(0, 2, 2, "G".as_bytes().to_vec(), "GTT".as_bytes().to_vec(), 1, 2)
+        ];
+
+        let (graph, node_to_alleles): (WFAGraph, NodeAlleleMap) = 
+            WFAGraph::from_reference_variants(&reference, &variants, 0, reference.len()).unwrap();
+        
+        // check the alignments first
+        assert_eq!(graph.get_num_nodes(), 5);
+        /* 
+        REF: 0 -> 3 -> 4
+        ALT:   -> 1 -^
+               -> 2 -^
+        */
+        assert_eq!(graph.edit_distance(&reference).unwrap(), WFAResult { score: 0, traversed_nodes: vec![0, 3, 4] });
+        assert_eq!(graph.edit_distance("ACGA".as_bytes()).unwrap(), WFAResult { score: 0, traversed_nodes: vec![0, 1, 4] });
+        assert_eq!(graph.edit_distance("ACGTTA".as_bytes()).unwrap(), WFAResult { score: 0, traversed_nodes: vec![0, 2, 4] });
+        // we can't really get an ambiguous allele assignment here, but we can do ambiguous branches
+        assert_eq!(graph.edit_distance("ACGGA".as_bytes()).unwrap(), WFAResult { score: 1, traversed_nodes: vec![0, 1, 3, 4] });
+        assert_eq!(graph.edit_distance("ACGGTA".as_bytes()).unwrap(), WFAResult { score: 1, traversed_nodes: vec![0, 2, 3, 4] });
+        
+        // now check our lookup tables
+        assert_eq!(*node_to_alleles.get(&0).unwrap_or(&vec![]), vec![]);
+        assert_eq!(*node_to_alleles.get(&1).unwrap_or(&vec![]), vec![(0, 0)]); // ALT0
+        assert_eq!(*node_to_alleles.get(&2).unwrap_or(&vec![]), vec![(0, 1)]); // ALT1
+        assert_eq!(*node_to_alleles.get(&3).unwrap_or(&vec![]), vec![]); // we still need to inject a reference allele, but it shouldn't have an assoc. with the variant
+        assert_eq!(*node_to_alleles.get(&4).unwrap_or(&vec![]), vec![]);
+    }
+
+    #[test]
+    fn test_partial_reference() {
+        // we prepended and appended "AA" to the basic SNV test
+        let reference = "AAAAAAA".as_bytes();
+        // variant coordinate shifted +2
+        let variants = vec![Variant::new_snv(0, 3, "A".as_bytes().to_vec(), "C".as_bytes().to_vec(), 0, 1)];
+
+        let (graph, node_to_alleles): (WFAGraph, NodeAlleleMap) = 
+            WFAGraph::from_reference_variants(&reference, &variants, 2, reference.len()-2).unwrap();
+
+        // check the alignments first
+        assert_eq!(graph.get_num_nodes(), 4);
+        // remember ALT alleles get added before reference alleles
+        assert_eq!(graph.edit_distance(&reference[2..5]).unwrap(), WFAResult { score: 0, traversed_nodes: vec![0, 2, 3] });
+        assert_eq!(graph.edit_distance("ACA".as_bytes()).unwrap(), WFAResult { score: 0, traversed_nodes: vec![0, 1, 3] });
+        assert_eq!(graph.edit_distance("AA".as_bytes()).unwrap(), WFAResult { score: 1, traversed_nodes: vec![0, 1, 2, 3] });
+
+        // now check our lookup tables
+        assert_eq!(*node_to_alleles.get(&0).unwrap_or(&vec![]), vec![]);
+        assert_eq!(*node_to_alleles.get(&1).unwrap_or(&vec![]), vec![(0, 1)]);
+        assert_eq!(*node_to_alleles.get(&2).unwrap_or(&vec![]), vec![(0, 0)]);
+        assert_eq!(*node_to_alleles.get(&3).unwrap_or(&vec![]), vec![]);
+    }
+
+    #[test]
+    fn test_complex_problem() {
+        let reference = "AACGTTGACGTCC".as_bytes(); // we skip 2 on the front, 1 on the back
+        let variants = vec![
+            // vcf_index, position, ref_len, allele0, allele1, index_allele0, index_allele1
+            // 3: GTTG>G
+            Variant::new_deletion(0, 3, 4, "GTTG".as_bytes().to_vec(), "G".as_bytes().to_vec(), 0, 1),
+            // 4: TT>T
+            Variant::new_deletion(0, 4, 2, "TT".as_bytes().to_vec(), "T".as_bytes().to_vec(), 0, 1),
+            // vcf_index, position, allele0, allele1, index_allele0, index_allele1
+            // 6: G>[A,C]
+            Variant::new_snv(0, 6, "A".as_bytes().to_vec(), "C".as_bytes().to_vec(), 1, 2)
+        ];
+
+        let (graph, node_to_alleles): (WFAGraph, NodeAlleleMap) = 
+            WFAGraph::from_reference_variants(&reference, &variants, 2, reference.len()-1).unwrap();
+
+        // check the alignments first
+        assert_eq!(graph.get_num_nodes(), 9);
+        /* 
+        // this tests the scenario where one variant rejoins at the start of another
+        // here, the TT>T joins right before the G>[A,C] so node 3 and 4 are *both* parents of 5, 6, and 7
+        REF: 0 -> 2 -> 4 -> 7 -> 8
+        ALT:   -> 1           -^
+                    -> 3 -|
+                         -> 5 -^
+                         -> 6 -^
+        */
+        // remember ALT alleles get added before reference alleles
+        assert_eq!(graph.edit_distance("CGTTGACGTC".as_bytes()).unwrap(), WFAResult { score: 0, traversed_nodes: vec![0, 2, 4, 7, 8] }); //reference
+        assert_eq!(graph.edit_distance("CGACGTC".as_bytes()).unwrap(), WFAResult { score: 0, traversed_nodes: vec![0, 1, 8] }); //first variant only
+        assert_eq!(graph.edit_distance("CGTGACGTC".as_bytes()).unwrap(), WFAResult { score: 0, traversed_nodes: vec![0, 2, 3, 7, 8] }); //second variant only
+        assert_eq!(graph.edit_distance("CGTTAACGTC".as_bytes()).unwrap(), WFAResult { score: 0, traversed_nodes: vec![0, 2, 4, 5, 8] }); //third-0 variant only
+        assert_eq!(graph.edit_distance("CGTTCACGTC".as_bytes()).unwrap(), WFAResult { score: 0, traversed_nodes: vec![0, 2, 4, 6, 8] }); //third-1 variant only
+        assert_eq!(graph.edit_distance("CGTAACGTC".as_bytes()).unwrap(), WFAResult { score: 0, traversed_nodes: vec![0, 2, 3, 5, 8] }); //second and third-0 variants
+        assert_eq!(graph.edit_distance("CGTCACGTC".as_bytes()).unwrap(), WFAResult { score: 0, traversed_nodes: vec![0, 2, 3, 6, 8] }); //second and third-1 variants
+
+        // now try some inexacts
+        assert_eq!(graph.edit_distance("CGGACGTC".as_bytes()).unwrap(), WFAResult { score: 1, traversed_nodes: vec![0, 1, 2, 3, 7, 8] }); //delete both Ts, ambiguous deletion call
+        assert_eq!(graph.edit_distance("CGTACGTC".as_bytes()).unwrap(), WFAResult { score: 1, traversed_nodes: vec![0, 1, 2, 3, 5, 6, 7, 8] }); //delete TG, ambiguous in a lot of ways
+
+        // now check our lookup tables
+        assert_eq!(*node_to_alleles.get(&0).unwrap_or(&vec![]), vec![]);
+        assert_eq!(*node_to_alleles.get(&1).unwrap_or(&vec![]), vec![(0, 1)]);
+        assert_eq!(*node_to_alleles.get(&2).unwrap_or(&vec![]), vec![(0, 0)]);
+        assert_eq!(*node_to_alleles.get(&3).unwrap_or(&vec![]), vec![(1, 1)]);
+        assert_eq!(*node_to_alleles.get(&4).unwrap_or(&vec![]), vec![(1, 0)]);
+        assert_eq!(*node_to_alleles.get(&5).unwrap_or(&vec![]), vec![(2, 0)]);
+        assert_eq!(*node_to_alleles.get(&6).unwrap_or(&vec![]), vec![(2, 1)]);
+        assert_eq!(*node_to_alleles.get(&7).unwrap_or(&vec![]), vec![]);
+        assert_eq!(*node_to_alleles.get(&8).unwrap_or(&vec![]), vec![]);
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    // After here are mostly edge case bug tests
+    ////////////////////////////////////////////////////////////////////////////////
+    
+
+    // tests when a variant goes past the provided reference, we should ignore it basically
+    #[test]
+    fn test_span_ref_end() {
+        // tests GT -> G/GTT
+        let reference = "ACGTA".as_bytes();
+        let variants = vec![
+            // vcf_index, position, ref_len, allele0, allele1, index_allele0, index_allele1
+            Variant::new_deletion(0, 3, 3, "TAG".as_bytes().to_vec(), "T".as_bytes().to_vec(), 0, 1)
+        ];
+
+        let (graph, node_to_alleles): (WFAGraph, NodeAlleleMap) = 
+            WFAGraph::from_reference_variants(&reference, &variants, 0, reference.len()).unwrap();
+        
+        // check the alignments first
+        assert_eq!(graph.get_num_nodes(), 1);
+
+        // now check our lookup tables
+        assert_eq!(*node_to_alleles.get(&0).unwrap_or(&vec![]), vec![]);
+    }
+
+    #[test]
+    fn test_hom_variants() {
+        // add a hom variant at base 1; these can be traversed, but provide no index lookup because they are not a "variant"
+        let reference = "AAAAA".as_bytes();
+        let variants = vec![Variant::new_snv(0, 3, "A".as_bytes().to_vec(), "C".as_bytes().to_vec(), 0, 1)];
+        let hom_variants = vec![Variant::new_snv(0, 1, "A".as_bytes().to_vec(), "C".as_bytes().to_vec(), 0, 1)];
+
+        let (graph, node_to_alleles): (WFAGraph, NodeAlleleMap) = 
+            WFAGraph::from_reference_variants_with_hom(&reference, &variants, &hom_variants, 0, reference.len()).unwrap();
+
+        // check the alignments first
+        assert_eq!(graph.get_num_nodes(), 7);
+        // remember ALT alleles get added before reference alleles
+        assert_eq!(graph.edit_distance("AAAAA".as_bytes()).unwrap(), WFAResult { score: 0, traversed_nodes: vec![0, 2, 3, 5, 6] });
+        assert_eq!(graph.edit_distance("ACAAA".as_bytes()).unwrap(), WFAResult { score: 0, traversed_nodes: vec![0, 1, 3, 5, 6] });
+        assert_eq!(graph.edit_distance("ACACA".as_bytes()).unwrap(), WFAResult { score: 0, traversed_nodes: vec![0, 1, 3, 4, 6] });
+        assert_eq!(graph.edit_distance("ACAA".as_bytes()).unwrap(), WFAResult { score: 1, traversed_nodes: vec![0, 1, 3, 4, 5, 6] });
+
+        // now check our lookup tables
+        assert_eq!(*node_to_alleles.get(&0).unwrap_or(&vec![]), vec![]);
+        assert_eq!(*node_to_alleles.get(&1).unwrap_or(&vec![]), vec![]);
+        assert_eq!(*node_to_alleles.get(&2).unwrap_or(&vec![]), vec![]);
+        assert_eq!(*node_to_alleles.get(&3).unwrap_or(&vec![]), vec![]);
+        assert_eq!(*node_to_alleles.get(&4).unwrap_or(&vec![]), vec![(0, 1)]);
+        assert_eq!(*node_to_alleles.get(&5).unwrap_or(&vec![]), vec![(0, 0)]);
+        assert_eq!(*node_to_alleles.get(&6).unwrap_or(&vec![]), vec![]);
+    }
+
+    #[test]
+    fn test_variant_at_start() {
+        let reference = "AAA".as_bytes();
+        let variants = vec![Variant::new_snv(0, 0, "A".as_bytes().to_vec(), "C".as_bytes().to_vec(), 0, 1)];
+
+        let (graph, node_to_alleles): (WFAGraph, NodeAlleleMap) = 
+            WFAGraph::from_reference_variants(&reference, &variants, 0, reference.len()).unwrap();
+
+        // check the alignments first
+        assert_eq!(graph.get_num_nodes(), 4);
+        // remember ALT alleles get added before reference alleles
+        assert_eq!(graph.edit_distance(&reference).unwrap(), WFAResult { score: 0, traversed_nodes: vec![0, 2, 3] });
+        assert_eq!(graph.edit_distance("CAA".as_bytes()).unwrap(), WFAResult { score: 0, traversed_nodes: vec![0, 1, 3] });
+        assert_eq!(graph.edit_distance("AA".as_bytes()).unwrap(), WFAResult { score: 1, traversed_nodes: vec![0, 1, 2, 3] });
+
+        // now check our lookup tables
+        assert_eq!(*node_to_alleles.get(&0).unwrap_or(&vec![]), vec![]);
+        assert_eq!(*node_to_alleles.get(&1).unwrap_or(&vec![]), vec![(0, 1)]);
+        assert_eq!(*node_to_alleles.get(&2).unwrap_or(&vec![]), vec![(0, 0)]);
+        assert_eq!(*node_to_alleles.get(&3).unwrap_or(&vec![]), vec![]);
+    }
+
+    #[test]
+    fn test_variant_at_end() {
+        let reference = "AAA".as_bytes();
+        let variants = vec![Variant::new_snv(0, 2, "A".as_bytes().to_vec(), "C".as_bytes().to_vec(), 0, 1)];
+
+        let (graph, node_to_alleles): (WFAGraph, NodeAlleleMap) = 
+            WFAGraph::from_reference_variants(&reference, &variants, 0, reference.len()).unwrap();
+
+        // check the alignments first
+        assert_eq!(graph.get_num_nodes(), 4);
+        // remember ALT alleles get added before reference alleles
+        assert_eq!(graph.edit_distance(&reference).unwrap(), WFAResult { score: 0, traversed_nodes: vec![0, 2, 3] });
+        assert_eq!(graph.edit_distance("AAC".as_bytes()).unwrap(), WFAResult { score: 0, traversed_nodes: vec![0, 1, 3] });
+        assert_eq!(graph.edit_distance("AA".as_bytes()).unwrap(), WFAResult { score: 1, traversed_nodes: vec![0, 1, 2, 3] });
+
+        // now check our lookup tables
+        assert_eq!(*node_to_alleles.get(&0).unwrap_or(&vec![]), vec![]);
+        assert_eq!(*node_to_alleles.get(&1).unwrap_or(&vec![]), vec![(0, 1)]);
+        assert_eq!(*node_to_alleles.get(&2).unwrap_or(&vec![]), vec![(0, 0)]);
+        assert_eq!(*node_to_alleles.get(&3).unwrap_or(&vec![]), vec![]);
+    }
+}
\ No newline at end of file
diff --git a/src/writers/block_stats.rs b/src/writers/block_stats.rs
new file mode 100644
index 0000000..5293076
--- /dev/null
+++ b/src/writers/block_stats.rs
@@ -0,0 +1,370 @@
+
+use log::debug;
+use rustc_hash::FxHashMap as HashMap;
+use serde::Serialize;
+use std::fs::File;
+use std::path::Path;
+
+use crate::block_gen::PhaseBlock;
+use crate::data_types::reference_genome::ReferenceGenome;
+use crate::data_types::variants::{VariantType, Zygosity};
+use crate::phaser::PhaseResult;
+
+/// This is a wrapper for writing out any stats to a file
+#[derive(Default)]
+pub struct BlockStatsCollector {
+    /// Blocks that will be written out eventually
+    blocks: Vec<PhaseBlock>,
+    /// Tracks the number of phased SNVs, key is (sample_name, chromosome)
+    phased_snvs: HashMap<(String, String), usize>
+}
+
+/// Contains all the data written to each row of our stats file
+#[derive(Serialize)]
+struct BlockRow {
+    /// The index of the block
+    source_block_index: usize,
+    /// the sample name tied to the block
+    sample_name: String,
+    /// Phase set ID - usually the position first variant in the block
+    phase_block_id: u64,
+    /// the chromosome of the block
+    chrom: String,
+    /// the position of the first allele
+    start: u64,
+    /// the position of the last allele
+    end: u64,
+    /// the number of variants in the block
+    num_variants: usize
+}
+
+#[derive(Serialize)]
+struct SummaryRow {
+    /// the sample name
+    sample_name: String,
+    /// The chromosome or "all"
+    chromosome: String,
+    /// The total number of variants
+    num_variants: usize,
+    /// The total number of heterozygous variants
+    num_heterozygous: usize,
+    /// The number of phased heterozygous variants
+    num_phased: usize,
+    /// The number of unphased heterozygous variants
+    num_unphased: usize,
+    /// The number of heterozygous SNVs
+    num_het_snv: usize,
+    /// The number of phased, heterozygous SNVs
+    num_phased_snv: usize,
+    /// The total number of blocks
+    num_blocks: usize,
+    /// The number of blocks with only 1 phased variant
+    num_singletons: usize,
+    /// variants per block stats
+    variants_per_block_median: usize,
+    variants_per_block_mean: usize,
+    variants_per_block_min: usize,
+    variants_per_block_max: usize,
+    variants_per_block_sum: usize,
+    /// basepairs per block stats
+    basepairs_per_block_median: u64,
+    basepairs_per_block_mean: u64,
+    basepairs_per_block_min: u64,
+    basepairs_per_block_max: u64,
+    basepairs_per_block_sum: u64,
+    /// block NG50
+    block_ng50: Option<u64>
+}
+
+impl BlockStatsCollector {
+    /// Creates a new writer for a given filename
+    /// # Arguments
+    /// * `csv_filename` - the path to write all stats to
+    pub fn new() -> BlockStatsCollector {
+        Self::default()
+    }
+
+    /// Adds a block to our collection
+    /// # Arguments
+    /// * `block` - the block to add, no checks are performed on the input
+    pub fn add_block(&mut self, block: PhaseBlock) {
+        self.blocks.push(block);
+    }
+
+    /// Adds a phase result to our statistics
+    /// # Arguments
+    /// * `chrom` - the chromosome for the result
+    /// * `result` 
+    pub fn add_result(&mut self, result: &PhaseResult) {
+        if let Some(stats) = result.statistics.as_ref() {
+            if let Some(count) = stats.phased_snvs() {
+                let sample_name: String = result.phase_block.sample_name().to_string();
+                let chrom: String = result.phase_block.get_chrom().to_string();
+                *self.phased_snvs.entry((sample_name, chrom)).or_insert(0) += count as usize;
+            }
+        }
+    }
+
+    /// Will write all blocks to a CSV filename in order
+    /// # Arguments
+    /// * `filename` - the filename for the output (tsv/csv)
+    pub fn write_blocks(&mut self, filename: &Path) -> csv::Result<()> {
+        // modify the delimiter to "," if it ends with .csv
+        let is_csv: bool = filename.extension().unwrap_or_default() == "csv";
+        let delimiter: u8 = if is_csv { b',' } else { b'\t' };
+        let mut csv_writer: csv::Writer<File> = csv::WriterBuilder::new()
+            .delimiter(delimiter)
+            .from_path(filename)?;
+
+        // make sure we go through the blocks in order
+        self.blocks.sort();
+        for block in self.blocks.iter() {
+            let block_row = BlockRow {
+                source_block_index: block.get_block_index(),
+                sample_name: block.sample_name().to_string(),
+                phase_block_id: block.get_start()+1,
+                chrom: block.get_chrom().to_string(),
+                start: block.get_start()+1,
+                end: block.get_end()+1,
+                num_variants: block.get_num_variants()
+            };
+            csv_writer.serialize(&block_row)?;
+        }
+        csv_writer.flush()?;
+        Ok(())
+    }
+
+    /// Will write out a file containing chromosome level block statistics and overall statistics.
+    /// # Arguments
+    /// * `filename` - the filename for the output (tsv/csv)
+    /// * `reference_genome` - the reference genome, without this we can't determine lengths
+    /// * `variant_counts` - the variant count data from parsing; key is (sample, chromosome, variant type, zygosity), value is a count
+    pub fn write_block_stats(
+        &self, 
+        sample_order: &[String],
+        filename: &Path, 
+        reference_genome: &ReferenceGenome, 
+        variant_counts: HashMap<(String, String, VariantType, Zygosity), usize>
+    ) -> csv::Result<()> {
+        // modify the delimiter to "," if it ends with .csv
+        let is_csv: bool = filename.extension().unwrap_or_default() == "csv";
+        let delimiter: u8 = if is_csv { b',' } else { b'\t' };
+        let mut csv_writer: csv::Writer<File> = csv::WriterBuilder::new()
+            .delimiter(delimiter)
+            .from_path(filename)?;
+        
+        // calculate the total reference length assuming we have a reference
+        let total_contig_length: u64 = {
+            let mut contig_sum: u64 = 0;
+            for contig_name in reference_genome.contig_keys().iter() {
+                let contig_length = reference_genome.get_full_chromosome(contig_name).len();
+                contig_sum += contig_length as u64;
+            }
+            contig_sum
+        };
+
+        for sample_name in sample_order.iter() {
+            // go through all the blocks and generate chromosome level and overall stats
+            let mut blocks_by_chrom: HashMap<String, Vec<PhaseBlock>> = Default::default();
+            let mut all_sample_blocks: Vec<PhaseBlock> = Default::default();
+            
+            for block in self.blocks.iter() {
+                if block.sample_name() == sample_name {
+                    let chrom: String = block.get_chrom().to_string();
+                    blocks_by_chrom.entry(chrom).or_insert(vec![]).push(block.clone());
+                    all_sample_blocks.push(block.clone());
+                }
+            }
+
+            // these are from parsing the VCFs
+            let mut num_variants: HashMap<String, usize> = Default::default();
+            let mut num_heterozygous: HashMap<String, usize> = Default::default();
+            let mut num_het_snv: HashMap<String, usize> = Default::default();
+            let mut ordered_iteration: Vec<_> = variant_counts.iter().collect();
+            ordered_iteration.sort();
+            for ((sample, chrom, variant_type, zygosity), &count) in ordered_iteration.iter() {
+                debug!("{} {} {:?} {:?} => {}", sample, chrom, variant_type, zygosity, count);
+                if sample == sample_name && *variant_type != VariantType::Unknown && *zygosity != Zygosity::HomozygousReference && *zygosity != Zygosity::Unknown {
+                    // okay, we're working with something we can count now
+                    *num_variants.entry(chrom.clone()).or_insert(0) += count;
+
+                    if *zygosity == Zygosity::Heterozygous {
+                        *num_heterozygous.entry(chrom.clone()).or_insert(0) += count;
+                        if *variant_type == VariantType::Snv {
+                            *num_het_snv.entry(chrom.clone()).or_insert(0) += count;
+                        }
+                    }
+                }
+            }
+
+            // best place to pull contig order from is our reference genome
+            for contig in reference_genome.contig_keys().iter() {
+                let contig_length: u64 = reference_genome.get_full_chromosome(contig).len() as u64;
+                let chrow_stats_row: SummaryRow = self.generate_summary_row(
+                    sample_name,
+                    contig, 
+                    blocks_by_chrom.get(contig).unwrap_or(&vec![]), 
+                    *num_variants.get(contig).unwrap_or(&0), 
+                    *num_heterozygous.get(contig).unwrap_or(&0), 
+                    *num_het_snv.get(contig).unwrap_or(&0),
+                    *self.phased_snvs.get(&(sample_name.clone(), contig.clone())).unwrap_or(&0),
+                    contig_length
+                );
+                csv_writer.serialize(&chrow_stats_row)?;
+            }
+
+            let all_stats_row: SummaryRow = self.generate_summary_row(
+                sample_name, 
+                "all",
+                &all_sample_blocks,
+                num_variants.values().sum(),
+                num_heterozygous.values().sum(),
+                num_het_snv.values().sum(),
+                self.phased_snvs.iter().map(|(_chrom, &count)| count).sum(),
+                total_contig_length
+            );
+            csv_writer.serialize(&all_stats_row)?;
+        }
+
+        csv_writer.flush()?;
+        Ok(())
+    }
+
+    /// Utility function for building a summary row from given data.
+    /// # Arguments
+    /// * `sample_name` - pass-through sample ID
+    /// * `chrom` - pass-through chromosome name
+    /// * `blocks` - the phase blocks that get parsed into a summary row for the sample/chromosome
+    /// * `num_variants` - total number of variants
+    /// * `num_heterozygous` - total number of heterozygous variants
+    /// * `num_het_snv` - total number of heterozygous SNVs
+    /// * `num_phased_snv` - total number of output phased SNVs
+    /// * `contig_length` - length of the chromosome
+    #[allow(clippy::too_many_arguments)]
+    fn generate_summary_row(
+        &self, sample_name: &str, chrom: &str, blocks: &[PhaseBlock],
+        num_variants: usize, num_heterozygous: usize, num_het_snv: usize, num_phased_snv: usize, 
+        contig_length: u64
+    ) -> SummaryRow {
+        // make sure every block has the correct sample name
+        assert!(blocks.iter().all(|b| b.sample_name() == sample_name));
+
+        // these are derivable from our results
+        let num_blocks = blocks.len();
+        let num_singletons = blocks.iter()
+            .filter(|block| block.get_num_variants() == 1)
+            .count();
+
+        // collect the block variant stats, this is only counting heterozygous variants
+        let mut block_variants: Vec<usize> = blocks.iter()
+            .map(|b| b.get_num_variants())
+            .collect();
+        
+        // collect the block length stats
+        let mut block_lengths: Vec<u64> = blocks.iter()
+            .map(|b| b.bp_len())
+            .collect();
+
+        // let total_heterozygous: usize = num_heterozygous.iter().map(|(_chrom, &count)| count).sum();
+        let num_phased: usize = block_variants.iter().sum();
+        let num_unphased = num_heterozygous - num_phased; 
+
+        block_variants.sort();
+        let variants_per_block_median: usize = if block_variants.is_empty() { 0 } else { block_variants[block_variants.len() / 2] };
+        let variants_per_block_mean: usize = if block_variants.is_empty() { 0 } else { block_variants.iter().sum::<usize>() / block_variants.len() };
+        let variants_per_block_min: usize = *block_variants.iter().min().unwrap_or(&0);
+        let variants_per_block_max: usize = *block_variants.iter().max().unwrap_or(&0);
+        let variants_per_block_sum: usize = block_variants.iter().sum();
+
+        block_lengths.sort();
+        let basepairs_per_block_median: u64 = if block_lengths.is_empty() { 0 } else { block_lengths[block_lengths.len() / 2] };
+        let basepairs_per_block_mean: u64 = if block_lengths.is_empty() { 0 } else { block_lengths.iter().sum::<u64>() / block_lengths.len() as u64 };
+        let basepairs_per_block_min: u64 = *block_lengths.iter().min().unwrap_or(&0);
+        let basepairs_per_block_max: u64 = *block_lengths.iter().max().unwrap_or(&0);
+        let basepairs_per_block_sum: u64 = block_lengths.iter().sum();
+
+        let block_ng50: Option<u64> = if contig_length != 0 {
+            Some(calculate_block_ng50(&block_lengths, contig_length))
+        } else {
+            None
+        };
+
+        SummaryRow {
+            sample_name: sample_name.to_string(),
+            chromosome: chrom.to_string(),
+            num_variants,
+            num_heterozygous,
+            num_phased,
+            num_unphased,
+            num_het_snv,
+            num_phased_snv,
+            num_blocks,
+            num_singletons,
+            variants_per_block_median,
+            variants_per_block_mean,
+            variants_per_block_min,
+            variants_per_block_max,
+            variants_per_block_sum,
+            basepairs_per_block_median,
+            basepairs_per_block_mean,
+            basepairs_per_block_min,
+            basepairs_per_block_max,
+            basepairs_per_block_sum,
+            block_ng50
+        }
+    }
+}
+
+/// Helper subroutine for calculating block NG50 from a list of sorted blocks.
+/// # Arguments
+/// * `sorted_blocks` - block sizes sorted in ascending order
+/// * `contig_length` - the total contig length, half is needed to reach NG50
+/// # Panics
+/// * if while iterating it detects unsorted blocks
+fn calculate_block_ng50(sorted_blocks: &[u64], contig_length: u64) -> u64 {
+    let mut last_block_size: u64 = u64::MAX;
+    let mut length_sum: u64 = 0;
+
+    // add one to handle odd values (e.g. rounding up)
+    let target_length: u64 = (contig_length + 1) / 2;
+
+    for &block_size in sorted_blocks.iter().rev() {
+        // we're going in reverse, so block sizes *should* be monotonically decreasing
+        assert!(block_size <= last_block_size);
+        last_block_size = block_size;
+
+        // add in the block and check again half the total length
+        length_sum += block_size;
+        if length_sum >= target_length {
+            // we made it, return the size of this block
+            return block_size;
+        }
+    }
+
+    // we didn't make it
+    0
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_calculate_block_ng50() {
+        // odd block length
+        let contig_length: u64 = 21;
+        let blocks: Vec<u64> = vec![1, 2, 3, 4, 10];
+        let bad_blocks: Vec<u64> = vec![2];
+        let good_blocks: Vec<u64> = vec![9, 10];
+
+        assert_eq!(calculate_block_ng50(&blocks, contig_length), 4);
+        assert_eq!(calculate_block_ng50(&bad_blocks, contig_length), 0);
+        assert_eq!(calculate_block_ng50(&good_blocks, contig_length), 9);
+
+        // same but now even block length
+        let contig_length: u64 = 20;
+        assert_eq!(calculate_block_ng50(&blocks, contig_length), 10);
+        assert_eq!(calculate_block_ng50(&bad_blocks, contig_length), 0);
+        assert_eq!(calculate_block_ng50(&good_blocks, contig_length), 10);
+    }
+}
\ No newline at end of file
diff --git a/src/writers/haplotag_writer.rs b/src/writers/haplotag_writer.rs
new file mode 100644
index 0000000..877dc5b
--- /dev/null
+++ b/src/writers/haplotag_writer.rs
@@ -0,0 +1,72 @@
+
+use serde::Serialize;
+use std::fs::File;
+use std::path::Path;
+
+use crate::phaser::HaplotagResult;
+
+/// This is a wrapper for writing out any stats to a file
+pub struct HaplotagWriter {
+    /// Handle for the CSV writer
+    csv_writer: csv::Writer<File>
+}
+
+/// Contains all the data written to each row of our stats file
+#[derive(Serialize)]
+struct HaplotagRow {
+    /// The index of the block
+    source_block_index: usize,
+    /// the sample name tied to the block
+    sample_name: String,
+    /// the chromosome of the block
+    chrom: String,
+    /// Phase set ID - usually the position first variant in the block
+    phase_block_id: u64,
+    /// the read name that is assigned
+    read_name: String,
+    /// the haplotype the read is assigned to
+    haplotag: u8
+}
+
+impl HaplotagWriter {
+    /// Creates a new writer for a given filename
+    /// # Arguments
+    /// * `filename` - the path to write all stats to
+    pub fn new(filename: &Path) -> csv::Result<HaplotagWriter> {
+        // modify the delimiter to "," if it ends with .csv
+        let is_csv: bool = filename.extension().unwrap_or_default() == "csv";
+        let delimiter: u8 = if is_csv { b',' } else { b'\t' };
+        let csv_writer: csv::Writer<File> = csv::WriterBuilder::new()
+            .delimiter(delimiter)
+            .from_path(filename)?;
+        Ok(HaplotagWriter {
+            csv_writer
+        })
+    }
+
+    /// Writes the haplotag results from a given block.
+    /// # Arguments
+    /// * `haplotag_result` - the HaplotagResult from phasing, in-order is not required
+    /// # Errors
+    /// * if the csv_writer has any errors
+    pub fn write_block(&mut self, haplotag_result: &HaplotagResult) -> Result<(), Box<dyn std::error::Error>> {
+        let source_block_index: usize = haplotag_result.phase_block.get_block_index();
+        let sample_name: String = haplotag_result.phase_block.sample_name().to_string();
+        let chrom: String = haplotag_result.phase_block.get_chrom().to_string();
+        for (read_name, &(phase_block_id_0, haplotag_0)) in haplotag_result.reads.iter() {
+            let phase_block_id: u64 = (phase_block_id_0 + 1).try_into()?;
+            let haplotag: u8 = (haplotag_0 + 1).try_into()?;
+            let row: HaplotagRow = HaplotagRow {
+                source_block_index,
+                sample_name: sample_name.clone(),
+                chrom: chrom.clone(),
+                phase_block_id,
+                read_name: read_name.clone(),
+                haplotag
+            };
+            self.csv_writer.serialize(&row)?;
+            self.csv_writer.flush()?;
+        }
+        Ok(())
+    }
+}
\ No newline at end of file
diff --git a/src/writers/mod.rs b/src/writers/mod.rs
new file mode 100644
index 0000000..9b179a0
--- /dev/null
+++ b/src/writers/mod.rs
@@ -0,0 +1,13 @@
+
+/// Contains writer for phase block stats, both the phase blocks themselves and the summary
+pub mod block_stats;
+/// Contains the writer for haplotag results
+pub mod haplotag_writer;
+/// Contains writer for BAM files
+pub mod ordered_bam_writer;
+/// Contains writer for VCF files
+pub mod ordered_vcf_writer;
+/// Contains writer for phasing statistics for underlying algorithms
+pub mod phase_stats;
+/// Contains additional VCF utilities
+pub mod vcf_util;
\ No newline at end of file
diff --git a/src/writers/ordered_bam_writer.rs b/src/writers/ordered_bam_writer.rs
new file mode 100644
index 0000000..14c3832
--- /dev/null
+++ b/src/writers/ordered_bam_writer.rs
@@ -0,0 +1,355 @@
+
+use crate::phaser::HaplotagResult;
+
+use log::{debug, trace, warn};
+use rust_htslib::bam;
+use rust_htslib::bam::Read;
+use rustc_hash::{FxHashMap as HashMap, FxHashSet as HashSet};
+use simple_error::bail;
+use std::cell::RefCell;
+use std::path::{PathBuf, Path};
+
+/// Structure that maintains order of phase problems while writing solutions.
+pub struct OrderedBamWriter {
+    /// each bam writer is responsible for a specific sample
+    sample_name: String,
+    /// the template BAMs we are reading from and copying/modifying
+    ref_bam_readers: Vec<RefCell<bam::IndexedReader>>,
+    /// the BAMs we are writing to
+    ref_bam_writers: Vec<RefCell<bam::Writer>>,
+    /// the data that may be cached because we are waiting on earlier results
+    map_store: HashMap<usize, HaplotagResult>,
+    /// set of blocks that are indicated to skip
+    skip_set: HashSet<usize>,
+    /// the index of data we are waiting for
+    current_index: usize,
+    /// the most recently written chromosome
+    current_chrom: String,
+    /// the most recently written POS field
+    current_pos: u64,
+    /// tracks chromosomes that have been finalized
+    finished_chroms: HashSet<String>
+}
+
+impl OrderedBamWriter {
+    /// Creates a new `OrderedBamWriter` using template BAMs.
+    /// # Arguments
+    /// * `sample_name` - the sample this BAM writer corresponds to
+    /// * `reference_filename` - the path to the reference genome
+    /// * `input_bams` - the template VCF file containing unphased variants
+    /// * `output_bams` - the VCF file that will get created containing our phase solutions
+    /// * `thread_pool` - a shared thread pool for BAM I/O
+    pub fn new(
+        sample_name: String, reference_filename: &Path,
+        input_bams: &[PathBuf], output_bams: &[PathBuf],
+        thread_pool: &rust_htslib::tpool::ThreadPool
+    ) -> Result<OrderedBamWriter, rust_htslib::errors::Error> {
+        // log this
+        debug!("Creating BAM writer for {}:", sample_name);
+        debug!("\tInputs:  {:?}", input_bams);
+        debug!("\tOutputs: {:?}", output_bams);
+
+        // get all the stuff we need for reading the VCF setup
+        let mut ref_bam_readers: Vec<RefCell<bam::IndexedReader>> = vec![];
+        let mut ref_bam_writers: Vec<RefCell<bam::Writer>> = vec![];
+        for (i, path) in input_bams.iter().enumerate() {
+            let mut bam_reader: bam::IndexedReader = bam::IndexedReader::from_path(path)?;
+            bam_reader.set_reference(reference_filename)?;
+            bam_reader.set_thread_pool(thread_pool)?;
+            let bam_header: bam::HeaderView = bam_reader.header().clone();
+            let ref_bam_reader: RefCell<bam::IndexedReader> = RefCell::new(bam_reader);
+            
+            //now setup the outputs, we want to do the header stuff here also
+            let mut output_header: bam::header::Header = bam::header::Header::from_template(&bam_header);
+            let cli_string: String = std::env::args().collect::<Vec<String>>().join(" ");
+            let cli_version: &str = &crate::cli::FULL_VERSION;
+
+            let mut cli_record = bam::header::HeaderRecord::new("PG".as_bytes());
+            cli_record.push_tag("PN".as_bytes(), &"hiphase");
+            cli_record.push_tag("ID".as_bytes(), &format!("hiphase-v{cli_version}"));
+            cli_record.push_tag("VN".as_bytes(), &cli_version);
+            cli_record.push_tag("CL".as_bytes(), &cli_string);
+            output_header.push_record(&cli_record);
+            
+            // TODO: do we need to add something to the BAM headers? I'm not sure yet, we'll come back to this
+            // output_header.push_record(r#"##FORMAT=<ID=PS,Number=1,Type=Integer,Description="Phase set identifier">"#.as_bytes());
+            let bam_format: bam::Format = if output_bams[i].extension().unwrap_or_default() == "cram" {
+                bam::Format::Cram
+            } else {
+                bam::Format::Bam
+            };
+            let mut bam_writer: bam::Writer = bam::Writer::from_path(
+                &output_bams[i],
+                &output_header,
+                bam_format
+            )?;
+            bam_writer.set_reference(reference_filename)?;
+            bam_writer.set_thread_pool(thread_pool)?;
+            let ref_bam_writer: RefCell<bam::Writer> = RefCell::new(bam_writer);
+
+            // push everything into the vecs for storage
+            ref_bam_readers.push(ref_bam_reader);
+            ref_bam_writers.push(ref_bam_writer);
+        }
+
+        Ok(OrderedBamWriter {
+            sample_name,
+            ref_bam_readers,
+            ref_bam_writers,
+            map_store: Default::default(),
+            skip_set: Default::default(),
+            current_index: 0,
+            current_chrom: "".to_string(),
+            current_pos: 0,
+            finished_chroms: Default::default()
+        })
+    }
+
+    /// Returns the phase block result that the writer is currently waiting to receive.
+    pub fn get_wait_block(&self) -> usize {
+        self.current_index
+    }
+
+    /// Adds a phase block result to our queue for writing.
+    /// # Arguments
+    /// * `phase_result` - a phasing result that will be written in the correct order with other blocks
+    pub fn write_phase_block(&mut self, haplotag_result: HaplotagResult) -> Result<(), Box<dyn std::error::Error>> {
+        let block_index: usize = haplotag_result.phase_block.get_block_index();
+        if block_index < self.current_index {
+            bail!("Block index is smaller than next expected index");
+        }
+        if haplotag_result.phase_block.sample_name() != self.sample_name {
+            bail!("Received haplotag result for sample other than the one specified");
+        }
+        match self.map_store.insert(block_index, haplotag_result) {
+            None => {},
+            Some(_) => {
+                bail!("Block index was already present in the map_store");
+            }
+        };
+        self.drain_map_store()?;
+        Ok(())
+    }
+
+    /// Adds a dummy block result to our queue for writing.
+    /// This is only necessary because we have block indices that are shared across samples, but this only cares about one sample.
+    /// # Arguments
+    /// * `phase_result` - a phasing result that will be written in the correct order with other blocks
+    pub fn write_dummy_block(&mut self, block_index: usize) -> Result<(), Box<dyn std::error::Error>> {
+        if block_index < self.current_index {
+            bail!("Block index is smaller than next expected index");
+        }
+
+        // add to the skip list and drain away
+        self.skip_set.insert(block_index);
+        self.drain_map_store()?;
+        Ok(())
+    }
+
+    /// This will drain phase block solutions in the correct order if they have been received.
+    /// It drains as far as it can given the current results and then stops to wait for more data.
+    fn drain_map_store(&mut self) -> Result<(), Box<dyn std::error::Error>> {
+        while !self.map_store.is_empty() {
+            match self.map_store.remove(&self.current_index) {
+                Some(haplotag_result) => {
+                    trace!("Draining {}", self.current_index);
+
+                    let chrom_result: &str = haplotag_result.phase_block.get_chrom();
+                    if chrom_result != self.current_chrom {
+                        if self.current_chrom.is_empty() {
+                            // this is the first block, so lets set chrom and move on
+                            self.current_chrom = chrom_result.to_string();
+                        } else {
+                            // the next block is on a different chromosome, so we need to finalize this chromosome
+                            self.finalize_chromosome()?;
+
+                            // now setup for the next chromosome
+                            self.current_chrom = chrom_result.to_string();
+                            self.current_pos = 0;
+                        }
+                    } else {
+                        // the chromosome matches previous, I don't think we actually have to do anything
+                        // as long as our assumptions are correct
+                    }
+
+                    /*
+                    General plan: loop over the writers and write each BAM separately
+                    - get all of the index-specific readers/writers ready
+                    - adjust haplotype_index to only correspond to the current vcf_index
+                    - we may need to adjust our errors to allow for empty blocks; we can make a dummy VCF to test with
+                    - ^ we should make a dummy VCF anyways just to verify that nothing in our output changes by adding an empty VCF
+                    */
+                    let start_pos = self.current_pos;
+                    let end_pos = haplotag_result.phase_block.get_end();
+                        
+                    for (bam_index, ref_bam_writer) in self.ref_bam_writers.iter().enumerate() {
+                        // prep the writer
+                        let mut bam_writer = ref_bam_writer.borrow_mut();
+                        
+                        // we _should_ just have to iterate from the last written through the end of the block
+                        let mut bam_reader = self.ref_bam_readers[bam_index].borrow_mut();
+                        
+                        // key is a read name, value is (block_id, haplotype)
+                        let read_block_lookup = &haplotag_result.reads;
+                        
+                        // annoyingly, the bam reader (here) uses normal rust conventions for 0..len (end is exclusive), whereas the VCF reader is inclusive
+                        // so to fix, we need to add the +1 as shown below
+                        match bam_reader.fetch(bam::FetchDefinition::RegionString(chrom_result.as_bytes(), start_pos as i64, end_pos as i64 + 1)) {
+                            Ok(()) => {
+                                for record_result in bam_reader.records() {
+                                    // set up the record for the new BAM file
+                                    let mut record = record_result?;
+                                    let record_pos = record.pos();
+                                    if record_pos < start_pos as i64 {
+                                        // this can happen when you have reads that overlap the location but don't start *after* the position
+                                        // we have already written though, so don't write it again
+                                        continue;
+                                    }
+
+                                    // this may need to be <=, hard to tell yet
+                                    assert!(record_pos <= end_pos as i64);
+
+                                    // now check if the read name has a lookup
+                                    let read_name = std::str::from_utf8(record.qname()).unwrap();
+                                    match read_block_lookup.get(read_name) {
+                                        Some((phase_block_id, haplotag)) => {
+                                            // we have a match, modify phase info
+                                            // phase_block_id is 0-based, so add 1 to it
+                                            record.push_aux("PS".as_bytes(), bam::record::Aux::I32((phase_block_id + 1).try_into()?))?;
+                                            // haplotag is 0/1 and we want 1/2 in the BAM, so add 1 to it
+                                            record.push_aux("HP".as_bytes(), bam::record::Aux::U8((haplotag + 1).try_into()?))?;
+                                            bam_writer.write(&record)?;
+                                        },
+                                        None => {
+                                            // no match, so just copy the read over
+                                            bam_writer.write(&record)?;
+                                        }
+                                    };
+                                }
+                            },
+                            Err(e) => {
+                                if end_pos == 0 {
+                                    warn!("Empty problem block received, no read mappings on chromosome {}", chrom_result);
+                                } else {
+                                    warn!("Received \'{}\', while seeking to {}:{}-{} in bam #{}, likely no reads in region", e, chrom_result, start_pos, end_pos, bam_index);
+                                    //return Err(e);
+                                }
+                            }
+                        };
+                    }
+
+                    // set up for the next block we get
+                    self.current_pos = end_pos+1;
+                    self.current_index += 1;
+                },
+                None => {
+                    // it's not in the received blocks, check if it is in the skip set
+                    if self.skip_set.remove(&self.current_index) {
+                        // we did find it here
+                        self.current_index += 1;
+                    } else {
+                        // it's not there either, time to end the looping for now
+                        break;
+                    }
+                }
+            };
+        }
+        Ok(())
+    }
+
+    /// Writes all remaining variants for a chromosome.
+    /// It should be called after all phase blocks for a chromosome are received.
+    /// # Panics
+    /// * if the current chromosome has been previously finalized
+    pub fn finalize_chromosome(&mut self) -> Result<(), rust_htslib::errors::Error> {
+        // make sure we haven't done this chromosome before
+        assert!(!self.finished_chroms.contains(&self.current_chrom));
+
+        // finalize the area
+        let start_pos = self.current_pos;
+        for (bam_index, ref_bam_writer) in self.ref_bam_writers.iter().enumerate() {
+            // prep the writer
+            let mut bam_writer = ref_bam_writer.borrow_mut();
+            let mut bam_reader = self.ref_bam_readers[bam_index].borrow_mut();
+            
+            //match vcf_reader.fetch(chrom_index, start_pos, None) {
+            match bam_reader.fetch(bam::FetchDefinition::RegionString(self.current_chrom.as_bytes(), start_pos as i64, i64::MAX)) {
+                Ok(()) => {
+                    for record_result in bam_reader.records() {
+                        // set up the record for the new BAM file
+                        let record = record_result?;
+                        let record_pos = record.pos();
+                        if record_pos < start_pos as i64 {
+                            // this can happen when you have reads that overlap the location but don't start *after* the position
+                            // we have already written though, so don't write it again
+                            continue;
+                        }
+
+                        // nothing left should be tagged
+                        bam_writer.write(&record)?;
+    
+                        // adding this last bit should prevent double writes by accident from a user
+                        self.current_pos = self.current_pos.max(record_pos as u64 + 1);
+                    }
+                },
+                Err(e) => {
+                    warn!("Received \'{}\', likely caused by no trailing variants detected for {}:{}-END", e, self.current_chrom, start_pos);
+                }
+            };
+        }
+
+        self.finished_chroms.insert(self.current_chrom.clone());
+        Ok(())
+    }
+
+    pub fn copy_remaining_chromosomes(&mut self) -> Result<(), rust_htslib::errors::Error> {
+        // go through each BAM and just copy any chromosomes we didn't do
+        for (bam_index, ref_bam_writer) in self.ref_bam_writers.iter().enumerate() {
+            // prep the writer
+            let mut bam_writer = ref_bam_writer.borrow_mut();
+            let mut bam_reader = self.ref_bam_readers[bam_index].borrow_mut();
+            debug!("Finalizing bam #{}...", bam_index);
+
+            // first, find anything that didn't copy
+            let mut remaining_contigs: Vec<bam::FetchDefinition> = vec![];
+            let mut remaining_contigs_copy: Vec<bam::FetchDefinition> = vec![]; // we have to do this because there is no clone for FetchDefs
+            let target_names: Vec<String> = bam_reader.header().target_names().iter()
+                .map(|s| String::from_utf8(s.to_vec()).unwrap())
+                .collect();
+            for tn in target_names.iter() {
+                if !self.finished_chroms.contains(tn) {
+                    // this target has not been completed yet
+                    remaining_contigs.push(bam::FetchDefinition::String(tn.as_bytes()));
+                    remaining_contigs_copy.push(bam::FetchDefinition::String(tn.as_bytes()));
+                }
+            }
+
+            // add unmapped at the end
+            remaining_contigs.push(bam::FetchDefinition::Unmapped);
+            remaining_contigs_copy.push(bam::FetchDefinition::Unmapped);
+
+            debug!("Remaining contigs: {:?}", remaining_contigs);
+
+            // now go through each, and fully copy everything
+            for (fetch_index, fetch_definition) in remaining_contigs.into_iter().enumerate() {
+                match bam_reader.fetch(fetch_definition) {
+                    Ok(()) => {
+                        for record_result in bam_reader.records() {
+                            // set up the record for the new BAM file
+                            let record = record_result?;
+                            
+                            // nothing left should be tagged
+                            bam_writer.write(&record)?;
+                        }
+                    },
+                    Err(e) => {
+                        warn!("Received \'{}\', likely caused by no reads detected for {:?}", e, remaining_contigs_copy[fetch_index]);
+                    }
+                };
+            }
+        }
+
+        // if we make it here, everything should be good
+        Ok(())
+    }
+}
diff --git a/src/writers/ordered_vcf_writer.rs b/src/writers/ordered_vcf_writer.rs
new file mode 100644
index 0000000..976130b
--- /dev/null
+++ b/src/writers/ordered_vcf_writer.rs
@@ -0,0 +1,423 @@
+
+use crate::block_gen::is_phasable_variant;
+use crate::phaser::PhaseResult;
+
+use log::{warn,debug,trace};
+use rust_htslib::bcf;
+use rust_htslib::bcf::Read;
+use rust_htslib::bcf::record::GenotypeAllele;
+use rustc_hash::FxHashMap as HashMap;
+use simple_error::bail;
+use std::cell::RefCell;
+use std::collections::VecDeque;
+use std::io;
+use std::path::PathBuf;
+
+/// Mostly for keeping the necessary information for a phase together when we store the results in memory prior to writing
+#[derive(Debug)]
+struct SingleVariantPhase {
+    /// haplotype 1
+    h1: u8,
+    /// haplotype 2
+    h2: u8,
+    /// phase block ID
+    block_id: usize
+}
+
+/// Structure that maintains order of phase problems while writing solutions.
+pub struct OrderedVcfWriter {
+    /// the template VCF we are reading from and copying if no phases are given
+    ref_vcf_readers: Vec<RefCell<bcf::IndexedReader>>,
+    /// A copy of the input VCF header, cached here for performance
+    vcf_headers: Vec<bcf::header::HeaderView>,
+    /// the VCF we are writing to
+    ref_vcf_writers: Vec<RefCell<bcf::Writer>>,
+    /// the data that may be cached because we are waiting on earlier results
+    map_store: HashMap<usize, PhaseResult>,
+    /// the index of data we are waiting for
+    current_index: usize,
+    /// the most recently written chromosome
+    current_chrom: String,
+    /// the most recently written POS field
+    current_pos: u64,
+    /// the distance you can write for each sample
+    current_positions: HashMap<String, u64>,
+    /// The minimum quality for variants that were phased. If this is different from what is generating blocks, there will likely be a panic.
+    min_quality: i32,
+    /// Per VCF file, there is a hashmap of sample_name -> index in VCF file
+    sample_indices: Vec<HashMap<String, usize>>,
+    /// Per VCF file, there is a hashmap of sample_name -> queue of variants
+    phase_queues: Vec<HashMap<String, VecDeque<SingleVariantPhase>>>
+}
+
+impl OrderedVcfWriter {
+    /// Creates a new `OrderedVcfWriter` using template VCFs.
+    /// # Arguments
+    /// * `input_vcfs` - the template VCF files containing unphased variants
+    /// * `output_vcfs` - the VCF files that will get created containing our phase solutions
+    /// * `min_quality` - the minimum quality that indicates a variant to phase
+    /// * `sample_name` - the sample getting phased
+    pub fn new(input_vcfs: &[PathBuf], output_vcfs: &[PathBuf], min_quality: i32, sample_names: &[String]) -> Result<OrderedVcfWriter, Box<dyn std::error::Error>> {
+        //get all the stuff we need for reading the VCF setup
+        let mut ref_vcf_readers: Vec<RefCell<bcf::IndexedReader>> = vec![];
+        let mut vcf_headers: Vec<bcf::header::HeaderView> = vec![];
+        let mut ref_vcf_writers: Vec<RefCell<bcf::Writer>> = vec![];
+        let mut sample_indices: Vec<HashMap<String, usize>> = vec![];
+        let mut phase_queues: Vec<HashMap<String, VecDeque<SingleVariantPhase>>> = vec![];
+        for (i, path) in input_vcfs.iter().enumerate() {
+            let vcf_reader: bcf::IndexedReader = bcf::IndexedReader::from_path(path)?;
+            let vcf_header: bcf::header::HeaderView = vcf_reader.header().clone();
+            let ref_vcf_reader: RefCell<bcf::IndexedReader> = RefCell::new(vcf_reader);
+
+            // first make sure we find the sample in this file
+            let mut vcf_sample_hash: HashMap<String, usize> = Default::default();
+            let mut vcf_phase_queue: HashMap<String, VecDeque<SingleVariantPhase>> = Default::default();
+            for sample_name in sample_names.iter() {
+                let mut lookup_index: Option<usize> = None;
+                for (sample_index, &vcf_sample) in vcf_header.samples().iter().enumerate() {
+                    let vcf_sample_string: String = std::str::from_utf8(vcf_sample).unwrap().to_string();
+                    if &vcf_sample_string == sample_name {
+                        lookup_index = Some(sample_index);
+                        break;
+                    }
+                }
+                let lookup_index: usize = match lookup_index {
+                    Some(index) => {
+                        index
+                    },
+                    None => {
+                        bail!("Sample name {:?} was not found in VCF: {:?}", sample_name, path);
+                    }
+                };
+
+                // add the sample lookup and initialize to an empty queue
+                vcf_sample_hash.insert(sample_name.to_string(), lookup_index);
+                vcf_phase_queue.insert(sample_name.to_string(), Default::default());
+            }
+            
+            //now setup the outputs, we want to do the header stuff here also
+            let mut output_header: bcf::header::Header = bcf::header::Header::from_template(&vcf_header);
+            let cli_string: String = std::env::args().collect::<Vec<String>>().join(" ");
+            let cli_version: &str = &crate::cli::FULL_VERSION;
+            output_header.push_record(format!(r#"##HiPhase_version="{cli_version}""#).as_bytes());
+            output_header.push_record(format!(r#"##HiPhase_command="{cli_string}""#).as_bytes());
+            output_header.push_record(r#"##FORMAT=<ID=PS,Number=1,Type=Integer,Description="Phase set identifier">"#.as_bytes());
+            output_header.push_record(r#"##FORMAT=<ID=PF,Number=1,Type=String,Description="Phasing flag">"#.as_bytes());
+            let vcf_writer: bcf::Writer = bcf::Writer::from_path(
+                &output_vcfs[i],
+                &output_header,
+                false,
+                bcf::Format::Vcf
+            )?;
+            let ref_vcf_writer: RefCell<bcf::Writer> = RefCell::new(vcf_writer);
+
+            // push everything into the vecs for storage
+            ref_vcf_readers.push(ref_vcf_reader);
+            vcf_headers.push(vcf_header);
+            ref_vcf_writers.push(ref_vcf_writer);
+            sample_indices.push(vcf_sample_hash);
+            phase_queues.push(vcf_phase_queue);
+        }
+
+        // default all of them to start at 0
+        let mut current_positions: HashMap<String, u64> = Default::default();
+        for sample in sample_names.iter() {
+            current_positions.insert(sample.clone(), 0);
+        }
+
+        Ok(OrderedVcfWriter {
+            ref_vcf_readers,
+            vcf_headers,
+            ref_vcf_writers,
+            map_store: Default::default(),
+            current_index: 0,
+            current_chrom: "".to_string(),
+            current_pos: 0,
+            current_positions,
+            min_quality,
+            sample_indices,
+            phase_queues
+        })
+    }
+
+    /// Returns the phase block result that the writer is currently waiting to receive.
+    pub fn get_wait_block(&self) -> usize {
+        self.current_index
+    }
+
+    /// Adds a phase block result to our queue for writing.
+    /// # Arguments
+    /// * `phase_result` - a phasing result that will be written in the correct order with other blocks
+    pub fn write_phase_block(&mut self, phase_result: PhaseResult) -> Result<(), Box<dyn std::error::Error>> {
+        let block_index: usize = phase_result.phase_block.get_block_index();
+        if block_index < self.current_index {
+            return Err(Box::new(io::Error::new(io::ErrorKind::Other, "Block index is smaller than next expected index")));
+        }
+        match self.map_store.insert(block_index, phase_result) {
+            None => {},
+            Some(_) => {
+                return Err(Box::new(io::Error::new(io::ErrorKind::Other, "Block index was already present in the map_store")));
+            }
+        };
+        self.drain_map_store()
+    }
+
+    /// This will drain phase block solutions in the correct order if they have been received.
+    /// It drains as far as it can given the current results and then stops to wait for more data.
+    fn drain_map_store(&mut self) -> Result<(), Box<dyn std::error::Error>> {
+
+        /*
+         * New plan for multi-sample VCFs:
+         * 1. we check if we have the next phase result like we already do
+         * 2. we append all of the relevant info from that phase block into the variant queue for a given sample
+         *   - do we have one queue per sample? or one per sample-VCF combination?
+         *   - probably sample-VCF combination
+         * 3. we iterate over each VCF from current position to the minimum end position of all sample blocks so far
+         *   - this _should_ guarantee at least one sample queue is empty
+         *   - for each variant, we check if it's getting phase for the sample and modify if so
+         *   - this will pop an entry from the variant queue for the sample
+         * 4. at the end of a chromosome:
+         *   - iterate as normal
+         *   - check that all queues are empty
+         *   - put the new info into the queue, rinse and repeat
+         */
+
+        while !self.map_store.is_empty() {
+            match self.map_store.remove(&self.current_index) {
+                Some(phase_result) => {
+                    trace!("Draining {}", self.current_index);
+
+                    let chrom_result: &str = phase_result.phase_block.get_chrom();
+                    if chrom_result != self.current_chrom {
+                        if self.current_index == 0 {
+                            // this is the first block, so lets set chrom and move on
+                            self.current_chrom = chrom_result.to_string();
+                        } else {
+                            // the next block is on a different chromosome, so we need to finalize this chromosome
+                            self.write_to_end_position()?;
+
+                            // now setup for the next chromosome
+                            self.current_chrom = chrom_result.to_string();
+                            self.current_pos = 0;
+                            for (_k, v) in self.current_positions.iter_mut() {
+                                // set all current positions back to 0
+                                *v = 0;
+                            }
+                        }
+                    } else {
+                        // the chromosome matches previous, I don't think we actually have to do anything
+                        // as long as our assumptions are correct
+                    }
+
+                    let sample_name = phase_result.phase_block.sample_name();
+                    for (vcf_index, phase_queue) in self.phase_queues.iter_mut().enumerate() {
+                        let sample_queue: &mut VecDeque<SingleVariantPhase> = phase_queue.get_mut(sample_name).unwrap();
+                        let mut previous_block_id: usize = 0;
+                        for (haplotype_index, &h1_index) in phase_result.haplotype_1.iter().enumerate() {
+                            if vcf_index == phase_result.variants[haplotype_index].get_vcf_index() {
+                                // h1 and h2 are just internal representations
+                                let h2_index: u8 = phase_result.haplotype_2[haplotype_index];
+
+                                // convert them to the file representations where possible
+                                let h1 = phase_result.variants[haplotype_index].convert_index(h1_index);
+                                let h2 = phase_result.variants[haplotype_index].convert_index(h2_index);
+
+                                // add one here because we need it to be 1-based
+                                let block_id: usize = phase_result.block_ids[haplotype_index]+1;
+                                if haplotype_index == 0 || block_id != previous_block_id {
+                                    debug!("New block ID found for {}: {}", self.current_index, block_id);
+                                }
+                                previous_block_id = block_id;
+
+                                sample_queue.push_back(SingleVariantPhase { h1, h2, block_id });
+                            } else {
+                                // this variant is not a part of this VCF file
+                            }
+                        }
+                    }
+
+                    // update the minimum position for this sample and then tell the writer to do what it can
+                    *self.current_positions.get_mut(sample_name).unwrap() = phase_result.phase_block.get_end();
+                    self.write_to_min_position()?;
+                    self.current_index += 1;
+                },
+                None => {
+                    break;
+                }
+            };
+        }
+        Ok(())
+    }
+
+    /// This will trigger the writer to try to write everything that remains on this chromosome
+    /// # Errors
+    /// * if we have any issues writing the VCF file
+    /// * if when we finish writing, there are still blocks in the queue
+    pub fn write_to_end_position(&mut self) -> Result<(), Box<dyn std::error::Error>> {
+        self.write_to_position(u64::MAX)?;
+        
+        // sanity check to make sure all queues are empty once we get to this point
+        for hm in self.phase_queues.iter() {
+            for (_sample_name, queue) in hm.iter() {
+                if !queue.is_empty() {
+                    bail!("Finished writing chromosome, but variant queues are not empty");
+                }
+            }
+        }
+
+        Ok(())
+    }
+
+    /// This will trigger to writer to write everything up to the minimum position on this chromosome
+    /// # Errors
+    /// * if we have any issues writing the VCF file
+    fn write_to_min_position(&mut self) -> Result<(), Box<dyn std::error::Error>> {
+        let min_position: u64 = *self.current_positions.values().min().unwrap();
+        self.write_to_position(min_position)
+    }
+
+    /// This should write out all VCF lines up to a certain position and then break.
+    /// # Arguments
+    /// * `final_position` - the last position that is included in this write
+    /// # Errors
+    /// * if we have any issues writing the VCF file
+    fn write_to_position(&mut self, final_position: u64) -> Result<(), Box<dyn std::error::Error>> {
+        if self.current_pos == final_position {
+            // we can't write anything new
+            return Ok(());
+        }
+
+        // 
+        let start_pos = self.current_pos;
+        for (vcf_index, ref_vcf_writer) in self.ref_vcf_writers.iter().enumerate() {
+            // prep the writer
+            let mut vcf_writer = ref_vcf_writer.borrow_mut();
+            let mut vcf_reader = self.ref_vcf_readers[vcf_index].borrow_mut();
+            let chrom_index: u32 = self.vcf_headers[vcf_index].name2rid(self.current_chrom.as_bytes())?;
+            // let sample_index: usize = self.sample_indices[vcf_index];
+            
+            match vcf_reader.fetch(chrom_index, start_pos, Some(final_position)) {
+                Ok(()) => {
+                    for record_result in vcf_reader.records() {
+                        // set up the record for the new VCF file
+                        let mut record = record_result?;
+                        vcf_writer.translate(&mut record);
+                        let record_pos = record.pos();
+                        if record_pos < start_pos as i64 {
+                            // this can happen when you have very very long indels that span one of our breaks
+                            // we have already written though, so don't write it again
+                            continue;
+                        }
+
+                        // we now have to iterate over each sample and modify the entries as necessary
+                        let vcf_sample_indices = &self.sample_indices[vcf_index];
+                        let mut changes_made: bool = false;
+
+                        // initialize the alleles array to be the same, these may change inside the loop
+                        let mut alleles = vec![];
+                        let record_gt = record.genotypes().unwrap();
+                        for si in 0..record.sample_count() {
+                            let genotype = record_gt.get(si as usize);
+                            match genotype.len() {
+                                0 => bail!("Encountered empty genotype record at position {}", record.pos()),
+                                1 => {
+                                    // TRGT can make single-allele GT calls, just copy it over as normal
+                                    // it will not be modified below because it is a homozygous allele
+                                    alleles.push(genotype[0]);
+                                },
+                                2 => {
+                                    // this is 99.99999999% path
+                                    alleles.push(genotype[0]);
+                                    alleles.push(genotype[1]);
+                                },
+                                gt_len => {
+                                    // we do not have 3+ GT entries implemented
+                                    bail!("Encountered GT of length {} at position {}", gt_len, record.pos())
+                                }
+                            }
+                        }
+
+                        // initially empty PS blocks also
+                        let mut ps_blocks: Vec<Vec<u8>> = vec![".".as_bytes().to_vec(); record.sample_count() as usize];
+                        let mut phase_flags: Vec<Vec<u8>> = vec![".".as_bytes().to_vec(); record.sample_count() as usize];
+                        let mut flagged_variants: bool = false;
+
+                        for (sample_name, &sample_index) in vcf_sample_indices.iter() {
+                            // now see how we handle this variant
+                            let include_variant = is_phasable_variant(&record, sample_index, self.min_quality, false)?;
+                            if include_variant {
+                                // sanity checks
+                                assert!(u64::try_from(record_pos).unwrap() >= start_pos);
+                                assert!(u64::try_from(record_pos).unwrap() <= final_position);
+
+                                let variant_to_write = match self.phase_queues[vcf_index].get_mut(sample_name).unwrap().pop_front() {
+                                    Some(v) => v,
+                                    None => {
+                                        bail!("Variant requested from empty queue during VCF writing");
+                                    }
+                                };
+
+                                // these are already converted to the VCF entries, so just compare to see if we need to overwrite
+                                let h1 = variant_to_write.h1;
+                                let h2 = variant_to_write.h2;
+                                if h1 == h2 {
+                                    // algorithm decided it was better if these were homozygous allele
+                                    // for now, we will just write out the original record
+
+                                    if h1 == u8::MAX {
+                                        // these were intentionally ignored by HiPhase, mark it as such
+                                        phase_flags[sample_index] = "TR_OVERLAP".as_bytes().to_vec();
+                                        flagged_variants = true;
+                                    }
+
+                                } else {
+                                    // we need to alter the genotypes for this sample to phased
+                                    let sample_gt_offset: usize = 2 * sample_index;
+                                    alleles[sample_gt_offset] = GenotypeAllele::Unphased(h1 as i32);
+                                    alleles[sample_gt_offset + 1] = GenotypeAllele::Phased(h2 as i32);
+
+                                    // the push_format_string expects &[u8] bytes so we have to:
+                                    //   1. convert the output to a String
+                                    //   2. interpret that to bytes
+                                    //   3. convert to a Vec for ownership
+                                    ps_blocks[sample_index] = variant_to_write.block_id
+                                        .to_string().as_bytes().to_vec();
+                                    changes_made = true;
+                                }
+                            } else {
+                                // this variant is not included in phasing, so we can just leave it as is
+                            }
+                        }
+
+                        if changes_made {
+                            // if we altered something, then alter the record and add PS
+                            record.push_genotypes(&alleles)?;
+                            record.push_format_string("PS".as_bytes(), &ps_blocks).unwrap();
+                        }
+                        if flagged_variants {
+                            // we have at least one variant that was ignored, use PF = PhaseFlag 
+                            record.push_format_string("PF".as_bytes(), &phase_flags).unwrap();
+                        }
+                        // all modifications have been made, write it out
+                        vcf_writer.write(&record)?;
+                    }
+                },
+                Err(e) => {
+                    if final_position == 0 {
+                        warn!("Empty problem block received, no heterozygous variants on chromosome {}", self.current_chrom);
+                    } else {
+                        warn!("Received \'{}\', while seeking to {}:{}-{} in vcf #{}, likely no variants present", e, self.current_chrom, start_pos, final_position, vcf_index);
+                    }
+                }
+            }
+        }
+        
+        // we wrote out things at final_position, so go one past it
+        if final_position == u64::MAX {
+            self.current_pos = final_position;
+        } else {
+            self.current_pos = final_position+1;
+        }
+        Ok(())
+    }
+}
diff --git a/src/writers/phase_stats.rs b/src/writers/phase_stats.rs
new file mode 100644
index 0000000..a9c909b
--- /dev/null
+++ b/src/writers/phase_stats.rs
@@ -0,0 +1,313 @@
+
+use crate::data_types::variants::VariantType;
+use crate::phaser::PhaseResult;
+
+use serde::Serialize;
+use std::fs::File;
+use std::path::Path;
+
+/// Contains statistics on the loading and parsing of reads into alleles.
+#[derive(Debug)]
+pub struct ReadStats {
+    /// The number of reads loaded
+    num_reads: u64,
+    /// The number of reads without any determined alleles
+    skipped_reads: u64,
+    /// The number of alleles successfully loaded
+    num_alleles: u64,
+    /// Records the number of exact matches found for a type
+    exact_matches: [u64; VariantType::Unknown as usize + 1],
+    /// Records the number of inexact matches found for a type
+    inexact_matches: [u64; VariantType::Unknown as usize + 1],
+    /// Records the number of no-matches found for a type
+    failed_matches: [u64; VariantType::Unknown as usize + 1],
+    /// Records the number of matches to allele 0
+    allele0_matches: [u64; VariantType::Unknown as usize + 1],
+    /// Records the number of matches to allele 1
+    allele1_matches: [u64; VariantType::Unknown as usize + 1],
+    /// If true, then global realignment was used to pull out reads
+    is_global_realignment: bool
+}
+
+impl ReadStats {
+    /// Creates a new `ReadStats` struct and does some sanity checks.
+    /// # Arguments
+    /// * `num_reads` - the number of reads loaded
+    /// * `skipped_reads` - the number of reads that were skipped during loading
+    /// * `num_alleles` - the number of alleles successfully (i.e. un-ambiguously) loaded
+    /// * `exact_matches` - the number of exact matches for each type
+    /// * `inexact_matches` - the number of inexact matches for each type
+    /// * `failed_matches` - the number of ambiguous or no-sequence alleles
+    /// * `allele0_matches` - the number of alleles assigned to allele 0
+    /// * `allele1_matches` - the number of alleles assigned to allele 1
+    /// * `is_global_realignment` - if True, then global realignment was used to transform reads into alleles
+    /// # Panics
+    /// * if `num_reads` > `num_alleles`, because that would imply some reads have no alleles
+    /// * if `num_alleles != exact_matches.sum() + inexact_matches.sum()`, because that would imply some alleles are not being counted correctly somewhere
+    /// * if `num_alleles != allele0_matches.sum() + allele1_matches.sum()`, because that would imply some alleles are not being counted correctly somewhere
+    #[allow(clippy::too_many_arguments)]
+    pub fn new(
+        num_reads: u64, skipped_reads: u64, num_alleles: u64, 
+        exact_matches: [u64; VariantType::Unknown as usize + 1], 
+        inexact_matches: [u64; VariantType::Unknown as usize + 1], 
+        failed_matches: [u64; VariantType::Unknown as usize + 1], 
+        allele0_matches: [u64; VariantType::Unknown as usize + 1], 
+        allele1_matches: [u64; VariantType::Unknown as usize + 1],
+        is_global_realignment: bool
+    ) -> ReadStats {
+        assert!(num_alleles >= num_reads);
+        assert_eq!(num_alleles, exact_matches.iter().sum::<u64>() + inexact_matches.iter().sum::<u64>());
+        assert_eq!(num_alleles, allele0_matches.iter().sum::<u64>() + allele1_matches.iter().sum::<u64>());
+        ReadStats {
+            num_reads,
+            skipped_reads,
+            num_alleles,
+            exact_matches,
+            inexact_matches,
+            failed_matches,
+            allele0_matches,
+            allele1_matches,
+            is_global_realignment
+        }
+    }
+}
+
+/// Contains any statistics from the phasing problem solver that may be relevant
+pub struct PhaseStats {
+    /// The number of solutions that were pruned during calculation
+    pruned_solutions: Option<u64>,
+    /// For heuristic solvers, the estimate cost prior to computation
+    estimated_cost: Option<u64>,
+    /// The actual cost of the solution
+    actual_cost: Option<u64>,
+    /// The number of phased variants
+    phased_variants: Option<u64>,
+    /// The number of phased SNV variants
+    phased_snvs: Option<u64>,
+    /// The number of variants where the phasing solution turned them homozygous
+    homozygous_variants: Option<u64>,
+    /// The number of ignored variants
+    skipped_variants: Option<u64>
+}
+
+impl PhaseStats {
+    /// Creates phase stats for an A* algorithm solution
+    /// # Arguments
+    /// * `pruned_solutions` - the number of solutions that were pruned from the solution space to reduce memory consumption and run-time; if this is 0, then we have a guaranteed best solution
+    /// * `estimated_cost` - the estimated cost of this phase block calculated from the heuristic
+    /// * `actual_cost` - the actual cost of the final solution
+    /// * `phased_variants` - the number of variants that were phased in the solution (e.g. not converted to homozygous)
+    /// * `homozygous_variants` - the number of variants that were converted to homozygous in the solution
+    /// * `skipped_variants` - the number of variants ignored in the solution
+    /// # Panics
+    /// * if `actual_cost < estimated_cost`, because that would imply something broken in our heuristics that makes A* no longer work
+    pub fn astar_new(
+        pruned_solutions: u64, estimated_cost: u64, actual_cost: u64,
+        phased_variants: u64, phased_snvs: u64, homozygous_variants: u64, skipped_variants: u64
+    ) -> PhaseStats {
+        assert!(actual_cost >= estimated_cost);
+        PhaseStats {
+            pruned_solutions: Some(pruned_solutions),
+            estimated_cost: Some(estimated_cost),
+            actual_cost: Some(actual_cost),
+            phased_variants: Some(phased_variants),
+            phased_snvs: Some(phased_snvs),
+            homozygous_variants: Some(homozygous_variants),
+            skipped_variants: Some(skipped_variants)
+        }
+    }
+
+    pub fn get_pruned_solutions(&self) -> Option<u64> {
+        self.pruned_solutions
+    }
+
+    pub fn phased_snvs(&self) -> Option<u64> {
+        self.phased_snvs
+    }
+
+    /// Returns the ratio of estimated_cost / actual_cost. In a perfect world, this is very near 1.0.
+    pub fn get_cost_ratio(&self) -> Option<f64> {
+        match self.estimated_cost {
+            Some(ec) => {
+                self.actual_cost.map(|ac| 
+                    if ac == 0 {
+                        assert_eq!(ec, 0);
+                        1.0
+                    } else {
+                        ec as f64 / ac as f64
+                    }
+                )
+            },
+            None => None
+        }
+    }
+}
+
+/// This is a wrapper for writing out any stats to a file
+pub struct StatsWriter {
+    /// Handle for the CSV writer
+    csv_writer: csv::Writer<File>
+}
+
+/// Contains all the data written to each row of our stats file
+#[derive(Serialize)]
+struct CsvRow {
+    /// The index of the block
+    block_index: usize,
+    /// the sample for the block
+    sample_name: String,
+    /// the chromosome of the block
+    chrom: String,
+    /// the position of the first allele
+    start: u64,
+    /// the position of the last allele
+    end: u64,
+    /// the number of variants in the block
+    num_variants: u64,
+    /// the number of reads in the block
+    num_reads: Option<u64>,
+    /// the number of skipped reads in the block
+    skipped_reads: Option<u64>,
+    /// The number of variants loaded
+    num_alleles: Option<u64>,
+    /// Records the number of exact matches found for a type
+    allele_matches: Option<String>,
+    /// Records the number of inexact matches found for a type
+    allele_partials: Option<String>,
+    /// Records the number of no-matches found for a type
+    allele_failures: Option<String>,
+    /// Records the number of assignments to allele0 for a type
+    allele0_assigned: Option<String>,
+    /// Records the number of assignments to allele1 for a type
+    allele1_assigned: Option<String>,
+    /// if True, then global realignment was used to transform reads to alleles for this block
+    is_global_realignment: Option<bool>,
+    /// The number of solutions that were pruned during calculation
+    pruned_solutions: Option<u64>,
+    /// For heuristic solvers, the estimate cost prior to computation
+    estimated_cost: Option<u64>,
+    /// The actual cost of the solution
+    actual_cost: Option<u64>,
+    /// The estimated / actual cost ratio
+    cost_ratio: Option<f64>,
+    /// The number of phased variants
+    phased_variants: Option<u64>,
+    /// The number of variants where the phasing solution turned them homozygous
+    homozygous_variants: Option<u64>,
+    /// The number of ignored variants
+    skipped_variants: Option<u64>
+}
+
+impl StatsWriter {
+    /// Creates a new writer for a given filename
+    /// # Arguments
+    /// * `filename` - the path to write all stats to
+    pub fn new(filename: &Path) -> csv::Result<StatsWriter> {
+        // modify the delimiter to "," if it ends with .csv
+        let is_csv: bool = filename.extension().unwrap_or_default() == "csv";
+        let delimiter: u8 = if is_csv { b',' } else { b'\t' };
+        let csv_writer: csv::Writer<File> = csv::WriterBuilder::new()
+            .delimiter(delimiter)
+            .from_path(filename)?;
+        Ok(StatsWriter {
+            csv_writer
+        })
+    }
+
+    /// Will write stats to a CSV file for us
+    /// # Arguments
+    /// * `phase_result` - the phasing results, which wraps block metadata, the read statistics, and the phasing statistics
+    pub fn write_stats(&mut self, phase_result: &PhaseResult) -> csv::Result<()> {
+        let num_reads;
+        let skipped_reads;
+        let num_alleles;
+        let allele_matches;
+        let allele_partials;
+        let allele_failures;
+        let allele0_assigned;
+        let allele1_assigned;
+        let is_global_realignment;
+        match &phase_result.read_statistics {
+            Some(rs) => {
+                num_reads = Some(rs.num_reads);
+                skipped_reads = Some(rs.skipped_reads);
+                num_alleles = Some(rs.num_alleles);
+                allele_matches = Some(format!("{:?}", rs.exact_matches));
+                allele_partials = Some(format!("{:?}", rs.inexact_matches));
+                allele_failures = Some(format!("{:?}", rs.failed_matches));
+                allele0_assigned = Some(format!("{:?}", rs.allele0_matches));
+                allele1_assigned = Some(format!("{:?}", rs.allele1_matches));
+                is_global_realignment = Some(rs.is_global_realignment);
+            },
+            None => {
+                num_reads = None;
+                skipped_reads = None;
+                num_alleles = None;
+                allele_matches = None;
+                allele_partials = None;
+                allele_failures = None;
+                allele0_assigned = None;
+                allele1_assigned = None;
+                is_global_realignment = None;
+            }
+        };
+        
+        let pruned_solutions;
+        let estimated_cost;
+        let actual_cost;
+        let cost_ratio;
+        let phased_variants;
+        let homozygous_variants;
+        let skipped_variants;
+
+        match &phase_result.statistics {
+            Some(ps) => {
+                pruned_solutions = ps.pruned_solutions;
+                estimated_cost = ps.estimated_cost;
+                actual_cost = ps.actual_cost;
+                cost_ratio = ps.get_cost_ratio();
+                phased_variants = ps.phased_variants;
+                homozygous_variants = ps.homozygous_variants;
+                skipped_variants = ps.skipped_variants;
+            },
+            None => {
+                pruned_solutions = None;
+                estimated_cost = None;
+                actual_cost = None;
+                cost_ratio = None;
+                phased_variants = None;
+                homozygous_variants = None;
+                skipped_variants = None;
+            }
+        };
+
+        let row: CsvRow = CsvRow {
+            block_index: phase_result.phase_block.get_block_index(),
+            sample_name: phase_result.phase_block.sample_name().to_string(),
+            chrom: phase_result.phase_block.get_chrom().to_string(),
+            start: phase_result.phase_block.get_start(),
+            end: phase_result.phase_block.get_end(),
+            num_variants: phase_result.phase_block.get_num_variants() as u64,
+            num_reads,
+            skipped_reads,
+            num_alleles,
+            allele_matches,
+            allele_partials,
+            allele_failures,
+            allele0_assigned,
+            allele1_assigned,
+            is_global_realignment,
+            pruned_solutions,
+            estimated_cost,
+            actual_cost,
+            cost_ratio,
+            phased_variants,
+            homozygous_variants,
+            skipped_variants
+        };
+
+        self.csv_writer.serialize(&row)?;
+        self.csv_writer.flush()?;
+        Ok(())
+    }
+}
\ No newline at end of file
diff --git a/src/writers/vcf_util.rs b/src/writers/vcf_util.rs
new file mode 100644
index 0000000..d57d3c4
--- /dev/null
+++ b/src/writers/vcf_util.rs
@@ -0,0 +1,54 @@
+
+// This was all provided by Daniel Baker as a way to get VCF indexing since rust_htslib does not have a user-friendly method yet.
+// Parts have been tweaked for readability.
+
+/// The error type we can generate from trying to index
+#[derive(Debug)]
+pub struct BcfBuildError {
+    pub msg: String,
+}
+
+impl BcfBuildError {
+    pub fn error_message(error: i32) -> &'static str {
+        match error {
+           -1 => "indexing failed",
+           -2 => "opening @fn failed",
+           -3 => "format not indexable",
+           -4 => "failed to create and/or save the index",
+            _ => "unknown error",
+        }
+    }
+}
+impl std::error::Error for BcfBuildError {}
+
+impl std::fmt::Display for BcfBuildError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "BcfBuildError{{msg: {}}}", self.msg)
+    }
+}
+
+/// Build a bcf or vcf.gz index.
+/// Builds tbi or csi depending on if build_tbi is set.
+pub fn build_bcf_index<P: AsRef<std::path::Path>>(
+    bcf_path: P,
+    idx_path: Option<P>,
+    n_threads: u32,
+    build_tbi: bool,
+) -> Result<(), BcfBuildError> {
+    let min_shift = if build_tbi {0} else {14};
+    let idx_path_cstr = idx_path.map(|p| rust_htslib::utils::path_to_cstring(&p).expect("path_to_cstring"));
+    let ret = unsafe {
+        rust_htslib::htslib::bcf_index_build3(
+            rust_htslib::utils::path_to_cstring(&bcf_path).unwrap().as_ptr(),
+            idx_path_cstr.map_or(std::ptr::null(), |p| p.as_ptr()),
+            min_shift,
+            n_threads as i32,
+        )
+    };
+    match ret {
+        0 => Ok(()),
+        e => Err(BcfBuildError {
+            msg: format!("Failed to build  bcf index. Error: {e:?}/{}", BcfBuildError::error_message(e)),
+        }),
+    }
+}
diff --git a/test_data/header_only.bam b/test_data/header_only.bam
new file mode 100644
index 0000000000000000000000000000000000000000..00ada6557cd0c8da709d820dfdffcfaedb1c9dbe
GIT binary patch
literal 3019
zcmV;+3pDf}iwFb&00000{{{d;LjnM<3w@b;Y@Sse$4}p`m$wsS2si|F{$U8lmGe9I
z^VWs6T^WV8D=UmmF}1LfO*UHF@eU-QMuHI&i8Ap5;kF4zLJTBIKop`0nu#%Z89|J~
zL=s~#Y64;~DCc?4dEV#ph5fbD&+l@6mvecaGd4QibM1Q=+j#D#=C+v@4V&iNmeys9
zi_1IrT)D7+S99somLBHAd}!m`1<kpc)@8ewsW~;%63TgPln@0VIzW(@TKEEx0g!jX
zYpshwbpRKnwAKZnGvG)|CrTP71C~3ky)6Qp0qv#I-pT@S8Bm&}Gtv}*&wzB&D@(Q*
zAny=qy)8#b)FGNXW6DHC9bvWPzDPRJlafYB;fkaaKDS&+OC_Q^guEwmC2Ls%JeSf*
zrHdd-fRjR7rKkv82Dr3RxROSe0PPG}L&e01EPkVuQe<oqWZ^4gx%W;MK^DE{j+`>Z
zK(=?PVz}bY7oG3O6s!<dQStB7nS#-#6!iT0lfrV#O`GPI7p~m7xUbU&nwPpJm$M}~
zDYz6!%$3avYaEpGB{^f9;XsUdNlbCF28mHwObCM3L?u?kl|*u~;?*jR#8g>~$}KOF
zm@a!Yq%0x1N+q<kzO>@!H&1cSg`kR@mcHm$b>ujAZBdXD+}qM3($QX}WL}uk>hNiL
zwvrTX1QIJ-la6bJ#C)|Bu9V{>CwSSM=ekr`N^+H?a4t*hkMgYeN>-?xl_V%cxom=n
zN_!xw<hs&TURCB8Es|FiK~ly;i?WhKDy`?PT-l)X<qf|pSLr}29C0XEZb2ojEKN`+
zxk@i6Csb*#HA%3t9XKnjE}68c3w5b3<h1rw7K|p#og*A~(j!61%f3lyU0Qb{304;r
zx22x#3YIIaNGl2DWF_C;S^)(s9bt62b-IG(3M%0Uhf>)?9C?8a+Oi;*PC6)9*&V#J
z8p@T+Vx=z~j60&3s;p-#RB?V6UD=V7T(moPB4HKeDsnO!a@pbJm4IAFt}Nst`6pF6
zak>J$BM{DW#3^zToFF%80tyC5I!^$$A1CO&gP4+gSvo|z0=6R{t?{S=Dmfss9`fix
z7-b+&6;clC`)HpF%w1jqjd&`6kb)prxjv51Mqa)HboE7EAnZSOn0MHHYB1*%<SBe<
zpuURkr57Aa7?YP624K<*EL=j$*sce}Wz_egd@=0`xYQv+bC@F&B~<xNnYd1iM3gVA
zIp;prm>ks4d&C*2pJwPaLH(3M$vNctE@WU6>%57jBLSz7+DIE%1)Y|WBVi$rRWP_u
zG>=u(QcI|xsw^<i`_Oq9p-3slO`v5eAsGjGwC_DukLEp<f8pf<DlTIZTtJ>mtnvI^
zItUalv9l>YgY|i`kMrkZ{X(}RZe#qCxWopcfKm!~h)*YtaT*#>k!!*ZVh0+{p?&8f
zCk-p;V+JKx1bNJWwjQ2TsYA$UeAMO#0lo^z)BzH9n3QpT!r+rG?PJ6h)=wRz1mudp
zJ(nig7dU<!Il+TUn$)3=OUTnf=HMnOO*$#G<dDbtIVm8w?(OqqyY+$oUE%^?MSOlx
zI}7zwezb(#r}_~*sHIQ&cMisv`kMs4sri^+Z?SqZZygRV-Lw=u=w-eESq<&`#3hIJ
z)AA;xv44@KPf$Hg&*ScJ>6G^vN9#(!6I$Ao9g+s})VxMxF7oJbbLQK(hT++CO_mln
zZ@Q&Ad_zgchxHyFFCuO4rusV~{-)2bdBRn<zs?Wa{EA>bRToJ7(vWYE<hXzmDz{<a
zIcilZv$jfN8zxC`4yQpW)kX>Fp_8=J635MG^V8Zm{KLwsm@q}o6BLN^G)BQoomFYS
z@XksNCDNrr;@<{VrJ870{v44@=zP<64tA#1dB(X_FjSpxkJc(UPaX4Vgoi%@txiWf
zC!8!Dvnh=g5(>m@TCY70!$tuT)fE=oB0qkuC0z8hhMlq1#)&peyYnsI8uJ#-mo{lb
zq>9JD#MriuCb<yd*3HeyO|1oOB{!EZH0P#U=Wga)tY6->M9rzO;%N4w);^~=(Yz4c
zb1_d=J4N#=)Z8@J+Bz}1>B1ee6C1|I=SC)%cDBx0Uf8#5Wy7AOk=d*EuI##a&(8h3
z(;t@(jEqgrY?_?eeE#Hx8;FbbR{HhC`N-VF1s6@sj28%zTkkj@85^CO7$2RQn%KH+
zV(Z-GmYEHDn2(&Zb#!`S>*$3O=YDhpaX!qmf6dvc*6ir`*yNVx>~t%$IW@F#Hj8y-
zVc-76#l0)d*_rPDvv(!Z!_*v~%Ky5qy=T{fb?YzPJ@iw?SRZ37d=6v%PaZu!4g|aA
z;9?p`Ht@vLtK&ei+dj1|4K(}Balaml1H+y>Z#WGs)B8Ttj04A>I&|+~9C-He$Ic$?
zk4`xozV$x?X+-ROW5?x8um|q=XU-)1&O4v(Pb0-{{MbT1((L5%lk$;aXYRkWFO4i)
z*!}9NG;-{NmmkSTo}I9GeQz8I&IT@iCLa;|(_=qxq>*5U_YL&M5Vo_&-<V3l?O<Pb
zdlH#uk8EV@-O+q(`@+i!`_=u88n#`$?z+0R>*AmMtEYy27k{{~h70c^>>sz+j2GTR
z*s;TPEV}sq?=@;zcJY~a*0og^zj9BG!E*<DyxpkbLe_q^J;&g2gZ0hl7`$n)^|Ci%
z@RY&Ud@|1(c*0=Uce4hbF4(;X>$p&oUp~~RVo@l`P2XT_ZEF6+3z7Zd3q3gp4-@R=
zclB0z@jZh5#T9w9Unpwh)kamn`1-(J{#&E2U#R+@53)Kh=zsoOfEVUBbO_il6#lvI
zcJ;xV0(<Rff6l>E0y}Wa={W}v2<+d#Z4`LaFO+X!a#fAXLjCT0IIj<SC1A&<8g*XC
z|K@vGofq=^ksky7LVh2;0q{b8NA3gm3;Dh4NuXcI?;T$Q@fGrW)(wCc^4If$d~Jab
z`|RqkFt(<H^NV=+pwEsC0-)N&u0lYziQPH?fNm3eqOT4_@bTgt0K83X<C;3403KWg
zfN2vu417><{^B42!TC$Q08pFQA_u^>iM4L20hO7$Z505))V3i2LJ}Wo0^r-k`j9Et
zCblqG2U)Jx08=9PxCCfqalAeNG(sHT?gJo{_IJGigzR2|Ooccu0H##%aoy{EHDE#o
zp9Ybe;N$Q80E9SZ-&qGTXxJ-y>L4WXnuqJa1n^AVl+GG%u~`Q^1b93ES)pzk1R4R{
zIRG?5<R|q35OQ_*b^wC&FQ3+^fvkcbIt6HiPJI0R0EApUyB170WcRNp0gV8@|BgDy
zYJSB403q`GR@Xt;!`an?00dJn^a2n}J+QN<2Es2{_Ry-jh6);I_t$}w!FeA5A;530
z1|VB%$W+Mgji=Rt4RQEufJOk1K2Qff_?SKsXoLXoUIRd=`R-Ez2$3HTO6x=9FRTV2
z#BsRZJ~AuR{{A}1YJS2hpb@6Ky$^uk{7XXsgic(GOoccO0#k~=1qf#=Yammhv<m>3
zFss!A0E9TsSPei(`;p@S2+lvdt`4kwivY7OIT?VgOHKp<W?eGZ1R5d0o_?SavU>pd
z@L{^!LF8HcJb^SqN{;jbjbQ4($a(0ir$8Lp8a#?L0yrIM1m~Yd8lfJ)K(h+YZ$bfv
zl>8BC1XBl20EG$xE};sBS$$#%fM9C5AAk_x{e1w0S)JDeAjI(sxJ+a_oCABi2pYeC
zZxM9Ag0iI}jVj1KezG@ySO?iHW9VuCLgX+1sCE=(UpU!omjjKE#JgIo?mVRA=Fb6B
z0UR^Ebr77d{VLE10UkQU>L57(L2pkTgoU$Z5cmk${o-Dr5ps3L-nw%ga<x1GKuF2a
zDF8y3EbL`<5F&s66#zoy|GOQ45c$f@0EEb2GCg$=BA+{@r(TcX<JD6^cEfa6o(1jm
z=66H({DT05Bwl+}gAH`Rn_mgpS$8(r{{e0T1w(u-001A02m}BC000301^_}s0stET
N0{{R300000001!Qrz-#e

literal 0
HcmV?d00001

diff --git a/test_data/header_only.bam.bai b/test_data/header_only.bam.bai
new file mode 100644
index 0000000000000000000000000000000000000000..3a17d3953a10ae362b40cf11d4ea85c8e8c1c40b
GIT binary patch
literal 1576
ecmZ>A^kh8DFbYOPU^E0qLtr!nMneF52mk<~O#zVr

literal 0
HcmV?d00001

diff --git a/test_data/header_only.vcf.gz b/test_data/header_only.vcf.gz
new file mode 100644
index 0000000000000000000000000000000000000000..6e74d72b7cccf23b44944a8216bd6e9d4aef0ac7
GIT binary patch
literal 2039
zcmV<T2MG8diwFb&00000{{{d;LjnNW2aTFrZ{0W!$Dh+rVN70jfo>s^k|+V20kXMG
zuxXp7onYrFI5|hjG3wY^`y`!y`@y-IjulQ`+FX95DDqE<cFz9(U0J2OrhS+!4z7<*
zpX~>Hx4(aSe(~+(>fqn!#|M{(U%$RTPTjOE9~Wg)AN+DyRmELdEmGSRkCU!TvtNE4
z+FYf(qe)c_&CXKYEPg(g^?mWs%u-b>=4Ji8TmIJ-UAag_*%fWNOKqyB>BCE(7tN0^
zPm>oukDB_foGtmHtk<5NCv7>Y7sYK_uGFSUwe)+Ns^&*AomAEGr;B-;x_ML0Myq^k
z#({rb#x|dH#VkE8<_D%|Q}<LYmX6B0sGF7BwWsq>r=M4TPCgyK|Mc{5o7w|-|Lx~v
zIykQv={~hX*C+L?cx<|I`CajkyQVFkx>Woq7xSX~`Ec7*<+PYoRjN|=&vmGuuRb1r
zTP<<+?ZpaT7j0SJXOo^r#$VnN-ceJ}R<TX0;(p_)`2W+SDi=RrdOW^-i^r=pnXU8q
z(*EQ5r<=F=_*mBE!_z}?d|5QNU231xY?ZQ=0#b2yeRNvfR?YPLhnH&{?uv5trWlK5
zMbczC-xT4czw5)(t;55$!_%snEQao`H-cQdDDK+Hboq;J6y4>;*4dd7-Q~PZvwwBd
zNtI@OQL6HLs>*rO%(|j&p6Xee6^mw-k7?C(saq7~+S@4aC&zEidzzJ#`tQ^JGJrZw
zS`ol{-^VFEZZdQ8oZ4==0(_tk=H34Ow5b>6{VKZYyoL8ws_z%`1GFJJZxMeTG1xN*
zksQV`Lt6%7!04RIFt%q5f_2V~8Mk8sI0~5a<97_3U|i%3!#l=BvMw@>nXqH50}7t}
zn29@v0!7QfV+KqQ5l7A@5uk^h3EpQR6rPYBnatLp&ox-`mM~<XKG}wiC59a2dI(|!
z2)X5U>5hgdkaZbmm+nC5*jmUiVaFI^!jSW`OScOi*h8iwxXZS;)>`nKVRp&3o=uE_
zGR!X5&IAb7XA<<kUPazm6JjRjKXz^yku7A9k2^Qs`CN)`{(FvW*!X9-X(#opdFUnX
zOs-FMCOk4BG=z#p7%zhCgJu#(Cf)~cUbg@<vakl&D;6-eKm=_K%w)?OONt3)!eZ+b
z3p=)e>?wd-3pWnSgJn=mGIe*vF?Y`1oLv}W5FumYC}sk)YZGI_GrllE6FGOpxXmgV
ztwi*>QzCA+!kG*kkBY^34GP9t#UkbkA97{fnxGjk5{=8L3$`Yi3>!naBMEjXj+qog
zHpN>T#Iarw0CE==*llbX=|`L2&t@HnHe&<UM{OO-)CO`#TyHX7hGO=r?@clt!x+G5
zHshAD#FX2N+nQvgX#?8a#pt&-nZq(Lvdb;R+pcjV6+mz<Le^%p&{)$LOrWUP&{)&}
z=v?lk!L3d1+%TMLx87!~HVe*@W<r^gMrPD(G6MB3E9%~6EFFsh)C^<}+W-;OWq533
zC{WOBGG}KbcACjp1WYkEDE+aqu$gm>(PsAYd%ZRxgU|iYk8Cy63KYE4Q0h^L7B$of
z)Ut-6j1_Ede&|`#v&*qiK|!E^1~k;mV6D|qH%P1+fY?`zK_fwiO1Y7sXSrvIeN<Fn
zWdr~mHB^Lv-g*s{g0VsM8btsGRWuA#;1pDf00}`u?ZB&=EEZ^X!0f%^Krd>nqGe)G
zLzNv9g4Ix&R`QxxD?+7+Muoi}=#Vs2M$4>4>v<T0f|BqWG5bW<vq6d*ozcqR1#Gj=
zT3gh~a|5$adOOraDMSdG*MYo)*SyM&MF*N!Suiq2LkBm4*V@(%+Li*K7Ordt=QSlb
zdB_An)=;4YuS(n*p)W^{G_O*MsG@@&3)+$FP}Wuzts^Ne@IgaG05Pgg#rhGluakDH
zwPIUnP(!5-otl9l2Lr3>Fvw#zdey6u0jSjqLRrgpS5OvtB`~DA6|A*Z6?c@Qi4RV5
zYX#~+jX-plcSZ{!1c8Fnv=v3Y$XL)&ky*!48+fF+!8>*48w>{OC`(eYq_qr@*cN(q
zjFLAE7^`}fePB^Tt$8y_UIu__URj`^%{$~}BL+hnDo3NBU5Vu75aGyZsMuB@)KCs@
zE&&haqQx2{it5-cI~FOh=2e!!k%mU8EU=`ZQUf7q$&$?ts6O~aQP{|;Q@K6|HCOTx
z#cI<j4K4$AS^!a^WV8U}nF-#hv7$@>Xinv>qAKhlN7Ja1VtI?&07)zh0OU1PwySq4
zii1?CJvg|y9cZY0@+DT=RX$7xbugDh2FIw)BPj29|H|8k&1=xXvlNC`uC4W(yuq}~
z5neenHZO7qcN_$R8f*uiy*9<Mm0EOcN%ia{Y>XB%$QpZ#6tyVi6$(aeG#(6-KB%vG
z*ovB12oW?^Y|neEJub18M{^8JPJ`vBOzOJ>wo(@^WM2pGU^lq=L(sa8-JsNlEo!iQ
zaAUUG_ME7m$Edx@Vkd`C18O~-s2zDm(pXVDjxMT+V8H^ua{g>&JeUKSK~<(a0)yvQ
zG{5oepZh<_v-f&8$&1(n*vu?&@}KM8AB0u9j()rP{BiH{^Vhxe<Grhs)4jusZ+l;U
zKfKsme|Bnb`N5~X^`}bqemgS;R{!D+nLk%wx#d^y{sK~p-34wY001A02m}BC00030
V1^_}s0stET0{{R300000004m6*LMH_

literal 0
HcmV?d00001

diff --git a/test_data/header_only.vcf.gz.tbi b/test_data/header_only.vcf.gz.tbi
new file mode 100644
index 0000000000000000000000000000000000000000..bb297398c84ee82cf44db206b9f2974a1debb49c
GIT binary patch
literal 80
zcmb2|=3rp}f&Xj_PR>jW#tf+k4H+32SQs`0x(d&a|F6SzW1G5ypohaxzIUJh&es7d
Pl}FPf&A<#c0z?1+^TQC-

literal 0
HcmV?d00001

diff --git a/test_data/multi_smrtcell.bam b/test_data/multi_smrtcell.bam
new file mode 100644
index 0000000000000000000000000000000000000000..5ed6c692a2021ad748dd9103a05db0a9a21ecc64
GIT binary patch
literal 3034
zcmV<03nla)iwFb&00000{{{d;LjnN33)Pu>Y@Sse$4}p`m$wsS2si@j{KF88E9ZCa
zr*mO#S4N@jN(*CCOf76=la<zXd&e}OMuHI&i8Ap5;kF4zLJTBIKop`0nu#%Z89|J~
zL=s~#Y64;~DCc?4dEV#p1^;$`?ez1zoZsbK-sg-APj=0m!PutrH#fIU4bh-!&TbvL
zVsUYK=bo$Q_w8yfT|T4-`5+(IG<#8Vc53K~-OJRRm>Lqwd2N&s1t8i$ke6Ec0+0ca
zcfxC}i$Jvj7o@b-1)wwFNJ}S58YTmlJFdMg0-FKtrPAKY0&p2nnxr$*6oAiwbkZwJ
zwiqC96KTCIM@ZBrnmc34L_{57wdB4?+R>AeMoHm{q#ZuDTuDnMqT7VLCvqigSpqzl
z(n+O@AWMLgLR+P%2wVoZv{JZ|MwS5W3|T|P#E2|@qm)u)Y!PJPD`UC$P8LBHz2=Ub
zGQ~i)x2s~f;?5VH@5mIa5LQv~?~|E=(WVsi+=b)9a?4GN=9cHL-nqEYZUfCrU6ae%
zlAIJ=3MA&r=7cp4%K4I<G0t!xM!Y1ZI9Y?ls4OM~L2IHCtKmu_Ia%>)l}2K!EJo#)
z7fDQ)y&6)MkX)q_T3TOP@pD@yIOjr8MNUg!^sCx(oV%7N$O-OkX%XpYuTnBEOlfua
zG(B5M3O53Ym90s~wL)UPS_)Unagr0fY|e9Csw^eBN>VtNrS(U7R(vHZRL)8g6rx-<
zK}4lJkW_MA=_;=(bBq?rtBN2g<Do@a$sv{2b4RXhQ2O$Q-;t|ypcRfd6fC!(l2(=`
zsFPf!7nBpKwAY#>SlJGo6;_u_+SG-*R2OnudnyY?ljY74jyvg*pyXxWB(yHAJCOvd
z3yRxP&vpdMl~$ycgmSWyZ*Q%Df|ZUiy4*S)!EyzaaD+pt>>-Z4zy@tukV_{W6s+tH
zURn+1%4M<Amk!1qQA}0VvlXg1ze}!e%SkR;?K_dM3UU=W84bB?bMi_+t|M0#a*_O#
zDxEkT0p1n}=Q-jOISEdXn=}Ch10<a%0Nalf^xi>C$-OKcA{_zS7Le9>Q~{M75Lpj-
z^dO8fkf#bMhxL85&jsc#uYg886+lQqkgHrD$7dri-vK)MA}<j3A3MxD>^?P^a|-el
zzBEu@MfcJRjwOuAOAG@rX$BT9A!TgG1L893dr`iab_87N5TQBDk%<zje5XuYr$r*l
z7uK9}pK44F>gPS;4Af6E^qQc4N}=Q&@_ZLEu!(iv#L|&~Q%G&34XlDr%gB+ikjE+*
z+$WmHDr%`E)K66wnCE@yJd9AJ6yqk)GL?{wgFM>z9;-+5p31-Qasd^WF$pdpPbJoP
z{w^H^3YXZ~6raKRJlV(jbFqG*+Yz@heo0(n15rRJg*(Kjlg2m=4XDU9VF$4Tjpoq4
zbCHvV74$KKk}HBdW<XmHPpZ@*WHdf%^Me3i1!U>~2|G;6I6q<VNtgC9;tK1h4pIVg
z#owAslk5u|zm1&WK_yM<P{$?Y=^%4(6O|^N6k2k~<NTZykX!fG`LUh)K>sdrfv+My
zKd7CB`YAtJLhe)j2p-hZr~Eqy<4gTb0^ihp%&)gtJ(;%-hnH?z3Lf+_-+-)!_I=`#
zL;GoYlhN3}NYf{%o~Gw<cer%QdyJ!XCEy7yZORTw19@s*qcInGbhtV5?OVg}Y`P{(
zi<>vyQXRgbq~pVS504j-ws%wg9T9)i=hr;ps@q@thi!gEu%4<5Bz|egH%M|^Kna!G
zFz_6;DwSDVC9w^YBshoDpp<H(g!Ir!+G&a7X0-WfZ5;k#WmQa=BIgMT#CaN{;HA#0
zv|o5<rG^seQX%ng1FKR^v@3s($R%{X={pBI)9O6q+$tEVPPa#E6`ZGz`82}AAAwe<
zqn#5@mX6t!#tI1qVm7VU9*1G00Ey}fi*1n~zt$2idRoKI*lOcM8>ZF%mT!%DljcjC
zv>{T(V_<A#TU(P{h;U|0bA0pAyta~?E9aZDlSAik;asd=-nB%{iIL)H`jVjqr#R7^
z5ZrSyM^-yUb1T%`JUcWqHoW=b9n)hQM@MHjj4$mRI&XP?Vb{vWJxd#=_wQZVb?Kg+
z`*x>4E*;n~GCs9=d}_;u;}>rvF4kM=*AwR(X2&kNWNd1*K#1IW$N7el;n}g#;faZ{
znQdb;v*TN*HtIpX;k=pQ$+4N?i^tCY&_?2XkZ1pz(-T9}!=oePTbt99Lz&Hq6zk6Y
z3kwJTSD<a7|BtYyv#?g?7xpbK?p<k4Pj&vEJ(rOlq~_>E{?~P_J-ZI9TYvfPfgdx*
zdKhEjD<A89;@F9CAlSh}i)kQP-{ViMjswMR{rI*t(Ck+y{Bj@;414y1!8EW;FMO&Q
z2aY{?_@4ea@a!Xxp4;CWopLsK%fI^4h}gSFPRN;H_uu~aoJsbrw?EaJMvC3=;rV=|
z*{P$a<Rin*-gkLV8d)~K`;}E`<k<VJJerR@J8AK{?l=;h^<DaOJ|gyqM}OK#Bf*X=
z^mWG&wzJ1xpGd*&U|)565}9NVZ({77(R^IZg$E(_i~AZiY&&?}wRLUR!9V&(R}K3P
z{y<L+7oL0A-)^fJFFf<G<45XPbnt!OY1FXn;Ir?jYpV`^`R*KpR~+_Ot5L&+to>wr
zj=`G^>zT_jc&cIRWp~2hg@&#9Se`ZTGQ+OxWDUH?uzL>GaiJtXf3Q)-qEM0>zsA_w
z)clDTBKzIvx^fQQTi8qQ=&thOGYk8ZEAnW+P}IgNjjDd}QH8zqmquN`Q1w3^Vs&27
z|LivaFU)V?FtA@J{IlQg=!2&c_Uf_ToP!q<cHrhSa}M4`*gt>WDDbFXC|}?Bsv4Jt
z`rZ3bULW-M!H!Qf>b#KuEqAjzFXZ=wKLq-P{62C$;D!8dxEI(j<oC`efPNvrw|@o1
zSIF-<*8^V2U)Ou{wFTZQ*fn2bY)u>Izc0c&1v}mkfNBxjkAQ3uyQL2R-6HmQPaTNh
z<AqrOc#GJkHFZD%Jg^D?(;{{R_@LnYg?<2n^B21Tpcb)34uEYD8@jm$RA%bdRR9E2
z+Xes#Nqo2ofNv4&L8e@b*nEE-WVu=cOo`y*GN6&g@me3y2yuL~2Y^u8-*f{IvU?da
z72>!Em{P&VwXgNmfC&|R3Pf&#kH7W;5aO7AdmYH2VXx|{gOJ3757mJQ;OV+4oi*HI
zvkrI&@I(NzLfzO8Gy=Gz4`_tQPw4?5<m#^N00ie>Jfl$qSq0yJ8qf%x_}F^@2)TM@
zEtqb|?w?Nq8UcLwZFP{<{Hi_xLge?Zu7j|LvupYR2&SIz1|XQae`i+>gdgba!Buq)
z6*SK6tpg{6^Bw>~fL~t?K(^G7sgT_p&Zq+$;_wH7MgWi8Uk5(;m^>M1gaGea13;+x
z?$ZGXk)H@k>qF$vuLdB*aircpGAq=+-a5!?e$pzS5vIGn2Y}%G3j+XzPF#yjg*XlY
zQ;NR{2xlv6AXA~V^8lDItJQq~ggDMx4M0fy(Gvg&&Og1b4y=2V0JAPR6@aWuP6h#H
zT{7DQ8X>^0UZ4@OdjR<GVY=Hv<XQVXjx<6_j&=i$VCvt<dFZPrK^)l{Jccv^I1_0E
z=bu0tp&ma&vkJ~{L;;4B{2plpQwL50g$e;Kp$djseRKeTU~0J+fDqt)JphDRUC;y|
z#PKq?Ok_Kp1ADs&8ozy45p+K8vZbSqD#*U+ve&;~2iYxS;2Hoz<S+f8b`)hFZrQ6>
z0*#QwJBL`^c}U4kp8=)<IBvS@AUI$9WuOrPJbakdL2&-P?yfor3ukLT@DZ~6`Mp3R
z<m#-wb>}+dYIzKRkdk8)0E8}?-^=PCME=~%0EEc@a~l95@|Bwa2$8>Ny6PZAK6_eM
zy&l2GE2o3(hUu<61KQ_}&!OzO2LK33{N(-y>uZBIK7O)u?r5<80%Fw^U{fvt03VA8
c1ONa4009360763o02=@U00000000000Q7{kX#fBK

literal 0
HcmV?d00001

diff --git a/test_data/multi_smrtcell.bam.bai b/test_data/multi_smrtcell.bam.bai
new file mode 100644
index 0000000000000000000000000000000000000000..3a17d3953a10ae362b40cf11d4ea85c8e8c1c40b
GIT binary patch
literal 1576
ecmZ>A^kh8DFbYOPU^E0qLtr!nMneF52mk<~O#zVr

literal 0
HcmV?d00001

diff --git a/test_data/multisample.bam b/test_data/multisample.bam
new file mode 100644
index 0000000000000000000000000000000000000000..6abe8c6ca7f3d3ff198f1dc78c0861cc5eae60b6
GIT binary patch
literal 3038
zcmV<43nBC$iwFb&00000{{{d;LjnN73)Pu>Y@Sse$4}p`m$#G45O4(4`G+AGSI+O;
zPv^qgu8cz4l@`XPm|ED#CM&J&_Ks;ljRYem5@q5A!fg|bgcwMafG9*0G!tX+GJ+U|
zi6q8g)C9y}P|ow7^SsaH3;(;ncKZ2U&hK(A?{h|mC%b0OWNgy~o15FFhG@_<XSWVr
zxwyEzbI&#N`*t;#t{BpTe2@=pn!UI=J2iCW?qzCDObrR;yf#XR0uXH=$V)AJ0muNz
zJK?p~MWEV%3sPF^0?-+7q@@!j4U+-O9oOC#fz5#SQfY5x0k{k(P0|@@3czPTI_Z@q
zTMUr5iL~C9BP41Q&7Co2BBGA4T5?|`?dVBKqoi;}(hi?nuB4?B(QQKB6S<PLECHTN
z>7>#{kR`xLp{-I>1TF(yS}9yfBTImGhOD7tVni0dQA#N?wg|HDm9gA=CyOA9UUNrI
znPMQ@+f^}Kap#N9cVr4y2&<_0_sLAbXj2M$?xJyFx#gxsbIbGB>|9)Ew}IxRuF2(W
zNlpqb1rl>*bHW-2<$Oub7-u*TBVH0yoUB1&R2CD0pfyp6)o>+|oUC}YN+U5<7Nc^@
zizKGYUJWTrNUl-|Ev+xD__-|;oO2<lBB!M<`c-W?&Rt6s<OKJ&w1{-HS1FknrnEYI
znx3sBg&Tpy%GRXgS|KrCErl!PILQfKHs`r6RhE)mB`KWC()y!3E54EyDrY4L3Q;bb
zAfnP9NGiFmbd^_?IYx`*RYj1L@zA2I<d911xg%FLD1CXu@5og;&<aN!3YJ??Nh?be
z)Jd+=3(5&q+G|Y`tZWC)3ad*dZR$c@stY--J(UHc$#Uli$DQ;@Q1Y^G5?Ytmok)V!
z1;uTtXFG!BN-NSzLOEH<x3^Y6!AeIMU2dI@V7Y=yIKrV+_7F#2V1u?S$fc7G3RZRp
zFRg}h<+51mO9$hQD5fgw*$P#h-=){I<s=ua_MJ#r1-XixjD}pcIe8@@*O4m=xk&y=
zl}?<F0B;L~^Bi%CoCGJxO`3p$0g}!WfbGW#dhZ~n<X)Bzk&b|E3rK4`s(?xkh^&V^
zdJsk#$Ww)s!}>nj=K^z=S3o143LvB)$W^Y7<Fk>M?*JWrkrxR2j~(V6cApx|IR$wN
zUmB>dqI>BD#}dZmC58c*Gy@BlkTSO80dX1iy(nKyI|43sh|nD7$V3TMzEdWy(;^Y&
z3v14~Pc<e7_46Ka2I{97dQDJ2rBHGXdA<u7*u*+-V(Cc0DWo>i23A3*W#mX$$YT`@
z?i0;p6}8k7>Zd9T%=12U9!4lqig6QYnMz2;K_2aUkJY1jPvu{Dxqynxm;@J)rxI&C
zf0qsdg-h&giqBwup6uiNxmdr@?TFhLza%cPfheGq!X4t%Nn@Ob22|vlu!GovMssN2
zxyVVw3i_Bq$rV8!GoY=9CspbYG8!MX`9XlM0y1@ggdHYjoS!iGq)Yo4afS6$2Ppx$
z;&0ETN%jSf-$qXGppqtasN)jybdWi?iAs}B3N1P0aeht;$gO+({Mb%?pnsRRz*iBU
zAJoo5{gfXqA@`|%1P^NIQ~sTU@umJIfp2O)=GR-Sp3GZ^!%H_U1rK_eZ$MT<`#y2W
zq5ZVH$!P3fr0Ek>Pt)_bJ6t;DJ;u?x67YnUHf4vTfjl*@(U^-oI^3N3_N`%fHeHjY
z#m$>;sSe*z((z%vhsTRZ+q<d$j)=eM^J|`P)$OnS!#2MnSWndj62CO$8zea{poGe8
z7<i6amCCHGlGuhx5}d<nP)fB?LVD;V?X<*kGur&LHV*%=vMMG_k@Exv;yjH}@KR@0
z+AqAbQbUP!sgU@$fmNv{+Lb>?<PtjH^qqs9X?31)ZWRnwr`w~o3eHo<d>Y~5k3g%_
z(as4cOUG<VV}*nQF`L$FkHfH0fJAkL#kR<gUuy{$J*{D9Y_)Ns4by6W%eTh7Mf0Ui
z+7PMYF)%i=t*uEeL^!jhIlg&lUR%k{RrAf+$)O9ja4yy_?^>ef#7J>8ed*AGQ=Di{
z2=2L<BdeXFxfN<|o*kMQ8{T}$j_I+DqocDM#+P;uoxeQ4uxn-Go}~@b`}eNwx@^zR
zeY?{imkw+g8K2rbKDFhd@k=%m7wfI`>xuIXvtt)uIyN<0AVhAx<9x%&@a)*=@WjN}
z%(k(a+3~GY8}%UHaQ@8j<k-ycC1V$Scq4H>$g_XV>4~A~;n9)tt<CAlq0Ht)igoAy
zg@uDDP+1C8=7F|_{y)N+&ca%mU)Z;}xOb&FJ=OVt_FP7KkeZ_t`Cr$y_Ut;aZv7Rz
z2Y$jB>tT$AuY9cc$zvzRfnWy@EvA8FeNQ~SIt~=O?GxM5K(k+;@T-A1FzmSt2h+eZ
zz3}N~960vW;d}ezz_X7&c3yvPbjsP_t^e*zBVz9vIU#3)J#fcAawgfgPkOpHjTF1_
zBlG!4v(rXT%}0ivv+s(YG_q`d_p7VY$g%fdbu=G&cJku&-Eky1>$~ikd_?SzkNvEX
zMuHt#=<ALlY-f+ZF_D7X!M^79Br?e!*~HkpqWQR*3lBo<m-jbn*mm%`>+0IBgMa+b
zt{V0o{K1|YE<E?Jzu#UnUU=qV$B)#p=-~Um+o)mL!ROpr*H#_;$~`#-uQ=@SR-=Xs
zS^Mes9D_F-)-#u5@KnRr%kG513k_TI@jPqbWrkhf$r^Z(VfP-Y<3dS(@ld0RMWG}&
zeVwtjsreHxME3j7cjX+sx3HJr*<IzuXBPHnSLD%tp{R{l8&&<{qY8WZuZ_BXq3VA+
z#Ol1D|G94hUYOs&VPL;d_~*XU(Fac@?6qUPIR`H!?7%H&<s7_=uz&rgQQ%R(P`<wL
zRW&XP^}FxkygumhgB_n})OjKQTkc_XUdZo<egyOj`F->Tzzg}^a38Q=$nRZG0{udM
z@AxW+uaMt!ZvecIzpnS?YYV(ruxr1}*qS!Ze_w=m3U<670M#P49|74Sc55F1x<%}X
zo;nc0$BVN7@D{O6YwCakcyJW}rbX-s@Ik@(i~Rru=Pz{wKrLd6901!QHgro3sLa%D
zs{jb5whaIflK4my0N*0kgG{*=vHAWw$a1v?m=eLq<v=5g<MlqE5#sn(4*;RGzwHJf
zWcPApD#URyFr|Wz>t64v0TU|tG>F^;AAjowAjC0!QXR;kVXyA0gOJ3757&VS;F-E9
zoi*HIvkrI&@I(NzLfzC4Gy=G@4`_tQPwfF9<m&G200ie>I<rv&Sp`3EI?xE6`1pGP
z2)TN8Etqb|?q5y?8UcLo9d(e^{OUdcLge?Yu7j|LvupbS2&P`>1|XPvU}skigdgba
zp;dJa6*SK4tpg{6^Bw>~fZtdRK(^G7sgT_p&#VI*;_wH7MgWgKPzOHvm^=k&gaGee
z13;+x?lS-gk)H@k>qF!(tOg*&aircpGAq=+-a5!?e)1}y5vIGn2Y}%Givs|JPF#yj
zg*XlYQ;NR@2xlv6AXA~V^8lDItJQq~ggDM#4M0fy(Gvg&&OfuR4y=2N0JAPR4S=jm
zP5}XCT{7DQ8X>^0UZ4@OdjR<GVY=Hv<XQVXfiyx&j&=i$VCp}}dFZRBKpfc`Jccv^
zI16b6=buCxp&ma+vkJ~{LIH-9`~hhMQwL54g$e;Kp$djseQW@LU~0J+fDqvQJphDR
zUDyO5#PJHaOk_Kp1ADs&8ozsY5p+K8vZbSqD#*U+vNwKE2iYxS;93Ae<S+lQb`)hF
zZrN*B0gaHvyM|cZc}U64p9Q7@IBvS@AUI$96`&CUJbakdL2&;4?yfor3ukLT@DZ~6
zg}p!{<m&9bb>}+dYIzKRkdk8)0E8}?-^=PCME?9M0EEc@dpiIj@|Bwb2$8>Jy6PZA
zK6`psy&l2Gt7m}hhUu<63)<(+&!O!32LT94{M7yi>uZBIKYp@v?`*LD0d<bC%u_A^
g03VA81ONa4009360763o02=@U00000000000Q7RqSpWb4

literal 0
HcmV?d00001

diff --git a/test_data/multisample.bam.bai b/test_data/multisample.bam.bai
new file mode 100644
index 0000000000000000000000000000000000000000..3a17d3953a10ae362b40cf11d4ea85c8e8c1c40b
GIT binary patch
literal 1576
ecmZ>A^kh8DFbYOPU^E0qLtr!nMneF52mk<~O#zVr

literal 0
HcmV?d00001

diff --git a/test_data/test_reference.fa b/test_data/test_reference.fa
new file mode 100644
index 0000000..c6f5c6d
--- /dev/null
+++ b/test_data/test_reference.fa
@@ -0,0 +1,5 @@
+>chr1
+acgt
+ACGT
+>chr2
+AccATGTA
diff --git a/test_data/test_reference.fa.gz b/test_data/test_reference.fa.gz
new file mode 100644
index 0000000000000000000000000000000000000000..08107fba98f49920307503ff28ed7d637baf7a06
GIT binary patch
literal 90
zcmb2|=3rp}f&Xj_PR>jWwhWB&3=jU>C1(^FawR6GmvA{cyN7Ur_(oig$;pl(?jep`
V)0RrL$pbaXqv@7rU<MloA^?{k5t;x1

literal 0
HcmV?d00001