From c27f07f6858b8bea944c1f2976889df070252776 Mon Sep 17 00:00:00 2001 From: Mitchell Robert Vollger Date: Thu, 14 Nov 2024 10:19:18 -0800 Subject: [PATCH] v0.6.0 * fix: do not report m6A predictions that happen within the first 7 or last 7 bp of a read. This is so the ML model only operates on real data. No changes to other calls. Will fix #65 * fix: report footprint codes even if there is no spanning msp, fixes #63 * feat: add a pyft utility that can take extract --all data and make it long format for plotting. * feat: Add a shuffle option to pileup to help with the FDR calculations in FIRE * feat: make nucleosomes and MSPs optional in pileup * chore: use vergen for cli version * feat: add phasing stats to QC * feat: allow strip-base mods to filter on base mod quality. --------- Co-authored-by: Mitchell R. Vollger --- .gitignore | 4 + CHANGELOG.md | 14 + Cargo.lock | 142 ++++++-- Cargo.toml | 2 + build.rs | 20 +- py-ft/Cargo.toml | 3 +- py-ft/docs/vignettes/centered-plot.ipynb | 101 +++--- py-ft/pyproject.toml | 5 +- py-ft/python/pyft/plot.py | 109 ++++++ py-ft/python/pyft/utils.py | 120 +++++++ src/cli.rs | 2 +- src/cli/extract_opts.rs | 2 +- src/cli/nucleosome_opts.rs | 7 +- src/cli/pileup_opts.rs | 20 ++ src/cli/strip_basemods_opts.rs | 6 + src/fiber.rs | 4 + src/lib.rs | 6 +- src/subcommands/center.rs | 27 +- src/subcommands/extract.rs | 102 +++--- src/subcommands/footprint.rs | 9 +- src/subcommands/pileup.rs | 440 ++++++++++++++++++++--- src/subcommands/predict_m6a.rs | 5 +- src/subcommands/qc.rs | 14 +- src/subcommands/strip_basemods.rs | 6 + src/utils/bamranges.rs | 18 + src/utils/basemods.rs | 16 + src/utils/nucleosome.rs | 7 +- tests/data/ctcf-footprints.bed.gz | Bin 13825 -> 14091 bytes tests/m6a_prediction_test.rs | 18 +- 29 files changed, 1008 insertions(+), 221 deletions(-) diff --git a/.gitignore b/.gitignore index d334aa3e..92a096dc 100644 --- a/.gitignore +++ b/.gitignore @@ -18,3 +18,7 @@ tests/data/chr20*bam* hub.txt *test.bam* py-ft/Cargo.lock +py-ft/test.html +py-ft/test.py +py-ft/test.tbl +tests/data/shuffle.chr20.hifi.bed.gz diff --git a/CHANGELOG.md b/CHANGELOG.md index d33c521f..3eb960ad 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,19 @@ All notable changes to this project will be documented in this file. +## [0.6.0] - 2024-11-14 + +### 🚀 Features + +- Add ability to drop mods below an ML score +- Add shuffle option to pileup that produces a shuffled track in addition to the real one +- Nuc and msp optional + +### 🐛 Bug Fixes + +- Report footprint codes even if there is no spanning msp, will fix #63 +- Do not report m6A predictions that happen within the first 7 or last 7 bp of a read. This is so the ML model only operates on real data. No changes to other calls. + ## [0.5.4] - 2024-09-10 ### ⚙️ Miscellaneous Tasks @@ -10,6 +23,7 @@ All notable changes to this project will be documented in this file. - Readme - Readme - Readme +- Release fibertools-rs version 0.5.4 ## [0.5.3] - 2024-07-31 diff --git a/Cargo.lock b/Cargo.lock index 36c89404..9db71665 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -111,9 +111,9 @@ dependencies = [ [[package]] name = "anyhow" -version = "1.0.79" +version = "1.0.90" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "080e9890a082662b09c1ad45f567faeeb47f22b5fb23895fbe1e651e718e25ca" +checksum = "37bf3594c4c988a53154954629820791dde498571819ae4ca50ca811e060cc95" [[package]] name = "approx" @@ -747,7 +747,7 @@ dependencies = [ "anstream", "anstyle", "clap_lex", - "strsim", + "strsim 0.10.0", "terminal_size", ] @@ -1075,12 +1075,12 @@ dependencies = [ [[package]] name = "darling" -version = "0.20.8" +version = "0.20.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "54e36fcd13ed84ffdfda6f5be89b31287cbb80c439841fe69e04841435464391" +checksum = "6f63b86c8a8826a49b8c21f08a2d07338eec8d900540f8630dc76284be802989" dependencies = [ - "darling_core 0.20.8", - "darling_macro 0.20.8", + "darling_core 0.20.10", + "darling_macro 0.20.10", ] [[package]] @@ -1093,21 +1093,21 @@ dependencies = [ "ident_case", "proc-macro2", "quote", - "strsim", + "strsim 0.10.0", "syn 1.0.109", ] [[package]] name = "darling_core" -version = "0.20.8" +version = "0.20.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c2cf1c23a687a1feeb728783b993c4e1ad83d99f351801977dd809b48d0a70f" +checksum = "95133861a8032aaea082871032f5815eb9e98cef03fa916ab4500513994df9e5" dependencies = [ "fnv", "ident_case", "proc-macro2", "quote", - "strsim", + "strsim 0.11.1", "syn 2.0.77", ] @@ -1124,11 +1124,11 @@ dependencies = [ [[package]] name = "darling_macro" -version = "0.20.8" +version = "0.20.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a668eda54683121533a393014d8692171709ff57a7d61f187b6e782719f8933f" +checksum = "d336a2a514f6ccccaa3e09b02d41d35330c07ddf03a62165fcec10bb561c7806" dependencies = [ - "darling_core 0.20.8", + "darling_core 0.20.10", "quote", "syn 2.0.77", ] @@ -1183,7 +1183,16 @@ version = "0.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8d67778784b508018359cbc8696edb3db78160bab2c2a28ba7f56ef6932997f8" dependencies = [ - "derive_builder_macro", + "derive_builder_macro 0.12.0", +] + +[[package]] +name = "derive_builder" +version = "0.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "507dfb09ea8b7fa618fcf76e953f4f5e192547945816d5358edffe39f6f94947" +dependencies = [ + "derive_builder_macro 0.20.2", ] [[package]] @@ -1198,16 +1207,38 @@ dependencies = [ "syn 1.0.109", ] +[[package]] +name = "derive_builder_core" +version = "0.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d5bcf7b024d6835cfb3d473887cd966994907effbe9227e8c8219824d06c4e8" +dependencies = [ + "darling 0.20.10", + "proc-macro2", + "quote", + "syn 2.0.77", +] + [[package]] name = "derive_builder_macro" version = "0.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ebcda35c7a396850a55ffeac740804b40ffec779b98fffbb1738f4033f0ee79e" dependencies = [ - "derive_builder_core", + "derive_builder_core 0.12.0", "syn 1.0.109", ] +[[package]] +name = "derive_builder_macro" +version = "0.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ab63b0e2bf4d5928aff72e83a7dace85d7bba5fe12dcc3c5a572d78caffd3f3c" +dependencies = [ + "derive_builder_core 0.20.2", + "syn 2.0.77", +] + [[package]] name = "difflib" version = "0.4.0" @@ -1414,7 +1445,7 @@ dependencies = [ "clap_mangen", "colored", "console", - "derive_builder", + "derive_builder 0.12.0", "env_logger", "gbdt", "gzp", @@ -1436,6 +1467,7 @@ dependencies = [ "spin", "tch", "tempfile", + "vergen-git2", ] [[package]] @@ -1746,6 +1778,19 @@ dependencies = [ "weezl", ] +[[package]] +name = "git2" +version = "0.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b903b73e45dc0c6c596f2d37eccece7c1c8bb6e4407b001096387c63d0d93724" +dependencies = [ + "bitflags 2.6.0", + "libc", + "libgit2-sys", + "log", + "url", +] + [[package]] name = "gix-features" version = "0.36.1" @@ -2263,6 +2308,18 @@ dependencies = [ "libdeflate-sys", ] +[[package]] +name = "libgit2-sys" +version = "0.17.0+1.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "10472326a8a6477c3c20a64547b0059e4b0d086869eee31e6d7da728a8eb7224" +dependencies = [ + "cc", + "libc", + "libz-sys", + "pkg-config", +] + [[package]] name = "libloading" version = "0.7.4" @@ -3443,9 +3500,9 @@ dependencies = [ [[package]] name = "rustversion" -version = "1.0.14" +version = "1.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7ffc183a10b4478d04cbbbfc96d0873219d962dd5accaff2ffbd4ceb7df837f4" +checksum = "0e819f2bc632f285be6d7cd36e25940d45b2391dd6d9b939e79de557f7014248" [[package]] name = "ryu" @@ -3743,6 +3800,12 @@ version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" +[[package]] +name = "strsim" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" + [[package]] name = "strum" version = "0.25.0" @@ -4203,6 +4266,45 @@ dependencies = [ "serde", ] +[[package]] +name = "vergen" +version = "9.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "349ed9e45296a581f455bc18039878f409992999bc1d5da12a6800eb18c8752f" +dependencies = [ + "anyhow", + "derive_builder 0.20.2", + "rustversion", + "time", + "vergen-lib", +] + +[[package]] +name = "vergen-git2" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e771aff771c0d7c2f42e434e2766d304d917e29b40f0424e8faaaa936bbc3f29" +dependencies = [ + "anyhow", + "derive_builder 0.20.2", + "git2", + "rustversion", + "time", + "vergen", + "vergen-lib", +] + +[[package]] +name = "vergen-lib" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "229eaddb0050920816cf051e619affaf18caa3dd512de8de5839ccbc8e53abb0" +dependencies = [ + "anyhow", + "derive_builder 0.20.2", + "rustversion", +] + [[package]] name = "version_check" version = "0.9.4" @@ -4675,7 +4777,7 @@ version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a76ff259533532054cfbaefb115c613203c73707017459206380f03b3b3f266e" dependencies = [ - "darling 0.20.8", + "darling 0.20.10", "proc-macro2", "quote", "syn 2.0.77", diff --git a/Cargo.toml b/Cargo.toml index 85778413..eb7be219 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -57,6 +57,8 @@ rand = "0.8.5" [build-dependencies] burn-import = {version = "0.12"} +vergen-git2 = { version = "1.0.1", features = ["build"] } + [dev-dependencies] assert_cmd = "2.0.11" diff --git a/build.rs b/build.rs index 45bd0a6a..b21b9f22 100644 --- a/build.rs +++ b/build.rs @@ -1,6 +1,22 @@ -use std::process::Command; +use std::error::Error; +use vergen_git2::*; + +fn vergen() -> Result<(), Box> { + // NOTE: This will output everything, and requires all features enabled. + // NOTE: See the specific builder documentation for configuration options. + let build = BuildBuilder::all_build()?; + let git2 = Git2Builder::all_git()?; + + Emitter::default() + .add_instructions(&build)? + .add_instructions(&git2)? + .emit()?; + Ok(()) +} fn main() { + /* + use std::process::Command; let output = Command::new("git") .args(["rev-parse", "HEAD"]) .output() @@ -12,6 +28,8 @@ fn main() { cargo:rustc-env=CARGO_LONG_VERSION={} commit:{}", git_hash, version, git_hash ); + */ + vergen().expect("Unable to set version and git hash"); // Generate the model code and state file from the ONNX file. use burn_import::onnx::ModelGen; diff --git a/py-ft/Cargo.toml b/py-ft/Cargo.toml index af3771b7..2844fe0e 100644 --- a/py-ft/Cargo.toml +++ b/py-ft/Cargo.toml @@ -1,7 +1,8 @@ [package] edition = "2021" name = "pyft" -version = "0.2.3" +version = "0.2.6" +readme = "./README.md" [lib] crate-type = ["cdylib"] diff --git a/py-ft/docs/vignettes/centered-plot.ipynb b/py-ft/docs/vignettes/centered-plot.ipynb index 23795ef2..ff3ce641 100644 --- a/py-ft/docs/vignettes/centered-plot.ipynb +++ b/py-ft/docs/vignettes/centered-plot.ipynb @@ -9,9 +9,20 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 7, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "DataTransformerRegistry.enable('vegafusion')" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "import pyft\n", "import pandas as pd\n", @@ -29,7 +40,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -57,6 +68,7 @@ " motif_start\n", " motif_end\n", " strand\n", + " footprint_codes\n", " fire_qual\n", " fiber_name\n", " has_spanning_msp\n", @@ -75,15 +87,16 @@ " 5204946\n", " 5204981\n", " +\n", + " 3\n", " 247\n", " m64076_211222_124721/148505307/ccs\n", " True\n", - " False\n", + " True\n", " 0\n", - " 8\n", + " 1\n", " 5204946\n", " +\n", - " not-footprinted\n", + " footprinted\n", " \n", " \n", " 1\n", @@ -91,12 +104,13 @@ " 5204946\n", " 5204981\n", " +\n", + " 2\n", " -1\n", " m64076_211222_124721/51053256/ccs\n", " False\n", - " False\n", + " True\n", " 0\n", - " 8\n", + " 1\n", " 5204946\n", " +\n", " not-footprinted\n", @@ -106,20 +120,20 @@ "" ], "text/plain": [ - " chrom motif_start motif_end strand fire_qual \\\n", - "0 chr11 5204946 5204981 + 247 \n", - "1 chr11 5204946 5204981 + -1 \n", + " chrom motif_start motif_end strand footprint_codes fire_qual \\\n", + "0 chr11 5204946 5204981 + 3 247 \n", + "1 chr11 5204946 5204981 + 2 -1 \n", "\n", " fiber_name has_spanning_msp footprinted start \\\n", - "0 m64076_211222_124721/148505307/ccs True False 0 \n", - "1 m64076_211222_124721/51053256/ccs False False 0 \n", + "0 m64076_211222_124721/148505307/ccs True True 0 \n", + "1 m64076_211222_124721/51053256/ccs False True 0 \n", "\n", " end centering_position centering_strand type \n", - "0 8 5204946 + not-footprinted \n", - "1 8 5204946 + not-footprinted " + "0 1 5204946 + footprinted \n", + "1 1 5204946 + not-footprinted " ] }, - "execution_count": 2, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -140,19 +154,19 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "[2024-03-13T00:26:04Z INFO pyft::fiberdata] 181 records fetched in 0.01s\n", - "[2024-03-13T00:26:04Z INFO pyft::fiberdata] Fiberdata made for 181 records in 0.15s\n", - "[2024-03-13T00:26:04Z INFO pyft::fiberdata] Fiberdata centered for 181 records in 0.02s\n", - "[2024-03-13T00:26:04Z INFO pyft::fiberdata] 172 records fetched in 0.11s\n", - "[2024-03-13T00:26:04Z INFO pyft::fiberdata] Fiberdata made for 172 records in 0.09s\n", - "[2024-03-13T00:26:05Z INFO pyft::fiberdata] Fiberdata centered for 172 records in 0.03s\n" + "[2024-11-12T22:29:48Z INFO pyft::fiberdata] 181 records fetched in 0.01s\n", + "[2024-11-12T22:29:48Z INFO pyft::fiberdata] Fiberdata made for 181 records in 0.11s\n", + "[2024-11-12T22:29:48Z INFO pyft::fiberdata] Fiberdata centered for 181 records in 0.02s\n", + "[2024-11-12T22:29:48Z INFO pyft::fiberdata] 172 records fetched in 0.11s\n", + "[2024-11-12T22:29:48Z INFO pyft::fiberdata] Fiberdata made for 172 records in 0.10s\n", + "[2024-11-12T22:29:48Z INFO pyft::fiberdata] Fiberdata centered for 172 records in 0.07s\n" ] } ], @@ -179,7 +193,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -216,6 +230,7 @@ " centering_strand\n", " motif_start\n", " motif_end\n", + " footprint_codes\n", " fire_qual\n", " has_spanning_msp\n", " footprinted\n", @@ -240,6 +255,7 @@ " NaN\n", " NaN\n", " NaN\n", + " NaN\n", " \n", " \n", " 1\n", @@ -259,6 +275,7 @@ " NaN\n", " NaN\n", " NaN\n", + " NaN\n", " \n", " \n", "\n", @@ -273,12 +290,12 @@ "0 msp -225 -160 0 5204946 + NaN \n", "1 msp -57 135 247 5204946 + NaN \n", "\n", - " motif_end fire_qual has_spanning_msp footprinted \n", - "0 NaN NaN NaN NaN \n", - "1 NaN NaN NaN NaN " + " motif_end footprint_codes fire_qual has_spanning_msp footprinted \n", + "0 NaN NaN NaN NaN NaN \n", + "1 NaN NaN NaN NaN NaN " ] }, - "execution_count": 4, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -297,7 +314,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -305,28 +322,28 @@ "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ "alt.LayerChart(...)" ] }, - "execution_count": 5, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -393,7 +410,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ @@ -403,7 +420,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": ".env", "language": "python", "name": "python3" }, @@ -417,7 +434,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.1" + "version": "3.13.0" } }, "nbformat": 4, diff --git a/py-ft/pyproject.toml b/py-ft/pyproject.toml index 12320718..11c73dea 100644 --- a/py-ft/pyproject.toml +++ b/py-ft/pyproject.toml @@ -10,11 +10,12 @@ classifiers = [ ] name = "pyft" requires-python = ">=3.7" -dependencies = ["pandas>=2.0", "altair>=5.0", "vegafusion[embed]", "tqdm"] +dependencies = ["pandas>=2.0", "altair==5.1", "vegafusion[embed]", "tqdm", "typing-extensions", "polars"] [tool.maturin] features = ["pyo3/extension-module"] python-source = "python" [tool.ruff] -extend-include = ["*.ipynb"] \ No newline at end of file +extend-include = ["*.ipynb"] + diff --git a/py-ft/python/pyft/plot.py b/py-ft/python/pyft/plot.py index 7707e668..bec820e7 100644 --- a/py-ft/python/pyft/plot.py +++ b/py-ft/python/pyft/plot.py @@ -1,4 +1,5 @@ import altair as alt +import pandas as pd alt.data_transformers.enable("vegafusion") @@ -213,3 +214,111 @@ def _add_footprint_lines_to_centered_chart(dfm): ) ) return footprint_lines + + + +def extract_chart( + in_df, + height=600, + width=800, + m6a_color=M6A_COLOR, + m5c_color=M5C_COLOR, + nuc_color=NUC_COLOR, + msp_color=MSP_COLOR, + max_fibers=None, + ref=True, +): + """Make an altair chart from a dataframe of centered positions + Args: + in_df (_type_): Pandas DataFrame from utils.read_extract_all with long=True. + m6a_color (_type_, optional): _description_. Defaults to M6A_COLOR. + m5c_color (_type_, optional): _description_. Defaults to M5C_COLOR. + nuc_color (_type_, optional): _description_. Defaults to NUC_COLOR. + msp_color (_type_, optional): _description_. Defaults to MSP_COLOR. + max_fibers (_type_, optional): maximum number of fibers to plot. Defaults to None. + ref (_type_, optional): Whether to use reference (ref) coordinates. Defaults to True. + Returns: + altair.Chart + """ + if ref: + start = "long_ref_start" + end = "long_ref_end" + else: + start = "long_start" + end = "long_end" + dfm = ( + in_df.copy()[ + [ + start, + end, + "type", + "fiber", + ] + ] + .dropna() + .reset_index(drop=True) + .infer_objects() + ) # .query("centered_position_type != '5mC'") + + # set the colors + color_m6a = alt.param(value=m6a_color, bind=alt.binding(input="color", name="m6a")) + color_5mc = alt.param(value=m5c_color, bind=alt.binding(input="color", name="5mC")) + color_nuc = alt.param(value=nuc_color, bind=alt.binding(input="color", name="nuc")) + color_msp = alt.param(value=msp_color, bind=alt.binding(input="color", name="msp")) + + domain = ["5mC", "m6a", "nuc", "msp"] + range_ = [color_5mc, color_m6a, color_nuc, color_msp] + opacity = dict(zip(domain, [1.0, 1.0, 0.5, 0.20])) + + # add opacity column to the dataframe + dfm = dfm.assign(opacity=dfm["type"].map(opacity)) + + dfm["long_end"] = dfm["long_end"] + + # convert fiber column to integer + dfm["height"] = 0.4 + dfm.loc[dfm.type=="nuc", "height"] = 0.2 + dfm.loc[dfm.type=="msp", "height"] = 0.3 + dfm["y"] = pd.factorize(dfm["fiber"])[0] - dfm["height"] + dfm["y2"] = pd.factorize(dfm["fiber"])[0] + dfm["height"] + if max_fibers is not None: + dfm = dfm.query("y < @max_fibers") + + + bind_range_w = alt.binding_range(min=200, max=1600, name="Chart width: ") + param_width = alt.param("width", bind=bind_range_w) + bind_range_h = alt.binding_range(min=200, max=1600, name="Chart height: ") + param_height = alt.param("height", bind=bind_range_h) + + base = ( + alt.Chart(dfm) + .encode( + x=alt.X(f"{start}:Q", scale=alt.Scale(domain=(500, 2500))), + x2=f"{end}:Q", + color=alt.Color("type:O").scale(domain=domain, range=range_), + y="y:Q", + y2="y2:Q", + opacity=alt.Opacity("opacity"), + # opacity=alt.condition(type_selection, "opacity", alt.value(0)) + ) + ) + + + chart = base.mark_rect() + chart = ( + chart.properties(width=width, height=height) + .add_params( + param_width, + param_height, + color_m6a, + color_5mc, + color_nuc, + color_msp, + ) + .interactive( + bind_y = False + ) + ) + + return chart + diff --git a/py-ft/python/pyft/utils.py b/py-ft/python/pyft/utils.py index 4f4a2df5..9ed7f577 100644 --- a/py-ft/python/pyft/utils.py +++ b/py-ft/python/pyft/utils.py @@ -1,5 +1,8 @@ import pandas as pd import altair as alt +import polars as pl +import numpy as np +import polars.selectors as cs alt.data_transformers.enable("vegafusion") @@ -22,6 +25,123 @@ def empty_data_dict(): "end": [], "qual": [], } + + +def split_to_ints(df, col, sep=",", trim=True): + """Split a columns with list of ints separated by + "sep" into a numpy array quickly. + + Args: + df (dataframe): dataframe that is like a bed12 file. + col (str): column name within the dataframe to split up. + sep (str, optional): Defaults to ",". + trim (bool, optional): Remove the first and last call from the bed12 (removes bookendings). Defaults to True. + + Returns: + column: New column that is a list of numpy array of ints. + """ + if trim: + return df[col].map_elements( + lambda x: np.fromstring(x, sep=sep, dtype=np.int32)[1:-1], + return_dtype=pl.datatypes.List(pl.datatypes.Int32), + ) + return df[col].map_elements( + lambda x: np.fromstring(x, sep=sep, dtype=np.int32), + return_dtype=pl.datatypes.List(pl.datatypes.Int32), + ) + + + +def read_extract_all(f: str, pandas=False, n_rows=None, long=False): + """Read a table made with fibertools-rs. Specifically `ft extract --all`. + Args: + f (str): File path to the table. Can be compressed. + Returns: + pl.DataFrame: Dataframe of the table. + """ + cols_with_lists = [ + "nuc_starts", + "nuc_lengths", + "ref_nuc_starts", + "ref_nuc_lengths", + "msp_starts", + "msp_lengths", + "fire", + "ref_msp_starts", + "ref_msp_lengths", + "m6a", + "ref_m6a", + "m6a_qual", + "5mC", + "ref_5mC", + "5mC_qual", + ] + df = pl.read_csv( + f, + separator="\t", + n_rows=n_rows, + null_values=["."], + comment_prefix=None, + ) + # clean up comment char + df.columns = list(map(lambda x: x.strip("#"), df.columns)) + if df.shape[0] > 0: + for col in cols_with_lists: + col_index = df.columns.index(col) + df.replace_column(col_index, split_to_ints(df, col, trim=False)) + + if long: + explode_sets = { + "m6a":["m6a", "ref_m6a", "m6a_qual"], + "5mC": ["5mC", "ref_5mC", "5mC_qual"], + "msp": ["msp_starts", "msp_lengths", "ref_msp_starts", "ref_msp_lengths", "fire"], + "nuc": ["nuc_starts", "nuc_lengths", "ref_nuc_starts", "ref_nuc_lengths"], + } + def my_rename(col): + if col == "m6a" or col == "5mC" or col == "msp_starts" or col == "nuc_starts": + return "long_start" + if col == "ref_m6a" or col == "ref_5mC" or col == "ref_msp_starts" or col == "ref_nuc_starts": + return "long_ref_start" + if col == "m6a_qual" or col == "5mC_qual" or col == "fire": + return "long_qual" + if col == "msp_lengths" or col == "nuc_lengths": + return "zlength" + if col == "ref_msp_lengths" or col == "ref_nuc_lengths": + return "ref_zlength" + else: + return col + + + all_explode_cols = [s for _t, sublist in explode_sets.items() for s in sublist] + dfs = [] + for cur_type, s in explode_sets.items(): + not_in_s = [x for x in all_explode_cols if x not in s] + tdf = df.drop(not_in_s).explode(s).with_columns( + pl.lit(cur_type).alias("type") + ).rename(my_rename) + if cur_type == "m6a" or cur_type == "5mC": + tdf = tdf.with_columns( + zlength=1, + ref_zlength=1, + ) + if cur_type == "nuc": + tdf = tdf.with_columns( + long_qual=0, + ) + tdf = tdf.with_columns( + long_end=pl.col("long_start") + pl.col("zlength"), + long_ref_end=pl.col("long_ref_start") + pl.col("ref_zlength"), + ).drop( + cs.contains("zlength") + ) + tdf=tdf.select(sorted(tdf.columns)) + dfs.append(tdf) + # concat all of the dataframes + df = pl.concat(dfs) + if pandas: + df = pd.DataFrame(df.to_dicts()) + return df + def footprint_code_to_vec(footprint_code, n_modules): diff --git a/src/cli.rs b/src/cli.rs index 7fd77b7c..f24208b1 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -44,7 +44,7 @@ pub use strip_basemods_opts::*; infer_subcommands = true, arg_required_else_help = true )] -#[command(version = super::LONG_VERSION)] +#[command(version = &**crate::FULL_VERSION)] #[command(styles=get_styles())] pub struct Cli { #[clap(flatten)] diff --git a/src/cli/extract_opts.rs b/src/cli/extract_opts.rs index a0d12108..771ff689 100644 --- a/src/cli/extract_opts.rs +++ b/src/cli/extract_opts.rs @@ -6,7 +6,7 @@ use std::fmt::Debug; pub struct ExtractOptions { #[clap(flatten)] pub input: InputBam, - /// Report in reference sequence coordinates + /// Report positions in reference sequence coordinates #[clap(short, long, default_value = "true", default_value_ifs([ ("molecular", "true", "false"), diff --git a/src/cli/nucleosome_opts.rs b/src/cli/nucleosome_opts.rs index 5ca2cfb0..8cb9e2fc 100644 --- a/src/cli/nucleosome_opts.rs +++ b/src/cli/nucleosome_opts.rs @@ -1,8 +1,13 @@ use crate::utils::input_bam::InputBam; -use crate::utils::nucleosome::*; use clap::Args; use std::fmt::Debug; +pub static NUC_LEN: &str = "75"; +pub static COMBO_NUC_LEN: &str = "100"; +pub static MIN_DIST_ADDED: &str = "25"; +pub static DIST_FROM_END: &str = "45"; +pub static ALLOWED_SKIPS: &str = "-1"; + #[derive(Args, Debug, Clone)] pub struct NucleosomeParameters { /// Minium nucleosome length diff --git a/src/cli/pileup_opts.rs b/src/cli/pileup_opts.rs index 79c68fbb..7a6e1174 100644 --- a/src/cli/pileup_opts.rs +++ b/src/cli/pileup_opts.rs @@ -28,4 +28,24 @@ pub struct PileupOptions { /// Write output one base at a time even if the values do not change #[clap(short, long)] pub per_base: bool, + /// Calculate coverage starting from the first MSP/NUC to the last MSP/NUC + /// position instead of the complete span of the read alignment. + #[clap(long)] + pub fiber_coverage: bool, + /// Shuffle the fiber-seq data according to a bed file of + /// the shuffled positions of the fiber-seq data + /// + /// The bed file should have the following format: + /// #chrom shuffled_start shuffled_end read_name original_start + #[clap(long)] + pub shuffle: Option, + /// Output a rolling max of the score column over X bases + #[clap(long)] + pub rolling_max: Option, + /// No MSP columns + #[clap(long)] + pub no_msp: bool, + /// No NUC columns + #[clap(long)] + pub no_nuc: bool, } diff --git a/src/cli/strip_basemods_opts.rs b/src/cli/strip_basemods_opts.rs index a627990e..768d6466 100644 --- a/src/cli/strip_basemods_opts.rs +++ b/src/cli/strip_basemods_opts.rs @@ -12,6 +12,12 @@ pub struct StripBasemodsOptions { #[clap(short, long, value_parser(["m6A","6mA", "5mC","CpG"]))] /// base modification to strip out of the bam file pub basemod: Option, + /// filter out m6A modifications with less than this ML value + #[clap(long, default_value = "0")] + pub ml_m6a: u8, + /// filter out 5mC modifications with less than this ML value + #[clap(long, default_value = "0")] + pub ml_5mc: u8, /// Drop forward strand of base modifications #[clap(long)] pub drop_forward: bool, diff --git a/src/fiber.rs b/src/fiber.rs index 8990e9d0..2ee21bb6 100644 --- a/src/fiber.rs +++ b/src/fiber.rs @@ -125,6 +125,10 @@ impl FiberseqData { // GET FUNCTIONS // + pub fn get_qname(&self) -> String { + String::from_utf8_lossy(self.record.qname()).to_string() + } + pub fn get_rq(&self) -> Option { if let Ok(Aux::Float(f)) = self.record.aux(b"rq") { Some(f) diff --git a/src/lib.rs b/src/lib.rs index cfa3cf07..58ac1700 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -12,14 +12,16 @@ pub mod cli; use crate::utils::bio_io::*; use anyhow::Result; use itertools::Itertools; +use lazy_static::lazy_static; use rust_htslib::bam::FetchDefinition; use rust_htslib::{bam, bam::Read}; use std::env; use std::io::Write; pub const VERSION: &str = env!("CARGO_PKG_VERSION"); -pub const GIT_HASH: &str = env!("CARGO_GIT_HASH"); -pub const LONG_VERSION: &str = env!("CARGO_LONG_VERSION"); +lazy_static! { + pub static ref FULL_VERSION: String = format!("{}", env!("VERGEN_GIT_DESCRIBE")); +} // if this string (bar)gets too long it displays weird when writing to stdout const PROGRESS_STYLE: &str = "[{elapsed_precise:.yellow}] {bar:>35.cyan/blue} {human_pos:>5.cyan}/{human_len:.blue} {percent:>3.green}% {per_sec:<10.cyan}"; diff --git a/src/subcommands/center.rs b/src/subcommands/center.rs index f4c43a24..b7110cfe 100644 --- a/src/subcommands/center.rs +++ b/src/subcommands/center.rs @@ -266,7 +266,7 @@ impl CenteredFiberData { Some(quals) => quals.to_vec(), None => vec![0; starts.len()], }; - + let mut write_count = 0; for ((&st, &en), &qual) in starts.iter().zip(ends.iter()).zip(quals.iter()) { let Some(st) = st else { continue; @@ -291,8 +291,10 @@ impl CenteredFiberData { format_args!("{}\t{}\t{}\t{}\n", t, st, en, qual) ) .unwrap(); + write_count += 1; } } + log::debug!("{}: {}", t, write_count); } rtn @@ -311,7 +313,7 @@ pub fn center( let total = fiber_data.len(); let mut seen = 0; - fiber_data + let to_write: Vec = fiber_data .into_par_iter() .map(|fiber| { match CenteredFiberData::new( @@ -331,13 +333,22 @@ pub fn center( None => "".to_string(), } }) - .collect::>() - .iter() .filter(|x| !x.is_empty()) - .for_each(|x| { - seen += 1; - write_to_file(x, buffer); - }); + .collect::>(); + + for line in to_write { + seen += 1; + write_to_file(&line, buffer); + } + + log::debug!( + "centering {} records of {} on {}:{}:{}", + seen, + total, + center_position.chrom, + center_position.position, + center_position.strand + ); if total - seen > 1 { log::warn!( diff --git a/src/subcommands/extract.rs b/src/subcommands/extract.rs index 16a96928..bb0b3497 100644 --- a/src/subcommands/extract.rs +++ b/src/subcommands/extract.rs @@ -61,65 +61,50 @@ impl FiberOut { } pub fn process_bam_chunk(fiber_data: Vec, out_files: &mut FiberOut) { - match &mut out_files.m6a { - Some(m6a) => { - let out: Vec = fiber_data - .par_iter() - .map(|r| r.write_m6a(out_files.reference)) - .collect(); - for line in out { - write_to_file(&line, m6a); - } + if let Some(m6a) = &mut out_files.m6a { + let out: Vec = fiber_data + .par_iter() + .map(|r| r.write_m6a(out_files.reference)) + .collect(); + for line in out { + write_to_file(&line, m6a); } - None => {} } - match &mut out_files.cpg { - Some(cpg) => { - let out: Vec = fiber_data - .par_iter() - .map(|r| r.write_cpg(out_files.reference)) - .collect(); - for line in out { - write_to_file(&line, cpg); - } + if let Some(cpg) = &mut out_files.cpg { + let out: Vec = fiber_data + .par_iter() + .map(|r| r.write_cpg(out_files.reference)) + .collect(); + for line in out { + write_to_file(&line, cpg); } - None => {} } - match &mut out_files.msp { - Some(msp) => { - let out: Vec = fiber_data - .par_iter() - .map(|r| r.write_msp(out_files.reference)) - .collect(); - for line in out { - write_to_file(&line, msp); - } + if let Some(msp) = &mut out_files.msp { + let out: Vec = fiber_data + .par_iter() + .map(|r| r.write_msp(out_files.reference)) + .collect(); + for line in out { + write_to_file(&line, msp); } - None => {} } - match &mut out_files.nuc { - Some(nuc) => { - let out: Vec = fiber_data - .par_iter() - .map(|r| r.write_nuc(out_files.reference)) - .collect(); - for line in out { - write_to_file(&line, nuc); - } + if let Some(nuc) = &mut out_files.nuc { + let out: Vec = fiber_data + .par_iter() + .map(|r| r.write_nuc(out_files.reference)) + .collect(); + for line in out { + write_to_file(&line, nuc); } - None => {} } - match &mut out_files.all { - Some(all) => { - let out: Vec = fiber_data - .par_iter() - .map(|r| r.write_all(out_files.simplify, out_files.quality)) - .collect(); - for line in out { - write_to_file(&line, all); - } + if let Some(all) = &mut out_files.all { + let out: Vec = fiber_data + .par_iter() + .map(|r| r.write_all(out_files.simplify, out_files.quality)) + .collect(); + for line in out { + write_to_file(&line, all); } - None => {} } } @@ -138,16 +123,13 @@ pub fn extract_contained(extract_opts: &mut ExtractOptions) { .unwrap(); // print the header if in all mode - match &mut out_files.all { - Some(all) => { - write!( - all, - "{}", - FiberseqData::all_header(out_files.simplify, out_files.quality) - ) - .unwrap(); - } - None => {} + if let Some(all) = &mut out_files.all { + write!( + all, + "{}", + FiberseqData::all_header(out_files.simplify, out_files.quality) + ) + .unwrap(); } // process bam in chunks diff --git a/src/subcommands/footprint.rs b/src/subcommands/footprint.rs index 25a6bbc1..1444ffc7 100644 --- a/src/subcommands/footprint.rs +++ b/src/subcommands/footprint.rs @@ -197,17 +197,14 @@ impl<'a> Footprint<'a> { fn establish_footprints(&mut self) { for (fiber, has_msp) in self.fibers.iter().zip(self.has_spanning_msp.iter()) { - let mut footprint_code = 0; - if *has_msp { - footprint_code = self.footprint_code(fiber); - } + let footprint_code = self.footprint_code(fiber, has_msp); self.footprint_codes.push(footprint_code); } } - fn footprint_code(&self, fiber: &FiberseqData) -> u16 { + fn footprint_code(&self, fiber: &FiberseqData, has_spanning_msp: &bool) -> u16 { + let mut footprint_code = if *has_spanning_msp { 1 << 0 } else { 0 }; // denote that we have a spanning msp - let mut footprint_code = 1 << 0; let motif_end = self.motif.footprint.max_pos(); // make a binary vector over the motif indicating the presence of an m6a let mut m6a_vec = vec![false; motif_end]; diff --git a/src/subcommands/pileup.rs b/src/subcommands/pileup.rs index 7c3e8457..9127ca84 100644 --- a/src/subcommands/pileup.rs +++ b/src/subcommands/pileup.rs @@ -3,9 +3,11 @@ use crate::fiber::FiberseqData; use crate::utils::bamranges; use crate::utils::bio_io; use crate::*; -use anyhow::{self, Ok}; -use rayon::prelude::*; +use anyhow::{anyhow, Ok}; +use std::collections::HashMap; +use std::io::BufRead; //use polars::prelude::*; +use ordered_float::NotNan; use rust_htslib::bam::ext::BamRecordExtensions; use rust_htslib::bam::{FetchDefinition, IndexedReader}; @@ -55,9 +57,15 @@ impl PartialEq for FireRow<'_> { impl std::fmt::Display for FireRow<'_> { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { let mut rtn = format!( - "\t{}\t{}\t{}\t{}\t{}", - self.coverage, self.fire_coverage, self.score, self.nuc_coverage, self.msp_coverage + "\t{}\t{}\t{}", + self.coverage, self.fire_coverage, self.score ); + if !self.pileup_opts.no_nuc { + rtn += &format!("\t{}", self.nuc_coverage); + } + if !self.pileup_opts.no_msp { + rtn += &format!("\t{}", self.msp_coverage); + } if self.pileup_opts.m6a { rtn += &format!("\t{}", self.m6a_coverage); } @@ -68,6 +76,73 @@ impl std::fmt::Display for FireRow<'_> { } } +pub struct ShuffledFibers { + pub shuffled_fiber_starts: HashMap<(String, String, i64), i64>, +} + +impl ShuffledFibers { + pub fn new(file_path: &str) -> Result { + let buffer = bio_io::buffer_from(file_path)?; + let mut shuffled_fiber_starts = HashMap::new(); + for line in buffer.lines() { + let line = line?; + if line.starts_with('#') { + continue; + } + let mut parts = line.split('\t'); + // error if there are not at least 4 parts + let chrom = parts.next().ok_or(anyhow!("missing chrom"))?; + let start = parts + .next() + .ok_or(anyhow!("missing fiber start"))? + .parse::()?; + let _end = parts + .next() + .ok_or(anyhow!("missing fiber end"))? + .parse::()?; + let fiber_name = parts.next().ok_or(anyhow!("missing fiber name"))?; + let original_start = parts + .next() + .ok_or(anyhow!("missing original start"))? + .parse::()?; + shuffled_fiber_starts.insert( + (chrom.to_string(), fiber_name.to_string(), original_start), + start, + ); + } + log::info!("Read {} shuffled fibers", shuffled_fiber_starts.len()); + Ok(Self { + shuffled_fiber_starts, + }) + } + + pub fn get_shuffled_start(&self, fiber: &FiberseqData) -> Option { + let target_name = fiber.target_name.clone(); + let fiber_name = fiber.get_qname(); + self.shuffled_fiber_starts + .get(&(target_name, fiber_name, fiber.record.reference_start())) + .copied() + } + + pub fn has_fiber(&self, fiber: &FiberseqData) -> bool { + let target_name = fiber.target_name.clone(); + let fiber_name = fiber.get_qname(); + self.shuffled_fiber_starts.contains_key(&( + target_name, + fiber_name, + fiber.record.reference_start(), + )) + } + + /// to get the shuffle offset which we ADD(+) to features of the fiber + /// to create a shuffled version of the fiber + pub fn get_shuffle_offset(&self, fiber: &FiberseqData) -> Option { + let start = fiber.record.reference_start(); + let shuffled_start = self.get_shuffled_start(fiber)?; + Some(shuffled_start - start) + } +} + pub struct FireTrack<'a> { pub chrom_start: usize, pub chrom_end: usize, @@ -81,10 +156,17 @@ pub struct FireTrack<'a> { pub cpg_coverage: Vec, pub m6a_coverage: Vec, pileup_opts: &'a PileupOptions, + shuffled_fibers: &'a Option, + cur_offset: i64, } impl<'a> FireTrack<'a> { - pub fn new(chrom_start: usize, chrom_end: usize, pileup_opts: &'a PileupOptions) -> Self { + pub fn new( + chrom_start: usize, + chrom_end: usize, + pileup_opts: &'a PileupOptions, + shuffled_fibers: &'a Option, + ) -> Self { let track_len = chrom_end - chrom_start + 1; let raw_scores = vec![-1.0; track_len]; let scores = vec![-1.0; track_len]; @@ -101,18 +183,24 @@ impl<'a> FireTrack<'a> { cpg_coverage: vec![0; track_len], m6a_coverage: vec![0; track_len], pileup_opts, + shuffled_fibers, + cur_offset: 0, } } #[inline] - fn add_range_set(array: &mut [i32], ranges: &bamranges::Ranges, offset: usize) { - let offset = offset as i64; + fn add_range_set( + array: &mut [i32], + ranges: &bamranges::Ranges, + cur_offset: i64, + chrom_start: usize, + ) { for (_, _, _, _, r) in ranges { match r { Some((rs, re, _)) => { let re = if rs == re { re + 1 } else { re }; for i in rs..re { - let pos = i - offset; + let pos = i + cur_offset - chrom_start as i64; // skip if pos is before the start of the track if pos < 0 || pos >= array.len() as i64 { continue; @@ -125,11 +213,70 @@ impl<'a> FireTrack<'a> { } } + fn fiber_start_and_end(&self, fiber: &FiberseqData) -> (i64, i64) { + if !self.pileup_opts.fiber_coverage { + return ( + fiber.record.reference_start() + self.cur_offset, + fiber.record.reference_end() + self.cur_offset, + ); + } + let mut start = i64::MAX; + let mut end = i64::MIN; + for (_, _, _, _, r) in &fiber.msp { + match r { + Some((rs, re, _)) => { + start = std::cmp::min(start, rs); + end = std::cmp::max(end, re); + } + _ => continue, + } + } + for (_, _, _, _, r) in &fiber.nuc { + match r { + Some((rs, re, _)) => { + start = std::cmp::min(start, rs); + end = std::cmp::max(end, re); + } + _ => continue, + } + } + if start == i64::MAX { + start = fiber.record.reference_start(); + } + if end == i64::MIN { + end = fiber.record.reference_end(); + } + (start + self.cur_offset, end + self.cur_offset) + } + /// inline this function #[inline] fn update_with_fiber(&mut self, fiber: &FiberseqData) { + // skip this fiber if it has no MSP/NUC information + // and we are looking at fiber_coverage + if self.pileup_opts.fiber_coverage + && fiber.msp.reference_starts.is_empty() + && fiber.nuc.reference_starts.is_empty() + { + return; + } + + // find the offset if we are shuffling data + self.cur_offset = match self.shuffled_fibers { + Some(shuffled_fibers) => match shuffled_fibers.get_shuffle_offset(fiber) { + Some(offset) => offset, + None => return, // skip missing fiber if it is not in the shuffle + }, + None => 0, + }; + + if self.cur_offset != 0 && self.chrom_start != 0 { + panic!("Cannot apply shuffling unless the entire chromosome is being read at once."); + } + + let (start, end) = self.fiber_start_and_end(fiber); // calculate the coverage - for i in fiber.record.reference_start()..fiber.record.reference_end() { + for i in start..end { let pos = i - self.chrom_start as i64; if pos < 0 || pos >= self.track_len as i64 { continue; @@ -146,7 +293,7 @@ impl<'a> FireTrack<'a> { } let score_update = (1.0 - q as f32 / 255.0).log10() * -50.0; for i in rs..re { - let pos = i - self.chrom_start as i64; + let pos = i + self.cur_offset - self.chrom_start as i64; if pos < 0 || pos >= self.track_len as i64 { continue; } @@ -158,19 +305,34 @@ impl<'a> FireTrack<'a> { } } - Self::add_range_set(&mut self.nuc_coverage, &fiber.nuc, self.chrom_start); - Self::add_range_set(&mut self.msp_coverage, &fiber.msp, self.chrom_start); + // add other sets of data to the FireTrack depending on CLI opts + let mut pairs = vec![]; + if !self.pileup_opts.no_nuc { + pairs.push((&mut self.nuc_coverage, &fiber.nuc)); + } + if !self.pileup_opts.no_msp { + pairs.push((&mut self.msp_coverage, &fiber.msp)); + } if self.pileup_opts.m6a { - Self::add_range_set(&mut self.m6a_coverage, &fiber.m6a, self.chrom_start); + pairs.push((&mut self.m6a_coverage, &fiber.m6a)); } if self.pileup_opts.cpg { - Self::add_range_set(&mut self.cpg_coverage, &fiber.cpg, self.chrom_start); + pairs.push((&mut self.cpg_coverage, &fiber.cpg)); + } + + for (array, ranges) in pairs { + Self::add_range_set(array, ranges, self.cur_offset, self.chrom_start); } } pub fn calculate_scores(&mut self) { for i in 0..self.track_len { - if self.fire_coverage[i] < MIN_FIRE_COVERAGE { + if self.fire_coverage[i] <= 0 { + self.scores[i] = -1.0; + } else if self.fire_coverage[i] < MIN_FIRE_COVERAGE + && self.pileup_opts.shuffle.is_none() + { + // there is no minimum fire coverage if we are shuffling self.scores[i] = -1.0; } else { self.scores[i] = self.raw_scores[i] / self.coverage[i] as f32; @@ -178,6 +340,25 @@ impl<'a> FireTrack<'a> { } } + pub fn calculate_rolling_max_score(&mut self) -> Vec { + let mut rolling_max = vec![-1.0; self.track_len]; + let window_size = self.pileup_opts.rolling_max.unwrap(); + let look_back = window_size / 2; + for (i, cur_roll_max) in rolling_max.iter_mut().enumerate().take(self.track_len) { + let start = if i < look_back { 0 } else { i - look_back }; + let mut end = i + look_back; + if end > self.track_len { + end = self.track_len; + } + let relevant_scores = &self.scores[start..end]; + *cur_roll_max = *relevant_scores + .iter() + .max_by_key(|x| NotNan::new(**x).unwrap()) + .unwrap_or(&-1.0); + } + rolling_max + } + pub fn row(&self, i: usize) -> FireRow { FireRow { score: &self.scores[i], @@ -194,14 +375,17 @@ impl<'a> FireTrack<'a> { pub struct FiberseqPileup<'a> { pub all_data: FireTrack<'a>, - pub hap1_data: FireTrack<'a>, - pub hap2_data: FireTrack<'a>, + pub hap1_data: Option>, + pub hap2_data: Option>, + pub shuffled_data: Option>, pub chrom: String, pub chrom_start: usize, pub chrom_end: usize, pub track_len: usize, has_data: bool, pileup_opts: &'a PileupOptions, + shuffled_fibers: &'a Option, + rolling_max: Option>, } impl<'a> FiberseqPileup<'a> { @@ -210,21 +394,43 @@ impl<'a> FiberseqPileup<'a> { chrom_start: usize, chrom_end: usize, pileup_opts: &'a PileupOptions, + shuffled_fibers: &'a Option, ) -> Self { let track_len = chrom_end - chrom_start + 1; - let all_data = FireTrack::new(chrom_start, chrom_end, pileup_opts); - let hap1_data = FireTrack::new(chrom_start, chrom_end, pileup_opts); - let hap2_data = FireTrack::new(chrom_start, chrom_end, pileup_opts); + let all_data = FireTrack::new(chrom_start, chrom_end, pileup_opts, &None); + let (hap1_data, hap2_data) = if pileup_opts.haps { + ( + Some(FireTrack::new(chrom_start, chrom_end, pileup_opts, &None)), + Some(FireTrack::new(chrom_start, chrom_end, pileup_opts, &None)), + ) + } else { + (None, None) + }; + + let shuffled_data = if shuffled_fibers.is_some() { + Some(FireTrack::new( + chrom_start, + chrom_end, + pileup_opts, + shuffled_fibers, + )) + } else { + None + }; + Self { all_data, hap1_data, hap2_data, + shuffled_data, chrom: chrom.to_string(), chrom_start, chrom_end, track_len, has_data: false, pileup_opts, + shuffled_fibers, + rolling_max: None, } } @@ -244,19 +450,38 @@ impl<'a> FiberseqPileup<'a> { .into_iter() .map(|r| r.collect::>()) .for_each(|r| { - let fibers: Vec = r - .par_iter() - .map(|r| FiberseqData::new(r.clone(), None, &self.pileup_opts.input.filters)) - .collect(); + let fibers: Vec = FiberseqData::from_records( + r, + &self.pileup_opts.input.header_view(), + &self.pileup_opts.input.filters, + ); if !fibers.is_empty() { self.has_data = true; } for fiber in fibers { + // skip if the fiber was unable to be shuffled + if self.shuffled_fibers.is_some() + && !self.shuffled_fibers.as_ref().unwrap().has_fiber(&fiber) + { + continue; + } + self.all_data.update_with_fiber(&fiber); - if fiber.get_hp() == "H1" && self.pileup_opts.haps { - self.hap1_data.update_with_fiber(&fiber); - } else if fiber.get_hp() == "H2" && self.pileup_opts.haps { - self.hap2_data.update_with_fiber(&fiber); + // add hap1 data + if let Some(hap1_data) = &mut self.hap1_data { + if fiber.get_hp() == "H1" { + hap1_data.update_with_fiber(&fiber); + } + } + // add hap2 data + if let Some(hap2_data) = &mut self.hap2_data { + if fiber.get_hp() == "H2" { + hap2_data.update_with_fiber(&fiber); + } + } + // add shuffled data + if let Some(shuffled_data) = &mut self.shuffled_data { + shuffled_data.update_with_fiber(&fiber); } } }); @@ -267,31 +492,56 @@ impl<'a> FiberseqPileup<'a> { pub fn header(pileup_opts: &PileupOptions) -> String { let mut header = format!("{}\t{}\t{}", "#chrom", "start", "end"); - let haps = if pileup_opts.haps { - vec!["", "_H1", "_H2"] - } else { - vec![""] - }; - for hap in haps { + let mut suffixes = vec![""]; + if pileup_opts.haps { + suffixes.push("_H1"); + suffixes.push("_H2"); + } + if pileup_opts.shuffle.is_some() { + suffixes.push("_shuffled"); + } + + for suffix in suffixes { header += &format!( - "\t{}{hap}\t{}{hap}\t{}{hap}\t{}{hap}\t{}{hap}", - "coverage", "fire_coverage", "score", "nuc_coverage", "msp_coverage", + "\t{}{suffix}\t{}{suffix}\t{}{suffix}", + "coverage", "fire_coverage", "score", ); + if !pileup_opts.no_nuc { + header += &format!("\t{}{suffix}", "nuc_coverage"); + } + if !pileup_opts.no_msp { + header += &format!("\t{}{suffix}", "msp_coverage"); + } if pileup_opts.m6a { - header += &format!("\t{}{hap}", "m6a_coverage"); + header += &format!("\t{}{suffix}", "m6a_coverage"); } if pileup_opts.cpg { - header += &format!("\t{}{hap}", "cpg_coverage"); + header += &format!("\t{}{suffix}", "cpg_coverage"); } } + if pileup_opts.rolling_max.is_some() { + header += "\trolling_max"; + } header += "\n"; header } fn calculate_scores(&mut self) { self.all_data.calculate_scores(); - self.hap1_data.calculate_scores(); - self.hap2_data.calculate_scores(); + // calculate rolling max + if self.pileup_opts.rolling_max.is_some() { + self.rolling_max = Some(self.all_data.calculate_rolling_max_score()); + } + // scores for other tracks + if let Some(hap1_data) = &mut self.hap1_data { + hap1_data.calculate_scores(); + } + if let Some(hap2_data) = &mut self.hap2_data { + hap2_data.calculate_scores(); + } + if let Some(shuffled_data) = &mut self.shuffled_data { + shuffled_data.calculate_scores(); + } } /// check if the ith row has the same data as the previous row @@ -302,13 +552,22 @@ impl<'a> FiberseqPileup<'a> { true } else { let total_same = self.all_data.row(i) == self.all_data.row(i - 1); - let hap1_same = self.hap1_data.row(i) == self.hap1_data.row(i - 1); - let hap2_same = self.hap2_data.row(i) == self.hap2_data.row(i - 1); - if self.pileup_opts.haps { - total_same && hap1_same && hap2_same + let haps_same = if self.pileup_opts.haps { + let h1 = self.hap1_data.as_ref().unwrap(); + let h2 = self.hap2_data.as_ref().unwrap(); + let hap1_same = h1.row(i) == h1.row(i - 1); + let hap2_same = h2.row(i) == h2.row(i - 1); + hap1_same && hap2_same } else { - total_same - } + true + }; + let shuffled_same = if self.shuffled_data.is_some() { + self.shuffled_data.as_ref().unwrap().row(i) + == self.shuffled_data.as_ref().unwrap().row(i - 1) + } else { + true + }; + total_same && haps_same && shuffled_same } } @@ -320,10 +579,35 @@ impl<'a> FiberseqPileup<'a> { self.is_same_as_previous(i) && i != self.track_len - 1 } + pub fn log_stats(&self) { + let mut data_tracks = vec![&self.all_data]; + if self.pileup_opts.haps { + data_tracks.push(self.hap1_data.as_ref().unwrap()); + data_tracks.push(self.hap2_data.as_ref().unwrap()); + } + if self.shuffled_data.is_some() { + data_tracks.push(self.shuffled_data.as_ref().unwrap()); + } + for data in data_tracks { + let total_coverage: i64 = data.coverage.iter().map(|x| *x as i64).sum(); + let total_fire_coverage: i64 = data.fire_coverage.iter().map(|x| *x as i64).sum(); + let total_score: f64 = data.scores.iter().map(|x| *x as f64).sum(); + log::info!( + "Total coverage: {}, Total fire coverage: {}, Total score: {}", + total_coverage, + total_fire_coverage, + total_score + ); + } + } + pub fn write(&self, out: &mut Box) -> Result<(), anyhow::Error> { if !self.has_data { return Ok(()); } + if self.shuffled_data.is_some() { + self.log_stats(); + } let mut write_start_index = 0; let mut write_end_index = 1; for i in 1..self.track_len { @@ -337,17 +621,31 @@ impl<'a> FiberseqPileup<'a> { write_start_index + self.chrom_start, write_end_index + self.chrom_start ); - // add in the data - let hap_data = if self.pileup_opts.haps { - vec![&self.all_data, &self.hap1_data, &self.hap2_data] - } else { - vec![&self.all_data] - }; - for data in hap_data { + + let mut data_tracks = vec![&self.all_data]; + if self.pileup_opts.haps { + data_tracks.push(self.hap1_data.as_ref().unwrap()); + data_tracks.push(self.hap2_data.as_ref().unwrap()); + } + if self.shuffled_data.is_some() { + data_tracks.push(self.shuffled_data.as_ref().unwrap()); + } + + for data in data_tracks { line += data.row(write_start_index).to_string().as_str(); } + if self.pileup_opts.rolling_max.is_some() { + line += &format!( + "\t{}", + self.rolling_max.as_ref().unwrap()[write_start_index] + ); + } // don't write empty lines unless keep_zeros is set - if self.pileup_opts.keep_zeros || self.all_data.coverage[write_start_index] > 0 { + let mut cov = self.all_data.coverage[write_start_index]; + if let Some(shuffled_data) = &self.shuffled_data { + cov += shuffled_data.coverage[write_start_index]; + } + if self.pileup_opts.keep_zeros || cov > 0 { line += "\n"; bio_io::write_to_file(&line, out); } @@ -388,11 +686,19 @@ fn run_rgn( bam: &mut IndexedReader, out: &mut Box, pileup_opts: &PileupOptions, + shuffled_fibers: &Option, ) -> Result<(), anyhow::Error> { let tid = bam.header().tid(chrom.as_bytes()).unwrap(); let chrom_len = bam.header().target_len(tid).unwrap() as i64; - let windows = split_fetch_definition(&rgn, chrom_len as usize, WINDOW_SIZE); + let window_size = if shuffled_fibers.is_some() { + (chrom_len + 1) as usize + } else { + WINDOW_SIZE + }; + log::info!("Window size on {}: {}", chrom, window_size); + + let windows = split_fetch_definition(&rgn, chrom_len as usize, window_size); log::debug!("Splitting {} into {} windows", chrom, windows.len()); for (chrom_start, mut chrom_end) in windows { if chrom_start >= chrom_len { @@ -417,8 +723,13 @@ fn run_rgn( chrom_start, chrom_end ); - let mut pileup = - FiberseqPileup::new(chrom, chrom_start as usize, chrom_end as usize, pileup_opts); + let mut pileup = FiberseqPileup::new( + chrom, + chrom_start as usize, + chrom_end as usize, + pileup_opts, + shuffled_fibers, + ); pileup.add_records(records)?; pileup.write(out)?; } @@ -436,11 +747,23 @@ pub fn pileup_track(pileup_opts: &mut PileupOptions) -> Result<(), anyhow::Error // add the header out.write_all(FiberseqPileup::header(pileup_opts).as_bytes())?; + let shuffled_fibers = match &pileup_opts.shuffle { + Some(file_path) => Some(ShuffledFibers::new(file_path)?), + None => None, + }; + match &pileup_opts.rgn { // if a region is specified, only process that region Some(rgn) => { let (rgn, chrom) = region_parser(rgn); - run_rgn(&chrom, rgn, &mut bam, &mut out, pileup_opts)?; + run_rgn( + &chrom, + rgn, + &mut bam, + &mut out, + pileup_opts, + &shuffled_fibers, + )?; } // if no region is specified, process all regions None => { @@ -452,6 +775,7 @@ pub fn pileup_track(pileup_opts: &mut PileupOptions) -> Result<(), anyhow::Error &mut bam, &mut out, pileup_opts, + &shuffled_fibers, )?; } } diff --git a/src/subcommands/predict_m6a.rs b/src/subcommands/predict_m6a.rs index fb98cce5..ab50657a 100644 --- a/src/subcommands/predict_m6a.rs +++ b/src/subcommands/predict_m6a.rs @@ -265,6 +265,9 @@ where positions: &[usize], base_mod: &str, ) -> basemods::BaseMod { + // do not report predictions for the first and last 7 bases + let min_pos = (WINDOW / 2) as i64; + let max_pos = (record.seq_len() - WINDOW / 2) as i64; let (modified_probabilities_forward, full_probabilities_forward, modified_bases_forward): ( Vec, Vec, @@ -273,7 +276,7 @@ where .iter() .zip(positions.iter()) .map(|(&x, &pos)| (self.float_to_u8(x), x, pos as i64)) - .filter(|(ml, _, _)| *ml >= self.min_ml_value()) + .filter(|(ml, _, pos)| *ml >= self.min_ml_value() && *pos >= min_pos && *pos < max_pos) .multiunzip(); log::debug!( diff --git a/src/subcommands/qc.rs b/src/subcommands/qc.rs index aa7c796a..6f43994c 100644 --- a/src/subcommands/qc.rs +++ b/src/subcommands/qc.rs @@ -295,6 +295,13 @@ impl<'a> QcStats<'a> { pub fn write(&self, out: &mut Box) -> Result<(), anyhow::Error> { // write the header out.write_all(b"statistic\tvalue\tcount\n")?; + // write the phasing information + for f in &[ + (&self.phased_reads, "phased_reads"), + (&self.phased_bp, "phased_bp"), + ] { + out.write_all(Self::hashmap_to_string(f.0, f.1).as_bytes())?; + } // write the integers for x in &[ (&self.fiber_lengths, "fiber_length"), @@ -316,13 +323,6 @@ impl<'a> QcStats<'a> { ] { out.write_all(Self::hashmap_to_string(f.0, f.1).as_bytes())?; } - // write the phasing information - for f in &[ - (&self.phased_reads, "phased_reads"), - (&self.phased_bp, "phased_bp"), - ] { - out.write_all(Self::hashmap_to_string(f.0, f.1).as_bytes())?; - } // write the m6a per msp size out.write_all( Self::hashmap_to_string(&self.m6a_per_msp_size, "m6a_per_msp_size").as_bytes(), diff --git a/src/subcommands/strip_basemods.rs b/src/subcommands/strip_basemods.rs index a6525480..7138f564 100644 --- a/src/subcommands/strip_basemods.rs +++ b/src/subcommands/strip_basemods.rs @@ -35,6 +35,12 @@ pub fn strip_base_mods(opts: &mut StripBasemodsOptions) { if opts.drop_reverse { data.drop_reverse(); } + if opts.ml_m6a > 0 { + data.filter_m6a(opts.ml_m6a); + } + if opts.ml_5mc > 0 { + data.filter_5mc(opts.ml_5mc); + } data.add_mm_and_ml_tags(record); record diff --git a/src/utils/bamranges.rs b/src/utils/bamranges.rs index 174fba84..e522b9d4 100644 --- a/src/utils/bamranges.rs +++ b/src/utils/bamranges.rs @@ -144,6 +144,24 @@ impl Ranges { forward } + // filter out ranges that are less than the passed quality score + pub fn filter_by_qual(&mut self, min_qual: u8) { + let to_keep = self + .qual + .iter() + .enumerate() + .filter_map(|(i, &q)| if q >= min_qual { Some(i) } else { None }) + .collect::>(); + + self.starts = to_keep.iter().map(|&i| self.starts[i]).collect(); + self.ends = to_keep.iter().map(|&i| self.ends[i]).collect(); + self.lengths = to_keep.iter().map(|&i| self.lengths[i]).collect(); + self.qual = to_keep.iter().map(|&i| self.qual[i]).collect(); + self.reference_starts = to_keep.iter().map(|&i| self.reference_starts[i]).collect(); + self.reference_ends = to_keep.iter().map(|&i| self.reference_ends[i]).collect(); + self.reference_lengths = to_keep.iter().map(|&i| self.reference_lengths[i]).collect(); + } + pub fn to_strings(&self, reference: bool, skip_none: bool) -> Vec { let (s, e, l, q) = if reference { ( diff --git a/src/utils/basemods.rs b/src/utils/basemods.rs index c9e96afd..770eac9f 100644 --- a/src/utils/basemods.rs +++ b/src/utils/basemods.rs @@ -241,6 +241,22 @@ impl BaseMods { self.base_mods.retain(|bm| bm.strand == '+'); } + /// drop m6A modifications with a qual less than the min_ml_score + pub fn filter_m6a(&mut self, min_ml_score: u8) { + self.base_mods + .iter_mut() + .filter(|bm| bm.is_m6a()) + .for_each(|bm| bm.ranges.filter_by_qual(min_ml_score)); + } + + /// drop 5mC modifications with a qual less than the min_ml_score + pub fn filter_5mc(&mut self, min_ml_score: u8) { + self.base_mods + .iter_mut() + .filter(|bm| bm.is_cpg()) + .for_each(|bm| bm.ranges.filter_by_qual(min_ml_score)); + } + /// combine the forward and reverse m6a data pub fn m6a(&self) -> Ranges { let ranges = self diff --git a/src/utils/nucleosome.rs b/src/utils/nucleosome.rs index 945e8aa2..d0cefafb 100644 --- a/src/utils/nucleosome.rs +++ b/src/utils/nucleosome.rs @@ -3,11 +3,6 @@ use rust_htslib::{ bam, bam::record::{Aux, AuxArray}, }; -pub static NUC_LEN: &str = "75"; -pub static COMBO_NUC_LEN: &str = "100"; -pub static MIN_DIST_ADDED: &str = "25"; -pub static DIST_FROM_END: &str = "45"; -pub static ALLOWED_SKIPS: &str = "-1"; pub fn find_nucleosomes(m6a: &[i64], options: &NucleosomeParameters) -> Vec<(i64, i64)> { let mut nucs = vec![]; @@ -107,7 +102,7 @@ pub fn d_segment_nucleosomes(m6a: &[i64], options: &NucleosomeParameters) -> Vec let mut pre_m6a = -1; let mut scores = vec![]; for &cur_m6a in m6a { - let m6a_clear_stretch = cur_m6a - pre_m6a - 1; + let m6a_clear_stretch: i64 = cur_m6a - pre_m6a - 1; scores.push((m6a_clear_stretch, pre_m6a + 1, cur_m6a)); scores.push((-1, cur_m6a, cur_m6a + 1)); pre_m6a = cur_m6a; diff --git a/tests/data/ctcf-footprints.bed.gz b/tests/data/ctcf-footprints.bed.gz index 4665c34ba9b206ba291908cbfe3ce528ed560b45..86ca9c38d8e81bcb51d756334b03ebc2c24f3ebc 100644 GIT binary patch literal 14091 zcmV+mH}uFKiwFb&00000{{{d;LjnMl08NfT4#FT1Mb~%@*CcJJY3gMN!k{ED;0$%oIjAjQWT1H z&bGKnJRp%+MJObfSCYHajYjg{|L4Dfqug?n*XQ8)*MY&YeoP&RII0iwAdj(Q=7`hJ z+;it#JK;RzWS?;&PuLk}oN)YHcwHpVIIqMwH&32%_MA~?oPGj0e||hp;L7OdZv5os z=gtYM$L%fs!B6tEXO*uC`jyXBbzHTm{%qp2 z*(dX=w*PK096!&D*BSF?_u+ZB|2(~)p0Mj-oN?mk(VlxN4_A}pQA_gh?DH?2o;?4; z?%7uc$y+I&aWCMy$8#P@=Uth1t;)`O zjcym7M~`Zj-M-|0=5l?1Wp}_*`EFWKK+yEp3S{GuTt*K+{Y_mk1pw3-JdVU?XeyI z7t@;m+W~j~-|%e5|3Xe#h4ugX_rL$|^BoU{6QZt| z$}aWz+N^fNM0*2t8&!&@SDPHgv1Zs+T~ZcZ?W*T{yhpqO9qrpI>Pi^OsGeT8p~6K~ z6IJG(>e0kbmYOLncaf<^1rlbtsVd7EuMoMk6P0Sdaf}yLgPk^sRucuf05!j6{q&ls zY1nwki7^)hLX;9*dYPJyI%W@o|BAr%kE()nZ*oJ;X zHm5^n0;UgDkGL1pMTbP#cBxmjnq#h?E+?7d*-eEn<(MfiOF8$pVKysdCT-|c&#smm zavt%T?SplTr<= z)_P4PN$Ym+66LffRt_!Suw2z%e>d{!(Osjn|(|uYMlg0#E^{Yn=)Iz^qm~6EUG~JsDI$+w?j+}jm z*(+CA%h;uQ(-2ils`q5FGW710GwxKKf@qI~#0EB*Erl#Jo#m>Stt~kNhoNZdjYX{! z-FoOmrFc5+9IV`DStqikTmnqv%a*}sTJ?N3X4S+vp2<>swZXcF+EWb5r`5RG*s;-j zDA>^Cz)8b|+$C2{3|*BIY3`<$zNIyhvY6^DC901l(v&FYW>uzE3LvUkm8vyppMru1 zwPOr>ha=!@4sbIUbS=JXIq)#JG=OV+ePxkTG0nP!hBkkuf?1(uMNAgHJ;@mu6sxH54 z@}6h!Kee|+_hdiH#XtN!uRQPEA0;Fn`r*5J zbH?e%Q=j#tO#D<%y|j?_P%K?)t$uy`(9yjWcYoKrU7zvL4C0l^K6F|0ar=*IYtDaC z7rsyqR+*n-nz>YnJ*NWPVpp7E%sy%nQB*Zi*WdsTiv&c52R zZ=K@)vjX>_jeI?Un?kRy-sk<1yxY3(9Oikh-xcbc(y(93!@u4w{m_W!yNsuXFv&zW#%bk?ve6@g!>rQ!{_2r%5bllN>qoXI5vZ{neIxrpm-w8L5P(Y9E7IKG0_7 zFu6(yrE zEe|nf3d{+x=#+0;GOg5HlJ3Sa5I%u8((P!*%!w#N93Nu3KjXsKFbVH`O# zGGRfJsvH#3jNwHa^r%9;1+Xg1zBp)W?jIeIYI0ZED8R0mo2h3| zrWe(8vUUklVzP*+9iDzj$=Z?|8?7GL8Qjd@J%vLMckKo^~01rpgGF2voxnre{ugs$4-ff*}s#i7| zZ1pb1q*3Zuvxp4YnwkzvlN>k8ZYhj$==79ppck_xg<>XV{L8Dv{lfa{V5#Bd0hn9@ zvv)91OW>khuA`p7wtNLTvou`MTa1wU9dI$4cM_4?Pz&kOdZhMr2c0RmG03Wx1jJ@% zYHCVtU`#6ets^JZ7=|;4a>}7CHA0gdgJeo^5<8OFS!uSCm$2;yNs-71-;e^?cDm$_ z1+tcs2NyGKs~(?6uUpA?fISVDyn5QxM&rN!{eS*Dk1*RLuFvK1uLF%sT~$Aa;Xl85 z#2zunP0uoSUC~_nlDg?#{z*H;-v(uO4NiVqn!U37(dm5ZlU`ZhRa@?7;I8hO6F+*U z-KVPPNh{t!{eu$e(q7$lO}{l${r6h@ zz3Y%4C+~VS?7dVTFB_`3sZs6*Y?so4cLS`)hrLXu-nyBWTIZ$q`kQUoZ~L&P=Bm!O zHQiImbF=Y!nbO(&OFnLX8HK%+WAifax?SkEmgl!Q-a~8kOnEicUg_-BLH47mx{9@% z8Q$~yOP6%-!&@`rw_fh43yiy#?rp4gE9FxIc1s(-9>Ud!_EusIllv|^;Y0Uwp2?^MpBlu z&D@cOVAwINT3aKsAh$r6SWO6vB<1!7R?Li&$XzwdWNQL$Aw?sUTc);WX`2eYd= zRg69dy+_ew+-*j(of8CCvE*{uBkiJUjO|L$W95&IMZ9U9>)L|L(%MamoK}Sj^>BQa zBS~TGvXEMh&b8@Qqr$zrDBng}^l13!M{gwwY}V|C=D1ZyuLjo47$&EaB->1H*J@i^ z@qq1RE*~LdDAm(+mta<;O{-zk9qCdrvb{5Mp1>OAz?-s>=67K>piB`QGPN?dHzY8h z8*pjV*tU@p7qVuof_D?{E)A2@liD0HJ=+Yb6yJ@|dtU-nxO`beL`g-(x}{u&NWIRf>*`k|}mM>dA4UGSrgT-bqP0>4W{f*p3+z)>8ia|79!c3ptYKwc!1%Fakn(1BYKZg4#;D|i=T7vtTm_hJ_Qi-K zrlGaPTZWV^3UhaZ1Z~IBNC~4Xq99w8=*p3Lgp4Vav8^h(3&h4^w$sk7! zza)vyS^LHw>1?U=M_YA%A$SpAa`73>F!?cJk5Gez?kxc|`nohcjSqcjm%w{X>Ap4D2h;*FoVRHaCw6c@vn?yb~#krg91= zhHWX$Nv8*sipnv=Mo9ZDRF$hSw>2n}214EAU!2%1F|BnNIVNRj0NJCh+X31@l1NcD zp*^ZjMfC)fE6XY z5Ruzj+KMvEmNe~P!_sAvWo=5k(%kfHjjB~vxMYtIn@Ej$dV9CzR?=(RG>blBLL0l2 zy^xAa^tK^JwP-BeMjEk>xtI4BCmwps0!uTl+8JdR(y!_GsU3^X;Sm;Hj^nQbUIz?^ z`w??^9^QxV;dMBVhx5dzcgFnXG!ZAxczJ(3fw$vGdwM{8T|99<9ai@H%*lEm_Q^PL zVo#j@)TO=eeca~v+aCqIPthMm`upmu64$Ny$b)fqJ#);n>V*5*C+k)#=dBOwv*ddk zJj?Ur<}|SErtSEV4x=Ba@HJ>!!oGGOSkx-ufoH>%;0( zgBCZ}hwmc!QtrHUGIRGx`F0$ckJB$dlXcU0-PB`cH&xKSntijoy0!JykK42DpQ*X7$`kyT%IRV?w`6g^!9pcCY5H@vpZLN z$E<8h#}a9hyfJ-ksTBG;`}MASTaeIN%@+zTO^c3)L{7J7pEUSa_0e`g+a5H zJn~Czi($E6%rCr4vWz?1rC$i$FMr>{>yr|NdQg;i^xJi}ht~oW7 zW0hsn;|@4mE_tQH%#eDfj`a#k6xlr`H5JXOYRRx;c$u^%TCeDka4ItcqnE9$?U2r! zY9~t0CA||asUAHszH%q+W^`@D9I-VzNsrf3y-Qh})@nhyR(nZ1jIhx=r*go2_5KUd z(uv#bj=S8%#Y}t2#U2d338_M<%H%TT^J)FuGk2(P@;lGQSW) zCG>k6`wIZ#ZNSQvnrqbVT0tjF+rC6^uax>acQAS}Z=aSrwk>x;n^8?=N^`wv+bn5D zZrMI1kz})4?N?{gjdHqYj+aX$%;5FKBL#CjLR!oA#usN&gogx0%ekRxduBr{3;f~0{?Q`bmK+e(XMB@Cs_CC=qTgOwSLpa>u*6*9zfpR^U^%-xM{pN(|>Hp}NyztJlp7rv7IPO~}*IoW@#VeQr)4-t2J?J$-eR1 zJB;}8huisI0PjaTp5657$#*7y;eIFrxqRKm{s&vo?;F4Gdz|mvhW|iT|0>-6C!*_j zf$=|(!oG_&-({BX0?B_MZv4+I^8Y_tTK~UT7XMI8MtDnz9!*Y{4g`{$rD*dxxgAoP zZ|mt%TE-gI7iY*udPD8$oKezIIUUiIZ)1&HwQ8?B;SwU+2`lL?h*i;4#*rB{DMzq# zqhW9n4s{ zpfFqNiyB3?UQfb@)!H@Oj&}6YM3tV@GBHS#yZ-L~=_w^}i%iDAolYVA?3>AN;a zXbqVs4KgxOBpq_-WGsei+sJ05shO`0p#fZvik227?v*J+BLonxeKOF=aFcFPDofJe zo6`jy_l*UYRM%=BbE?+VBB?`)EYnA>1zA<4bJ z*JmXxzS(BdU|J`2}-VAhe5-;*<7DRvZ_A-%`76D*|( zpJI)DaZ(rCzL8RBe9|5BhHmD~hep4=ob=;L+}v{Tv(7hAw{8^D zyw%e_oAW_PGYIgIcSd?WJWA&=^K(D}s&mlZjFp^pS(VkA*zd6u z+PV?7+Dkjq$3p56H=|Qmr97-gEoi+S*DlFu%;#DF1N;1gE^>?lg)ahc9w>wva1%bDz3t9|$vr!?oU znJKMk;jv8h7J1it$!+=sDFe+0%TkZc3U?iSFmhNjRAMA8HDTSy>IC>be}J1B|X1JJRZ$c0gl;@w>7X*^~UG82M5+O>Ln zSLs}=tV;WVu+%}%@K6~kJEj? z`h;CEG_B9w&!e5sQ_m0734fh|^#xwqMbCWP`?xXg$zF#KV&@-*&>tXN#{g|EatFxt^chImZh$E5D(b{lP2!>q&>a@hI;c z%O}?MLi_3w%<~ghtXt~&y!%Q0d}5KWYxRMO+RFyNZ};>q?ZY4Qn>{SVoFjoc6k=A!eF!s@rWJREmD*YDgr!`c3IR&6(^d<~gccBj_fb z@pyp)?YSnpvt(?S<%W)Ci?NY*lSPJy4rL19ODsOiTDC-3+;c{<17H(@FCL<{35i@q z40>fIu-ug42WQKjrCzq&DacoPV!YJTshue8Pm+#c=!t-{kf1OO1wu`+ZC;gfn8Pe8 zCFR%IIW6a;O|w>GblqFiymq4!5?1s;=4!tW9X)bqWNvBFh?C)|4M7(c)^e_>?QJ!> zylXvI^9kFv79*`2#V>TTv2VV*6D7A+*bSFXd3(1i6*QIASrTcD$ZRz{y z)4o_!R9B27%v4b#ph%yU09mi^hL+UWrIlqwM`?V_ZO0@yomO1ww|fx#g^FG6qbdF9 z0z#rR%o&#mT6zaY4RCso_RAd;X)>VIAObQ3?d|~5%w^k)kZ7R!_$WYQ#t|dAzC@I{ zntZEmT)S^HG(wW`udstMa(&0HOL?$o+qJXLs+uhIrg3sO+Pq{Ipd4CeNPEP3|F3`l zpZ~@lSsaja9kj*q*8$4|W=9;bJTebh9$g3fada+jG`IdZ?mfHCuE&MzZ*lwjzUNn} z&%ZdgUa9w%Wt?5-=5-<-rFH$P5wB09a052_mT&*v#P@F5 z%zprR-psb=NB<&Z?JpmIck|=Bm%kefb2lxvcQ$oLh5UxN+^w8{|MjgO@7%TedtbfK zn)%?G*|R}^ppjK|Qo3ERj&2R_rdya40MZKhqMXiL>FKO145q^vm>a(YZ7ENa_&m~W z!bf|n=>39`IJ`_P0K0=|>3}LH>`O46G1R8a1yP+L8Btp{(xzl4J7t5ta!dNLb)w+P zU?gGAUhR5l(XIuWV9Ut0^enEl8}r`H)N55?igJHtev{XwH!NGZ%T^o__mdfSC$z_O;p$}=`wa_!gNTXnlWf643o2Z8_nT2_fu%kqTFX~ z4@;p>8I24%h=~emn@g6Pt1j`PQ+Ip?$0c#9=@VbLDsCLAw?5M0ruuBL5?j1971GdZ zyi!9xR<_cGyUzqeHa2S}bQtVLP=>+w8uyTcrqc;66qad=uQZQclk!59g@#GnLz_9K zSGsGzGBq`>MQnm;mY{D&I5J!S-9?eCOGRt9qOW#)V>OQUX8QxINx}m zz!`NGEpU$9KkAbw*vKTqcTFZjV|&3irMOxyKog#_lu6$9J5C<0BBl zEoye(7$SKKL0z{%-Y*d0!WQg_DA?WWm)#g3^$c@uH|oe8w0K(CLT*2=_niUr z3g_^SBf)nIZakJZmh;R#Uks6VAar~Eya1AV6u^5s^HnQ;;2*E(Go z{}c0z*G#vT+ZvN!986?!WgN7$N+`WVb8JIews~e%LxMQ~OZvNEOr*7xhnp#b4OeHE zbYD8VXzAI*Eyl~gs{N%}aorHDop^S}=a!*uE%jRTG>8K$KYCkl^ z8R}@2or@ajk0H?G;)H}CZ&U@ht`w}&NLlWOAdrkJeSJi5(vnq$&p%*>eB`KqW4b`hPk<`W{ z=*^Of!_M+l@UstA^2lPmL;83h_ncOG!@!P@+Rm;uThjL!v)Z+m*SJe(%Z*I?5=XXY zCHlEOtWq^~+aB$wJlb-lRmH}P%lR7v>P3%UVXGm!2zW@#sAlTzDx#*og4gDydVXrk zC1(Q-#^Ot8i}oqCFRr$64U!vbx3no(he7sQdWPQe&~jHZV^Kb--Sg9Dv>(4F-Jm1f z1WCz=i^y3S~$0gQ`fWtqDxIZf}=o_=T^$5Q#=R^Kr0y{B=wo?Z+IGM;?7-9+L-;sI%-WynLP) z&d783x!cdjhQI2)`G5TG<7<3gdf$R4pim#!!IdE6j5?zpU*UYDc)%iY`+etOXW?<{ zz2sL;uX^2U`uF3VlYT}XFLJ_u_|^H~D?Cy>cj2#qi}l`(e|VVraNM3ZoZ%yyVZZ1U z?`-~hb1R;7hlAcXNJ2BVII(PaNV^NoAmd2w=aM z$)Zxr;kkn@`Qlecxpc@1Rw8vk+XNS`EiGXyJ(gV#phM|WeAYrz$$<7uEhqWHMmx1J zGh0Zxk5ON&hhbHNZ+3^<(5!#!4XyQNrLpwJZ*qfjvum`_RyQ%d^Tjd%rHSXugw)_? zOhw6!ZiRd4&@&srXp}l5SGuFFMs6vESbL(gBpXB6p}g2`pw?%l0S?w6CplU&$-Npfppu;JGc$ zvKv+={TrdvnA%w_Fa}R;FQz79MlTh;C7I*S&CuR`Ymf%H|0F*3Il`zZq++IygFA>^3I$sMYtAj9=L0wY)3M>i~ErNXEqV&4uEuAN)h z79R(T<6j5NkLe>W#|e9Q9L~e%0FHQs_m8u1`q`(Saq_7m?vq!vtG;PzYcJ*P^~1j% z!25h(f1l+u8RJpL`={`_bp84nw@eCGz50qb2Jh2Pct4KgS8eL!W&`HK(!-V4S}hkIv`-=g*Rjd1UGmF;^| zc=PC5ek=F3BaKhj3$Ft6wUc}APH)@RUrt;9?yB|1;s0=^@Y+niV{VD9-}jaC`&#oK z%~$^e)5P~V07ef=XNl=%`$bkaTSzr|1J-(D;SN8-IK48oqw(rW+bj3G+)O!Ms&CoT*iE%qb_YkxsE@xQ(9@b<7rY}M53spf)p)EN2sYGLmHG1%O<3C zMqwL@U=p1k8mN!%8cIGMZPxY6h6>8STRv?K=`mo;eH1j{U8}?IC7s%_#EzwV>WzPN zYtJCIS)={tGS;a*_;!9F2RG18nIf!XrTw(EPngzk8BMBDBE2a+MFWWB=y#sc2!&S$ z!zPr*KDtWNvXox+`kMj zrW;j~DPvRn+enX)6W0Q?4mstRtx?9v3~zWUta!`4q)|)f+DeOk&h(7-%CZ$RG60mp z%Vpe`@R-t(0gOSGq&}LDGv$mDVQHf+2PYYlJL3_%GS*Eys$4!}i$qCB%N0~(Ki9RQAsqv`-~WF5sL`$!(K#}%yR8Fl*w z2V92syaI1uVX|NU0XykebnMsV&c}WJy07+()6W;afPC%z-^qB!d4(i@eWkx~13$g5 zfaWKh*Cn`LHujI|<*l|n<3znfi9ZmOd$XSZucKEwz+S~Xppks0d=&m{0p?0~{DnJN z`|kurL{jU=483B`o>=>lyNT8s%UG1vMrh0U!#m>-vls#Mfnt}D$ zP{~^Zv(PrvaA!!~AZoL0rABBhXTH$KV=Xj^Ik@vB&H%#GM)eyF?pkF&nW3WVU@H2rnIeyMZ;6>QY0cwjb;4rF0!_^eC5xY~I-^^>r4oQ8F8MH#4;? zyd#I-_{6p{Mqkn#(m@7bBL`^ofO0AUitS$lDb7^MFfW(2y31!1MY?uz7K~VGQ^ro| z!mxr}DaU7_rbZh9m4|8aC}Yw_!U@r^pzM85BP7c>*QO=5#mHD!ost&T5{bPWf9n_y ztSE-IL`6u)2R(%xB|DCxTysAy5tQDzGMK|J@u;m~C13kyQ*SZ|B4l8buz57bxMd>xxtZSsI%P|oxy|pC9 zAvuA}sG(d8+-ka3BFhskS8DIrq--%b1sT~q+I&p&<`aye6t@Za3S@?bUP0Yx(ywkU z5H08VjK!|%k%_J^qzJPiS}oL7Z6h)N&a7z-u&q_-NbboE-!Ea^yo_>rEO%qe4VPt6 zdDNPmHYv}+JR@XeWx{);C#&|&)Es}tBvwv)Ua-w;+WW8n2fEM&1&j0m03VA81ONa4 z009360763o0K5$CTuE{y%Z*)!uA(RtX5gJhi@)hjZ@TYq)@B|<2Fa?I)l%(5Ba;~e z+yS59zyq9r{n!5}w#aUAweuo+`}@|mWp5f=+vdJSZCP9C*0!nju2y^JuA1v|eM{}t z`|-AqNB3*&HBgkH7x<-~Hz#JyPaozb46WWGk!0OVUuJ5-r!M zP$47Hn!$e2v*qB;GjIwFc79Et2XNZu-phAN8dv=Nh9lUW96(&fwnW5N}mT7Bo zHozuQRBw>IU={88V6;*s{ZLUs) z&Gv{eCW#23l3ZBvZZJ)c8uXh=(rk#V;T5Q0Gs#=4kzgcKU82oMDq>7X&q6zcEiTtc z1Zi$p#yD%P3EG=&pW2Y9p;Ov=Oy~yMDFI_jO-o~34%PX$ziw+Eo^oDHZa+4a+-f$j zb!FJ1x75wCg>Syw?z`{RyWjTnafR4#Z?pM$Y)6qftNUoq*N)tMPrct= zKZ^!8e?16q8o5YLU*meQiXdK9+yeST;8#=5imt}m*#g`J*VB559d}<;?>f87=?k33 zvOf&v%hJDX_UFUH$Ue*5{sUfS|5dPWBmIL%P$!$qmamiaw&CY#;=jnFZwW4+!C!X% zMSjaxk0#ivqpuRz4Q>K|ISseh^Jv3aW^O(w+ec&Ta#hyv_6xo8){=c|=lq5d^Cxz> z-`KxHy~8je<8= za||k_P1k2fxH_A!gHUNYJT6Y1d=oX{UPz*QytqT;uA@xWoWv+~@KP&H*mOe*!l}0s zt{6B&3QSZ@$uwC0Of_LMBBtG}!J1RkYPGlqK5$qa&u6tEesNmO@8)0xC_Wbyh)~ zIXztn9lkX3olRib*{)6~S}{shwU=8^g;FsCcr!hm6X{f4bPrf#rgXH5HMb*C##~86 zm<;LF832(C97!KBrawC-%t59I)Zh-*A2ROl40lwk>CvW=w7SvBDkfnL7Gqk#T55)wNNr|X*;Gxq4+kwWa{|STLYc5+mb8^=fz6Vs zmQ11AC?es?CbUXHg;035w$TmL8rEQ&(z0J^pSi?XQ@|JLW^6uBrO+<(8aD%Li?o;y z+dj)28g%KUqQO_Ej+O?C*@Tq44V`E;E`d*|2B4VZ!rZqX8?cRR8XK^AZr@f&@R4{*SV5yVIQ;x156pTFBg-jga!%cn$ zNzX4ifr2BX_yMfwGZqApt>d+PFp<4WNNxG7EL?WV`PSzSuj)K(axA~7~O_dT>{MAl5mfrfy&HQr?E`9 z57M1g(gHQQ&!(th-o^MOm}`N_8qn;Ca@R^$iDpmF&|1>Prr2D{fesI8M{2nl(>dNF zluf8Wh1wor(e2F2waKnvAy4x}DtAC^M$&lZN(+L*g(j`jO+AoCLFlwLAUSfWk`+;G zsn@cFn+;Ev2HoGvW`(s`b21}1k{j9u)+e4AG3u8~>ZJ4NYb9ia+RK_+Rm{7e79@*n zyJ=5wL^!)b_&RR1KslOQGju|>XP7Z40^V9h@!qL!P~7Y2SQ-@2OnZYhb?GoRJ&+Qn zW1}ZLn(d<5Ko!R%D~qC|>V-DC&Jz_P6n&FHQPrn8WljZAk4R@L2ih%c1*1IprlV4U zw&4EXj4LP-uMtMW29s$~;7MOSl}y*i=*pC%GYu}(e{Vy$C0)X`V05M_+&#HZ90Sj! zs?owIDpES47^)g&N`qwKrh7X!L(iD8<1$*8Z21goid?=ZC zgXsR;HQM2VFM;M+v)`gGmU=O%S2A+ghct;NlPeMV&YgbpJ}S8$KS`vn&N^=UNVs^k za?yG|`yz3R34CSu>GNq6E)no}EFZ?|VtVz{olSivH$Tyc3p2P8iC41pCk#VQ7$DB8 ztL!ta;i^quh2(j23h{|$Jjgy3cvSMr|L`x;e=2qGc>TN(nfOVOk0N{&=jbkcq45Wj zvhKH^ugmC}W;s{|4ds?ZNQF?BPQ9`*RP*Xc1WlGuuXllf3@^~kH5;FzF&ZtOd6QyZ z5uR^Mr%9thd&w?iJ#Vd`P)B4n!h~_rQXggol+?8Gz@#8Fc&1*6Su;!Crdw#Piz=qd zXaLk|G(6u2)4?MfB~lGqOS{0D&U=e)o}^r}@;aGKGL}xw@P(U&@EKbn{J4UeiPM!4 zl{!I#*cmXIqq>eO46_G)O3azRRtPaL_`oMxxq&PO=^WNxiy3)CQsZw7sFs-wiZHm^ zFwdh^!i%1sIhG*x#k|E^sm{@M)dKY`!#s^GjVwch&e+sM&~==p0vfVKdR9<0M7$&E zIRZ=;gzjECWyQRc?uJp=v&ynv_&EWjD|Ig0kfv{PJG`1giD}3LosJpEVA~49yBvg| z)(I=pt6}PBp-Z^Y!CE_G-4t(Gxf1kj6k082==myu`lgOa2h({fu$16OJS6;bfPQ5% zq*>84vDTIHgm)Vb*NjmzCe(pV0l1N5oi^4L&B7?Cp4UL9a`>luLWCY# zN$;UqmSeN_bJ}sDMl(WvY;E~WX*F8e_zV=0^jxS^OtR+MM5GL#OVLV^^HfPIs&Y~~ zSUQvGK*d=WyoxvvNVIvC(tJeE3VNW^#d=UYxC~W>W%g8f^E?!j7#6PWEEkVA`InPT zts`J&XrH-?z}c=fKK2gpuA1Ue;SnuqwxLuHvgo=>MGsH=FSoQddJ@9Qt+_Q27MW~> zC(K#k*v&g>E~$1*swYYy;+hnWe}0hR8c zn0d`ufXd;BGYj;I+a8C>M%yQE+%~q^vNw}$u&K;NP6=Fp4v zB0c~0L!R*I&GyDRkCvVPJ7RI{eXwWmfvc^@)sk#A_PWs*xI3>#`I8lP6`ZYH!ko-v zcVE>3r=%AUD`)ko7v%c7s%PC-7n}{f+jYvhfLh-(cQx6rqhG%P(cVDrZy~gIcGqt} zwEsVR_EQCa44!=)^^>ns{)_PJFH_CF5!#<_O5X_W$I<&nXn#KMzY*Fm!hR#PUnl;5 z8bbSxzy8eI{l;HE_;ug->*pErjlX{J55Mu(pX!Nk{PnBS`o>>BwDuc+{o zOiF%!1J&ys^2G2$fMI;g^mJF(*40&6k&&7Irems z{)d13>BnETKmYaTAHVT zW3#O{)6Htr-t;9eiGQwsr%`9pBX!h~DEvJed6aQyC6Dq}G4+1O zUi3P#Cs=j;Mjek@ofUqs^4pDf@sC30l~)yxoUfjFcEo#)zpBBi!?R`b$~~#Y`oQen zQ!m}G$V)auy}sznkgqR#{Mt%sA?NpymxaCzeWl3Px_TvnUq`)PIIbIN&j9%-#o*5Z z_InI{Bt6S{{#wZES)<2=^XRB%$m0vw)fiAo%uX#h}D(x-rmQ3eRtxl~!c{XHR--^Z~mj*yGb`H>i+)ofzbJvC5`os5@(Nw`7CllAYRt{E$gWf>gHKI+ay6 z#WT~$hz-jzWU#)EY6;r+t|3#T@2!xM(jo3L6>O+RWsC8rn~el{b{%e$^qmO{ z0iAnGvbRMzj$qr>nGD7K+6*E~uMY+jzqheTBh0?LwRdA!yk&Zg( zvZpm!3I5u&41|qWgQQVOE!|)d*qgqd|0K zu;!L(nX&n*5wKw!OU1O7HkH7-x{^L!$RRPSns9R^T(qMw)uWWU_J62sq!JX17_gPEjv&D8&W_ ziK0T5R$7L!VkRc&4D6zj!bYsdSZWokh?WOHQdqVQ-DgQbQB*Eb?a~ih2J0@?^C<0M z6Jr~Y!S-scW9Dp6(Hhpp-kb4hX}*V2=@k0nK(w9ik|WwXjg{jNGhs{Lq*N+sF;NI6 zs<%KPOq4T`2w^LQsz+r6SZlPbLm50c)u4-oh9UwKkbwX(!{9P#MYB2D0};mFELATl zD)M`0w(+gDsJn6DCJU`p9Ed69!#-WrouYr$2+$cRWkQXKbD4-VI=v=5k@G=F27~P& zJsBT>kg`rTclHwaS(jB$HIw(Jy`< zYOz5!-*lU8wq2vByFTq!uIaAux_=JM+HdL9e*JT6+nzbzS1;Y3zVzkyb@$^2_qTk$ z{<-QVkI(&h-mbi<65Zb#@T}1Ncc%G9;v+k)M2((_iJ|7_EGtHJA1WdZVlkPbz9!*R6h@J*Rc;p-`(!J zl&i!mhd$5MU7>oY`07&ieYsouroY5p#;FhVtE**owmkd#8ydkh%c1fKGA32nJPjCx zA}@o?rt>rq1!udV2{V9YVi|BS5hu!8luhLP0{jN=d&OT%C#yYAVQN zf`7_@LI`E5pzRH-%w!-DN{uTOtIAraG$z*5w9Cv*TG+PR3*AB&;xaF3k_RNcTv--1 zQ)(1N@@ZElZ98NJERQtxY@vHLl@_`)@DTZc&q_4Z>3VNg80Vd0;x1rX|2FTSn22Os zNOEvh1IpC7Hl9W`J*pajvC3*l_(*Y-FS1;zI$%7VA;F#6A`E(Q%}7f++K`Z7a|z5O zi@n3z1C?O!aJX17plRiKWO)i$Ys4j+&Ws5aPoeLZs6;c&veBi(RyIXS zYl5vg>Y2gz=3Qy7Ewq?`Qp8a;TD(|uqSY0Y?Go)Z6e~m_k}Aq0+zb?{2YM(WW)udY zz;>0i3LQz-7U4<9$#e z&Qgd-P__O5&Oz8hBIyL1=(c8y=+fvR=_K9A9I7gBs}KgkElAosQKZ+Ww7}LzZ-*$ITFuZc^u3UXY7W1J58!kzNkywO&EaC_X$7yXsZ%YAm6auX z12L%vHfOo|c(83<7zV*!(VpsXiUh)X>4sU7iZ0q~Y5{NFZ3>&>Q+-ZL+J{|tOM+_M zwG1e>axKBXi~$~|l>)F#QpHS%YUD!1gYMnN62f|AyfllAC2vSlZ73oYPFoXQE)dE{ z6I4-QEITa?bPcpf6eT=#d$)ZUN;w}mTICt+^)go#x&)%8wUsS_qhWL%HKSP}RJYqru)`wL~YK za#AKaLW68k%4^DoPE#IcKx8P3V9G=(p#knTP6e_l7-f37A|@$o&AUs6JwEdqlb~ZD zt4kR?6P|KfRQ%zGzx<5NMFy$+=j8U?rlq@bDKDLh zc-n{UT86xILAO1VToy^UGUd|4TzaZsnO=Ques!vv;?knzrImS@P+gX4a(}^b^JQ~& zX|oQE)@6})Yj_^EU2ht!c&6N&DpxwWI!KmkX)kt9eK+td_BO+NRQc}acSYK%yz{48 z?Y0Gbl=7~!dZhKQ9>U&-a%-UW%kfZl#npSK%I)lY>{ri9#&yHh<7YAFaehSc zRVjFV zN%tcvk$?w<+M+Uqw#L=OsVt*m0&sJYk`Cuv+IUu&Eq=@?g|HiISOpg@jP`Y=2eYG%~# z7l!24elV1XvQliuoo`lCH5rf|;_e<)voI#0i%}#j7~~Y$UCwBZsiH|uwiDT)RxcUL zs>N7}a(K`bGuiCPWqZKcl2}$q%IGV&atofS84>neIukN%Gu$N(VX{~EID|d95mgsv zq&F~VGcxT&b~u|qW>V8B;4cP8(|xRXss-JpildEIMWq7KMi-fuoaSm<>xkf}iMLl% zM!fon4y}u(FEas_WKiIk4o8IT)wb02b|Z!=`|a(j5*WBq;-bOx#$98KSr* z#7fs(%2FUT7i!|ehr7EdpQdk1^^|g2=u{MawrwRvl{K9upw^mQtqsP#=+087 z=203E#@@M!)(~ocQ`PptN6e%)wXl*x3zeZ&71i8$Ts5VYvOYDWojV9H3FCelb|kDg zSQe4Z)v{J1lYCAr2skuk74A*vO)F#-7a1nqxJ{8wr59(+x=JL47nZh+OIscyD9Wq$ zEEO0{G8qGEwuynEII0#1H%j+N9Vj^FX=PQE44D>F!fEgB?cA0;vqq_=QEN)no3*F~ zCQWTbTg<{J#W$?@mPG|3Ic#Ty3We>{@u|-8Lg)Gn)9uSrWvS?9w)NJV>Q+mi^|g}b zO;J-1-IBiKutxe?yEBjDhpx%}<>%#h^-sL^RQax$l1EPO`m(FcT|>3*bA5U3naIoP zB`;q^KJAtG&sizi9^Um19m~Ul=FPB34lP9AN|eif=J@tz5EJ)}r)~vD=JC+R>>0*Y zfm?I5-&4OTrytffbLCr?vgKAC9o4;EH|}4LTh{5zA!9srR}V#3$U~_!SF;}`Pmi`< zO+J-gck#F{<=K_-=+d+8*G+Qu!>QqUT8h;-bv=>Y;S76?u5P2^DZF(k%MBIcYzkfm zL)4BuJ3^BQD3Ru3N~efvh*Q@z(PPe$X4R_XHl|V&)KF*Hlyrh>@H^7k3hC%9Dm;$L zL?r9ADCLYjWI&Aa{z4l~t){J2v)KA+C@HXK)8k{*Fk)FO1Slw?A)3_X#O|VjTFzU= zINvCDbc5+g4OJNKMlw)IJupC2dYCClsJALZGEm;GAG{xtHsdXO=81fUa z1l7VW_b5V@n=@j`K^X~zP}elNeB@9{XD8rPV^lsCo)ix@17(G3WkeJux20PMHRBlz zFQ>q00lFGtJrfzBm0?M6u()!sIAPi!J-+qog)H#yeuc@Y`3O76Pni?wg)uM0l#k{PG zYRe|fQn>|F2S6?5ykrp6f_wR&35p4e#Bc{R4U?qPeWERlVp^gWADlrX+T5w3YK?#J zx|GWU5lIo2{qB*joAE83Z2 z5S0j}+v-BNP}9!pXm`qLvq?%gP&2b^r<0#*1XV4Lgj9ZTG2dQ0VtK-MsDSUE)!TQQ zx@{`CX>7H{7O`Qr@NI?vIR727K(HcvoZt0@dM%>(>1F70H+07TIM3|2oJVuOW9!uU z56iKV9)laE&RY+#O)lK-Vo=8e*75@wv*Stq{FuGP|SMaD77z87!qcP=F z;x4J>(VUEv&iLTjS2C@X?Yk^JsMqba^-ek(qfMe%d(Bdf!Z_0|LA~i@gjX`#L3lLi z8*DjVG?s*hc`0QRX*fEmR}sw^FB*-hmxp2MXWVs+g4b$N+Da+t>FN|*$*IX1`fMfz z6v-=8rxcyDEHWt_NRf*8z*tD@aW>X6%AGriGFMn68%iB%p!B&Oz$q|2KEr)kW3q!9 zo0}$l=z@|cs?#Zr%;M{bTW3(81Rm4e4@#v*@JS8ORtVjhSHYSh01BHZ##c`5EE*ha z1tH^PS!soGEtVlgX_y^Hn$lMXKHAB)c3J5VHuIINkT7(}hZnWqFbjbC`FB+}?y?{) z(~Y@Ch~>JOJ-o`UP#(lAV?}Zx#gw&Jvev{XlE=rGgf(3Y5D~$H*)2vjbr&&N(?LU6 z!7Qigj3qw$!3_XflG+NoyGXG?UjetGP_ApZD)pT=m=?D96{?zR4|U2zKX@U^RJw#N zvO-eor$aQ@<7@e(SeUUM3SAFMgnHyb0mJt}>IqHCY%3S#5G=~VsNeLOHgLftvx5hr zn+Xi3WJ_ZQ+XU?I@}0q z+2ByUkC|y7{;m#ibnT&Q{~X)C+tg-L%}r(N4ZTeqzfrGT=wRyGue=@*>7M9;q1qhh z&cypSi#^eG<9S)$zUzJXICsFU4_tN!KIJ^Sm+5b}U9jg-#d-M!e?Icw@48YwLEJn@ zmUmT`w~q%nDo48>je19GpS9kzyHe%uv!gH6@d;->2z&2%V?HR9kIQpY3@?Z>Z*cL3 z4KG6C(>upIF%S=o5l3;iHKu0ze85+snrRH%j+&X`4(Cn5E$x>3SWkv|bWRqH zALFj6K}AP(uc7qdWhVk^eG0TSH>Jb!bxU`|ElyeH8kU~Iih)lh4T!&DA4WBMZE_iT z(|5cxx+pmf6D_R4*&0nJn=uFX>gtl^;N!JAq`8h{3e<~(rbN24^&)4iDLKE@GKRf= zE-4ANQLP?HP&HXqYg0sWvDqu=%hzZTIw2*}GT6-MP^3|7c9RSP+caA$ri8zr5TRa4 z+JI7O$ZD$GwhG;DO%;P#k-~!ux0@K>#keW8`;Jl!x9#-QWiL&Jt4TUlKboP{*oN6` zBDjDAN#P@H7V}=%0B}@NZ!i&i-kiig?V9)EBpT)Sv;dv5s7rvjo6YRg|ajXOQ(HeQ(75eJns`vW(=p5ND+-sp}@xZ(04Ru_+ z!UrMoNme}JvJ0?_bu3>&gYih+?{}XZ$`ek!uEiU$DX%l1dfZcww1*qvy(*{IU%cax z4|DRnVkb_TSM$!SuK?gUQBF+D{F7S(AQi=JV3d#D>Qu3IcXqL>>VAm;(VzY4=6E=dEAk<

Ils)ZKKe&j(I(lr`tZYlW&)pk;;Sq#ziqR{SNBhgEZT(kt#C4V={Kt~Au?Osi6R zktvH*3h0ToHNzyjm&YVF&Qd+Z*yI}}ZQSdiXsJ_H`ySqoMo2H8F)9yMPZ{@784-m<5}PY`y_oVJq7*_A~WDoJ6n% zKkNs7A{Kf6BF^KJj`h5H|1M+PJq_wiqZgR5AMU?bv2y*(>%#f}ne;l41Mv8&aKCD* zuh{Dn%U1Z2;HblO{>*r9pI;=kJ=ui<&DE6c-u35$?{$L=zeWb;w-Ju(*uMfW^Z`cH zBj3~Z^T9WJn)T*a_{Te;?EE-+PhVB;y6b6sd$ZQATk}!JhabD^_-v;=P?D#K@7I6z z=*K5zEPmZr7sAqS6fZd&^bPh_L?NkP04cBHs47fc=%Nh?)ekyhV5lpu3KtdbF<(sb zLwt}h2!+KJChbNYv}V?Pz%@)QgiYErtzOha658cM+?E!N0nJ5y#YAL8qzP$iP|=M7 zZLkF1Y@wDXcc|}noXO)R3UH%I*T>EJ1Qk3L3 z5QI?|Se~OC-CBBC4i?rk&AD(TLuT*L6A1ERw z*~vz|!_x8F#7@9c@f5Stj&FiLa3Xr6TYWTqwwG$k-)F)@0w6`RXlxc(I(10wQyR%+qnu9X zN;`OX4jo03>iHpHdq?nSQj?OLH3yU&5iW1sV;qzFLHLRYAihm)v6*i^Tjo~2>1@lj zelG9-hY^?_+kTsuUjL$ARxe+6amxY!x^J12?Vl^%9<|zH<+h)Vdil#@zvaC8vIVZ# zykcH9S1ISUm1;llai3WE+QMRvWPW@e=5qU2*HzkS#wSNkdsX*hcZ@|}y0>i3jP|eb zve@2S@joIk*MqikTAMvT&rSHlQcF(A;EswMx7Yst<|CCn_#97E;w2IZb+s$6BRuTh_VW5TI)5$%B#r{|*-l)w$~S=HJlorEzjPl3gpW(e z@0f=E6BCFCCm?cYOf5)2nX;g=Wt>`!EuZJ56s=Lspx#_^0qjKu-b&EG zwNa>`4m5j4i~6lFSt{_4x_GMVTTJreE~?yC1gCE&fwYY>p()c1MMZOHL>Fxi(81$Y zj3)Q864{+R@?l{#ntB_6RyKV(&1aHFo|F|vnW8zsh*7g7)x`3cMfzUE)ws(vZxIR@ ztk^;pOhGaREglss(HT5=r4^=%&UhK?MQBIDdJlITwf1M! z-ES!%A7YYL4}~|!IwM$3O{SWA%JvcrwN@DKHadT6Yw|@eEnJ3UaPDeOEsiR|_oHj0 z_z|5J3)b@;9Y%>1TADukA-F1&+CO;r_g143Wvg3-zT+t_HjGr6 zkphabRV+M^QE3+Kl1Ex9RnX0iMmVTC#+UZ#1MSD)wgV^^JQP__ez3 zx{@3@JyOYCz_W(GK0aNJ_v_73&2EhG#y&VwoLlZIzF*v1@0%Y`595zLPq_=|#|?Fn zBkpW2+l`ytaMa|i&~H&6E-uE+U^*@ z)c_Z|F|8GJlt+0O^)0Fy%MA+Mz-f*eH&;{{>U?YWW$a8dVm;!N%(WCj;6jm&N^{RJ zA|j(y$V-n8Cb%jQt)Gndrj+yUrN+4qI|GuzcX^HRq&r(lac=kGUDF@9`5K)1Jqe}y zczA~g-PMMh1&?$xuG$=PhGVE_<>lcZn0l)WS_t~|q*I0A1*IXD)B(w)^SHf$uGo+4 z#fZ+K@`$4|o6Do>a(b@&R)aM$%BLj{gLA9@%<1Ij_@h0=1$hl^FzE7D<9Zioy~ z>Y$*tBPm6wM!cxgm1y^FzEso%jp<}d0o?c)QOO)`pUupcLxUSZq)NcJAY{ZukvzpMEBa_lF34|R32=D&(FzZYM7U9Upr_`k=3 zKck`Vip*or^4u3acTDxzC(WhHz4_Yl*e{*Cx_na)-+RF)m;2uN`9nSXWPd+G!JifI zWn%x(#@<`N)9C-;pI_xZ_Md*!=`Tb6x%P=i{HJ?wc5YI?@^ku5RiAhM{k1PVtJANy z*z;tg_j#TCyt4eR+37zpDSUq8Rnk1Ee?geYe2~mVMyUq8RHgVf!rD8BIO z19VPLaZjpqSh15MUL< zZffd2&FAB$oG;UYZfXl0Ga^IRl!y$W!pYc*rbZjHKm+RIU*xTgFCvF8W=J>VHVUN zr7$woLO#2zn|nc-@NOEpNYuhU1l9Kr?o40OGlQ}k#o7{dMsXKNbF05bI;F*!`wR`z zQl=3`pRqVJsPE#jacPEuPNX*F`7LdTIvw^dszx0ZRc9Z5!fsR{JWBV2l-$1CRCPtH zj?K5;a>HhuyyA!feQfnlKC#|A^0&`B&0=}WjTgvcFaNwt-e3fIsXsE;m%I+j@}0V_ zaxP3kA60s<#(Tj>P2_pg_T{e1arosC&{(d2kMD1n-pw3Ruuzh3j*Z2sJDUu_l$wsbdwDF)KyJ;|R%GlGG#rMd9tF3T%`%ma;?-Ycv$CB9bwzN6@3rf zG^-ZhjNy!fBi74QaaD z_fqksy>Cm@VN6wwbW!Lj9X`f71>#Z^DBWFI^QZ(F9?1rZx2RK}0@a~tCfc=V7-wAy z)RQ`=f+spZ#5tBy2|Da6EBPK^g$tmeI$ThtG!vVgN@m6x&>5xE$&4l#{(*p28GSJt zSdi85{XqK;6YZAuAx36D;?7??y?3}mGK1kYP%?_mzv%O>m7h#z|(>tlSol1uQ;NpA*HA$gvW@A*n=sXV}onbxFUh#pU zusCtW7V3zSrjUFA(Z%tvk#HU%964_DAyAoxBwZfEOtR>POO|sc(rf4vD$l7H+-bz# zRI8>=qRIx~?E2-ccIdQcp)%N~y&D`Uakb+BwSBjhZBd(I{m(6G!#cA_3|PJgW&elM zU0=*~_BwZd!+AN+<2(U?>x^8fucT*|dk)u?M@dJPb*DyM**~(r3O(-KPU=L+%{xEB zyW%Udg1;_W1D@!^ZL|KQUf_))ekD0i?ce-3FBf6qjUk^8T1LNDtS`%a+{}x^2>;Ex zJ)ZaMcl5Urj?W93R?9M-ByT=^g1c<>l(3=hPtvs$pCWM^*h*CuuGGD(x`pNk*A{>x zdGtB2DQeF@%iA3#>4%jlHGYJLay@5Rt5RC$^6WNDE#C-m&#y319&1fa4WRqqKp6_O z91jR1fJ{Qb`#mZ|xNH4(6Y9P|qs)w}rq&)!IH5%(Et9`pIxdtfHxS#TM-E;5iC}qSqcb<%qR~4nh>uKw8QMF51E7+b1(%@ z%qOOaW({|P&{;|8fV+o%Y8NqD0jlv(1a*f~8EEQj)MUzwDjsVDpPYbKn{bQo5R6ei zOD-?F`_%c6(8oiLgO24cE4J*-bF)o*(^|f=VzHtRO0ga;q;~zE@4mS8_56RFlIus$ zlk$=m>h_tH`czsTKXJ0Bk79Ip_ASr!D|PLe*M1@OP9o2GoL`@1$X%c72H7qF*B`>| zL$CL+9+jtHeDaDofVRbWD$u3GLd+k zOaG_%y8a&S5%G5qbok$p^!DDr{qe^?j!!9i zn~kyGQ_Mtf4t`JJzTs9If(aRn7Tv^VdAaHa7TC4=J;^qRg3#9sv1^fniLs+x$jVLt z;i4O}T{jAn*qUucfh=r6i0V1jW!{RUfC&@otT$1DARSx?NxO*25i=9Eot+ed_8MxW z1w>C+sp)MRHcX9KIVR0SKWzhS2w|tzB34Oa?Tm)fIC9fc8FX)L_9}H^-O5|8&j!*{ z+3PIaT24<|CPPiJhe3_2bGs)?kyK^+xRRB_9j{R&UJO&_wzNf#S~*DH*%Z-mMgNPED)jCdJGi(>l+3YE z{J7WyVNDt4YJtQT*kP|NVd82iUv5^Sp`7$6p`2kFkg25qtz5k&p04KPnG9{QOHl z{HQ$qe9q@RugB~4O}mF5`TWv8F8gVYZ{_}>2g$BzDJk$3NTQ9pm``sxz(zJF)e zNA&&b?%5u8SJ_uVZC7n>?^XQsMPQzFxP3IJ*6JHU{8$U)~kL*H3YukWQOag z7BMYT{w%grCDCbPpsVt-&BA3QVUivtv5<*`D(Wy3Bvft-wug#WvqKBWl6^CMWz`49 z=Bj}&ju|8t(w-h(a?}FZt$e3vrE?X2P-YpUg!RlNOIL=;o*gkN<b3N}x8vtz<5bL2k$Z>z;;Hg_I!^Wr~Ds&ymPm<}5Q6leYDc z5tQb!vW{Z38C21*!dYxHrtLb4m zWPGa_TDuaYNI+w>@UB&3lH9S76}QocDbsB)u8qk|*y&ZJ>h@_{hKl*&Jp8Fz?sqc2K#{t9R5v|qWy;5*2Z>ful)YP} z(Z*s)+|#qRL#}gf!A#<6He1s5Ij16}Af5nIt(^64Q!98Yp@WWFVLPBJyP5Qr7`p=@ zHw1)pbpqtdQ)DJ}wTsy#rE3g?#5>Bzd`G%Q zn{r?T6PtlcS7PwgvF$_w^NBUS3iI&}-ZzWK|2#bF;Zcuz*n|5RJlyiJ_9#E>U?0pQ z^N2m_tcBSJ_Baba_K|tk$LDw6&szB;vx~g^h&|X>)91%oEWbXUR9BO`5BBrfj@V}_ z`(X5>IJ*vB0z4%-TU-}@FvngzpY30oWPXTtcly+~{9qoz8vuBd{iRi$Tz7Zd_tOM7H~VO0 z?+=#{KXr*WvY#5fF}e9Zub=hzCMbVk^#PaQlO{ik@S`}-;o4tl@(t!VpC6wu8uUz4 z?Xs=Ib(n`0Ox9bT#om!_(Z>LJM!~bb!rzIkRhfXt8+8eI*V7RxiR-FJ7LbLUdV8hu#mgg+p11CoG~>njO<^@`#E1PF2W#$Eb0noZ;sl z_Ua9VGOJ$6rPDU#Cbk8-4PS^Gt?b~$y;A1LC#VHE%Q*#;3otywQmzaXeV|A{Y`EGb zT~GF?WG$yGQWaCRuvk^w=b&9@uNG^~QThgE>;H1mRF(yX2#SZw4l=K`BQYQ>y zq&!j@nJ^jp++dwiau+3xy(SMx)~su9a{>s`&pWp18k?yen5o4W~p}z zI51o>hCpm|1rPH1-s-_3@7|jM__WR$AhVV~@j@+0pLI$T3Sd+f-3& za^fp6x@t;q*^YTxtAJ8H%aZGgzCHT1ePu8@)cR#qM(b<<0aDHIAX4q>W@J+y;q=DP zQ_T)4vs&$KCD)<-FKi82(i)qs0$aqgRBf5g+5MuAh%n^?ikT|`?V1U!lp`0{;mdbW zHEGw)@CB;kF^_q4J$4^%^B^C=*O{XJz*H|VaG#Lp4ShaepV#!)+ZU{O1DkfdF}rV{ zz~bHJdb&SPXf$}e&L^|riBjf^(My6GpS=!1UBD-=Zk{yHo9li*?r%u;lA7B-Rf#e%i{c-lbQPt1w{u@>OMc8jt^;g6HZ$nkT zG1Fg3zA@7uDZVk&Uo7MsGyU0$zA@9!{lqtB`m59W#!P?E_8T+(S($Il^ph3-jhX(` z&3`Lq`WrdSHVRLV_m_VHP}yySKuG`qABzYC000000RIL6LPG)o8vp|U0000000000 D34jUC diff --git a/tests/m6a_prediction_test.rs b/tests/m6a_prediction_test.rs index 1d259d68..7a0ba37c 100644 --- a/tests/m6a_prediction_test.rs +++ b/tests/m6a_prediction_test.rs @@ -1,3 +1,4 @@ +use fibertools_rs::subcommands::predict_m6a::WINDOW; use fibertools_rs::utils::bio_io; use fibertools_rs::utils::input_bam::FiberFilters; use rust_htslib::bam::Reader; @@ -7,15 +8,24 @@ fn sum_qual(bam: &mut Reader) -> usize { let fibers = fibertools_rs::fiber::FiberseqRecords::new(bam, FiberFilters::default()); // sum up all quality scores across all fibers let mut sum = 0; + let mut count = 0; for fiber in fibers { - sum += fiber + let min_pos = (WINDOW / 2) as i64; + let max_pos = (fiber.record.seq_len() - WINDOW / 2) as i64; + let mut this_count = 0; + let this_qual_sum = fiber .m6a - .qual .into_iter() - .map(|x| x as usize) + .filter(|(st, en, _, _, _)| *st >= min_pos && *en < max_pos) + .map(|(_, _, _, qual, _)| { + this_count += 1; + qual as usize + }) .sum::(); + sum += this_qual_sum; + count += this_count; } - eprintln!("sum {:?}", sum); + eprintln!("sum {:?}; count {:?}", sum, count); // now count for the input bam file as well sum }