diff --git a/.readthedocs.yaml b/.readthedocs.yaml new file mode 100644 index 0000000..9138a7f --- /dev/null +++ b/.readthedocs.yaml @@ -0,0 +1,13 @@ +version: "2" + +build: + os: "ubuntu-22.04" + tools: + python: "3.10" + +python: + install: + - requirements: docs/requirements.txt + +sphinx: + configuration: docs/source/conf.py diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 0000000..d0c3cbf --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = source +BUILDDIR = build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 0000000..6247f7e --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=source +set BUILDDIR=build + +if "%1" == "" goto help + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/docs/requirements.txt b/docs/requirements.txt new file mode 100644 index 0000000..5401e49 --- /dev/null +++ b/docs/requirements.txt @@ -0,0 +1,2 @@ +sphinx==7.1.2 +furo diff --git a/docs/source/api.rst b/docs/source/api.rst new file mode 100644 index 0000000..ec94338 --- /dev/null +++ b/docs/source/api.rst @@ -0,0 +1,7 @@ +API +=== + +.. autosummary:: + :toctree: generated + + lumache diff --git a/docs/source/conf.py b/docs/source/conf.py new file mode 100644 index 0000000..e1da635 --- /dev/null +++ b/docs/source/conf.py @@ -0,0 +1,58 @@ +# Configuration file for the Sphinx documentation builder. + +# -- Project information + +project = 'piscem-infer' +copyright = '2022-, Rob Patro' +author = 'Rob Patro' +master_doc = 'index' + +release = '0.1' +version = '0.1.1' + +# -- General configuration + +extensions = [ + 'sphinx.ext.duration', + 'sphinx.ext.doctest', + 'sphinx.ext.autodoc', + 'sphinx.ext.autosummary', + 'sphinx.ext.intersphinx', +] + +intersphinx_mapping = { + 'python': ('https://docs.python.org/3/', None), + 'sphinx': ('https://www.sphinx-doc.org/en/master/', None), +} +intersphinx_disabled_domains = ['std'] + +templates_path = ['_templates'] + +# -- Options for HTML output + +html_theme = 'furo' +pygments_style = "sphinx" +pygments_dark_style = "monokai" + +# -- Options for EPUB output +epub_show_urls = 'footnote' + + + +# Configuration file for the Sphinx documentation builder. +# +# This file only contains a selection of the most common options. For a full +# list see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +# import os +# import sys +# sys.path.insert(0, os.path.abspath('.')) + + diff --git a/docs/source/formats.rst b/docs/source/formats.rst new file mode 100644 index 0000000..8153ce6 --- /dev/null +++ b/docs/source/formats.rst @@ -0,0 +1,14 @@ +Formats +======= + ++----------------------------------------+-----------------------------------------------+--------------------------------------------------------------+---------------------------------------------------------+ +| target_name | eeln | tpm | ecount | ++========================================+===============================================+==============================================================+=========================================================+ +| The identifer of the quantified target | The expected *effective length* of the target | The abundance of the target in transcripts per million (TPM) | The expected number of fragments assigned to the target | ++----------------------------------------+-----------------------------------------------+--------------------------------------------------------------+---------------------------------------------------------+ + + +.. autosummary:: + :toctree: generated + + piscem-infer diff --git a/docs/source/index.rst b/docs/source/index.rst new file mode 100644 index 0000000..7c3523d --- /dev/null +++ b/docs/source/index.rst @@ -0,0 +1,32 @@ +Welcome to piscem-infers's documentation! +========================================= + +What is `piscem-infer`? +----------------------- + +``piscem-infer`` is a tool to consume bulk-RAD files (produced by `piscem `_ or `piscem-cpp `_) +and to produce from them abundance estimates of the targets against which the reads were mapped. For example, a basic RNA-seq pipeline could consist of mapping the reads +against the transcriptome using `piscem`, and then quantifying transcript abundance using `piscem-infer`. Likewise, one could use the pair of tools on metagenomic index and +metagenomic sequencing reads to perform taxonomic abundance estimation. The main goal of `piscem-infer` is to separate the statistical inference algorithms and code from the +code that performs indexing and mapping. This allows faster and more agile development of new improvements to the inference method, as well as eases the maintenance burden. + +The `piscem-infer` program is written in `rust`, which makes it easy for end-users to compile, and which also makes it easy for us to deploy without the need for end-users +to compile it (it's easy to create statically-linked, pre-compiled executables, and to put the package on `bioconda`). At the same time, this gives us access to the safety +guarantees of `rust`, making the code easier to develop and maintain with confidence while retaining the efficiency of a high-performance, statically-typed, compiled language. + +While `piscem-infer` is in *active development*, it is already very usable! We encourage folks who are curious to try it out, to open Github issues for any questions or feature +requests, and to provide feedback on how you'd like to see this next "evolution" of (bulk sequencing) abundance estimation tools evolve! + +Check out the :doc:`usage` section for further information. + +.. note:: + + This project is under active development. + +Contents +-------- + +.. toctree:: + + usage + formats diff --git a/docs/source/usage.rst b/docs/source/usage.rst new file mode 100644 index 0000000..cfccc13 --- /dev/null +++ b/docs/source/usage.rst @@ -0,0 +1,101 @@ +Usage +===== + +.. _installation: + +Installation +------------ + +To install piscem-infer, use `cargo `_: + +.. code-block:: console + + $ cargo install piscem-infer + +or install it from source + +.. code-block:: console + + $ git clone https://github.com/COMBINE-lab/piscem-infer + $ cd piscem-infer + $ cargo build --release + + +An example run +-------------- + +Obtaining the reference +~~~~~~~~~~~~~~~~~~~~~~~ + +First, we'll grab the reference transcriptome we will use for quantification - in this case Gencode's v44 annotation +of the human transcriptome. + +.. code-block:: console + + $ wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_44/gencode.v44.transcripts.fa.gz + +Building the index +~~~~~~~~~~~~~~~~~~ + + +Next, we'll build the index. You'll only have to do this once (or whenever you want to update the annotation you're using). To +build the index and map the reads, we'll need ``piscem``. You can either build it from source according to the instructions +on the `GitHub page `_, or you can install it from ``biconda`` using ``conda install piscem``. +Once you have it installed, you can build the index with: + +.. code-block:: console + + $ piscem build -s gencode.v44.transcripts.fa.gz -k 31 -m 19 -t 16 -o gencode_v44_idx + +Obtaining the reads +~~~~~~~~~~~~~~~~~~~ + +To obtain some sample read data, we'll use the excellent |fastqdl|_ tool that you can install +via either ``pip`` or bioconda (through ``conda`` or ``mamba``). + +.. code-block:: console + + $ fastq-dl -a SRR1039508 + +Mapping the reads +~~~~~~~~~~~~~~~~~ + +Next, we'll use ``piscem`` again to map the reads. The following command will do it for us (you can check out ``piscem map-bulk -h`` for +a descripton of all the options): + +.. code-block:: console + + $ piscem map-bulk -t 16 -i gencode_v44_idx -1 SRR1039508_1.fastq.gz -2 SRR1039508_1.fastq.gz -o SRR1039508_mapped + + +Quantification with ``piscem-infer`` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Now that we've mapped the reads to produce a bulk-oriented ``RAD`` file, we're ready to quantify with ``piscem-infer``! +There exist other options we can pass (e.g. we can perform bootstrap sampling to produce inferential replicates, in which +case we can also request the use of multiple threads, but here we just invoke the most basic quantification process). + +.. code-block:: console + + $ piscem-infer quant -i SRR1039508_map -l IU -o quant/SRR1039508 + +Note that we pass to the ``-o`` flag a file *stem* prefixed with a path (in this case ``quant``). This is because ``piscem-infer`` +will produce several output files. All of them will share the same *stem*. If we pass a stem that is prefixed with some path +(e.g. a directory) then this directory will be created if it doesn't exist. We also let ``piscem-infer`` know the library type +(i.e. how we expect the reads to map), where ``piscem-infer`` uses `salmon's library type specification `_. +Here we expect the library to be unstranded and the paired-end reads to map "inward" (i.e. facing each other). + +.. code-block:: console + + $ ls -la quant/ + .rw-rw-r--@ 3.1k rob 30 Sep 12:33 SRR1039508.fld.pq + .rw-rw-r--@ 628 rob 30 Sep 12:33 SRR1039508.meta_info.json + .rw-rw-r--@ 33M rob 30 Sep 12:33 SRR1039508.quant + +The file ``SRR1039508.quant`` contains the quantification estimates, and is of a very similar format to e.g. a ``salmon`` ("quant.sf") format file. + + + +.. |fastqdl| replace:: ``fastq-dl`` +.. _fastqdl: https://github.com/rpetit3/fastq-dl + diff --git a/src/utils/io.rs b/src/utils/io.rs index dce708b..9d55ed9 100644 --- a/src/utils/io.rs +++ b/src/utils/io.rs @@ -11,6 +11,8 @@ use std::fs::File; use std::io::Write; use std::path::Path; +use tracing::warn; + pub(crate) fn write_results( output: &Path, hdr: &rad_types::RadHeader, @@ -19,10 +21,37 @@ pub(crate) fn write_results( ) -> anyhow::Result<()> { let mut ofile = File::create(output)?; - ofile.write_all("target_name\teelen\tecount\n".to_string().as_bytes())?; + ofile.write_all("target_name\teelen\ttpm\tecount\n".to_string().as_bytes())?; + + const ONE_MILLION: f64 = 1_000_000.0; + let denom: f64 = e_counts + .iter() + .zip(eff_lengths.iter()) + .map(|(c, l)| if *l > 0.0 { c / l } else { 0.0_f64 }) + .sum::(); + let inv_denom: f64 = if denom > 0.0 { + ONE_MILLION / denom + } else { + warn!("The sum of ecount / eeln for all transcripts was 0. It seems likely that no fragments were quantified. Please check the input sample!"); + 0.0 + }; + let tpms: Vec = e_counts + .iter() + .zip(eff_lengths.iter()) + .map(|(c, l)| { + if *l > 0.0 { + inv_denom * (c / l) + } else { + 0.0_f64 + } + }) + .collect(); for (i, name) in hdr.ref_names.iter().enumerate() { - let l = format!("{}\t{:.3}\t{:.3}\n", name, eff_lengths[i], e_counts[i]); + let l = format!( + "{}\t{:.3}\t{:.3}\t{:.3}\n", + name, eff_lengths[i], tpms[i], e_counts[i] + ); ofile.write_all(l.as_bytes())?; }