Merge pull request #17 from COMBINE-lab/dev

Dev
COMBINE-lab · Oct 1, 2023 · 21daddd · 21daddd
2 parents ad0330f + 648ed87
commit 21daddd
Show file tree

Hide file tree

Showing 10 changed files with 313 additions and 2 deletions.
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
@@ -0,0 +1,13 @@
+version: "2"
+
+build:
+  os: "ubuntu-22.04"
+  tools:
+    python: "3.10"
+
+python:
+  install:
+    - requirements: docs/requirements.txt
+
+sphinx:
+  configuration: docs/source/conf.py
diff --git a/docs/Makefile b/docs/Makefile
@@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+SPHINXOPTS    ?=
+SPHINXBUILD   ?= sphinx-build
+SOURCEDIR     = source
+BUILDDIR      = build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/docs/make.bat b/docs/make.bat
@@ -0,0 +1,35 @@
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=source
+set BUILDDIR=build
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+	echo.
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.may add the Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.http://sphinx-doc.org/
+	exit /b 1
+)
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+
+:end
+popd
diff --git a/docs/requirements.txt b/docs/requirements.txt
@@ -0,0 +1,2 @@
+sphinx==7.1.2
+furo
diff --git a/docs/source/api.rst b/docs/source/api.rst
@@ -0,0 +1,7 @@
+API
+===
+
+.. autosummary::
+   :toctree: generated
+
+   lumache
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -0,0 +1,58 @@
+# Configuration file for the Sphinx documentation builder.
+
+# -- Project information
+
+project = 'piscem-infer'
+copyright = '2022-, Rob Patro'
+author = 'Rob Patro'
+master_doc = 'index'
+
+release = '0.1'
+version = '0.1.1'
+
+# -- General configuration
+
+extensions = [
+    'sphinx.ext.duration',
+    'sphinx.ext.doctest',
+    'sphinx.ext.autodoc',
+    'sphinx.ext.autosummary',
+    'sphinx.ext.intersphinx',
+]
+
+intersphinx_mapping = {
+    'python': ('https://docs.python.org/3/', None),
+    'sphinx': ('https://www.sphinx-doc.org/en/master/', None),
+}
+intersphinx_disabled_domains = ['std']
+
+templates_path = ['_templates']
+
+# -- Options for HTML output
+
+html_theme = 'furo'
+pygments_style = "sphinx"
+pygments_dark_style = "monokai"
+
+# -- Options for EPUB output
+epub_show_urls = 'footnote'
+
+
+
+# Configuration file for the Sphinx documentation builder.
+#
+# This file only contains a selection of the most common options. For a full
+# list see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
+
+# -- Path setup --------------------------------------------------------------
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+# import os
+# import sys
+# sys.path.insert(0, os.path.abspath('.'))
+
+
diff --git a/docs/source/formats.rst b/docs/source/formats.rst
@@ -0,0 +1,14 @@
+Formats
+=======
+
++----------------------------------------+-----------------------------------------------+--------------------------------------------------------------+---------------------------------------------------------+
+| target_name                            | eeln                                          |  tpm                                                         | ecount                                                  | 
++========================================+===============================================+==============================================================+=========================================================+
+| The identifer of the quantified target | The expected *effective length* of the target | The abundance of the target in transcripts per million (TPM) | The expected number of fragments assigned to the target |
++----------------------------------------+-----------------------------------------------+--------------------------------------------------------------+---------------------------------------------------------+
+
+
+.. autosummary::
+   :toctree: generated
+
+    piscem-infer
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -0,0 +1,32 @@
+Welcome to piscem-infers's documentation!
+=========================================
+
+What is `piscem-infer`?
+-----------------------
+
+``piscem-infer`` is a tool to consume bulk-RAD files (produced by `piscem <https://github.com/COMBINE-lab/piscem>`_ or `piscem-cpp <https://github.com/COMBINE-lab/piscem-cpp>`_) 
+and to produce from them abundance estimates of the targets against which the reads were mapped.  For example, a basic RNA-seq pipeline could consist of mapping the reads 
+against the transcriptome using `piscem`, and then quantifying transcript abundance using `piscem-infer`.  Likewise, one could use the pair of tools on metagenomic index and 
+metagenomic sequencing reads to perform taxonomic abundance estimation.  The main goal of `piscem-infer` is to separate the statistical inference algorithms and code from the 
+code that performs indexing and mapping.  This allows faster and more agile development of new improvements to the inference method, as well as eases the maintenance burden.
+
+The `piscem-infer` program is written in `rust`, which makes it easy for end-users to compile, and which also makes it easy for us to deploy without the need for end-users 
+to compile it (it's easy to create statically-linked, pre-compiled executables, and to put the package on `bioconda`). At the same time, this gives us access to the safety 
+guarantees of `rust`, making the code easier to develop and maintain with confidence while retaining the efficiency of a high-performance, statically-typed, compiled language.
+
+While `piscem-infer` is in *active development*, it is already very usable! We encourage folks who are curious to try it out, to open Github issues for any questions or feature
+requests, and to provide feedback on how you'd like to see this next "evolution" of (bulk sequencing) abundance estimation tools evolve!
+
+Check out the :doc:`usage` section for further information.
+
+.. note::
+
+   This project is under active development.
+
+Contents
+--------
+
+.. toctree::
+
+   usage
+   formats
diff --git a/docs/source/usage.rst b/docs/source/usage.rst
@@ -0,0 +1,101 @@
+Usage
+=====
+
+.. _installation:
+
+Installation
+------------
+
+To install piscem-infer, use `cargo <https://github.com/rust-lang/cargo>`_:
+
+.. code-block:: console
+
+   $ cargo install piscem-infer
+
+or install it from source
+
+.. code-block:: console
+
+   $ git clone https://github.com/COMBINE-lab/piscem-infer
+   $ cd piscem-infer
+   $ cargo build --release
+
+
+An example run
+--------------
+
+Obtaining the reference
+~~~~~~~~~~~~~~~~~~~~~~~
+
+First, we'll grab the reference transcriptome we will use for quantification - in this case Gencode's v44 annotation 
+of the human transcriptome.
+
+.. code-block:: console
+
+   $ wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_44/gencode.v44.transcripts.fa.gz
+
+Building the index
+~~~~~~~~~~~~~~~~~~
+
+
+Next, we'll build the index. You'll only have to do this once (or whenever you want to update the annotation you're using). To 
+build the index and map the reads, we'll need ``piscem``. You can either build it from source according to the instructions 
+on the `GitHub page <https://github.com/COMBINE-lab/piscem>`_, or you can install it from ``biconda`` using ``conda install piscem``. 
+Once you have it installed, you can build the index with:
+
+.. code-block:: console
+
+    $ piscem build -s gencode.v44.transcripts.fa.gz -k 31 -m 19 -t 16 -o gencode_v44_idx
+
+Obtaining the reads
+~~~~~~~~~~~~~~~~~~~
+
+To obtain some sample read data, we'll use the excellent |fastqdl|_ tool that you can install 
+via either ``pip`` or bioconda (through ``conda`` or ``mamba``).
+
+.. code-block:: console
+
+   $ fastq-dl -a SRR1039508
+
+Mapping the reads
+~~~~~~~~~~~~~~~~~
+
+Next, we'll use ``piscem`` again to map the reads.  The following command will do it for us (you can check out ``piscem map-bulk -h`` for 
+a descripton of all the options):
+
+.. code-block:: console
+
+   $ piscem map-bulk -t 16 -i gencode_v44_idx -1 SRR1039508_1.fastq.gz -2 SRR1039508_1.fastq.gz -o SRR1039508_mapped
+
+
+Quantification with ``piscem-infer``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Now that we've mapped the reads to produce a bulk-oriented ``RAD`` file, we're ready to quantify with ``piscem-infer``!
+There exist other options we can pass (e.g. we can perform bootstrap sampling to produce inferential replicates, in which 
+case we can also request the use of multiple threads, but here we just invoke the most basic quantification process).
+
+.. code-block:: console
+
+   $ piscem-infer quant -i SRR1039508_map -l IU -o quant/SRR1039508
+
+Note that we pass to the ``-o`` flag a file *stem* prefixed with a path (in this case ``quant``). This is because ``piscem-infer``
+will produce several output files.  All of them will share the same *stem*.  If we pass a stem that is prefixed with some path 
+(e.g. a directory) then this directory will be created if it doesn't exist. We also let ``piscem-infer`` know the library type 
+(i.e. how we expect the reads to map), where ``piscem-infer`` uses `salmon's library type specification <https://salmon.readthedocs.io/en/latest/salmon.html#what-s-this-libtype>`_.
+Here we expect the library to be unstranded and the paired-end reads to map "inward" (i.e. facing each other).
+
+.. code-block:: console
+
+    $ ls -la quant/
+    .rw-rw-r--@ 3.1k rob 30 Sep 12:33 SRR1039508.fld.pq
+    .rw-rw-r--@  628 rob 30 Sep 12:33 SRR1039508.meta_info.json
+    .rw-rw-r--@  33M rob 30 Sep 12:33 SRR1039508.quant
+
+The file ``SRR1039508.quant`` contains the quantification estimates, and is of a very similar format to e.g. a ``salmon`` ("quant.sf") format file.
+
+
+
+.. |fastqdl| replace:: ``fastq-dl``
+.. _fastqdl: https://github.com/rpetit3/fastq-dl
+
diff --git a/src/utils/io.rs b/src/utils/io.rs
@@ -11,6 +11,8 @@ use std::fs::File;
 use std::io::Write;
 use std::path::Path;
 
+use tracing::warn;
+
 pub(crate) fn write_results(
     output: &Path,
     hdr: &rad_types::RadHeader,
@@ -19,10 +21,37 @@ pub(crate) fn write_results(
 ) -> anyhow::Result<()> {
     let mut ofile = File::create(output)?;
 
-    ofile.write_all("target_name\teelen\tecount\n".to_string().as_bytes())?;
+    ofile.write_all("target_name\teelen\ttpm\tecount\n".to_string().as_bytes())?;
+
+    const ONE_MILLION: f64 = 1_000_000.0;
+    let denom: f64 = e_counts
+        .iter()
+        .zip(eff_lengths.iter())
+        .map(|(c, l)| if *l > 0.0 { c / l } else { 0.0_f64 })
+        .sum::<f64>();
+    let inv_denom: f64 = if denom > 0.0 {
+        ONE_MILLION / denom
+    } else {
+        warn!("The sum of ecount / eeln for all transcripts was 0. It seems likely that no fragments were quantified. Please check the input sample!");
+        0.0
+    };
+    let tpms: Vec<f64> = e_counts
+        .iter()
+        .zip(eff_lengths.iter())
+        .map(|(c, l)| {
+            if *l > 0.0 {
+                inv_denom * (c / l)
+            } else {
+                0.0_f64
+            }
+        })
+        .collect();
 
     for (i, name) in hdr.ref_names.iter().enumerate() {
-        let l = format!("{}\t{:.3}\t{:.3}\n", name, eff_lengths[i], e_counts[i]);
+        let l = format!(
+            "{}\t{:.3}\t{:.3}\t{:.3}\n",
+            name, eff_lengths[i], tpms[i], e_counts[i]
+        );
         ofile.write_all(l.as_bytes())?;
     }