Merge pull request #5 from mshakya/master

merging
LANL-Bioinformatics · Jul 10, 2019 · 56e1b67 · 56e1b67
2 parents 51af384 + ca30292
commit 56e1b67
Show file tree

Hide file tree

Showing 5 changed files with 225 additions and 28 deletions.
diff --git a/docs/conf.py b/docs/conf.py
@@ -19,14 +19,14 @@
 
 # -- Project information -----------------------------------------------------
 
-project = 'phame'
-copyright = '2018, migun'
-author = 'migun'
+project = 'PhaME'
+copyright = '2018, LANL'
+author = 'Migun Shakya, Chien-Chi Lo, Sanaa Ahmed, Mark Flynn, Patrick S.G Chain'
 
 # The short X.Y version
-version = ''
+version = '1'
 # The full version, including alpha/beta/rc tags
-release = ''
+release = '1.0.2'
 
 
 # -- General configuration ---------------------------------------------------
@@ -66,7 +66,7 @@
 exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
 
 # The name of the Pygments (syntax highlighting) style to use.
-pygments_style = None
+pygments_style = 'sphinx'
 
 
 # -- Options for HTML output -------------------------------------------------
@@ -106,7 +106,7 @@
 # -- Options for HTMLHelp output ---------------------------------------------
 
 # Output file base name for HTML help builder.
-htmlhelp_basename = 'phame'
+htmlhelp_basename = 'PhaME'
 
 
 # -- Options for LaTeX output ------------------------------------------------
@@ -133,8 +133,8 @@
 # (source start file, target name, title,
 #  author, documentclass [howto, manual, or own class]).
 latex_documents = [
-    (master_doc, 'phame.tex', 'phame Documentation',
-     'migun', 'manual'),
+    (master_doc, 'phame.tex', 'PhaME',
+     'PhaME Development Team', 'manual'),
 ]
 
 
@@ -143,7 +143,7 @@
 # One entry per manual page. List of tuples
 # (source start file, name, description, authors, manual section).
 man_pages = [
-    (master_doc, 'phame', 'phame Documentation',
+    (master_doc, 'PhaME', 'PhaME',
      [author], 1)
 ]
 
@@ -154,8 +154,8 @@
 # (source start file, target name, title, author,
 #  dir menu entry, description, category)
 texinfo_documents = [
-    (master_doc, 'phame', 'phame Documentation',
-     author, 'phame', 'One line description of project.',
+    (master_doc, 'PhaME',
+     author, 'PhaME', 'One line description of project.',
      'Miscellaneous'),
 ]
 

diff --git a/docs/introduction/introduction.rst b/docs/introduction/introduction.rst
@@ -4,17 +4,199 @@ Introduction
 What is PhaME?
 ==============
 
+PhaME or Phylogenetic and Molecular Evolution (PhaME) is an analysis tool that performs phylogenetic and molecular evolutionary analysis.
 
+Given a reference, PhaME extracts SNPs from complete genomes, draft genomes and/or reads, uses SNP multiple sequence alignment to construct a phylogenetic tree, and provides evolutionary analyses (genes under positive selection) using CDS SNPs.
 
-PhaME or Phylogenetic and Molecular Evolution (PhaME) analysis tool allows suite of analysis pertaining to phylogeny and moleuclar evolution.
 
-Given a reference, PhaME extracts SNPs from complete genomes, draft genomes and/or reads, uses SNP multiple sequence alignment to construct a phylogenetic tree, and provides evolutionary analyses (genes under positive selection) using CDS SNPs.
+PhaME: Under the hood.
+======================
+
+Here, We have explained in detail how PhaME works in detail.
+
+Selecting Reference genome:
+-----------------------------
+PhaME is a reference genome based tool where all input genomes and metagenomes are first aligned against a reference genome. As a first step in the PhaME pipeline, given a set of reference genomes in path set in parameter `refdir` `phame.ctl` file, one genome is picked as the reference genome. PhaME provides multiple options to pick the most appropriate reference genome. First, reference genome can be picked randomly from the set. Second, users can select reference genome from the list. Lastly, users can use MinHash option, which will calculate the MinHash distance between all reference genomes with each other and with input contigs and raw reads to pick a genome that has the shortest average distance with each other. MinHash distances are calculated using BBMap [Bushnell]_.
+
+Self-nucmerization to remove repeats from reference genomes:
+---------------------------------------------------------------
+PhaME is built on alignment tool nucmer [Kurtz 2004]_ for genome alignment. All genomes in fasta format that are complete and included as set of reference genomes are first aligned with self, called `self-nucmerization`, using `nucmer` to remove repeats within a genome. Following `nucmer` command is used for the self-nucmerization step:
+
+::
+
+    nucmer --maxmatch --nosimplify --prefix=seq_seq$$ ref_genomeA.fasta ref_genomeA.fasta
+
+::
+
+The option `--maxmatch`  reports all matches is used to ensure all possible alignments are reported for maximal removal of repeats. The identified repeat regions are then removed from downstream analyses.
+
+Genome Alignments
+--------------------------------
+All genomes that are in `refdir` are first aligned against the reference genome (see section 1) that have had its repeats removed (section 2). Likewise, incomplete genomes or contigs, the ones that are listed in the `workdir` with extension `.contig` are also aligned against the reference genome using `nucmer`. For aligning genomes in fasta format against each other, following command, same as the previous step for nucmer alignment is used:
+
+::
+
+    nucmer --maxmatch refgenome.fasta genome.fasta
+
+::
+
+All other options in nucmer alignments are kept at default, some of the important ones are listed below:
+
+::
+
+   -b|breaklen     Set the distance an alignment extension will attempt to
+                    extend poor scoring regions before giving up (default 200)
+    -c|mincluster   Sets the minimum length of a cluster of matches (default 65)
+    -D|diagdiff     Set the maximum diagonal difference between two adjacent
+                    anchors in a cluster (default 5)
+    -d|diagfactor   Set the maximum diagonal difference between two adjacent
+                    anchors in a cluster as a differential fraction of the gap
+                    length (default 0.12)
+    --[no]extend    Toggle the cluster extension step (default --extend)
+    -g|maxgap       Set the maximum gap between two adjacent matches in a
+                    cluster (default 90)
+    -l|minmatch     Set the minimum length of a single match (default 20)
+
+::
+
+Also, `nucmer` only aligns `"A"` `"T"` `"C"` `"G"`, all other characters are ignored. So, if there are `"N"`s in the provided genomes, thse positions are not included in the alignment.
+
+*Note*: If an analysis requires running multiple iterations of PhaME on a same set of dataset or a subset of dataset, one doesn't need to perform the alignment over and over again. PhaME provides an option where it can keep all possible pairwise alignment of genomes from `refdir` for future analyses. All the steps mentioned in this section are the same, except that all vs. all alignment is performed compared to just one reference. By doing all vs. all alignment one can also test the effect on their analyses with different reference genomes.
+
+Mapping of raw reads to reference genome
+-------------------------------------------
+PhaME as of now only processes short raw reads from Illumina. If raw reads, single or paired end, are included in the analyses, they are mapped to the reference genome using either `bowtie2` or `BWA`. For reads mapping of reference genome, following commands are used:
+
+First, it builds database from the reference genome.
+::
+
+    bowtie2-build refgenome refgenome
+
+::
+or, if BWA was chosen as the preferred aligner:
+
+::
+
+    bwa index refgenome
+
+::
+
+The raw reads are then mapped to the reference genomne using one of the following commands:
+
+For bowtie2 and paired reads:
+
+::
+
+    bowtie2 -a -x $refgenome -1 read1 -2 read2 -S paired.sam`;
+
+::
+The option `-a` reports all possible alignments.
+
+For bowtie2 and single end reads:
+
+::
+
+    bowtie2 -a -x $refgenome -U read -S single.sam`;
+
+::
+
+For BWA and paired reads:
+
+::
+
+    bwa mem refgenome read1 read2 | samtools view -ubS -| samtools sort -T tmp_folder -O BAM -o paired.bam
+
+::
+
+For BWA and single end reads:
+
+::
+
+    bwa mem refgenome read |samtools view -ubS - | samtools sort -T tmp_folder -O BAM -o single.bam
+
+::
+
+
+Filtering genome alignments
+------------------------------
+Genome alignment produced using `nucmer` are filtered using `delta-filter` to only keep 1 to 1 alignments allowing for rearrangements. This filtering step is produced for all `nucmer` alignments.
+
+::
+
+    delta-filter -1 genome.delta > genome.snpfilter
+
+::
+
+
+Calling SNPs from genome alignments
+--------------------------------------
+The pairwise `nucmer` alignments are then parsed to produce a SNP table using `show-snps`.
+
+::
+
+    show-snps -CT genome.snpfilter > genome.snps
+
+::
+
+Here, option C and T specifies not to report SNPs from ambiguous alignments and report the output in tab delimited file respectively.
+
+Reporting nucmer alignments
+------------------------------
+
+Each alignments are further parse to produce a tab delimited file that has information on regions and %ID of their alignments.
+::
+
+    show-coords -clTr genome.snpfilter > genome.coords
+
+::
+
+The parameter flag -clTr implies different headers to be reported in the report.
+
+::
+
+-c          Include percent coverage information in the output
+-l          Include the sequence length information in the output
+-r          Sort output lines by reference IDs and coordinates
+-T          Switch output to tab-delimited format
+
+::
+
+Calling SNPs from read mapping
+---------------------------------
+`bcftools mpileup` is used for calling SNPs from read mapping results (bam file) of every genomes represented by raw reads. Maximum depth is set to 1000000 for both SNP and indel calling and minimum gaps for calling an indel is set to 3. The output vcf file is then used to call SNPs using `bcftools call` where ploidy is specified as `1` if its a haploid or bacterial genome, else it is called using default parameter. Furthermore, based on the user specified parameter in the control file, SNPs are further filtered based on percentage of SNPs. Here are the snippets of commmand that are run as part of this. All of them result in a vcf file.
+
+::
+
+    bcftools mpileup -d 1000000 -L 1000000 -m 3 -Ov -f $refgenome $bam_output | bcftools call --ploidy 1 -cO b > $bcf_output;
+    bcftools view -v snps,indels,mnps,ref,bnd,other -Ov $bcf_output | vcfutils.pl varFilter -a$min_alt_bases -d$min_depth -D$max_depth > $vcf_output`;
+    bcftools filter -i '(DP4[0]+DP4[1])==0 || (DP4[2]+DP4[3])/(DP4[0]+DP4[1]+DP4[2]+DP4[3]) > $snp_filter' $vcf_output > $vcf_filtered`
+
+::
+
+
+Calculating core genome alignments
+----------------------------------
+As a first step in calculating the core genome, all alignments to reference are checked for linear coverage to assure the proportion of reference genome that was used in the alignment. If its lower than the threshold cutoff (default = 0.6) set in control file, that genome will be removed from further analyses. Then rest of the pairwise alignments that are either in vcf format or nucmer formats are then collated to calculate a core genome. Only the alignment position that are 100% conserved are kept, all other positions are removed from the final core genome alignment. PhaME produces multiple alignment files corresponding to core genome such as the one that has only the variant sites (`_all_snp_alignment.fna`), has variant and invariant sites (`all_alignment.fna`), and the ones that have SNPs from only the coding region (`_cds_snp_alignment.fna`). The coding region SNP alignment requires a GFF formatted annotation file.
+
 
+Reconstructing core genome phylogeny
+-------------------------------------
+PhaME provides multiple tools (RAxML [Stamatakis 2014]_, FastTree [Price 2010]_, and IQ-Tree [Nguyen 2015]_) to reconstruct phylogeny from one core genome alignments that have invariant sites. If RAxML or FastTree option is chosen, users cannot modify the models as they are pre-selected. RAxML trees are reconstructed using GTRGAMMAI models that "GTR + Optimization of substitution rates + GAMMA model of rate heterogeneity (alpha parameter will be estimated)" with `I` but with estimate for invariable sites. FastTree uses GTR model only. IQ-TREE is run using option `-m TEST` that searches for the best model that fits the data before reconstructing the phylogeny. RAxML is the only option that is currently available that can also calculate the bootstraps.
 
-Quick usage
-===========
-To quickly get started with PhaME, you can install using conda.
+Selecting genes for molecular evolutionary analyses
+-------------------------------------------------------
+To perform selection analyses using PAML or HyPhy, codon alignments of genes are required. Based on the position of SNPs in the reference genome, if a SNP is within a coding region and if that coding region does not have a gap, they are extracted from the core genome alignment. The nucleotide sequences of the genes are then translated to protein sequences, aligned using the program `mafft`, and then reverse translated back to nucleotide using the perl code `pal2nal.pl` from http://www.bork.embl.de/pal2nal/.
 
-.. code-block:: console
+Molecular Evoluationary analyses
+------------------------------------
+The set of gene alignments are then used for molecular evolutionary analyses using either PAML [Yang 2007]_ or HyPhy [Pond 2005]_. PAML is run twice for the same gene using two differnt models (`model=0` and `model=2`), first that sets one omega ratio for all branches and another that sets different omega ratios for all lineages. For the first model, additional parameter variation model that specifies, neutral (1), selection (2), beta and omega ratio between 0 and 1 (7), and beta, omega and an additional omega is run. For the Model 2 with variable omega ratios across all branches, the model with one omega across all sites are used. If HyPhy is selected, it uses the aBSREL (adaptive Branch-Site Random Effects Likelihood) model.
 
-    conda install phame
+References
+--------------
+.. [Yang 2007] Yang Z: PAML 4: phylogenetic analysis by maximum likelihood. Mol Biol Evol 2007, 24:1586-1591.
+.. [Pond 2005] Pond SL, Frost SD, Muse SV: HyPhy: hypothesis testing using phylogenies. Bioinformatics 2005, 21:676-679.
+.. [Kurtz 2004] Kurtz S, Phillippy A, Delcher AL, Smoot M, Shumway M, Antonescu C, Salzberg SL: Versatile and open software for comparing large genomes. Genome Biol 2004, 5:R12.
+.. [Bushnell] Bushnell B: BBMap. 37.66 edition. sourceforge.net/projects/bbmap/.
+.. [Stamatakis 2014] Stamatakis A: RAxML version 8: a tool for phylogenetic analysis and post- analysis of large phylogenies. Bioinformatics 2014, 30:1312-1313.
+.. [Price 2010] Price MN, Dehal PS, Arkin AP: FastTree 2--approximately maximum- likelihood trees for large alignments. PLoS One 2010, 5:e9490.
+.. [Nguyen 2015] Nguyen LT, Schmidt HA, von Haeseler A, Minh BQ: IQ-TREE: a fast and effective stochastic algorithm for estimating maximum-likelihood phylogenies. Mol Biol Evol 2015, 32:268-274.
diff --git a/lib/PhaME.pm b/lib/PhaME.pm
@@ -602,7 +602,8 @@ sub buildTree {
     if ( $tree == 1 || $tree == 4 ) {
         print OUT "Reconstructing phylogeny using FastTree\n";
         my $fasttree
-            = "export OMP_NUM_THREADS=$thread; FastTree -quiet -nt -gtr < $outdir/$name\_snp_alignment.fna > $outdir/$name\.fasttree 2>>$error \n\n";
+            = "export OMP_NUM_THREADS=$thread; FastTree -quiet -nt -gtr < $outdir/$name\_all_alignment.fna > $outdir/$name\.fasttree 2>>$error \n\n";
+            # = "export OMP_NUM_THREADS=$thread; FastTree -quiet -nt -gtr < $outdir/$name\_snp_alignment.fna > $outdir/$name\.fasttree 2>>$error \n\n";
         print OUT $fasttree;
         if ( system($fasttree) ) { die "Error running $fasttree.\n"; }
 
@@ -611,7 +612,8 @@ sub buildTree {
         print OUT "Reconstructing phylogeny using RaxML\n";
         print OUT "\n";
         my $raxml
-            = "raxmlHPC-PTHREADS -p 10 -T $thread -m GTRGAMMAI -s $outdir/$name\_snp_alignment.fna -w $outdir -n $name 2>>$error >> $log\n\n";
+            = "raxmlHPC-PTHREADS -p 10 -T $thread -m GTRGAMMAI -s $outdir/$name\_all_alignment.fna -w $outdir -n $name 2>>$error >> $log\n\n";
+            # = "raxmlHPC-PTHREADS -p 10 -T $thread -m GTRGAMMAI -s $outdir/$name\_snp_alignment.fna -w $outdir -n $name 2>>$error >> $log\n\n";
         print OUT $raxml;
         if ( system($raxml) ) { die "Error running $raxml.\n"; }
         # dont need a rooted tree, removing it for now
@@ -628,7 +630,7 @@ sub buildTree {
         print OUT "Also bootstraping IQ-Trees trees\n";
         print OUT "\n";
         my $iqtree
-            = "iqtree -m TEST -s $outdir/$name\_snp_alignment.fna -nt $thread 2>>$error >> $log\n\n";
+            = "iqtree -m TEST -s $outdir/$name\_all_alignment.fna -nt $thread 2>>$error >> $log\n\n";
         print OUT $iqtree;
         if ( system($iqtree) ) { die "Error running $iqtree.\n"; }
     }
@@ -674,6 +676,15 @@ sub bootstrap {
         if ( system($bestTree) ) { die "Error running $bestTree.\n"; }
 
     }
+    if ( $tree == 3 || $tree == 4 ) {
+        print OUT "Reconstructing phylogeny using IQ-tree after finding the best model\n";
+        print OUT "Also bootstraping IQ-Trees trees\n";
+        print OUT "\n";
+        my $iqtree
+            = "iqtree -m TEST -b $bootstrap -s $outdir/$name\_all_alignment.fna -nt $thread 2>>$error >> $log\n\n";
+        print OUT $iqtree;
+        if ( system($iqtree) ) { die "Error running $iqtree.\n"; }
+    }
 
     return "Bootstrap complete";
     close OUT;

diff --git a/src/phame b/src/phame
@@ -91,7 +91,7 @@ my $code       = 0;
 my $type;
 my $clean   = 1;
 my $threads = 1;
-my $cutoff  = 0;
+my $cutoff  = 0.6;
 my $annotation;
 my $genefile;
 my $runtime       = time;

diff --git a/test/ctl_files/t1_ebola_preads.ctl b/test/ctl_files/t1_ebola_preads.ctl
@@ -16,20 +16,24 @@
          data = 4  # *See below 0:only complete(F); 1:only contig(C); 2:only reads(R); 
                    # 3:combination F+C; 4:combination F+R; 5:combination C+R; 
                    # 6:combination F+C+R; 7:realignment  *See below 
+
         reads = 2  # 1: single reads; 2: paired reads; 3: both types present;
+
+        SNPsfilter = 0.6 #
 
-         tree = 1  # 0:no tree; 1:use FastTree; 2:use RAxML; 3:use both;
+         tree = 4  # 0:no tree; 1:use FastTree; 2:use RAxML; 3:use IQ tree 4:all;
     bootstrap = 0  # 0:no; 1:yes;  # Run bootstrapping  *See below
-            N = 10  # Number of bootstraps to run *See below    
+            N = 0  # Number of bootstraps to run *See below    
 
-    PosSelect = 1  # 0:No; 1:use PAML; 2:use HyPhy; 3:use both
+    PosSelect = 0  # 0:No; 1:use PAML; 2:use HyPhy; 3:use both
 
          code = 1  # 0:Bacteria; 1:Virus
 
-        clean = 0  # 0:no clean; 1:clean
+        clean = 1  # 0:no clean; 1:clean
 
       threads = 2  # Number of threads to use
 
-       cutoff = 0.0  # Linear alignment (LA) coverage against reference - ignores SNPs from organism that have lower cutoff.
+       cutoff = 0.5  # Linear alignment (LA) coverage against reference - ignores SNPs from organism that have lower cutoff.
+