galaxy-genome-annotation · bgruening · Nov 2, 2023 · Jul 27, 2023 · Nov 1, 2023 · Nov 1, 2023
diff --git a/tools/repeatexplorer2/.shed.yml b/tools/repeatexplorer2/.shed.yml
@@ -0,0 +1,11 @@
+---
+categories:
+  - Genome annotation
+description: Tool for annotation of repeats from unassembled shotgun reads.
+homepage_url: https://github.com/repeatexplorer/repex_tarean
+long_description: |
+  Tool for annotation of repeats from unassembled shotgun reads.
+name: repeatexplorer2
+owner: gga
+remote_repository_url: https://github.com/galaxy-genome-annotation/galaxy-tools/tree/master/tools/repeatexplorer2
+type: unrestricted
diff --git a/tools/repeatexplorer2/macros.xml b/tools/repeatexplorer2/macros.xml
@@ -0,0 +1,21 @@
+<macros>
+    <token name="@TOOL_VERSION@">2.3.8</token>
+    <token name="@VERSION_SUFFIX@">0</token>
+    <token name="@PROFILE@">23.0</token>
+    <xml name="requirements">
+        <requirements>
+            <container type="docker">kavonrtep/repeatexplorer:@TOOL_VERSION@</container>
+        </requirements>
+    </xml>
+    <xml name="citations">
+        <citations>
+            <citation type="bibtex">@software{repeatexplorer2,
+                author = {repeatexplorer},
+                year = {2023},
+                title = {repeatexplorer2},
+                publisher = {GitHub},
+                url = {https://github.com/repeatexplorer/repex_tarean}
+                      }</citation>
+        </citations>
+    </xml>
+</macros>
diff --git a/tools/repeatexplorer2/repex_full_clustering.xml b/tools/repeatexplorer2/repex_full_clustering.xml
@@ -0,0 +1,259 @@
+<tool id="repeatexplorer2" name="RepeatExplorer2 clustering:" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
+  <description>Improved version of repeat discovery and characterization using graph-based sequence clustering</description>
+  <macros>
+    <import>macros.xml</import>
+  </macros>
+  <expand macro="requirements"/>
+  <command><![CDATA[
+
+      export GALAXY_MEMORY_KB=\$((\${GALAXY_MEMORY_MB:-8192}*1024))
+      &&
+
+      export PYTHONHASHSEED=0
+      &&
+
+      ## output will go here
+      mkdir -p '${ReportFile.extra_files_path}'
+      &&
+
+      /repex_tarean/seqclust
+      --cpu \${GALAXY_SLOTS:-1}
+      --max_memory \${GALAXY_MEMORY_KB}
+      '${paired}'
+      #if $sample:
+        --sample '${sample}'
+      #end if
+      --taxon '${taxon}'
+      --output_dir='${ReportFile.extra_files_path}'
+      #if $advanced.mincl:
+        --mincl '${advanced.mincl}'
+      #end if
+      --assembly_min '${advanced.assembly_min}'
+      #if $advanced.keep_names:
+        --keep_names
+      #end if
+      '${FastaFile}'
+      &&
+
+      ## archive the output
+      tar -cvf '${ReportArchive}' --directory='${ReportFile.extra_files_path}' .
+      &&
+
+      ## pick up the html index
+      cp '${ReportFile.extra_files_path}/index.html' ./index.html
+
+      ]]></command>
+  <inputs>
+    <param name="FastaFile" label="NGS reads" type="data" format="fasta" help="Input file must contain FASTA-formatted NGS reads. Illumina paired-end reads are recommended."/>
+    <param argument="--paired" type="boolean" truevalue="--paired" falsevalue="" checked="True" label="Paired-end reads" help="If paired-end reads are used, they must be interleaved and all pairs must be complete. Example of the correct format is provided in the help below."/>
+    <param argument="--sample" type="integer" optional="true" label="Subsample reads (number)" help="Use an integer &gt; 1 to select a specific number of reads to use. Leave this field blank to use the entire dataset."/>
+    <param argument="--taxon" label="Select taxon and protein domain database version (REXdb)" type="select" help="Reference database of transposable element protein domains - REXdb - is used for annotation of repeats">
+      <option value="VIRIDIPLANTAE3.0" selected="true">Viridiplantae version 3.0 </option>
+      <option value="VIRIDIPLANTAE2.2" selected="true">Viridiplantae version 2.2</option>
+      <option value="METAZOA3.0">Metazoa version 3.0</option>
+      <option value="METAZOA2.0">Metazoa version 2.0</option>
+    </param>
+    <section name="advanced" title="Advanced options" expanded="false">
+      <param argument="--mincl" label="Cluster size threshold  for detailed analysis" type="float" value="" min="0.0001" max="100" optional="true" help="Minimal size (as percentage of input reads) of the smallest cluster which is analyzed; clusters with less than 20 reads are not considered."/>
+      <param argument="--assembly_min" type="integer" label="Minimal cluster size for assembly" value="5" min="2" max="100"/>
+      <param argument="--keep_names" label="Keep original read names" type="boolean" checked="false" help="By default, reads are renamed using integers. Use this option to keep original names."/>
+    </section>
+  </inputs>
+  <outputs>
+    <data name="ReportArchive" format="tar" label="RepeatExplorer2 - Archive with HTML report from data ${FastaFile.hid}"/>
+    <data name="ReportFile" format="html" from_work_dir="index.html" label="RepeatExplorer2 - HTML report from data ${FastaFile.hid}"/>
+  </outputs>
+  <tests>
+    <!-- test1: basic function -->
+    <test expect_num_outputs="2">
+      <param name="FastaFile" value="LAS_paired_10k.fa.gz" ftype="fasta.gz"/>
+      <param name="paired" value="True"/>
+      <param name="taxon" value="VIRIDIPLANTAE3.0"/>
+      <output name="ReportArchive" file="test1_out.tar">
+        <assert_contents>
+          <has_size value="33873920" delta="3000000"/>
+        </assert_contents>
+      </output>
+      <output name="ReportFile" file="test1_out.html">
+      </output>
+      <assert_contents>
+        <has_text text="Clustering summary"/>
+      </assert_contents>
+    </test>
+    <!-- test2: read subsample -->
+    <test expect_num_outputs="2">
+      <param name="FastaFile" value="LAS_paired_10k.fa.gz" ftype="fasta.gz"/>
+      <param name="paired" value="True"/>
+      <param name="sample" value="5000"/>
+      <param name="taxon" value="VIRIDIPLANTAE3.0"/>
+      <output name="ReportArchive" file="test2_out.tar">
+        <assert_contents>
+          <has_size value="17981440" delta="3000000"/>
+        </assert_contents>
+      </output>
+      <output name="ReportFile" file="test2_out.html">
+        <assert_contents>
+          <has_text text="Clustering summary"/>
+        </assert_contents>
+      </output>
+    </test>
+    <!-- test3: advanced params -->
+    <test expect_num_outputs="2">
+      <param name="FastaFile" value="LAS_paired_10k.fa.gz" ftype="fasta.gz"/>
+      <param name="paired" value="True"/>
+      <param name="taxon" value="VIRIDIPLANTAE3.0"/>
+      <param name="mincl" value="0.01"/>
+      <param name="keep_names" value="True"/>
+      <output name="ReportArchive" file="test3_out.tar">
+        <assert_contents>
+          <has_size value="33873920" delta="3000000"/>
+        </assert_contents>
+      </output>
+      <output name="ReportFile" file="test3_out.html">
+        <assert_contents>
+          <has_text text="Clustering summary"/>
+        </assert_contents>
+      </output>
+    </test>
+  </tests>
+  <help><![CDATA[
+      **HELP**
+
+      RepeatExplorer2 clustering is a computational pipeline for unsupervised
+      identification of repeats from unassembled sequence reads. The
+      pipeline uses low-pass whole genome sequence reads and performs graph-based
+      clustering. Resulting clusters, representing all types of repeats, are then
+      examined to identify and classify into repeats groups. 
+
+      **Input data**
+
+      The analysis requires either **single** or **paired-end reads** generated
+      by whole genome shotgun sequencing provided as a single fasta-formatted file.
+      Generally, paired-end reads provide significantly better results than single
+      reads. Reads should be of uniform length (optimal size range is 100-200 nt) and
+      the number of analyzed reads should represent less than 1x genome equivalent
+      (genome coverage of 0.01 - 0.50 x is recommended). Reads should be
+      quality-filtered (recommended filtering : quality score >=10 over 95% of bases
+      and no Ns allowed) and only **complete read pairs** should be submitted for
+      analysis. When paired reads are used, input data must be **interlaced** format
+      as fasta file:
+
+      example of interlaced input format::
+
+        >0001_f
+        CGTAATATACATACTTGCTAGCTAGTTGGATGCATCCAACTTGCAAGCTAGTTTGATG
+        >0001_r
+        GATTTGACGGACACACTAACTAGCTAGTTGCATCTAAGCGGGCACACTAACTAACTAT
+        >0002_f
+        ACTCATTTGGACTTAACTTTGATAATAAAAACTTAAAAAGGTTTCTGCACATGAATCG
+        >0002_r
+        TATGTTGAAAAATTGAATTTCGGGACGAAACAGCGTCTATCGTCACGACATAGTGCTC
+        >0003_f
+        TGACATTTGTGAACGTTAATGTTCAACAAATCTTTCCAATGTCTTTTTATCTTATCAT
+        >0003_r
+        TATTGAAATACTGGACACAAATTGGAAATGAAACCTTGTGAGTTATTCAATTTATGTT
+        ...
+
+
+      **Comparative analysis**
+
+      For comparative analysis sequence names must contain code (prefix) for each group.
+      Prefix in sequences names  must be of fixed length.
+
+      Example of labeling two groups with where **group code length** is 2 and is used to distinguish groups - AA and BB ::
+
+        >AA0001_f
+        CGTAATATACATACTTGCTAGCTAGTTGGATGCATCCAACTTGCAAGCTAGTTTGATG
+        >AA0001_r
+        GATTTGACGGACACACTAACTAGCTAGTTGCATCTAAGCGGGCACACTAACTAACTAT
+        >AA0002_f
+        ACTCATTTGGACTTAACTTTGATAATAAAAACTTAAAAAGGTTTCTGCACATGAATCG
+        >AA0002_r
+        TATGTTGAAAAATTGAATTTCGGGACGAAACAGCGTCTATCGTCACGACATAGTGCTC
+        >BB0001_f
+        TGACATTTGTGAACGTTAATGTTCAACAAATCTTTCCAATGTCTTTTTATCTTATCAT
+        >BB0001_r
+        TATTGAAATACTGGACACAAATTGGAAATGAAACCTTGTGAGTTATTCAATTTATGTT
+        >BB0002_f
+        TGACATTTGTGAACGTTAATGTTCAACAAATCTTTCCAATGTCTTTTTATCTTATCAT
+        >BB0002_r
+        TATTGAAATACTGGACACAAATTGGAAATGAAACCTTGTGAGTTATTCAATTTATGTT
+
+
+      To prepare quality filtered and interlaced input fasta file from fastq
+      files, use `Preprocessing of paired-reads`__  tool.
+
+      .. __: tool_runner?tool_id=paired_fastq_filtering
+
+
+      **Additional parameters**
+
+      **Sample size** defines how many reads should be used in calculation.
+      Default setting with 500,000 reads will enable detection of high copy
+      repeats within several hours of computation time. For higher
+      sensitivity the sample size can be set higher. Since sample size affects
+      the memory usage, this parameter may be automatically adjusted to lower
+      value during the run. Maximum sample size which can be processed depends on
+      the repetitiveness of analyzed genome.
+
+
+      **Select taxon and protein domain database version (REXdb)**. Classification
+      of transposable elements is based on the similarity to our reference database
+      of transposable element protein domains (**REXdb**). Standalone database for Viridiplantae species
+      can be obtained on `repeatexplorer.org`__. Classification
+      system used in REXdb is described in article `Systematic survey of plant
+      LTR-retrotransposons elucidates phylogenetic relationships of their
+      polyprotein domains and provides a reference for element classification`__
+      Database for Metazoa species is still under development so use it with caution.
+
+      .. __: http://repeatexplorer.org
+      .. __: https://doi.org/10.1186/s13100-018-0144-1
+
+      **Select parameters for protein domain search** REXdb is compared with s
+      equence clusters either using blastx or diamond aligner. Diamond program
+      is about three time faster than blastx with word size 3.
+
+      **Similarity search options** By default sequence reads are compared using
+      mgblast program. Default threshold is explicitly set to 90% sequence
+      similarity spanning at least 55% of the read length (in the case of reads
+      differing in length it applies to the longer one). Additionally, sequence
+      overlap must be at least 55 nt. If you select option for shorter reads
+      than 100 nt,  minimum overlap 55 nt is not required.
+
+      By default,
+      mgblast search use DUST program to filter out
+      low-complexity sequences. If you want
+      to increase sensitivity of detection of satellites with shorter monomer
+      use option with '*no masking of low complexity repeats*'. Note that omitting
+      DUST filtering will significantly increase running times
+
+
+      **Automatic filtering of abundant satellite repeats** perform clustering on
+      smaller dataset of sequence reads to detect abundant high confidence
+      satellite repeats. If such satellites are detected, sequence reads derived
+      from these satellites are depleted from input dataset. This step enable more
+      sensitive detection of less abundant repeats as more reads can be used
+      in clustering step.
+
+      **Use custom repeat database**. This option allows users to perform similarity
+      comparison of identified repeats to their custom databases. The repeat class must
+      be encoded in FASTA headers of database entries in order to allow correct 
+      parsing of similarity hits. Required format for custom database sequence name is: ::
+
+        >reapeatname#class/subclass
+
+
+      **Output**
+
+      List of clusters identified as putative satellite repeats, their genomic
+      abundance and various cluster characteristics. 
+
+      Output includes a **HTML summary** with table listing of all analyzed
+      clusters. More detailed information about clusters is provided in
+      additional files and directories. All results are also provided as
+      downloadable **zip archive**. Additionally a **log file** reporting
+      the progress of the computational pipeline is provided.
+
+      ]]></help>
+  <expand macro="citations"/>
+</tool>
diff --git a/tools/repeatexplorer2/test-data/LAS_paired_10k.fa.gz b/tools/repeatexplorer2/test-data/LAS_paired_10k.fa.gz
diff --git a/tools/repeatexplorer2/test-data/test1_out.html b/tools/repeatexplorer2/test-data/test1_out.html
@@ -0,0 +1,64 @@
+
+<html xmlns:mml="http://www.w3.org/1998/Math/MathML">
+  <head>
+    <meta charset="utf-8"/>	
+    <title> Clustering summary </title>
+    <link rel="stylesheet" href="style1.css">
+  </head>
+
+ <h1 > Clustering Summary</h1>
+<a href="summary_histogram.png"> <img src="summary_histogram.png" width="700" border="1" > </a><p> <b> Graphical summary of the clustering results. </b> Bars represent superclusters, with their heights and widths corresponding to the numbers of reads in the superclusters (y-axis) and to their proportions in all analyzed reads (x-axis), respectively. Rectangles inside the supercluster bars represent individual clusters. If the filtering of abundant satellites was performed, the affected clusters are shown in green, and their sizes correspond to the adjusted values. Blue and pink background panels show proportions of reads that were clustered and remained single, respectively. Top clusters are on the left of the dotted line. </p><hr><br><br>
+ <h2 > Run information:</h2>
+
+<p class='character'>Number of input reads: 10000</p>
+
+<p class='character'>Number of analyzed reads: 10000</p>
+
+<p class='character'>Proportion of reads in top clusters : 14 %</p>
+
+<p class='character'>Cluster merging: No</p>
+
+<p class='character'>Paired-end reads: Yes</p>
+
+ <h2 > Available analyses:</h2>
+<p> <a href="tarean_report.html">Tandem repeat analysis</a> </p><p> <a href="cluster_report.html">Cluster annotation</a> </p><p> <a href="supercluster_report.html">Supercluster annotation</a> </p><p> <a href="summarized_annotation.html">Repeat annotation summary</a> </p>
+ <h2 > Supplementary files:</h2>
+<p> <a href="CLUSTER_TABLE.csv">CLUSTER_TABLE.csv</a> </p><p> <a href="SUPERCLUSTER_TABLE.csv">SUPERCLUSTER_TABLE.csv</a> </p><p> <a href="contigs.fasta">contigs.fasta</a> </p><hr>
+
+<h3> How to cite </h3>
+<p>
+	Novak, P., Neumann, P., Pech, J., Steinhaisl, J., Macas, J. (2013) -
+	  <a href="http://bioinformatics.oxfordjournals.org/content/29/6/792">RepeatExplorer: a Galaxy-based web server for genome-wide characterization of eukaryotic repetitive elements from next generation sequence reads.</a> <i> Bioinformatics</i> <b>29</b>:792-793.
+</p>
+
+<p><i> Classification of repetitive elements using REXdb:</i></p>
+<p>Neumann, P., Novak, P., Hostakova, N., Macas, J. (2019) &#8211; <a href="https://mobilednajournal.biomedcentral.com/articles/10.1186/s13100-018-0144-1" target="_blank">Systematic survey of plant LTR-retrotransposons elucidates phylogenetic relationships of their polyprotein domains and provides a reference for element classification</a>. <em>Mobile DNA</em> <b>10</b>:1.</p>
+
+</p>
+<i>The principle of repeat identification implemented in the RepeatExplorer:</i>
+<p>
+	  Novak, P., Neumann, P., Macas, J. (2010) - <a href="http://www.biomedcentral.com/1471-2105/11/378">Graph-based clustering and characterization of repetitive sequences in next-generation sequencing data.</a> <i>BMC Bioinformatics</i> <b>11</b>:378.
+</p>
+<i>Using TAREAN for satellite repeat detection and characterization:</i>
+<p>
+  Novak, P., Robledillo, L.A.,Koblizkova, A., Vrbova, I., Neumann, P., Macas, J. (2017) -
+    <a href="https://doi.org/10.1093/nar/gkx257"> TAREAN: a computational tool for identification and characterization of satellite DNA from unassembled short reads.</a> <i> Nucleic Acid Research </i> <b>45</b>:e111
+</p>
+<br><hr>
+ <h3 > Details:</h3>
+<pre>
+--------------------------------------------------------------------------
+PIPELINE VERSION         : devel-0.3.8-2917(e753f81)
+
+PROTEIN DATABASE VERSION : protein_database_viridiplantae_v3.0.fasta
+            md5 checksum : a36362f4e8b024f1ce97589aac1e6f1a
+
+DNA DATABASE VERSION     : dna_database_masked.fasta
+            md5 checksum : 86bab7cdd3e70374cd756de13680240d
+--------------------------------------------------------------------------
+</pre>
+<p class='character'>Minimal number of reads in cluster to be considered top cluster : 20</p>
+
+<p class='character'>Reserved Memory : 23G</p>
+
+<p class='character'>Maximum number of processable reads with the reserved memory : 1353221</p>