-
Notifications
You must be signed in to change notification settings - Fork 6
API Tutorial
My goal in developing Transposome was to allow users to develop custom analysis pipelines, thereby making it easy to repeat certain procedures or combine Transposome methods with other code in creative ways. What follows are a number of computational recipes that are intended you give you an idea of what can be done. To keep the code simple, each example builds on the results from the previous example. Note that these examples, and others, can be found in the transposome-scripts repository. If you write your own scripts, consider adding them to that repository for others to use.
#!/usr/bin/env perl
use 5.012;
use strict;
use warnings;
use autodie qw(open);
use File::Basename;
use Transposome::SeqUtil;
my $usage = "$0 seqfile.fas\n";
my $infile = shift or die $usage;
my ($iname, $ipath, $isuffix) = fileparse($infile, qr/\.[^.]*/);
for my $sample_size (qw(100_000 200_000 300_000 400_000 500_000)) {
say STDERR "Starting sampling for sample size: ", $sample_size;
my $sequtil = Transposome::SeqUtil->new( file => $infile, sample_size => $sample_size, no_store => 1 );
my $out = $iname."_$sample_size".".fasta";
{ local *STDOUT; open STDOUT, '>', $out; $sequtil->sample_seq; }
}
See also the page for choosing the appropriate genome coverage for more discussion of sampling reads. Note the numbers may be given as "100_000" for readability, or simply as "100000."
2. Store sequences for easy retrieval#!/usr/bin/env perl
use 5.012;
use strict;
use warnings;
use Transposome::SeqUtil;
my $usage = "$0 seqfile.fas\n";
my $infile = shift or die $usage;
my $seq_obj = Transposome::SeqUtil->new( file => $infile, in_memory => 0 );
my ($seqs, $seqct) = $seq_obj->store_seq;
## now you can do a number of things, e.g., get a count
say STDERR "There are $seqct sequences in $infile.";
## retrieve a specific record, here the 'some_id_1' sequence
if (exists $seqs->{'some_id_1'}) {
say join "\n", ">some_id_1", $seqs->{'some_id_1'};
}
#!/usr/bin/env perl
use 5.012;
use strict;
use warnings;
use autodie qw(open);
use Getopt::Long;
use Transposome::SeqUtil;
my $usage = "$0 -f seqfile_1.fastq -r seqfile_2.fastq -s sample_size\n";
my $pair1;
my $pair2;
my $sample_size;
GetOptions(
'f|pair1=s' => \$pair1,
'r|pair2=s' => \$pair2,
's|sample_size=i' => \$sample_size,
);
die $usage if !$pair1 or !$pair2 or !$sample_size;
my $sequtil_pair1 = Transposome::SeqUtil->new( file => $pair1, sample_size => $sample_size, seed => 100, in_memory => 1 );
my $sequtil_pair2 = Transposome::SeqUtil->new( file => $pair2, sample_size => $sample_size, seed => 100, in_memory => 1 );
my ($seqs1, $seqs1_ct) = $sequtil_pair1->sample_seq;
my ($seqs2, $seqs2_ct) = $sequtil_pair2->sample_seq;
for my $seqs1_k (sort keys %$seqs1) {
my $seqs2_k = $seqs1_k; $seqs2_k =~ s/1$/2/;
if (exists $seqs2->{$seqs2_k}) {
say join "\n", ">".$seqs1_k, $seqs1->{$seqs1_k};
say join "\n", ">".$seqs2_k, $seqs2->{$seqs2_k};
}
}
NOTE: For a faster solution, I recommend sampling your data (with seqtk or Transposome) and then joining the files with Pairfq.
#!/usr/bin/env perl
use 5.012;
use strict;
use warnings;
use File::Basename;
use Transposome::Run::Blast;
use Transposome::PairFinder;
my $usage = "$0 seqfile.fas output_directory\n";
my $infile = shift or die $usage;
my $out_dir = shift or die $usage;
my ($iname, $ipath, $isuffix) = fileparse($infile, qr/\.[^.]*/);
my $rep = $iname."_blast_report.txt";
my $blast = Transposome::Run::Blast->new( file => $infile,
dir => $out_dir,
threads => 4,
cpus => 2,
seq_num => 25_000,
report => $rep );
my $blastdb = $blast->run_allvall_blast;
for my $pid (qw(90.0 85.0 80.0)) {
for my $fcov (qw(0.60 0.55 0.50)) {
my $samp_out = $out_dir."/".$iname."_PID$pid"."_COV$fcov"."_out";
my $blast_res = Transposome::PairFinder->new( file => $blastdb,
dir => $samp_out,
in_memory => 1,
percent_identity => $pid,
fraction_coverage => $fcov );
my ($idx_file, $int_file, $hs_file) = $blast_res->parse_blast;
}
}
#!/usr/bin/env perl
use 5.012;
use strict;
use warnings;
use Getopt::Long;
use Transposome::Cluster;
use Transposome::SeqUtil;
my $usage = "$0 -s seqfile.fas -int int_file -idx idx_file -o output_directory\n";
my $report = 'cluster_merge_permute_report.txt';
my $int_file;
my $idx_file;
my $seq_file;
my $out_dir;
GetOptions( 's|seqfile=s' => \$seq_file,
'int|intfile=s' => \$int_file,
'idx|idxfile=s' => \$idx_file,
'o|outdir=s' => \$out_dir,
);
die $usage if !$int_file or !$idx_file or !$seq_file or !$out_dir;
for my $merge_thresh (qw(100 500 1000)) { # the exact threshold will depend on the size of the data set
my $cluster = Transposome::Cluster->new( file => $int_file,
dir => $out_dir,
merge_threshold => $merge_thresh,
cluster_size => 10 );
my $comm = $cluster->louvain_method;
my $cluster_file = $cluster->make_clusters($comm, $idx_file);
my ($read_pairs, $vertex, $uf) = $cluster->find_pairs($cluster_file, $report);
my $memstore = Transposome::SeqUtil->new( file => $seq_file, in_memory => 1 );
my ($seqs, $seqct) = $memstore->store_seq;
my ($cls_dir_path, $cls_with_merges_path, $cls_tot) = $cluster->merge_clusters($vertex, $seqs, $read_pairs, $report, $uf);
}
#!/usr/bin/env perl
## pragmas and library imports
use 5.012;
use strict;
use warnings;
use Getopt::Long;
use File::Basename;
use Transposome::PairFinder;
use Transposome::Cluster;
use Transposome::SeqUtil;
use Transposome::Annotation;
use Transposome::Run::Blast;
## lexical vars
my $seq_file;
my $out_dir;
my $rep_db;
my $cpu;
my $thread;
my $seq_num;
my $cls_size;
my $bls_pid;
my $bls_fcov;
my $help;
GetOptions( 's|sequence_file=s' => \$seq_file,
'o|output_dir=s' => \$out_dir,
'repdb|repeat_database=s' => \$rep_db,
'c|cpus=i' => \$cpu,
't|threads=i' => \$thread,
'n|sequence_num=i' => \$seq_num,
'pid|percent_identity=i' => \$bls_pid,
'fcov|fraction_coverage=f' => \$bls_fcov,
'cls|cluster_size=i' => \$cls_size,
'h|help' => \$help,
);
## check input
usage() and exit(0) if $help;
if (!$seq_file || !$out_dir || !$cpu || !$thread) {
say "\n[ERROR]: Command line not parsed correctly. Check input and refer to the usage. Exiting.\n";
usage();
exit(1);
}
$seq_num //= 25000;
$cls_size //= 100;
$bls_pid //= 90;
$bls_fcov //= 0.55;
my $merge_thresh = 100;
my ($iname, $ipath, $isuffix) = fileparse($seq_file, qr/\.[^.]*/);
for my $sample_size (qw(100_000 200_000 300_000 400_000 500_000)) {
my $sequtil = Transposome::SeqUtil->new( file => $seq_file, sample_size => $sample_size , no_store => 1);
my $samp_seq = $iname."_$sample_size".".fasta";
{ local *STDOUT; open STDOUT, '>', $samp_seq; $sequtil->sample_seq; }
my $samp_rep = $iname."_$sample_size"."_report.txt";
my $samp_dir = $out_dir."_$sample_size";
#
# perform the all vs. all blast
#
my $blast = Transposome::Run::Blast->new( file => $samp_seq,
dir => $samp_dir,
threads => $thread,
cpus => $cpu,
seq_num => $seq_num,
report => $samp_rep );
my $blastdb = $blast->run_allvall_blast;
#
# parse mglblast results to find best scoring pairwise matches
#
my $blast_res = Transposome::PairFinder->new( file => $blastdb,
dir => $samp_dir,
in_memory => 1,
percent_identity => $bls_pid,
fraction_coverage => $bls_fcov );
my ($idx_file, $int_file, $hs_file) = $blast_res->parse_blast;
#
# cluster sequences and analyze groupings
#
my $cluster = Transposome::Cluster->new( file => $int_file,
dir => $samp_dir,
merge_threshold => $merge_thresh,
cluster_size => $cls_size );
my $comm = $cluster->louvain_method;
my $cluster_file = $cluster->make_clusters($comm, $idx_file);
my ($read_pairs, $vertex, $uf) = $cluster->find_pairs($cluster_file, $samp_rep);
my $memstore = Transposome::SeqUtil->new( file => $samp_seq, in_memory => 1 );
my ($seqs, $seqct) = $memstore->store_seq;
my ($cls_dir_path, $cls_with_merges_path, $cls_tot) = $cluster->merge_clusters($vertex, $seqs,
$read_pairs, $samp_rep, $uf);
#
# annotate clusters and generate whole-genome summary of results
#
my $annotation = Transposome::Annotation->new( database => $rep_db,
dir => $samp_dir,
file => $samp_rep );
my ($anno_rp_path, $anno_sum_rep_path, $total_readct,
$rep_frac, $blasts, $superfams) = $annotation->annotate_clusters($cls_dir_path, $seqct, $cls_tot);
$annotation->clusters_annotation_to_summary($anno_rp_path, $anno_sum_rep_path, $total_readct,
$seqct, $rep_frac, $blasts, $superfams, $samp_rep);
$merge_thresh += 100;
}
exit;
## subs
sub usage {
my $script = basename($0);
print STDERR <<END
USAGE: $script -s seqs.fastq -o my_cluster_report.txt -n 25000 -c 2 -t 12 [-h]
Required:
-s|sequence_file : A Fasta/q file to analyze for repeats
-o|output_dir : A name for the output to be written.
-c|cpus : The number of CPUs to use for each blast process.
-t|threads : The number of parallel blast processes to run.
(NB: threads X cpus should be less than or equal to the number
of CPUs on your machine.)
-repdb|repeat_database : A sequence file of repeats to be used for annotation.
Options:
-pid|percent_identity : Percent identity between pairwise matches in all vs. all blast (Default: 90).
-fcov|fraction_coverage : The fraction coverage between pairwise matches in all vs. all blast (Default: 0.55).
-cls|cluster_size : The minimum size of a cluster to be used for annotation (Default: 100).
-n|sequence_num : The number of sequences to submit to each blast process (Default: 25000).
-h|help : Print a usage statement.
END
}