From e8f5ef0664667d8ecdd77399df82183d0a327427 Mon Sep 17 00:00:00 2001 From: nylander Date: Mon, 25 Jan 2016 15:55:47 +0100 Subject: [PATCH] first commit --- README.md | 11 +++ bin/README.md | 10 +++ bin/fasta_unwrap.pl | 46 ++++++++++++ bin/fasta_wrap.pl | 47 ++++++++++++ bin/get_fasta_info.pl | 165 ++++++++++++++++++++++++++++++++++++++++++ bin/init.sh | 99 +++++++++++++++++++++++++ data/README.md | 10 +++ doc/README.md | 41 +++++++++++ old/.gitignore | 0 results/README.md | 10 +++ src/README.md | 10 +++ src/fastaparser.pl | 36 +++++++++ tmp/.gitignore | 0 13 files changed, 485 insertions(+) create mode 100644 README.md create mode 100644 bin/README.md create mode 100755 bin/fasta_unwrap.pl create mode 100755 bin/fasta_wrap.pl create mode 100755 bin/get_fasta_info.pl create mode 100755 bin/init.sh create mode 100644 data/README.md create mode 100644 doc/README.md create mode 100644 old/.gitignore create mode 100644 results/README.md create mode 100644 src/README.md create mode 100755 src/fastaparser.pl create mode 100644 tmp/.gitignore diff --git a/README.md b/README.md new file mode 100644 index 0000000..89add39 --- /dev/null +++ b/README.md @@ -0,0 +1,11 @@ +# ptemplate -- A project template + +Inspired by [A Quick Guide to Organizing Computational Biology Projects](http://dx.doi.org/10.1371/journal.pcbi.1000424) + +To initialize a new `project`, run these three steps: + + git clone https://github.com/nylander/ptemplate.git + mv ptemplate project && cd project + sh bin/init.sh + + diff --git a/bin/README.md b/bin/README.md new file mode 100644 index 0000000..afdfc0e --- /dev/null +++ b/bin/README.md @@ -0,0 +1,10 @@ +# ptemplate -- bin + +**Version:** 2016-01-25 + +**Sign:** nylander + +## Description + +Text here. + diff --git a/bin/fasta_unwrap.pl b/bin/fasta_unwrap.pl new file mode 100755 index 0000000..561424e --- /dev/null +++ b/bin/fasta_unwrap.pl @@ -0,0 +1,46 @@ +#!/usr/bin/perl +#=============================================================================== +=pod + +=head1 + + FILE: fasta_unwrap.pl + + USAGE: ./fasta_unwrap.pl + + DESCRIPTION: Un-wrap sequence lines in fasta + + OPTIONS: --- + + REQUIREMENTS: --- + + BUGS: --- + + NOTES: --- + + AUTHOR: Johan Nylander (JN), Johan.Nylander@bils.se + + COMPANY: BILS/NRM + + VERSION: 1.0 + + CREATED: 01/12/2016 04:25:36 PM + + REVISION: --- + +=cut + +#=============================================================================== + +$/ = '>'; +while(<>) { + chomp; + next if ($_ eq ''); + my ($id, @seqlines) = split /\n/; + print '>', $id, "\n"; + my $seq = ''; + foreach my $line (@seqlines) { + $seq .= $line; + } + print $seq, "\n"; +} diff --git a/bin/fasta_wrap.pl b/bin/fasta_wrap.pl new file mode 100755 index 0000000..fe29df5 --- /dev/null +++ b/bin/fasta_wrap.pl @@ -0,0 +1,47 @@ +#!/usr/bin/perl +#=============================================================================== +=pod + +=head1 + + FILE: fasta_wrap.pl + + USAGE: ./fasta_wrap.pl + + DESCRIPTION: Wrap sequence lines in fasta + + OPTIONS: --- + + REQUIREMENTS: --- + + BUGS: --- + + NOTES: --- + + AUTHOR: Johan Nylander (JN), Johan.Nylander@bils.se + + COMPANY: BILS/NRM + + VERSION: 1.0 + + CREATED: 01/12/2016 04:25:36 PM + + REVISION: --- + +=cut + +#=============================================================================== + +my $length = 60; + +while(<>) { + my $line = $_; + chomp($line); + if ($line =~ /^\s*>/) { + print $line, "\n"; + } + else { + $line =~ s/\S{$length}/$&\n/g; + print $line, "\n"; + } +} diff --git a/bin/get_fasta_info.pl b/bin/get_fasta_info.pl new file mode 100755 index 0000000..a3c5ffe --- /dev/null +++ b/bin/get_fasta_info.pl @@ -0,0 +1,165 @@ +#!/usr/bin/perl + +# Usage: get_fasta_info.pl infile.fa +# + +use strict; +use Data::Dumper; + +my ( + $my_name, $Total_Contigs, $Total_XContigs, $Total_Bases, + $Total_XBases, $Total_Min, $Total_Max, $Unique_Counts +); + +#=== FUNCTION ================================================================ +# NAME: read_file +# VERSION: 02/09/2012 12:11:04 PM +# DESCRIPTION: ??? +# PARAMETERS: ??? +# RETURNS: ??? +# TODO: ??? +#=============================================================================== +sub read_file { + my ($file_name) = @_; + + ########## open file ########## + ## check if compressed. Warning, does not handle tar archives (*.tar.gz, *.tar.bz2, *.tgz, etc) + if ($file_name =~ /\.gz$/) { + $file_name =~ s/(.*\.gz)\s*$/gzip -dc < $1|/; + } + elsif ($file_name =~ /\.zip$/) { + $file_name =~ s/(.*\.zip)\s*$/gzip -dc < $1|/; + } + elsif ($file_name =~ /\.Z$/) { + $file_name =~ s/(.*\.Z)\s*$/gzip -dc < $1|/; + } + elsif ($file_name =~ /\.bz2$/) { + $file_name =~ s/(.*\.bz2)\s*$/bzip2 -dc < $1|/; + } + open(FASTAIN, $file_name) or die "Can't open $file_name : $!"; + + ########## read contigs in file ########## + my ( $header, $contig, $sequence, $line ) = ('') x 4; + my ( $contigs, $xcontigs, $bases, $xbases, $max, $ave, $line_num ) = + (0) x 7; + my $min = undef; + while ( defined( $line = ) ) { + chomp $line; + $line_num++; + if ( $line =~ /^>/ ) { + if ($contig) # after first input line? + { + ( $contigs, $xcontigs, $bases, $xbases, $min, $max ) = + process_contig( $contig, $sequence, $contigs, $xcontigs, + $bases, $xbases, $min, $max ); + } + $header = $line; + if ( $header =~ m/^>(\S+)/ ) { + $contig = $1; + } + else { + die "$my_name: Invalid fasta file header at line $line_num\n" + . "of file: '$file_name'\n"; + } + $sequence = ''; + next; + } # end if ($line =~ /^>/) + if ( !$contig ) { + die +"$my_name: File '$file_name' does not begin with a fasta header line\n"; + } + $line =~ s/\s//g; + $sequence .= $line; + } # end while (defined($line = )) + close(FASTAIN); + if ($contig) { + ( $contigs, $xcontigs, $bases, $xbases, $min, $max ) = + process_contig( $contig, $sequence, $contigs, $xcontigs, $bases, + $xbases, $min, $max ); + } + + # print stats and accumulate totals + $ave = $contigs ? ( sprintf "%.1f", $bases / $contigs ) : 0; + print_counts( $contigs, $xcontigs, $bases, $xbases, $min, $max, $ave, + $file_name ); + $Total_Contigs += $contigs; + $Total_XContigs += $xcontigs; + $Total_Bases += $bases; + $Total_XBases += $xbases; + $Total_Min = $min + if ( ( !defined $Total_Min ) || ( $Total_Min > $min ) ); + $Total_Max = $max if ( $Total_Max < $max ); + + #print STDERR "Bases: ", $Total_Bases, "\n"; + + return; +} # end read_file + +#=== FUNCTION ================================================================ +# NAME: process_contig +# VERSION: 03/11/2009 03:03:13 PM PST +# DESCRIPTION: ??? +# PARAMETERS: ??? +# RETURNS: ??? +# TODO: ??? +#=============================================================================== +sub process_contig { + my ( $contig, $sequence, $contigs, $xcontigs, $bases, $xbases, $min, $max ) + = @_; + my $len = length $sequence; + $contigs++; + $bases += $len; + if ($Unique_Counts) { + $xcontigs++; + $xbases += $len; + if ( $contig =~ /__(\d+)$/ ) { + my $extra = $1; + $xcontigs += $extra; + $xbases += $len * $extra; + } + } + $min = $len if ( ( !defined $min ) || ( $min > $len ) ); + $max = $len if ( $max < $len ); + return ( $contigs, $xcontigs, $bases, $xbases, $min, $max ); +} # end process_contig + +#=== FUNCTION ================================================================ +# NAME: print_counts +# VERSION: 03/11/2009 03:03:30 PM PST +# DESCRIPTION: ??? +# PARAMETERS: ??? +# RETURNS: ??? +# TODO: ??? +#=============================================================================== +sub print_counts { + my ( $contigs, $xcontigs, $bases, $xbases, $min, $max, $ave, $file_name ) = + @_; + + if ($Unique_Counts) { + printf STDOUT "%7d %7d %13d %13d %7d %7d %9.1f %s\n", + $contigs, $xcontigs, $bases, $xbases, $min, $max, $ave, + $file_name; + } + else { + printf STDOUT +"\n# File: %s\nNseqs: %9d\nMin. length: %d\nMax. length: %d\nAvg. length: %d\n", + $file_name, $contigs, $min, $max, $ave; + } + + return; +} # end print_counts + +#=== FUNCTION ================================================================ +# NAME: "MAIN" +# VERSION: 03/11/2009 03:08:14 PM PST +# DESCRIPTION: ??? +# PARAMETERS: ??? +# RETURNS: ??? +# TODO: ??? +#=============================================================================== +MAIN: +while ( my $infile = shift(@ARGV) ) { + read_file($infile); +} +exit(0); +__END__ diff --git a/bin/init.sh b/bin/init.sh new file mode 100755 index 0000000..51142be --- /dev/null +++ b/bin/init.sh @@ -0,0 +1,99 @@ +#!/bin/bash + +TIMESTAMP=$(date '+%Y-%m-%d') + +## Set wd +SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )" +PROJPATH=$(dirname $SCRIPTPATH) +PROJNAME=$(basename $PROJPATH) +echo "* Project name:" $PROJNAME + +## Create doc/README.md +if true ; then + cat << EOF > doc/README.md +# Project $PROJNAME + +**Version:** $TIMESTAMP + +**Sign:** $USER + +## Description + +Text here. + +More documentation in the \`doc\` folder. + +## Tools + +$(find bin -type f -executable ! -iname "init.sh" -printf "* %p\n") + +## Data + +Data in the \`data\` folder. + +## Analyses + +Text here. + +--- + +## Results + +Results in \`results\` folder. + +--- + +EOF +fi + +## Reinit git, create .gitignore, and append to doc/README.md +if command -v git >/dev/null 2>&1; then + cd $PROJPATH + if [ -e ".git" ] ; then + rm -rf .git + fi + echo -n "* " + git init + cat << EOF > .gitignore +old +EOF + cat << EOF >> doc/README.md +Version Control + +To track changes (after creating and editing files) + + git add * + git commit -m "first commit" + +EOF +fi + +## Create other README.md files +if true ; then + for f in bin data results src ; do + cat << EOF > "$f"/README.md +# $PROJNAME -- $f + +**Version:** $TIMESTAMP + +**Sign:** $USER + +## Description + +Text here. + +EOF + done +fi + +## List files and folders +echo "* Path, files and folders:" +if command -v tree >/dev/null 2>&1; then + tree -I init.sh "$PROJPATH" +else + ls -I init.sh -F "$PROJPATH" +fi +echo "* Start with file doc/README.md" + +## Move init.sh to old/ +mv "$0" $PROJPATH/old/. diff --git a/data/README.md b/data/README.md new file mode 100644 index 0000000..289b754 --- /dev/null +++ b/data/README.md @@ -0,0 +1,10 @@ +# ptemplate -- data + +**Version:** 2016-01-25 + +**Sign:** nylander + +## Description + +Text here. + diff --git a/doc/README.md b/doc/README.md new file mode 100644 index 0000000..f40d25e --- /dev/null +++ b/doc/README.md @@ -0,0 +1,41 @@ +# Project ptemplate + +**Version:** 2016-01-25 + +**Sign:** nylander + +## Description + +Text here. + +More documentation in the `doc` folder. + +## Tools + +* bin/fasta_unwrap.pl +* bin/fasta_wrap.pl +* bin/get_fasta_info.pl + +## Data + +Data in the `data` folder. + +## Analyses + +Text here. + +--- + +## Results + +Results in `results` folder. + +--- + +Version Control + +To track changes (after creating and editing files) + + git add * + git commit -m "first commit" + diff --git a/old/.gitignore b/old/.gitignore new file mode 100644 index 0000000..e69de29 diff --git a/results/README.md b/results/README.md new file mode 100644 index 0000000..92673cd --- /dev/null +++ b/results/README.md @@ -0,0 +1,10 @@ +# ptemplate -- results + +**Version:** 2016-01-25 + +**Sign:** nylander + +## Description + +Text here. + diff --git a/src/README.md b/src/README.md new file mode 100644 index 0000000..b0e5d70 --- /dev/null +++ b/src/README.md @@ -0,0 +1,10 @@ +# ptemplate -- src + +**Version:** 2016-01-25 + +**Sign:** nylander + +## Description + +Text here. + diff --git a/src/fastaparser.pl b/src/fastaparser.pl new file mode 100755 index 0000000..0a34ea2 --- /dev/null +++ b/src/fastaparser.pl @@ -0,0 +1,36 @@ +#!/usr/bin/perl + +# perlmonks.org/?node_id=308170 + +use strict; +use Data::Dumper; + +my @seq; # array of sequences +my %seq; # hash lookup of sequences based on id's +{ + local $/ = '>'; + while () { + s/^>//g; # strip out '>' from beginning + s/>$//g; # and end of line + next if !length($_); # ignore empty lines + my ($header) = /^(.*)\n/; # capture the header + s/^(.*)\n//; # and strip it out + my @rec = split /\|/, $header; # assuming genbank header format + s/\n//mg; # join the sequence strings + push @rec, $_; # make sequence the last element + push @seq, \@rec; # push into array of sequences + $seq{$rec[1]} = \@rec; # or store in a hash lookup + } +} +print Dumper(\@seq); +print Dumper(\%seq); + +__DATA__ +>gi|2695845|emb|Y13255.1|ABY13255 Acipenser baeri mRNA for immunoglobulin heavy chain, clone ScH 3.3 +TGGTTACAACACTTTCTTCTTTCAATAACCACAATACTGCAGTACAATGGGGATTTTAACAGCTCTCTGTATAATAATGA +TACCCCCGCGACCTTCTCGTGGACTGATCAATCTGGAAAAGCTTTT +>gi|2695846|emb|Y13255.1|ABY13255 Acipenser baeri mRNA for immunoglobulin heavy chain, clone ScH 3.3 +TGGTTACAACACTTTCTTCTTTCAATAACCACAATACTGCAGTACAATGGGGATTTTAACAGCTCTCTGTATAATAATGA +TACCCCCGCGACCTTCTCGTGGACTGATCAATCTGGAAAAGCTTTT + + diff --git a/tmp/.gitignore b/tmp/.gitignore new file mode 100644 index 0000000..e69de29