From e8f5ef0664667d8ecdd77399df82183d0a327427 Mon Sep 17 00:00:00 2001
From: nylander <Johan.Nylander@bils.se>
Date: Mon, 25 Jan 2016 15:55:47 +0100
Subject: [PATCH] first commit

---
 README.md             |  11 +++
 bin/README.md         |  10 +++
 bin/fasta_unwrap.pl   |  46 ++++++++++++
 bin/fasta_wrap.pl     |  47 ++++++++++++
 bin/get_fasta_info.pl | 165 ++++++++++++++++++++++++++++++++++++++++++
 bin/init.sh           |  99 +++++++++++++++++++++++++
 data/README.md        |  10 +++
 doc/README.md         |  41 +++++++++++
 old/.gitignore        |   0
 results/README.md     |  10 +++
 src/README.md         |  10 +++
 src/fastaparser.pl    |  36 +++++++++
 tmp/.gitignore        |   0
 13 files changed, 485 insertions(+)
 create mode 100644 README.md
 create mode 100644 bin/README.md
 create mode 100755 bin/fasta_unwrap.pl
 create mode 100755 bin/fasta_wrap.pl
 create mode 100755 bin/get_fasta_info.pl
 create mode 100755 bin/init.sh
 create mode 100644 data/README.md
 create mode 100644 doc/README.md
 create mode 100644 old/.gitignore
 create mode 100644 results/README.md
 create mode 100644 src/README.md
 create mode 100755 src/fastaparser.pl
 create mode 100644 tmp/.gitignore

diff --git a/README.md b/README.md
new file mode 100644
index 0000000..89add39
--- /dev/null
+++ b/README.md
@@ -0,0 +1,11 @@
+# ptemplate -- A project template
+
+Inspired by [A Quick Guide to Organizing Computational Biology Projects](http://dx.doi.org/10.1371/journal.pcbi.1000424)
+
+To initialize a new `project`, run these three steps:
+
+    git clone https://github.com/nylander/ptemplate.git
+    mv ptemplate project && cd project
+    sh bin/init.sh
+
+
diff --git a/bin/README.md b/bin/README.md
new file mode 100644
index 0000000..afdfc0e
--- /dev/null
+++ b/bin/README.md
@@ -0,0 +1,10 @@
+# ptemplate -- bin
+
+**Version:** 2016-01-25
+
+**Sign:** nylander
+
+## Description
+
+Text here.
+
diff --git a/bin/fasta_unwrap.pl b/bin/fasta_unwrap.pl
new file mode 100755
index 0000000..561424e
--- /dev/null
+++ b/bin/fasta_unwrap.pl
@@ -0,0 +1,46 @@
+#!/usr/bin/perl 
+#===============================================================================
+=pod
+
+=head1
+
+         FILE: fasta_unwrap.pl
+
+        USAGE: ./fasta_unwrap.pl  
+
+  DESCRIPTION: Un-wrap sequence lines in fasta
+
+      OPTIONS: ---
+
+ REQUIREMENTS: ---
+
+         BUGS: ---
+
+        NOTES: ---
+
+       AUTHOR: Johan Nylander (JN), Johan.Nylander@bils.se
+
+      COMPANY: BILS/NRM
+
+      VERSION: 1.0
+
+      CREATED: 01/12/2016 04:25:36 PM
+
+     REVISION: ---
+
+=cut
+
+#===============================================================================
+
+$/ = '>';
+while(<>) {
+    chomp;
+    next if ($_ eq '');
+    my ($id, @seqlines) = split /\n/;
+    print '>', $id, "\n";
+    my $seq = '';
+    foreach my $line (@seqlines) {
+        $seq .= $line;
+    }
+    print $seq, "\n";
+}
diff --git a/bin/fasta_wrap.pl b/bin/fasta_wrap.pl
new file mode 100755
index 0000000..fe29df5
--- /dev/null
+++ b/bin/fasta_wrap.pl
@@ -0,0 +1,47 @@
+#!/usr/bin/perl 
+#===============================================================================
+=pod
+
+=head1
+
+         FILE: fasta_wrap.pl
+
+        USAGE: ./fasta_wrap.pl  
+
+  DESCRIPTION: Wrap sequence lines in fasta
+
+      OPTIONS: ---
+
+ REQUIREMENTS: ---
+
+         BUGS: ---
+
+        NOTES: ---
+
+       AUTHOR: Johan Nylander (JN), Johan.Nylander@bils.se
+
+      COMPANY: BILS/NRM
+
+      VERSION: 1.0
+
+      CREATED: 01/12/2016 04:25:36 PM
+
+     REVISION: ---
+
+=cut
+
+#===============================================================================
+
+my $length = 60;
+
+while(<>) {
+    my $line = $_;
+    chomp($line);
+    if ($line =~ /^\s*>/) {
+        print $line, "\n";
+    }
+    else {
+        $line =~ s/\S{$length}/$&\n/g;
+        print $line, "\n";
+    }
+}
diff --git a/bin/get_fasta_info.pl b/bin/get_fasta_info.pl
new file mode 100755
index 0000000..a3c5ffe
--- /dev/null
+++ b/bin/get_fasta_info.pl
@@ -0,0 +1,165 @@
+#!/usr/bin/perl
+
+# Usage: get_fasta_info.pl infile.fa
+#
+
+use strict;
+use Data::Dumper;
+
+my (
+    $my_name,      $Total_Contigs, $Total_XContigs, $Total_Bases,
+    $Total_XBases, $Total_Min,     $Total_Max,      $Unique_Counts
+);
+
+#===  FUNCTION  ================================================================
+#         NAME:  read_file
+#      VERSION:  02/09/2012 12:11:04 PM
+#  DESCRIPTION:  ???
+#   PARAMETERS:  ???
+#      RETURNS:  ???
+#         TODO:  ???
+#===============================================================================
+sub read_file {
+    my ($file_name) = @_;
+
+    ########## open file ##########
+    ## check if compressed. Warning, does not handle tar archives (*.tar.gz, *.tar.bz2, *.tgz, etc) 
+    if ($file_name =~ /\.gz$/) {
+        $file_name =~ s/(.*\.gz)\s*$/gzip -dc < $1|/;
+    }
+    elsif ($file_name =~ /\.zip$/) {
+        $file_name =~ s/(.*\.zip)\s*$/gzip -dc < $1|/;
+    }
+    elsif ($file_name =~ /\.Z$/) {
+        $file_name =~ s/(.*\.Z)\s*$/gzip -dc < $1|/;
+    }
+    elsif ($file_name =~ /\.bz2$/) {
+        $file_name =~ s/(.*\.bz2)\s*$/bzip2 -dc < $1|/;
+    }
+    open(FASTAIN, $file_name) or die "Can't open $file_name : $!";
+
+    ########## read contigs in file ##########
+    my ( $header, $contig, $sequence, $line ) = ('') x 4;
+    my ( $contigs, $xcontigs, $bases, $xbases, $max, $ave, $line_num ) =
+      (0) x 7;
+    my $min = undef;
+    while ( defined( $line = <FASTAIN> ) ) {
+        chomp $line;
+        $line_num++;
+        if ( $line =~ /^>/ ) {
+            if ($contig)    # after first input line?
+            {
+                ( $contigs, $xcontigs, $bases, $xbases, $min, $max ) =
+                  process_contig( $contig, $sequence, $contigs, $xcontigs,
+                    $bases, $xbases, $min, $max );
+            }
+            $header = $line;
+            if ( $header =~ m/^>(\S+)/ ) {
+                $contig = $1;
+            }
+            else {
+                die "$my_name: Invalid fasta file header at line $line_num\n"
+                  . "of file: '$file_name'\n";
+            }
+            $sequence = '';
+            next;
+        }    # end if ($line =~ /^>/)
+        if ( !$contig ) {
+            die
+"$my_name: File '$file_name' does not begin with a fasta header line\n";
+        }
+        $line =~ s/\s//g;
+        $sequence .= $line;
+    }    # end while (defined($line = <FASTAIN>))
+    close(FASTAIN);
+    if ($contig) {
+        ( $contigs, $xcontigs, $bases, $xbases, $min, $max ) =
+          process_contig( $contig, $sequence, $contigs, $xcontigs, $bases,
+            $xbases, $min, $max );
+    }
+
+    # print stats and accumulate totals
+    $ave = $contigs ? ( sprintf "%.1f", $bases / $contigs ) : 0;
+    print_counts( $contigs, $xcontigs, $bases, $xbases, $min, $max, $ave,
+        $file_name );
+    $Total_Contigs  += $contigs;
+    $Total_XContigs += $xcontigs;
+    $Total_Bases    += $bases;
+    $Total_XBases   += $xbases;
+    $Total_Min = $min
+      if ( ( !defined $Total_Min ) || ( $Total_Min > $min ) );
+    $Total_Max = $max if ( $Total_Max < $max );
+
+    #print STDERR "Bases: ", $Total_Bases, "\n";
+
+    return;
+}    # end read_file
+
+#===  FUNCTION  ================================================================
+#         NAME:  process_contig
+#      VERSION:  03/11/2009 03:03:13 PM PST
+#  DESCRIPTION:  ???
+#   PARAMETERS:  ???
+#      RETURNS:  ???
+#         TODO:  ???
+#===============================================================================
+sub process_contig {
+    my ( $contig, $sequence, $contigs, $xcontigs, $bases, $xbases, $min, $max )
+      = @_;
+    my $len = length $sequence;
+    $contigs++;
+    $bases += $len;
+    if ($Unique_Counts) {
+        $xcontigs++;
+        $xbases += $len;
+        if ( $contig =~ /__(\d+)$/ ) {
+            my $extra = $1;
+            $xcontigs += $extra;
+            $xbases   += $len * $extra;
+        }
+    }
+    $min = $len if ( ( !defined $min ) || ( $min > $len ) );
+    $max = $len if ( $max < $len );
+    return ( $contigs, $xcontigs, $bases, $xbases, $min, $max );
+}    # end process_contig
+
+#===  FUNCTION  ================================================================
+#         NAME:  print_counts
+#      VERSION:  03/11/2009 03:03:30 PM PST
+#  DESCRIPTION:  ???
+#   PARAMETERS:  ???
+#      RETURNS:  ???
+#         TODO:  ???
+#===============================================================================
+sub print_counts {
+    my ( $contigs, $xcontigs, $bases, $xbases, $min, $max, $ave, $file_name ) =
+      @_;
+
+    if ($Unique_Counts) {
+        printf STDOUT "%7d %7d %13d %13d %7d %7d %9.1f  %s\n",
+          $contigs, $xcontigs, $bases, $xbases, $min, $max, $ave,
+          $file_name;
+    }
+    else {
+        printf STDOUT
+"\n# File: %s\nNseqs: %9d\nMin. length: %d\nMax. length: %d\nAvg. length: %d\n",
+          $file_name, $contigs, $min, $max, $ave;
+    }
+
+    return;
+}    # end print_counts
+
+#===  FUNCTION  ================================================================
+#         NAME:  "MAIN"
+#      VERSION:  03/11/2009 03:08:14 PM PST
+#  DESCRIPTION:  ???
+#   PARAMETERS:  ???
+#      RETURNS:  ???
+#         TODO:  ???
+#===============================================================================
+MAIN:
+while ( my $infile = shift(@ARGV) ) {
+    read_file($infile);
+}
+exit(0);
+__END__
diff --git a/bin/init.sh b/bin/init.sh
new file mode 100755
index 0000000..51142be
--- /dev/null
+++ b/bin/init.sh
@@ -0,0 +1,99 @@
+#!/bin/bash
+
+TIMESTAMP=$(date '+%Y-%m-%d')
+
+## Set wd
+SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )"
+PROJPATH=$(dirname $SCRIPTPATH)
+PROJNAME=$(basename $PROJPATH)
+echo "* Project name:" $PROJNAME
+
+## Create doc/README.md
+if true ; then
+  cat << EOF > doc/README.md
+# Project $PROJNAME
+
+**Version:** $TIMESTAMP
+
+**Sign:** $USER
+
+## Description
+
+Text here.
+
+More documentation in the \`doc\` folder.
+
+## Tools
+
+$(find bin -type f -executable ! -iname "init.sh" -printf "* %p\n")
+
+## Data
+
+Data in the \`data\` folder.
+
+## Analyses
+
+Text here.
+
+---
+
+## Results
+
+Results in \`results\` folder.
+
+---
+
+EOF
+fi
+
+## Reinit git, create .gitignore, and append to doc/README.md
+if command -v git >/dev/null 2>&1; then
+  cd $PROJPATH
+  if [ -e ".git" ] ; then
+      rm -rf .git
+  fi
+  echo -n "* "
+  git init
+  cat << EOF > .gitignore
+old
+EOF
+  cat << EOF >> doc/README.md
+Version Control
+
+To track changes (after creating and editing files)
+
+    git add *
+    git commit -m "first commit"
+
+EOF
+fi
+
+## Create other README.md files
+if true ; then
+  for f in bin data results src ; do
+  cat << EOF > "$f"/README.md
+# $PROJNAME -- $f
+
+**Version:** $TIMESTAMP
+
+**Sign:** $USER
+
+## Description
+
+Text here.
+
+EOF
+  done
+fi
+
+## List files and folders
+echo "* Path, files and folders:"
+if command -v tree >/dev/null 2>&1; then
+  tree -I init.sh "$PROJPATH"
+else
+  ls -I init.sh -F "$PROJPATH"
+fi
+echo "* Start with file doc/README.md"
+
+## Move init.sh to old/
+mv  "$0" $PROJPATH/old/.
diff --git a/data/README.md b/data/README.md
new file mode 100644
index 0000000..289b754
--- /dev/null
+++ b/data/README.md
@@ -0,0 +1,10 @@
+# ptemplate -- data
+
+**Version:** 2016-01-25
+
+**Sign:** nylander
+
+## Description
+
+Text here.
+
diff --git a/doc/README.md b/doc/README.md
new file mode 100644
index 0000000..f40d25e
--- /dev/null
+++ b/doc/README.md
@@ -0,0 +1,41 @@
+# Project ptemplate
+
+**Version:** 2016-01-25
+
+**Sign:** nylander
+
+## Description
+
+Text here.
+
+More documentation in the `doc` folder.
+
+## Tools
+
+* bin/fasta_unwrap.pl
+* bin/fasta_wrap.pl
+* bin/get_fasta_info.pl
+
+## Data
+
+Data in the `data` folder.
+
+## Analyses
+
+Text here.
+
+---
+
+## Results
+
+Results in `results` folder.
+
+---
+
+Version Control
+
+To track changes (after creating and editing files)
+
+    git add *
+    git commit -m "first commit"
+
diff --git a/old/.gitignore b/old/.gitignore
new file mode 100644
index 0000000..e69de29
diff --git a/results/README.md b/results/README.md
new file mode 100644
index 0000000..92673cd
--- /dev/null
+++ b/results/README.md
@@ -0,0 +1,10 @@
+# ptemplate -- results
+
+**Version:** 2016-01-25
+
+**Sign:** nylander
+
+## Description
+
+Text here.
+
diff --git a/src/README.md b/src/README.md
new file mode 100644
index 0000000..b0e5d70
--- /dev/null
+++ b/src/README.md
@@ -0,0 +1,10 @@
+# ptemplate -- src
+
+**Version:** 2016-01-25
+
+**Sign:** nylander
+
+## Description
+
+Text here.
+
diff --git a/src/fastaparser.pl b/src/fastaparser.pl
new file mode 100755
index 0000000..0a34ea2
--- /dev/null
+++ b/src/fastaparser.pl
@@ -0,0 +1,36 @@
+#!/usr/bin/perl
+
+# perlmonks.org/?node_id=308170
+
+use strict;
+use Data::Dumper;
+
+my @seq;  # array of sequences
+my %seq;  # hash lookup of sequences based on id's
+{
+    local $/ = '>';
+    while (<DATA>) {
+        s/^>//g;                       # strip out '>' from beginning
+        s/>$//g;                       # and end of line
+        next if !length($_);           # ignore empty lines
+        my ($header) = /^(.*)\n/;      # capture the header
+        s/^(.*)\n//;                   # and strip it out
+        my @rec = split /\|/, $header; # assuming genbank header format
+        s/\n//mg;                      # join the sequence strings
+        push @rec, $_;                 # make sequence the last element
+        push @seq, \@rec;              # push into array of sequences
+        $seq{$rec[1]} = \@rec;         # or store in a hash lookup
+    }
+}
+print Dumper(\@seq);
+print Dumper(\%seq);
+
+__DATA__
+>gi|2695845|emb|Y13255.1|ABY13255 Acipenser baeri mRNA for immunoglobulin heavy chain, clone ScH 3.3
+TGGTTACAACACTTTCTTCTTTCAATAACCACAATACTGCAGTACAATGGGGATTTTAACAGCTCTCTGTATAATAATGA
+TACCCCCGCGACCTTCTCGTGGACTGATCAATCTGGAAAAGCTTTT
+>gi|2695846|emb|Y13255.1|ABY13255 Acipenser baeri mRNA for immunoglobulin heavy chain, clone ScH 3.3
+TGGTTACAACACTTTCTTCTTTCAATAACCACAATACTGCAGTACAATGGGGATTTTAACAGCTCTCTGTATAATAATGA
+TACCCCCGCGACCTTCTCGTGGACTGATCAATCTGGAAAAGCTTTT
+
+
diff --git a/tmp/.gitignore b/tmp/.gitignore
new file mode 100644
index 0000000..e69de29