-
Notifications
You must be signed in to change notification settings - Fork 0
/
uniref2panda_reduce.pl
executable file
·108 lines (67 loc) · 2.89 KB
/
uniref2panda_reduce.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
#!/usr/bin/perl
###########################################################################
# $Id: uniref2panda.pl 1 2010-07-08 10:39:00Z rsanka $
#
# Description: Converts input fasta file with UniRef headers to fasta file
# with PANDA headers.
#
# Original version by Ravi
# Hadoop-fied by Ntino
###########################################################################
##looses one record in the beginning and one in the end
use strict;
use warnings;
my ( $unirefID, $unirefHEAD, $unirefCluster, $unirefTaxon, $isoform, @isoformData, $newHeader, $misc, $sequence, $emblID, $giID, $key, $value, $uniref_entry_flag, $id_mapping_flag);
$uniref_entry_flag = 0;
$id_mapping_flag = 0;
$unirefHEAD = '';
$sequence = '';
$giID = '';
$emblID = '';
while (<STDIN>) {
if ($uniref_entry_flag == 1 and $id_mapping_flag ==1) {
$uniref_entry_flag = 0;
$id_mapping_flag = 0;
#NEED taxon:TAXID HERE FROM THE GI - NCBI TAXID MAPPING
if ($isoform eq "") {
$newHeader = ">$giID|$emblID $unirefCluster taxon: {$unirefTaxon;} $misc";
}
else {
$newHeader = ">$giID|$emblID|$isoform $unirefCluster taxon: {$unirefTaxon;} $misc";
}
print "$newHeader\n$sequence\n";
}
#$unirefHEAD = '';
#$giID = '';
#$emblID = '';
chomp $_;
($key,$value) = split(/\t/, $_);
if ($_=~ /.*@@@.*/) {
($unirefHEAD,$sequence) = split(/@@@/,$value);
}
elsif ($_=~ /.*---.*/) {
($giID,$emblID) = split(/---/,$value);
}
# If the line is a uniref header
if ( length($unirefHEAD) > 1 and $uniref_entry_flag == 0 ) {
# Use regex to acquire the following:
$unirefHEAD =~ m/>([A-Za-z0-9]+)_([A-Za-z0-9-]+) (.+) n=\d+ Tax=(.+) RepID.*/;
$unirefID = $2; # UniRef ID
$unirefCluster = $3; # UniRef Cluster name.
$unirefTaxon = $4; # UniRef Taxon name.
$isoform = "";
@isoformData = split('-',$unirefID);
if (scalar(@isoformData) > 1) {
$isoform = $isoformData[1];
$unirefID = $isoformData[0];
}
$uniref_entry_flag = 1;
}
# line is a uniref id mapping header
elsif ( length($giID) > 1 ) {
# Intialize the remaining tags. Set wgp, cg, and closed to 1 if the EMBL ID starts with any of appropriate letters.
$misc = "(exp=0; wgp=0; cg=0; closed=0; pub=1; rf_status =;)";
$misc = "(exp=0; wgp=1; cg=1; closed=1; pub=1; rf_status =;)" if ($emblID =~ m/^(AL|BX|CR|CT|CU).*/);
$id_mapping_flag = 1;
}
}