-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgff2gene.cu
141 lines (121 loc) · 4.29 KB
/
gff2gene.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
#include "gff2gene.cuh"
#include "functions.cuh"
gff2gene::gff2gene(string input_File, string output_Path)
{
/**
* * Constructor Function
* Assigns passed variables to the classes' private variable.
**/
cout << "Starting up GFF to Gene list" << endl
<< endl;
this->input_File = input_File;
this->output_Path = output_Path;
}
void gff2gene::ingress()
{
/**
* Execution function.
**/
/**
* Call the "functions" class. Bespoke functions commonly used by CATE.
**/
functions function = functions();
cout << "Processing GFF file: " << this->input_File << endl
<< endl;
/**
* Open the GFF file from the location.
**/
fstream gff_File;
gff_File.open(input_File, ios::in);
if (gff_File.is_open())
{
/**
* @param line captures the sequences line read from the FASTA file.
**/
string line;
while (getline(gff_File, line))
{
/**
* Skip the header lines beginning with a "#" or "##" at the start of a GFF file.
**/
if (line.at(0) != '#')
{
break;
}
}
/**
* @param file_Name defines the output gene file, it will be a derivation of the GFF file's name.
**/
string file_Name = this->output_Path + "/" + filesystem::path(input_File).stem().string() + ".txt";
fstream gene_List;
gene_List.open(file_Name, ios::out);
while (getline(gff_File, line))
{
// cout << line << endl;
/**
* @param split_Data vector captures split function's outputs.
* [0] = sequence ID
* [1] = source
* [2] = feature type
* [3] = start
* [4] = end
* [5] = score
* [6] = strand
* [7] = phase
* [8] = atributes
**/
vector<string> split_Data;
function.split(split_Data, line, '\t');
/**
* Ensures that it is a target data row. They all have 9 columns as standard.
**/
if (split_Data.size() == 9)
{
// cout << line << endl;
/**
* @param feature captures the feature type which is the third column.
**/
string feature = split_Data[2];
/**
* Capitalizes the string to normalize the data and prevent mismatch errors.
**/
transform(feature.begin(), feature.end(), feature.begin(), ::toupper);
/**
* If the feature is equal to being a GENE it's data will be extracted.
**/
if (feature == "GENE")
{
/**
* @param attribute Get attribute information to get the gene's NAME if available else it will be "NA".
**/
string attribute = split_Data[8];
vector<string> split_attributes;
function.split(split_attributes, attribute, ';');
string gene_Name = "NA";
for (string split_attribute : split_attributes)
{
// string ID_check;
vector<string> ID_check_split;
function.split(ID_check_split, split_attribute, '=');
transform(ID_check_split[0].begin(), ID_check_split[0].end(), ID_check_split[0].begin(), ::toupper);
/**
* Gene names are followed by the "ID" tag.
**/
if (ID_check_split[0] == "ID")
{
gene_Name = ID_check_split[1];
break;
}
}
string write_Line = gene_Name + "\t" + split_Data[0] + ":" + split_Data[3] + ":" + split_Data[4] + "\n";
cout << write_Line;
gene_List << write_Line;
}
// REMOVE AFTER TESTING
// break;
}
}
gff_File.close();
gene_List.close();
}
}