-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathannotate_protein_list.py
42 lines (35 loc) · 1.59 KB
/
annotate_protein_list.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
import pandas as pd
from go_protein_annotation.default_use import DefaultAnnotation
from tqdm import tqdm
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("-i", "--infile", help="name of the input file")
parser.add_argument("-o", "--outfile", help="name of the output file")
parser.add_argument("-c", "--column", help="header of smiles column.")
parser.add_argument("-s", "--separator", help="symbol for delimitation. Write 'tab' for tab-delimited files")
args = parser.parse_args()
# Checking parsed arguments for validity
if not args.infile:
raise ValueError("Please specify an input-file via the argument '-i'!")
if args.outfile:
out_file = args.outfile
else:
out_file = "go_function_annotation.tsv"
if not args.column:
raise ValueError("Please specify the name of the column containing the smiles '-c'!")
if not args.separator or args.separator == "tab":
sep_symbol = "\t"
else:
sep_symbol = str(args.separator)
print(sep_symbol)
protein_df = pd.read_csv(args.infile, sep=sep_symbol, low_memory=False)
protein_df = protein_df[args.column].unique().tolist()
default_annotation = DefaultAnnotation()
out_dict_list = []
for uniprot_id in tqdm(protein_df):
out = default_annotation.get_protein_functions(uniprot_id, as_dataframe=False)
out_dict_list.extend(out)
protein_class_df = pd.DataFrame(out_dict_list)
protein_class_df.set_index("uniprot_id", inplace=True)
protein_class_df.to_csv(out_file, sep="\t")