This repository has been archived by the owner on Jul 30, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathkmer2tsv.py
51 lines (39 loc) · 2.02 KB
/
kmer2tsv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import argparse
import glob
import os
import pandas
from Bio import SeqIO
import joblib
arg_parser = argparse.ArgumentParser(description='Convert kmer counts to tsv')
arg_parser.add_argument('--label_path', type=str, help='Path to labels file', default='../dataset/source/simulation_abundance_viral10%.CSV')
arg_parser.add_argument('--fasta_path', type=str, help='Path to fasta file', default='../dataset/simulation_abundance')
arg_parser.add_argument('--verbose', type=bool, help='Verbose', default=True)
args = arg_parser.parse_args()
labels = pandas.read_csv(args.label_path)
print(f'Processing files in {args.fasta_path}')
fasta_files = [*sorted(glob.glob(os.path.join(args.fasta_path, "*.fasta"))),
*sorted(glob.glob(os.path.join(args.fasta_path, "*.fna")))]
print(f'Found {len(fasta_files)} files')
def get_label(ncbi_id):
return int(labels[labels['NCBI Accession'] == ncbi_id]['type'].values[0] == 'virus')
def concat_with_label(kmer, ncbi_id):
real_ncbi_id = ncbi_id.split('|')[3]
if real_ncbi_id not in labels['NCBI Accession'].values:
raise Exception(f'NCBI ID {real_ncbi_id} not found in labels')
if args.verbose:
print(f'Label for {real_ncbi_id} is {get_label(real_ncbi_id)}')
return f'{kmer.upper()}\t{get_label(real_ncbi_id)}'
def process_one_file(filename):
records = list(SeqIO.parse(filename, "fasta"))
if args.verbose:
print(f'Found {len(records)} records in {filename}')
with open(f"{filename}_new.txt", "r") as kmers_file:
kmers = kmers_file.read().split('\n')
if args.verbose:
print(f'Found {len(kmers)} kmers in {filename}')
with open(f"{filename}_new_labeled.txt", "w") as output:
output_records = joblib.Parallel(n_jobs=-1, backend="threading")(
joblib.delayed(concat_with_label)(kmer, record.id) for kmer, record in zip(kmers, records))
output.write("\n".join(output_records))
print(f'File {filename} done')
joblib.Parallel(n_jobs=-1)(joblib.delayed(process_one_file)(filename) for filename in fasta_files)