Skip to content

Commit

Permalink
test for virsorter2
Browse files Browse the repository at this point in the history
  • Loading branch information
KateSakharova committed Oct 18, 2024
1 parent 7e19f7f commit 995808c
Show file tree
Hide file tree
Showing 11 changed files with 1,585 additions and 22 deletions.
24 changes: 13 additions & 11 deletions bin/parse_viral_pred.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ def parse_virus_sorter(sorter_files):
return high_confidence, low_confidence, prophages


def parse_virus_sorter2(sorter_files):
def parse_virus_sorter2(sorter_files, vs_cutoff):
"""Extract high, low and prophages confidence Records from virus sorter results.
High confidence are contigs with confidence score >= confidence cutoff (0.9 per default).
Low confidence are contigs with confidence score < confidence cutoff and > 0.5.
Expand Down Expand Up @@ -222,6 +222,7 @@ def parse_virus_sorter2(sorter_files):
prophages = dict()

final_boundary_file, final_score_file, final_combined_fa_file = "", "", ""
print('SORTER',sorter_files)
for i in sorter_files:
if "final-viral-boundary.tsv" in i:
final_boundary_file = i
Expand Down Expand Up @@ -250,15 +251,15 @@ def parse_virus_sorter2(sorter_files):
if not circular or not max_score:
continue
circular = circular.startswith('circular')

if 'partial' in record.id:
# add the prophage position within the contig
record.id = clean_name
prophages.setdefault(clean_name, []).append(
Record(record, "prophage", circular, prange)
)

record.id = clean_name

if float(max_score) >= float(args.vs_cutoff):
if float(max_score) >= float(vs_cutoff):
high_confidence[record.id] = Record(record, "high_confidence", circular)
else:
low_confidence[record.id] = Record(record, "low_confidence", circular)
Expand All @@ -270,7 +271,7 @@ def parse_virus_sorter2(sorter_files):
return high_confidence, low_confidence, prophages


def merge_annotations(pprmeta, finder, sorter, sorter2, assembly):
def merge_annotations(pprmeta, finder, sorter, sorter2, assembly, vs_cutoff):
"""Parse VirSorter, VirFinder and PPR-Meta outputs and merge the results.
High confidence viral contigs:
- VirSorter reported as categories 1 and 2
Expand All @@ -291,19 +292,20 @@ def merge_annotations(pprmeta, finder, sorter, sorter2, assembly):
pprmeta_lc = parse_pprmeta(pprmeta)
finder_lc, finder_lowestc = parse_virus_finder(finder)
sorter_hc, sorter_lc, sorter_prophages = parse_virus_sorter(sorter) if sorter is not None else parse_virus_sorter2(
sorter2)
sorter2, vs_cutoff)

for seq_record in SeqIO.parse(assembly, "fasta"):
# HC
if seq_record.id in sorter_hc:
hc_predictions_contigs.append(sorter_hc.get(seq_record.id).get_seq_record())
# Pro
elif seq_record.id in sorter_prophages:
# TODO: check this decision because for VirSorter2 sequence can be in 2 categories
if seq_record.id in sorter_prophages:
# a contig may have several prophages
# for prophages write the record as it holds the
# sliced fasta
for record in sorter_prophages.get(seq_record.id):
prophage_predictions_contigs.append(record.get_seq_record())
# HC
if seq_record.id in sorter_hc:
hc_predictions_contigs.append(sorter_hc.get(seq_record.id).get_seq_record())
# LC
elif seq_record.id in finder_lc:
lc_predictions_contigs.append(seq_record)
Expand Down Expand Up @@ -331,7 +333,7 @@ def main(pprmeta, finder, sorter, sorter2, assembly, outdir, vs_cutoff, prefix=F
sorter_hc,
sorter_lc,
sorter_prophages,
) = merge_annotations(pprmeta, finder, sorter, sorter2, assembly)
) = merge_annotations(pprmeta, finder, sorter, sorter2, assembly, vs_cutoff)

at_least_one = False
name_prefix = ""
Expand Down
3 changes: 2 additions & 1 deletion pytest.ini
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
[pytest]
pythonpath = . bin
norecursedirs = cwl/*
addopts= --git-aware --symlink --color=no

##addopts= --git-aware --symlink --color=no
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
seqname trim_orf_index_start trim_orf_index_end trim_bp_start trim_bp_end trim_pr trim_pr_max prox_orf_index_start prox_orf_index_end prox_bp_start prox_bp_end prox_pr prox_pr_max partial full_orf_index_start full_orf_index_end full_bp_start full_bp_end pr_full arc bac euk vir mix unaligned hallmark_cnt group shape seqname_new final_max_score final_max_score_group
phage_1 1 13 3 23076 1.0 1.0 1 13 3 23076 NA NA 0 1 13 3 23076 1.0 0.0 0.0 0.0 84.6 0.0 15.4 7 dsDNAphage linear seq91||full 1.0 dsDNAphage
phage_2 1 30 1 23146 1.0 1.0 1 30 1 23146 NA NA 0 1 30 1 23146 1.0 0.0 0.0 0.0 93.3 0.0 6.7 11 dsDNAphage linear seq90||full 1.0 dsDNAphage
hc_1 1 65 92 47657 1.0 1.0 1 65 92 47657 NA NA 0 1 65 92 47657 1.0 0.0 0.0 0.0 87.7 0.0 12.3 3 dsDNAphage circular seq9||full 1.0 dsDNAphage
hc_2 1 33 61 20731 0.727 0.76 1 34 61 23816 NA NA 0 1 34 61 23816 0.72 0.0 0.0 2.9 88.2 5.9 2.9 0 dsDNAphage linear seq78||full 0.727 dsDNAphage
lc_1 1 25 2 24532 0.993 0.993 1 25 2 24532 NA NA 0 1 25 2 24532 0.993 0.0 0.0 0.0 60.0 0.0 40.0 2 dsDNAphage linear seq72||full 0.993 dsDNAphage
lc_2 1 36 2 25058 0.947 0.96 1 36 2 25058 NA NA 0 1 36 2 25058 0.947 0.0 0.0 0.0 86.1 2.8 11.1 0 dsDNAphage circular seq70||full 0.947 dsDNAphage
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
>phage_1||partial
AAA
>phage_2||1_partial
GGG
>hc_1||full
TTT
>hc_2||lt2gene
T
>lc_1||full
C
>lc_2||lt2gene
AA
>lc_3||full
GG
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
seqname dsDNAphage ssDNA max_score max_score_group length hallmark viral cellular
phage_1||partial 0.913 0.053 0.7 dsDNAphage 105648 0 67.800 1.300
phage_2||1_partial 1.000 0.087 1.000 dsDNAphage 62420 2 66.000 0.000
hc_1||full 0.987 0.027 0.987 dsDNAphage 57882 15 83.000 1.900
hc_2||lt2gene 1.000 0.413 1.000 dsDNAphage 47566 3 87.700 0.000
lc_1||full 1.000 0.173 0.8 dsDNAphage 45193 1 85.500 0.000
lc_2||lt2gene 1.000 0.513 0.5 dsDNAphage 45227 4 74.000 0.000
seq10||lt2gene nan nan nan dsDNAphage 5899 1 100.000 0.000
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,5 @@ TTT
G
>seq6
C
>seq7|phage-circular
T
>seq8
AA
>seq9
GG
Loading

0 comments on commit 995808c

Please sign in to comment.