Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: check pairwise vs sourmash compare output #366

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions test_check_pairwise/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
*.csv
*~
*.txt
.snakemake
*.cmp
Binary file added test_check_pairwise/64sketches.sig.zip
Binary file not shown.
9 changes: 9 additions & 0 deletions test_check_pairwise/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
.PHONY: all clean cleanall

all:
snakemake -c 1 -p

clean:
snakemake -c 1 --delete-all-output

cleanall: clean all
51 changes: 51 additions & 0 deletions test_check_pairwise/Snakefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
COLNAME='average_containment_ani'
COMPARE_ANI_ARGS='--ani --avg-containment'

rule all:
input:
"pairwise-vs-mat.txt",
"mat-vs-pairwise.txt"

rule compare_txt:
input:
pairwise="64sketches.pairwise.csv",
mat="64sketches.mat.csv",
script="compare_pairwise_csv.py",
output:
txt="pairwise-vs-mat.txt",
txt2="mat-vs-pairwise.txt"
shell: """
./compare_pairwise_csv.py {input.mat} {input.pairwise} -u {COLNAME} -R 8 | tee {output.txt}
./compare_pairwise_csv.py {input.pairwise} {input.mat} -u {COLNAME} -R 8 | tee {output.txt2}
"""

rule make_pairwise:
input:
"64sketches.sig.zip"
output:
"64sketches.pairwise.csv"
shell: """
sourmash scripts pairwise --write-all {input} -o {output} --ani -t 0
"""

rule make_mat:
input:
sig="64sketches.sig.zip",
output:
cmp="64sketches.cmp",
txt="64sketches.cmp.labels.txt",
labels="64sketches.labels.csv",
shell: """
sourmash compare {input.sig} -o {output.cmp} {COMPARE_ANI_ARGS} --labels-to {output.labels}
"""

rule mat_to_pair:
input:
cmp="64sketches.cmp",
labels="64sketches.labels.csv",
script="matrix_to_pairwise.py",
output:
csv="64sketches.mat.csv"
shell: """
./matrix_to_pairwise.py {input.cmp} {input.labels} -o {output.csv} -u {COLNAME}
"""
86 changes: 86 additions & 0 deletions test_check_pairwise/compare_pairwise_csv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
#! /usr/bin/env python
import sys
import argparse
import csv
import numpy
from collections import defaultdict

from sourmash.logging import debug_literal, error, notify, print_results
from sourmash import sourmash_args

def main():
p = argparse.ArgumentParser()
p.add_argument('csv1')
p.add_argument('csv2')
p.add_argument('-u', '--use-column', default='jaccard')
p.add_argument('-R', '--round-to', default=10, type=int)
args = p.parse_args()

notify(f"loading '{args.csv1}'")
with sourmash_args.FileInputCSV(args.csv1) as r:
rows1 = list(r)
notify(f"loaded {len(rows1)} rows.")

notify(f"loading '{args.csv2}'")
with sourmash_args.FileInputCSV(args.csv2) as r:
rows2 = list(r)
notify(f"loaded {len(rows2)} rows.")

queries1 = set(( row["query_name"] for row in rows1 ))
matches1 = set(( row["match_name"] for row in rows1 ))

queries2 = set(( row["query_name"] for row in rows2 ))
matches2 = set(( row["match_name"] for row in rows2 ))

assert queries1 == queries2, queries1 ^ queries2
assert matches1 == matches2, matches1 ^ matches2

colname = args.use_column

d1 = defaultdict(dict)
for row in rows1:
q = row["query_name"]
m = row["match_name"]
value = float(row[colname])

d1[q][m] = value

#import pprint
#pprint.pprint(d1)

fail = False
num_matching = 0
num_missing = 0
num_ident = 0
num_nomatch = 0
for row in rows2:
q = row["query_name"]
m = row["match_name"]
val2 = float(row[colname])

val1 = d1[q].get(m, -1)

if val1 == -1:
num_missing += 1
fail = True
elif round(val1, args.round_to) != round(val2, args.round_to):
notify(f"MISMATCH: '{q}' vs '{m}', {val1} != {val2}")
fail = True
num_nomatch += 1
else:
if val1 == 1.0: # and val2 == 1.0
num_ident += 1
else:
num_matching += 1

print(f"values at 100%: {num_ident}")
print(f"matching: {num_matching}")
print(f"non-matching: {num_nomatch}")
print(f"num missing: {num_missing}")

if fail:
notify("differences detected! failing!")
sys.exit(-1)

if __name__ == '__main__':
sys.exit(main())
51 changes: 51 additions & 0 deletions test_check_pairwise/matrix_to_pairwise.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
#! /usr/bin/env python
import sys
import argparse
import csv
import numpy

from sourmash.logging import debug_literal, error, notify, print_results
from sourmash import sourmash_args

def load_labelinfo_csv(filename):
"Load file output by 'sourmash compare --labels-to'"
with sourmash_args.FileInputCSV(filename) as r:
labelinfo = list(r)

labelinfo.sort(key=lambda row: int(row["sort_order"]))
return labelinfo


def main():
p = argparse.ArgumentParser()
p.add_argument('mat', help='numpy similarity matrix')
p.add_argument('labels_from', help='CSV file output by --labels-to')
p.add_argument('-o', '--output-pairwise-csv', required=True,
help="save CSV to this file")
p.add_argument('-u', '--use-column', default='jaccard')
args = p.parse_args()

with open(args.mat, "rb") as fp:
mat = numpy.load(fp)
notify(f"...got {mat.shape[0]} x {mat.shape[1]} matrix.", *mat.shape)

labelinfo = load_labelinfo_csv(args.labels_from)

n_rows = 0
with open(args.output_pairwise_csv, "w", newline="") as outfp:
w = csv.writer(outfp)
w.writerow(["query_name", "match_name", args.use_column])
for i in range(mat.shape[0]):
for j in range(i + 1):
val = mat[i][j]
if val > 0.0:
w.writerow([labelinfo[j]["label"],
labelinfo[i]["label"],
val])
n_rows += 1

notify(f"wrote {n_rows} rows to '{args.output_pairwise_csv}'")


if __name__ == '__main__':
sys.exit(main())