Skip to content

Commit

Permalink
ENH: first-pass LJA+Flye DOT parser; custom errors
Browse files Browse the repository at this point in the history
needs testing, and this will crash downstream b/c the layout code/etc
expects nodes to have lengths. But overall, this is a good start.

from there -- need to allow other parsers to create multidigraphs (not
just this one) -- marbl#202. and, for DOT graphs, draw nodes as circles w/
uniform sizes (marbl#211). once these (and the requisite other hurdles i
haven't thought of r/n) are passed, we've got something going.
  • Loading branch information
fedarko committed Mar 22, 2023
1 parent c0e74a6 commit 12824c9
Show file tree
Hide file tree
Showing 7 changed files with 302 additions and 79 deletions.
286 changes: 249 additions & 37 deletions metagenomescope/assembly_graph_parser.py

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions metagenomescope/errors.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
class GraphParsingError(Exception):
pass
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,11 @@ def test_sniff_filetype():
assert sniff_filetype("aSdF.FaStG") == "fastg"
assert sniff_filetype("LastGraphfastg") == "fastg"

assert sniff_filetype("asdf.dot") == "dot"
assert sniff_filetype("ASDF.DOT") == "dot"
assert sniff_filetype("asdf.gv") == "gv"
assert sniff_filetype("ASDF.GV") == "gv"

with pytest.raises(NotImplementedError):
sniff_filetype("asdf.asdf")
with pytest.raises(NotImplementedError):
Expand Down
5 changes: 3 additions & 2 deletions metagenomescope/tests/assembly_graph_parser/test_parse_gfa.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# from .utils import run_tempfile_test
from metagenomescope.input_node_utils import negate_node_id
from metagenomescope.assembly_graph_parser import parse_gfa
from metagenomescope.errors import GraphParsingError
from .utils import run_tempfile_test
from gfapy.error import InconsistencyError

Expand Down Expand Up @@ -134,7 +135,7 @@ def test_parse_no_length_node():
s1.pop(1)
s1.insert(1, "S\t1\t*")
run_tempfile_test(
"gfa", s1, ValueError, "Found a node without a specified length: 1"
"gfa", s1, GraphParsingError, "Found a node without a specified length: 1"
)

# Manually assigning node 1 a sequence should fix the problem
Expand Down Expand Up @@ -177,7 +178,7 @@ def test_parse_invalid_id_node():
run_tempfile_test(
"gfa",
s1,
ValueError,
GraphParsingError,
"Node IDs in the input assembly graph cannot "
'start with the "-" character.',
)
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from .utils import run_tempfile_test
from metagenomescope.errors import GraphParsingError
from metagenomescope.assembly_graph_parser import parse_lastgraph
from metagenomescope.tests.assembly_graph_parser.test_validate_lastgraph import (
reset_glines,
Expand Down Expand Up @@ -64,13 +65,13 @@ def test_parse_lastgraph_node_interrupted():
glines = reset_glines()
glines.pop(3)
run_tempfile_test(
"LastGraph", glines, ValueError, "Line 4: Node block ends too early."
"LastGraph", glines, GraphParsingError, "Line 4: Node block ends too early."
)

glines = reset_glines()
glines[2] = "ARC\t1\t1\t5"
run_tempfile_test(
"LastGraph", glines, ValueError, "Line 3: Node block ends too early."
"LastGraph", glines, GraphParsingError, "Line 3: Node block ends too early."
)


Expand All @@ -79,22 +80,22 @@ def test_parse_lastgraph_invalid_node_count():
exp_msg = "Line 1: $NUMBER_OF_NODES must be a positive integer"

glines[0] = "3.5\t10\t1\t1"
run_tempfile_test("LastGraph", glines, ValueError, exp_msg)
run_tempfile_test("LastGraph", glines, GraphParsingError, exp_msg)

glines[0] = "-3.5\t10\t1\t1"
run_tempfile_test("LastGraph", glines, ValueError, exp_msg)
run_tempfile_test("LastGraph", glines, GraphParsingError, exp_msg)

glines[0] = "-2\t10\t1\t1"
run_tempfile_test("LastGraph", glines, ValueError, exp_msg)
run_tempfile_test("LastGraph", glines, GraphParsingError, exp_msg)

glines[0] = "2.0\t10\t1\t1"
run_tempfile_test("LastGraph", glines, ValueError, exp_msg)
run_tempfile_test("LastGraph", glines, GraphParsingError, exp_msg)

glines[0] = "ABC\t10\t1\t1"
run_tempfile_test("LastGraph", glines, ValueError, exp_msg)
run_tempfile_test("LastGraph", glines, GraphParsingError, exp_msg)

glines[0] = "0x123\t10\t1\t1"
run_tempfile_test("LastGraph", glines, ValueError, exp_msg)
run_tempfile_test("LastGraph", glines, GraphParsingError, exp_msg)

glines[0] = "0\t10\t1\t1"
run_tempfile_test("LastGraph", glines, ValueError, exp_msg)
run_tempfile_test("LastGraph", glines, GraphParsingError, exp_msg)
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from networkx import NetworkXError
from .utils import run_tempfile_test
from metagenomescope.errors import GraphParsingError
from metagenomescope.assembly_graph_parser import parse_metacarvel_gml


Expand Down Expand Up @@ -92,34 +93,34 @@ def test_parse_metacarvel_gml_insufficient_node_metadata():
# Remove orientation from node 10
mg.pop(5)
exp_msg = 'Only 11 / 12 nodes have "orientation" given.'
run_tempfile_test("gml", mg, ValueError, exp_msg, join_char="")
run_tempfile_test("gml", mg, GraphParsingError, exp_msg, join_char="")
# Remove length from node 10 (it's the line after the previous line we
# removed)
mg.pop(5)
# due to "precedence" (just the order of iteration in the for loops), we
# expect orientation to take priority over length in these error messages
# -- but this doesn't really matter
run_tempfile_test("gml", mg, ValueError, exp_msg, join_char="")
run_tempfile_test("gml", mg, GraphParsingError, exp_msg, join_char="")

# Restore mg
mg = get_marygold_gml()
# Now, just remove the length line. We should see an error message about
# length, not orientation, now.
mg.pop(6)
exp_msg = 'Only 11 / 12 nodes have "length" given.'
run_tempfile_test("gml", mg, ValueError, exp_msg, join_char="")
run_tempfile_test("gml", mg, GraphParsingError, exp_msg, join_char="")
# For fun, let's remove all of the lines with length and make sure this
# updates the error msg accordingly
mg = [line for line in mg if "length" not in line]
exp_msg = 'Only 0 / 12 nodes have "length" given.'
run_tempfile_test("gml", mg, ValueError, exp_msg, join_char="")
run_tempfile_test("gml", mg, GraphParsingError, exp_msg, join_char="")

# ... And let's try that same thing with orientation, which as we've
# established takes priority in error messages (again, doesn't actually
# matter, but we might as well test that this behavior remains consistent)
mg = [line for line in mg if "orientation" not in line]
exp_msg = 'Only 0 / 12 nodes have "orientation" given.'
run_tempfile_test("gml", mg, ValueError, exp_msg, join_char="")
run_tempfile_test("gml", mg, GraphParsingError, exp_msg, join_char="")


def test_parse_metacarvel_gml_insufficient_edge_metadata():
Expand All @@ -128,35 +129,35 @@ def test_parse_metacarvel_gml_insufficient_edge_metadata():
# Remove orientation from edge 8 -> 9 (line 190 in the file)
mg.pop(189)
exp_msg = 'Only 15 / 16 edges have "orientation" given.'
run_tempfile_test("gml", mg, ValueError, exp_msg, join_char="")
run_tempfile_test("gml", mg, GraphParsingError, exp_msg, join_char="")
# Remove orientation from all edges in the file
mg = [
line
for line in mg
if 'orientation "E' not in line and 'orientation "B' not in line
]
exp_msg = 'Only 0 / 16 edges have "orientation" given.'
run_tempfile_test("gml", mg, ValueError, exp_msg, join_char="")
run_tempfile_test("gml", mg, GraphParsingError, exp_msg, join_char="")

# Restore mg
mg = get_marygold_gml()
# Remove mean from edge 12 -> 8
mg.pop(182)
exp_msg = 'Only 15 / 16 edges have "mean" given.'
run_tempfile_test("gml", mg, ValueError, exp_msg, join_char="")
run_tempfile_test("gml", mg, GraphParsingError, exp_msg, join_char="")
# Also remove mean from edge 8 -> 9
# This is actually line 191 of the file, but remember 0-indexing + already
# popped one line above in the file
mg.pop(189)
exp_msg = 'Only 14 / 16 edges have "mean" given.'
run_tempfile_test("gml", mg, ValueError, exp_msg, join_char="")
run_tempfile_test("gml", mg, GraphParsingError, exp_msg, join_char="")

# Restore mg
mg = get_marygold_gml()
# Remove bsize from edge 7 -> 12
mg.pop(168)
exp_msg = 'Only 15 / 16 edges have "bsize" given.'
run_tempfile_test("gml", mg, ValueError, exp_msg, join_char="")
run_tempfile_test("gml", mg, GraphParsingError, exp_msg, join_char="")

# Remove stdev from edge 7 -> 9
# Note that we haven't restored mg from above yet! This tests the whole
Expand All @@ -167,11 +168,11 @@ def test_parse_metacarvel_gml_insufficient_edge_metadata():
# breaking this function.
mg.pop(174)
exp_msg = 'Only 15 / 16 edges have "stdev" given.'
run_tempfile_test("gml", mg, ValueError, exp_msg, join_char="")
run_tempfile_test("gml", mg, GraphParsingError, exp_msg, join_char="")

# Remove bsize from *all* lines -- stdev error should still show up
mg = [line for line in mg if "bsize" not in line]
run_tempfile_test("gml", mg, ValueError, exp_msg, join_char="")
run_tempfile_test("gml", mg, GraphParsingError, exp_msg, join_char="")


def test_parse_metacarvel_gml_undirected_graph():
Expand All @@ -182,10 +183,10 @@ def test_parse_metacarvel_gml_undirected_graph():
# Try two things: 1) the choice of directed/undirected isn't specified
# (defaults to undirected), 2) explicitly specified as not directed
mg.pop(1)
run_tempfile_test("gml", mg, ValueError, exp_msg, join_char="")
run_tempfile_test("gml", mg, GraphParsingError, exp_msg, join_char="")

mg.insert(1, " directed 0\n")
run_tempfile_test("gml", mg, ValueError, exp_msg, join_char="")
run_tempfile_test("gml", mg, GraphParsingError, exp_msg, join_char="")


def test_parse_metacarvel_gml_duplicate_edges():
Expand Down Expand Up @@ -221,7 +222,7 @@ def test_parse_metacarvel_gml_duplicate_edges():
# detect that the input graph is a multigraph and be all like "nuh uh
# you didn't get that from MetaCarvel, now did you" (something like that)
run_tempfile_test(
"gml", mg, ValueError, "Multigraphs are unsupported", join_char=""
"gml", mg, GraphParsingError, "Multigraphs are unsupported", join_char=""
)


Expand Down Expand Up @@ -294,7 +295,7 @@ def test_parse_metacarvel_gml_invalid_node_metadata():
# angry if you put a string like FOW outside of quotes in a GML file.)
p = "".join([c for c in o if c != '"'])
exp_msg = 'Node NODE_10 has unsupported orientation "{}".'.format(p)
run_tempfile_test("gml", mg, ValueError, exp_msg, join_char="")
run_tempfile_test("gml", mg, GraphParsingError, exp_msg, join_char="")

# Test invalid lengths
lengths = [
Expand All @@ -315,7 +316,7 @@ def test_parse_metacarvel_gml_invalid_node_metadata():
exp_msg = 'Node NODE_10 has non-positive-integer length "{}".'.format(
length_to_test
)
run_tempfile_test("gml", mg, ValueError, exp_msg, join_char="")
run_tempfile_test("gml", mg, GraphParsingError, exp_msg, join_char="")


def test_parse_metacarvel_gml_invalid_edge_metadata():
Expand Down Expand Up @@ -344,12 +345,12 @@ def test_parse_metacarvel_gml_invalid_edge_metadata():
# (See test_parse_metacarvel_gml_invalid_node_metadata() above)
p = "".join([c for c in val if c != '"'])
exp_msg = 'has unsupported orientation "{}".'.format(p)
run_tempfile_test("gml", mg, ValueError, exp_msg, join_char="")
run_tempfile_test("gml", mg, GraphParsingError, exp_msg, join_char="")

mg.pop(189)
mg.insert(189, ' orientation "REV"\n')
exp_msg = 'has unsupported orientation "REV".'
run_tempfile_test("gml", mg, ValueError, exp_msg, join_char="")
run_tempfile_test("gml", mg, GraphParsingError, exp_msg, join_char="")

# Restoring the orientation to something normal (say, BB) should work
mg.pop(189)
Expand All @@ -376,21 +377,21 @@ def test_parse_metacarvel_gml_invalid_edge_metadata():
mg.insert(192, " bsize {}\n".format(val))
p = "".join([c for c in val if c != '"'])
exp_msg = 'has non-positive-integer bsize "{}".'.format(p)
run_tempfile_test("gml", mg, ValueError, exp_msg, join_char="")
run_tempfile_test("gml", mg, GraphParsingError, exp_msg, join_char="")

# 3. Test mean
mg = get_marygold_gml()
mg.pop(198)
mg.insert(198, ' mean "ABC"\n')
exp_msg = 'has non-numeric mean "ABC".'
run_tempfile_test("gml", mg, ValueError, exp_msg, join_char="")
run_tempfile_test("gml", mg, GraphParsingError, exp_msg, join_char="")

# 4. Test stdev
mg = get_marygold_gml()
mg.pop(199)
mg.insert(199, ' stdev "ABC"\n')
exp_msg = 'has non-numeric stdev "ABC".'
run_tempfile_test("gml", mg, ValueError, exp_msg, join_char="")
run_tempfile_test("gml", mg, GraphParsingError, exp_msg, join_char="")
# TODO test bsize, mean, stdev more thoroughly

# Test that NaN/infinity/zero/negative values work ok (but only with
Expand Down Expand Up @@ -461,14 +462,14 @@ def test_parse_metacarvel_gml_repeated_node_attrs():
mg = get_marygold_gml()
mg.insert(5, ' orientation "REV"\n')
exp_msg = "Node NODE_10 has unsupported orientation \"['REV', 'FOW']\"."
run_tempfile_test("gml", mg, ValueError, exp_msg, join_char="")
run_tempfile_test("gml", mg, GraphParsingError, exp_msg, join_char="")

mg = get_marygold_gml()
mg.insert(5, ' length "200"\n')
exp_msg = (
"Node NODE_10 has non-positive-integer length \"['200', '100']\"."
)
run_tempfile_test("gml", mg, ValueError, exp_msg, join_char="")
run_tempfile_test("gml", mg, GraphParsingError, exp_msg, join_char="")


def test_parse_metacarvel_gml_repeated_edge_attrs():
Expand All @@ -478,31 +479,31 @@ def test_parse_metacarvel_gml_repeated_edge_attrs():
"Edge ('NODE_7', 'NODE_12') has unsupported orientation "
"\"['EB', 'EB']\"."
)
run_tempfile_test("gml", mg, ValueError, exp_msg, join_char="")
run_tempfile_test("gml", mg, GraphParsingError, exp_msg, join_char="")

mg = get_marygold_gml()
mg.insert(166, ' mean "123.45"\n')
exp_msg = (
"Edge ('NODE_7', 'NODE_12') has non-numeric mean "
"\"['123.45', '-200.00']\"."
)
run_tempfile_test("gml", mg, ValueError, exp_msg, join_char="")
run_tempfile_test("gml", mg, GraphParsingError, exp_msg, join_char="")

mg = get_marygold_gml()
mg.insert(166, ' stdev "123.45"\n')
exp_msg = (
"Edge ('NODE_7', 'NODE_12') has non-numeric stdev "
"\"['123.45', 25.1234]\"."
)
run_tempfile_test("gml", mg, ValueError, exp_msg, join_char="")
run_tempfile_test("gml", mg, GraphParsingError, exp_msg, join_char="")

mg = get_marygold_gml()
mg.insert(167, " bsize 15\n")
exp_msg = (
"Edge ('NODE_7', 'NODE_12') has non-positive-integer bsize "
'"[15, 30]".'
)
run_tempfile_test("gml", mg, ValueError, exp_msg, join_char="")
run_tempfile_test("gml", mg, GraphParsingError, exp_msg, join_char="")


def test_parse_metacarvel_gml_repeated_edge_source_or_target():
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import pytest
from io import StringIO
from metagenomescope.errors import GraphParsingError
from metagenomescope.assembly_graph_parser import validate_lastgraph_file


Expand All @@ -9,9 +10,9 @@ def get_validate_err(glines):
bad_lg = StringIO("\n".join(glines))
# Assume that the LastGraph file represented by bad_lg will fail
# validation, and return the accompanying error message.
# (If a ValueError *isn't* raised, this'll throw an error saying DID NOT
# RAISE or something.)
with pytest.raises(ValueError) as ei:
# (If a GraphParsingError *isn't* raised, this'll throw an error saying
# DID NOT RAISE or something.)
with pytest.raises(GraphParsingError) as ei:
validate_lastgraph_file(bad_lg)
return str(ei.value)

Expand Down

0 comments on commit 12824c9

Please sign in to comment.