Skip to content

Commit

Permalink
Make removing taxa easy; closes #12
Browse files Browse the repository at this point in the history
  • Loading branch information
xrotwang committed Aug 9, 2023
1 parent 665b11a commit 4e2287e
Show file tree
Hide file tree
Showing 3 changed files with 54 additions and 2 deletions.
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
# Changes

## [Unreleased]

- `tools.normalise.normalise` now accepts a `remove_taxa` argument, making it easy to remove taxa from a NEXUS file in a consistent way.


## [v1.5.0] - 2023-07-20

- Make NEXUS content created by `commonnexus` simpler to parse and thus
Expand Down
19 changes: 17 additions & 2 deletions src/commonnexus/tools/normalise.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,19 +17,24 @@
- The ";" terminating MATRIX commands is on a separate line, allowing more simplistic parsing
of matrix rows.
"""
import typing
import collections

from commonnexus import Nexus
from commonnexus.blocks.characters import Data
from commonnexus.blocks import Taxa, Distances, Characters, Trees


def normalise(nexus: Nexus,
data_to_characters: bool = False,
strip_comments: bool = False) -> Nexus:
strip_comments: bool = False,
remove_taxa: typing.Optional[typing.Container[str]] = None) -> Nexus:
"""
:param nexus: A `Nexus` object to be normalised in-place.
:param data_to_characters: Flag signaling whether DATA blocks should be converted to CHARACTER \
blocks.
:param strip_comments: Flag signaling whether to remove all non-command comments.
:param remove_taxa: Container of taxon labels specifying taxa to remove from relevant blocks.
:return: The modified `Nexus` object.
.. code-block:: python
Expand Down Expand Up @@ -87,6 +92,8 @@ def normalise(nexus: Nexus,
TREE 1 = (t1,t2,t3);
END;
"""
remove_taxa = remove_taxa or []

if strip_comments:
nexus = Nexus([cmd.without_comments() for cmd in nexus], config=nexus.cfg)
nexus = Nexus([cmd.with_normalised_whitespace() for cmd in nexus], config=nexus.cfg)
Expand All @@ -95,6 +102,7 @@ def normalise(nexus: Nexus,
if nexus.characters:
matrix = nexus.characters.get_matrix()
taxlabels = list(matrix.keys())
matrix = collections.OrderedDict((k, v) for k, v in matrix.items() if k not in remove_taxa)
characters = nexus.DATA or nexus.CHARACTERS
cls = Data if characters.name == 'DATA' and not data_to_characters else Characters
nexus.replace_block(
Expand All @@ -107,19 +115,26 @@ def normalise(nexus: Nexus,
assert set(matrix.keys()).issubset(taxlabels)
else:
taxlabels = list(matrix.keys())
matrix = collections.OrderedDict(
(k, collections.OrderedDict((kk, vv) for kk, vv in v.items() if kk not in remove_taxa))
for k, v in matrix.items() if k not in remove_taxa)
nexus.replace_block(nexus.DISTANCES, Distances.from_data(matrix))

if nexus.TREES:
trees = []
for tree in nexus.TREES.trees:
nwk = nexus.TREES.translate(tree) if nexus.TREES.TRANSLATE else tree.newick
if remove_taxa:
nwk.prune_by_names(remove_taxa)
trees.append((tree.name, nwk, tree.rooted))
nexus.replace_block(nexus.TREES, Trees.from_data(*trees))

if taxlabels:
taxa = Taxa.from_data([t for t in taxlabels if t not in remove_taxa])
if nexus.TAXA:
assert nexus.TAXA.DIMENSIONS.ntax == len(taxlabels)
assert set(nexus.TAXA.TAXLABELS.labels.values()) == set(taxlabels)
nexus.replace_block(nexus.TAXA, taxa)
else:
nexus.prepend_block(Taxa.from_data(taxlabels))
nexus.prepend_block(taxa)
return nexus
32 changes: 32 additions & 0 deletions tests/test_tools_normalise.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,38 @@ def test_normalise(nexus):
assert res.TAXA


def test_normalise_remove_taxon(nexus):
nex = nexus(
CHARACTERS="DIMENSIONS NCHAR=3; MATRIX 't 1' 100 t2 010 t3 001;",
DISTANCES="FORMAT NODIAGONAL; MATRIX 't 1' t2 1.0 t3 2.0 3.0;",
TREES="TRANSLATE a 't 1', b t2, c t3; TREE 1 = (a,b\n,c);")
res = str(normalise(nex, remove_taxa={'t2'}))
assert res == """#NEXUS
BEGIN TAXA;
DIMENSIONS NTAX=2;
TAXLABELS 't 1' t3;
END;
BEGIN CHARACTERS;
DIMENSIONS NCHAR=3;
FORMAT DATATYPE=STANDARD MISSING=? GAP=- SYMBOLS="01";
MATRIX
't 1' 100
t3 001
;
END;
BEGIN DISTANCES;
DIMENSIONS NTAX=2;
FORMAT TRIANGLE=BOTH MISSING=?;
MATRIX
't 1' 0 2.0
t3 2.0 0
;
END;
BEGIN TREES;
TREE 1 = ('t 1',t3);
END;"""


def test_normalise_stripcomments():
res = normalise(Nexus('#nexus beg[c]in bl[&c]ock; cmd; end[c];'), strip_comments=True)
assert '[c]' not in str(res)
Expand Down

0 comments on commit 4e2287e

Please sign in to comment.