From 9f240ea4b8e0371131e0ea0e87d6e3a866f13a0b Mon Sep 17 00:00:00 2001 From: Michal Ziemski Date: Mon, 11 Mar 2024 19:59:46 +0100 Subject: [PATCH] ENH: add sample_dict method to MultiFASTADirectoryFormat (#311) * ENH: add sample_dict method to MultiFASTADirectoryFormat * Add a MultiMAGSequencesDirFmt -> MultiFASTADirectoryFormat transformer * ENH: add FeatureData[Contig] type * Lint * Add missing files * Revert "ENH: add FeatureData[Contig] type" This reverts commit edbd24a8beb40318dd749055cfadf5379aebe1ec. This commit was moved to a separate branch/PR. --- q2_types/per_sample_sequences/_format.py | 49 +++++++++++++- q2_types/per_sample_sequences/_transformer.py | 14 ++++ .../data/mags/mags-fa-with-manifest/MANIFEST | 6 ++ .../mags-fa-with-manifest/sample1/mag1.fa | 51 +++++++++++++++ .../mags-fa-with-manifest/sample1/mag2.fa | 64 +++++++++++++++++++ .../mags-fa-with-manifest/sample1/mag3.fa | 46 +++++++++++++ .../mags-fa-with-manifest/sample2/mag1.fa | 41 ++++++++++++ .../mags-fa-with-manifest/sample2/mag2.fa | 42 ++++++++++++ .../per_sample_sequences/tests/test_format.py | 37 ++++++++++- .../tests/test_transformer.py | 8 +++ 10 files changed, 355 insertions(+), 3 deletions(-) create mode 100644 q2_types/per_sample_sequences/tests/data/mags/mags-fa-with-manifest/MANIFEST create mode 100644 q2_types/per_sample_sequences/tests/data/mags/mags-fa-with-manifest/sample1/mag1.fa create mode 100644 q2_types/per_sample_sequences/tests/data/mags/mags-fa-with-manifest/sample1/mag2.fa create mode 100644 q2_types/per_sample_sequences/tests/data/mags/mags-fa-with-manifest/sample1/mag3.fa create mode 100644 q2_types/per_sample_sequences/tests/data/mags/mags-fa-with-manifest/sample2/mag1.fa create mode 100644 q2_types/per_sample_sequences/tests/data/mags/mags-fa-with-manifest/sample2/mag2.fa diff --git a/q2_types/per_sample_sequences/_format.py b/q2_types/per_sample_sequences/_format.py index 25240bc9..d20ac7f0 100644 --- a/q2_types/per_sample_sequences/_format.py +++ b/q2_types/per_sample_sequences/_format.py @@ -597,13 +597,60 @@ def _validate_(self, level): class MultiFASTADirectoryFormat(MultiDirValidationMixin, model.DirectoryFormat): - sequences = model.FileCollection(r'.+\.(fa|fasta)$', format=DNAFASTAFormat) + pathspec = r'.+\.(fa|fasta)$' + sequences = model.FileCollection(pathspec, format=DNAFASTAFormat) @sequences.set_path_maker def sequences_path_maker(self, sample_id, mag_id): # write out with fasta extension, regardless if input was fa or fasta return '%s/%s.fasta' % (sample_id, mag_id) + def sample_dict(self, relative=False): + """ + Returns a mapping of sample id to another dictionary where keys + represent the MAG ID and values correspond to the filepath for + each MAG. + + Parameters + --------- + relative : bool + Whether to return filepaths relative to the directory's location. + Returns absolute filepaths by default. + + Returns + ------- + dict + Mapping of sample id -> dict {mag_id: mag_filepath} as + described above. Both levels of the dictionary are + sorted alphabetically by key. + """ + mags_pattern = re.compile(self.pathspec) + ids = {} + for d in self.path.iterdir(): + if not d.is_dir(): + continue + + sample_id = d.name.rsplit('/', 1)[0] + if sample_id not in ids: + ids[sample_id] = {} + + for path in d.iterdir(): + if not mags_pattern.match(path.name): + continue + + mag_id = os.path.splitext(os.path.basename(path.name))[0] + absolute_path = path.absolute() + if relative: + ids[sample_id][mag_id] = str( + absolute_path.relative_to(self.path.absolute()) + ) + else: + ids[sample_id][mag_id] = str(absolute_path) + + ids[sample_id] = dict(sorted(ids[sample_id].items())) + + return dict(sorted(ids.items())) + class MultiMAGSequencesDirFmt(MultiFASTADirectoryFormat): manifest = model.File('MANIFEST', format=MultiMAGManifestFormat) diff --git a/q2_types/per_sample_sequences/_transformer.py b/q2_types/per_sample_sequences/_transformer.py index 1a3e1581..7c33431e 100644 --- a/q2_types/per_sample_sequences/_transformer.py +++ b/q2_types/per_sample_sequences/_transformer.py @@ -7,6 +7,7 @@ # ---------------------------------------------------------------------------- import os +import shutil import functools import re import warnings @@ -276,3 +277,16 @@ def _29(ff: MultiMAGManifestFormat) -> pd.DataFrame: lambda f: os.path.join(ff.path.parent, f)) df.set_index(['sample-id', 'mag-id'], inplace=True) return df + + +@plugin.register_transformer +def _30(dirfmt: MultiMAGSequencesDirFmt) \ + -> MultiFASTADirectoryFormat: + result = MultiFASTADirectoryFormat() + for sample_id, mag in dirfmt.sample_dict().items(): + os.makedirs(os.path.join(result.path, sample_id)) + for mag_id, mag_fp in mag.items(): + shutil.copy( + mag_fp, os.path.join(result.path, sample_id, f"{mag_id}.fa") + ) + return result diff --git a/q2_types/per_sample_sequences/tests/data/mags/mags-fa-with-manifest/MANIFEST b/q2_types/per_sample_sequences/tests/data/mags/mags-fa-with-manifest/MANIFEST new file mode 100644 index 00000000..e21594a3 --- /dev/null +++ b/q2_types/per_sample_sequences/tests/data/mags/mags-fa-with-manifest/MANIFEST @@ -0,0 +1,6 @@ +sample-id,mag-id,filename +sample1,mag1,sample1/mag1.fasta +sample1,mag2,sample1/mag2.fasta +sample1,mag3,sample1/mag3.fasta +sample2,mag1,sample2/mag1.fasta +sample2,mag2,sample2/mag2.fasta diff --git a/q2_types/per_sample_sequences/tests/data/mags/mags-fa-with-manifest/sample1/mag1.fa b/q2_types/per_sample_sequences/tests/data/mags/mags-fa-with-manifest/sample1/mag1.fa new file mode 100644 index 00000000..484d44ec --- /dev/null +++ b/q2_types/per_sample_sequences/tests/data/mags/mags-fa-with-manifest/sample1/mag1.fa @@ -0,0 +1,51 @@ +>k129_5480 +TTATTTTCAAGATAATGAGCCAATTTAAGCGGTGTCTGGCCGCCAAGCTGCACGATCACA +CCTTTAACTTTCCCATGCTCATTTTCTGCTTCAATCAATGACAATACATCTTCGCCTGTG +AGCGGCTCGAAATATAATCTGTCAGAGGTATCATAATCCGTTGAAACGGTTTCAGGATTA +CAATTAACCATGATTGTTTCATAACCCGCCTCTTTTAGTGCATAGGCGGCATGGACACAG +CAATAATCAAATTCAATACCTTGCCCGATACGGTTTGGCCCGCCACCTAGAATAACGATT +TTGTCTTTTTTAGTTGCTGTAATTTCAGAAGTAGAATTAAGTGTTTCATAGGTGCCGTAC +ATATACGATGTTAATGACGGTATTTCTGCCGCACAGCTATCCACCCGCTTATAAACAGGT +TTTACTTTGTGCATCAAACGTGTTTTACGGATCGTTGCTTCTGCAACCCCTACCAATTCA +GCCAGACGCGCATCCGAAAATCCTGCGCGCTTCAATGCCATCCATCCCTGAGGATCTTTC +GGCAGGCCGTTTTTCTTAATGGAGGCCTCAGTATCAATAAGAGATTTTATACGCTCTAAA +TACCACATATCAAATTTTGTTAATTGATAGATAGTTTCTAAATCCATACCGTGTCGCATC +GCTTCGGCTGCATAGAGTAAGCGGGCTGGCGTTGGACGTGAAAGTGCTGCCCGAATATCG +TCCATATCAGGCTCAGACTTACCAGCAATCGGAATGGAGCTAAGCCCCTCTAAGCCCTTT +TCTAAAGAGCGCAAAGCTTTTTGCAGAGACTCTTCGAAGCTACGCCCTATAGCCATGGCT +TCACCGACTGACTTCATTGCTGTGGTTAAGGTGTTATCAGAGCCTTTAAATTTCTCGAAA +GCAAAACGAGGCACTTTTGTCACGACATAATCAATGGATGGCTCAAAGGCTGCGGGTGTT +TTGCCGCCTGTAATATCATTGCCTAATTCATCAAGTGTATACCCTACCGCCAATTTCGCT +GCCACTTTAGCAATCGGAAAACCTGTAGCTTTTGAGGCTAAAGCAGAAGAACGAGACACA +CGAGGGTTCATCTCAATCACCACCATACGGCCTGTCTCTGGATCCATTCCAAATTGGACA +TTCGATCCACCTGTTTCAACACCAATCACACGAAGTACGGCCAATGAGGCATTGCGCATG +ATTTGATACTCTTTATCTGTCAGTGTTAAGGCTGGAGCAACGGTAATAGAATCACCTGTA +TGCACGCCCATAGGGTCAATGTTTTCAATCGAACAAATAATGATAGCGTTGTCCTTTGTA +TCACGAACAACCTCCATCTCGTATTCTTTCCAACCCAATAAACTCTCATCAATCAACACT +TCATTGGTTGGTGACGCATCCAAGCCTTCACGAATGATTTGTTCAAACTCATCTTTGTTA +TAAGCAACCCCGCCACCAGAACCACCCATGGTAAAGGATGGACGAATAATCGCTGGTAAG +CCTGTATGTTTCAGAGCCTCTCTAGCCTCTTCCATAGAATGCACCACCGCACTTTTAGGA +CTTTCAAGACCAATCTTCTCCATACAATCTTTAAATAATTGGCGGTCTTCAGCCTTTTCA +ATGGCTTCTTTATTGGCACCGATCAGTTCAATATTGAGTCTTTTTAATACACCCATTTTA +TCAAGAGCCAGTGCAGCATTCAGTGCCGTCTGACCACCCATGGTTGGAAGCAACGCATCG +GGGCGTTCTTTTTCTAAAATCTTTGCGACAATTTCTGGGGTGATTGGCTCAATATAAGTC +GCATCAGCCATATTCGGATCAGTCATAATTGTGGCTGGATTAGAATTAATCAGGACAACG +CGGTACCCCTCTTCTTTCAGCGCTTTACAGGCTTGTGCACCTGAATAGTCAAATTCACAG +GCTTGACCTATCACGATAGGACCAGCGCCAATAATACAAATGGAGGAAATGTCGGTGCGT +TTAGGCATGTGAATCTCGGTTTCTTTTTTTTATACTTACCGAGAGTTAGTTTATGCACTT +ATCAGGGTGTGCAGACAAGCTCTTTCTTGACCTTACCCGCAAGTTTAGCTATATTCTATC +AACAGCCCGCCCTTGATGGCGGGTTATTTTATTGAAAAGGTGCAAGGCTATGCAAAAAAT +ACCCTTAACAAAACAAGGCCACACAGACCTTGAAGCAGAATTAAAAGATTTAAAACACCG +CCAACGTCCAGCGGTTATTGCTGCGATATCTGAAGCCAGAGAACATGGCGATTTATCAGA +AAACGCTGAATATCACGCCGCCCGTGAGCAGCAAAGCTTTATCGAAGGTCGTATCGAGCA +AGTCGAAGCTATTTTATCGCTCGCTGAGATTATTGACCCGGCCAAAATTTCTGGTGACAC +GGTAAAATTTGCAGCAACTGTTAAAGTCGTTGATTGTGACACAGATGATGAACATATCTA +CCAAATCGTCGGTGATGAAGAATCAGACATTGAAACAGGAAAACTGGCTATCTCGTCACC +TGTTGCCCGCGCTTTAATCGGCAAAAAAGTTGAGGACTCAGTCGAAGTCCGCACACCAAA +AGGCACAAGAGAATACGAAATTTTAGAAATTCTGTATAAGTAATTTCTATTCTTCGATCG +GTACGCCAGGCTTCTTGAAATTACGTTTCATAATAAGTGATGACTTAACAGAGCGAACAT +TTTTTAGCGCTGTCAGTTCTTCTGTAATAAAACGCTGATAAGCATCCCAATCTTTGGCCA +CAATACGGAGTGTGAAATCCATATCACCCGCAATCATGTAACAATCACGAACGAGATCCA +TTTTCTCAACGGCTTTGATAAAGGCCTGAAGGTCTTTTTCTGAAGTGTCTTCTAAAGCTA +CATTGGCAAAAACCGCCACACCATAGCCTAACATTGAAGCACTTAAATCCGCATGATAAC +TTTGGATATAACCATAATCTTCCAAT diff --git a/q2_types/per_sample_sequences/tests/data/mags/mags-fa-with-manifest/sample1/mag2.fa b/q2_types/per_sample_sequences/tests/data/mags/mags-fa-with-manifest/sample1/mag2.fa new file mode 100644 index 00000000..524cddb5 --- /dev/null +++ b/q2_types/per_sample_sequences/tests/data/mags/mags-fa-with-manifest/sample1/mag2.fa @@ -0,0 +1,64 @@ +>k129_5112 +CCCCGGAAAGGGCTGGCGACCGACGATGACCTCGGGAAGCCCCAACTCGCGGCCGATGGC +GCGTACCTCGTCTTTGAAGAGGGTGCGAAGGGGCTCGACGAGGTCGAACTGGAGGTCTTC +GGGCAGCCCACCGACGTTGTGGTGGCTCTTGATGTTGGCGGTTCCAGCCCCGCCACCGGA +CTCGACGACATCCGGATACAGGGTGCCCTGCACGAGGAAGCGGATGGGTTCGCCGTCGGC +CTTGGCCTCATCCACGAGCTCGCGCTGCACCCGCTCGAACGCACGGATGAACTCGCGACC +GATGATCTTGCGCTTCTCTTCGGGATCGCTGACGCCGGCGAGGGCCTCGAGGAACGTCTC +GCGGGCGTCGACGGTCACGAGGCGCACACCGGTCGAGGCTACGTAATCCTGCTCGACCTG +TTCGCGTTCGCCCTTGCGCAGCAGGCCGTGGTCGACGAACACGGCAACGAGCTGGTCGCC +GACTGCCTTGTGAACGAGGGCCGTCGAGACAGCCGAGTCGACTCCGCCCGACAGCGCCGA +GAGCACACGACCCGAGCCGACCTGCGCGCGGATCCGATCGACCTGCTCGGCGATGACGTT +GCCGCTGTTCCAGTCTGCGGGGAGGCCCGCAGCCTTGTGCAGGAAGTTCTCGATGATGCG +CTGCCCGTGGTCGGAGTGCTTGACCTCGGGATGCCACTGCACACCGTACATGCGGCGAGC +GTCGTTGCCGAAAGCGGCGACCGGGGTGGCACCGGTGCGGGCGAGCACCTCGAACCCGGC +GGGGGCTTCGGACACCTGGTCACCATGGCTCATCCAGACGTTCTGCTCCGCGGGCTGGCC +ATCGAACAGTACGCTCTCGTCACGGATGATGCTGGCGTCAGTCGCCCCGTACTCGCGCAG +CCCCGTGTTCGCAACGACGCCACCGAGCGCCTGCGCCATGACCTGGAATCCGTAGCAGAT +GCCAAGGGTCGGAACGCCCAGGTCGAACACCGCCGGGTCGAGCGTCGGCGCGCCAGGCTC +GTACACCGATGACGGTCCGCCCGACAGGATGATGCCGATCGGATCTTTTGCGGCAATCTC +TTCAGCTGTCGCGGTGTGCGGAACCAGCTCGCTGTAGACGCCCGCTTCGCGCACGCGACG +GGCAATGAGCTGGGCGTACTGCGCGCCGAAGTCGACGACGAGGACGGGTCGCTGCGAGGT +CTCGGTCTGTTCTGTCACCGGATGCTTTCGGTCGGCGCCCCTGGAACCCAGGAGCGAAGG +TCAGGACACTGTGGGGTTCTGGCGGGTCACGCTGGAGTGTTCGGCGAGATCGTGGTTCTC +GGACTCGCGCGCAGCAAGGTACGTCTTGACCTCACGGGCGACCCGTGCCTCCATGAAGAA +CGACAGGAACGGGACGATTCCGCCCAGCGCGAGGGCGATGAACCGACCGAACCGCCACCG +CATCAGGCTCCAGATGCGGAAGCACGCGAAGAGGTACACGACGTAGAACCAGCCGTGGCC +GACGAGGATCGACAGCGACACATTGACGCCGTCGCCCGCCGACTCGAGGTCGCAGCCCAG +ACCCCCGGGCACGAAGAGCGAGTACCACTCGCATCCGGGCCCGACCAGCACCGGTGCGAA +CCAGAGGAAGCCACCGGACCCGCCGGCGAACAGTTCGACGTGCAGCGGCGAGTACTTGAG +GATCATCTCGGCCAGCAGCAGGAGCAGCATGACACCGGTGATGATCGAGGCGACCTGGTA +GAAGGTCAAGGCTCCGCGAATGGCCGGGAAAGACGACGGTTTCGGCTCACGGGGCATGGG +CCCATTCTAGTCGCCGGTTGCGGTCGCGCTTCCCGACGAGGATGCCGCGGCTGCGGCATC +CTCGAGCTCTTCGACTTCCTTCTCCCACGCATCCTTGGCGAGGCGGTACCAGAAATAGAA +GGCGAAGCCGGCGAAGACCACCCACTCGGCGGCGTAGAAGATGTTCAGCCAGTTGACTGT +GGACCCGGCATCCGGCGCGGGCGAGGCGATGTCCACAAGGCCCGCCGGCGCAGACTGCGA +GGCGATGTAGCTGCGATAGACGTCCAGGCCCGCGGTGTCGTGCCACTGCGACAGGAGCGC +CGCCGGCGACATCCGTGTCATCGTGAACGGCGGCTCGCCGCGCGGCGGCGGCACCGGACC +CTCGTCCGAGATCAACCGACCGACGACCGTCACGGACTCCCCCGCGACCGCAGTCTGCTC +GAGCGCCTCGGCGGCGGATTCGGCGACGGTGAGCGTCGGCGCCCAGCCGACGGCGACGGC +CACGGATGTCGGCGTCGCGGTGTCGGCGATACGCAGCTGACCGGTGACCCAGAAGCCTTC +GACGCCGTCGTTGAAGCGCGACGAAACGACAAGGAAATCCTCGGGAACCCACGTGCCCGT +CACCTCGACGCGCTGGCCCACGAGCGGCTCGGGAAGGTACTCGCCGGGCCCGGCGATCTC +GGCGAGCGGCCTGACCTCTTCGGTGGTCCCGGGCGGGAGTGGGTCGGTGTCGATAGCGCG +CGAGAGCTGCCACTGCCCGAGCCACGCGAACACCCCCGCTACGACGAGCGCGAGCAGCAG +GACGCCGATCCAGCGGGGTCGGAGCATGACCTCCCGCAGGGTCGGGGGAAAGACTGTCTG +GTCTGTCATCCGCCCGTATACGGCGCGACGACCACCTCGACGCGCTGGAACTCCTTGAGA +TCGGAGTACCCGGTCGTGGCCATCGACTTCTTCAGCGCCCCGATCAGGTTCGCGGTTCCG +TCAGCCACCGGAGCCGGACCGTAGAGCACGGATTCGAGGTTCGTCACCTGATCCACCTTC +ACGCGGCGACCGCGCGGGAGCTTCGAGTGGTGAGCCTCCGGCCCCCAGTGGTATCCACGA +CCGGGGGCGTCGGTTGCCCGCGAGAGAGCGACGCCGAGCATGACGGCATCCGCTCCCATC +GCGAGCGCCTTGACGATGTCGCCTGACGTTCCCACACCGCCATCGGCGATGACGTGGACG +TAGCGCCCGCCCGACTCGTCGAGGTAGTCGCGGCGCGCGCCGGCGACGTCGGCTACCGCC +GTGGCCATCGGGGCGTGGATGCCGAGAACCCCGCGCGTCGTCGAGGCTGCGCCCCCGCCG +AAGCCGACGAGCACGCCCGCGGCGCCCGTGCGCATGAGGTGCAGGGCTGCCGTGTAGGTC +GCAGCACCGCCGACGATGACAGGCACGTCGAGGTCGTAGATGAACTTCTTGAGGTTGAGG +GGCTCGTCGACGCTCGAGACGTGCTCGGCCGAAACCGTCGTGCCACGGATGACGAACAGG +TCCACACCCGCGGCGACCACGGTTTCGTACAGCTGCTGGGTGCGCTGCGGAGTCAAAGCA +CCGGCCACCGTGACTCCGGCGTCACGGATCTGCTGCAGTCGCTCACGGATGAGCTCGGGC +TTGATCGGCTCGGAGTAGAGCTCCTGCATCCGGCAGGTTGCCGTCGCCTCGTCGAGAGAC +GCGATCTCAGCCAGCAGCGGCTCGGGGTCGTCGTACCGGGTCCAGAGCCCCTCGAGATCG +AGGACACCGAGTCCGCCGAGCTGACCGAGCATGATCGCCGTCTGCGGGCTCACAACCGAA +TCCATCGGGGCGCCGAGCACCGGGATGTCGAACTGGAACGCGTCGATCGACCATGCGGTC +GAGACATCCTCGGGATTGCGGGTGCGGCGCGAGGGAACGACGGCGATGTCGTCGAACGAG +TACGCGCGGCGAGCCCGCTTGGCGCGGCCGATCTCGATC diff --git a/q2_types/per_sample_sequences/tests/data/mags/mags-fa-with-manifest/sample1/mag3.fa b/q2_types/per_sample_sequences/tests/data/mags/mags-fa-with-manifest/sample1/mag3.fa new file mode 100644 index 00000000..489e644b --- /dev/null +++ b/q2_types/per_sample_sequences/tests/data/mags/mags-fa-with-manifest/sample1/mag3.fa @@ -0,0 +1,46 @@ +>k129_6525 +AAACTCTATCAAGCGTATACCAAAGTGAGTGGTGTATTGATCAGTCAGCTCATTATTGAA +TCGGACTTCTGTCTCCAATCGATATATTGATGGAGAAGAAGGGCTCCATAAAAGAGGATT +ATTAAGTTCTAAATTAACCTCTGCTGTCTCTTTATTTTCCGCTTCTACATTAAGCGAATG +ATCCATCTGCAGTAATGGTTTATCGCCTTCATTAGAAAATATTTTAAATACTACAGTAAC +TGCGCTCCAATGATTATACGTATTAAAGACCTCTGCGCGTAACAATAATTCTACTTTATT +TAGGTCTTGCTGATCATGGAAATGGCAGCGCACTTGAGTTCCGCAGTAGGGCACATAGAC +GCAATTAGTAGCTATTAGTCGAACATCGCGGTAGATGCCGCCGCCCTCATACGACCATAG +TTCAAATTCTCTGGCATCGCAGCGAACCGCGACCACGTTCGGCACATTCGCATCGCAAAG +CTCAGTGATATCGAGAGTGAAACTGGTGTAGCCTGATAGATGCCGCCCAGCTAAATGGCC +ATTTACCCATATTGTTGCATCTCGGTAAATGCCATCAAATTCAAGGTGAATACGCTGCTT +GCTAGCTTCTTTGGGAATTTCAAACGTTTTGCGATACCAGCCTACATCAGTCGGCAGTGA +GCCATGCACAGCATTCGCGGATGCCCGAAATTCGCCTTCAATTACGAAATCATGAGGTAG +GTTTATGTCGCGCCATGCTTCATCTGGATAGCCCAAGCGCGCGACCCCATGGTTTCCTGC +CTTTAACCACTCGGCTCGCTTAAAACGATTAGCATGAATGGCTTGATGGTTAGTGCTGTC +TAGCTCACCTCGATGAAACTTCCAACCTTGGTTAAATTCATAAGTGGTACGCATAGAATT +ACTGATGTCTTTTAAAAGATTCTACAAGTGGAGTCTATTAATTATTTGATAAGTTACTCT +GATTATTTTTAGAGATTTCTAATACAACTCCGCTGCACGTGCCGTAACGTCCGCCTTGGT +ATGCGCAAAACAGATGGGTGGGGACGCCTTCAGAGTTAATCAGTAACTGCGGTCGTTCGA +ATCGCCCTTCACGATCCAGTCCAGGCAATGTCTCGTCGAAGTAAGTTCCAGCATCTTTGT +AAGCAACCTGCGGATTTTGCCAAATAAGGCCATCGCTGGATTCCATATACAGCCCATACT +CGTGGTTATAAAAACCCATGTCACGCATGATTATTTTATATGGTGCGCAATCCTTCGGTT +CGTACCATGTGTACGCGTC +>k129_6531 +TCGGATTTGCCGAATGCTTTTTGTAAGGGCCTTCAATTGATTTGGCGATAGCGAGCCCGT +ATTTACGGTTGCCGTTAAATCGCCGCCCAGTGTCCCAATCCCAACCTTTATAGTAGAGCC +AATATTCGCCATTAGGATTTTGTAATAGCGATGGGTTGCTAACGACGGCATCGTCCCAAT +CGCCATCGCTGCCAACATTAATAACAGGTTCATCTCCAACGCGTCGCCAAGGTCCATTTA +TATTATCGGCAATGGCTAAACCGATGCGTTTGGTGTAGACTAATTGATTGAAGTATTTTT +CGTACTCTGCAGTAGATAAATTGGGTAGCTCATTTTGCTCGATATCTAGTTTCGAGCCAT +CTGCTCCCATGTAGAAAAGAGCATACTTGTCGCCGACCTTTTGCACAGTCGGATTGTGGA +TTGCCCATGAGTCCCAAGCATTTGCACCGCTGCCTTTTAGAACGACTCCTAAATCTTCGT +AGGGGCCTTCCGGAAGATCAGCAACCGCATGGGCCACTTCGCAGGCACTTACCCAACCAG +AAAATGTATACTCGTTTCGCCAACGTGAGTAAAAAACATGAATGCGCCCGTCGGGTCCGT +AGATAGGCGAACAGCACCAAACATGATAGCCTTCTACTTCAAGAATTCGCCCCAGTGGTT +TGAGTTTTTGCTCGAAGTTCGAAGTGCTTACTTCAGAGGTGATGGGACGTAGCTTCTGTA +AATTAATGAGCGACTTATTGCTAACTGTAGAGTCCATGAAAAAAAGGTAAACTTTATACG +AGTAATGTTATGCTCCTTAAAACTGTCAAGGTTTAGGCATTTTGCTGAGCATTATGGTGT +TTAATGGGCTTGAATCATAACAGGATTAAGCGACATTTAAATATTAATGATAAGAATTAG +TGATATAGCTAAAGAGTTAGGGCTTTCGAGGGTTACAGTCTCGGCTATTTTAAATGGACG +ACACCAGAAAATAGGTATTTCCGAAAAGACCGCGCAAAGGGTTCGTTCGAGTGCAAAGGC +TATGGGTTATCTACCCAATCAGAATGCATTGAGTATGAAGAGAGGTCGAAGCATGACTAT +TGGTATGCTGAGTAGTGCGCTATCGGAGGAGTGGGGTGCTAAAATTCTTGTTGGTGCATT +AAGTGCGATAAAGAACACGCCTTATTCACTGCGCGTTGAGTCAGTACAGGGAGCAGCAGA +AGAGCGCGGTGCCCTAGAGCGCCTCTTGGGGTCACGAATTGAAGGGTTGTTGTGCTGCAA +TATAAAT diff --git a/q2_types/per_sample_sequences/tests/data/mags/mags-fa-with-manifest/sample2/mag1.fa b/q2_types/per_sample_sequences/tests/data/mags/mags-fa-with-manifest/sample2/mag1.fa new file mode 100644 index 00000000..84f04b0b --- /dev/null +++ b/q2_types/per_sample_sequences/tests/data/mags/mags-fa-with-manifest/sample2/mag1.fa @@ -0,0 +1,41 @@ +>k129_4684 +TGATACCGACGCGGCACTTGAGTGCGCGCTATCCTTCAAGGAAGCCACATGCGTTATTGT +TAAACATGCAAACCCCTGTGGTGTTGCAACTGGGTCATCATTACTTGAGGCATATGAGGG +AGCTTACGCTACAGACCCAACATCTGCTTTTGGTGGCATTATTGCTTTTAATCGAGAATT +AGATTCAAAAACAGCAAAGGCAGTTATTGATAGACAGTTTGTTGAGGTAATCATTGCCCC +ATCTATATCACCCGACAGCATTAAAATTATTGCAAAAAAAGATGGTATACGTTTATTAGA +AGCTGGTTCACGACAAGAAGACATCAAAACTCTTAACATGAAGCGAGTCAGCGGAGGTTT +ATTACTGCAAGACAATGATATTGGGATTATTGATCGTGGTGATATAAAAATTGTTTCAAA +CGAGGTAATTG +>k129_5618 +GTGCTAATCGCACCCTCATGAGCGACACCATTATTCTTTATTTTTGAGTCTTCAGCAAAA +ATAAATACAGTCAAGTCACAACTCTTAGCGTATTCAAATGCGCGTCTTAATACTTCTGTA +TTTTCAATAGCAACATCACCATTACTTACGCCGATGCAGCCTGCAGCTTTTAATAAAAAC +ATCTCTGTCAGTTCTTTTCCATTTAGTTCTTGTGTTAAAGCGCCAAGTGGAAAAATATTT +GCGCGGTTAGATTCACTAGCACGCCGATTTATAAATTCCACAATGGCCGGCGTATCAATA +GTTGGTTGTATATCAGGTGGGACACAAATTGATGTTATACCACTACGGTTTGCAGCTTTA +AGTTCATTTTTGATAGCTATGTTTTTTTCTGAACCAATTTCGCCAAGCCTACCGCAAATA +TCAACTAATCCAGGTAAAATTATTTTATCTTTTGCGTTTATATCCAGATCCGATTTAAAA +>k129_5631 +TCATGATGATCCAAAAGCAGTTGCGGAAGCATCTGGGATAATTACGCGGAGTGGATGTCG +CCGAATCGCAAGATTTACTTTTGATTATGCTATTAAAACAGGAAGAAAAAAAATTACAAT +AGTTCATAAAGCAAATATCCTAAAAGCTCTAACAGGTCTGTTTCTAGAAACAGCAATGGA +AATCGGCAAAGAGTATGGAAATAAACTGGAAATTGAAGAGCGAATTGTCGACAACACAGC +AATGCAATTAGTAATCGATCCAGCGCAATTTAATATAATACTAACAACAAATATGTTCGG +TGATATTCTCTCAGATGAGATTGCGGGTCTAATAGGCGGACTCGGGTTGGCGCCAGGGGG +GAATATTGGTGATGATATAGCAATTTTTGAAGCGGTACACGGAACGGCTCCTGATATTGC +TGGAAAAGGGATTGCGAATCCAACAGCACTTTACCTAGCTTCAGCAATGATGTTGGAACA +TATAAATCAAAATAATATGGCCAATAACCTAAGGAAAGCAATTAGAGAAACATTGAAGAA +TAAAAAAAATCGCACAATCGATCTAGGTGGCGAAGCATCCACAAAAGATTATATGTCATA +TGTTATCGATAATTTAAACTAGAAAACAAATGAATGCACTTATACTCTTAGCACATGGAA +GTCGAAGAAGTGAATCTAACCTCGAAGTAGAGAGTTTATCAAATGAAATTTATGCGCTGA +TTAGCAACAAATT +>k129_2817 +GTCGCCAATTAGCAACTATGATGTCTTCTGGAGTACCTTTGGTCCAATCATTTGAAATCA +CAGGTCGTGGCCATGACAACCCAGGAATGCAGAGCCTAATTTTAGCCATCAAGGCTGATG +TTGAATCTGGAAATAGTTTGGTTGATGCCCTTAGAAAACATCCATTACATTTTAACTCGC +TTTATTGCAATTTAATTGAAGCTGGTGAACACGCCGGTATTTTAGAGGCAATTTTACACA +AATTAGCAACATACTTAGAAAAGACAGAAGCTCTGAAATCAAAAATAAAATCGGCTTTAT +TTTATCCAATGGCAGTTATTGTCGCAGCAATTATTGTGGTAACAATTCTGATGATATTTG +TAATACCTCAATTTTCTGAATTATTTGGAAGTTTTGGTGCTGACTTACCGGGTTTGACAC +AATTTTTAATAGATGCATCAGATTTCTTTGTTAGCCACTGGTGGAAATTATTTGGGTTAT diff --git a/q2_types/per_sample_sequences/tests/data/mags/mags-fa-with-manifest/sample2/mag2.fa b/q2_types/per_sample_sequences/tests/data/mags/mags-fa-with-manifest/sample2/mag2.fa new file mode 100644 index 00000000..4a6f71b8 --- /dev/null +++ b/q2_types/per_sample_sequences/tests/data/mags/mags-fa-with-manifest/sample2/mag2.fa @@ -0,0 +1,42 @@ +>k129_5401 +CCATTGTATGTCTTTAGGTAGCTCCTCATGTTTGAGGTTCATGTCTTGGATTTTGTTTTC +TCCAAAAATCCTTTGTCCTGCATCATAAGCTTGCATTACTTCCTCATTGGATTTAGTTTT +AGAAACAGCCACCAGTTGGACTGATGGGGGTATCTCCTTGATTATCCTAGAAATATTTTC +TGCTATATTCATAATACAAACTTACAATTTTCACAGAGTATTTTTTAAAGAATGAATTGA +AATTGAAGTTGAATTAAAGCATTTAAAATTTACAACATTCCATGATTTGATGTGCAAGTT +TCAAAGCACGGGTTCCGCTTTCCAGAGGAACAATAATTGGATCATTATTTTCGATGGCAT +CAGCAAAAACCTCTAACTCTTCCAGTATGGCGTTGGAATCTTCAATTCGTGGATTTTCAA +AATAGATTTGTTTTTTTTCTCCCTGTGCATTTTCAAGAATCATGGCAAAGTCTTTCGGTT +TTTTTGGAGCTTTTTTCATTTTTACTACCTCGACTTTTTTCTCTAGAAAATCTACTGCGA +TATAAGTGTTTTTCTGGAAAAAGCGCGTTTTACGCATTTTTTTTAGTGAAATCCTGCTTG +CGGTAAGGTTGGCAACACAGCCGTTTTCAAACTCAATTCTTGCGTTAGTTATATCGGGGG +TGGAGCTTATTACCGCCACCCCAGAAGCGGATACGGATTTTACTTTGGCATCGACAACAC +TCAACAATACATCAATATCGTGAATCATCAAATCTAAAACCACTGGAACATCGGTTCCTC +TTGGATTGAATTCTGCCAAACGATGGGACTCTATAAATTTTGGGTGGGTAATCGATGATT +TTACCGCCTTAAAAGCCGGATTAAAACGTTCTACGTGTCCCACCTGACCAAGAACCATTT +TTTTTGTAGCCATTTCGGTAATTTCAAGGGCTTCGGTAACATTATTGGCAATAGGCTTTT +CTATAAATATATGCTTTCCTTTTTGGATTGCATTTATGGCATTTTTATGGTGGAAAAATG +TAGGAGTGACAATATCTACCATATCGCAAGCTTGGATCAAATCGGCTTCACTTTTAAAAG +CGGTATAGCCATTTTCTTTAGCTAAGGCCTTGGTGTTTTTTTTATCCTGATCATAAAACC +CCACTAATTGGTATCGTTTAGAGGCCTCTAGTAAACGCAGATGAATTTTACCCAAATGTC +CTGCACCCAAGACGCCGACTTTTATCATAGCACTTAATTTTTAATCAAAAATACCATCTT +TTTCTGATTTTTTTTTGGAAGTAATTACATTTGTCCTCATGATTGATTCAACCAAGCATC +AAGGACAAAGAAGACAGTTGGTAAAATTGTTAGAGGAGAAAGGAGTCTACGACAAAAGGG +TTTTGAATGCTGTTGGAAGTGTTCCCCGTCATCTGTTTATGGATTCGGGTTTGGAGGAGT +ATTCCTATATTGACAAAGCCTATCCCATTGCGGCTAATCAGACCATATCACAGCCTTACA +CCGTAGCTTTTCAGACCCAATTGCTGGAACTTCAGAAAGGGGATCGAGTTTTGGAAATCG +GAACGGGTTCGGGCTATCAAACAGCTATTTTAATCGCCCTTGAAGGTCTAAAAGTGTATA +CCATTGAACGCCAACTGGAGTTATATAAAACAACTGTTTTGTTATTTAAAAAGTTGGGGT +TAAATCCCAAAAAAGTGATATTTGGTGATGGTTACCAAGGTTTACCAGATCAAGCACCTT +TTGATGCCATCATCGTTACTGCAGGTGCGCCTCAAGTACCCAAACCTTTGTTGGAACAAT +TGACCATTGGAGGGAGACTCGTAATCCCTGTGGGAGAGAAAGACCAAGTCATGACCCGAT +ATATGCGAACAGGGGAAAAGACCTTTGATCGACAAACCTTTGGGAATTTCAGATTTGTCC +CTTTGCTAAAGGATGAGAGATAGAGCTTGTTAAGTACTTCGTGAATATCGGATTTTCCTT +ACTGAATTTATAGCTCTTGACAATATCAATTGTTTGAAGATGGAAGGTGAAGTATACTTC +AGGCTTCGTAGCTGATAAGAATATTCACTCTTCGATTTTATAAATTTTGTTAAAAAATTG +CTCTACGTCGGTAGTTTTTTGGATTGATTAGAGCAGGTTTACTGTGTGTTGTAAAATTTT +TATAATCTTTAATTTGAGGTTGTTCACTATAATTTGGTGAGAAAAACTATTTATTGAAAT +TTTTTTTAATCCTATCTAAATCCCTTTTATTGTCTAAATCTTTAAGGGCTTCTCTTTTAT +CGTAAAGTTTTTTCCCCCGAGCTAAAGCGATCAACATTTTGGCAAAACCTTTTTCGTTGA +TGAAAAGCTTAAGTGGTACAATGGTCAAGCCGGAATTTTTCACCTGTTTGAAAAGTTTGT +TTAATTCTCTTTTTTTAA diff --git a/q2_types/per_sample_sequences/tests/test_format.py b/q2_types/per_sample_sequences/tests/test_format.py index 01ebe0ca..d8e2f23d 100644 --- a/q2_types/per_sample_sequences/tests/test_format.py +++ b/q2_types/per_sample_sequences/tests/test_format.py @@ -678,6 +678,39 @@ def test_multifasta_dirfmt_unorganized(self): ValidationError, 'should be .* per-sample directories'): format.validate() + def test_multifasta_dirfmt_sample_dict(self): + filepath = self.get_data_path('mags/mags-fasta') + shutil.copytree(filepath, self.temp_dir.name, dirs_exist_ok=True) + multifasta = MultiFASTADirectoryFormat(self.temp_dir.name, mode='r') + + obs = multifasta.sample_dict() + exp = { + 'sample1': { + 'mag1': str(Path(multifasta.path / 'sample1/mag1.fasta')), + 'mag2': str(Path(multifasta.path / 'sample1/mag2.fasta')), + 'mag3': str(Path(multifasta.path / 'sample1/mag3.fasta')) + }, + 'sample2': { + 'mag1': str(Path(multifasta.path / 'sample2/mag1.fasta')), + 'mag2': str(Path(multifasta.path / 'sample2/mag2.fasta')) + }, + } + self.assertDictEqual(obs, exp) + + obs = multifasta.sample_dict(relative=True) + exp = { + 'sample1': { + 'mag1': 'sample1/mag1.fasta', + 'mag2': 'sample1/mag2.fasta', + 'mag3': 'sample1/mag3.fasta' + }, + 'sample2': { + 'mag1': 'sample2/mag1.fasta', + 'mag2': 'sample2/mag2.fasta' + }, + } + self.assertDictEqual(obs, exp) + def test_multibowtie_index_dirfmt(self): dirpath = self.get_data_path('bowtie/index-valid') format = MultiBowtie2IndexDirFmt(dirpath, mode='r') @@ -708,7 +741,7 @@ def test_contig_seqs_dirfmt_sample_dict(self): 'sample2': str(Path(contigs.path / 'sample2_contigs.fa')), 'sample3': str(Path(contigs.path / 'sample3_contigs.fa')) } - self.assertEqual(obs, exp) + self.assertDictEqual(obs, exp) obs = contigs.sample_dict(relative=True) exp = { @@ -716,7 +749,7 @@ def test_contig_seqs_dirfmt_sample_dict(self): 'sample2': 'sample2_contigs.fa', 'sample3': 'sample3_contigs.fa' } - self.assertEqual(obs, exp) + self.assertDictEqual(obs, exp) @patch('subprocess.run', return_value=Mock(returncode=0)) def test_bam_dirmt(self, p): diff --git a/q2_types/per_sample_sequences/tests/test_transformer.py b/q2_types/per_sample_sequences/tests/test_transformer.py index e4e146a7..fe335fe5 100644 --- a/q2_types/per_sample_sequences/tests/test_transformer.py +++ b/q2_types/per_sample_sequences/tests/test_transformer.py @@ -1259,6 +1259,14 @@ def test_mag_manifest_to_df(self): assert_frame_equal(exp, obs) + def test_mag_seqs_dirfmt_to_multifile_dirfmt(self): + obs = self.apply_transformation( + MultiMAGSequencesDirFmt, + MultiFASTADirectoryFormat, + 'mags/mags-fa-with-manifest' + ) + obs.validate() + if __name__ == '__main__': unittest.main()