From 5848c5959e1b99c81af5b5153549456757389e27 Mon Sep 17 00:00:00 2001 From: David Edwards Date: Mon, 4 May 2020 10:08:41 +1000 Subject: [PATCH 1/2] Fastml fix --- README.md | 4 ++-- scripts/snppar.py | 10 +++++----- setup.py | 2 +- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 45addc4..9cd8166 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ SNPPar is designed to find homoplasic SNPs based on a user-defined phylogenetic By default, SNPPar uses TreeTime for ancestral state reconstruction (ASR), but using FastML for ASR is also available if FastML is installed (though much, much slower) -Current Version: V0.4dev +Current Version: V0.4.1dev # Home: @@ -88,7 +88,7 @@ Note: If any gene is split in the reference (including across the origin of the [-t TREE] [-g GENBANK] [-E SORTING] [-M MUTATION_EVENTS] [-d DIRECTORY] [-p PREFIX] [-P] [-S] [-C] [-R] [-A] [-a] [-n] [-e] [-u] [-f] [-x FASTML_EXECUTE] - SNPPar: Parallel/homoplasic SNP Finder V0.4dev + SNPPar: Parallel/homoplasic SNP Finder V0.4.1dev optional arguments: -h, --help show this help message and exit -s SNPTABLE, --snptable SNPTABLE diff --git a/scripts/snppar.py b/scripts/snppar.py index 0479dbb..b448b85 100644 --- a/scripts/snppar.py +++ b/scripts/snppar.py @@ -23,8 +23,8 @@ # added parsing for previous results # fix for fastml_execute # simplified and intermediate and complex sorting for TreeTime -# To add: missingness report - highest SNP, isolate, overall missingness -# mapping using tree and snp table only (i.e. no reference) +# further fixing (and testing) of fastml_execute +# To add: mapping using tree and snp table only (i.e. no reference) # import os,sys,subprocess,string,re,random,collections,operator,argparse @@ -42,7 +42,7 @@ from datetime import datetime # Constants declaration -version = 'V0.4dev' +version = 'V0.4.1dev' genefeatures = 'CDS' excludefeatures = 'gene,misc_feature,repeat_region,mobile_element' nt = ['A','C','G','T'] @@ -86,9 +86,9 @@ def executeCommand(command, log): message = 'Failed to run command: ' + command logPrint(log, message, "CRITICAL") message = 'stdout: ' + result.stdout - log(log, message, "CRITICAL") + logPrint(log, message, "CRITICAL") message = 'stderr: ' + result.stderr - log(log, message, "CRITICAL") + logPrint(log, message, "CRITICAL") sys.exit(1) else: message = 'stdout: ' + result.stdout diff --git a/setup.py b/setup.py index 4d31af6..01874af 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ setup( name='snppar', - version='0.4dev', + version='0.4.1dev', author='David Edwards', author_email='David.Edwards@monash.edu', packages=['snppar'], From 13c5b42572208177d8cc6c2437df2324a76b0d3f Mon Sep 17 00:00:00 2001 From: David Edwards Date: Tue, 5 May 2020 15:06:59 +1000 Subject: [PATCH 2/2] rerun tests --- .../2019-10-22_11-03-25.938748_log.txt | 80 ------------------ .../2020-05-05_15-05-54.966462_log.txt | 81 +++++++++++++++++++ 2 files changed, 81 insertions(+), 80 deletions(-) delete mode 100644 test_data/test_outputs/2019-10-22_11-03-25.938748_log.txt create mode 100644 test_data/test_outputs/2020-05-05_15-05-54.966462_log.txt diff --git a/test_data/test_outputs/2019-10-22_11-03-25.938748_log.txt b/test_data/test_outputs/2019-10-22_11-03-25.938748_log.txt deleted file mode 100644 index 099a206..0000000 --- a/test_data/test_outputs/2019-10-22_11-03-25.938748_log.txt +++ /dev/null @@ -1,80 +0,0 @@ -2019-10-22_11-03-25.938748: INFO : Log started: (/Users/david/Desktop/simulated_data/git_test_outputs/2019-10-22_11-03-25.938748_log.txt) -2019-10-22_11:03:25.940101: INFO : SNPPar: Parallel SNP Finder V0.1.3dev - (utilising TreeTime for ASR) -2019-10-22_11:03:25.940946: INFO : User command: snppar -s MTB_Global_L2_alleles.csv -t MTB_Global_L2.tre -g NC_00962_3_1.gbk -d /Users/david/Desktop/simulated_data/git_test_outputs -2019-10-22_11:03:25.941473: INFO : Reading SNP table from MTB_Global_L2_alleles.csv -2019-10-22_11:03:26.128678: INFO : Finished reading 4401 SNPs in total -...keeping 4401 variable SNPs and ignoring 0 SNPs -that are non-variable among the 94 isolates -2019-10-22_11:03:26.129186: INFO : Reading Genbank file from NC_00962_3_1.gbk -2019-10-22_11:03:26.825814: INFO : Found 3906 genes -2019-10-22_11:03:26.826247: INFO : Reading tree from MTB_Global_L2.tre... -2019-10-22_11:03:26.832211: INFO : Tree and SNP table have same isolates -2019-10-22_11:03:26.832556: INFO : Parsing SNPs to find bi-, tri- and quadallelic SNPs... -Also testing if biallelic SNPs are homoplasic -2019-10-22_11:03:28.055164: INFO : Biallelic SNPs (>1 one isolate): 1440 -Biallelic SNP patterns tested: 51 -Paraphyletic SNPs found: 2 -Triallelic SNPs found: 0 -Quadallelic SNPs found: 0 - -Total SNPs for mapping: 2 -2019-10-22_11:03:28.056322: INFO : Assigning monophyletic SNPs to genes -2019-10-22_11:03:28.075707: INFO : Of the 4399 monophyletic SNPs for testing, 489 are intergenic, and 3922 are intragenic -2019-10-22_11:03:28.076184: INFO : 12 SNPs occur in overlapping genes -2019-10-22_11:03:28.092666: INFO : Assigning SNPs to map to genes -2019-10-22_11:03:28.093103: INFO : Of the 2 SNPs to map for testing, 0 are intergenic, and 2 are intragenic -2019-10-22_11:03:28.094766: INFO : Running TreeTime: treetime ancestral --aln /Users/david/Desktop/simulated_data/git_test_outputs/snps_to_map.mfasta --tree MTB_Global_L2.tre --outdir /Users/david/Desktop/simulated_data/git_test_outputs/treetime_out/ --report-ambiguous --verbose 2 -2019-10-22_11:03:30.736273: INFO : stdout: -0.00 -TreeAnc: set-up - -Inferred sequence evolution model (saved as /Users/david/Desktop/simulated_data/git_test_outputs/treetime_out//sequence_evolution_model.txt): -Substitution rate (mu): 1.0 - -Equilibrium frequencies (pi_i): - A: 0.2525 - C: 0.2549 - G: 0.2413 - T: 0.2392 - -: 0.0121 - -Symmetrized rates from j->i (W_ij): - A C G T - - A 0 1.2405 1.4918 1.2434 1.2436 - C 1.2405 0 1.2429 1.4918 1.2434 - G 1.4918 1.2429 0 1.2455 1.2456 - T 1.2434 1.4918 1.2455 0 1.2458 - - 1.2436 1.2434 1.2456 1.2458 0 - -Actual rates from j->i (Q_ij): - A C G T - - A 0 0.3132 0.3767 0.3139 0.314 - C 0.3162 0 0.3168 0.3803 0.317 - G 0.3599 0.2999 0 0.3005 0.3005 - T 0.2974 0.3568 0.2979 0 0.298 - - 0.015 0.015 0.0151 0.0151 0 - - ---- alignment including ancestral nodes saved as - /Users/david/Desktop/simulated_data/git_test_outputs/treetime_out/ancestral_sequences.fasta - ---- tree saved in nexus format as - /Users/david/Desktop/simulated_data/git_test_outputs/treetime_out/annotated_tree.nexus - -2019-10-22_11:03:30.736937: INFO : Extracting mutation events from ASR results... -2019-10-22_11:03:30.740533: INFO : Extracting internal node sequences from ASR results... -2019-10-22_11:03:30.742402: INFO : Processing mapped mutation events... -2019-10-22_11:03:30.742766: INFO : Processing monophyletic mutation events... -2019-10-22_11:03:32.574716: INFO : Combining monophyletic and mapped mutation events -Also combining monophyletic and mapped node sequences... -2019-10-22_11:03:32.580576: INFO : Writing node sequences to /Users/david/Desktop/simulated_data/git_test_outputs/node_sequences.fasta -2019-10-22_11:03:32.718423: INFO : Assigning coding consequenses to the mutation events... -2019-10-22_11:03:33.979677: INFO : 2855 non-synonymous changes, 1071 synonymous changes, and 489 SNP events in non-coding regions -2019-10-22_11:03:33.980294: INFO : Parsing all mutation events... -2019-10-22_11:03:33.983855: INFO : Found 4 mutation events that are homoplasic -2019-10-22_11:03:33.984414: INFO : Writing all calls at homoplasic event positions to /Users/david/Desktop/simulated_data/git_test_outputs/homoplasic_events_all_calls.tsv -2019-10-22_11:03:33.990348: INFO : Writing all mutation events to /Users/david/Desktop/simulated_data/git_test_outputs/all_mutation_events.tsv -2019-10-22_11:03:34.242071: INFO : Mapping mutation events to NEXUS tree: /Users/david/Desktop/simulated_data/git_test_outputs/node_labelled_nexus.tre -2019-10-22_11:03:34.248015: INFO : Also mapping mutation events to Newick NHX tree: /Users/david/Desktop/simulated_data/git_test_outputs/node_labelled_newick.tre -2019-10-22_11:03:34.252754: INFO : Cleaning up intermediate files... -2019-10-22_11:03:34.269320: INFO : ...Finished diff --git a/test_data/test_outputs/2020-05-05_15-05-54.966462_log.txt b/test_data/test_outputs/2020-05-05_15-05-54.966462_log.txt new file mode 100644 index 0000000..ef24cc4 --- /dev/null +++ b/test_data/test_outputs/2020-05-05_15-05-54.966462_log.txt @@ -0,0 +1,81 @@ +2020-05-05_15-05-54.966462: INFO : Log started: (test_outputs/2020-05-05_15-05-54.966462_log.txt) +2020-05-05_15:05:54.967995: INFO : SNPPar: Parallel SNP Finder V0.4.1dev + (utilising TreeTime for ASR and intermediate sorting) +2020-05-05_15:05:54.968403: INFO : User command: snppar -s MTB_Global_L2_alleles.csv -t MTB_Global_L2.tre -g NC_00962_3_1.gbk -d test_outputs +2020-05-05_15:05:54.968667: INFO : Reading SNP table from MTB_Global_L2_alleles.csv +2020-05-05_15:05:55.132750: INFO : Finished reading 4401 SNPs in total +...keeping 4401 variable SNPs and ignoring 0 SNPs +that are non-variable among the 94 isolates +2020-05-05_15:05:55.133277: INFO : Reading Genbank file from NC_00962_3_1.gbk +2020-05-05_15:05:56.023196: INFO : Found 3906 genes +2020-05-05_15:05:56.023646: INFO : Reading tree from MTB_Global_L2.tre... +2020-05-05_15:05:56.032251: INFO : Tree and SNP table have same isolates +2020-05-05_15:05:56.032629: INFO : Parsing SNPs to find bi-, tri- and quadallelic SNPs... +Also testing if biallelic SNPs are homoplasic +2020-05-05_15:05:57.227237: INFO : Biallelic SNPs (>1 one isolate): 1440 +Biallelic SNPs (>1 one isolate, no missing calls): 1440 +Biallelic SNP patterns tested (no missing calls): 51 +Paraphyletic SNPs found: 2 +Triallelic SNPs found: 0 +Quadallelic SNPs found: 0 + +Total SNPs for mapping: 2 +2020-05-05_15:05:57.228455: INFO : Assigning monophyletic SNPs to genes +2020-05-05_15:05:57.247711: INFO : Of the 4399 monophyletic SNPs for testing, 489 are intergenic, and 3922 are intragenic +2020-05-05_15:05:57.248047: INFO : 12 SNPs occur in overlapping genes +2020-05-05_15:05:57.266262: INFO : Assigning SNPs to map to genes +2020-05-05_15:05:57.266846: INFO : Of the 2 SNPs to map for testing, 0 are intergenic, and 2 are intragenic +2020-05-05_15:05:57.268311: INFO : Running TreeTime: treetime ancestral --aln test_outputs/snps_to_map.mfasta --tree MTB_Global_L2.tre --outdir test_outputs/treetime_out/ --report-ambiguous --verbose 2 +2020-05-05_15:06:00.089242: INFO : stdout: +0.00 -TreeAnc: set-up + +Inferred sequence evolution model (saved as test_outputs/treetime_out//sequence_evolution_model.txt): +Substitution rate (mu): 1.0 + +Equilibrium frequencies (pi_i): + A: 0.2525 + C: 0.2549 + G: 0.2413 + T: 0.2392 + -: 0.0121 + +Symmetrized rates from j->i (W_ij): + A C G T - + A 0 1.2405 1.4918 1.2434 1.2436 + C 1.2405 0 1.2429 1.4918 1.2434 + G 1.4918 1.2429 0 1.2455 1.2456 + T 1.2434 1.4918 1.2455 0 1.2458 + - 1.2436 1.2434 1.2456 1.2458 0 + +Actual rates from j->i (Q_ij): + A C G T - + A 0 0.3132 0.3767 0.3139 0.314 + C 0.3162 0 0.3168 0.3803 0.317 + G 0.3599 0.2999 0 0.3005 0.3005 + T 0.2974 0.3568 0.2979 0 0.298 + - 0.015 0.015 0.0151 0.0151 0 + + +--- alignment including ancestral nodes saved as + test_outputs/treetime_out/ancestral_sequences.fasta + +--- tree saved in nexus format as + test_outputs/treetime_out/annotated_tree.nexus + +2020-05-05_15:06:00.090033: INFO : Extracting mutation events from ASR results... +2020-05-05_15:06:00.093463: INFO : Extracting internal node sequences from ASR results... +2020-05-05_15:06:00.095357: INFO : Processing mapped mutation events... +2020-05-05_15:06:00.095687: INFO : Processing monophyletic mutation events... +2020-05-05_15:06:01.889402: INFO : Combining monophyletic and mapped mutation events +Also combining monophyletic and mapped node sequences... +2020-05-05_15:06:01.897686: INFO : Writing node sequences to test_outputs/node_sequences.fasta +2020-05-05_15:06:02.036351: INFO : Assigning coding consequenses to the mutation events... +2020-05-05_15:06:03.249957: INFO : 2855 non-synonymous changes, 1071 synonymous changes, and 489 SNP events in non-coding regions +2020-05-05_15:06:03.250475: INFO : Parsing all mutation events... +2020-05-05_15:06:03.253026: INFO : Found 4 mutation events that are homoplasic +2020-05-05_15:06:03.253422: INFO : Writing all calls at homoplasic event positions to test_outputs/homoplasic_events_all_calls.tsv +2020-05-05_15:06:03.258145: INFO : Writing all mutation events to test_outputs/all_mutation_events.tsv +2020-05-05_15:06:03.479074: INFO : Mapping mutation events to NEXUS tree: test_outputs/node_labelled_nexus.tre +2020-05-05_15:06:03.484283: INFO : Also mapping mutation events to Newick NHX tree: test_outputs/node_labelled_newick.tre +2020-05-05_15:06:03.487913: INFO : Cleaning up intermediate files... +2020-05-05_15:06:03.502712: INFO : ...Finished