From b2135ab97ea7cf55a0fa65f4476ca22d11b0babe Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Mon, 17 Jun 2024 11:21:05 +0100 Subject: [PATCH] Calculate recombination length including gaps --- src/branch_sequences.c | 42 +++++++++++++++++++++++++----------------- src/parse_phylip.c | 25 ++++++++++++++++++------- src/parse_phylip.h | 6 ++++-- src/tree_statistics.c | 22 ++++++++++++---------- 4 files changed, 59 insertions(+), 36 deletions(-) diff --git a/src/branch_sequences.c b/src/branch_sequences.c index 74864e7a..b89a6552 100644 --- a/src/branch_sequences.c +++ b/src/branch_sequences.c @@ -123,8 +123,10 @@ void fill_in_recombinations_with_gaps(newick_node ** nodeArray, int node_index, // Set root-specific values if (parent_node_index == -1) { - set_number_of_bases_in_recombinations(sequence_index,0); - set_number_of_branch_bases_in_recombinations(sequence_index,0); + for (int include_gaps = 0; include_gaps <= 1; ++include_gaps) { + set_number_of_bases_in_recombinations(sequence_index,0,include_gaps); + set_number_of_branch_bases_in_recombinations(sequence_index,0,include_gaps); + } set_internal_node(1,sequence_index); set_genome_length_excluding_blocks_and_gaps_for_sample(sequence_index, length_of_original_genome); @@ -172,25 +174,31 @@ void fill_in_recombinations_with_gaps(newick_node ** nodeArray, int node_index, // Set number of branch bases in recombination by iterating through // the first part of merged blocks (i.e. only blocks on the branch to this node) - set_number_of_branch_bases_in_recombinations(sequence_index, - calculate_number_of_bases_in_recombinations(merged_block_coordinates, - node->number_of_blocks, - node_sequence, - snp_locations, - number_of_snps, - 0) - ); + for (int include_gaps = 0; include_gaps <= 1; ++include_gaps) { + set_number_of_branch_bases_in_recombinations(sequence_index, + calculate_number_of_bases_in_recombinations(merged_block_coordinates, + node->number_of_blocks, + node_sequence, + snp_locations, + number_of_snps, + include_gaps), + include_gaps + ); + } // Set number of total bases in recombination by iterating through // all merged blocks leading to this node - set_number_of_bases_in_recombinations(sequence_index, - calculate_number_of_bases_in_recombinations(merged_block_coordinates, - (num_blocks[parent_node_index] + node->number_of_blocks), - node_sequence, - snp_locations, - number_of_snps, - 0) + for (int include_gaps = 0; include_gaps <= 1; ++include_gaps) { + set_number_of_bases_in_recombinations(sequence_index, + calculate_number_of_bases_in_recombinations(merged_block_coordinates, + (num_blocks[parent_node_index] + node->number_of_blocks), + node_sequence, + snp_locations, + number_of_snps, + include_gaps), + include_gaps ); + } free(node_sequence); for(i = 0; i < num_recombinations[node_index]; i++) diff --git a/src/parse_phylip.c b/src/parse_phylip.c index af283197..651f3960 100644 --- a/src/parse_phylip.c +++ b/src/parse_phylip.c @@ -295,23 +295,34 @@ void set_genome_length_excluding_blocks_and_gaps_for_sample(int sample_index, in ((sample_statistics *) statistics_for_samples[sample_index])->genome_length_excluding_blocks_and_gaps = genome_length_excluding_blocks_and_gaps; } -void set_number_of_branch_bases_in_recombinations(int sample_index, int bases_in_recombinations) +void set_number_of_branch_bases_in_recombinations(int sample_index, int bases_in_recombinations, int include_gaps) { if( sample_index == -1) { return; } - ((sample_statistics *) statistics_for_samples[sample_index])->branch_bases_in_recombinations = bases_in_recombinations; + + if (include_gaps > 0) { + ((sample_statistics *) statistics_for_samples[sample_index])->branch_bases_in_recombinations_including_gaps = bases_in_recombinations; + } else { + ((sample_statistics *) statistics_for_samples[sample_index])->branch_bases_in_recombinations = bases_in_recombinations; + } } -void set_number_of_bases_in_recombinations(int sample_index, int bases_in_recombinations) +void set_number_of_bases_in_recombinations(int sample_index, int bases_in_recombinations, int include_gaps) { - if( sample_index == -1) - { - return; - } + if( sample_index == -1) + { + return; + } + + if (include_gaps > 0) { + ((sample_statistics *) statistics_for_samples[sample_index])->bases_in_recombinations_including_gaps = bases_in_recombinations; + } else { ((sample_statistics *) statistics_for_samples[sample_index])->bases_in_recombinations = bases_in_recombinations; + } + } diff --git a/src/parse_phylip.h b/src/parse_phylip.h index c5f6ed1b..f1d489a0 100644 --- a/src/parse_phylip.h +++ b/src/parse_phylip.h @@ -27,6 +27,8 @@ int number_of_snps; int genome_length_without_gaps; int number_of_blocks; + int bases_in_recombinations_including_gaps; + int branch_bases_in_recombinations_including_gaps; int bases_in_recombinations; int branch_bases_in_recombinations; int genome_length_excluding_blocks_and_gaps; @@ -54,8 +56,8 @@ int get_internal_node(int sequence_index); void fill_in_unambiguous_bases_in_parent_from_children_where_parent_has_a_gap(int parent_sequence_index, int * child_sequence_indices, int num_children); void fill_in_unambiguous_gaps_in_parent_from_children(int parent_sequence_index, int * child_sequence_indices, int num_children); void freeup_memory(void); -void set_number_of_branch_bases_in_recombinations(int sample_index, int bases_in_recombinations); -void set_number_of_bases_in_recombinations(int sample_index, int bases_in_recombinations); +void set_number_of_branch_bases_in_recombinations(int sample_index, int bases_in_recombinations, int include_gaps); +void set_number_of_bases_in_recombinations(int sample_index, int bases_in_recombinations, int include_gaps); void filter_sequence_bases_and_rotate(char * reference_bases, char ** filtered_bases_for_snps, int number_of_filtered_snps); void set_genome_length_excluding_blocks_and_gaps_for_sample(int sample_index, int genome_length_excluding_blocks_and_gaps); diff --git a/src/tree_statistics.c b/src/tree_statistics.c index bf331e33..792b8018 100644 --- a/src/tree_statistics.c +++ b/src/tree_statistics.c @@ -35,25 +35,27 @@ void create_tree_statistics_file(char filename[], sample_statistics ** statistic char extension[7] = {".stats"}; concat_strings_created_with_malloc(base_filename,extension); file_pointer = fopen(base_filename, "w"); - fprintf( file_pointer, "Node\tTotal SNPs\tNumber of SNPs Inside Recombinations\tNumber of SNPs Outside Recombinations\tNumber of Recombination Blocks\tBases in Recombinations\tCumulative Bases in Recombinations\tr/m\trho/theta\tGenome Length\tBases in Clonal Frame\n"); + fprintf( file_pointer, "Node\tTotal SNPs\tNumber of SNPs Inside Recombinations\tNumber of SNPs Outside Recombinations\tNumber of Recombination Blocks\tBases in Recombinations Including Gaps\tCumulative Bases in Recombinations Including Gaps\tBases in Recombinations Excluding Gaps\tCumulative Bases in Recombinations Excluding Gaps\tr/m\trho/theta\tGenome Length\tBases in Clonal Frame\n"); for(sample_counter=0; sample_counter< number_of_samples; sample_counter++) { sample_statistics * sample_details = ((sample_statistics *) statistics_for_samples[sample_counter]); fprintf( file_pointer, "%s\t", sample_details->sample_name); - fprintf( file_pointer, "%i\t", (sample_details->number_of_recombinations + sample_details->number_of_snps)); - fprintf( file_pointer, "%i\t", sample_details->number_of_recombinations); - fprintf( file_pointer, "%i\t", (sample_details->number_of_snps)); - fprintf( file_pointer, "%i\t", sample_details->number_of_blocks); - fprintf( file_pointer, "%i\t", sample_details->branch_bases_in_recombinations); - fprintf( file_pointer, "%i\t", sample_details->bases_in_recombinations); - fprintf( file_pointer, "%f\t", recombination_to_mutation_ratio(sample_details->number_of_recombinations, (sample_details->number_of_snps))); + fprintf( file_pointer, "%i\t", (sample_details->number_of_recombinations + sample_details->number_of_snps)); + fprintf( file_pointer, "%i\t", sample_details->number_of_recombinations); + fprintf( file_pointer, "%i\t", (sample_details->number_of_snps)); + fprintf( file_pointer, "%i\t", sample_details->number_of_blocks); + fprintf( file_pointer, "%i\t", sample_details->branch_bases_in_recombinations_including_gaps); + fprintf( file_pointer, "%i\t", sample_details->bases_in_recombinations_including_gaps); + fprintf( file_pointer, "%i\t", sample_details->branch_bases_in_recombinations); + fprintf( file_pointer, "%i\t", sample_details->bases_in_recombinations); + fprintf( file_pointer, "%f\t", recombination_to_mutation_ratio(sample_details->number_of_recombinations, (sample_details->number_of_snps))); fprintf( file_pointer, "%f\t", rho_theta(sample_details->number_of_blocks,sample_details->number_of_snps)); - fprintf( file_pointer, "%i\t", sample_details->genome_length_without_gaps); + fprintf( file_pointer, "%i\t", sample_details->genome_length_without_gaps); fprintf( file_pointer, "%i", sample_details->genome_length_excluding_blocks_and_gaps); fprintf( file_pointer, "\n"); - free(sample_details->sample_name); + free(sample_details->sample_name); free(sample_details); }