diff --git a/algorithms.py b/algorithms.py index f39d7012e..42d4b20a8 100644 --- a/algorithms.py +++ b/algorithms.py @@ -931,7 +931,12 @@ def __init__( # implemented using `ParametricAncestryModel()` self.hull_offset = hull_offset - self.initialise(tables.tree_sequence()) + if model == "fixed_pedigree": + self.t = 0 + self.S[0] = 0 + self.S[self.L] = -1 + else: + self.initialise(tables.tree_sequence()) self.num_ca_events = 0 self.num_re_events = 0 @@ -981,6 +986,7 @@ def initialise(self, ts): root_segments_tail = [None for _ in range(ts.num_nodes)] root_lineages = [None for _ in range(ts.num_nodes)] last_S = -1 + start_time = np.inf for tree in ts.trees(): left, right = tree.interval S = 0 if tree.num_roots == 1 else tree.num_roots @@ -991,6 +997,7 @@ def initialise(self, ts): # any ancestral segments to the state. if tree.num_roots > 1: for root in tree.roots: + start_time = min(start_time, tree.time(root)) population = ts.node(root).population if root_segments_head[root] is None: seg = self.alloc_segment(left, right, root) @@ -1014,7 +1021,7 @@ def initialise(self, ts): # Insert the segment chains into the algorithm state. for node in range(ts.num_nodes): lineage = root_lineages[node] - if lineage is not None: + if lineage is not None and ts.nodes_time[node] == start_time: seg = lineage.head left_end = seg.left while seg is not None: @@ -1616,6 +1623,16 @@ def process_pedigree_common_ancestors(self, ind, ploid): individual, then recombine and distribute the remaining ancestral material among its parent ploids. """ + node = ind.nodes[ploid] + is_sample = (self.tables.nodes.flags[node] & tskit.NODE_IS_SAMPLE) > 0 + if is_sample: + segment = self.alloc_segment(0, self.L, node) + lineage = self.alloc_lineage(segment, population=0) + self.add_lineage(lineage) + ind.add_common_ancestor(lineage.head, ploid=ploid) + for k in list(self.S.keys())[:-1]: + self.S[k] += 1 + common_ancestors = ind.common_ancestors[ploid] if len(common_ancestors) == 0: # No ancestral material inherited on this ploid of this individual @@ -1632,7 +1649,6 @@ def process_pedigree_common_ancestors(self, ind, ploid): # monoploid genome for this ploid of this individual. # If any coalescences occur, they use the corresponding node ID. # FIXME update the population/label here - node = ind.nodes[ploid] genome = self.merge_ancestors(common_ancestors, 0, 0, node) if ind.parents[ploid] == tskit.NULL: # If this individual is a founder we need to make sure that all @@ -1676,18 +1692,6 @@ def dtwf_climb_pedigree(self): Simulates transmission of ancestral material through a pre-specified pedigree """ - assert self.num_populations == 1 # Single pop/pedigree for now - pop = self.P[0] - - # Go through the extant lineages and gather the ancestral material - # into the corresponding pedigree individuals. - for lineage in pop.iter_ancestors(): - u = lineage.head.node - node = self.tables.nodes[u] - assert node.individual != tskit.NULL - ind = self.pedigree.individuals[node.individual] - ind.add_common_ancestor(lineage.head, ploid=ind.nodes.index(u)) - # Visit pedigree individuals in time order. visit_order = sorted(self.pedigree.individuals, key=lambda x: (x.time, x.id)) for ind in visit_order: diff --git a/lib/msprime.c b/lib/msprime.c index 2316fb00b..567e5a066 100644 --- a/lib/msprime.c +++ b/lib/msprime.c @@ -1859,11 +1859,13 @@ msp_verify_overlaps(msp_t *self) int ok = overlap_counter_alloc(&counter, self->sequence_length, 0); tsk_bug_assert(ok == 0); - /* add in the overlaps for ancient samples */ - for (j = self->next_sampling_event; j < self->num_sampling_events; j++) { - se = self->sampling_events[j]; - for (u = self->root_segments[se.sample]; u != NULL; u = u->next) { - overlap_counter_increment_interval(&counter, u->left, u->right); + if (self->model.type != MSP_MODEL_WF_PED) { + /* add in the overlaps for ancient samples */ + for (j = self->next_sampling_event; j < self->num_sampling_events; j++) { + se = self->sampling_events[j]; + for (u = self->root_segments[se.sample]; u != NULL; u = u->next) { + overlap_counter_increment_interval(&counter, u->left, u->right); + } } } @@ -2495,12 +2497,10 @@ msp_store_coalescence_edge( int ret = 0; if (self->model.type == MSP_MODEL_DTWF || self->model.type == MSP_MODEL_WF_PED) { - if (self->additional_nodes & MSP_NODE_IS_RE_EVENT) { - // parent and child can be equal - // don't store edges - if (parent == child) { - return ret; - } + // parent and child can be equal if we're storing RE events, or if we've + // got internal samples. Don't store edges + if (parent == child) { + return ret; } } @@ -2914,13 +2914,30 @@ msp_pedigree_add_individual_common_ancestor( } static int MSP_WARN_UNUSED -msp_pedigree_add_sample_ancestry(msp_t *self, segment_t *segment) +msp_pedigree_add_sample_ancestry(msp_t *self, tsk_id_t node_id) { int ret = 0; tsk_size_t ploid; - tsk_id_t node_id = segment->value; tsk_id_t individual_id; individual_t *ind; + avl_node_t *a; + node_mapping_t *counter; + segment_t *seg + = msp_alloc_segment(self, 0, self->sequence_length, node_id, 0, 0, NULL, NULL); + lineage_t *lin = msp_alloc_lineage(self, seg, seg, 0, 0); + + if (seg == NULL || lin == NULL) { + ret = MSP_ERR_NO_MEMORY; + goto out; + } + ret = msp_insert_individual(self, lin); + if (ret != 0) { + goto out; + } + for (a = self->overlap_counts.head; a != self->overlap_counts.tail; a = a->next) { + counter = (node_mapping_t *) a->item; + counter->value += 1; + } tsk_bug_assert(node_id < (tsk_id_t) self->tables->nodes.num_rows); individual_id = self->tables->nodes.individual[node_id]; @@ -2934,14 +2951,7 @@ msp_pedigree_add_sample_ancestry(msp_t *self, segment_t *segment) } } tsk_bug_assert(ploid < ind->ploidy); - if (avl_count(&ind->common_ancestors[ploid]) > 0) { - /* This is where we'd deal with the internal samples. What we'll probably - * need to do is to go through the ancestry after we've processed the - * individual and then pad out any gaps in the ancestry segments. */ - ret = MSP_ERR_PEDIGREE_INTERNAL_SAMPLE; - goto out; - } - ret = msp_pedigree_add_individual_common_ancestor(self, ind->id, segment, ploid); + ret = msp_pedigree_add_individual_common_ancestor(self, ind->id, seg, ploid); if (ret != 0) { goto out; } @@ -2955,9 +2965,11 @@ msp_pedigree_initialise(msp_t *self) int ret = 0; population_t *pop; lineage_t *lin; + segment_t *seg; avl_node_t *a; label_id_t label = 0; tsk_size_t j; + node_mapping_t *counter; if (self->next_demographic_event != NULL) { ret = MSP_ERR_UNSUPPORTED_OPERATION; @@ -2981,14 +2993,21 @@ msp_pedigree_initialise(msp_t *self) for (j = 0; j < self->num_populations; j++) { pop = &self->populations[j]; + /* Rather than messing about with how we initialise from trees, it's + * easier to just remove the lineages here, before we add them + * back later when dealing with samples in the pedigree. */ for (a = pop->ancestors[label].head; a != NULL; a = a->next) { lin = (lineage_t *) a->item; - ret = msp_pedigree_add_sample_ancestry(self, lin->head); - if (ret != 0) { - goto out; + for (seg = lin->head; seg != NULL; seg = seg->next) { + msp_free_segment(self, seg); } + msp_remove_individual(self, lin); } } + tsk_bug_assert(avl_count(&self->overlap_counts) == 2); + counter = (node_mapping_t *) self->overlap_counts.head->item; + counter->value = 0; + self->pedigree.next_individual = 0; out: return ret; @@ -4589,31 +4608,6 @@ msp_apply_sampling_events(msp_t *self, double time) return ret; } -static int MSP_WARN_UNUSED -msp_pedigree_insert_ancient_samples(msp_t *self) -{ - int ret = 0; - sampling_event_t *se; - segment_t *root_seg, *new_head; - - while (self->next_sampling_event < self->num_sampling_events - && self->sampling_events[self->next_sampling_event].time <= self->time) { - se = self->sampling_events + self->next_sampling_event; - root_seg = self->root_segments[se->sample]; - ret = msp_insert_root_segments(self, root_seg, &new_head); - if (ret != 0) { - goto out; - } - ret = msp_pedigree_add_sample_ancestry(self, new_head); - if (ret != 0) { - goto out; - } - self->next_sampling_event++; - } -out: - return ret; -} - static double msp_get_next_fixed_event_time(msp_t *self) { @@ -5483,6 +5477,7 @@ msp_pedigree_process_common_ancestors(msp_t *self, individual_t *ind, tsk_size_t { int ret = 0; tsk_id_t node = ind->nodes[ploid]; + bool is_sample = (self->tables->nodes.flags[node] & TSK_NODE_IS_SAMPLE) != 0; tsk_id_t parent = ind->parents[ploid]; avl_tree_t *common_ancestors = &ind->common_ancestors[ploid]; segment_t *genome, *parent_ancestry[MSP_MAX_PED_PLOIDY], *seg; @@ -5490,6 +5485,13 @@ msp_pedigree_process_common_ancestors(msp_t *self, individual_t *ind, tsk_size_t tsk_size_t j; tsk_id_t *parent_nodes; + if (is_sample) { + ret = msp_pedigree_add_sample_ancestry(self, node); + if (ret != 0) { + goto out; + } + } + /* FIXME - assuming 1 label here */ ret = msp_merge_n_ancestors( self, common_ancestors, ind->population, 0, node, &genome); @@ -5607,10 +5609,6 @@ msp_run_pedigree(msp_t *self, double max_time, unsigned long max_events) } tsk_bug_assert(ind->time >= self->time); self->time = ind->time; - ret = msp_pedigree_insert_ancient_samples(self); - if (ret != 0) { - goto out; - } for (ploid = 0; ploid < ind->ploidy; ploid++) { ret = msp_pedigree_process_common_ancestors(self, ind, ploid); if (ret != 0) { diff --git a/lib/tests/test_pedigrees.c b/lib/tests/test_pedigrees.c index de9cd957e..2cde9a139 100644 --- a/lib/tests/test_pedigrees.c +++ b/lib/tests/test_pedigrees.c @@ -553,8 +553,8 @@ test_replicates_ancient_samples(void) CU_ASSERT_EQUAL_FATAL(ret, 0); for (j = 0; j < 20; j++) { - msp_verify(&msp, 0); ret = msp_run(&msp, DBL_MAX, UINT32_MAX); + CU_ASSERT_EQUAL_FATAL(ret, MSP_EXIT_MODEL_COMPLETE); msp_verify(&msp, 0); ret = msp_finalise_tables(&msp); @@ -844,24 +844,12 @@ test_errors(void) static void test_internal_samples(void) { - int ret; tsk_id_t parents[] = { -1, -1, -1, -1, 0, 1 }; double time[] = { 1.0, 1.0, 0.0 }; tsk_flags_t is_sample[] = { true, true, true }; - msp_t msp; - tsk_table_collection_t tables; - gsl_rng *rng = safe_rng_alloc(); - - ret = build_pedigree_sim( - &msp, &tables, rng, 100, 2, 3, parents, time, is_sample, NULL); - ret = msp_initialise(&msp); - CU_ASSERT_EQUAL_FATAL(ret, 0); - ret = msp_run(&msp, DBL_MAX, UINT32_MAX); - CU_ASSERT_EQUAL_FATAL(ret, MSP_ERR_PEDIGREE_INTERNAL_SAMPLE); - tsk_table_collection_free(&tables); - msp_free(&msp); - gsl_rng_free(rng); + verify_pedigree(0, 1, 3, parents, time, is_sample, NULL, 0); + verify_pedigree(0.1, 1, 3, parents, time, is_sample, NULL, 0); } static void diff --git a/lib/util.c b/lib/util.c index 6b8342d13..270201fea 100644 --- a/lib/util.c +++ b/lib/util.c @@ -183,12 +183,6 @@ msp_strerror_internal(int err) ret = "All individuals in the input pedigree must be associated with " "exactly two parents (can be TSK_NULL, if not known)"; break; - case MSP_ERR_PEDIGREE_INTERNAL_SAMPLE: - ret = "Samples that are internal nodes in the pedigree are not " - "currently supported. Please comment on this GitHub issue if you " - "would like to see this feature implemented: " - "https://github.com/tskit-dev/msprime/issues/1855 "; - break; case MSP_ERR_BAD_PROPORTION: ret = "Proportion values must have 0 <= x <= 1"; diff --git a/lib/util.h b/lib/util.h index 619407268..ced714448 100644 --- a/lib/util.h +++ b/lib/util.h @@ -134,7 +134,6 @@ #define MSP_ERR_PEDIGREE_TIME_TRAVEL -88 #define MSP_ERR_PEDIGREE_IND_NOT_DIPLOID -89 #define MSP_ERR_PEDIGREE_IND_NOT_TWO_PARENTS -90 -#define MSP_ERR_PEDIGREE_INTERNAL_SAMPLE -91 /* clang-format on */ /* This bit is 0 for any errors originating from tskit */ diff --git a/tests/test_algorithms.py b/tests/test_algorithms.py index 0abd6e579..ce89bf3d8 100644 --- a/tests/test_algorithms.py +++ b/tests/test_algorithms.py @@ -423,6 +423,21 @@ def test_pedigree_trio(self, r): input_tables.nodes.assert_equals(output_tables.nodes[: len(input_tables.nodes)]) assert len(output_tables.edges) >= 2 + @pytest.mark.parametrize("r", [0, 0.1, 1]) + def test_pedigree_internal_sample(self, r): + input_tables = simulate_pedigree(num_founders=4, num_generations=4) + flags = input_tables.nodes.flags + flags[8:12] = tskit.NODE_IS_SAMPLE # Make inds 4 and 5 samples + input_tables.nodes.flags = flags + with tempfile.TemporaryDirectory() as tmpdir: + ts_path = pathlib.Path(tmpdir) / "pedigree.trees" + input_tables.dump(ts_path) + ts = self.run_script(f"0 --from-ts {ts_path} -r {r} --model=fixed_pedigree") + output_tables = ts.dump_tables() + input_tables.individuals.assert_equals(output_tables.individuals) + input_tables.nodes.assert_equals(output_tables.nodes[: len(input_tables.nodes)]) + assert len(output_tables.edges) >= 2 + @pytest.mark.parametrize("num_founders", [1, 2, 20]) def test_one_gen_pedigree(self, num_founders): tables = simulate_pedigree(num_founders=num_founders, num_generations=1) diff --git a/tests/test_pedigree.py b/tests/test_pedigree.py index 6db662cbd..765ec41b1 100644 --- a/tests/test_pedigree.py +++ b/tests/test_pedigree.py @@ -750,6 +750,19 @@ def test_just_samples(self): parent_nodes = {0, 1, 2, 3} assert set(ts.first().roots) == parent_nodes + @pytest.mark.parametrize("num_founders", [2, 3, 10, 20]) + def test_all_samples(self, num_founders): + tables = simulate_pedigree( + num_founders=num_founders, + num_children_prob=[0, 0, 1], + num_generations=6, + sequence_length=100, + ) + flags = tables.nodes.flags + flags[:] = tskit.NODE_IS_SAMPLE + tables.nodes.flags = flags + self.verify(tables) + class TestSimulateThroughPedigreeEventByEvent(TestSimulateThroughPedigree): def verify(self, input_tables, recombination_rate=0): @@ -792,11 +805,14 @@ def verify(self, input_tables, recombination_rate=0): for node_id in tree.nodes(): if tree.num_children(node_id) == 1: # Any unary nodes should be associated with a pedigree - # founder. + # founder or a sapmle node = ts2.node(node_id) assert node.individual != tskit.NULL individual = ts2.individual(node.individual) - assert list(individual.parents) == [tskit.NULL, tskit.NULL] + assert node.is_sample() or list(individual.parents) == [ + tskit.NULL, + tskit.NULL, + ] tables1 = ts1.tables tables2 = ts2.tables tables1.individuals.assert_equals( @@ -912,19 +928,6 @@ def test_10_percent_parents_removed(self): class TestSimulateThroughPedigreeErrors: - def test_parents_are_samples(self): - tables = get_base_tables(100) - parents = [ - add_pedigree_individual(tables, time=1, is_sample=True) for _ in range(2) - ] - add_pedigree_individual(tables, parents=parents, time=0) - - with pytest.raises(_msprime.LibraryError, match="1855"): - msprime.sim_ancestry( - initial_state=tables, - model="fixed_pedigree", - ) - @pytest.mark.parametrize("num_parents", [0, 1, 3]) def test_not_two_parents(self, num_parents): tables = get_base_tables(100) @@ -984,20 +987,6 @@ def test_pedigree_time_travel(self): with pytest.raises(_msprime.InputError, match="time for a parent must be"): msprime.sim_ancestry(initial_state=tables, model="fixed_pedigree") - @pytest.mark.parametrize("num_founders", [2, 3, 10, 20]) - def test_all_samples(self, num_founders): - tables = simulate_pedigree( - num_founders=num_founders, - num_children_prob=[0, 0, 1], - num_generations=6, - sequence_length=100, - ) - flags = tables.nodes.flags - flags[:] = tskit.NODE_IS_SAMPLE - tables.nodes.flags = flags - with pytest.raises(_msprime.LibraryError, match="1855"): - msprime.sim_ancestry(initial_state=tables, model="fixed_pedigree") - class TestSimulateThroughPedigreeMultiplePops: def test_demography_raises_error(self):