From bb547ccf2534ba7dec245eb151619582830edda3 Mon Sep 17 00:00:00 2001 From: Manuel Lera-Ramirez Date: Fri, 19 Jan 2024 18:00:16 +0000 Subject: [PATCH] Cutsite pairs bugs (#184) * fix Dseq.apply_cut and minor error in Dseqrecord.apply_cut * WIP * return error on overlapping cuts, add tests for it * #180 solved for features on + strand, but not all * fixes #180 --- scripts/cutsite_pairs.ipynb | 51 +++++++++++++++++++ src/pydna/dseq.py | 44 ++++++++++++++-- src/pydna/dseqrecord.py | 57 ++++++++++++++++++++- src/pydna/utils.py | 40 +++++++++++++++ tests/test_module_dseq.py | 90 +++++++++++++++++++++++++++++++++ tests/test_module_dseqrecord.py | 24 +++++++++ tests/test_module_utils.py | 6 +++ 7 files changed, 305 insertions(+), 7 deletions(-) create mode 100644 scripts/cutsite_pairs.ipynb diff --git a/scripts/cutsite_pairs.ipynb b/scripts/cutsite_pairs.ipynb new file mode 100644 index 00000000..9b2705c8 --- /dev/null +++ b/scripts/cutsite_pairs.ipynb @@ -0,0 +1,51 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# New cut implementation\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Coming soon\n", + "# seq = Dseq('aaGAATTCaa', circular=True)\n", + "\n", + "# print('EcORI', EcoRI.ovhg, len(seq))\n", + "# for shift in range(len(seq)):\n", + "# seq_shifted = seq.shifted(shift)\n", + "# cut_site = seq_shifted.get_cutsites(EcoRI)[0][0]\n", + "# print(shift, seq_shifted, cut_site, cut_site[0] - cut_site[1])\n", + "\n", + "# seq = Dseq('ccTTAATTAAcc', circular=True)\n", + "# print('PacI', PacI.ovhg, len(seq))\n", + "# for shift in range(len(seq)):\n", + "# seq_shifted = seq.shifted(shift)\n", + "# cut_site = seq_shifted.get_cutsites(PacI)[0][0]\n", + "# print(shift, seq_shifted, cut_site, cut_site[0] - cut_site[1])\n", + "\n", + "\n", + "# seq = Dseq('TTAAccccTTAA', circular=True)\n", + "# custom_cut = ((1, 11), type('DynamicClass', (), {'ovhg': 2})())\n", + "# print(seq.apply_cut(custom_cut, custom_cut).__repr__())\n", + "\n", + "# print()\n", + "\n", + "# custom_cut = ((1, 11), type('DynamicClass', (), {'ovhg': -10})())\n", + "# print(seq.apply_cut(custom_cut, custom_cut).__repr__())" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/src/pydna/dseq.py b/src/pydna/dseq.py index 034f34b0..78e95503 100644 --- a/src/pydna/dseq.py +++ b/src/pydna/dseq.py @@ -30,8 +30,9 @@ from pydna.utils import rc as _rc from pydna.utils import flatten as _flatten -from pydna.common_sub_strings import common_sub_strings as _common_sub_strings +from pydna.utils import cuts_overlap as _cuts_overlap +from pydna.common_sub_strings import common_sub_strings as _common_sub_strings from Bio.Restriction import RestrictionBatch as _RestrictionBatch from Bio.Restriction import CommOnly @@ -1481,11 +1482,44 @@ def get_cutsites(self, *enzymes): return sorted(out) - def apply_cut(self, left_cut, right_cut): + def left_end_position(self) -> tuple[int, int]: + """The index in the global sequence of the watson and crick start positions. + + Global sequence (str(self)) for all three cases is AAA + + ``` + AAA AA AAT + TT TTT TTT + Returns (0, 1) Returns (1, 0) Returns (0, 0) + ``` + + """ + if self.ovhg > 0: + return self.ovhg, 0 + return 0, -self.ovhg - left_watson, left_crick = left_cut[0] if left_cut is not None else ((self.ovhg, 0) if self.ovhg > 0 else (0, -self.ovhg)) - ovhg = self.ovhg if left_cut is None else left_cut[1].ovhg - right_watson, right_crick = right_cut[0] if right_cut is not None else (len(self.watson), len(self.crick)) + def right_end_position(self) -> tuple[int, int]: + """The index in the global sequence of the watson and crick end positions. + + Global sequence (str(self)) for all three cases is AAA + + ``` + AAA AA AAA + TT TTT TTT + Returns (3, 2) Returns (2, 3) Returns (3, 3) + ``` + + """ + if self.watson_ovhg() < 0: + return len(self) + self.watson_ovhg(), len(self) + return len(self), len(self) - self.watson_ovhg() + + def apply_cut(self, left_cut, right_cut): + if _cuts_overlap(left_cut, right_cut, len(self)): + raise ValueError("Cuts overlap") + left_watson, left_crick = left_cut[0] if left_cut is not None else self.left_end_position() + ovhg = left_cut[1].ovhg if left_cut is not None else self.ovhg + right_watson, right_crick = right_cut[0] if right_cut is not None else self.right_end_position() return Dseq( str(self[left_watson:right_watson]), # The line below could be easier to understand as _rc(str(self[left_crick:right_crick])), but it does not preserve the case diff --git a/src/pydna/dseqrecord.py b/src/pydna/dseqrecord.py index 08ffc955..a8729fc4 100644 --- a/src/pydna/dseqrecord.py +++ b/src/pydna/dseqrecord.py @@ -15,11 +15,12 @@ from Bio.Restriction import CommOnly from pydna.dseq import Dseq as _Dseq from pydna._pretty import pretty_str as _pretty_str -from pydna.utils import flatten as _flatten +from pydna.utils import flatten as _flatten, location_boundaries as _location_boundaries # from pydna.utils import memorize as _memorize from pydna.utils import rc as _rc from pydna.utils import shift_location as _shift_location +from pydna.utils import shift_feature as _shift_feature from pydna.common_sub_strings import common_sub_strings as _common_sub_strings from Bio.SeqFeature import SeqFeature as _SeqFeature from Bio import SeqIO @@ -837,6 +838,7 @@ def __getitem__(self, sl): # origin-spanning features should only be included after shifting # in cases where the slice comprises the entire sequence, but then # sl_start == sl_stop and the second condition is not met + # TODO: _location_boundaries answer.features = [f for f in answer.features if ( f.location.parts[-1].end <= answer.seq.length and f.location.parts[0].start <= f.location.parts[-1].end)] @@ -1280,7 +1282,58 @@ def apply_cut(self, left_cut, right_cut): # Not really a cut, but to handle the general case if left_cut is None: features = self.features - features = self.shifted(min(left_cut[0])).features + else: + # The features that span the origin if shifting with left_cut, but that do not cross + # the cut site should be included, and if there is a feature within the cut site, it should + # be duplicated. See https://github.com/BjornFJohansson/pydna/issues/180 for a practical example. + # + # Let's say we are going to open a circular plasmid like below (| inidicate cuts, numbers indicate + # features) + # + # 3333|3 + # 1111 + # 000 + # XXXXatg|YYY + # XXX|tacYYYY + # 000 + # 2222 + # + features = self.shifted(min(left_cut[0])).features + # for f in features: + # print(f.id, f.location, _location_boundaries(f.location)) + # Here, we have done what's shown below (* indicates the origin). + # The features 0 and 2 have the right location for the final product: + # + # 3*3333 + # 1*111 + # XXXX*atgYYY + # XXXX*tacYYY + # 000 + # 2222 + + features_need_transfer = [f for f in features if (_location_boundaries(f.location)[1] <= abs(left_cut[1].ovhg))] + features_need_transfer = [_shift_feature(f, -abs(left_cut[1].ovhg), len(self)) for f in features_need_transfer] + # ^ ^^^^^^^^^ + # Now we have shifted the features that end before the cut (0 and 1, but not 3), as if + # they referred to the below sequence (* indicates the origin): + # + # 1111 + # 000 + # XXXXatg*YYY + # XXXXtac*YYY + # + # The features 0 and 1 would have the right location if the final sequence had the same length + # as the original one. However, the final product is longer because of the overhang. + + features += [_shift_feature(f, abs(left_cut[1].ovhg), len(dseq)) for f in features_need_transfer] + # ^ ^^^^^^^^^ + # So we shift back by the same amount in the opposite direction, but this time we pass the + # length of the final product. + # print(*features, sep='\n') + # Features like 3 are removed here + features = [f for f in features if ( + _location_boundaries(f.location)[1] <= len(dseq) and + _location_boundaries(f.location)[0] <= _location_boundaries(f.location)[1])] else: left_watson, left_crick = left_cut[0] if left_cut is not None else (0, 0) right_watson, right_crick = right_cut[0] if right_cut is not None else (None, None) diff --git a/src/pydna/utils.py b/src/pydna/utils.py index e7c8e221..0c9e3f90 100644 --- a/src/pydna/utils.py +++ b/src/pydna/utils.py @@ -18,6 +18,7 @@ import keyword as _keyword import collections as _collections import itertools as _itertools +from copy import deepcopy as _deepcopy import sys as _sys import re @@ -63,6 +64,14 @@ def shift_location(original_location, shift, lim): assert len(newloc) == len(original_location) return newloc +def shift_feature(feature, shift, lim): + """Return a new feature with shifted location.""" + # TODO: Missing tests + new_location = shift_location(feature.location, shift, lim) + new_feature = _deepcopy(feature) + new_feature.location = new_location + return new_feature + # def smallest_rotation(s): # """Smallest rotation of a string. @@ -570,6 +579,37 @@ def eq(*args, **kwargs): same = False return same +def cuts_overlap(left_cut, right_cut, seq_len): + # Special cases: + if left_cut is None or right_cut is None or left_cut == right_cut: + return False + + # This block of code would not be necessary if the cuts were + # initially represented like this + (left_watson, _), enz1 = left_cut + (right_watson, _), enz2 = right_cut + left_crick = left_watson - enz1.ovhg + right_crick = right_watson - enz2.ovhg + if left_crick >= seq_len: + left_crick -= seq_len + left_watson -= seq_len + if right_crick >= seq_len: + right_crick -= seq_len + right_watson -= seq_len + + # Convert into ranges x and y and see if ranges overlap + x = sorted([left_watson, left_crick]) + y = sorted([right_watson, right_crick]) + return (x[1] > y[0]) != (y[1] < x[0]) + +def location_boundaries(loc: _sl|_cl): + + #TODO: pending on https://github.com/BjornFJohansson/pydna/pull/179 + if loc.strand != 1: + return loc.parts[-1].start, loc.parts[0].end + else: + return loc.parts[0].start, loc.parts[-1].end + if __name__ == "__main__": cached = _os.getenv("pydna_cached_funcs", "") diff --git a/tests/test_module_dseq.py b/tests/test_module_dseq.py index 4e1c50ac..8c2d025b 100644 --- a/tests/test_module_dseq.py +++ b/tests/test_module_dseq.py @@ -788,5 +788,95 @@ def test_from_full_sequence_and_overhangs(): assert dseq_2.watson_ovhg() == watson_ovhg +def test_right_end_position(): + + from pydna.dseq import Dseq + + test_cases = [ + ("AAA", "TT", (3, 2)), + ("AA", "TTT", (2, 3)), + ("AAA", "TTT", (3, 3)), + ] + for watson, crick, expected in test_cases: + dseq = Dseq(watson, crick, ovhg=0, circular=False) + assert dseq.right_end_position() == expected + +def test_left_end_position(): + + from pydna.dseq import Dseq + + test_cases = [ + ("AAA", "TT", (0, 1), -1), + ("AA", "TTT", (1, 0), 1), + ("AAT", "TTT", (0, 0), 0), + ] + for watson, crick, expected, ovhg in test_cases: + dseq = Dseq(watson, crick, ovhg=ovhg, circular=False) + assert dseq.left_end_position() == expected + + +def test_apply_cut(): + from pydna.dseq import Dseq + + seq = Dseq('aaGAATTCaa', circular=False) + + # A cut where both sides are None returns the same sequence + assert seq.apply_cut(None, None) == seq + + # A cut where one side is None leaves that side intact + EcoRI_cut = ((3, 7), type('DynamicClass', (), {'ovhg': -4})()) + assert seq.apply_cut(None, EcoRI_cut) == Dseq.from_full_sequence_and_overhangs('aaGAATT', watson_ovhg=-4, crick_ovhg=0) + assert seq.apply_cut(EcoRI_cut, None) == Dseq.from_full_sequence_and_overhangs('AATTCaa', watson_ovhg=0, crick_ovhg=-4) + + # It respects the original overhang + seq = Dseq.from_full_sequence_and_overhangs('aaGAATTCaa', watson_ovhg=1, crick_ovhg=1) + assert seq.apply_cut(None, EcoRI_cut) == Dseq.from_full_sequence_and_overhangs('aaGAATT', watson_ovhg=-4, crick_ovhg=1) + assert seq.apply_cut(EcoRI_cut, None) == Dseq.from_full_sequence_and_overhangs('AATTCaa', watson_ovhg=1, crick_ovhg=-4) + + seq = Dseq.from_full_sequence_and_overhangs('aaGAATTCaa', watson_ovhg=-1, crick_ovhg=-1) + assert seq.apply_cut(None, EcoRI_cut) == Dseq.from_full_sequence_and_overhangs('aaGAATT', watson_ovhg=-4, crick_ovhg=-1) + assert seq.apply_cut(EcoRI_cut, None) == Dseq.from_full_sequence_and_overhangs('AATTCaa', watson_ovhg=-1, crick_ovhg=-4) + + # A repeated cut in a circular molecule opens it up + seq = Dseq('aaGAATTCaa', circular=True) + assert seq.apply_cut(EcoRI_cut, EcoRI_cut) == Dseq.from_full_sequence_and_overhangs('AATTCaaaaGAATT', watson_ovhg=-4, crick_ovhg=-4) + + # Two cuts extract a subsequence + seq = Dseq('aaGAATTCaaGAATTCaa', circular=True) + EcoRI_cut_2 = ((11, 15), type('DynamicClass', (), {'ovhg': -4})()) + assert seq.apply_cut(EcoRI_cut, EcoRI_cut_2) == Dseq.from_full_sequence_and_overhangs('AATTCaaGAATT', watson_ovhg=-4, crick_ovhg=-4) + + # Overlapping cuts should return an error + seq = Dseq('aaGAATTCaa', circular=True) + first_cuts = [ + ((3, 7), type('DynamicClass', (), {'ovhg': -4})()), + ((7, 3), type('DynamicClass', (), {'ovhg': 4})()), + # Spanning the origin + ((9, 8), type('DynamicClass', (), {'ovhg': -8})()), + ((8, 9), type('DynamicClass', (), {'ovhg': 8})()), + ] + overlapping_cuts = [ + ((4, 8), type('DynamicClass', (), {'ovhg': -4})()), + ((2, 6), type('DynamicClass', (), {'ovhg': -4})()), + ((2, 8), type('DynamicClass', (), {'ovhg': -4})()), + ((8, 4), type('DynamicClass', (), {'ovhg': 4})()), + ((6, 2), type('DynamicClass', (), {'ovhg': 4})()), + ((8, 2), type('DynamicClass', (), {'ovhg': 4})()), + # Spanning the origin + ((7, 6), type('DynamicClass', (), {'ovhg': -8})()), + ((6, 7), type('DynamicClass', (), {'ovhg': 8})()), + ] + + for first_cut in first_cuts: + for second_cut in overlapping_cuts: + try: + seq.apply_cut(first_cut, second_cut) + except ValueError as e: + assert e.args[0] == 'Cuts overlap' + else: + print(first_cut, second_cut) + assert False, 'Expected ValueError' + + if __name__ == "__main__": pytest.main([__file__, "-vv", "-s"]) diff --git a/tests/test_module_dseqrecord.py b/tests/test_module_dseqrecord.py index 4ab11365..a41f5a77 100644 --- a/tests/test_module_dseqrecord.py +++ b/tests/test_module_dseqrecord.py @@ -2240,6 +2240,30 @@ def test_assemble_YEp24PGK_XK(): assert YEp24PGK_XK_correct.seguid() == "cdseguid-hEldsrUV0mBpISw8_xpvnpfYi0g" assert eq(YEp24PGK_XK, YEp24PGK_XK_correct) +def test_apply_cut(): + from pydna.dseqrecord import Dseqrecord + from Bio.SeqFeature import SeqFeature, SimpleLocation + + # Single cut case + for strand in [1, -1, None]: + for cut_coords, cut_ovhg in (((4, 7), -3), ((7, 4), 3)): + dummy_cut = (cut_coords, type('DynamicClass', (), {'ovhg': cut_ovhg})()) + seq = Dseqrecord("acgtATGaatt", circular=True) + seq.features.append(SeqFeature(SimpleLocation(4, 7, strand), id='full_overlap')) + seq.features.append(SeqFeature(SimpleLocation(3, 7, strand), id='left_side')) + seq.features.append(SeqFeature(SimpleLocation(4, 8, strand), id='right_side')) + seq.features.append(SeqFeature(SimpleLocation(3, 10, strand), id='throughout')) + open_seq = seq.apply_cut(dummy_cut, dummy_cut) + assert len(open_seq.features) == 4 + new_locs = [str(f.location) for f in open_seq.features] + if strand == 1: + assert new_locs == ['[0:3](+)', '[0:4](+)', '[11:14](+)', '[10:14](+)'] + elif strand == -1: + # TODO: change the join{[11:14](-), [10:11](-)} case? + assert new_locs == ['[0:3](-)', '[0:4](-)', '[11:14](-)', 'join{[11:14](-), [10:11](-)}'] + if strand == None: + # TODO: pending on https://github.com/BjornFJohansson/pydna/pull/179 + assert new_locs == ['[0:3]', '[0:4]', '[11:14]', 'join{[11:14], [10:11]}'] if __name__ == "__main__": args = [ diff --git a/tests/test_module_utils.py b/tests/test_module_utils.py index 2351f368..b0d0da2c 100644 --- a/tests/test_module_utils.py +++ b/tests/test_module_utils.py @@ -463,5 +463,11 @@ def close(self): assert mf(1, kw=1) == ((1,), {"kw": 1}) +def test_shift_location(): + from pydna.utils import shift_location + + # Shifting of locations should be reversible + + if __name__ == "__main__": pytest.main([__file__, "-vv", "-s"])