Cutsite pairs bugs (#184)

* fix Dseq.apply_cut and minor error in Dseqrecord.apply_cut * WIP * return error on overlapping cuts, add tests for it * #180 solved for features on + strand, but not all * fixes #180
pydna-group · Feb 2, 2024 · bb547cc · bb547cc
1 parent 11a5668
commit bb547cc
Show file tree

Hide file tree

Showing 7 changed files with 305 additions and 7 deletions.
diff --git a/scripts/cutsite_pairs.ipynb b/scripts/cutsite_pairs.ipynb
@@ -0,0 +1,51 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# New cut implementation\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Coming soon\n",
+    "# seq = Dseq('aaGAATTCaa', circular=True)\n",
+    "\n",
+    "# print('EcORI', EcoRI.ovhg, len(seq))\n",
+    "# for shift in range(len(seq)):\n",
+    "#     seq_shifted = seq.shifted(shift)\n",
+    "#     cut_site = seq_shifted.get_cutsites(EcoRI)[0][0]\n",
+    "#     print(shift, seq_shifted, cut_site, cut_site[0] - cut_site[1])\n",
+    "\n",
+    "# seq = Dseq('ccTTAATTAAcc', circular=True)\n",
+    "# print('PacI', PacI.ovhg, len(seq))\n",
+    "# for shift in range(len(seq)):\n",
+    "#     seq_shifted = seq.shifted(shift)\n",
+    "#     cut_site = seq_shifted.get_cutsites(PacI)[0][0]\n",
+    "#     print(shift, seq_shifted, cut_site, cut_site[0] - cut_site[1])\n",
+    "\n",
+    "\n",
+    "# seq = Dseq('TTAAccccTTAA', circular=True)\n",
+    "# custom_cut = ((1, 11), type('DynamicClass', (), {'ovhg': 2})())\n",
+    "# print(seq.apply_cut(custom_cut, custom_cut).__repr__())\n",
+    "\n",
+    "# print()\n",
+    "\n",
+    "# custom_cut = ((1, 11), type('DynamicClass', (), {'ovhg': -10})())\n",
+    "# print(seq.apply_cut(custom_cut, custom_cut).__repr__())"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/src/pydna/dseq.py b/src/pydna/dseq.py
@@ -30,8 +30,9 @@
 
 from pydna.utils import rc as _rc
 from pydna.utils import flatten as _flatten
-from pydna.common_sub_strings import common_sub_strings as _common_sub_strings
+from pydna.utils import cuts_overlap as _cuts_overlap
 
+from pydna.common_sub_strings import common_sub_strings as _common_sub_strings
 from Bio.Restriction import RestrictionBatch as _RestrictionBatch
 from Bio.Restriction import CommOnly
 
@@ -1481,11 +1482,44 @@ def get_cutsites(self, *enzymes):
 
         return sorted(out)
 
-    def apply_cut(self, left_cut, right_cut):
+    def left_end_position(self) -> tuple[int, int]:
+        """The index in the global sequence of the watson and crick start positions.
+
+        Global sequence (str(self)) for all three cases is AAA
+
+        ```
+        AAA              AA               AAT
+         TT             TTT               TTT
+        Returns (0, 1)  Returns (1, 0)    Returns (0, 0)
+        ```
+
+        """
+        if self.ovhg > 0:
+            return self.ovhg, 0
+        return 0, -self.ovhg
 
-        left_watson, left_crick = left_cut[0] if left_cut is not None else ((self.ovhg, 0) if self.ovhg > 0 else (0, -self.ovhg))
-        ovhg = self.ovhg if left_cut is None else left_cut[1].ovhg
-        right_watson, right_crick = right_cut[0] if right_cut is not None else (len(self.watson), len(self.crick))
+    def right_end_position(self) -> tuple[int, int]:
+        """The index in the global sequence of the watson and crick end positions.
+
+        Global sequence (str(self)) for all three cases is AAA
+
+        ```
+        AAA               AA                   AAA
+        TT                TTT                  TTT
+        Returns (3, 2)    Returns (2, 3)       Returns (3, 3)
+        ```
+
+        """
+        if self.watson_ovhg() < 0:
+            return len(self) + self.watson_ovhg(), len(self)
+        return len(self), len(self) - self.watson_ovhg()
+
+    def apply_cut(self, left_cut, right_cut):
+        if _cuts_overlap(left_cut, right_cut, len(self)):
+            raise ValueError("Cuts overlap")
+        left_watson, left_crick = left_cut[0] if left_cut is not None else self.left_end_position()
+        ovhg = left_cut[1].ovhg if left_cut is not None else self.ovhg
+        right_watson, right_crick = right_cut[0] if right_cut is not None else self.right_end_position()
         return Dseq(
                     str(self[left_watson:right_watson]),
                     # The line below could be easier to understand as _rc(str(self[left_crick:right_crick])), but it does not preserve the case

diff --git a/src/pydna/dseqrecord.py b/src/pydna/dseqrecord.py
@@ -15,11 +15,12 @@
 from Bio.Restriction import CommOnly
 from pydna.dseq import Dseq as _Dseq
 from pydna._pretty import pretty_str as _pretty_str
-from pydna.utils import flatten as _flatten
+from pydna.utils import flatten as _flatten, location_boundaries as _location_boundaries
 
 # from pydna.utils import memorize as _memorize
 from pydna.utils import rc as _rc
 from pydna.utils import shift_location as _shift_location
+from pydna.utils import shift_feature as _shift_feature
 from pydna.common_sub_strings import common_sub_strings as _common_sub_strings
 from Bio.SeqFeature import SeqFeature as _SeqFeature
 from Bio import SeqIO
@@ -837,6 +838,7 @@ def __getitem__(self, sl):
             # origin-spanning features should only be included after shifting
             # in cases where the slice comprises the entire sequence, but then
             # sl_start == sl_stop and the second condition is not met
+            # TODO: _location_boundaries
             answer.features = [f for f in answer.features if (
                                f.location.parts[-1].end <= answer.seq.length and
                                f.location.parts[0].start <= f.location.parts[-1].end)]
@@ -1280,7 +1282,58 @@ def apply_cut(self, left_cut, right_cut):
             # Not really a cut, but to handle the general case
             if left_cut is None:
                 features = self.features
-            features = self.shifted(min(left_cut[0])).features
+            else:
+                # The features that span the origin if shifting with left_cut, but that do not cross
+                # the cut site should be included, and if there is a feature within the cut site, it should
+                # be duplicated. See https://github.com/BjornFJohansson/pydna/issues/180 for a practical example.
+                #
+                # Let's say we are going to open a circular plasmid like below (| inidicate cuts, numbers indicate
+                # features)
+                #
+                #    3333|3
+                #    1111
+                #     000
+                # XXXXatg|YYY
+                # XXX|tacYYYY
+                #     000
+                #     2222
+                #
+                features = self.shifted(min(left_cut[0])).features
+                # for f in features:
+                #     print(f.id, f.location, _location_boundaries(f.location))
+                # Here, we have done what's shown below (* indicates the origin).
+                # The features 0 and 2 have the right location for the final product:
+                #
+                #    3*3333
+                #    1*111
+                # XXXX*atgYYY
+                # XXXX*tacYYY
+                #      000
+                #      2222
+
+                features_need_transfer = [f for f in features if (_location_boundaries(f.location)[1] <= abs(left_cut[1].ovhg))]
+                features_need_transfer = [_shift_feature(f, -abs(left_cut[1].ovhg), len(self)) for f in features_need_transfer]
+                #                                           ^                       ^^^^^^^^^
+                # Now we have shifted the features that end before the cut (0 and 1, but not 3), as if
+                # they referred to the below sequence (* indicates the origin):
+                #
+                #    1111
+                #     000
+                # XXXXatg*YYY
+                # XXXXtac*YYY
+                #
+                # The features 0 and 1 would have the right location if the final sequence had the same length
+                # as the original one. However, the final product is longer because of the overhang.
+
+                features += [_shift_feature(f, abs(left_cut[1].ovhg), len(dseq)) for f in features_need_transfer]
+                #                             ^                       ^^^^^^^^^
+                # So we shift back by the same amount in the opposite direction, but this time we pass the 
+                # length of the final product.
+                # print(*features, sep='\n')
+                # Features like 3 are removed here
+                features = [f for f in features if (
+                               _location_boundaries(f.location)[1] <= len(dseq) and
+                               _location_boundaries(f.location)[0] <= _location_boundaries(f.location)[1])]
         else:
             left_watson, left_crick = left_cut[0] if left_cut is not None else (0, 0)
             right_watson, right_crick = right_cut[0] if right_cut is not None else (None, None)

diff --git a/src/pydna/utils.py b/src/pydna/utils.py
@@ -18,6 +18,7 @@
 import keyword as _keyword
 import collections as _collections
 import itertools as _itertools
+from copy import deepcopy as _deepcopy
 
 import sys as _sys
 import re
@@ -63,6 +64,14 @@ def shift_location(original_location, shift, lim):
     assert len(newloc) == len(original_location)
     return newloc
 
+def shift_feature(feature, shift, lim):
+    """Return a new feature with shifted location."""
+    # TODO: Missing tests
+    new_location = shift_location(feature.location, shift, lim)
+    new_feature = _deepcopy(feature)
+    new_feature.location = new_location
+    return new_feature
+
 
 # def smallest_rotation(s):
 #     """Smallest rotation of a string.
@@ -570,6 +579,37 @@ def eq(*args, **kwargs):
                 same = False
     return same
 
+def cuts_overlap(left_cut, right_cut, seq_len):
+    # Special cases:
+    if left_cut is None or right_cut is None or left_cut == right_cut:
+        return False
+
+    # This block of code would not be necessary if the cuts were
+    # initially represented like this
+    (left_watson, _), enz1 = left_cut
+    (right_watson, _), enz2 = right_cut
+    left_crick = left_watson - enz1.ovhg
+    right_crick = right_watson - enz2.ovhg
+    if left_crick >= seq_len:
+        left_crick -= seq_len
+        left_watson -= seq_len
+    if right_crick >= seq_len:
+        right_crick -= seq_len
+        right_watson -= seq_len
+
+    # Convert into ranges x and y and see if ranges overlap
+    x = sorted([left_watson, left_crick])
+    y = sorted([right_watson, right_crick])
+    return (x[1] > y[0]) != (y[1] < x[0])
+
+def location_boundaries(loc: _sl|_cl):
+
+    #TODO: pending on https://github.com/BjornFJohansson/pydna/pull/179
+    if loc.strand != 1:
+        return loc.parts[-1].start, loc.parts[0].end
+    else:
+        return loc.parts[0].start, loc.parts[-1].end
+
 
 if __name__ == "__main__":
     cached = _os.getenv("pydna_cached_funcs", "")

diff --git a/tests/test_module_dseq.py b/tests/test_module_dseq.py
@@ -788,5 +788,95 @@ def test_from_full_sequence_and_overhangs():
         assert dseq_2.watson_ovhg() == watson_ovhg
 
 
+def test_right_end_position():
+
+    from pydna.dseq import Dseq
+
+    test_cases = [
+        ("AAA", "TT", (3, 2)),
+        ("AA", "TTT", (2, 3)),
+        ("AAA", "TTT", (3, 3)),
+    ]
+    for watson, crick, expected in test_cases:
+        dseq = Dseq(watson, crick, ovhg=0, circular=False)
+        assert dseq.right_end_position() == expected
+
+def test_left_end_position():
+
+    from pydna.dseq import Dseq
+
+    test_cases = [
+        ("AAA", "TT", (0, 1), -1),
+        ("AA", "TTT", (1, 0), 1),
+        ("AAT", "TTT", (0, 0), 0),
+    ]
+    for watson, crick, expected, ovhg in test_cases:
+        dseq = Dseq(watson, crick, ovhg=ovhg, circular=False)
+        assert dseq.left_end_position() == expected
+
+
+def test_apply_cut():
+    from pydna.dseq import Dseq
+
+    seq = Dseq('aaGAATTCaa', circular=False)
+
+    # A cut where both sides are None returns the same sequence
+    assert seq.apply_cut(None, None) == seq
+
+    # A cut where one side is None leaves that side intact
+    EcoRI_cut = ((3, 7), type('DynamicClass', (), {'ovhg': -4})())
+    assert seq.apply_cut(None, EcoRI_cut) == Dseq.from_full_sequence_and_overhangs('aaGAATT', watson_ovhg=-4, crick_ovhg=0)
+    assert seq.apply_cut(EcoRI_cut, None) == Dseq.from_full_sequence_and_overhangs('AATTCaa', watson_ovhg=0, crick_ovhg=-4)
+
+    # It respects the original overhang
+    seq = Dseq.from_full_sequence_and_overhangs('aaGAATTCaa', watson_ovhg=1, crick_ovhg=1)
+    assert seq.apply_cut(None, EcoRI_cut) == Dseq.from_full_sequence_and_overhangs('aaGAATT', watson_ovhg=-4, crick_ovhg=1)
+    assert seq.apply_cut(EcoRI_cut, None) == Dseq.from_full_sequence_and_overhangs('AATTCaa', watson_ovhg=1, crick_ovhg=-4)
+
+    seq = Dseq.from_full_sequence_and_overhangs('aaGAATTCaa', watson_ovhg=-1, crick_ovhg=-1)
+    assert seq.apply_cut(None, EcoRI_cut) == Dseq.from_full_sequence_and_overhangs('aaGAATT', watson_ovhg=-4, crick_ovhg=-1)
+    assert seq.apply_cut(EcoRI_cut, None) == Dseq.from_full_sequence_and_overhangs('AATTCaa', watson_ovhg=-1, crick_ovhg=-4)
+
+    # A repeated cut in a circular molecule opens it up
+    seq = Dseq('aaGAATTCaa', circular=True)
+    assert seq.apply_cut(EcoRI_cut, EcoRI_cut) == Dseq.from_full_sequence_and_overhangs('AATTCaaaaGAATT', watson_ovhg=-4, crick_ovhg=-4)
+
+    # Two cuts extract a subsequence
+    seq = Dseq('aaGAATTCaaGAATTCaa', circular=True)
+    EcoRI_cut_2 = ((11, 15), type('DynamicClass', (), {'ovhg': -4})())
+    assert seq.apply_cut(EcoRI_cut, EcoRI_cut_2) == Dseq.from_full_sequence_and_overhangs('AATTCaaGAATT', watson_ovhg=-4, crick_ovhg=-4)
+
+    # Overlapping cuts should return an error
+    seq = Dseq('aaGAATTCaa', circular=True)
+    first_cuts = [
+        ((3, 7), type('DynamicClass', (), {'ovhg': -4})()),
+        ((7, 3), type('DynamicClass', (), {'ovhg': 4})()),
+        # Spanning the origin
+        ((9, 8), type('DynamicClass', (), {'ovhg': -8})()),
+        ((8, 9), type('DynamicClass', (), {'ovhg': 8})()),
+        ]
+    overlapping_cuts = [
+        ((4, 8), type('DynamicClass', (), {'ovhg': -4})()),
+        ((2, 6), type('DynamicClass', (), {'ovhg': -4})()),
+        ((2, 8), type('DynamicClass', (), {'ovhg': -4})()),
+        ((8, 4), type('DynamicClass', (), {'ovhg': 4})()),
+        ((6, 2), type('DynamicClass', (), {'ovhg': 4})()),
+        ((8, 2), type('DynamicClass', (), {'ovhg': 4})()),
+        # Spanning the origin
+        ((7, 6), type('DynamicClass', (), {'ovhg': -8})()),
+        ((6, 7), type('DynamicClass', (), {'ovhg': 8})()),
+    ]
+
+    for first_cut in first_cuts:
+        for second_cut in overlapping_cuts:
+            try:
+                seq.apply_cut(first_cut, second_cut)
+            except ValueError as e:
+                assert e.args[0] == 'Cuts overlap'
+            else:
+                print(first_cut, second_cut)
+                assert False, 'Expected ValueError'
+
+
 if __name__ == "__main__":
     pytest.main([__file__, "-vv", "-s"])
diff --git a/tests/test_module_dseqrecord.py b/tests/test_module_dseqrecord.py
@@ -2240,6 +2240,30 @@ def test_assemble_YEp24PGK_XK():
     assert YEp24PGK_XK_correct.seguid() == "cdseguid-hEldsrUV0mBpISw8_xpvnpfYi0g"
     assert eq(YEp24PGK_XK, YEp24PGK_XK_correct)
 
+def test_apply_cut():
+    from pydna.dseqrecord import Dseqrecord
+    from Bio.SeqFeature import SeqFeature, SimpleLocation
+
+    # Single cut case
+    for strand in [1, -1, None]:
+        for cut_coords, cut_ovhg in (((4, 7), -3), ((7, 4), 3)):
+            dummy_cut = (cut_coords, type('DynamicClass', (), {'ovhg': cut_ovhg})())
+            seq = Dseqrecord("acgtATGaatt", circular=True)
+            seq.features.append(SeqFeature(SimpleLocation(4, 7,  strand), id='full_overlap'))
+            seq.features.append(SeqFeature(SimpleLocation(3, 7,  strand), id='left_side'))
+            seq.features.append(SeqFeature(SimpleLocation(4, 8,  strand), id='right_side'))
+            seq.features.append(SeqFeature(SimpleLocation(3, 10, strand), id='throughout'))
+            open_seq = seq.apply_cut(dummy_cut, dummy_cut)
+            assert len(open_seq.features) == 4
+            new_locs = [str(f.location) for f in open_seq.features]
+            if strand == 1:
+                assert new_locs == ['[0:3](+)', '[0:4](+)', '[11:14](+)', '[10:14](+)']
+            elif strand == -1:
+                # TODO: change the join{[11:14](-), [10:11](-)} case?
+                assert new_locs == ['[0:3](-)', '[0:4](-)', '[11:14](-)', 'join{[11:14](-), [10:11](-)}']
+            if strand == None:
+                # TODO: pending on https://github.com/BjornFJohansson/pydna/pull/179
+                assert new_locs == ['[0:3]', '[0:4]', '[11:14]', 'join{[11:14], [10:11]}']
 
 if __name__ == "__main__":
     args = [

diff --git a/tests/test_module_utils.py b/tests/test_module_utils.py
@@ -463,5 +463,11 @@ def close(self):
     assert mf(1, kw=1) == ((1,), {"kw": 1})
 
 
+def test_shift_location():
+    from pydna.utils import shift_location
+
+    # Shifting of locations should be reversible
+
+
 if __name__ == "__main__":
     pytest.main([__file__, "-vv", "-s"])