Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cutsite pairs bugs #183

Merged
merged 5 commits into from
Jan 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 51 additions & 0 deletions scripts/cutsite_pairs.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# New cut implementation\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Coming soon\n",
"# seq = Dseq('aaGAATTCaa', circular=True)\n",
"\n",
"# print('EcORI', EcoRI.ovhg, len(seq))\n",
"# for shift in range(len(seq)):\n",
"# seq_shifted = seq.shifted(shift)\n",
"# cut_site = seq_shifted.get_cutsites(EcoRI)[0][0]\n",
"# print(shift, seq_shifted, cut_site, cut_site[0] - cut_site[1])\n",
"\n",
"# seq = Dseq('ccTTAATTAAcc', circular=True)\n",
"# print('PacI', PacI.ovhg, len(seq))\n",
"# for shift in range(len(seq)):\n",
"# seq_shifted = seq.shifted(shift)\n",
"# cut_site = seq_shifted.get_cutsites(PacI)[0][0]\n",
"# print(shift, seq_shifted, cut_site, cut_site[0] - cut_site[1])\n",
"\n",
"\n",
"# seq = Dseq('TTAAccccTTAA', circular=True)\n",
"# custom_cut = ((1, 11), type('DynamicClass', (), {'ovhg': 2})())\n",
"# print(seq.apply_cut(custom_cut, custom_cut).__repr__())\n",
"\n",
"# print()\n",
"\n",
"# custom_cut = ((1, 11), type('DynamicClass', (), {'ovhg': -10})())\n",
"# print(seq.apply_cut(custom_cut, custom_cut).__repr__())"
]
}
],
"metadata": {
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
44 changes: 39 additions & 5 deletions src/pydna/dseq.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,9 @@
from pydna.utils import cseguid as _cseg
from pydna.utils import rc as _rc
from pydna.utils import flatten as _flatten
from pydna.common_sub_strings import common_sub_strings as _common_sub_strings
from pydna.utils import cuts_overlap as _cuts_overlap

from pydna.common_sub_strings import common_sub_strings as _common_sub_strings
from Bio.Restriction import RestrictionBatch as _RestrictionBatch
from Bio.Restriction import CommOnly

Expand Down Expand Up @@ -1484,11 +1485,44 @@ def get_cutsites(self, *enzymes):

return sorted(out)

def apply_cut(self, left_cut, right_cut):
def left_end_position(self) -> tuple[int, int]:
"""The index in the global sequence of the watson and crick start positions.

Global sequence (str(self)) for all three cases is AAA

```
AAA AA AAT
TT TTT TTT
Returns (0, 1) Returns (1, 0) Returns (0, 0)
```

"""
if self.ovhg > 0:
return self.ovhg, 0
return 0, -self.ovhg

left_watson, left_crick = left_cut[0] if left_cut is not None else ((self.ovhg, 0) if self.ovhg > 0 else (0, -self.ovhg))
ovhg = self.ovhg if left_cut is None else left_cut[1].ovhg
right_watson, right_crick = right_cut[0] if right_cut is not None else (len(self.watson), len(self.crick))
def right_end_position(self) -> tuple[int, int]:
"""The index in the global sequence of the watson and crick end positions.

Global sequence (str(self)) for all three cases is AAA

```
AAA AA AAA
TT TTT TTT
Returns (3, 2) Returns (2, 3) Returns (3, 3)
```

"""
if self.watson_ovhg() < 0:
return len(self) + self.watson_ovhg(), len(self)
return len(self), len(self) - self.watson_ovhg()

def apply_cut(self, left_cut, right_cut):
if _cuts_overlap(left_cut, right_cut, len(self)):
raise ValueError("Cuts overlap")
left_watson, left_crick = left_cut[0] if left_cut is not None else self.left_end_position()
ovhg = left_cut[1].ovhg if left_cut is not None else self.ovhg
right_watson, right_crick = right_cut[0] if right_cut is not None else self.right_end_position()
return Dseq(
str(self[left_watson:right_watson]),
# The line below could be easier to understand as _rc(str(self[left_crick:right_crick])), but it does not preserve the case
Expand Down
57 changes: 55 additions & 2 deletions src/pydna/dseqrecord.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,12 @@
from Bio.Restriction import CommOnly
from pydna.dseq import Dseq as _Dseq
from pydna._pretty import pretty_str as _pretty_str
from pydna.utils import flatten as _flatten
from pydna.utils import flatten as _flatten, location_boundaries as _location_boundaries

# from pydna.utils import memorize as _memorize
from pydna.utils import rc as _rc
from pydna.utils import shift_location as _shift_location
from pydna.utils import shift_feature as _shift_feature
from pydna.common_sub_strings import common_sub_strings as _common_sub_strings
from Bio.SeqFeature import SeqFeature as _SeqFeature
from Bio import SeqIO
Expand Down Expand Up @@ -900,6 +901,7 @@ def __getitem__(self, sl):
# origin-spanning features should only be included after shifting
# in cases where the slice comprises the entire sequence, but then
# sl_start == sl_stop and the second condition is not met
# TODO: _location_boundaries
answer.features = [f for f in answer.features if (
f.location.parts[-1].end <= answer.seq.length and
f.location.parts[0].start <= f.location.parts[-1].end)]
Expand Down Expand Up @@ -1343,7 +1345,58 @@ def apply_cut(self, left_cut, right_cut):
# Not really a cut, but to handle the general case
if left_cut is None:
features = self.features
features = self.shifted(min(left_cut[0])).features
else:
# The features that span the origin if shifting with left_cut, but that do not cross
# the cut site should be included, and if there is a feature within the cut site, it should
# be duplicated. See https://github.com/BjornFJohansson/pydna/issues/180 for a practical example.
#
# Let's say we are going to open a circular plasmid like below (| inidicate cuts, numbers indicate
# features)
#
# 3333|3
# 1111
# 000
# XXXXatg|YYY
# XXX|tacYYYY
# 000
# 2222
#
features = self.shifted(min(left_cut[0])).features
# for f in features:
# print(f.id, f.location, _location_boundaries(f.location))
# Here, we have done what's shown below (* indicates the origin).
# The features 0 and 2 have the right location for the final product:
#
# 3*3333
# 1*111
# XXXX*atgYYY
# XXXX*tacYYY
# 000
# 2222

features_need_transfer = [f for f in features if (_location_boundaries(f.location)[1] <= abs(left_cut[1].ovhg))]
features_need_transfer = [_shift_feature(f, -abs(left_cut[1].ovhg), len(self)) for f in features_need_transfer]
# ^ ^^^^^^^^^
# Now we have shifted the features that end before the cut (0 and 1, but not 3), as if
# they referred to the below sequence (* indicates the origin):
#
# 1111
# 000
# XXXXatg*YYY
# XXXXtac*YYY
#
# The features 0 and 1 would have the right location if the final sequence had the same length
# as the original one. However, the final product is longer because of the overhang.

features += [_shift_feature(f, abs(left_cut[1].ovhg), len(dseq)) for f in features_need_transfer]
# ^ ^^^^^^^^^
# So we shift back by the same amount in the opposite direction, but this time we pass the
# length of the final product.
# print(*features, sep='\n')
# Features like 3 are removed here
features = [f for f in features if (
_location_boundaries(f.location)[1] <= len(dseq) and
_location_boundaries(f.location)[0] <= _location_boundaries(f.location)[1])]
else:
left_watson, left_crick = left_cut[0] if left_cut is not None else (0, 0)
right_watson, right_crick = right_cut[0] if right_cut is not None else (None, None)
Expand Down
40 changes: 40 additions & 0 deletions src/pydna/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import keyword as _keyword
import collections as _collections
import itertools as _itertools
from copy import deepcopy as _deepcopy

import sys as _sys
import re
Expand Down Expand Up @@ -71,6 +72,14 @@ def shift_location(original_location, shift, lim):
assert len(newloc) == len(original_location)
return newloc

def shift_feature(feature, shift, lim):
"""Return a new feature with shifted location."""
# TODO: Missing tests
new_location = shift_location(feature.location, shift, lim)
new_feature = _deepcopy(feature)
new_feature.location = new_location
return new_feature


# def smallest_rotation(s):
# """Smallest rotation of a string.
Expand Down Expand Up @@ -800,6 +809,37 @@ def eq(*args, **kwargs):
same = False
return same

def cuts_overlap(left_cut, right_cut, seq_len):
# Special cases:
if left_cut is None or right_cut is None or left_cut == right_cut:
return False

# This block of code would not be necessary if the cuts were
# initially represented like this
(left_watson, _), enz1 = left_cut
(right_watson, _), enz2 = right_cut
left_crick = left_watson - enz1.ovhg
right_crick = right_watson - enz2.ovhg
if left_crick >= seq_len:
left_crick -= seq_len
left_watson -= seq_len
if right_crick >= seq_len:
right_crick -= seq_len
right_watson -= seq_len

# Convert into ranges x and y and see if ranges overlap
x = sorted([left_watson, left_crick])
y = sorted([right_watson, right_crick])
return (x[1] > y[0]) != (y[1] < x[0])

def location_boundaries(loc: _sl|_cl):

#TODO: pending on https://github.com/BjornFJohansson/pydna/pull/179
if loc.strand != 1:
return loc.parts[-1].start, loc.parts[0].end
else:
return loc.parts[0].start, loc.parts[-1].end


if __name__ == "__main__":
cached = _os.getenv("pydna_cached_funcs", "")
Expand Down
90 changes: 90 additions & 0 deletions tests/test_module_dseq.py
Original file line number Diff line number Diff line change
Expand Up @@ -789,5 +789,95 @@ def test_from_full_sequence_and_overhangs():
assert dseq_2.watson_ovhg() == watson_ovhg


def test_right_end_position():

from pydna.dseq import Dseq

test_cases = [
("AAA", "TT", (3, 2)),
("AA", "TTT", (2, 3)),
("AAA", "TTT", (3, 3)),
]
for watson, crick, expected in test_cases:
dseq = Dseq(watson, crick, ovhg=0, circular=False)
assert dseq.right_end_position() == expected

def test_left_end_position():

from pydna.dseq import Dseq

test_cases = [
("AAA", "TT", (0, 1), -1),
("AA", "TTT", (1, 0), 1),
("AAT", "TTT", (0, 0), 0),
]
for watson, crick, expected, ovhg in test_cases:
dseq = Dseq(watson, crick, ovhg=ovhg, circular=False)
assert dseq.left_end_position() == expected


def test_apply_cut():
from pydna.dseq import Dseq

seq = Dseq('aaGAATTCaa', circular=False)

# A cut where both sides are None returns the same sequence
assert seq.apply_cut(None, None) == seq

# A cut where one side is None leaves that side intact
EcoRI_cut = ((3, 7), type('DynamicClass', (), {'ovhg': -4})())
assert seq.apply_cut(None, EcoRI_cut) == Dseq.from_full_sequence_and_overhangs('aaGAATT', watson_ovhg=-4, crick_ovhg=0)
assert seq.apply_cut(EcoRI_cut, None) == Dseq.from_full_sequence_and_overhangs('AATTCaa', watson_ovhg=0, crick_ovhg=-4)

# It respects the original overhang
seq = Dseq.from_full_sequence_and_overhangs('aaGAATTCaa', watson_ovhg=1, crick_ovhg=1)
assert seq.apply_cut(None, EcoRI_cut) == Dseq.from_full_sequence_and_overhangs('aaGAATT', watson_ovhg=-4, crick_ovhg=1)
assert seq.apply_cut(EcoRI_cut, None) == Dseq.from_full_sequence_and_overhangs('AATTCaa', watson_ovhg=1, crick_ovhg=-4)

seq = Dseq.from_full_sequence_and_overhangs('aaGAATTCaa', watson_ovhg=-1, crick_ovhg=-1)
assert seq.apply_cut(None, EcoRI_cut) == Dseq.from_full_sequence_and_overhangs('aaGAATT', watson_ovhg=-4, crick_ovhg=-1)
assert seq.apply_cut(EcoRI_cut, None) == Dseq.from_full_sequence_and_overhangs('AATTCaa', watson_ovhg=-1, crick_ovhg=-4)

# A repeated cut in a circular molecule opens it up
seq = Dseq('aaGAATTCaa', circular=True)
assert seq.apply_cut(EcoRI_cut, EcoRI_cut) == Dseq.from_full_sequence_and_overhangs('AATTCaaaaGAATT', watson_ovhg=-4, crick_ovhg=-4)

# Two cuts extract a subsequence
seq = Dseq('aaGAATTCaaGAATTCaa', circular=True)
EcoRI_cut_2 = ((11, 15), type('DynamicClass', (), {'ovhg': -4})())
assert seq.apply_cut(EcoRI_cut, EcoRI_cut_2) == Dseq.from_full_sequence_and_overhangs('AATTCaaGAATT', watson_ovhg=-4, crick_ovhg=-4)

# Overlapping cuts should return an error
seq = Dseq('aaGAATTCaa', circular=True)
first_cuts = [
((3, 7), type('DynamicClass', (), {'ovhg': -4})()),
((7, 3), type('DynamicClass', (), {'ovhg': 4})()),
# Spanning the origin
((9, 8), type('DynamicClass', (), {'ovhg': -8})()),
((8, 9), type('DynamicClass', (), {'ovhg': 8})()),
]
overlapping_cuts = [
((4, 8), type('DynamicClass', (), {'ovhg': -4})()),
((2, 6), type('DynamicClass', (), {'ovhg': -4})()),
((2, 8), type('DynamicClass', (), {'ovhg': -4})()),
((8, 4), type('DynamicClass', (), {'ovhg': 4})()),
((6, 2), type('DynamicClass', (), {'ovhg': 4})()),
((8, 2), type('DynamicClass', (), {'ovhg': 4})()),
# Spanning the origin
((7, 6), type('DynamicClass', (), {'ovhg': -8})()),
((6, 7), type('DynamicClass', (), {'ovhg': 8})()),
]

for first_cut in first_cuts:
for second_cut in overlapping_cuts:
try:
seq.apply_cut(first_cut, second_cut)
except ValueError as e:
assert e.args[0] == 'Cuts overlap'
else:
print(first_cut, second_cut)
assert False, 'Expected ValueError'


if __name__ == "__main__":
pytest.main([__file__, "-vv", "-s"])
24 changes: 24 additions & 0 deletions tests/test_module_dseqrecord.py
Original file line number Diff line number Diff line change
Expand Up @@ -2237,6 +2237,30 @@ def test_assemble_YEp24PGK_XK():
assert YEp24PGK_XK_correct.cseguid() == "t9fs_9UvEuD-Ankyy8XEr1hD5DQ"
assert eq(YEp24PGK_XK, YEp24PGK_XK_correct)

def test_apply_cut():
from pydna.dseqrecord import Dseqrecord
from Bio.SeqFeature import SeqFeature, SimpleLocation

# Single cut case
for strand in [1, -1, None]:
for cut_coords, cut_ovhg in (((4, 7), -3), ((7, 4), 3)):
dummy_cut = (cut_coords, type('DynamicClass', (), {'ovhg': cut_ovhg})())
seq = Dseqrecord("acgtATGaatt", circular=True)
seq.features.append(SeqFeature(SimpleLocation(4, 7, strand), id='full_overlap'))
seq.features.append(SeqFeature(SimpleLocation(3, 7, strand), id='left_side'))
seq.features.append(SeqFeature(SimpleLocation(4, 8, strand), id='right_side'))
seq.features.append(SeqFeature(SimpleLocation(3, 10, strand), id='throughout'))
open_seq = seq.apply_cut(dummy_cut, dummy_cut)
assert len(open_seq.features) == 4
new_locs = [str(f.location) for f in open_seq.features]
if strand == 1:
assert new_locs == ['[0:3](+)', '[0:4](+)', '[11:14](+)', '[10:14](+)']
elif strand == -1:
# TODO: change the join{[11:14](-), [10:11](-)} case?
assert new_locs == ['[0:3](-)', '[0:4](-)', '[11:14](-)', 'join{[11:14](-), [10:11](-)}']
if strand == None:
# TODO: pending on https://github.com/BjornFJohansson/pydna/pull/179
assert new_locs == ['[0:3]', '[0:4]', '[11:14]', 'join{[11:14], [10:11]}']

if __name__ == "__main__":
args = [
Expand Down
6 changes: 6 additions & 0 deletions tests/test_module_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -457,5 +457,11 @@ def close(self):
assert mf(1, kw=1) == ((1,), {"kw": 1})


def test_shift_location():
from pydna.utils import shift_location

# Shifting of locations should be reversible


if __name__ == "__main__":
pytest.main([__file__, "-vv", "-s"])
Loading