From 485ae193a6acf11a225bd545cad79e3082d53bfd Mon Sep 17 00:00:00 2001
From: pentamorfico <amarrod96@gmail.com>
Date: Tue, 20 Feb 2024 13:48:11 +0100
Subject: [PATCH 1/3] modify blastn parameters

---
 srufinder/blast.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/srufinder/blast.py b/srufinder/blast.py
index bb76c17..de585f5 100644
--- a/srufinder/blast.py
+++ b/srufinder/blast.py
@@ -56,7 +56,12 @@ def run_spacer(self):
 
             # BLASTn
             subprocess.run(['blastn', 
-                            '-task', 'blastn-short', 
+                            '-reward', '3',
+                            '-penalty', '-2',
+                            '-gapopen', '5',
+                            '-gapextend', '5',
+                            '-dust', 'no',
+                            '-evalue', '100',
                             '-word_size', str(self.master.word_size), 
                             '-query', self.master.out+'spacers.fa',
                             '-db', self.master.out+'genome',
@@ -65,3 +70,5 @@ def run_spacer(self):
                             '-qcov_hsp_perc', str(self.master.spacer_coverage),
                             '-out', self.master.out+'blast_spacers.tab',
                             '-num_threads', str(self.master.threads)])
+
+

From 68f920606683b761041234a24c1704204b0b2c56 Mon Sep 17 00:00:00 2001
From: pentamorfico <amarrod96@gmail.com>
Date: Thu, 22 Feb 2024 19:18:22 +0100
Subject: [PATCH 2/3] modify adj cluster to only retain repeats from the same
 type

---
 srufinder/blast.py   |  8 ++++++--
 srufinder/cluster.py | 48 +++++++++++++-------------------------------
 2 files changed, 20 insertions(+), 36 deletions(-)

diff --git a/srufinder/blast.py b/srufinder/blast.py
index de585f5..7bcd5ba 100644
--- a/srufinder/blast.py
+++ b/srufinder/blast.py
@@ -33,7 +33,12 @@ def run(self):
 
         # BLASTn
         subprocess.run(['blastn', 
-                        '-task', 'blastn-short', 
+                        '-reward', '3',
+                        '-penalty', '-2',
+                        '-gapopen', '5',
+                        '-gapextend', '5',
+                        '-dust', 'no',
+                        '-evalue', '100',
                         '-word_size', str(self.master.word_size), 
                         '-query', self.master.repeatdb,
                         '-db', self.master.out+'masked',
@@ -61,7 +66,6 @@ def run_spacer(self):
                             '-gapopen', '5',
                             '-gapextend', '5',
                             '-dust', 'no',
-                            '-evalue', '100',
                             '-word_size', str(self.master.word_size), 
                             '-query', self.master.out+'spacers.fa',
                             '-db', self.master.out+'genome',
diff --git a/srufinder/cluster.py b/srufinder/cluster.py
index 868b6af..789c6dd 100644
--- a/srufinder/cluster.py
+++ b/srufinder/cluster.py
@@ -52,7 +52,7 @@ def run(self):
         # Cluster matches in arrays
         self.cluster_adj()
 
-        # Append partial matches
+        # Append par#
         self.append_partial()
 
         # Round
@@ -168,8 +168,7 @@ def cluster_adj(self):
         '''
         Cluster adjacent matches into arrays
         '''
-
-        logging.info('Clustering matches')
+        logging.info('Clustering repeats')
 
         # Sort by position
         self.df_overlap = self.df_overlap.sort_values('Min')
@@ -183,40 +182,21 @@ def cluster_adj(self):
             self.master.clean()
             sys.exit()
 
-        cluster_df_lst = []
-        cluster = 0
-        # For each contig
-        for i in set(self.df_overlap_compl['Acc']):
-            tmp = self.df_overlap_compl[self.df_overlap_compl['Acc'] == i]
-
-            pos = tmp[['Min','Max']].values
-            cluster_list = []
-            # Loop over complete matches
-            for ind, k in enumerate(pos):
-                # Keep first match
-                if ind == 0:
-                    cluster_list.append(cluster)
-                    arrays_cluster = [k]
-                else:
-                    # If match within Xbp of any previous, add match to current cluster
-                    if min(self.dist_all(k, arrays_cluster)) <= self.master.max_dist:
-                        cluster_list.append(cluster)
-                        arrays_cluster.append(k)
-                    # If match > Xbp from previous, initiate new cluster
-                    else:
-                        cluster += 1
-                        arrays_cluster = [k]
-                        cluster_list.append(cluster)
-            
-            tmp.insert(len(tmp.columns), 'Cluster', cluster_list)
-            cluster_df_lst.append(tmp)
-            
-            # Increment cluster ID for next acc
-            cluster += 1
+        self.df_overlap_compl["Repeat_type"] = self.df_overlap_compl["Repeat"].str.split(':').str[0]
+        #cluster all repeats with the same type, and then group those from the same type less than 100bp apart
+        self.df_overlap_compl = self.df_overlap_compl.sort_values(['Acc', 'Repeat_type', 'Min'])
+        #cluster those that are less than 100bp apart from the previous one (that is, min(n+1) - max(n) < 100)
+        #create a column with the difference between the Min value of the current row and the Max value of the previous row
+        self.df_overlap_compl['Min_diff'] = self.df_overlap_compl['Min'] - self.df_overlap_compl['Max'].shift(1)
+        #cluster those contiguous repeats that are less than 100bp apart (absolute value)
+        self.df_overlap_compl['Cluster'] = (self.df_overlap_compl['Min_diff'].abs() > 100).cumsum()
+        #get clusters with more than 3 repeats
 
+        self.df_overlap_compl = self.df_overlap_compl.sort_values(['Acc','Min'])
+        self.df_overlap_compl = self.df_overlap_compl.reset_index(drop=True)
 
         # If several contigs, concatenate
-        self.df_cluster = pd.concat(cluster_df_lst)
+        self.df_cluster = self.df_overlap_compl
 
     def append_partial(self):
         '''

From a4fcef01fda3d4cf56c0a1542ab39f2fa003852e Mon Sep 17 00:00:00 2001
From: pentamorfico <amarrod96@gmail.com>
Date: Fri, 23 Feb 2024 01:01:39 +0100
Subject: [PATCH 3/3] keep repeats >40%seq id, similar type

---
 srufinder/cluster.py    | 51 ++++++++++++++++++++++++++++++++++++-----
 srufinder/controller.py |  2 ++
 2 files changed, 47 insertions(+), 6 deletions(-)

diff --git a/srufinder/cluster.py b/srufinder/cluster.py
index 789c6dd..9f3a968 100644
--- a/srufinder/cluster.py
+++ b/srufinder/cluster.py
@@ -6,9 +6,10 @@
 
 import pandas as pd
 
-from Bio import pairwise2
+from Bio import Align
 from Bio import SeqIO
 from Bio.Seq import Seq
+from Bio import pairwise2
 
 class Cluster(object):
     
@@ -116,8 +117,29 @@ def identity(self,x,y):
         '''
         Calculate identity between two sequences
         '''
-        align = pairwise2.align.globalxs(x, y, -1, -1, penalize_end_gaps=False)
-        return(align[0][2]/min(len(x), len(y))*100)
+        aligner = Align.PairwiseAligner()
+
+        aligner.match_score = 1.0 
+        aligner.mismatch_score = -2.0
+        aligner.gap_score = -10
+        aligner.extend_gap_score = -10
+
+        alignments = aligner.align(x, y)
+
+        best_alignment = alignments[0]
+
+        # Extract aligned sequences from the best alignment
+        aligned_seq1, aligned_seq2 = best_alignment
+
+        matches = sum(1 for a, b in zip(aligned_seq1, aligned_seq2) if a == b)
+        #compute lengths of aligned sequences without gaps
+        len1 = len(aligned_seq1.replace('-', ''))
+        len2 = len(aligned_seq2.replace('-', ''))
+        alignment_length = min(len1, len2)
+
+        # Calculate pidentity
+        pidentity = (matches / alignment_length) * 100 if alignment_length > 0 else 0
+        return pidentity
 
     def identity_all(self,x,ll):
         '''
@@ -168,6 +190,9 @@ def cluster_adj(self):
         '''
         Cluster adjacent matches into arrays
         '''
+        pd.set_option('display.max_rows', None)  # None means unlimited
+        pd.set_option('display.max_columns', None)  # Adjust as per your DataFrame's number of columns
+
         logging.info('Clustering repeats')
 
         # Sort by position
@@ -189,12 +214,26 @@ def cluster_adj(self):
         #create a column with the difference between the Min value of the current row and the Max value of the previous row
         self.df_overlap_compl['Min_diff'] = self.df_overlap_compl['Min'] - self.df_overlap_compl['Max'].shift(1)
         #cluster those contiguous repeats that are less than 100bp apart (absolute value)
-        self.df_overlap_compl['Cluster'] = (self.df_overlap_compl['Min_diff'].abs() > 100).cumsum()
-        #get clusters with more than 3 repeats
+        self.df_overlap_compl['Cluster'] = (self.df_overlap_compl['Min_diff'].abs() > self.master.max_dist).cumsum()
+
+        # iterate over the clusters, and compute similarity between the sequences of the cluster. remove rows with similarity <40% to any other sequence in the cluster
+        to_remove = []
+        clusters = self.df_overlap_compl.groupby('Cluster')
+        for name, group in clusters:
+            if len(group) > 1:
+                sequences = group["Sequence"].tolist()
+                
+                for i in range(len(group)):
 
+                    similarities = self.identity_all(sequences[i], [x for j, x in enumerate(sequences) if j != i])
+                    if not any([similarity >= 40 for similarity in similarities]):
+                        to_remove.append(group.index[i])
+        
+
+
+        self.df_overlap_compl = self.df_overlap_compl.drop(to_remove)
         self.df_overlap_compl = self.df_overlap_compl.sort_values(['Acc','Min'])
         self.df_overlap_compl = self.df_overlap_compl.reset_index(drop=True)
-
         # If several contigs, concatenate
         self.df_cluster = self.df_overlap_compl
 
diff --git a/srufinder/controller.py b/srufinder/controller.py
index 4967e26..4251209 100644
--- a/srufinder/controller.py
+++ b/srufinder/controller.py
@@ -7,6 +7,8 @@
 import pandas as pd
 
 from Bio import SeqIO
+from Bio import Align
+
 
 class Controller(object):