From cb3437fb2b62e9e1836dc26cca058fc78556f690 Mon Sep 17 00:00:00 2001 From: guolidong Date: Sun, 19 Dec 2021 15:02:39 +0800 Subject: [PATCH 1/4] fixed bugs by gld --- circlemap/realigner.py | 59 ++++++++++++++++++++------------------- circlemap/utils.py | 63 +++++++++++++++++++++++++----------------- 2 files changed, 67 insertions(+), 55 deletions(-) diff --git a/circlemap/realigner.py b/circlemap/realigner.py index 9624d19..a9dedb6 100644 --- a/circlemap/realigner.py +++ b/circlemap/realigner.py @@ -172,7 +172,7 @@ def realign(self,peaks): for index,interval in peaks_pd.iterrows(): - + iteration += 1 if check_size_and_write(results,only_discordants,self.output,self.lock,self.directory,self.overlap_fraction,self.pid) == True: results = [] @@ -207,14 +207,13 @@ def realign(self,peaks): disorcordants_per_it = 0 for index,mate_interval in realignment_interval_extended.iterrows(): - iteration += 1 + #sample realignment intervals - #fasta file fetch is 1 based that why I do +1 - plus_coding_interval = genome_fa.fetch(str(mate_interval['chrom']),int(int(mate_interval['start'])+1),int(int(mate_interval['end'])+1)).upper() + plus_coding_interval = genome_fa.fetch(str(mate_interval['chrom']),int(int(mate_interval['start'])),int(int(mate_interval['end']))).upper() interval_length = len(plus_coding_interval) minus_coding_interval = str(Seq(plus_coding_interval).complement()) @@ -282,11 +281,11 @@ def realign(self,peaks): else: #sc length sc_len = len(get_longest_soft_clipped_bases(read)['seq']) + if non_colinearity(int(read.cigar[0][0]),int(read.cigar[-1][0]), + int(read.cigar[0][1]),int(read.cigar[-1][1]), + int(read.pos),int(mate_interval.start),int(mate_interval.end)) == True: - if non_colinearity(int(read.cigar[0][0]),int(read.cigar[-1][0]),int(read.pos), - int(mate_interval.start),int(mate_interval.end)) == True: - if sc_len >= self.min_sc_length: edits_allowed = adaptative_myers_k(sc_len, self.edit_distance_frac) @@ -324,7 +323,7 @@ def realign(self,peaks): iteration_results.append([interval['chrom'], read.reference_start, soft_clip_end+1, read.qname,iteration,float(round(score,2))]) - elif read.reference_start + int(mate_interval['start']) + int( + elif read.reference_start > int(mate_interval['start']) + int( realignment_dict['alignments'][1][0][0]): iteration_results.append([interval['chrom'], soft_clip_start, read_end, read.qname,iteration,float(round(score,2))]) @@ -346,40 +345,42 @@ def realign(self,peaks): else: #discordant reads #R2F1 oriented when iterating trough R2 - if read.is_reverse == True and read.mate_is_reverse == False: - if read.is_read2: - if read.reference_start < read.next_reference_start: - # discordant read - disorcordants_per_it +=1 - iteration_discordants.append([interval['chrom'],read.reference_start,read.next_reference_start + read.infer_query_length(),read.qname]) - + if read.mapq >= self.mapq_cutoff: + if read.is_reverse == True and read.mate_is_reverse == False: + if read.is_read2: + if read.reference_id == read.next_reference_id: + if read.reference_start < read.next_reference_start: + # discordant read + disorcordants_per_it +=1 + iteration_discordants.append([interval['chrom'],read.reference_start,read.next_reference_start + read.infer_query_length(),read.qname]) - #R2F1 when iterating trough F1 - elif read.is_reverse == False and read.mate_is_reverse == True: - if read.is_read2 == False: - if read.next_reference_start < read.reference_start: - disorcordants_per_it +=1 - iteration_discordants.append([interval['chrom'], read.next_reference_start,read.reference_start+read.infer_query_length(), read.qname]) + #R2F1 when iterating trough F1 + elif read.is_reverse == False and read.mate_is_reverse == True: + if read.is_read2 == False: + if read.reference_id == read.next_reference_id: + if read.next_reference_start < read.reference_start: + disorcordants_per_it +=1 + iteration_discordants.append([interval['chrom'], read.next_reference_start,read.reference_start+read.infer_query_length(), read.qname]) #second pass to add discordant read info if len(iteration_results) > 0: - results = results + assign_discordants(iteration_results,iteration_discordants,insert_metrics[0],insert_metrics[1]) - + results = results + assign_discordants(iteration_results,iteration_discordants,insert_metrics[0],insert_metrics[1],False) elif len(iteration_discordants) > 0: - discordant_bed = pd.DataFrame.from_records(iteration_discordants,columns=['chrom','start','end','read']).sort_values(['chrom','start','end']) - discordant_bed = discordant_bed.groupby(merge_bed(discordant_bed)).agg( - {'chrom': 'first', 'start': 'first', 'end': 'last', 'read': 'count'}) + discordant_bed = pd.DataFrame.from_records(iteration_discordants,columns=['chrom','start','end','read']).sort_values(['chrom','start','end']) + + discordant_bed = discordant_bed.groupby(merge_bed(discordant_bed)).agg( + {'chrom': 'first', 'start': 'first', 'end': 'max', 'read': 'nunique'}) - for index,disc_interval in discordant_bed.iterrows(): - only_discordants.append([disc_interval['chrom'],disc_interval['start'],disc_interval['end'],disc_interval['read'],0]) + for index,disc_interval in discordant_bed.iterrows(): + only_discordants.append([disc_interval['chrom'],disc_interval['start'],disc_interval['end'],disc_interval['read'],0]) @@ -414,4 +415,4 @@ def realign(self,peaks): ecc_dna.close() - return([0,0]) \ No newline at end of file + return([0,0]) diff --git a/circlemap/utils.py b/circlemap/utils.py index b165f3a..64e277a 100644 --- a/circlemap/utils.py +++ b/circlemap/utils.py @@ -607,7 +607,7 @@ def get_realignment_intervals(bed_prior,interval_extension,interval_p_cutoff,ver candidate_mates_dataframe = candidate_mates_dataframe.sort_values(by=['chrom', 'start','end'],ascending=[True,True,True]) candidate_mates_dataframe['probability'] = candidate_mates_dataframe.probability.astype(float) - candidate_mates = candidate_mates_dataframe.groupby((candidate_mates_dataframe.end.shift()-candidate_mates_dataframe.start).lt(0).cumsum()).agg({'chrom':'first','start':'first','end':'last','probability':'sum'}) + candidate_mates = candidate_mates_dataframe.groupby((candidate_mates_dataframe.end.shift()-candidate_mates_dataframe.start).lt(0).cumsum()).agg({'chrom':'first','start':'first','end':'max','probability':'sum'}) sum = np.sum(float(x[3]) for index, x in candidate_mates.iterrows()) candidate_mates['probability'] = candidate_mates['probability'] / sum @@ -621,7 +621,7 @@ def get_realignment_intervals(bed_prior,interval_extension,interval_p_cutoff,ver candidate_mates_dataframe['probability'] = candidate_mates_dataframe.probability.astype(float) - candidate_mates = candidate_mates_dataframe.groupby((candidate_mates_dataframe.end.shift()-candidate_mates_dataframe.start).lt(0).cumsum()).agg({'chrom':'first','start':'first','end':'last','probability':'sum'}) + candidate_mates = candidate_mates_dataframe.groupby((candidate_mates_dataframe.end.shift()-candidate_mates_dataframe.start).lt(0).cumsum()).agg({'chrom':'first','start':'first','end':'max','probability':'sum'}) sum = np.sum(float(x[3]) for index,x in candidate_mates.iterrows()) @@ -647,7 +647,7 @@ def get_realignment_intervals(bed_prior,interval_extension,interval_p_cutoff,ver for item,row in candidate_mates.iterrows(): - if ('LR' in orientation) or ('L' and 'R' in orientation): + if ('LR' in orientation) or (('L' in orientation) and ('R' in orientation)): start = row['start'] - interval_extension @@ -675,8 +675,10 @@ def get_realignment_intervals(bed_prior,interval_extension,interval_p_cutoff,ver end = row['end'] + interval_extension extended.append([row['chrom'], row['start'], int(round(end)),float(row['probability'])]) - - return (pd.DataFrame.from_records(extended, columns=['chrom', 'start', 'end','probability'])) + ret_data = pd.DataFrame.from_records(extended, columns=['chrom', 'start', 'end','probability']) + ret_data = ret_data.sort_values(by=['chrom', 'start','end'],ascending=[True,True,True]) + ret_data = ret_data.groupby((ret_data.end.shift()-ret_data.start).lt(0).cumsum()).agg({'chrom':'first','start':'first','end':'max','probability':'first'}) + return (ret_data) else: @@ -687,7 +689,7 @@ def get_realignment_intervals(bed_prior,interval_extension,interval_p_cutoff,ver if interval['probability'] >= interval_p_cutoff: - if ('LR' in orientation) or ('L' and 'R' in orientation): + if ('LR' in orientation) or (('L' in orientation) and ('R' in orientation)): start = interval['start'] - interval_extension @@ -716,9 +718,10 @@ def get_realignment_intervals(bed_prior,interval_extension,interval_p_cutoff,ver extended.append([interval['chrom'], interval['start'], int(round(end)),float(interval['probability'])]) - - return(pd.DataFrame.from_records(extended,columns=['chrom','start','end','probability']).sort_values(by=['probability'],ascending=[False])) - + ret_data = pd.DataFrame.from_records(extended, columns=['chrom', 'start', 'end','probability']) + ret_data = ret_data.sort_values(by=['chrom', 'start','end'],ascending=[True,True,True]) + ret_data = ret_data.groupby((ret_data.end.shift()-ret_data.start).lt(0).cumsum()).agg({'chrom':'first','start':'first','end':'max','probability':'first'}) + return (ret_data) except BaseException as e: @@ -889,8 +892,8 @@ def realign(read,n_hits,plus_strand,minus_strand,plus_base_freqs,minus_base_freq #get soft-clipped read soft_clipped_read = get_longest_soft_clipped_bases(read) - #encoding of DNA and operations A,T,C,G,=,X,DI. THis is done for Numba - nuc_and_ops = np.array([1,2,3,4,5,6,7]) + #encoding of DNA and operations A,T,C,G,=,X,D,I. THis is done for Numba + nuc_and_ops = np.array([1,2,3,4,5,6,7,8]) encoded_nucs = number_encoding(soft_clipped_read['seq']) hits = 0 @@ -905,7 +908,7 @@ def realign(read,n_hits,plus_strand,minus_strand,plus_base_freqs,minus_base_freq while hits < n_hits and min_score >= -10: - alignment = edlib.align(soft_clipped_read['seq'], minus_strand, mode='HW', task='path') + alignment = edlib.align(soft_clipped_read['seq'], plus_strand , mode='HW', task='path') if hits ==0: if alignment['editDistance'] > max_edit: return(None) @@ -919,13 +922,13 @@ def realign(read,n_hits,plus_strand,minus_strand,plus_base_freqs,minus_base_freq mask_bases = 'X' * ( location[1] - location[0]) - minus_strand = minus_strand[:location[0]] + mask_bases + minus_strand[location[1]:] + plus_strand = plus_strand[:location[0]] + mask_bases + plus_strand[location[1]:] hits += 1 score = pssm(phred_to_prob(np.array(soft_clipped_read['qual'],dtype=np.float64)), encoded_nucs, - edlib_cigar_to_iterable(alignment['cigar']),minus_base_freqs,gap_open,gap_extend,nuc_and_ops,verbose) + edlib_cigar_to_iterable(alignment['cigar']),plus_base_freqs,gap_open,gap_extend,nuc_and_ops,verbose) if score < min_score: min_score = score @@ -986,13 +989,15 @@ def edlib_cigar_to_iterable(edlib_cigar): operations = [] for i in re.findall(r'\d+[IDX=]',edlib_cigar): - length.append(int(i[0])) - if i[1] == '=': + length.append(int(i[:-1])) + if i[-1] == '=': operations.append(5) - elif i[1] == 'X': + elif i[-1] == 'X': operations.append(6) - elif i[1] == 'I' or 'D': + elif i[-1] == 'D': operations.append(7) + elif i[-1] == 'I': + operations.append(8) return(np.array(length),np.array(operations)) @@ -1091,10 +1096,12 @@ def pssm(seq_prob,seq_nucl,iterable_cigar,base_freqs,gap_open,gap_extend,nuc_and seq_pos += operation_length - elif operation == nuc_and_ops[6]: + elif operation == nuc_and_ops[6] or operation == nuc_and_ops[7]: #affine gap scoring model indel_penalty += gap_open + gap_extend*(operation_length-1) + if operation == nuc_and_ops[7]: # insertion cost query + seq_pos += operation_length return(np.sum(seq_prob)-indel_penalty) @@ -1491,10 +1498,12 @@ def assign_discordants(split_bed,discordant_bed,insert_mean,insert_std): start_filt = chrom_filt[ (chrom_filt['start'] > row['start']) & ((chrom_filt['start'] - row['start']) < max_dist)] end_filt = start_filt[(start_filt['end'] < row['end']) & ((row['end'] - start_filt['end']) < max_dist)] - - assigned_splits.append( - [row['chrom'], row['start'], row['end'], row['read'], row['iteration'], float(row['score']), len(end_filt)]) - + if len(end_filt)>1: + assigned_splits.append( + [row['chrom'], row['start'], row['end'], row['read'], row['iteration'], float(row['score']), len(np.unique(end_filt['read']))]) + else: + assigned_splits.append( + [row['chrom'], row['start'], row['end'], row['read'], row['iteration'], float(row['score']), len(end_filt)]) return (assigned_splits) else: @@ -1510,14 +1519,15 @@ def adaptative_myers_k(sc_len,edit_frac): """Calculate the edit distance allowed as a function of the read length""" return(float(sc_len*edit_frac)) @jit(nopython=True) -def non_colinearity(read_start_cigar,read_end_cigar,aln_start,mate_interval_start,mate_interval_end): +def non_colinearity(read_start_cigar,read_end_cigar,read_start_cigar_l,read_end_cigar_l, + aln_start,mate_interval_start,mate_interval_end): """Input a read and the mate interval in the graph. The function checks whether the alignment would be linear (splicing) or colinear. Will return false, in order to not attemp realignment. This is mainly thought for skipping deletions and RNA splicing""" #check left soft-clipped - if read_start_cigar == 4: + if (read_start_cigar == 4 and read_end_cigar != 4 )or (read_start_cigar == 4 and read_end_cigar ==4 and (read_start_cigar_l>read_end_cigar_l)): #graph needs to be upstream or looping to itself if int(mate_interval_start) > aln_start: return (True) @@ -1527,7 +1537,7 @@ def non_colinearity(read_start_cigar,read_end_cigar,aln_start,mate_interval_star else: return (False) #check right softclipped - if read_end_cigar == 4: + if (read_end_cigar == 4 and read_start_cigar != 4 ) or (read_start_cigar == 4 and read_end_cigar ==4 and read_start_cigar_l Date: Sun, 19 Dec 2021 15:08:54 +0800 Subject: [PATCH 2/4] Update README.md --- README.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index dc7bd40..9292cc3 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,6 @@ -# Welcome to Circle-Map official repository! +# This is NOT the Circle-Map official repository! +# I fixed some detail bugs of Circle-Map in this gld_debug branch, enjoy this. + [![PyPI](https://img.shields.io/pypi/v/Circle-Map.svg)](https://pypi.python.org/pypi/Circle-Map) [![Anaconda-Server Badge](https://anaconda.org/bioconda/circle-map/badges/version.svg)](https://anaconda.org/bioconda/circle-map) [![Bioconda Downloads](https://anaconda.org/bioconda/circle-map/badges/downloads.svg)](https://anaconda.org/bioconda/circle-map) @@ -7,7 +9,7 @@ Circle-Map is an easy to install, python package that implements all the steps required to detect extrachromosomal DNA circles. The package contains easy to run algorithms to accurately detect circular DNA formed from mappable and non mappable regions of a genome. - + ## Why should I use Circle-Map? Circle-Map takes as input an alignment of reads to a reference genome (e.g. a *BWA-MEM* generated *BAM* file) and like other methods, it will use those alignments to detect cases were the read has been split into two segments (e.g. split reads) to detect genomic rearrangements supporting a circular DNA structure. From 522e2feaf1b8ef1c59e23223c9662be223f664c2 Mon Sep 17 00:00:00 2001 From: Lidong Date: Mon, 20 Dec 2021 09:34:36 +0800 Subject: [PATCH 3/4] fixed bugs : forget to delete debug parameter --- circlemap/realigner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/circlemap/realigner.py b/circlemap/realigner.py index a9dedb6..cd42e1d 100644 --- a/circlemap/realigner.py +++ b/circlemap/realigner.py @@ -369,7 +369,7 @@ def realign(self,peaks): if len(iteration_results) > 0: - results = results + assign_discordants(iteration_results,iteration_discordants,insert_metrics[0],insert_metrics[1],False) + results = results + assign_discordants(iteration_results,iteration_discordants,insert_metrics[0],insert_metrics[1]) elif len(iteration_discordants) > 0: From 7b8193942779d0855e65006d3eda278102c01d33 Mon Sep 17 00:00:00 2001 From: guolidong Date: Tue, 28 Dec 2021 12:09:18 +0800 Subject: [PATCH 4/4] fixed bugs --- circlemap/utils.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/circlemap/utils.py b/circlemap/utils.py index 64e277a..f0bdb14 100644 --- a/circlemap/utils.py +++ b/circlemap/utils.py @@ -676,6 +676,8 @@ def get_realignment_intervals(bed_prior,interval_extension,interval_p_cutoff,ver extended.append([row['chrom'], row['start'], int(round(end)),float(row['probability'])]) ret_data = pd.DataFrame.from_records(extended, columns=['chrom', 'start', 'end','probability']) + ret_data['start'] = ret_data.start.astype(int) + ret_data['end'] = ret_data.end.astype(int) ret_data = ret_data.sort_values(by=['chrom', 'start','end'],ascending=[True,True,True]) ret_data = ret_data.groupby((ret_data.end.shift()-ret_data.start).lt(0).cumsum()).agg({'chrom':'first','start':'first','end':'max','probability':'first'}) return (ret_data) @@ -720,6 +722,8 @@ def get_realignment_intervals(bed_prior,interval_extension,interval_p_cutoff,ver ret_data = pd.DataFrame.from_records(extended, columns=['chrom', 'start', 'end','probability']) ret_data = ret_data.sort_values(by=['chrom', 'start','end'],ascending=[True,True,True]) + ret_data['start'] = ret_data.start.astype(int) + ret_data['end'] = ret_data.end.astype(int) ret_data = ret_data.groupby((ret_data.end.shift()-ret_data.start).lt(0).cumsum()).agg({'chrom':'first','start':'first','end':'max','probability':'first'}) return (ret_data) @@ -1289,8 +1293,8 @@ def merge_final_output(bam,results,begin,splits,dir,fraction,pid): for interval in unfiltered_output: if (int(interval[4])+int(interval[3])) >= splits: - if int(interval[1]) != 0: - interval[1] = int(interval[1])+1 + #if int(interval[1]) != 0: + # interval[1] = int(interval[1])+1 filtered.append(interval) filtered_output = bt.BedTool(filtered)