diff --git a/README.md b/README.md index dc7bd40..9292cc3 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,6 @@ -# Welcome to Circle-Map official repository! +# This is NOT the Circle-Map official repository! +# I fixed some detail bugs of Circle-Map in this gld_debug branch, enjoy this. + [![PyPI](https://img.shields.io/pypi/v/Circle-Map.svg)](https://pypi.python.org/pypi/Circle-Map) [![Anaconda-Server Badge](https://anaconda.org/bioconda/circle-map/badges/version.svg)](https://anaconda.org/bioconda/circle-map) [![Bioconda Downloads](https://anaconda.org/bioconda/circle-map/badges/downloads.svg)](https://anaconda.org/bioconda/circle-map) @@ -7,7 +9,7 @@ Circle-Map is an easy to install, python package that implements all the steps required to detect extrachromosomal DNA circles. The package contains easy to run algorithms to accurately detect circular DNA formed from mappable and non mappable regions of a genome. - + ## Why should I use Circle-Map? Circle-Map takes as input an alignment of reads to a reference genome (e.g. a *BWA-MEM* generated *BAM* file) and like other methods, it will use those alignments to detect cases were the read has been split into two segments (e.g. split reads) to detect genomic rearrangements supporting a circular DNA structure. diff --git a/circlemap/realigner.py b/circlemap/realigner.py index 9624d19..cd42e1d 100644 --- a/circlemap/realigner.py +++ b/circlemap/realigner.py @@ -172,7 +172,7 @@ def realign(self,peaks): for index,interval in peaks_pd.iterrows(): - + iteration += 1 if check_size_and_write(results,only_discordants,self.output,self.lock,self.directory,self.overlap_fraction,self.pid) == True: results = [] @@ -207,14 +207,13 @@ def realign(self,peaks): disorcordants_per_it = 0 for index,mate_interval in realignment_interval_extended.iterrows(): - iteration += 1 + #sample realignment intervals - #fasta file fetch is 1 based that why I do +1 - plus_coding_interval = genome_fa.fetch(str(mate_interval['chrom']),int(int(mate_interval['start'])+1),int(int(mate_interval['end'])+1)).upper() + plus_coding_interval = genome_fa.fetch(str(mate_interval['chrom']),int(int(mate_interval['start'])),int(int(mate_interval['end']))).upper() interval_length = len(plus_coding_interval) minus_coding_interval = str(Seq(plus_coding_interval).complement()) @@ -282,11 +281,11 @@ def realign(self,peaks): else: #sc length sc_len = len(get_longest_soft_clipped_bases(read)['seq']) + if non_colinearity(int(read.cigar[0][0]),int(read.cigar[-1][0]), + int(read.cigar[0][1]),int(read.cigar[-1][1]), + int(read.pos),int(mate_interval.start),int(mate_interval.end)) == True: - if non_colinearity(int(read.cigar[0][0]),int(read.cigar[-1][0]),int(read.pos), - int(mate_interval.start),int(mate_interval.end)) == True: - if sc_len >= self.min_sc_length: edits_allowed = adaptative_myers_k(sc_len, self.edit_distance_frac) @@ -324,7 +323,7 @@ def realign(self,peaks): iteration_results.append([interval['chrom'], read.reference_start, soft_clip_end+1, read.qname,iteration,float(round(score,2))]) - elif read.reference_start + int(mate_interval['start']) + int( + elif read.reference_start > int(mate_interval['start']) + int( realignment_dict['alignments'][1][0][0]): iteration_results.append([interval['chrom'], soft_clip_start, read_end, read.qname,iteration,float(round(score,2))]) @@ -346,23 +345,25 @@ def realign(self,peaks): else: #discordant reads #R2F1 oriented when iterating trough R2 - if read.is_reverse == True and read.mate_is_reverse == False: - if read.is_read2: - if read.reference_start < read.next_reference_start: - # discordant read - disorcordants_per_it +=1 - iteration_discordants.append([interval['chrom'],read.reference_start,read.next_reference_start + read.infer_query_length(),read.qname]) - + if read.mapq >= self.mapq_cutoff: + if read.is_reverse == True and read.mate_is_reverse == False: + if read.is_read2: + if read.reference_id == read.next_reference_id: + if read.reference_start < read.next_reference_start: + # discordant read + disorcordants_per_it +=1 + iteration_discordants.append([interval['chrom'],read.reference_start,read.next_reference_start + read.infer_query_length(),read.qname]) - #R2F1 when iterating trough F1 - elif read.is_reverse == False and read.mate_is_reverse == True: - if read.is_read2 == False: - if read.next_reference_start < read.reference_start: - disorcordants_per_it +=1 - iteration_discordants.append([interval['chrom'], read.next_reference_start,read.reference_start+read.infer_query_length(), read.qname]) + #R2F1 when iterating trough F1 + elif read.is_reverse == False and read.mate_is_reverse == True: + if read.is_read2 == False: + if read.reference_id == read.next_reference_id: + if read.next_reference_start < read.reference_start: + disorcordants_per_it +=1 + iteration_discordants.append([interval['chrom'], read.next_reference_start,read.reference_start+read.infer_query_length(), read.qname]) #second pass to add discordant read info if len(iteration_results) > 0: @@ -370,16 +371,16 @@ def realign(self,peaks): results = results + assign_discordants(iteration_results,iteration_discordants,insert_metrics[0],insert_metrics[1]) - elif len(iteration_discordants) > 0: - discordant_bed = pd.DataFrame.from_records(iteration_discordants,columns=['chrom','start','end','read']).sort_values(['chrom','start','end']) - discordant_bed = discordant_bed.groupby(merge_bed(discordant_bed)).agg( - {'chrom': 'first', 'start': 'first', 'end': 'last', 'read': 'count'}) + discordant_bed = pd.DataFrame.from_records(iteration_discordants,columns=['chrom','start','end','read']).sort_values(['chrom','start','end']) + + discordant_bed = discordant_bed.groupby(merge_bed(discordant_bed)).agg( + {'chrom': 'first', 'start': 'first', 'end': 'max', 'read': 'nunique'}) - for index,disc_interval in discordant_bed.iterrows(): - only_discordants.append([disc_interval['chrom'],disc_interval['start'],disc_interval['end'],disc_interval['read'],0]) + for index,disc_interval in discordant_bed.iterrows(): + only_discordants.append([disc_interval['chrom'],disc_interval['start'],disc_interval['end'],disc_interval['read'],0]) @@ -414,4 +415,4 @@ def realign(self,peaks): ecc_dna.close() - return([0,0]) \ No newline at end of file + return([0,0]) diff --git a/circlemap/utils.py b/circlemap/utils.py index b165f3a..f0bdb14 100644 --- a/circlemap/utils.py +++ b/circlemap/utils.py @@ -607,7 +607,7 @@ def get_realignment_intervals(bed_prior,interval_extension,interval_p_cutoff,ver candidate_mates_dataframe = candidate_mates_dataframe.sort_values(by=['chrom', 'start','end'],ascending=[True,True,True]) candidate_mates_dataframe['probability'] = candidate_mates_dataframe.probability.astype(float) - candidate_mates = candidate_mates_dataframe.groupby((candidate_mates_dataframe.end.shift()-candidate_mates_dataframe.start).lt(0).cumsum()).agg({'chrom':'first','start':'first','end':'last','probability':'sum'}) + candidate_mates = candidate_mates_dataframe.groupby((candidate_mates_dataframe.end.shift()-candidate_mates_dataframe.start).lt(0).cumsum()).agg({'chrom':'first','start':'first','end':'max','probability':'sum'}) sum = np.sum(float(x[3]) for index, x in candidate_mates.iterrows()) candidate_mates['probability'] = candidate_mates['probability'] / sum @@ -621,7 +621,7 @@ def get_realignment_intervals(bed_prior,interval_extension,interval_p_cutoff,ver candidate_mates_dataframe['probability'] = candidate_mates_dataframe.probability.astype(float) - candidate_mates = candidate_mates_dataframe.groupby((candidate_mates_dataframe.end.shift()-candidate_mates_dataframe.start).lt(0).cumsum()).agg({'chrom':'first','start':'first','end':'last','probability':'sum'}) + candidate_mates = candidate_mates_dataframe.groupby((candidate_mates_dataframe.end.shift()-candidate_mates_dataframe.start).lt(0).cumsum()).agg({'chrom':'first','start':'first','end':'max','probability':'sum'}) sum = np.sum(float(x[3]) for index,x in candidate_mates.iterrows()) @@ -647,7 +647,7 @@ def get_realignment_intervals(bed_prior,interval_extension,interval_p_cutoff,ver for item,row in candidate_mates.iterrows(): - if ('LR' in orientation) or ('L' and 'R' in orientation): + if ('LR' in orientation) or (('L' in orientation) and ('R' in orientation)): start = row['start'] - interval_extension @@ -675,8 +675,12 @@ def get_realignment_intervals(bed_prior,interval_extension,interval_p_cutoff,ver end = row['end'] + interval_extension extended.append([row['chrom'], row['start'], int(round(end)),float(row['probability'])]) - - return (pd.DataFrame.from_records(extended, columns=['chrom', 'start', 'end','probability'])) + ret_data = pd.DataFrame.from_records(extended, columns=['chrom', 'start', 'end','probability']) + ret_data['start'] = ret_data.start.astype(int) + ret_data['end'] = ret_data.end.astype(int) + ret_data = ret_data.sort_values(by=['chrom', 'start','end'],ascending=[True,True,True]) + ret_data = ret_data.groupby((ret_data.end.shift()-ret_data.start).lt(0).cumsum()).agg({'chrom':'first','start':'first','end':'max','probability':'first'}) + return (ret_data) else: @@ -687,7 +691,7 @@ def get_realignment_intervals(bed_prior,interval_extension,interval_p_cutoff,ver if interval['probability'] >= interval_p_cutoff: - if ('LR' in orientation) or ('L' and 'R' in orientation): + if ('LR' in orientation) or (('L' in orientation) and ('R' in orientation)): start = interval['start'] - interval_extension @@ -716,9 +720,12 @@ def get_realignment_intervals(bed_prior,interval_extension,interval_p_cutoff,ver extended.append([interval['chrom'], interval['start'], int(round(end)),float(interval['probability'])]) - - return(pd.DataFrame.from_records(extended,columns=['chrom','start','end','probability']).sort_values(by=['probability'],ascending=[False])) - + ret_data = pd.DataFrame.from_records(extended, columns=['chrom', 'start', 'end','probability']) + ret_data = ret_data.sort_values(by=['chrom', 'start','end'],ascending=[True,True,True]) + ret_data['start'] = ret_data.start.astype(int) + ret_data['end'] = ret_data.end.astype(int) + ret_data = ret_data.groupby((ret_data.end.shift()-ret_data.start).lt(0).cumsum()).agg({'chrom':'first','start':'first','end':'max','probability':'first'}) + return (ret_data) except BaseException as e: @@ -889,8 +896,8 @@ def realign(read,n_hits,plus_strand,minus_strand,plus_base_freqs,minus_base_freq #get soft-clipped read soft_clipped_read = get_longest_soft_clipped_bases(read) - #encoding of DNA and operations A,T,C,G,=,X,DI. THis is done for Numba - nuc_and_ops = np.array([1,2,3,4,5,6,7]) + #encoding of DNA and operations A,T,C,G,=,X,D,I. THis is done for Numba + nuc_and_ops = np.array([1,2,3,4,5,6,7,8]) encoded_nucs = number_encoding(soft_clipped_read['seq']) hits = 0 @@ -905,7 +912,7 @@ def realign(read,n_hits,plus_strand,minus_strand,plus_base_freqs,minus_base_freq while hits < n_hits and min_score >= -10: - alignment = edlib.align(soft_clipped_read['seq'], minus_strand, mode='HW', task='path') + alignment = edlib.align(soft_clipped_read['seq'], plus_strand , mode='HW', task='path') if hits ==0: if alignment['editDistance'] > max_edit: return(None) @@ -919,13 +926,13 @@ def realign(read,n_hits,plus_strand,minus_strand,plus_base_freqs,minus_base_freq mask_bases = 'X' * ( location[1] - location[0]) - minus_strand = minus_strand[:location[0]] + mask_bases + minus_strand[location[1]:] + plus_strand = plus_strand[:location[0]] + mask_bases + plus_strand[location[1]:] hits += 1 score = pssm(phred_to_prob(np.array(soft_clipped_read['qual'],dtype=np.float64)), encoded_nucs, - edlib_cigar_to_iterable(alignment['cigar']),minus_base_freqs,gap_open,gap_extend,nuc_and_ops,verbose) + edlib_cigar_to_iterable(alignment['cigar']),plus_base_freqs,gap_open,gap_extend,nuc_and_ops,verbose) if score < min_score: min_score = score @@ -986,13 +993,15 @@ def edlib_cigar_to_iterable(edlib_cigar): operations = [] for i in re.findall(r'\d+[IDX=]',edlib_cigar): - length.append(int(i[0])) - if i[1] == '=': + length.append(int(i[:-1])) + if i[-1] == '=': operations.append(5) - elif i[1] == 'X': + elif i[-1] == 'X': operations.append(6) - elif i[1] == 'I' or 'D': + elif i[-1] == 'D': operations.append(7) + elif i[-1] == 'I': + operations.append(8) return(np.array(length),np.array(operations)) @@ -1091,10 +1100,12 @@ def pssm(seq_prob,seq_nucl,iterable_cigar,base_freqs,gap_open,gap_extend,nuc_and seq_pos += operation_length - elif operation == nuc_and_ops[6]: + elif operation == nuc_and_ops[6] or operation == nuc_and_ops[7]: #affine gap scoring model indel_penalty += gap_open + gap_extend*(operation_length-1) + if operation == nuc_and_ops[7]: # insertion cost query + seq_pos += operation_length return(np.sum(seq_prob)-indel_penalty) @@ -1282,8 +1293,8 @@ def merge_final_output(bam,results,begin,splits,dir,fraction,pid): for interval in unfiltered_output: if (int(interval[4])+int(interval[3])) >= splits: - if int(interval[1]) != 0: - interval[1] = int(interval[1])+1 + #if int(interval[1]) != 0: + # interval[1] = int(interval[1])+1 filtered.append(interval) filtered_output = bt.BedTool(filtered) @@ -1491,10 +1502,12 @@ def assign_discordants(split_bed,discordant_bed,insert_mean,insert_std): start_filt = chrom_filt[ (chrom_filt['start'] > row['start']) & ((chrom_filt['start'] - row['start']) < max_dist)] end_filt = start_filt[(start_filt['end'] < row['end']) & ((row['end'] - start_filt['end']) < max_dist)] - - assigned_splits.append( - [row['chrom'], row['start'], row['end'], row['read'], row['iteration'], float(row['score']), len(end_filt)]) - + if len(end_filt)>1: + assigned_splits.append( + [row['chrom'], row['start'], row['end'], row['read'], row['iteration'], float(row['score']), len(np.unique(end_filt['read']))]) + else: + assigned_splits.append( + [row['chrom'], row['start'], row['end'], row['read'], row['iteration'], float(row['score']), len(end_filt)]) return (assigned_splits) else: @@ -1510,14 +1523,15 @@ def adaptative_myers_k(sc_len,edit_frac): """Calculate the edit distance allowed as a function of the read length""" return(float(sc_len*edit_frac)) @jit(nopython=True) -def non_colinearity(read_start_cigar,read_end_cigar,aln_start,mate_interval_start,mate_interval_end): +def non_colinearity(read_start_cigar,read_end_cigar,read_start_cigar_l,read_end_cigar_l, + aln_start,mate_interval_start,mate_interval_end): """Input a read and the mate interval in the graph. The function checks whether the alignment would be linear (splicing) or colinear. Will return false, in order to not attemp realignment. This is mainly thought for skipping deletions and RNA splicing""" #check left soft-clipped - if read_start_cigar == 4: + if (read_start_cigar == 4 and read_end_cigar != 4 )or (read_start_cigar == 4 and read_end_cigar ==4 and (read_start_cigar_l>read_end_cigar_l)): #graph needs to be upstream or looping to itself if int(mate_interval_start) > aln_start: return (True) @@ -1527,7 +1541,7 @@ def non_colinearity(read_start_cigar,read_end_cigar,aln_start,mate_interval_star else: return (False) #check right softclipped - if read_end_cigar == 4: + if (read_end_cigar == 4 and read_start_cigar != 4 ) or (read_start_cigar == 4 and read_end_cigar ==4 and read_start_cigar_l