diff --git a/src/AlleleDropOut_germline.R b/src/AlleleDropOut_germline.R new file mode 100644 index 0000000..89bf0da --- /dev/null +++ b/src/AlleleDropOut_germline.R @@ -0,0 +1,187 @@ +library(reshape2) +calcP <- function(geno_vec, dis_vec, dis_limit=20000){ + + dis_bin <- c(100, 1000, 5000, 10000, 20000, 500000, 1000000000000000000) + prob_bin <- c(1, 0.93, 0.78, 0.71, 0.69, 0.64, 0) + + prob_bin[dis_bin>dis_limit] <- 0 + + pos_match <- which(geno_vec%in%c("000","111")) + dis_match <- dis_vec[pos_match] + + match_p <- 0 + for(pos in dis_match){ + if(pos<=dis_bin[1]){p <-1} + else{ + p <- prob_bin[max(which(pos>=dis_bin))+1] + } + match_p <- match_p + p/sqrt(pos) + } + + pos_mismatch <- which(geno_vec%in%c("010","101")) + dis_mismatch <- dis_vec[pos_mismatch] + + + mismatch_p <- 0 + for(pos in dis_mismatch){ + if(pos<=dis_bin[1]){p <-1} + else{ + p <- prob_bin[max(which(pos>=dis_bin))+1] + } + mismatch_p <-mismatch_p + p/sqrt(pos) + } + + if(is.na(match_p) | is.na(mismatch_p)){p<-(-1)} + else{ + if((match_p + mismatch_p) ==0){ + p <- (-1) + }else{ + p <- match_p/(match_p+mismatch_p) + } + } + + return(p) +} + + + + + +args = commandArgs(trailingOnly=TRUE) + +chr <- args[1] +out <- args[2] +len <- as.numeric(args[4]) +bin <- as.numeric(args[5]) + +load(paste0(args[3],chr,".input.image.RDS")) + + + +germline <- mat_qc[!mat_qc$V4=="0|0" & !mat_qc$V4==".|.",] + +print(dim(germline)) + +y <- germline +dis <- colsplit(y$V1,":", c("id1","id2"))[,2] +check <- germline[,c(1,2,3,4)] +check$prob <- (-1000) +check$tol <- 0 +check$dis <- (-1000) + +check$prob_phy <- (-1000) +check$tol_phy <- 0 + +pos_lst <-list() +sta <- 0 +for(i in seq(5,ncol(y),1)){ + vec <- y[,i] + pos_lst[[i]] <- which(!vec=="0|0" & !y$V4==".|.") + sta <- sta + length(which(!vec=="0|0" & !y$V4==".|.")) +} + + +dis_rd <-c() +trio_hap_rd <-c() +cnt <- 0 +dis_limit <- 20000 + +lst <- germline$V2 +dis_rd2 <-c() +hap_rd2 <-c() + +for(snv in lst[seq(1,length(lst),bin)]){ + cnt=cnt+1 + #print(paste0("scanning ",cnt," now...")) + snv_index <- which(y$V2==snv) + snv_index_noMisCol <- which(!y[snv_index,]=="0|0" & !y[snv_index,]=="0/0",) + + # remove the first column since they are not genotypes + snv_index_noMisCol <- snv_index_noMisCol[snv_index_noMisCol>4] + + geno_vec <-c() + dis_vec <-c() + dis_phy1 <- c() + dis_phy2 <- c() + # identify haplotypes for each element + index <-0 + for(pos in snv_index_noMisCol){ + # element index (snv_index, pos) + pos_noMisCol <- pos_lst[[pos]] + lower_vec <- which(pos_noMisColsnv_index) + + if(length(lower_vec)>=1 & length(upper_vec)>=1){ + lower <- pos_noMisCol[max(lower_vec)] + upper <- pos_noMisCol[min(upper_vec)] + tp <- y[snv_index, pos] + tp <- gsub("/", "|", tp) + geno <- c(y[lower,pos], tp, y[upper, pos]) + + part <-colsplit(geno,"\\|",c("i1","i2"))[,2] + part[part>1] <- 1 + flag <- paste0(part[1],part[2],part[3]) + hap_rd2 <-c(hap_rd2, abs(part[2]-part[1]), abs(part[3]-part[2])) + dis_rd2 <-c(dis_rd2, dis[snv_index]-dis[lower], dis[upper]-dis[snv_index]) + + dis_vec <- c(dis_vec, dis[upper]-dis[lower]) + dis_phy1 <- c(dis_phy1, dis[snv_index]-dis[lower]) + dis_phy2 <- c(dis_phy2, dis[upper]-dis[snv_index]) + geno_vec <-c(geno_vec,flag) + } + } + + sum <- 0 + sum_equal <- 0 + + if(!is.null(dis_phy1) & sum(dis_phy10){ + p <- sum_equal/sum + check$prob_phy[cnt] <- p + check$tol_phy[cnt] <- sum + print(paste0("Scan ", cnt,":", snv, " ", sum, " ", p, " ",len)) + } + + if(length(geno_vec)>0 ){ + dis_rd <-c(dis_rd, dis_vec) + trio_hap_rd <-c(trio_hap_rd, geno_vec) + #print(table(trio_hap_rd)) + sel <-list() + sel <- table(geno_vec[dis_vec<1000000]) + + if(length(sel)>0){ + check$prob[cnt] <- 0 + check$tol[cnt] <- sum(dis_vec=2] + filter <- names(region_cnt)[region_cnt>=4] neg <- x[(x$region%in% filter & x$genotype==".|."),] pos <- x[!x$genotype==".|.",] svm$neg <- neg @@ -49,7 +48,6 @@ SVM_train <- function(label=NULL, dir=NULL, region=NULL){ label$pos[label$pos=="None"] <- NA # using median values to replace the missing values train_x_pos <- impute(as.matrix(data.matrix(label$pos[,features])), what="median") - train_x_pos[is.na(train_x_pos)] <- 0 # using the minior value of QS vec <- train_x_pos[,colnames(train_x_pos)=="QS"] @@ -59,7 +57,6 @@ SVM_train <- function(label=NULL, dir=NULL, region=NULL){ label$neg <- as.data.frame(label$neg) label$neg[label$neg=="None"] <- NA train_x_neg <- impute(as.matrix(data.matrix(label$neg[,features])), what="median") - train_x_neg[is.na(train_x_neg)] <- 0 vec <- train_x_neg[,colnames(train_x_neg)=="QS"] vec[vec>0.5] <- 1- vec[vec>0.5] train_x_neg[,1] <- vec @@ -67,7 +64,6 @@ SVM_train <- function(label=NULL, dir=NULL, region=NULL){ label$test <- as.data.frame(label$test) label$test[label$test=="None"] <- NA test_x <- impute(as.matrix(data.matrix(label$test[,features])), what="median") - test_x[is.na(test_x)] <- 0 vec <- test_x[,colnames(test_x)=="QS"] vec[vec>0.5] <- 1- vec[vec>0.5] test_x[,1] <- vec @@ -83,10 +79,12 @@ SVM_train <- function(label=NULL, dir=NULL, region=NULL){ print(p) } dev.off() - + + model <- svm(as.matrix(rbind(train_x_pos, train_x_neg)), as.factor(c(train_y_pos,train_y_neg)), probability=TRUE) + print("Done") pred <- predict(model, as.matrix(test_x), probability=TRUE) prob <- attr(pred, "probabilities") prob <-as.data.frame(prob) @@ -410,10 +408,11 @@ somaticLD <- function(mat=NULL, svm=NULL, dir=NULL, region=NULL, min_size=50){ dt <- fread(mat_gz) dt <- data.frame(dt) -dt$V10[is.na(dt$V10)] <- ".|." +dt$V10[dt$V10=="nan"] <- ".|." rownames(dt) <- paste0(dt$V1,":",dt$V2,":",dt$V3,":",dt$V4) -cellName <- read.table(file=paste0(outdir,region,".cell.txt"),header=F) +cellName <- read.csv(file=paste0(outdir,region,".cell_snv.cellID.filter.csv"),header=T) +cellName <- cellName[,2] for(j in seq(11,18,1)){ dt[,j] <- as.numeric(dt[,j]) @@ -422,7 +421,6 @@ meta <- dt[,1:18] colnames(meta) <-c("chr","pos","ref","alt","Dep","dep1","dep2","dep3","dep4", "genotype","QS","VDB","RPB","MQB","BQB","MQSB","SGB","MQ0F") - mutation_block <- SNV_block(summary=meta) svm_in <- SVM_prepare(mutation_block) svm_out <- SVM_train(label =svm_in,dir=outdir, region=region) @@ -431,7 +429,25 @@ final <- somaticLD(mat=dt, svm=svm_out, dir=outdir, region=region) mut_mat <- dt[rownames(final$out),] -colnames(mut_mat) <-c(colnames(meta),cellName[,1]) +colnames(mut_mat) <-c(colnames(meta),cellName) + + +#### re-assign the sequencing depth for each allele + +for(i in seq(1, nrow(mut_mat),1)){ + N_wild <- 0 + N_mut <- 0 + vec <- mut_mat[i,seq(19, ncol(mut_mat),1)] + sta <- unname(unlist(vec)) + sta <- table(c(sta,"0/1","1/0","1/1")) + sta <- sta - 1 + N_alt <- sta["0/1"] + sta["1/1"] + N_ref <- sta["1/0"] + sta["1/1"] + LDrefine_somatic[i,6] <- N_ref + LDrefine_somatic[i,7] <- N_alt + LDrefine_somatic[i,8] <- N_ref + N_alt + LDrefine_somatic[i,12] <- N_alt/(N_ref+N_alt) +} write.csv(final$LDrefine_somatic, paste0(outdir,region,".putativeSNVs.csv"),quote=FALSE,row.names = FALSE) write.csv(final$LDrefine_germline2, paste0(outdir,region,".germlineTwoLoci_model.csv"),quote=FALSE, row.names = FALSE) diff --git a/src/Monopogen.py b/src/Monopogen.py index 1836746..d656463 100644 --- a/src/Monopogen.py +++ b/src/Monopogen.py @@ -17,6 +17,7 @@ import numpy as np import gzip from pysam import VariantFile +from bamProcess import * from germline import * from somatic import * import multiprocessing as mp @@ -46,6 +47,18 @@ +def error_check(all, output, step): + job_fail = 0 + for id in all: + if id not in output: + logger.error("In "+ step + " step " + id + " failed!") + job_fail = job_fail + 1 + + if job_fail > 0: + logger.error("Failed! See instructions above.") + exit(1) + + def germline(args): logger.info("Performing germline variant calling...") @@ -74,13 +87,13 @@ def germline(args): jobid = record[0] + ":" + record[1] + "-" + record[2] bam_filter = args.out + "/Bam/" + record[0] + ".filter.bam.lst" imputation_vcf = args.imputation_panel + "CCDG_14151_B01_GRM_WGS_2020-08-05_" + record[0] + ".filtered.shapeit2-duohmm-phased.vcf.gz" - cmd1 = samtools + " mpileup -b " + bam_filter + " -f " + args.reference + " -r " + jobid + " -q 20 -Q 20 -t DP -d 10000000 -v " + cmd1 = samtools + " mpileup -b" + bam_filter + " -f " + args.reference + " -r " + jobid + " -q 20 -Q 20 -t DP -d 10000000 -v " cmd1 = cmd1 + " | " + bcftools + " view " + " | " + bcftools + " norm -m-both -f " + args.reference cmd1 = cmd1 + " | grep -v \"\" | grep -v INDEL |" + bgzip + " -c > " + args.out + "/germline/" + jobid + ".gl.vcf.gz" #cmd2 = bcftools + " view " + out + "/germline/" + jobid + ".gl.vcf.gz" + " -i 'FORMAT/DP>1' | " + bcftools + " call -cv | " + bgzip + " -c > " + args.out + "/SCvarCall/" + jobid + ".gt.vcf.gz" cmd3 = java + " -Xmx20g -jar " + beagle + " gl=" + out + "/germline/" + jobid + ".gl.vcf.gz" + " ref=" + imputation_vcf + " chrom=" + record[0] + " out=" + out + "/germline/" + jobid + ".gp " + "impute=false modelscale=2 nthreads=24 gprobs=true niterations=0" - cmd4 = "zless -S " + out + "/germline/" + jobid + ".gp.vcf.gz > " + out + "/germline/" + jobid + ".germline.vcf" + cmd4 = "zless -S " + out + "/germline/" + jobid + ".gp.vcf.gz | grep -v 0/0 > " + out + "/germline/" + jobid + ".germline.vcf" cmd5 = java + " -Xmx20g -jar " + beagle + " gt=" + out + "/germline/" + jobid + ".germline.vcf" + " ref=" + imputation_vcf + " chrom=" + record[0] + " out=" + out + "/germline/" + jobid+ ".phased " + "impute=false modelscale=2 nthreads=24 gprobs=true niterations=0" f_out = open(out + "/Script/runGermline_" + jobid + ".sh","w") if args.step == "varScan" or args.step == "all": @@ -106,19 +119,6 @@ def germline(args): -def sort_chr(chr_lst): - # sort chr IDs from 1...22 - chr_lst_sort = [] - for i in range(1, 23): - i = str(i) - if i in chr_lst: - chr_lst_sort.append(i) - i_chr = "chr"+i - if i_chr in chr_lst: - chr_lst_sort.append(i_chr) - chr_lst = chr_lst_sort - return chr_lst - def somatic(args): @@ -138,120 +138,28 @@ def somatic(args): region_lst.append(region) chr_lst = list(set(chr_lst)) - if args.step=="featureInfo" or args.step=="all": - logger.info("Get feature information from sequencing data...") - + + logger.info("Get feature information from sequencing data...") joblst = [] for id in region_lst: - joblst.append(id+">"+args.out) + joblst.append(id+">"+args.out+">"+args.app_path) + with Pool(processes=args.nthreads) as pool: result = pool.map(featureInfo, joblst) + error_check(all = chr_lst, output = result, step = "LDrefinement") if args.step=="cellScan" or args.step=="all": - logger.info("Get single cell level information from sequencing data...") - + logger.info("Collect single cell level information from sequencing data...") chr_lst = sort_chr(chr_lst) - - run = 1 - if run: - joblst = [] - for id in chr_lst: - joblst.append(id+">"+args.out+">"+args.app_path) - with Pool(processes=args.nthreads) as pool: - result = pool.map(bamExtract, joblst) - - ####### merge bams from different chromosomes - bamlst = [] - print(chr_lst) - for chr in chr_lst: - bam_filter = out + "/Bam/" + chr + ".filter.targeted.bam" - bamlst.append(bam_filter) - print(bamlst) - output_bam = out + "/Bam/merge.filter.targeted.bam" - pysam.merge("-f","-o",output_bam,*bamlst) - os.system(samtools + " index " + output_bam) - - if run: - os.system("mkdir -p " + out + "/Bam/split_bam/") - cell_clst = pd.read_csv(args.barcode) - df = pd.DataFrame(cell_clst, columns= ['cell','id']) - df = df.sort_values(by=['id']) - args.keep = float(args.keep) - if args.keep < 1: - dis = np.cumsum(df['id'])/np.sum(df['id']) - N = sum(dis>(1-args.keep)) - df = df.iloc[-(N):] - cell_lst = df['cell'].unique() - joblst = [] - - for cell in cell_lst: - para = "merge" + ":" + cell + ":" + args.out + ":" + args.app_path - joblst.append(para) - - with Pool(processes=args.nthreads) as pool: - result = pool.map(bamSplit, joblst) - - if sum(result)==0: - logger.error("No reads detected for cells in " + args.barcode + ". Please check 1) the input cell barcode file is matched with bam file; 2) the cell barcode name has the same format shown in bam file. For example XX-1!") - logger.error("Failed! See instructions above.") - exit(1) - - # generate the bam file list - cell_bam = open(out + "/Bam/split_bam/cell.bam.lst","w") - for cell in cell_lst: - cell_bam.write(out + "/Bam/split_bam/" + cell + ".bam\n") - cell_bam.close() - - - region_lst = [] - if args.winSize=="10MB": - region_file = args.app_path + "/../resource/GRCh38.region.10MB.lst" - if args.winSize=="50MB": - region_file = args.app_path + "/../resource/GRCh38.region.50MB.lst" - with open(region_file) as f_in: - for line in f_in: - record = line.strip().split(",") - if(len(record)==1): - region = record[0] - if(len(record)==3): - region = record[0] + ":" + record[1] + "-" + record[2] - if record[0] in chr_lst: - region_lst.append(region) - - joblst = [] - for id in region_lst: - record = id.strip().split(":") - chr = record[0] - joblst.append(id+">"+chr+">"+args.out+">"+args.app_path+">"+args.reference) - - - with Pool(processes=args.nthreads) as pool: - result = pool.map(jointCall, joblst) - error_check(all = region_lst, output = result, step = "cellScan:joint calling") - - - ##### merge vcfs from multiple regions - - for id in chr_lst: - tp = "" - for reg in region_lst: - if re.search(id+":", reg): - tp = tp + " " + args.out+"/somatic/" + reg + ".cell.gl.vcf.gz" - cmd = args.app_path + "/bcftools concat -o " + args.out+"/somatic/" + id + ".cell.gl.vcf.gz " + tp + " -O z" - print(cmd) - output = os.system(cmd) - joblst = [] for id in chr_lst: - joblst.append(id+">"+args.out) + joblst.append(id+">"+args.out+">"+args.reference+">"+args.barcode) with Pool(processes=args.nthreads) as pool: - result = pool.map(vcf2mat, joblst) - error_check(all = chr_lst, output = result, step = "cellScan:vcf2mat") - - + result = pool.map(bam2mat, joblst) + error_check(all = chr_lst, output = result, step = "cellScan") if args.step=="LDrefinement" or args.step=="all": logger.info("Run LD refinement ...") @@ -265,18 +173,6 @@ def somatic(args): -def error_check(all, output, step): - job_fail = 0 - for id in all: - if id not in output: - logger.error("In "+ step + " step " + id + " failed!") - job_fail = job_fail + 1 - - if job_fail > 0: - logger.error("Failed! See instructions above.") - exit(1) - - def preProcess(args): logger.info("Performing data preprocess before variant calling...") @@ -392,9 +288,6 @@ def main(): help="The app library paths used in the tool") parser_somatic.add_argument('-t', '--nthreads', required=False, type=int, default=22, help="Number of jobs used for SNV calling") - parser_somatic.add_argument('-w', '--winSize', required=False, default="50MB", - choices=['10MB','50MB'], - help="Split the chromosome into small segments for cell level sequencing information collection. Setting 10MB will generate more jobs but be faster") parser_somatic.add_argument('-s', '--step', required=True, choices=['featureInfo', 'cellScan' , 'LDrefinement', 'monovar', 'all'], help="Run germline variant calling step by step") diff --git a/src/__pycache__/Single_Cell_Ftrs_Pos.cpython-37.pyc b/src/__pycache__/Single_Cell_Ftrs_Pos.cpython-37.pyc index 8fec6aa..a5a64d4 100644 Binary files a/src/__pycache__/Single_Cell_Ftrs_Pos.cpython-37.pyc and b/src/__pycache__/Single_Cell_Ftrs_Pos.cpython-37.pyc differ diff --git a/src/__pycache__/alleles_prior.cpython-37.pyc b/src/__pycache__/alleles_prior.cpython-37.pyc index c8603e8..c049589 100644 Binary files a/src/__pycache__/alleles_prior.cpython-37.pyc and b/src/__pycache__/alleles_prior.cpython-37.pyc differ diff --git a/src/__pycache__/bamProcess.cpython-37.pyc b/src/__pycache__/bamProcess.cpython-37.pyc new file mode 100644 index 0000000..38ff890 Binary files /dev/null and b/src/__pycache__/bamProcess.cpython-37.pyc differ diff --git a/src/__pycache__/base_q_ascii.cpython-37.pyc b/src/__pycache__/base_q_ascii.cpython-37.pyc index cab1805..00a9332 100644 Binary files a/src/__pycache__/base_q_ascii.cpython-37.pyc and b/src/__pycache__/base_q_ascii.cpython-37.pyc differ diff --git a/src/__pycache__/calc_variant_prob.cpython-37.pyc b/src/__pycache__/calc_variant_prob.cpython-37.pyc index e2f2b85..de57418 100644 Binary files a/src/__pycache__/calc_variant_prob.cpython-37.pyc and b/src/__pycache__/calc_variant_prob.cpython-37.pyc differ diff --git a/src/__pycache__/genotype_prob_mat.cpython-37.pyc b/src/__pycache__/genotype_prob_mat.cpython-37.pyc index 729b60b..64f2a24 100644 Binary files a/src/__pycache__/genotype_prob_mat.cpython-37.pyc and b/src/__pycache__/genotype_prob_mat.cpython-37.pyc differ diff --git a/src/__pycache__/germline.cpython-37.pyc b/src/__pycache__/germline.cpython-37.pyc new file mode 100644 index 0000000..8758408 Binary files /dev/null and b/src/__pycache__/germline.cpython-37.pyc differ diff --git a/src/__pycache__/hzvcf.cpython-37.pyc b/src/__pycache__/hzvcf.cpython-37.pyc index 3555def..1ec69ed 100644 Binary files a/src/__pycache__/hzvcf.cpython-37.pyc and b/src/__pycache__/hzvcf.cpython-37.pyc differ diff --git a/src/__pycache__/mp_genotype.cpython-37.pyc b/src/__pycache__/mp_genotype.cpython-37.pyc index 6c5ee75..4c1476b 100644 Binary files a/src/__pycache__/mp_genotype.cpython-37.pyc and b/src/__pycache__/mp_genotype.cpython-37.pyc differ diff --git a/src/__pycache__/nu_genotype_single_cell.cpython-37.pyc b/src/__pycache__/nu_genotype_single_cell.cpython-37.pyc index a932092..958cc01 100644 Binary files a/src/__pycache__/nu_genotype_single_cell.cpython-37.pyc and b/src/__pycache__/nu_genotype_single_cell.cpython-37.pyc differ diff --git a/src/__pycache__/nu_prob_mat.cpython-37.pyc b/src/__pycache__/nu_prob_mat.cpython-37.pyc index 0e0d861..abafae6 100644 Binary files a/src/__pycache__/nu_prob_mat.cpython-37.pyc and b/src/__pycache__/nu_prob_mat.cpython-37.pyc differ diff --git a/src/__pycache__/somatic.cpython-37.pyc b/src/__pycache__/somatic.cpython-37.pyc new file mode 100644 index 0000000..45d75dd Binary files /dev/null and b/src/__pycache__/somatic.cpython-37.pyc differ diff --git a/src/__pycache__/utils.cpython-37.pyc b/src/__pycache__/utils.cpython-37.pyc index abcf3b2..39b2b76 100644 Binary files a/src/__pycache__/utils.cpython-37.pyc and b/src/__pycache__/utils.cpython-37.pyc differ diff --git a/src/bamProcess.py b/src/bamProcess.py new file mode 100644 index 0000000..d6c3e9f --- /dev/null +++ b/src/bamProcess.py @@ -0,0 +1,163 @@ +#!/usr/bin/env python3 +""" +The main interface of scPopGene +""" + +import argparse +import sys +import os +import logging +import shutil +import glob +import re +import pysam +import time +import subprocess +import pandas as pd +from pysam import VariantFile + +LIB_PATH = os.path.abspath( + os.path.join(os.path.dirname(os.path.realpath(__file__)), "pipelines/lib")) + +if LIB_PATH not in sys.path: + sys.path.insert(0, LIB_PATH) + +PIPELINE_BASEDIR = os.path.dirname(os.path.realpath(sys.argv[0])) +CFG_DIR = os.path.join(PIPELINE_BASEDIR, "cfg") + +#import pipelines +#from pipelines import get_cluster_cfgfile +#from pipelines import PipelineHandler + +# global logger +logger = logging.getLogger(__name__) +logger.setLevel(logging.DEBUG) +handler = logging.StreamHandler() +handler.setFormatter(logging.Formatter( + '[{asctime}] {levelname:8s} {filename} {message}', style='{')) +logger.addHandler(handler) + + + +def addChr(args): + # edit the sequence names for your output header + in_bam = args.bamFile + prefix = 'chr' + out_bam=in_bam+"tmp.bam" + print(in_bam) + input_bam = pysam.AlignmentFile(in_bam,"rb") + new_head = input_bam.header.to_dict() + for seq in new_head['SQ']: + seq['SN'] = prefix + seq['SN'] + # create output BAM with newly defined header + with pysam.AlignmentFile(out_bam, "wb", header=new_head) as outf: + for read in input_bam.fetch(): + prefixed_chrom = prefix + read.reference_name + a = pysam.AlignedSegment(outf.header) + a.query_name = read.query_name + a.query_sequence = read.query_sequence + a.reference_name = prefixed_chrom + a.flag = read.flag + a.reference_start = read.reference_start + a.mapping_quality = read.mapping_quality + a.cigar = read.cigar + a.next_reference_id = read.next_reference_id + a.next_reference_start = read.next_reference_start + a.template_length = read.template_length + a.query_qualities = read.query_qualities + a.tags = read.tags + outf.write(a) + input_bam.close() + outf.close() + os.system("samtools index " + out_bam) + os.system(" mv " + out_bam + " " + in_bam) + os.system(" mv " + out_bam + ".bai " + in_bam + ".bai") + + + + +def sort_chr(chr_lst): + # sort chr IDs from 1...22 + chr_lst_sort = [] + for i in range(1, 23): + i = str(i) + if i in chr_lst: + chr_lst_sort.append(i) + i_chr = "chr"+i + if i_chr in chr_lst: + chr_lst_sort.append(i_chr) + chr_lst = chr_lst_sort + return chr_lst + + + + +def bamSplit(para): + para_lst = para.strip().split(":") + chr = para_lst[0] + cell = para_lst[1] + out = para_lst[2] + app_path = para_lst[3] + + #assert os.path.isfile(bam_filter), "*.fiter.targeted.bam file {} cannot be found!".format(bam_filter) + samtools = app_path + "/samtools" + output_bam = out + "/Bam/merge.filter.targeted.bam" + infile = pysam.AlignmentFile(output_bam,"rb") + # Note to change the read groups + tp =infile.header.to_dict() + if len(tp['RG'])>1: + tp['RG']= [tp['RG'][0]] + tp['RG'][0]['SM'] = cell + tp['RG'][0]['ID'] = cell + cnt = 0 + outfile = pysam.AlignmentFile( out + "/Bam/split_bam/" + cell + ".bam", "wb", header=tp) + for s in infile: + t = robust_get_tag(s,"CB") + if not t=="NotFound": + if t==cell: + outfile.write(s) + cnt = cnt +1 + else: + if re.search(cell, s.query_name): + outfile.write(s) + cnt = cnt + 1 + + outfile.close() + infile.close() + cmd=samtools + " index " + out + "/Bam/split_bam/" + cell + ".bam" + + os.system(samtools + " index " + out + "/Bam/split_bam/" + cell + ".bam") + + return(cnt) + + + + +def jointCall(para): + + para_lst = para.strip().split(">") + jobid = para_lst[0] + chr=para_lst[1] + out = para_lst[2] + app_path = para_lst[3] + reference = para_lst[4] + samtools = app_path + "/samtools" + bcftools = app_path + "/bcftools" + bgzip = app_path + "/bgzip" + bam_filter = out + "/Bam/split_bam/cell.bam.lst" + cmd1 = samtools + " mpileup -b " + bam_filter + " -f " + reference + " -r " + jobid + " -q 20 -Q 20 -t DP4 -d 10000 -v " + cmd1 = cmd1 + " | " + bcftools + " view " + " | " + bcftools + " norm -m-both -f " + reference + cmd1 = cmd1 + " | grep -v \"\" | grep -v INDEL |" + bgzip + " -c > " + out + "/somatic/" + jobid + ".cell.gl.vcf.gz" + print(cmd1) + output = os.system(cmd1) + + # delete the bam files once snv calling was finished in specfic regions + f = open(bam_filter, "r") + for x in f: + x = x.strip() + #os.system("rm " + x) + #os.system("rm " + x + ".bai") + f.close() + + if output == 0: + return(jobid) diff --git a/src/haplotype_somatic.R b/src/haplotype_somatic.R new file mode 100644 index 0000000..33094ec --- /dev/null +++ b/src/haplotype_somatic.R @@ -0,0 +1,101 @@ + + +library(e1071) +library(reshape2) + +source("/rsrch3/scratch/bcb/jdou1/scAncestry/Monopogen/src/svm_phasing_sub.R") + +args = commandArgs(trailingOnly=TRUE) + +chr <- args[1] +loaddata <- args[2] +dis_limit_physical <- as.numeric(args[3]) +dis_limit_genetic <- as.numeric(args[4]) + + + + + +print(chr) + +if(loaddata==1){ + print("Loading data now...") + mat <- read.table(paste0(file="../germline/chr",chr,".gl.filter.hc.cell.mat.gz")) + #normal <- read.table(file=paste0("normal.",chr,".snv")) + #tumor <- read.table(file=paste0("tumor.",chr,".snv")) + #somatic <- read.table(file="somatic.snv") + feature_info <- read.csv(paste0(file="../debug/chr",chr,".gl.feature.info"),header=F) + + + mat$V2 <- paste0(mat$V1,":",mat$V2) + meta <- mat[,c("V1","V2","V3","V4")] + meta <- as.data.frame(meta) + meta$in_normal <- 0 + meta$in_tumor <- 0 + meta$somatic <- 0 + #meta$in_normal[meta$V2%in%normal$V1] <- 1 + #meta$in_tumor[meta$V2%in%tumor$V1] <- 1 + #meta$somatic[meta$V2%in%somatic$V1] <- 1 + meta$baf <- frq(mat) + + #meta$baf <-0.5 + # SVM module on remove low quality mutations + + + svm_info <- feature_info[feature_info$V1%in%meta$V2,] + svm_dt <- getFeatureInfo(svm_info=svm_info) + + overlap <- intersect(meta$V2, svm_dt$id) + + meta_qc <- meta[meta$V2%in%overlap,] + mat_qc <- mat[meta$V2%in%overlap,] + svm_dt_qc <- svm_dt[svm_dt$id%in%overlap,] + svm_dt_qc$BAF <- meta_qc$baf + save.image(file=paste0(chr,".input.image.RDS")) +} + +if(loaddata==2){ + load(file=paste0(chr,".input.image.RDS")) +} + + +source("/rsrch3/scratch/bcb/jdou1/scAncestry/Monopogen/src/svm_phasing_sub.R") +print(args) + + +print("start to run SVM and haplotype filtering") +print("new version ") + +# get putative mutation block +#meta_qc$flag <- paste0(meta_qc$in_normal, ":", meta_qc$in_tumor, ":", meta_qc$somatic) + + + + + +mutation_block <- SNV_block(summary=meta_qc) + +svm_in <- SVM_prepare(mutation_block) + +svm_out <- SVM_train(label =svm_in, data=svm_dt_qc, downsampling=2) + +p_lower <- sort(svm_out$test$POS)[floor(nrow(svm_out$test)*0.1)] +p_upper <- sort(svm_out$test$POS)[floor(nrow(svm_out$test)*0.95)] +filter1 <- (svm_out$test$POS>p_lower) +filter2 <- (svm_out$test$baf>0.1 ) +svm_pass <- svm_out$test[filter1 & filter2,] + + +svm_phasing <- Phasing(x=mat_qc, snv_meta=svm_pass, dis_limit=dis_limit_genetic, readlength=dis_limit_physical) + +svm_out$phasing <- svm_phasing +svm_out$feature <- svm_dt_qc + +saveRDS(svm_out,file=paste0("svm",chr,".out.RDS")) + + + +x=mat_qc +snv_meta=svm_pass +dis_limit=dis_limit_genetic +readlength=dis_limit_physical \ No newline at end of file diff --git a/src/index_motif.py b/src/index_motif.py new file mode 100644 index 0000000..e69de29 diff --git a/src/somatic.py b/src/somatic.py index ce40f21..8c49357 100644 --- a/src/somatic.py +++ b/src/somatic.py @@ -14,6 +14,7 @@ import numpy as np import gzip from pysam import VariantFile +from bamProcess import * import multiprocessing as mp from multiprocessing import Pool @@ -31,9 +32,9 @@ def withSNVs(invcf, path): return(cnt) def runCMD(cmd): - + print(cmd) os.system(cmd) - #process = subprocess.run(cmd, shell=True, stdout=open(args.logfile, 'w'), stderr=open(args.logfile,'w')) + #process = subprocess.run(cmd, shell=True) def robust_get_tag(read, tag_name): @@ -59,14 +60,10 @@ def validate_user_setting_somatic(args): exit(1) bam_filter = args.out + "/Bam/" + record[0] + ".filter.bam.lst" gl_vcf = args.out + "/germline/" + jobid + ".gl.vcf.gz" - gp_vcf = args.out + "/germline/" + jobid + ".germline.vcf.gz" phased_vcf = args.out + "/germline/" + jobid + ".phased.vcf.gz" assert os.path.isfile(bam_filter), "The bam list file {} cannot be found! Please run germline module".format(bam_filter) assert os.path.isfile(gl_vcf), "The *.gl.vcf.gz file {} cannot be found! Please run germline module".format(gl_vcf) - #if withSNVs(gl_vcf, args.app_path)==0: - # print("The *.gl.vcf.gz file {} cannot be found! Maybe there are no markers detected in ".format(gl_vcf)) - #if withSNVs(phased_vcf, args.app_path)==0: - # print("The *.phased.vcf.gz file {} cannot be found! Maybe there are no markers detected in this region?".format(phased_vcf)) + assert os.path.isfile(phased_vcf), "The *.phased.vcf.gz file {} cannot be found! Please run germline module".format(phased_vcf) assert os.path.isfile(args.barcode), "The cell barcode file {} cannot be found!".format(args.barcode) @@ -80,11 +77,36 @@ def getInfo_robust(rec, info): return(info_dt) -def featureInfo(para): +def bamExtract(para): + para_lst = para.strip().split(">") + chr = para_lst[0] out = para_lst[1] - region = para_lst[0] + app_path = para_lst[2] + samtools = os.path.abspath(app_path) + "/samtools" + out = os.path.abspath(out) + inbam = getBamName(chr, out) + outbam = out + "/Bam/" + chr + ".filter.targeted.bam" + # remember to update bed file ls -lrt + + #os.system("cat " + out + "/somatic/" + chr + "*.gl.vcf.filter.hc.bed > " + out + "/somatic/" + chr + ".bed") + chr_bed = out + "/somatic/" + chr + ".gl.vcf.filter.hc.bed" + cmd1 = samtools + " view " + " -b -L " + chr_bed + " " + inbam + " -o " + outbam + with open(out+"/Script/bamExtract_" + chr + ".sh","w") as f_out: + f_out.write(cmd1 + "\n") + f_out.write(samtools + " index " + outbam + "\n") + cmd="bash " + out+"/Script/bamExtract_" + chr + ".sh" + print(cmd) + os.system(cmd) +def featureInfo(para): + + para_lst = para.strip().split(">") + region = para_lst[0] + out = para_lst[1] + app = para_lst[2] + pysam.tabix_index(out + "/germline/" + region + ".phased.vcf.gz", preset="vcf", force=True) + pysam.tabix_index(out + "/germline/" + region + ".gl.vcf.gz", preset="vcf", force=True) vcf_in = VariantFile(out + "/germline/" + region + ".phased.vcf.gz") info_GT = {} for rec in vcf_in.fetch(): @@ -102,13 +124,13 @@ def featureInfo(para): for rec in gl_vcf_in.fetch(): info_I16 = rec.info.get('I16') info_QS = getInfo_robust(rec,'QS') - info_VDB = getInfo_robust(rec,'VDB') + info_VDB = getInfo_robust(rec,'VDB') info_RPB = getInfo_robust(rec,'RPB') info_MQB = getInfo_robust(rec,'MQB') info_BQB = getInfo_robust(rec,'BQB') - info_MQSB = getInfo_robust(rec,'MQSB') + info_MQSB =getInfo_robust(rec,'MQSB') info_SGB = getInfo_robust(rec,'SGB') - info_MQ0F = getInfo_robust(rec,'MQ0F') + info_MQ0F =getInfo_robust(rec,'MQ0F') id = str(rec.chrom)+":"+str(rec.pos) + ":" + rec.ref + ":" + rec.alts[0] gt_info_var = "NA" if (id in info_GT): @@ -126,6 +148,14 @@ def featureInfo(para): b = "{}\t{}\n".format(rec.chrom, rec.pos) gl_vcf_filter_txt.write(b) gl_vcf_filter_dp4.write(a) + gl_vcf_filter_dp4.close() + gl_vcf_filter_txt.close() + gl_vcf_filter_bed.close() + gl_vcf_dp4.close() + + + bamExtract(para) + def getBamName(chr, out): #chr = region.split(":")[0] @@ -135,97 +165,6 @@ def getBamName(chr, out): bamName = line.strip() return(bamName) -def bamExtract(para): - - para_lst = para.strip().split(">") - - chr = para_lst[0] - out = para_lst[1] - app_path = para_lst[2] - samtools = os.path.abspath(app_path) + "/samtools" - out = os.path.abspath(out) - inbam = getBamName(chr, out) - outbam = out + "/Bam/" + chr + ".filter.targeted.bam" - # remember to update bed file ls -lrt - - os.system("cat " + out + "/somatic/" + chr + "*.gl.vcf.filter.hc.bed > " + out + "/somatic/" + chr + ".bed") - cmd1 = samtools + " view " + " -b -L " + out + "/somatic/" + chr +".bed " + inbam + " -o " + outbam - with open(out+"/Script/bamExtract_" + chr + ".sh","w") as f_out: - f_out.write(cmd1 + "\n") - f_out.write(samtools + " index " + outbam + "\n") - cmd="bash " + out+"/Script/bamExtract_" + chr + ".sh" - runCMD(cmd) - - -def bamSplit(para): - para_lst = para.strip().split(":") - chr = para_lst[0] - cell = para_lst[1] - out = para_lst[2] - app_path = para_lst[3] - - #assert os.path.isfile(bam_filter), "*.fiter.targeted.bam file {} cannot be found!".format(bam_filter) - samtools = app_path + "/samtools" - output_bam = out + "/Bam/merge.filter.targeted.bam" - infile = pysam.AlignmentFile(output_bam,"rb") - # Note to change the read groups - tp =infile.header.to_dict() - if len(tp['RG'])>1: - tp['RG']= [tp['RG'][0]] - tp['RG'][0]['SM'] = cell - tp['RG'][0]['ID'] = cell - cnt = 0 - outfile = pysam.AlignmentFile( out + "/Bam/split_bam/" + cell + ".bam", "wb", header=tp) - for s in infile: - t = robust_get_tag(s,"CB") - if not t=="NotFound": - if t==cell: - outfile.write(s) - cnt = cnt +1 - else: - if re.search(cell, s.query_name): - outfile.write(s) - cnt = cnt + 1 - - outfile.close() - infile.close() - cmd=samtools + " index " + out + "/Bam/split_bam/" + cell + ".bam" - #runCMD(cmd,args) - os.system(samtools + " index " + out + "/Bam/split_bam/" + cell + ".bam") - - return(cnt) - - - - -def jointCall(para): - - para_lst = para.strip().split(">") - jobid = para_lst[0] - chr=para_lst[1] - out = para_lst[2] - app_path = para_lst[3] - reference = para_lst[4] - samtools = app_path + "/samtools" - bcftools = app_path + "/bcftools" - bgzip = app_path + "/bgzip" - bam_filter = out + "/Bam/split_bam/cell.bam.lst" - cmd1 = samtools + " mpileup -b " + bam_filter + " -f " + reference + " -r " + jobid + " -q 20 -Q 20 -t DP4 -d 10000 -v " - cmd1 = cmd1 + " | " + bcftools + " view " + " | " + bcftools + " norm -m-both -f " + reference - cmd1 = cmd1 + " | grep -v \"\" | grep -v INDEL |" + bgzip + " -c > " + out + "/somatic/" + jobid + ".cell.gl.vcf.gz" - print(cmd1) - output = os.system(cmd1) - - # delete the bam files once snv calling was finished in specfic regions - f = open(bam_filter, "r") - for x in f: - x = x.strip() - #os.system("rm " + x) - #os.system("rm " + x + ".bai") - f.close() - - if output == 0: - return(jobid) def less1(num): @@ -314,4 +253,283 @@ def LDrefinement(para): print(cmd) output = os.system(cmd) if output == 0: - return(region) \ No newline at end of file + return(region) + + + + + + +def robust_get_tag(read, tag_name): + try: + return read.get_tag(tag_name) + except KeyError: + return "NotFound" + +def rev_compl(st): + nn = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'} + return "".join(nn[n] for n in reversed(st)) + + + +def bam2mat(para): + + para_lst = para.strip().split(">") + print(para_lst) + + # bam2gz(para); + + #### gz to matrix ### + mat_infile = para_lst[1] + "/somatic/" + para_lst[0] + ".cell_snv.mat.gz" + snv_infile = para_lst[1] + "/somatic/" + para_lst[0] + ".cell_snv.snvID.csv" + cell_infile = para_lst[1] + "/somatic/" + para_lst[0] + ".cell_snv.cellID.csv" + meta_info = para_lst[1] + "/somatic/" + para_lst[0] + ".gl.vcf.filter.DP4" + + cell_clst = pd.read_csv(cell_infile) + snv_clst = pd.read_csv(snv_infile) + mat = pd.read_csv(mat_infile, sep="\t", header=None) + mat_out = gzip.open(para_lst[1] + "/somatic/" + para_lst[0] + ".gl.filter.hc.cell.mat.gz","wt") + + + mat.columns = ['snvIndex','cellIndex','allele'] + mat=mat.groupby(by=['snvIndex','cellIndex'], as_index=False).first() + mat=mat.pivot(index='snvIndex', columns='cellIndex', values='allele') + meta_info = pd.read_csv(meta_info, sep="\t", header=None) + meta_info.columns=["chr","pos","ref_allele","alt_allele","Dep","dep1","dep2","dep3","dep4","genotype","QS","VDB","RPB","MQB","BQB","MQSB","SGB","MQ0F"] + + overlapSNV_index=set(snv_clst["index"]).intersection(set(mat.index)) + + cell_clst.loc[list(mat.columns)]['cell'].to_csv(para_lst[1] + "/somatic/" + para_lst[0] + ".cell_snv.cellID.filter.csv") + + ## meta_info_id is similar with snv_clst + n_cell = mat.shape[1] + for index in overlapSNV_index: + info = meta_info.iloc[index].astype(str) + geno = info["genotype"] + info = "\t".join(info) + + flag = 0 + if geno=="0|1": + phase_info = ["0|0"]*n_cell + tp = mat.loc[index] + pos = tp.dropna().index + for value in pos: + allele = tp.loc[value] + index_vec = tp.index.get_loc(value) + if allele==1: + phase_info[index_vec]="0|1" + elif allele==0: + phase_info[index_vec]="1|0" + flag = 1 + elif geno=="1|0": + phase_info = ["0|0"]*n_cell + tp = mat.loc[index] + pos = tp.dropna().index + for value in pos: + allele = tp.loc[value] + index_vec = tp.index.get_loc(value) + if allele==1: + phase_info[index_vec]="1|0" + elif allele==0: + phase_info[index_vec]="0|1" + flag = 1 + elif geno=="nan": + phase_info = ["0/0"]*n_cell + tp = mat.loc[index] + pos = tp.dropna().index + for value in pos: + allele = tp.loc[value] + index_vec = tp.index.get_loc(value) + if allele==1: + phase_info[index_vec]="0/1" + elif allele==0: + phase_info[index_vec]="1/0" + flag = 1 + if flag: + mat_out.write(info+"\t" +'\t'.join(phase_info)) + mat_out.write('\n') + + + + +def bam2gz(para): + + para_lst = para.strip().split(">") + + #assert os.path.isfile(bam_filter), "*.fiter.targeted.bam file {} cannot be found!".format(bam_filter) + #in_fasta = "/rsrch3/scratch/bcb/jdou1/scAncestry/ref/fasta/genome.fa" + #in_vcf = "/rsrch3/scratch/bcb/jdou1/bam_process/chr1.gp.vcf" + #in_bam = "/rsrch3/scratch/bcb/jdou1/bam_process/chr1.filter.targeted.bam" + + # joblst.append(id+">"+args.out+">"+args.reference) + + in_snv = para_lst[1] + "/somatic/" + para_lst[0] + ".gl.vcf.filter.DP4" + in_bam = para_lst[1] + "/Bam/" + para_lst[0] + ".filter.targeted.bam" + in_fasta = para_lst[2] + in_cell_barcode = para_lst[3] + + # read cell barcode file + + cell_clst = pd.read_csv(in_cell_barcode) + df = pd.DataFrame(cell_clst, columns= ['cell','id']) + df = df.sort_values(by=['id'], ascending=False) + df['index']=range(len(df.index)) + cell_set = list(df["cell"]) + + + ref_fa = pysam.FastaFile(in_fasta) + snv_info = {} + tp = list() + index = 0 + motif_len = 3 + snv_tol = 0 + + # index 0-based + with open(in_snv,'r') as fp: + for line in fp: + line = line.strip() + line = line.split("\t") + chrom = line[0] + pos = int(line[1]) + ref = line[2] + alts = line[3] + seq = ref_fa.fetch(chrom, pos-4, pos+3) + seq_rev_compl = rev_compl(seq) + id = str(chrom)+":"+str(pos) + ":" + ref + ":" + alts[0] + mydict = dict(id=index, chr = chrom, pos = pos, motif_pos= seq, ref_allele = ref, alt_allele=alts[0]) + snv_info[index]=mydict + index = index + 1 + snv_tol = snv_tol + 1 + tp.append(id) + + + + #print(snv_info_out) + ## output cell snv meta information + df.to_csv(para_lst[1] + "/somatic/" + para_lst[0] + ".cell_snv.cellID.csv", index = False) + snv_info_out = pd.DataFrame() + snv_info_out['snvID'] = tp + snv_info_out['index'] = range(len(tp)) + snv_info_out.to_csv(para_lst[1] + "/somatic/" + para_lst[0] + ".cell_snv.snvID.csv",index = False) + + read_tol = 0 + read_cover_tol = 0 + read_wild_tol = 0 + read_mutated_tol = 0 + read_noAllele_tol = 0 + overlap = 0 + index = 0 + lower_index = 0 + + infile = pysam.AlignmentFile(in_bam,"rb") + fp = gzip.open(para_lst[1] + "/somatic/" + para_lst[0] + ".cell_snv.mat.gz", "wt") + # fp = open(para_lst[1] + "/somatic/" + para_lst[0] + ".cell_snv.mat", "wt") + for s in infile: + + cell_barcode = robust_get_tag(s,"CB") + if cell_barcode=="NotFound": + ### no cell barcode file detected. + ### check whether the barcode added in the query name + cell_barcode = s.query_name.strip().split("_")[1] + + if cell_barcode in cell_set: + cell_barcode_index = cell_set.index(cell_barcode) + else: + ### cell barcode is not detected, should move to next read + continue + + read_name = s.query_name + align_chr = s.reference_name + align_start = s.reference_start + align_seq = s.query_sequence + mystart = str(snv_info[lower_index]["pos"]) + + + ### flags used to see whether SNVs covered in different scenarios + read_tol = read_tol + 1 + read_cover = 0 + read_wild = 0 + read_mutated = 0 + read_noAllele = 0 + read_len = len(align_seq) + + + ### get the SNVs locating in [align_start, align_start + read_len] ### + if read_tol%1000000 == 0: + print("scanning read " + str(read_tol)) + + lock = 0 + snv_cover = "" + + + for i in range(lower_index,snv_tol-1,1): + + snv_pos = snv_info[i]["pos"] + wild_allele = snv_info[i]["ref_allele"] + alt_allele = snv_info[i]["alt_allele"] + + if snv_pos >= align_start: + if lock==0: + lower_index = i + lock = lock + 1 + if snv_pos <= align_start + read_len: + read_cover = read_cover + 1 + snv_cover = str(snv_cover) + ":" + str(snv_pos) + motif_pos = snv_info[i]["motif_pos"] + motif_neg = motif_pos[:motif_len] + snv_info[i]["alt_allele"] + motif_pos[motif_len + 1:] + + if re.search(motif_pos, align_seq): + read_wild = read_wild + 1 + fp.write(str(i)+"\t"+str(cell_barcode_index)+"\t0\n") + elif re.search(motif_neg, align_seq): + read_mutated = read_mutated + 1 + fp.write(str(i)+"\t"+str(cell_barcode_index)+"\t1\n") + else: + delta = snv_pos - align_start + if read_len - delta < motif_len: + motif_pos_part = motif_pos[0:motif_len+1] + motif_neg_part = motif_neg[0:motif_len+1] + seq_part = align_seq[read_len-2*motif_len-1:read_len] + + if re.search(motif_pos_part, seq_part): + read_wild = read_wild + 1 + fp.write(str(i)+"\t"+str(cell_barcode_index)+"\t0\n") + elif re.search(motif_neg_part, seq_part): + read_mutated = read_mutated + 1 + fp.write(str(i)+"\t"+str(cell_barcode_index)+"\t1\n") + + elif delta <= motif_len: + motif_pos_part = motif_pos[motif_len:len(motif_pos)] + motif_neg_part = motif_neg[motif_len:len(motif_neg)] + seq_part = align_seq[0:2*motif_len+1] + + if re.search(motif_pos_part, seq_part): + read_wild = read_wild + 1 + fp.write(str(i)+"\t"+str(cell_barcode_index)+"\t0\n") + elif re.search(motif_neg_part, seq_part): + read_mutated = read_mutated + 1 + fp.write(str(i)+"\t"+str(cell_barcode_index)+"\t1\n") + else: + #fp.write(str(snv_info[i])+"\n") + #fp.write(str(align_chr) + ":" + str(align_start) + ":" + align_seq+"\n") + read_noAllele = read_noAllele + 1 + else: + break + + if read_cover > 0: + read_cover_tol = read_cover_tol + 1 + if read_wild > 0: + read_wild_tol = read_wild_tol + 1 + if read_mutated > 0 and read_wild>0: + overlap = overlap + 1 + + if read_mutated > 0: + read_mutated_tol = read_mutated_tol + 1 + if read_noAllele > 0: + read_noAllele_tol = read_noAllele_tol + 1 + + + infile.close() + fp.close() + print(str(read_tol) + ":" + str(read_cover_tol) + ":" + str(read_wild_tol) + ":" + str(read_mutated_tol) + ":" + str(overlap) + ":" + str(read_noAllele_tol)) + return(para_lst[0]) \ No newline at end of file diff --git a/src/svm_phasing_sub.R b/src/svm_phasing_sub.R new file mode 100644 index 0000000..317d520 --- /dev/null +++ b/src/svm_phasing_sub.R @@ -0,0 +1,268 @@ + +frq <- function(x=mat) { + res<-matrix(0,nrow(mat),1) + for(i in seq(1,nrow(mat),1)){ + geno <- mat[i, seq(5,ncol(mat),1)] + geno <- gsub("/","|", geno) + geno <- geno[!geno=="0|0"] + s <- strsplit(geno, "|") + result <- sapply(s, "[[", 1) + if(length(result)>10){ + + myfrq <- length( which(result=="0"))/length(result) + }else{ + myfrq <- 0 + } + if(myfrq>0.5){myfrq <- 1- myfrq} + res[i,1] <- myfrq + } + return(res) +} + + +getFeatureInfo <-function(svm_info=NULL){ + n_min <-(-1000) + dt <- data.frame("id"=svm_info$V1, "DP"=n_min, "QS"=n_min, + "VDB"=n_min,"SGB"=n_min,"RPB"=n_min, "MQB"=n_min,"MQSB"=n_min, "BQB"=n_min) + + svm_info[is.na(svm_info)] <-"-1000" + for(i in seq(1,nrow(svm_info),1)){ + for(j in seq(1,ncol(svm_info),1)){ + if(svm_info[i,j]=="DP"){dt$DP[i] <- svm_info[i,j+1]} + if(svm_info[i,j]=="QS"){dt$QS[i] <- svm_info[i,j+1]} + if(svm_info[i,j]=="VDB"){dt$VDB[i] <- svm_info[i,j+1]} + if(svm_info[i,j]=="SGB"){dt$SGB[i] <- svm_info[i,j+1]} + if(svm_info[i,j]=="RPB"){dt$RPB[i] <- svm_info[i,j+1]} + if(svm_info[i,j]=="MQB"){dt$MQB[i] <- svm_info[i,j+1]} + if(svm_info[i,j]=="MQSB"){dt$MQSB[i] <- svm_info[i,j+1]} + if(svm_info[i,j]=="BQB"){dt$BQB[i] <- svm_info[i,j+1]} + } + } + return(dt) +} + +SNV_block <-function(summary=NULL){ + block <- 1 + last <- ".|." + summary$region <- 0 + for(i in seq(1,nrow(summary),1)){ + if(summary[i,4]==".|."){ + if(last==".|." | last=="0|0"){;} + else{ + block <- block + 1 + } + summary$region[i] <- block + } + last <- summary[i,4] + } + return(summary) +} + + +SVM_prepare <-function(x=NULL){ + svm<-list() + a<- table(x$region) + neg <- x[(x$region%in% names(a)[a>=4] & x$V4==".|.") | (x$V4=="0|0" & x$baf>0),] + #neg <- x[(x$region%in% names(a)[a>=3] & x$V4==".|.") | x$V4=="0|0",] + #neg <- x[x$V4=="0|0" | ,] + pos <- x[!x$V4==".|." & !x$V4=="0|0" &x$baf>0.4,] + svm$neg <- neg + svm$pos <- pos + svm$test <- x[x$region%in% names(a)[a<4] & x$V4==".|.",] + return(svm) +} + +SVM_train <- function(label=NULL, data=NULL, downsampling=20){ + + + print("Run SVM trainning") + + feature <-c("QS", "VDB", "SGB", "RPB", "MQB", "MQSB", "BQB", "BAF") + data$BQB <- as.numeric(data$BQB) + + data$VDB[data$VDB==(-1000)] <-0 + data$MQSB[data$MQSB==(-1000)] <-1 + data$QS <- abs(data$QS-0.5) + x_pos <- data[data$id%in%label$pos$V2,feature] + #x_pos <- x_pos[seq(1,nrow(x_pos),downsampling),] + x_neg <- data[data$id%in%label$neg$V2,feature] + + y_pos <- rep("POS", nrow(x_pos)) + y_neg <- rep("NEG", nrow(x_neg)) + + if(nrow(x_pos) > nrow(x_neg)){ + # randomly select matched variants between pos and neg + sel <- sample(nrow(x_pos))[seq(1,nrow(x_neg),1)] + x_pos <- x_pos[sel,] + y_pos <- y_pos[sel] + } + if(nrow(x_pos) < nrow(x_neg)){ + # randomly select matched variants between pos and neg + sel <- sample(nrow(x_neg))[seq(1,nrow(x_pos),1)] + x_neg <- x_neg[sel,] + y_neg <- y_neg[sel] + } + + print(paste0("pos->:", length(y_pos))) + print(paste0("neg->:", length(y_neg))) + + model <- svm(as.matrix(rbind(x_pos, x_neg)), as.factor(c(y_pos,y_neg)), probability=TRUE) + test <- data[data$id%in%label$test$V2,feature] + pred <- predict(model, as.matrix(test), probability=TRUE) + prob <- attr(pred, "probabilities") + prob <-as.data.frame(prob) + label$test$POS <- prob$POS + label$test$NEG <- prob$NEG + return(label) +} + +calcP <- function(geno_vec, dis_vec, dis_limit=20000){ + + dis_bin <- c(100, 1000, 5000, 10000, 20000, 500000, 1000000000000000000) + prob_bin <- c(1, 0.93, 0.78, 0.71, 0.69, 0.64, 0) + + prob_bin[dis_bin>dis_limit] <- 0 + + pos_match <- which(geno_vec%in%c("000","111")) + dis_match <- dis_vec[pos_match] + + match_p <- 0 + for(pos in dis_match){ + if(pos<=dis_bin[1]){p <-1} + else{ + p <- prob_bin[max(which(pos>=dis_bin))+1] + } + match_p <- match_p + p/sqrt(pos) + } + + pos_mismatch <- which(geno_vec%in%c("010","101")) + dis_mismatch <- dis_vec[pos_mismatch] + + + mismatch_p <- 0 + for(pos in dis_mismatch){ + if(pos<=dis_bin[1]){p <-1} + else{ + p <- prob_bin[max(which(pos>=dis_bin))+1] + } + mismatch_p <-mismatch_p + p/sqrt(pos) + } + + if((match_p + mismatch_p)==0){p <- (-1)} + else{ + p <- match_p/(match_p+mismatch_p) + } + + return(p) +} + +Phasing <- function(x = NULL, snv_meta = NULL, dis_limit = 20000, readlength=NULL){ + + start <- 1 + if(start){ + y <- x + dis <- colsplit(y$V1,":", c("id1","id2"))[,2] + dis_rd_study <-c() + trio_hap_rd_study <-c() + hap_rd2 <-c() + dis_rd2 <-c() + cnt <- 0 + + check_study <- snv_meta + check_study$prob <- (-1000) + check_study$tol <- 0 + check_study$dis <- (-1000) + + check_study$prob_phy <- (-1000) + check_study$tol_phy <- 0 + res<-c() + + pos_lst_study <-list() + for(i in seq(5,ncol(y),1)){ + vec <- y[,i] + pos_lst_study[[i]] <- which(!vec=="0|0" & !y$V4==".|." & !vec=="0/0") + } + } + + for(snv in snv_meta$V2){ + cnt=cnt+1 + #print(paste0("scanning ",cnt," now...")) + snv_index <- which(y$V2==snv) + snv_index_noMisCol <- which(!y[snv_index,]=="0|0" & !y[snv_index,]=="0/0",) + + # remove the first column since they are not genotypes + snv_index_noMisCol <- snv_index_noMisCol[snv_index_noMisCol>4] + geno_vec <-c() + dis_vec <-c() + dis_phy1 <- c() + dis_phy2 <- c() + # identify haplotypes for each element + index <-0 + for(pos in snv_index_noMisCol){ + # element index (snv_index, pos) + pos_noMisCol <- pos_lst_study[[pos]] + lower_vec <- which(pos_noMisColsnv_index) + if(length(lower_vec)>=1 & length(upper_vec)>=1){ + lower <- pos_noMisCol[max(lower_vec)] + upper <- pos_noMisCol[min(upper_vec)] + tp <- y[snv_index, pos] + tp <- gsub("/", "|", tp) + geno <- c(y[lower,pos], tp, y[upper, pos]) + + part <-colsplit(geno,"\\|",c("i1","i2"))[,2] + part[part>1] <- 1 + flag <- paste0(part[1],part[2],part[3]) + hap_rd2 <-c(hap_rd2, abs(part[2]-part[1]), abs(part[3]-part[2])) + dis_rd2 <-c(dis_rd2, dis[snv_index]-dis[lower], dis[upper]-dis[snv_index]) + + dis_vec <- c(dis_vec, dis[upper]-dis[lower]) + dis_phy1 <- c(dis_phy1, dis[snv_index]-dis[lower]) + dis_phy2 <- c(dis_phy2, dis[upper]-dis[snv_index]) + geno_vec <-c(geno_vec,flag) + } + } + + sum <- 0 + sum_equal <- 0 + if(sum(dis_phy10){ + p <- sum_equal/sum + check_study$prob_phy[cnt] <- p + check_study$tol_phy[cnt] <- sum + print(paste0("Scan ", cnt,":", snv, " ", sum, " ", p)) + } + + if(length(geno_vec)>0 ){ + dis_rd_study <-c(dis_rd_study, dis_vec) + trio_hap_rd_study <-c(trio_hap_rd_study, geno_vec) + #print(table(trio_hap_rd)) + check_study$prob[cnt] <- calcP(geno_vec,dis_vec,dis_limit) + check_study$tol[cnt] <- sum(dis_vec