diff --git a/src/AlleleDropOut_germline.R b/src/AlleleDropOut_germline.R
new file mode 100644
index 0000000..89bf0da
--- /dev/null
+++ b/src/AlleleDropOut_germline.R
@@ -0,0 +1,187 @@
+library(reshape2)
+calcP <- function(geno_vec, dis_vec, dis_limit=20000){
+
+		dis_bin <-  c(100,  1000,  5000,   10000, 20000,  500000, 1000000000000000000)
+		prob_bin <- c(1,    0.93,  0.78,   0.71,  0.69,   0.64,   0)
+
+		prob_bin[dis_bin>dis_limit] <- 0
+
+		pos_match <- which(geno_vec%in%c("000","111"))
+		dis_match <- dis_vec[pos_match]
+
+		match_p <- 0
+		for(pos in dis_match){
+				if(pos<=dis_bin[1]){p <-1}
+				else{
+					p <- prob_bin[max(which(pos>=dis_bin))+1]
+				}
+				match_p <- match_p + p/sqrt(pos) 
+		}
+
+		pos_mismatch <- which(geno_vec%in%c("010","101"))
+		dis_mismatch <- dis_vec[pos_mismatch]
+
+
+		mismatch_p <- 0
+		for(pos in dis_mismatch){
+				if(pos<=dis_bin[1]){p <-1}
+				else{
+					p <- prob_bin[max(which(pos>=dis_bin))+1]
+				}
+				mismatch_p <-mismatch_p + p/sqrt(pos) 
+		}
+
+		if(is.na(match_p) | is.na(mismatch_p)){p<-(-1)}
+		else{
+			if((match_p + mismatch_p) ==0){
+				p <- (-1)
+			}else{
+				p <- match_p/(match_p+mismatch_p)
+			}
+		}
+		
+		return(p)
+}
+	
+
+
+  
+
+args = commandArgs(trailingOnly=TRUE)
+
+chr <- args[1]
+out <- args[2]
+len <- as.numeric(args[4])
+bin <- as.numeric(args[5])
+
+load(paste0(args[3],chr,".input.image.RDS"))
+
+
+
+germline <- mat_qc[!mat_qc$V4=="0|0" & !mat_qc$V4==".|.",]
+
+print(dim(germline))
+
+y <- germline
+dis <- colsplit(y$V1,":", c("id1","id2"))[,2]
+check <- germline[,c(1,2,3,4)]
+check$prob <- (-1000)
+check$tol <- 0
+check$dis <- (-1000)
+
+check$prob_phy <- (-1000)
+check$tol_phy <- 0
+
+pos_lst <-list()
+sta <- 0
+for(i in seq(5,ncol(y),1)){
+ 	vec <- y[,i]
+ 	pos_lst[[i]] <- which(!vec=="0|0" & !y$V4==".|.")
+ 	sta <- sta + length(which(!vec=="0|0" & !y$V4==".|."))
+}
+
+
+dis_rd <-c()
+trio_hap_rd <-c()
+cnt <- 0
+dis_limit <- 20000
+
+lst <- germline$V2
+dis_rd2 <-c()
+hap_rd2 <-c()
+
+for(snv in lst[seq(1,length(lst),bin)]){
+ 	  cnt=cnt+1
+ 	  #print(paste0("scanning ",cnt," now..."))
+ 	  snv_index <- which(y$V2==snv)
+ 	  snv_index_noMisCol <- which(!y[snv_index,]=="0|0" & !y[snv_index,]=="0/0",)
+
+ 	  # remove the first column since they are not genotypes 
+ 	  snv_index_noMisCol <- snv_index_noMisCol[snv_index_noMisCol>4]
+
+ 	  geno_vec <-c()
+ 	  dis_vec <-c()
+ 	  dis_phy1 <- c()
+	  dis_phy2 <- c()
+ 	  # identify haplotypes for each element
+ 	  index <-0
+ 	  for(pos in snv_index_noMisCol){
+ 	  	 # element index (snv_index, pos)
+ 	  	 pos_noMisCol <- pos_lst[[pos]]
+ 	  	 lower_vec <- which(pos_noMisCol<snv_index)
+ 	  	 upper_vec <- which(pos_noMisCol>snv_index)
+
+ 	  	 if(length(lower_vec)>=1 & length(upper_vec)>=1){
+ 	  	 		lower <- pos_noMisCol[max(lower_vec)]
+ 	  	 		upper <- pos_noMisCol[min(upper_vec)]
+ 	  	 		tp <- y[snv_index, pos]
+ 	  	 		tp <- gsub("/", "|", tp) 
+ 	  	 		geno <- c(y[lower,pos], tp, y[upper, pos])
+
+ 	  	 		part <-colsplit(geno,"\\|",c("i1","i2"))[,2]
+				part[part>1] <- 1
+				flag <- paste0(part[1],part[2],part[3])
+				hap_rd2 <-c(hap_rd2, abs(part[2]-part[1]), abs(part[3]-part[2]))
+				dis_rd2 <-c(dis_rd2, dis[snv_index]-dis[lower], dis[upper]-dis[snv_index])
+
+ 	  	 		dis_vec <- c(dis_vec, dis[upper]-dis[lower])
+ 	  	 		dis_phy1 <- c(dis_phy1,  dis[snv_index]-dis[lower])
+ 	  	 		dis_phy2 <- c(dis_phy2, dis[upper]-dis[snv_index])
+ 	  	 		geno_vec <-c(geno_vec,flag)
+ 	  	 }
+ 	  }
+
+ 	  sum <- 0
+ 	  sum_equal <- 0
+
+ 	  if(!is.null(dis_phy1) & sum(dis_phy1<len)){
+ 	  	 d_geno <- substr(geno_vec[which(dis_phy1<len)],1,2)
+ 	  	 sum <- length(d_geno)
+ 	  	 sum_equal <- sum(d_geno=="00" | d_geno=="11")
+ 	  }
+
+ 	  if(!is.null(dis_phy2) & sum(dis_phy2<len)){
+ 	  	 d_geno <- substr(geno_vec[which(dis_phy2<len)],2,3)
+ 	  	 sum <- length(d_geno) + sum
+ 	  	 sum_equal <- sum(d_geno=="00" | d_geno=="11") + sum_equal
+ 	  }
+
+
+ 	  if(sum>0){
+ 	  	  		p <- sum_equal/sum
+ 	  	  		check$prob_phy[cnt] <- p
+ 	  	  		check$tol_phy[cnt] <- sum
+ 	  	  		print(paste0("Scan ", cnt,":", snv, " ", sum, " ", p, " ",len))
+ 	  }
+
+ 	  if(length(geno_vec)>0 ){
+ 	  	  dis_rd <-c(dis_rd, dis_vec)
+		  trio_hap_rd <-c(trio_hap_rd, geno_vec)
+ 	  	  #print(table(trio_hap_rd))
+ 	  	  sel <-list()
+ 	  	  sel <- table(geno_vec[dis_vec<1000000])
+ 	  	  
+ 	  	  if(length(sel)>0){
+ 	  	  		check$prob[cnt] <- 0
+ 	  	  		check$tol[cnt] <- sum(dis_vec<dis_limit)
+ 	  	  		check$dis[cnt] <- median(dis_vec[dis_vec<dis_limit])
+ 	  	  		#print(paste0("Scan ", cnt,":", snv, " ", sum, " ", (p_sum_000 + p_sum_111)/sum))
+ 	  	  
+ 	  	  }
+ 	  }
+}
+
+
+
+res <-list()
+res$dis <- dis_rd
+res$hap <- trio_hap_rd
+res$check <- check
+res$dis2 <- dis_rd2
+res$hap2 <- hap_rd2
+saveRDS(res, file=out)
+
+
+
+
+
diff --git a/src/LDrefinement.R b/src/LDrefinement.R
index 9a95a54..1b33f7c 100644
--- a/src/LDrefinement.R
+++ b/src/LDrefinement.R
@@ -31,8 +31,7 @@ SNV_block <-function(summary=NULL){
 SVM_prepare <-function(x=NULL){
 	svm<-list()
 	region_cnt <- table(x$region)
-	### SNV hot-spot regions (including multi-alleles) ###
-	filter <- names(region_cnt)[region_cnt>=2]
+	filter <- names(region_cnt)[region_cnt>=4]
 	neg <- x[(x$region%in% filter & x$genotype==".|."),]
 	pos <- x[!x$genotype==".|.",]
 	svm$neg <- neg 
@@ -49,7 +48,6 @@ SVM_train <- function(label=NULL, dir=NULL, region=NULL){
 	label$pos[label$pos=="None"] <- NA
 	# using median values to replace the missing values 
 	train_x_pos <- impute(as.matrix(data.matrix(label$pos[,features])), what="median")
-	train_x_pos[is.na(train_x_pos)] <- 0
 
 	# using the minior value of QS 
 	vec <- train_x_pos[,colnames(train_x_pos)=="QS"]
@@ -59,7 +57,6 @@ SVM_train <- function(label=NULL, dir=NULL, region=NULL){
 	label$neg <- as.data.frame(label$neg)
 	label$neg[label$neg=="None"] <- NA
 	train_x_neg <- impute(as.matrix(data.matrix(label$neg[,features])), what="median")
-	train_x_neg[is.na(train_x_neg)] <- 0
 	vec <- train_x_neg[,colnames(train_x_neg)=="QS"]
 	vec[vec>0.5] <- 1- vec[vec>0.5]
 	train_x_neg[,1] <- vec
@@ -67,7 +64,6 @@ SVM_train <- function(label=NULL, dir=NULL, region=NULL){
 	label$test <- as.data.frame(label$test)
 	label$test[label$test=="None"] <- NA
 	test_x <- impute(as.matrix(data.matrix(label$test[,features])), what="median")
-	test_x[is.na(test_x)] <- 0
 	vec <- test_x[,colnames(test_x)=="QS"]
 	vec[vec>0.5] <- 1- vec[vec>0.5]
 	test_x[,1] <- vec
@@ -83,10 +79,12 @@ SVM_train <- function(label=NULL, dir=NULL, region=NULL){
 	  print(p)
 	}
 	dev.off()
-    
+  
+
 	model <- svm(as.matrix(rbind(train_x_pos, train_x_neg)), 
 				 as.factor(c(train_y_pos,train_y_neg)), 
 				 probability=TRUE)
+  print("Done")
 	pred <- predict(model, as.matrix(test_x), probability=TRUE)
 	prob <- attr(pred, "probabilities")
 	prob <-as.data.frame(prob)
@@ -410,10 +408,11 @@ somaticLD <- function(mat=NULL, svm=NULL,  dir=NULL, region=NULL, min_size=50){
 
 dt <- fread(mat_gz)
 dt <- data.frame(dt)
-dt$V10[is.na(dt$V10)] <- ".|."
+dt$V10[dt$V10=="nan"] <- ".|."
 rownames(dt) <- paste0(dt$V1,":",dt$V2,":",dt$V3,":",dt$V4)
 
-cellName <- read.table(file=paste0(outdir,region,".cell.txt"),header=F)
+cellName <- read.csv(file=paste0(outdir,region,".cell_snv.cellID.filter.csv"),header=T)
+cellName <- cellName[,2]
 
 for(j in seq(11,18,1)){
   dt[,j] <- as.numeric(dt[,j])
@@ -422,7 +421,6 @@ meta <- dt[,1:18]
 colnames(meta) <-c("chr","pos","ref","alt","Dep","dep1","dep2","dep3","dep4",
 	"genotype","QS","VDB","RPB","MQB","BQB","MQSB","SGB","MQ0F")
 
-
 mutation_block <- SNV_block(summary=meta)
 svm_in <- SVM_prepare(mutation_block)
 svm_out <- SVM_train(label =svm_in,dir=outdir, region=region)
@@ -431,7 +429,25 @@ final <- somaticLD(mat=dt, svm=svm_out, dir=outdir, region=region)
 
 
 mut_mat <- dt[rownames(final$out),]
-colnames(mut_mat) <-c(colnames(meta),cellName[,1])
+colnames(mut_mat) <-c(colnames(meta),cellName)
+
+
+#### re-assign the sequencing depth for each allele
+
+for(i in seq(1, nrow(mut_mat),1)){
+  N_wild <- 0 
+  N_mut <- 0 
+  vec <- mut_mat[i,seq(19, ncol(mut_mat),1)]
+  sta <- unname(unlist(vec))
+  sta <- table(c(sta,"0/1","1/0","1/1"))
+  sta <- sta - 1 
+  N_alt <- sta["0/1"] + sta["1/1"]
+  N_ref <- sta["1/0"] + sta["1/1"]
+  LDrefine_somatic[i,6] <- N_ref 
+  LDrefine_somatic[i,7] <- N_alt
+  LDrefine_somatic[i,8] <- N_ref + N_alt
+  LDrefine_somatic[i,12] <- N_alt/(N_ref+N_alt)
+}
 
 write.csv(final$LDrefine_somatic,   paste0(outdir,region,".putativeSNVs.csv"),quote=FALSE,row.names = FALSE)
 write.csv(final$LDrefine_germline2, paste0(outdir,region,".germlineTwoLoci_model.csv"),quote=FALSE, row.names = FALSE)
diff --git a/src/Monopogen.py b/src/Monopogen.py
index 1836746..d656463 100644
--- a/src/Monopogen.py
+++ b/src/Monopogen.py
@@ -17,6 +17,7 @@
 import numpy as np
 import gzip
 from pysam import VariantFile
+from bamProcess import * 
 from germline import *
 from somatic import *
 import multiprocessing as mp
@@ -46,6 +47,18 @@
 
 
 
+def error_check(all, output, step):
+		job_fail = 0
+		for id in all:
+			if id not in output:
+				logger.error("In "+ step + " step " + id + " failed!")
+				job_fail = job_fail + 1
+
+		if job_fail > 0:
+			logger.error("Failed! See instructions above.")
+			exit(1)
+
+
 
 def germline(args):
 	logger.info("Performing germline variant calling...")
@@ -74,13 +87,13 @@ def germline(args):
 				jobid = record[0] + ":" + record[1] + "-" + record[2]
 			bam_filter = args.out + "/Bam/" +  record[0] +  ".filter.bam.lst"
 			imputation_vcf = args.imputation_panel + "CCDG_14151_B01_GRM_WGS_2020-08-05_" + record[0] + ".filtered.shapeit2-duohmm-phased.vcf.gz"
-			cmd1 = samtools + " mpileup -b " + bam_filter + " -f "  + args.reference  + " -r " +  jobid + " -q 20 -Q 20 -t DP -d 10000000 -v "
+			cmd1 = samtools + " mpileup -b" + bam_filter + " -f "  + args.reference  + " -r " +  jobid + " -q 20 -Q 20 -t DP -d 10000000 -v "
 			cmd1 = cmd1 + " | " + bcftools + " view " + " | "  + bcftools  + " norm -m-both -f " + args.reference 
 			cmd1 = cmd1 + " | grep -v \"<X>\" | grep -v INDEL |" + bgzip +   " -c > " + args.out + "/germline/" +  jobid + ".gl.vcf.gz" 
 			#cmd2 = bcftools + " view " +  out + "/germline/" +  jobid + ".gl.vcf.gz" + " -i 'FORMAT/DP>1' | " + bcftools + " call -cv  | " + bgzip +    "  -c > " +  args.out + "/SCvarCall/"  +  jobid + ".gt.vcf.gz"
 			cmd3 = java + " -Xmx20g -jar " + beagle +  " gl=" +  out + "/germline/" +  jobid + ".gl.vcf.gz"  +  " ref=" +  imputation_vcf   + "  chrom=" + record[0] + " out="   +  out + "/germline/" + jobid + ".gp " + "impute=false  modelscale=2  nthreads=24  gprobs=true  niterations=0"
 			
-			cmd4 = "zless -S " +  out + "/germline/" + jobid + ".gp.vcf.gz  > " +  out + "/germline/" + jobid + ".germline.vcf"
+			cmd4 = "zless -S " +  out + "/germline/" + jobid + ".gp.vcf.gz | grep -v  0/0  > " +  out + "/germline/" + jobid + ".germline.vcf"
 			cmd5 = java + " -Xmx20g -jar " + beagle +  " gt=" +  out + "/germline/" +  jobid + ".germline.vcf"  +  " ref=" +  imputation_vcf    +  "  chrom=" + record[0]  + " out="   +  out + "/germline/" + jobid+ ".phased " + "impute=false  modelscale=2  nthreads=24  gprobs=true  niterations=0"
 			f_out = open(out + "/Script/runGermline_" +  jobid +  ".sh","w")
 			if args.step == "varScan" or args.step == "all":
@@ -106,19 +119,6 @@ def germline(args):
 
 
 
-def sort_chr(chr_lst):
-	# sort chr IDs from 1...22
-	chr_lst_sort = []
-	for i in range(1, 23):
-		i = str(i)
-		if  i in chr_lst:
-			chr_lst_sort.append(i)
-		i_chr = "chr"+i 
-		if  i_chr in chr_lst:
-			chr_lst_sort.append(i_chr)
-	chr_lst = chr_lst_sort 
-	return chr_lst
-
 
 def somatic(args):
 	
@@ -138,120 +138,28 @@ def somatic(args):
 			region_lst.append(region)
 	chr_lst = list(set(chr_lst))
 
-	
 
 	if args.step=="featureInfo" or args.step=="all":
-		logger.info("Get feature information from sequencing data...")
-		
+
+		logger.info("Get feature information from sequencing data...")		
 		joblst = []
 		for id in region_lst:
-			joblst.append(id+">"+args.out)
+			joblst.append(id+">"+args.out+">"+args.app_path)
+
 		with Pool(processes=args.nthreads) as pool:
 			result = pool.map(featureInfo, joblst)
+		error_check(all = chr_lst, output = result, step = "LDrefinement")
 		
 	if args.step=="cellScan" or args.step=="all":
 		
-		logger.info("Get single cell level information from sequencing data...")
-		
+		logger.info("Collect single cell level information from sequencing data...")		
 		chr_lst = sort_chr(chr_lst)
-
-		run = 1
-		if run:
-			joblst = []
-			for id in chr_lst:
-				joblst.append(id+">"+args.out+">"+args.app_path)
-			with Pool(processes=args.nthreads) as pool:
-				result = pool.map(bamExtract, joblst)
-
-			####### merge bams from different chromosomes 
-			bamlst = []
-			print(chr_lst)
-			for chr in chr_lst:
-				bam_filter =  out + "/Bam/" + chr + ".filter.targeted.bam"
-				bamlst.append(bam_filter)
-			print(bamlst)
-			output_bam = out + "/Bam/merge.filter.targeted.bam"
-			pysam.merge("-f","-o",output_bam,*bamlst)
-			os.system(samtools + " index " + output_bam)
-
-		if run:
-			os.system("mkdir -p " + out + "/Bam/split_bam/")
-			cell_clst = pd.read_csv(args.barcode)   
-			df = pd.DataFrame(cell_clst, columns= ['cell','id'])
-			df = df.sort_values(by=['id'])
-			args.keep = float(args.keep)
-			if args.keep < 1:
-				dis = np.cumsum(df['id'])/np.sum(df['id'])
-				N = sum(dis>(1-args.keep))
-				df = df.iloc[-(N):]
-			cell_lst = df['cell'].unique()
-			joblst = []
-
-			for cell in cell_lst:
-					para = "merge" + ":" + cell + ":" + args.out + ":" + args.app_path
-					joblst.append(para)
-
-			with Pool(processes=args.nthreads) as pool:
-				result = pool.map(bamSplit, joblst)
-
-			if sum(result)==0:
-				logger.error("No reads detected for cells in " + args.barcode + ". Please check 1) the input cell barcode file is matched with bam file; 2) the cell barcode name has the same format shown in bam file. For example XX-1!")
-				logger.error("Failed! See instructions above.")
-				exit(1)
-
-			# generate the bam file list 
-			cell_bam = open(out + "/Bam/split_bam/cell.bam.lst","w")
-			for cell in cell_lst:
-				cell_bam.write(out + "/Bam/split_bam/" + cell + ".bam\n")
-			cell_bam.close()
-
-
-		region_lst = []
-		if args.winSize=="10MB":
-			region_file = args.app_path + "/../resource/GRCh38.region.10MB.lst"
-		if args.winSize=="50MB":
-			region_file = args.app_path + "/../resource/GRCh38.region.50MB.lst"
-		with open(region_file) as f_in:
-			for line in f_in:
-				record = line.strip().split(",")
-				if(len(record)==1):
-				 	region = record[0]
-				if(len(record)==3):
-					region = record[0] + ":" + record[1] + "-" + record[2]
-				if record[0] in chr_lst:
-					region_lst.append(region)
-
-		joblst = []
-		for id in region_lst:
-			record = id.strip().split(":")
-			chr = record[0]
-			joblst.append(id+">"+chr+">"+args.out+">"+args.app_path+">"+args.reference)
-
-
-		with Pool(processes=args.nthreads) as pool:
-			result = pool.map(jointCall, joblst)
-		error_check(all = region_lst, output = result, step = "cellScan:joint calling")
-
-
-		##### merge vcfs from multiple regions 
-	
-		for id in chr_lst:
-			tp = ""
-			for reg in region_lst:
-				if re.search(id+":", reg):
-					tp = tp + " " + args.out+"/somatic/" +  reg + ".cell.gl.vcf.gz"
-			cmd = args.app_path + "/bcftools concat -o " + args.out+"/somatic/" +  id + ".cell.gl.vcf.gz " +  tp + " -O z"
-			print(cmd)
-			output = os.system(cmd)
-
 		joblst = []
 		for id in chr_lst:
-			joblst.append(id+">"+args.out)
+			joblst.append(id+">"+args.out+">"+args.reference+">"+args.barcode)
 		with Pool(processes=args.nthreads) as pool:
-			result = pool.map(vcf2mat, joblst)
-		error_check(all = chr_lst, output = result, step = "cellScan:vcf2mat")
-
-
+			result = pool.map(bam2mat, joblst)
+		error_check(all = chr_lst, output = result, step = "cellScan")
 
 	if args.step=="LDrefinement" or args.step=="all":
 		logger.info("Run LD refinement ...")
@@ -265,18 +173,6 @@ def somatic(args):
 
 
 
-def error_check(all, output, step):
-		job_fail = 0
-		for id in all:
-			if id not in output:
-				logger.error("In "+ step + " step " + id + " failed!")
-				job_fail = job_fail + 1
-
-		if job_fail > 0:
-			logger.error("Failed! See instructions above.")
-			exit(1)
-
-
 
 def preProcess(args):
 	logger.info("Performing data preprocess before variant calling...")
@@ -392,9 +288,6 @@ def main():
 								help="The app library paths used in the tool")
 	parser_somatic.add_argument('-t', '--nthreads', required=False, type=int, default=22,
 								help="Number of jobs used for SNV calling") 
-	parser_somatic.add_argument('-w', '--winSize', required=False,  default="50MB",
-								choices=['10MB','50MB'],
-								help="Split the chromosome into small segments for cell level sequencing information collection. Setting 10MB will generate more jobs but be faster") 
 	parser_somatic.add_argument('-s', '--step', required=True,
 								choices=['featureInfo', 'cellScan' , 'LDrefinement', 'monovar', 'all'],
 								help="Run germline variant calling step by step")
diff --git a/src/__pycache__/Single_Cell_Ftrs_Pos.cpython-37.pyc b/src/__pycache__/Single_Cell_Ftrs_Pos.cpython-37.pyc
index 8fec6aa..a5a64d4 100644
Binary files a/src/__pycache__/Single_Cell_Ftrs_Pos.cpython-37.pyc and b/src/__pycache__/Single_Cell_Ftrs_Pos.cpython-37.pyc differ
diff --git a/src/__pycache__/alleles_prior.cpython-37.pyc b/src/__pycache__/alleles_prior.cpython-37.pyc
index c8603e8..c049589 100644
Binary files a/src/__pycache__/alleles_prior.cpython-37.pyc and b/src/__pycache__/alleles_prior.cpython-37.pyc differ
diff --git a/src/__pycache__/bamProcess.cpython-37.pyc b/src/__pycache__/bamProcess.cpython-37.pyc
new file mode 100644
index 0000000..38ff890
Binary files /dev/null and b/src/__pycache__/bamProcess.cpython-37.pyc differ
diff --git a/src/__pycache__/base_q_ascii.cpython-37.pyc b/src/__pycache__/base_q_ascii.cpython-37.pyc
index cab1805..00a9332 100644
Binary files a/src/__pycache__/base_q_ascii.cpython-37.pyc and b/src/__pycache__/base_q_ascii.cpython-37.pyc differ
diff --git a/src/__pycache__/calc_variant_prob.cpython-37.pyc b/src/__pycache__/calc_variant_prob.cpython-37.pyc
index e2f2b85..de57418 100644
Binary files a/src/__pycache__/calc_variant_prob.cpython-37.pyc and b/src/__pycache__/calc_variant_prob.cpython-37.pyc differ
diff --git a/src/__pycache__/genotype_prob_mat.cpython-37.pyc b/src/__pycache__/genotype_prob_mat.cpython-37.pyc
index 729b60b..64f2a24 100644
Binary files a/src/__pycache__/genotype_prob_mat.cpython-37.pyc and b/src/__pycache__/genotype_prob_mat.cpython-37.pyc differ
diff --git a/src/__pycache__/germline.cpython-37.pyc b/src/__pycache__/germline.cpython-37.pyc
new file mode 100644
index 0000000..8758408
Binary files /dev/null and b/src/__pycache__/germline.cpython-37.pyc differ
diff --git a/src/__pycache__/hzvcf.cpython-37.pyc b/src/__pycache__/hzvcf.cpython-37.pyc
index 3555def..1ec69ed 100644
Binary files a/src/__pycache__/hzvcf.cpython-37.pyc and b/src/__pycache__/hzvcf.cpython-37.pyc differ
diff --git a/src/__pycache__/mp_genotype.cpython-37.pyc b/src/__pycache__/mp_genotype.cpython-37.pyc
index 6c5ee75..4c1476b 100644
Binary files a/src/__pycache__/mp_genotype.cpython-37.pyc and b/src/__pycache__/mp_genotype.cpython-37.pyc differ
diff --git a/src/__pycache__/nu_genotype_single_cell.cpython-37.pyc b/src/__pycache__/nu_genotype_single_cell.cpython-37.pyc
index a932092..958cc01 100644
Binary files a/src/__pycache__/nu_genotype_single_cell.cpython-37.pyc and b/src/__pycache__/nu_genotype_single_cell.cpython-37.pyc differ
diff --git a/src/__pycache__/nu_prob_mat.cpython-37.pyc b/src/__pycache__/nu_prob_mat.cpython-37.pyc
index 0e0d861..abafae6 100644
Binary files a/src/__pycache__/nu_prob_mat.cpython-37.pyc and b/src/__pycache__/nu_prob_mat.cpython-37.pyc differ
diff --git a/src/__pycache__/somatic.cpython-37.pyc b/src/__pycache__/somatic.cpython-37.pyc
new file mode 100644
index 0000000..45d75dd
Binary files /dev/null and b/src/__pycache__/somatic.cpython-37.pyc differ
diff --git a/src/__pycache__/utils.cpython-37.pyc b/src/__pycache__/utils.cpython-37.pyc
index abcf3b2..39b2b76 100644
Binary files a/src/__pycache__/utils.cpython-37.pyc and b/src/__pycache__/utils.cpython-37.pyc differ
diff --git a/src/bamProcess.py b/src/bamProcess.py
new file mode 100644
index 0000000..d6c3e9f
--- /dev/null
+++ b/src/bamProcess.py
@@ -0,0 +1,163 @@
+#!/usr/bin/env python3
+"""
+The main interface of scPopGene
+"""
+
+import argparse
+import sys
+import os
+import logging
+import shutil
+import glob
+import re
+import pysam
+import time
+import subprocess
+import pandas as pd
+from pysam import VariantFile
+
+LIB_PATH = os.path.abspath(
+	os.path.join(os.path.dirname(os.path.realpath(__file__)), "pipelines/lib"))
+
+if LIB_PATH not in sys.path:
+	sys.path.insert(0, LIB_PATH)
+
+PIPELINE_BASEDIR = os.path.dirname(os.path.realpath(sys.argv[0]))
+CFG_DIR = os.path.join(PIPELINE_BASEDIR, "cfg")
+
+#import pipelines
+#from pipelines import get_cluster_cfgfile
+#from pipelines import PipelineHandler
+
+# global logger
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.DEBUG)
+handler = logging.StreamHandler()
+handler.setFormatter(logging.Formatter(
+	'[{asctime}] {levelname:8s} {filename} {message}', style='{'))
+logger.addHandler(handler)
+
+
+
+def addChr(args):
+	# edit the sequence names for your output header
+	in_bam = args.bamFile
+	prefix = 'chr'
+	out_bam=in_bam+"tmp.bam"
+	print(in_bam)
+	input_bam = pysam.AlignmentFile(in_bam,"rb")
+	new_head = input_bam.header.to_dict()
+	for seq in new_head['SQ']:
+		seq['SN'] = prefix  + seq['SN']
+	# create output BAM with newly defined header
+	with pysam.AlignmentFile(out_bam, "wb", header=new_head) as outf:
+		for read in input_bam.fetch():
+			prefixed_chrom = prefix + read.reference_name
+			a = pysam.AlignedSegment(outf.header)
+			a.query_name = read.query_name
+			a.query_sequence = read.query_sequence
+			a.reference_name = prefixed_chrom
+			a.flag = read.flag
+			a.reference_start = read.reference_start
+			a.mapping_quality = read.mapping_quality
+			a.cigar = read.cigar
+			a.next_reference_id = read.next_reference_id
+			a.next_reference_start = read.next_reference_start
+			a.template_length = read.template_length
+			a.query_qualities = read.query_qualities
+			a.tags = read.tags
+			outf.write(a)
+	input_bam.close()
+	outf.close()
+	os.system("samtools  index " +  out_bam)
+	os.system(" mv " + out_bam + " " + in_bam)
+	os.system(" mv " + out_bam + ".bai  " + in_bam + ".bai")
+
+
+
+
+def sort_chr(chr_lst):
+	# sort chr IDs from 1...22
+	chr_lst_sort = []
+	for i in range(1, 23):
+		i = str(i)
+		if  i in chr_lst:
+			chr_lst_sort.append(i)
+		i_chr = "chr"+i 
+		if  i_chr in chr_lst:
+			chr_lst_sort.append(i_chr)
+	chr_lst = chr_lst_sort 
+	return chr_lst
+
+
+
+
+def bamSplit(para):
+	para_lst = para.strip().split(":")
+	chr = para_lst[0]
+	cell = para_lst[1]
+	out = para_lst[2]
+	app_path = para_lst[3]
+
+	#assert os.path.isfile(bam_filter), "*.fiter.targeted.bam file {} cannot be found!".format(bam_filter)
+	samtools = app_path + "/samtools" 
+	output_bam =  out + "/Bam/merge.filter.targeted.bam" 
+	infile = pysam.AlignmentFile(output_bam,"rb")
+	# Note to change the read groups 
+	tp =infile.header.to_dict()
+	if len(tp['RG'])>1:
+		tp['RG']= [tp['RG'][0]]
+	tp['RG'][0]['SM'] = cell
+	tp['RG'][0]['ID'] = cell
+	cnt = 0
+	outfile =  pysam.AlignmentFile( out + "/Bam/split_bam/" + cell  + ".bam", "wb", header=tp)
+	for s in infile:
+		t  = robust_get_tag(s,"CB")
+		if not t=="NotFound":
+			if t==cell:
+				outfile.write(s)
+				cnt = cnt +1 
+		else:
+			if re.search(cell, s.query_name):
+				outfile.write(s)
+				cnt = cnt + 1
+	
+	outfile.close()
+	infile.close()
+	cmd=samtools + " index " + out + "/Bam/split_bam/" + cell + ".bam"
+
+	os.system(samtools + " index " + out + "/Bam/split_bam/" + cell + ".bam")
+
+	return(cnt)
+
+
+
+
+def jointCall(para):
+
+	para_lst = para.strip().split(">")
+	jobid = para_lst[0]
+	chr=para_lst[1]
+	out = para_lst[2]
+	app_path = para_lst[3]
+	reference = para_lst[4]
+	samtools = app_path + "/samtools" 
+	bcftools = app_path + "/bcftools" 
+	bgzip = app_path + "/bgzip"
+	bam_filter = out + "/Bam/split_bam/cell.bam.lst"
+	cmd1 = samtools + " mpileup -b " + bam_filter + " -f "  + reference + " -r " +  jobid + " -q 20 -Q 20 -t DP4 -d 10000 -v "
+	cmd1 = cmd1 + " | " + bcftools + " view " + " | "  + bcftools  + " norm -m-both -f " + reference 
+	cmd1 = cmd1 + " | grep -v \"<X>\" | grep -v INDEL |" + bgzip +   " -c > " + out + "/somatic/" +  jobid + ".cell.gl.vcf.gz" 		
+	print(cmd1)
+	output = os.system(cmd1)
+
+	# delete the bam files once snv calling was finished in specfic regions
+	f = open(bam_filter, "r")
+	for x in f:
+		x = x.strip()
+		#os.system("rm " + x)
+		#os.system("rm " + x + ".bai")		
+	f.close()
+
+	if output == 0:
+		return(jobid)
diff --git a/src/haplotype_somatic.R b/src/haplotype_somatic.R
new file mode 100644
index 0000000..33094ec
--- /dev/null
+++ b/src/haplotype_somatic.R
@@ -0,0 +1,101 @@
+
+
+library(e1071)
+library(reshape2)
+
+source("/rsrch3/scratch/bcb/jdou1/scAncestry/Monopogen/src/svm_phasing_sub.R")
+
+args = commandArgs(trailingOnly=TRUE)
+
+chr <- args[1]
+loaddata <- args[2]
+dis_limit_physical <- as.numeric(args[3])
+dis_limit_genetic <- as.numeric(args[4])
+
+
+
+
+
+print(chr)
+
+if(loaddata==1){
+	print("Loading data now...")
+	mat <- read.table(paste0(file="../germline/chr",chr,".gl.filter.hc.cell.mat.gz"))
+	#normal <- read.table(file=paste0("normal.",chr,".snv"))
+	#tumor <- read.table(file=paste0("tumor.",chr,".snv"))
+	#somatic <- read.table(file="somatic.snv")
+	feature_info <- read.csv(paste0(file="../debug/chr",chr,".gl.feature.info"),header=F)
+
+
+	mat$V2 <- paste0(mat$V1,":",mat$V2)
+	meta <- mat[,c("V1","V2","V3","V4")]
+	meta <- as.data.frame(meta)
+	meta$in_normal <- 0
+	meta$in_tumor <- 0
+	meta$somatic <- 0
+	#meta$in_normal[meta$V2%in%normal$V1] <- 1
+	#meta$in_tumor[meta$V2%in%tumor$V1] <- 1
+	#meta$somatic[meta$V2%in%somatic$V1] <- 1
+	meta$baf <- frq(mat)
+
+	#meta$baf <-0.5
+	# SVM module on remove low quality mutations
+
+
+	svm_info <- feature_info[feature_info$V1%in%meta$V2,]
+	svm_dt <- getFeatureInfo(svm_info=svm_info)
+
+	overlap <- intersect(meta$V2, svm_dt$id)
+
+	meta_qc <- meta[meta$V2%in%overlap,]
+	mat_qc <- mat[meta$V2%in%overlap,]
+	svm_dt_qc <- svm_dt[svm_dt$id%in%overlap,]
+	svm_dt_qc$BAF <- meta_qc$baf
+	save.image(file=paste0(chr,".input.image.RDS"))
+}
+
+if(loaddata==2){
+	load(file=paste0(chr,".input.image.RDS"))
+}
+
+
+source("/rsrch3/scratch/bcb/jdou1/scAncestry/Monopogen/src/svm_phasing_sub.R")
+print(args)
+
+
+print("start to run SVM and haplotype filtering")
+print("new version ")
+
+# get putative mutation block 
+#meta_qc$flag <- paste0(meta_qc$in_normal, ":", meta_qc$in_tumor, ":", meta_qc$somatic)
+
+
+
+
+
+mutation_block <- SNV_block(summary=meta_qc)
+
+svm_in <- SVM_prepare(mutation_block)
+
+svm_out <- SVM_train(label =svm_in, data=svm_dt_qc, downsampling=2)
+
+p_lower <- sort(svm_out$test$POS)[floor(nrow(svm_out$test)*0.1)]
+p_upper <- sort(svm_out$test$POS)[floor(nrow(svm_out$test)*0.95)]
+filter1 <- (svm_out$test$POS>p_lower)
+filter2 <- (svm_out$test$baf>0.1 )
+svm_pass <- svm_out$test[filter1 & filter2,]
+
+
+svm_phasing <- Phasing(x=mat_qc, snv_meta=svm_pass, dis_limit=dis_limit_genetic, readlength=dis_limit_physical)
+
+svm_out$phasing <- svm_phasing
+svm_out$feature <- svm_dt_qc
+
+saveRDS(svm_out,file=paste0("svm",chr,".out.RDS"))
+
+
+
+x=mat_qc
+snv_meta=svm_pass
+dis_limit=dis_limit_genetic
+readlength=dis_limit_physical
\ No newline at end of file
diff --git a/src/index_motif.py b/src/index_motif.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/somatic.py b/src/somatic.py
index ce40f21..8c49357 100644
--- a/src/somatic.py
+++ b/src/somatic.py
@@ -14,6 +14,7 @@
 import numpy as np
 import gzip
 from pysam import VariantFile
+from bamProcess import * 
 import multiprocessing as mp
 from multiprocessing import Pool
 
@@ -31,9 +32,9 @@ def withSNVs(invcf, path):
 	return(cnt)
 
 def runCMD(cmd):
-
+	print(cmd)
 	os.system(cmd)
-	#process = subprocess.run(cmd, shell=True, stdout=open(args.logfile, 'w'), stderr=open(args.logfile,'w'))
+	#process = subprocess.run(cmd, shell=True)
 
 
 def robust_get_tag(read, tag_name):  
@@ -59,14 +60,10 @@ def validate_user_setting_somatic(args):
 				exit(1)
 			bam_filter = args.out + "/Bam/" +  record[0] +  ".filter.bam.lst"
 			gl_vcf = args.out + "/germline/" + jobid + ".gl.vcf.gz"
-			gp_vcf = args.out + "/germline/" + jobid + ".germline.vcf.gz" 
 			phased_vcf = args.out + "/germline/" + jobid + ".phased.vcf.gz" 
 			assert os.path.isfile(bam_filter), "The bam list file {} cannot be found! Please run germline module".format(bam_filter)
 			assert os.path.isfile(gl_vcf), "The *.gl.vcf.gz file {} cannot be found! Please run germline module".format(gl_vcf)
-			#if withSNVs(gl_vcf, args.app_path)==0:
-			#	print("The *.gl.vcf.gz file {} cannot be found! Maybe there are no markers detected in ".format(gl_vcf))
-			#if withSNVs(phased_vcf, args.app_path)==0:
-			#	print("The *.phased.vcf.gz file {} cannot be found! Maybe there are no markers detected in this region?".format(phased_vcf))
+			assert os.path.isfile(phased_vcf), "The *.phased.vcf.gz file {} cannot be found! Please run germline module".format(phased_vcf)
 			assert os.path.isfile(args.barcode), "The cell barcode file {} cannot be found!".format(args.barcode)
 
 
@@ -80,11 +77,36 @@ def getInfo_robust(rec, info):
 	return(info_dt)
 
 
-def featureInfo(para):
+def bamExtract(para):
+	
 	para_lst = para.strip().split(">")
+	chr = para_lst[0]
 	out = para_lst[1]
-	region = para_lst[0]
+	app_path = para_lst[2]
+	samtools = os.path.abspath(app_path) + "/samtools" 
+	out = os.path.abspath(out)
+	inbam = getBamName(chr, out)
+	outbam =  out + "/Bam/" + chr + ".filter.targeted.bam"
+	# remember to update bed file   ls -lrt
+
+	#os.system("cat " +  out + "/somatic/" + chr + "*.gl.vcf.filter.hc.bed > " + out + "/somatic/" + chr + ".bed")
+	chr_bed = out + "/somatic/" + chr + ".gl.vcf.filter.hc.bed"
+	cmd1 = samtools + " view " + " -b  -L " + chr_bed + " " + inbam + " -o " + outbam
+	with open(out+"/Script/bamExtract_" + chr + ".sh","w") as f_out:
+		f_out.write(cmd1 + "\n")
+		f_out.write(samtools + " index " +  outbam + "\n")
+	cmd="bash " + out+"/Script/bamExtract_" + chr + ".sh"
+	print(cmd)
+	os.system(cmd)
 
+def featureInfo(para):
+
+	para_lst = para.strip().split(">")
+	region = para_lst[0]
+	out = para_lst[1]
+	app = para_lst[2]
+	pysam.tabix_index(out + "/germline/" +  region + ".phased.vcf.gz", preset="vcf", force=True)
+	pysam.tabix_index(out + "/germline/" +  region + ".gl.vcf.gz", preset="vcf", force=True)
 	vcf_in = VariantFile(out + "/germline/" +  region + ".phased.vcf.gz") 
 	info_GT = {}
 	for rec in vcf_in.fetch():
@@ -102,13 +124,13 @@ def featureInfo(para):
 	for rec in gl_vcf_in.fetch():
 		info_I16 = rec.info.get('I16')
 		info_QS =  getInfo_robust(rec,'QS')
-		info_VDB =  getInfo_robust(rec,'VDB')
+		info_VDB = getInfo_robust(rec,'VDB')
 		info_RPB = getInfo_robust(rec,'RPB')
 		info_MQB = getInfo_robust(rec,'MQB')  
 		info_BQB = getInfo_robust(rec,'BQB') 
-		info_MQSB = getInfo_robust(rec,'MQSB')
+		info_MQSB =getInfo_robust(rec,'MQSB')
 		info_SGB = getInfo_robust(rec,'SGB')
-		info_MQ0F = getInfo_robust(rec,'MQ0F')
+		info_MQ0F =getInfo_robust(rec,'MQ0F')
 		id = str(rec.chrom)+":"+str(rec.pos) + ":" + rec.ref + ":" + rec.alts[0]
 		gt_info_var = "NA"
 		if (id in info_GT):
@@ -126,6 +148,14 @@ def featureInfo(para):
 			b = "{}\t{}\n".format(rec.chrom, rec.pos)
 			gl_vcf_filter_txt.write(b)
 			gl_vcf_filter_dp4.write(a)
+	gl_vcf_filter_dp4.close()
+	gl_vcf_filter_txt.close() 
+	gl_vcf_filter_bed.close() 
+	gl_vcf_dp4.close()
+
+	
+	bamExtract(para)
+
 
 def getBamName(chr, out):
 	#chr = region.split(":")[0]
@@ -135,97 +165,6 @@ def getBamName(chr, out):
 			bamName = line.strip()
 			return(bamName)
 
-def bamExtract(para):
-	
-	para_lst = para.strip().split(">")
-	
-	chr = para_lst[0]
-	out = para_lst[1]
-	app_path = para_lst[2]
-	samtools = os.path.abspath(app_path) + "/samtools" 
-	out = os.path.abspath(out)
-	inbam = getBamName(chr, out)
-	outbam =  out + "/Bam/" + chr + ".filter.targeted.bam"
-	# remember to update bed file   ls -lrt
-
-	os.system("cat " +  out + "/somatic/" + chr + "*.gl.vcf.filter.hc.bed > " + out + "/somatic/" + chr + ".bed")
-	cmd1 = samtools + " view " + " -b  -L " + out + "/somatic/" + chr +".bed " + inbam + " -o " + outbam
-	with open(out+"/Script/bamExtract_" + chr + ".sh","w") as f_out:
-		f_out.write(cmd1 + "\n")
-		f_out.write(samtools + " index " +  outbam + "\n")
-	cmd="bash " + out+"/Script/bamExtract_" + chr + ".sh"
-	runCMD(cmd)
-
-
-def bamSplit(para):
-	para_lst = para.strip().split(":")
-	chr = para_lst[0]
-	cell = para_lst[1]
-	out = para_lst[2]
-	app_path = para_lst[3]
-
-	#assert os.path.isfile(bam_filter), "*.fiter.targeted.bam file {} cannot be found!".format(bam_filter)
-	samtools = app_path + "/samtools" 
-	output_bam =  out + "/Bam/merge.filter.targeted.bam" 
-	infile = pysam.AlignmentFile(output_bam,"rb")
-	# Note to change the read groups 
-	tp =infile.header.to_dict()
-	if len(tp['RG'])>1:
-		tp['RG']= [tp['RG'][0]]
-	tp['RG'][0]['SM'] = cell
-	tp['RG'][0]['ID'] = cell
-	cnt = 0
-	outfile =  pysam.AlignmentFile( out + "/Bam/split_bam/" + cell  + ".bam", "wb", header=tp)
-	for s in infile:
-		t  = robust_get_tag(s,"CB")
-		if not t=="NotFound":
-			if t==cell:
-				outfile.write(s)
-				cnt = cnt +1 
-		else:
-			if re.search(cell, s.query_name):
-				outfile.write(s)
-				cnt = cnt + 1
-	
-	outfile.close()
-	infile.close()
-	cmd=samtools + " index " + out + "/Bam/split_bam/" + cell + ".bam"
-	#runCMD(cmd,args)
-	os.system(samtools + " index " + out + "/Bam/split_bam/" + cell + ".bam")
-
-	return(cnt)
-
-
-
-
-def jointCall(para):
-
-	para_lst = para.strip().split(">")
-	jobid = para_lst[0]
-	chr=para_lst[1]
-	out = para_lst[2]
-	app_path = para_lst[3]
-	reference = para_lst[4]
-	samtools = app_path + "/samtools" 
-	bcftools = app_path + "/bcftools" 
-	bgzip = app_path + "/bgzip"
-	bam_filter = out + "/Bam/split_bam/cell.bam.lst"
-	cmd1 = samtools + " mpileup -b " + bam_filter + " -f "  + reference + " -r " +  jobid + " -q 20 -Q 20 -t DP4 -d 10000 -v "
-	cmd1 = cmd1 + " | " + bcftools + " view " + " | "  + bcftools  + " norm -m-both -f " + reference 
-	cmd1 = cmd1 + " | grep -v \"<X>\" | grep -v INDEL |" + bgzip +   " -c > " + out + "/somatic/" +  jobid + ".cell.gl.vcf.gz" 		
-	print(cmd1)
-	output = os.system(cmd1)
-
-	# delete the bam files once snv calling was finished in specfic regions
-	f = open(bam_filter, "r")
-	for x in f:
-		x = x.strip()
-		#os.system("rm " + x)
-		#os.system("rm " + x + ".bai")		
-	f.close()
-
-	if output == 0:
-		return(jobid)
 
 
 def less1(num):
@@ -314,4 +253,283 @@ def LDrefinement(para):
 	print(cmd)
 	output = os.system(cmd)
 	if output == 0:
-		return(region)
\ No newline at end of file
+		return(region)
+
+
+
+
+
+
+def robust_get_tag(read, tag_name):  
+	try:  
+		return read.get_tag(tag_name)
+	except KeyError:
+		return "NotFound"
+
+def rev_compl(st):
+    nn = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'}
+    return "".join(nn[n] for n in reversed(st))
+
+
+
+def bam2mat(para):
+
+	para_lst = para.strip().split(">")
+	print(para_lst)
+
+	# bam2gz(para);
+
+	#### gz to matrix ### 
+	mat_infile = para_lst[1] + "/somatic/" + para_lst[0] + ".cell_snv.mat.gz"
+	snv_infile = para_lst[1] + "/somatic/" + para_lst[0] + ".cell_snv.snvID.csv"
+	cell_infile = para_lst[1] + "/somatic/" + para_lst[0] + ".cell_snv.cellID.csv"
+	meta_info = para_lst[1] + "/somatic/" + para_lst[0] + ".gl.vcf.filter.DP4"
+
+	cell_clst = pd.read_csv(cell_infile)
+	snv_clst =  pd.read_csv(snv_infile)  
+	mat = pd.read_csv(mat_infile, sep="\t", header=None) 
+	mat_out = gzip.open(para_lst[1] + "/somatic/" +  para_lst[0] + ".gl.filter.hc.cell.mat.gz","wt")
+
+
+	mat.columns = ['snvIndex','cellIndex','allele'] 
+	mat=mat.groupby(by=['snvIndex','cellIndex'], as_index=False).first()
+	mat=mat.pivot(index='snvIndex', columns='cellIndex', values='allele')
+	meta_info = pd.read_csv(meta_info, sep="\t", header=None) 
+	meta_info.columns=["chr","pos","ref_allele","alt_allele","Dep","dep1","dep2","dep3","dep4","genotype","QS","VDB","RPB","MQB","BQB","MQSB","SGB","MQ0F"]
+
+	overlapSNV_index=set(snv_clst["index"]).intersection(set(mat.index))
+
+	cell_clst.loc[list(mat.columns)]['cell'].to_csv(para_lst[1] + "/somatic/" + para_lst[0] + ".cell_snv.cellID.filter.csv")
+
+	## meta_info_id is similar with snv_clst
+	n_cell = mat.shape[1]
+	for index in overlapSNV_index:
+		info = meta_info.iloc[index].astype(str)
+		geno = info["genotype"]
+		info = "\t".join(info)
+		
+		flag = 0
+		if geno=="0|1":
+			phase_info = ["0|0"]*n_cell
+			tp = mat.loc[index]
+			pos = tp.dropna().index
+			for value in pos:
+					allele = tp.loc[value]
+					index_vec = tp.index.get_loc(value)
+					if allele==1:
+						phase_info[index_vec]="0|1"
+					elif allele==0:
+						phase_info[index_vec]="1|0"
+			flag = 1
+		elif geno=="1|0":
+			phase_info = ["0|0"]*n_cell
+			tp = mat.loc[index]
+			pos = tp.dropna().index
+			for value in pos:
+					allele = tp.loc[value]
+					index_vec = tp.index.get_loc(value)
+					if allele==1:
+						phase_info[index_vec]="1|0"
+					elif allele==0:
+						phase_info[index_vec]="0|1"
+			flag = 1 
+		elif geno=="nan":
+			phase_info = ["0/0"]*n_cell
+			tp = mat.loc[index]
+			pos = tp.dropna().index
+			for value in pos:
+					allele = tp.loc[value]
+					index_vec = tp.index.get_loc(value)
+					if allele==1:
+						phase_info[index_vec]="0/1"
+					elif allele==0:
+						phase_info[index_vec]="1/0"
+			flag = 1 
+		if flag:
+			mat_out.write(info+"\t" +'\t'.join(phase_info))
+			mat_out.write('\n')
+
+
+
+
+def bam2gz(para):
+			
+		para_lst = para.strip().split(">")
+
+		#assert os.path.isfile(bam_filter), "*.fiter.targeted.bam file {} cannot be found!".format(bam_filter)
+		#in_fasta = "/rsrch3/scratch/bcb/jdou1/scAncestry/ref/fasta/genome.fa"
+		#in_vcf = "/rsrch3/scratch/bcb/jdou1/bam_process/chr1.gp.vcf"
+		#in_bam = "/rsrch3/scratch/bcb/jdou1/bam_process/chr1.filter.targeted.bam"
+
+		# joblst.append(id+">"+args.out+">"+args.reference)
+		
+		in_snv = para_lst[1] + "/somatic/" + para_lst[0] + ".gl.vcf.filter.DP4"
+		in_bam = para_lst[1] + "/Bam/" + para_lst[0] + ".filter.targeted.bam"
+		in_fasta = para_lst[2]
+		in_cell_barcode = para_lst[3]
+
+		# read cell barcode file 
+
+		cell_clst = pd.read_csv(in_cell_barcode)   
+		df = pd.DataFrame(cell_clst, columns= ['cell','id'])
+		df = df.sort_values(by=['id'], ascending=False)
+		df['index']=range(len(df.index))
+		cell_set = list(df["cell"])
+
+
+		ref_fa = pysam.FastaFile(in_fasta)
+		snv_info = {}
+		tp = list()
+		index = 0 
+		motif_len = 3
+		snv_tol = 0 
+
+		# index 0-based 
+		with open(in_snv,'r') as fp:
+			for line in fp:
+				line = line.strip()
+				line = line.split("\t")
+				chrom = line[0]
+				pos = int(line[1])
+				ref = line[2] 
+				alts = line[3]
+				seq = ref_fa.fetch(chrom, pos-4, pos+3)
+				seq_rev_compl = rev_compl(seq)
+				id = str(chrom)+":"+str(pos) + ":" + ref + ":" + alts[0]
+				mydict = dict(id=index, chr = chrom, pos = pos, motif_pos= seq, ref_allele = ref, alt_allele=alts[0])			
+				snv_info[index]=mydict  
+				index = index + 1
+				snv_tol = snv_tol + 1
+				tp.append(id)
+
+		
+
+		#print(snv_info_out)
+		## output cell snv meta information 
+		df.to_csv(para_lst[1] + "/somatic/" + para_lst[0] + ".cell_snv.cellID.csv", index = False) 
+		snv_info_out = pd.DataFrame()
+		snv_info_out['snvID'] = tp 
+		snv_info_out['index'] = range(len(tp))
+		snv_info_out.to_csv(para_lst[1] + "/somatic/" + para_lst[0] + ".cell_snv.snvID.csv",index = False) 
+		
+		read_tol = 0 
+		read_cover_tol = 0 
+		read_wild_tol = 0
+		read_mutated_tol = 0 
+		read_noAllele_tol = 0 
+		overlap = 0 
+		index = 0
+		lower_index = 0
+
+		infile = pysam.AlignmentFile(in_bam,"rb")
+		fp = gzip.open(para_lst[1] + "/somatic/" + para_lst[0] + ".cell_snv.mat.gz", "wt")
+		# fp = open(para_lst[1] + "/somatic/" + para_lst[0] + ".cell_snv.mat", "wt")
+		for s in infile:
+
+			cell_barcode  = robust_get_tag(s,"CB")
+			if cell_barcode=="NotFound":
+				### no cell barcode file detected.
+				### check whether the barcode added in the query name 	
+				cell_barcode = s.query_name.strip().split("_")[1]
+
+			if cell_barcode in cell_set:
+				cell_barcode_index = cell_set.index(cell_barcode)
+			else:
+				### cell barcode is not detected, should move to next read 
+				continue
+
+			read_name = s.query_name 
+			align_chr = s.reference_name
+			align_start = s.reference_start
+			align_seq = s.query_sequence
+			mystart = str(snv_info[lower_index]["pos"])
+
+
+			### flags used to see whether SNVs covered in different scenarios
+			read_tol = read_tol + 1 
+			read_cover = 0
+			read_wild = 0 
+			read_mutated = 0
+			read_noAllele = 0
+			read_len = len(align_seq)
+
+
+			### get the SNVs locating in [align_start, align_start + read_len] ### 
+			if read_tol%1000000 == 0:
+				print("scanning read " + str(read_tol)) 
+
+			lock = 0 
+			snv_cover = ""
+
+
+			for i in range(lower_index,snv_tol-1,1):
+
+				snv_pos = snv_info[i]["pos"]
+				wild_allele = snv_info[i]["ref_allele"]
+				alt_allele = snv_info[i]["alt_allele"]
+
+				if snv_pos >= align_start:
+					if lock==0: 
+						lower_index = i 
+					lock = lock + 1 
+					if snv_pos <= align_start + read_len: 
+						read_cover = read_cover + 1 
+						snv_cover = str(snv_cover) + ":" + str(snv_pos)
+						motif_pos = snv_info[i]["motif_pos"]
+						motif_neg = motif_pos[:motif_len] + snv_info[i]["alt_allele"] + motif_pos[motif_len + 1:]
+					
+						if re.search(motif_pos, align_seq):
+							read_wild = read_wild + 1 
+							fp.write(str(i)+"\t"+str(cell_barcode_index)+"\t0\n")
+						elif re.search(motif_neg, align_seq):
+							read_mutated = read_mutated + 1
+							fp.write(str(i)+"\t"+str(cell_barcode_index)+"\t1\n")
+						else:
+							delta = snv_pos - align_start	
+							if read_len - delta < motif_len:
+								motif_pos_part = motif_pos[0:motif_len+1]
+								motif_neg_part = motif_neg[0:motif_len+1]
+								seq_part = align_seq[read_len-2*motif_len-1:read_len]
+								
+								if re.search(motif_pos_part, seq_part):
+									read_wild = read_wild + 1 	
+									fp.write(str(i)+"\t"+str(cell_barcode_index)+"\t0\n")		
+								elif re.search(motif_neg_part, seq_part):	
+									read_mutated = read_mutated + 1
+									fp.write(str(i)+"\t"+str(cell_barcode_index)+"\t1\n")
+
+							elif delta <= motif_len: 
+								motif_pos_part = motif_pos[motif_len:len(motif_pos)]
+								motif_neg_part = motif_neg[motif_len:len(motif_neg)]
+								seq_part = align_seq[0:2*motif_len+1]
+
+								if re.search(motif_pos_part, seq_part):
+									read_wild = read_wild + 1 
+									fp.write(str(i)+"\t"+str(cell_barcode_index)+"\t0\n")	
+								elif re.search(motif_neg_part, seq_part):
+									read_mutated = read_mutated + 1
+									fp.write(str(i)+"\t"+str(cell_barcode_index)+"\t1\n")
+							else:
+								#fp.write(str(snv_info[i])+"\n")
+								#fp.write(str(align_chr) + ":" + str(align_start) + ":" + align_seq+"\n") 
+								read_noAllele = read_noAllele + 1 
+					else:
+						break 
+						
+			if read_cover > 0: 
+				read_cover_tol = read_cover_tol + 1 
+			if read_wild > 0: 
+				read_wild_tol = read_wild_tol + 1 
+			if read_mutated > 0 and read_wild>0: 
+				overlap = overlap + 1
+
+			if read_mutated > 0: 
+				read_mutated_tol = read_mutated_tol + 1 
+			if read_noAllele > 0: 
+				read_noAllele_tol = read_noAllele_tol + 1
+
+
+		infile.close()
+		fp.close()	
+		print(str(read_tol) + ":" + str(read_cover_tol) + ":" + str(read_wild_tol) + ":" + str(read_mutated_tol) + ":" + str(overlap) + ":" + str(read_noAllele_tol))
+		return(para_lst[0])
\ No newline at end of file
diff --git a/src/svm_phasing_sub.R b/src/svm_phasing_sub.R
new file mode 100644
index 0000000..317d520
--- /dev/null
+++ b/src/svm_phasing_sub.R
@@ -0,0 +1,268 @@
+
+frq <- function(x=mat) {
+	res<-matrix(0,nrow(mat),1)
+	for(i in seq(1,nrow(mat),1)){
+		geno <- mat[i, seq(5,ncol(mat),1)]
+		geno <- gsub("/","|", geno)
+		geno <- geno[!geno=="0|0"]
+		s <- strsplit(geno, "|")
+		result <- sapply(s, "[[", 1)
+		if(length(result)>10){
+
+			myfrq <- length( which(result=="0"))/length(result)
+		}else{
+			myfrq <- 0
+		}
+		if(myfrq>0.5){myfrq <- 1- myfrq}
+		res[i,1] <- myfrq
+	}
+	return(res)
+}
+
+
+getFeatureInfo <-function(svm_info=NULL){
+	n_min <-(-1000)
+	dt <- data.frame("id"=svm_info$V1, "DP"=n_min, "QS"=n_min, 
+		"VDB"=n_min,"SGB"=n_min,"RPB"=n_min, "MQB"=n_min,"MQSB"=n_min, "BQB"=n_min)
+
+	svm_info[is.na(svm_info)] <-"-1000"
+	for(i in seq(1,nrow(svm_info),1)){
+		for(j in seq(1,ncol(svm_info),1)){
+			if(svm_info[i,j]=="DP"){dt$DP[i] <- svm_info[i,j+1]}
+			if(svm_info[i,j]=="QS"){dt$QS[i] <- svm_info[i,j+1]}
+			if(svm_info[i,j]=="VDB"){dt$VDB[i] <- svm_info[i,j+1]}
+			if(svm_info[i,j]=="SGB"){dt$SGB[i] <- svm_info[i,j+1]}
+			if(svm_info[i,j]=="RPB"){dt$RPB[i] <- svm_info[i,j+1]}
+			if(svm_info[i,j]=="MQB"){dt$MQB[i] <- svm_info[i,j+1]}
+			if(svm_info[i,j]=="MQSB"){dt$MQSB[i] <- svm_info[i,j+1]}
+			if(svm_info[i,j]=="BQB"){dt$BQB[i] <- svm_info[i,j+1]}
+		}
+	}
+	return(dt)
+}
+
+SNV_block <-function(summary=NULL){
+	block <- 1
+	last <- ".|."
+	summary$region <- 0
+	for(i in seq(1,nrow(summary),1)){
+		if(summary[i,4]==".|."){
+			if(last==".|." | last=="0|0"){;}
+			else{	
+				block <- block + 1
+			}
+			summary$region[i] <- block
+		}
+		last <- summary[i,4]
+	}
+	return(summary)
+}
+
+
+SVM_prepare <-function(x=NULL){
+	svm<-list()
+	a<- table(x$region)
+	neg <- x[(x$region%in% names(a)[a>=4] & x$V4==".|.") | (x$V4=="0|0" & x$baf>0),]
+	#neg <- x[(x$region%in% names(a)[a>=3] & x$V4==".|.") | x$V4=="0|0",]
+	#neg <- x[x$V4=="0|0" | ,]
+	pos <- x[!x$V4==".|." & !x$V4=="0|0" &x$baf>0.4,]
+	svm$neg <- neg 
+	svm$pos <- pos
+	svm$test <- x[x$region%in% names(a)[a<4] & x$V4==".|.",]
+	return(svm)
+}
+
+SVM_train <- function(label=NULL, data=NULL, downsampling=20){
+	
+
+	print("Run SVM trainning")
+
+	feature <-c("QS", "VDB", "SGB", "RPB", "MQB", "MQSB", "BQB", "BAF")
+	data$BQB <- as.numeric(data$BQB)
+
+	data$VDB[data$VDB==(-1000)] <-0
+	data$MQSB[data$MQSB==(-1000)] <-1
+	data$QS <- abs(data$QS-0.5)
+	x_pos <- data[data$id%in%label$pos$V2,feature]
+	#x_pos <- x_pos[seq(1,nrow(x_pos),downsampling),]
+	x_neg <- data[data$id%in%label$neg$V2,feature]
+
+	y_pos <- rep("POS", nrow(x_pos))
+	y_neg <- rep("NEG", nrow(x_neg))
+
+	if(nrow(x_pos) > nrow(x_neg)){
+		# randomly select matched variants between pos and neg
+		sel <- sample(nrow(x_pos))[seq(1,nrow(x_neg),1)]
+		x_pos <- x_pos[sel,]
+		y_pos <- y_pos[sel]
+	}
+	if(nrow(x_pos) < nrow(x_neg)){
+		# randomly select matched variants between pos and neg
+		sel <- sample(nrow(x_neg))[seq(1,nrow(x_pos),1)]
+		x_neg <- x_neg[sel,]
+		y_neg <- y_neg[sel]
+	}
+    
+	print(paste0("pos->:", length(y_pos)))
+	print(paste0("neg->:", length(y_neg)))
+
+	model <- svm(as.matrix(rbind(x_pos, x_neg)), as.factor(c(y_pos,y_neg)), probability=TRUE)
+	test <- data[data$id%in%label$test$V2,feature]
+	pred <- predict(model, as.matrix(test), probability=TRUE)
+	prob <- attr(pred, "probabilities")
+	prob <-as.data.frame(prob)
+	label$test$POS <- prob$POS
+	label$test$NEG <- prob$NEG
+	return(label)
+}
+
+calcP <- function(geno_vec, dis_vec, dis_limit=20000){
+
+		dis_bin <-  c(100,  1000,  5000,   10000, 20000,  500000, 1000000000000000000)
+		prob_bin <- c(1,    0.93,  0.78,   0.71,  0.69,   0.64,   0)
+
+		prob_bin[dis_bin>dis_limit] <- 0
+
+		pos_match <- which(geno_vec%in%c("000","111"))
+		dis_match <- dis_vec[pos_match]
+
+		match_p <- 0
+		for(pos in dis_match){
+				if(pos<=dis_bin[1]){p <-1}
+				else{
+					p <- prob_bin[max(which(pos>=dis_bin))+1]
+				}
+				match_p <- match_p + p/sqrt(pos) 
+		}
+
+		pos_mismatch <- which(geno_vec%in%c("010","101"))
+		dis_mismatch <- dis_vec[pos_mismatch]
+
+
+		mismatch_p <- 0
+		for(pos in dis_mismatch){
+				if(pos<=dis_bin[1]){p <-1}
+				else{
+					p <- prob_bin[max(which(pos>=dis_bin))+1]
+				}
+				mismatch_p <-mismatch_p + p/sqrt(pos) 
+		}
+
+		if((match_p + mismatch_p)==0){p <- (-1)}
+		else{
+			p <- match_p/(match_p+mismatch_p)
+		}
+		
+		return(p)
+}
+
+Phasing <- function(x = NULL, snv_meta = NULL, dis_limit = 20000, readlength=NULL){
+
+	start <- 1
+    if(start){
+		y <- x
+	    dis <- colsplit(y$V1,":", c("id1","id2"))[,2]
+		dis_rd_study <-c()
+		trio_hap_rd_study <-c()
+		hap_rd2 <-c()
+		dis_rd2 <-c()
+		cnt <- 0
+	
+		check_study <- snv_meta
+		check_study$prob <- (-1000)
+		check_study$tol <-  0
+		check_study$dis <- (-1000)
+
+		check_study$prob_phy <- (-1000)
+		check_study$tol_phy <- 0
+		res<-c()
+
+		pos_lst_study <-list()
+		for(i in seq(5,ncol(y),1)){
+		 	vec <- y[,i]
+		 	pos_lst_study[[i]] <- which(!vec=="0|0" & !y$V4==".|." & !vec=="0/0")
+		}
+	}
+
+	for(snv in snv_meta$V2){
+	 	  cnt=cnt+1
+	 	  #print(paste0("scanning ",cnt," now..."))
+	 	  snv_index <- which(y$V2==snv)
+	 	  snv_index_noMisCol <- which(!y[snv_index,]=="0|0" & !y[snv_index,]=="0/0",)
+
+	 	  # remove the first column since they are not genotypes 
+	 	  snv_index_noMisCol <- snv_index_noMisCol[snv_index_noMisCol>4]
+	 	  geno_vec <-c()
+	 	  dis_vec <-c()
+	 	  dis_phy1 <- c()
+		  dis_phy2 <- c()
+	 	  # identify haplotypes for each element
+	 	  index <-0
+	 	  for(pos in snv_index_noMisCol){
+	 	  	 # element index (snv_index, pos)
+	 	  	 pos_noMisCol <- pos_lst_study[[pos]]
+	 	  	 lower_vec <- which(pos_noMisCol<snv_index)
+	 	  	 upper_vec <- which(pos_noMisCol>snv_index) 	  	 
+	 	  	 if(length(lower_vec)>=1 & length(upper_vec)>=1){
+	 	  	 		lower <- pos_noMisCol[max(lower_vec)]
+	 	  	 		upper <- pos_noMisCol[min(upper_vec)]
+	 	  	 		tp <- y[snv_index, pos]
+	 	  	 		tp <- gsub("/", "|", tp) 
+	 	  	 		geno <- c(y[lower,pos], tp, y[upper, pos])
+
+	 	  	 		part <-colsplit(geno,"\\|",c("i1","i2"))[,2]
+					part[part>1] <- 1
+					flag <- paste0(part[1],part[2],part[3])
+					hap_rd2 <-c(hap_rd2, abs(part[2]-part[1]), abs(part[3]-part[2]))
+					dis_rd2 <-c(dis_rd2, dis[snv_index]-dis[lower], dis[upper]-dis[snv_index])
+
+	 	  	 		dis_vec <- c(dis_vec, dis[upper]-dis[lower])
+	 	  	 		dis_phy1 <- c(dis_phy1,  dis[snv_index]-dis[lower])
+	 	  	 		dis_phy2 <- c(dis_phy2, dis[upper]-dis[snv_index])
+	 	  	 		geno_vec <-c(geno_vec,flag)
+	 	  	 }
+	 	  }
+
+	 	  sum <- 0
+	 	  sum_equal <- 0
+	 	  if(sum(dis_phy1<readlength)){
+	 	  	 d_geno <- substr(geno_vec[which(dis_phy1<readlength)],1,2)
+	 	  	 sum <- length(d_geno)
+	 	  	 sum_equal <- sum(d_geno=="00" | d_geno=="11")
+	 	  }
+
+	 	  if(sum(dis_phy2<readlength)){
+	 	  	 d_geno <- substr(geno_vec[which(dis_phy2<readlength)],2,3)
+	 	  	 sum <- length(d_geno) + sum
+	 	  	 sum_equal <- sum(d_geno=="00" | d_geno=="11") + sum_equal
+	 	  }
+
+
+	 	  if(sum>0){
+	 	  	  		p <- sum_equal/sum
+	 	  	  		check_study$prob_phy[cnt] <- p
+	 	  	  		check_study$tol_phy[cnt] <- sum
+	 	  	  		print(paste0("Scan ", cnt,":", snv, " ", sum, " ", p))
+	 	  }
+
+	 	  if(length(geno_vec)>0 ){
+	 	  	  dis_rd_study <-c(dis_rd_study, dis_vec)
+			  trio_hap_rd_study <-c(trio_hap_rd_study, geno_vec)
+	 	  	  #print(table(trio_hap_rd))
+	  	  		check_study$prob[cnt] <- calcP(geno_vec,dis_vec,dis_limit)
+	  	  		check_study$tol[cnt] <- sum(dis_vec<dis_limit)
+	  	  		#check_study$dis[cnt] <- max(dis_vec[dis_vec<dis_limit])
+	 	  }
+	}
+	res$table <- check_study
+	res$dis_genetic <- dis_rd_study 
+	res$hap_genetic <- trio_hap_rd_study
+	res$dis_physical <- dis_rd2 
+	res$dis_genetic <- hap_rd2
+
+	return(res)
+}
+
+
+
+