-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathdetect_recombsite.cpp
executable file
·2759 lines (2709 loc) · 115 KB
/
detect_recombsite.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/* this function detects recombination sites in meiosis, based on pooled linked read sequencing
input:
1. known markers from both parents
2. variations called in pool
output:
prediction on recombination sites.
update: 2018-07-29 - check number of reads
*/
#include "./gzlib/gzstream.h"
#include <zlib.h>
#include <stdio.h>
#include <string.h>
#include <string>
#include <map>
#include <vector>
#include <stdlib.h>
#include <fstream>
#include <iostream>
#include <sstream>
#include <time.h>
#include <assert.h>
#include <algorithm>
#include <iomanip>
#include <math.h>
#include <sys/stat.h>
#include <dirent.h>
#include <errno.h>
#include "split_string.h"
#include "separate_chr_v2.h"
#include "insert_raw_bp.h"
#include "check_allele_order.h"
#include "postprocess_bpPrediction.h"
#include "output_co.h"
#include "globals.h"
//
struct iMARKER
{
string chr;
unsigned long pos;
string from;
double af; // observed alternative allele frequency: under high coverage, expect ~0.5, e.g., [0.3, 0.7]
string cov;
bool isGiven; // if the marker is in given list, set as true.
};
struct READINFO
{
string readname;
string mateid;
unsigned long spansta;
unsigned long spanend;
string mallele; // marker1:allele1,\nmarker2:allele2
};
// --ivc 50 --iaf 0.2
unsigned long badBarcodeCov = 0;
int minVarCov = 50; // minimum base coverage of variant site (refcov+altcov) in vcf to be used as markers
int maxVarCov = 10000; // maximum base coverage of variant site (refcov+altcov) in vcf to be used as markers
double minVarAF = 0.2; // minimum allele frequency of variant site (refcov+altcov) in vcf to be used as markers
double maxVarAF = 1.0; // maximum allele frequency of variant site (refcov+altcov) in vcf to be used as markers
int minRead = 2; // minimum number of uniq read ids on each side of CO, excluding the one on CO
bool rcm = false;
multimap<string, unsigned long> HeteroBarcode;
string tmpflagstr("tmptmp");
unsigned long total_check = 0;
//
bool collect_marker_info(const char* file,
string parent_id,
map<string, string>* mkr);
bool get_var_ref_barcode(string formatInfo,
string barcodeString,
vector<string>* vbc,
vector<string>* rbc,
string this_chr,
string this_pos,
multimap<string, unsigned long>* HeteroBarcode);
bool get_break_pos_of_artSeq(string artSeq,
map<unsigned long, string> artSeqPos,
int min_marker,
unsigned long* leftp,
unsigned long* rightp,
int* bpPos,
double* score);
bool get_moleStat_distribution(const char* file,
double maxPercent,
int* maxStat,
string lenORreadnum);
bool get_bad_molecule(const char* file,
int maxReadNum,
int maxMoleLen,
multimap<string, std::pair<unsigned long, unsigned long> >* badMolecule); // to remove 2018-01-01
bool read_chr_var(string vcffilename,
map<string, string> knownMarker,
multimap<string, iMARKER>* moleculeSet);
bool check_candidate_cosite(multimap<string, std::pair<unsigned long, unsigned long> >* badMolecule,
string mkey,
unsigned long mstart,
unsigned long mend);
bool build_artSeq(map<unsigned long, string> alleleinfo,
unsigned long regionBegin,
unsigned long regionEnd,
string* artSeq,
string* artSeqCheckingStr);
string check_allele_order(string leftAlleles,
string rightAlleles);
bool get_recombis_options(int argc,
char* argv[],
string* filevcf,
string* fileMoleTable,
string* fileReadNumDistr,
string* fileMoleLenDistr,
string* filemark1,
string* filemark2,
int* min_marker,
int* min_span,
double* min_score,
string* outPrefix,
int* max_moleculeLen,
int* max_readnum,
int* max_CORDist,
int* min_CORDist,
int* max_COSize,
map<string, string>* visitedVarChr,
map<string, string>* visitedMolChr);
bool check_co_read_density(unsigned long co_start,
unsigned long co_end,
string read_align_info,
string read_order_info,
string read_id_info,
int* bp_per_read_in_co,
bool* bp_boundary_same_fragment,
int* min_RR_dist,
int* max_RR_dist,
int* n_readLeft,
int* n_readRight,
int* max_CORR_dist);
bool check_mole_local_base_cov(unsigned long mole_start,
unsigned long mole_end,
string read_align_info,
unsigned long win_size,
unsigned long win_step,
double* min_win_base_cov,
double* max_win_base_cov,
double* mean_win_base_cov,
double* sd_win_base_cov);
bool output_real_molecule_len_distribution(
map<int, unsigned long> molecule_len_control,
string outPrefix);
int get_chr_num_ii(string marker_file);
bool verbose = true; // TODO: get from option
// main
bool detect_recombsite(int argc, char* argv[])
{
std::stringstream usage;
usage.str("");
if(argc < 14)
{
usage << endl;
usage << " Given necessary info, this function predicts meiotic recombination breakpoints." << endl;
const char *buildString = __DATE__", " __TIME__;
usage << " (special version compiled on " << buildString << ")" << endl << endl;
usage << " Usage: DrLink recombis options [default] " << endl << endl;
usage << "\t# Mandatory options: " << endl;
usage << "\t--var STRING snp variants in vcf.gz from longranger wgs [NULL]" << endl;
usage << "\t--mol STRING molecule meta info in txt.gz from DrLink molecule [NULL]" << endl;
usage << "\t--str STRING stat of molecule read numbers in txt from DrLink molecule [NULL]" << endl;
usage << "\t--stl STRING stat of molecule lengths in txt from DrLink molecule [NULL]" << endl;
usage << "\t--spa STRING snp markers in txt from paternal line [NULL]" << endl;
usage << "\t--sma STRING snp markers in txt from maternal line [NULL]" << endl;
usage << "\t# Optional: " << endl;
usage << "\t--ivc INT min number of read coverage for collecting a marker [ 250]" << endl;
usage << "\t--avc INT max number of read coverage for collecting a marker [ 400]" << endl;
usage << "\t--iaf DOUBLE min alternative AF for collecting a marker [0.35]" << endl;
usage << "\t--aaf DOUBLE max alternative AF for collecting a marker [0.65]" << endl;
usage << "\t--nur INT max molecule read number for selecting good molecules [NULL]" << endl;
usage << "\t--nul INT max molecule length for selecting good molecules [NULL]" << endl;
usage << "\t--imn INT min number of allelic markers for reporting a CO [ 3]" << endl;
usage << "\t--imr INT(bp) min range of allelic markers for reporting a CO [ 250]" << endl;
usage << "\t--ims DOUBLE min score for reporting a CO [ 0.8]" << endl;
usage << "\t--icd INT min inter-read distance for reporting a CO [ 500]" << endl;
usage << "\t--acd INT max inter-read distance for reporting a CO [5000]" << endl;
usage << "\t--ird INT min number of side-reads for reporting a CO [ 2]" << endl;
//usage << "\t--aco INT max interval size for reporting a CO [2000]" << endl;
usage << "\t--split STRING a flag refering to splitted --var and --mol [ off]" << endl;
usage << "\t--rcm bool check if a marker is covered by a read [ off]" << endl;
usage << "\t--out STRING prefix for labeling output files [enjo]" << endl;
usage << endl;
usage << "\t*Note 1: one of the marker files can be null. " << endl;
usage << "\t*Note 2: if given --nur (--nul), --str (--strl) becomes invalid and ignored." << endl;
usage << "\t*Note 3: if given --split tmptmp and files found, then --var and --mol can be ignored. "<< endl;
usage << "\t*Note 4: option --aco should be determined according to marker density\n\t(e.g., Athal: 300bp/marker), "
<< "and, read distribution shows a rough range of 0~5000 bp. " << endl;
cout << usage.str() << endl;
// --icd/acd is related to statistics of read coverage of 10x molecules
// --ird: number of uniq read-ids beyond CO start and end, excluding the pair on CO.
return false;
}
double startT= clock();
// initialize variables
string filevcf("");
string fileMoleTable("");
string fileReadNumDistr("");
string fileMoleLenDistr("");
string filemark1("");
string filemark2("");
int min_marker = 3;
int min_span = 250; // bp
double min_score = 0.8;
string outPrefix("enjo_");
int maxReadNum = 0; //22; // if 0.1x per molecule: round(18*2*0.60)
int maxMoleLen = 0; //54000; // if 45kb per molecule: round(45000*2*0.60)
int maxCORDist = 5000; //maximum inter-read distance in predicted CO intervals
int minCORDist = 500;
int maxCOSize = 500000;//maximum size of interval to be predicted as a CO - no control now!!!
map<string, string> visitedVarChr;
map<string, string> visitedMolChr;
// set variables from options
if( !get_recombis_options(argc,
argv,
&filevcf,
&fileMoleTable,
&fileReadNumDistr,
&fileMoleLenDistr,
&filemark1,
&filemark2,
&min_marker,
&min_span,
&min_score,
&outPrefix,
&maxMoleLen,
&maxReadNum,
&maxCORDist,
&minCORDist,
&maxCOSize,
&visitedVarChr,
&visitedMolChr) )
{
cout << " Error: incorrect parameter settings. " << endl;
return false;
}
multimap<string, std::pair<unsigned long, unsigned long> > badMolecule;
// step m. read bad (longer or with larger read number) molecule info: decreasing fp, but increasing fn.
// find max read number: set ratio of valley/peak*0.5 in "out_1000BC_90M_molecule_stat.pdf" after read peak
if(maxReadNum == 0)
if(!get_moleStat_distribution(fileReadNumDistr.c_str(), 0.20, &maxReadNum, "readNum"))
{
cout << " Error: failed in reading file " << fileReadNumDistr << ". Exited." << endl;
return false;
}
// find max molecule length: set ratio of valley/peak*0.5 in "out_1000BC_90M_molecule_stat.pdf" after molecule peak
if(maxMoleLen == 0)
if(!get_moleStat_distribution(fileMoleLenDistr.c_str(), 0.20, &maxMoleLen, "moleLen"))
{
cout << " Error: failed in reading file " << fileMoleLenDistr << ". Exited." << endl;
return false;
}
/* this is now done during reading of raw molecules
if(!get_bad_molecule(fileMoleTable.c_str(), maxReadNum, maxMoleLen, &badMolecule))
{
cout << " Error: failed in reading file " << fileMoleTable << ". Exited." << endl;
return false;
}
*/
// step 0. initialized a variable for recording known marker info
map<string, string> knownMarker;
// step 1. read known marker info from parent 1
if(!collect_marker_info(filemark1.c_str(), "1", &knownMarker))
{
cout << " Error: cannot open marker file " << filemark1 << "; exited." << endl;
exit(1);
}
// step 2. read known marker info from parent 2
if(!collect_marker_info(filemark2.c_str(), "2", &knownMarker))
{
cout << " Error: cannot open marker file " << filemark2 << "; exited." << endl;
exit(1);
}
// step 3. separte molecule_table_trashme.txt.gz and phased_variants.vcf.gz into chromosome-specific files
// seprate vcf-variant file: map<string, string> visitedVarChr
if(visitedVarChr.size()==0 &&
!separate_chr_v2(filevcf, &visitedVarChr, &tmpflagstr))
{
return false;
}
else
{
map<string, string>::iterator vchritr;
map<string, string>::iterator vchritr_end;
vchritr = visitedVarChr.begin();
vchritr_end = visitedVarChr.end();
cout << " Info: variants in vcf separated into chr-specific files: " << endl;
while(vchritr != vchritr_end)
{
cout << " " << (*vchritr).second << endl;
vchritr ++;
}
}
// separate molecule file: map<string, string> visitedMolChr
if(visitedMolChr.size()==0 &&
!separate_chr_v2(fileMoleTable, &visitedMolChr, &tmpflagstr))
{
return false;
}
else
{
map<string, string>::iterator mchritr;
map<string, string>::iterator mchritr_end;
mchritr = visitedMolChr.begin();
mchritr_end = visitedMolChr.end();
cout << " Info: raw molecules separated into chr-specific files: " << endl;
while(mchritr != mchritr_end)
{
cout << " " << (*mchritr).second << endl;
mchritr ++;
}
}
// step 5. prepare a file and a variable for collecting candidate break points
std::stringstream ipre;
ipre.str("");
ipre << "_aML" << maxMoleLen
<< "_aRN" << maxReadNum
<< "_iSC" << min_score
<< "_iMK" << min_marker
<< "_iSP" << min_span
<< "_iVC" << minVarCov
<< "_aVC" << maxVarCov
<< "_iAF" << minVarAF
<< "_aAF" << maxVarAF
<< "_ird" << minRead;
// create an intermediate folder for collecting details about a CO-molecule
DIR* dir = opendir((outPrefix+ipre.str()).c_str());
if (dir)
{
/* Directory exists. */
closedir(dir);
}
else if (ENOENT == errno)
{
/* Directory does not exist. */
const int dir_err = mkdir((outPrefix+ipre.str()).c_str(), S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH);
if (dir_err == -1)
{
cout << " Error: cannot create directory " << outPrefix+ipre.str() << endl;
return false;
}
}
else ;
// raw breakpoints file
string rawbpfile = outPrefix + ipre.str() +"_raw_BP.txt";
fstream rawfp;
rawfp.open(rawbpfile.c_str(), ios::out);
if(!rawfp.good())
{
cout << " Error: cannot open file " << rawbpfile << " for collecting raw break point predictions. " << endl;
return false;
}
rawfp << "#chr\t"
<< "left_pos\t"
<< "right_pos\t"
<< "score\t"
<< "left_alleles\t"
<< "right_alleles\t"
<< "barcode\t"
<< "vcf_mole_len\t"
<< "left_span\t"
<< "right_span\t"
<< "read_density\t"
<< "co_boundary_same_fragment\t"
<< "min_mole_RR_dist\t"
<< "max_mole_RR_dist\t"
<< "bam_mole_len\t"
<< "bam_mole_readNum\t"
<< "bam_mole_base_cov\t"
<< "af_at_markers\t"
<< "cov_at_markers\t"
<< "co_size\t"
<< "n_readLeft\t"
<< "n_readRight\t"
<< "max_CORR_dist\n";
// files for collecting length of pure Col-/Ler-molecules: format: chr#barcode first-aligned-read mole_len artSeq
string colMole = outPrefix + ipre.str() +"_col_mol_length.txt";
string lerMole = outPrefix + ipre.str() +"_ler_mol_length.txt";
fstream colMolefp;
fstream lerMolefp;
bool outLerCol = false;
if(outLerCol)
{
colMolefp.open(colMole.c_str(), ios::out);
lerMolefp.open(lerMole.c_str(), ios::out);
if(!colMolefp.good())
{
cout << " Error: cannot open file " << colMole << " for collecting molecule length info of Col. " << endl;
return false;
}
if(!lerMolefp.good())
{
cout << " Error: cannot open file " << lerMole << " for collecting molecule length info of Ler. " << endl;
return false;
}
}
// file for collecting molecules passed all checkings on RN, ML and HET
// for chi-square checking number of normal molecules in a window
string goodmolefile = outPrefix + ipre.str() +"_pRNpMLpHET.txt.gz";
ogzstream goodmolefp;
goodmolefp.open(goodmolefile.c_str(), ios::out);
if(!goodmolefp)
{
cout << " Error: cannot open file " << goodmolefile << " for writing good molecule info. " << endl;
return false;
}
goodmolefp << "#chr\tstart\tend\tRN" << endl;
// a variable for collecting candidate break points
multimap<string, multimap<unsigned long, BPINFO> > rawbpmap;
//
unsigned long numbad = 0;
unsigned long numbadML = 0;
unsigned long numbadRN = 0;
unsigned long numbadHET = 0;
unsigned long numbTotalMol = 0;
unsigned long numbTotalMolRead = 0; // number of reads related to numbTotalMol
unsigned long numbGoodTotalMol = 0; // number of molecules passing checking on RN, ML and HET
unsigned long numbGoodTotalMolRead = 0; // number of reads related to numbGoodTotalMol
map<int, unsigned long> molecule_len_control; // length distribution of molecules passing all control
//
// step 6. analyze molecule for each chr, and allele info at markers according to vcf - 2018-01-01
map<string, string>::iterator vchritr;
map<string, string>::iterator vchritr_end;
vchritr = visitedVarChr.begin();
vchritr_end = visitedVarChr.end();
while(vchritr != vchritr_end)
{
string chr = (*vchritr).first;
cout << " Info: finding breakpoints on chr " << chr << "..." << endl;
//
cout << " Extracting parental allele info at markers for molecules/barcodes... " << endl;
string vcffile = (*vchritr).second;
multimap<string, iMARKER> moleculeSet; // <chr#bc, {marker-pos, from-parent} >
if(!read_chr_var(vcffile, knownMarker, &moleculeSet))
{
cout << " Error: read information at markers from " << vcffile << " failed. " << endl;
return false;
}
cout << " Extracted parental allele info: " << moleculeSet.size() << "." << endl;
// find file of raw molecules for chr
cout << " Check: current chr " << chr << endl;
assert(visitedMolChr.find(chr) != visitedMolChr.end());
string molfile = visitedMolChr[chr];
//
igzstream molfp;
molfp.open(molfile.c_str());
if(!molfp.good())
{
cout << " Error: cannot file of raw molecule for chr " << chr << ": " << molfile << endl;
return false;
}
cout << " Info: traversing the molecule table of chr " << chr << " to find breakpoints..." << endl;
while(molfp.good())
{
string line("");
getline(molfp, line);
if(line.size()==0 || line[0]=='#') continue;
/* get info for current molecule - 11 columns:
0.chr#barcode 1#AAACACCAGAACTCGG
1.first_aligned 15095089
2.last_aligned 15102519
3.molecule_len 7580
4.molecule_cov 0.04
5.read_num 2
6.Uni_flag U
7.last_aligned_end 15102668
8.all_reads_aligned_at 15095089,15095230;15102519,15102668
9.R1R2 1,2
10.read_id rid1,rid2
11.read_cigar cig1,cig2
*/
vector<string> lineinfo = split_string(line, '\t');
if(lineinfo.size() < 12) continue;
// filter away current molecule if with unexpected length or number of reads
int readnum = atoi(lineinfo[5].c_str());
double molecov = atof(lineinfo[4].c_str());
int molelen = atoi(lineinfo[3].c_str());
// pre-controlling on molecule length
if(molelen<1000) continue;
numbTotalMol ++;
numbTotalMolRead += readnum;
if(readnum>maxReadNum)
{
numbadRN ++;
numbad ++;
continue;
}
else
if(molelen>maxMoleLen ||
molelen<2*min_span)
{
// skip potential overlapping molecules, or short molecules
numbadML ++;
numbad ++;
continue;
}
// find range of current molecule
unsigned long msta = strtoul(lineinfo[1].c_str(), NULL, 0);
unsigned long mend = strtoul(lineinfo[3].c_str(), NULL, 0) + msta -1;
// filter away current molecule if covering a heterozygous site
std::pair <multimap<string, unsigned long>::iterator,
multimap<string, unsigned long>::iterator> hetitr_region;
hetitr_region = HeteroBarcode.equal_range(lineinfo[0]);
multimap<string, unsigned long>::iterator hetitr;
bool hetcov = false;
for (hetitr=hetitr_region.first; hetitr!=hetitr_region.second; ++hetitr)
{
if((*hetitr).second>=msta && (*hetitr).second<=mend)
{
cout << " Info: molecule at "
<< lineinfo[0] << ": "
<< lineinfo[1] << "-"
<< lineinfo[2] << " with het-bc at mkdr "
<< (*hetitr).second << " - skipped. " << endl;
numbad ++;
numbadHET ++;
hetcov = true;
break;
}
}
if(hetcov == true) continue;
// "real" moleclues passing checking on RN, ML and HET
vector<string> chrbarcode = split_string(lineinfo[0], '#');
numbGoodTotalMol ++;
numbGoodTotalMolRead += readnum;
goodmolefp << chrbarcode[0] << "\t"
<< lineinfo[1] << "\t"
<< lineinfo[2] << "\t"
<< lineinfo[5] << endl;
// collect length of a real molecule for calculating and checking expected CO molecule
int mlkey = (int)round((double)molelen/1000);
map<int, unsigned long>::iterator mlitr = molecule_len_control.find(mlkey);
if(mlitr == molecule_len_control.end())
{
molecule_len_control.insert(std::pair<int, unsigned long>(mlkey, 1));
}
else
{
(*mlitr).second += 1;
}
// new 2018-05-23
//
// for this molecule, get its alignment-covered positions for selecting markers
// 8.all_reads_aligned_at 15095089,15095230;15102519,15102668
// 9.R1R2 1,2
// 10.read_id rid1,rid2
/* struct READINFO
{
string readname;
string mateid;
unsigned long spansta;
unsigned long spanend;
string mallele; // marker1:allele1,\nmarker2:allele2
};
*/
multimap<unsigned long, READINFO> molAlignMarkerAllele;
map<unsigned long, string> molReadCoveredPos;
vector<string> spaninfo = split_string(lineinfo[8], ';');
vector<string> mateinfo = split_string(lineinfo[9], ',');
vector<string> nameinfo = split_string(lineinfo[10], ',');
vector<string> cigarinfo = split_string(lineinfo[11], ',');
assert(spaninfo.size()==mateinfo.size() && mateinfo.size()==nameinfo.size());
vector<string>::iterator rsp_itr = spaninfo.begin();
vector<string>::iterator rsp_itr_end = spaninfo.end();
int geti = 0;
bool readwarning = false;
while(rsp_itr != rsp_itr_end)
{
string spanMidName = spaninfo[geti] + "\t" + mateinfo[geti] + "\t" + nameinfo[geti];
//
vector<string> indrsp = split_string(*rsp_itr, ',');
unsigned long rstart = strtoul(indrsp[0].c_str(), NULL, 0);
unsigned long rend = strtoul(indrsp[1].c_str(), NULL, 0);
//
if(rcm==true)
for(unsigned long ii = rstart; ii <= rend; ii ++)
{
molReadCoveredPos.insert(std::pair<unsigned long, string>(ii, spanMidName));
}
//
READINFO tmpallinfo;
tmpallinfo.readname = nameinfo[geti]+"\t"+cigarinfo[geti];
tmpallinfo.mateid = mateinfo[geti];
tmpallinfo.spansta = rstart;
tmpallinfo.spanend = rend;
tmpallinfo.mallele = "";
molAlignMarkerAllele.insert(std::pair<unsigned long, READINFO>(rstart, tmpallinfo));
//
rsp_itr ++;
geti ++;
}
//
map<unsigned long, string> alleleinfo; // <pos_sorted, from>: only in given marker list
map<unsigned long, double> allelefreq; // <pos_sorted, af>
map<unsigned long, string> allelecov; // <pos_sorted, cov>
// find allele info at markers of current molecule with the above range
string bckey = lineinfo[0];
std::pair <multimap<string, iMARKER>::iterator,
multimap<string, iMARKER>::iterator> mkritr_region;
mkritr_region = moleculeSet.equal_range(bckey);
if(mkritr_region.first == mkritr_region.second)
{
// current molecule does not cover any markers - skip.
continue;
}
multimap<string, iMARKER>::iterator mkritr;
for (mkritr=mkritr_region.first; mkritr!=mkritr_region.second; ++mkritr)
{
iMARKER mkrtmp = (*mkritr).second;
// firstly check if marker position covered by any alignments 2018-05-23
bool rcmchecking = false;
if(rcm==true && molReadCoveredPos.find(mkrtmp.pos) != molReadCoveredPos.end())
{
rcmchecking = true;
}
if(rcm==false || rcmchecking==true)
{ // record info at given markers
if(mkrtmp.pos>=msta && mkrtmp.pos<=mend && mkrtmp.isGiven==true)
{
if(alleleinfo.find(mkrtmp.pos) != alleleinfo.end())
{
if(alleleinfo[mkrtmp.pos].compare(mkrtmp.from) != 0)
{
// this should not happen as het-cov has been checked above.
cout << " Error: marker at " << bckey << ": " << mkrtmp.pos
<< " covering het but not filtered out. I need to check 1!! " << endl;
return false;
}
}
else
{
alleleinfo.insert(std::pair<unsigned long, string>(mkrtmp.pos, mkrtmp.from));
allelefreq.insert(std::pair<unsigned long, double>(mkrtmp.pos, mkrtmp.af));
allelecov.insert(std::pair<unsigned long, string>(mkrtmp.pos, mkrtmp.cov));
}
}
}
}
assert(allelefreq.size() == alleleinfo.size());
assert(allelecov.size() == alleleinfo.size());
if(alleleinfo.size() == 0)
{
// no marker on this molecule
continue;
}
//
total_check ++;
// build artificial sequence for current molecule
unsigned long firstpos = 0; // first marker pos
unsigned long lastpos = 0; // last marker pos
bool firstposb = true;
string artSeq("");
std::stringstream sspos;
sspos.str("");
map<unsigned long, string>::iterator alitr;
map<unsigned long, string>::iterator alitr_end;
alitr = alleleinfo.begin();
alitr_end = alleleinfo.end();
map<unsigned long, string> toRemoveMarker; // those within the molecule but not covered by any reads
while(alitr != alitr_end)
{
bool currentMmkrAdded = false;
//
std::stringstream sstmp;
sstmp.str("");
sstmp << (*alitr).first;
//
bool markerCoveredbyRead = false;
// new 2018-05-23: reads covering same markers can happen
multimap<unsigned long, READINFO>::iterator xitr;
multimap<unsigned long, READINFO>::iterator xitr_end;
xitr = molAlignMarkerAllele.begin();
xitr_end = molAlignMarkerAllele.end();
while(xitr != xitr_end)
{
unsigned long spansta = (*xitr).first;
unsigned long spanend = (*xitr).second.spanend;
if(spansta<=(*alitr).first && (*alitr).first<=spanend)
{
markerCoveredbyRead = true;
if(currentMmkrAdded == false)
{
artSeq += (*alitr).second;
currentMmkrAdded = true; // avoid adding mut allele multiple times
}
//
if((*xitr).second.mallele.size()>0)
{
//string tmpstr(",\t----------\n ");
string tmpstr(",");
(*xitr).second.mallele += (tmpstr + sstmp.str() + "\t" + (*alitr).second);
}
else
{
(*xitr).second.mallele += ( sstmp.str() + "\t" + (*alitr).second);
}
if(firstposb)
{
firstpos = (*alitr).first;
lastpos = (*alitr).first;
firstposb= false;
}
else
{
if((*alitr).first < firstpos)
{
firstpos = (*alitr).first;
}
else
if((*alitr).first > lastpos)
{
lastpos = (*alitr).first;
}
else ;
}
}
xitr ++;
}
//
if(markerCoveredbyRead == false)
{
// not covered by any read on this molecule: marker info needs to be removed
toRemoveMarker.insert(std::pair<unsigned long, string>((*alitr).first, "NF"));
}
//
alitr ++;
}
// remove marker info that are not covered by any reads on the current molecule
map<unsigned long, string>::iterator cl_itr;
map<unsigned long, string>::iterator cl_itr_end;
cl_itr = toRemoveMarker.begin();
cl_itr_end = toRemoveMarker.end();
while(cl_itr != cl_itr_end)
{
unsigned long clkey = (*cl_itr).first;
alleleinfo.erase(clkey);
allelefreq.erase(clkey);
allelecov.erase(clkey);
cl_itr ++;
}
//
multimap<unsigned long, READINFO>::iterator allitr;
multimap<unsigned long, READINFO>::iterator allitr_end;
allitr = molAlignMarkerAllele.begin();
allitr_end = molAlignMarkerAllele.end();
while(allitr != allitr_end)
{
READINFO tmpallinfo = (*allitr).second;
if(tmpallinfo.mallele.size()==0)
{
tmpallinfo.mallele = "00000000\t0";
}
vector<string> mkrinfo = split_string(tmpallinfo.mallele, ',');
vector<string>::iterator mitr = mkrinfo.begin();
vector<string>::iterator mitr_end = mkrinfo.end();
while(mitr != mitr_end)
{
sspos << " " << (*mitr) << "\t"
<< tmpallinfo.spansta << "\t" << tmpallinfo.spanend << "\t"
<< tmpallinfo.mateid << "\t"
<< tmpallinfo.readname << "\n";
mitr ++;
}
allitr ++;
}
// check if there is breakpoint for current molecule using artificial sequence
int allele1 = std::count(artSeq.begin(), artSeq.end(), '1');
int allele2 = std::count(artSeq.begin(), artSeq.end(), '2');
if(allele1>=min_marker && allele2>=min_marker )
{
cout << " check: bckey " << bckey << " for current molecule " << msta << " to " << mend << ":" << endl;
cout << " : firstmkrpos->lastmkrpos = " << firstpos << "->" << lastpos << endl;
cout << " : artificial seq for current molecule " << artSeq << endl;
cout << sspos.str() << endl;
if(readwarning)
{
cout << " Warning: there were reads with same start position, last kept only. " << endl;
}
unsigned long leftp; // real
unsigned long rightp;// real
int bpPos; // position breaking artSeq
double score;
if(get_break_pos_of_artSeq(artSeq, alleleinfo, min_marker, &leftp, &rightp, &bpPos, &score))
{
// check size of potential CO interval
if(rightp - leftp + 1 <= maxCOSize)
{
unsigned long leftspan = leftp - firstpos + 1;
unsigned long rightspan = lastpos - rightp + 1;
if(leftspan>=min_span && rightspan>=min_span ) // caution - fp and fn
{
if(score > min_score)
{
/* check read density within potential CO interval:
if the size of the CO interval is too large, we still expect
a number of reads within this interval, although
they may not cover any markers.
if the size of the CO interval is too small, however, it can be
due to the false mergeing of molecules.
500 << bp_per_read_in_co <= 5000 bp - can be defined as good candidates in our settings.
Note: this should be set according to real sequencing coverage on molecules.
*/
int bp_per_read_in_co = -1; // read density within potential CO interval
bool bp_boundary_same_fragment = false; // whether reads covering co start/end from the same fragment
int min_RR_dist = 9999999; // minimum R1-R1 or R2-R2 distance within the molecule
int max_RR_dist = 0; // maximum ...
int n_readLeft = 0; // number of uniq reads ids on CO left (including non-marker covering)
int n_readRight = 0; // number of uniq reads ids on CO right (including non-marker covering)
int max_CORR_dist = 0; // maximum R1-R1 distance within CO interval
if(!check_co_read_density(leftp,
rightp,
lineinfo[8],
lineinfo[9],
lineinfo[10],
&bp_per_read_in_co,
&bp_boundary_same_fragment,
&min_RR_dist,
&max_RR_dist,
&n_readLeft,
&n_readRight,
&max_CORR_dist))
{
return false;
}
// find allele freq and cov at markers
std::stringstream artSeqCov;
artSeqCov.str("");
std::stringstream artSeqAF;
artSeqAF.str("");
map<unsigned long, string>::iterator alitri;
map<unsigned long, string>::iterator alitri_end;
alitri = alleleinfo.begin();
alitri_end = alleleinfo.end();
int ibp = 0;
while(alitri != alitri_end)
{
//
if(artSeqAF.str().size()>0 && ibp==bpPos)
{
artSeqAF << ";";
}
else
if(artSeqAF.str().size()>0)
{
artSeqAF << ",";
}
assert(allelefreq.find((*alitri).first) != allelefreq.end());
artSeqAF << allelefreq[(*alitri).first];
//
if(artSeqCov.str().size()>0 && ibp==bpPos)
{
artSeqCov << ";";
}
else
if(artSeqCov.str().size()>0)
{
artSeqCov << ",";
}
assert(allelecov.find((*alitri).first) != allelecov.end());
artSeqCov << allelecov[(*alitri).first];
//
ibp ++;
alitri ++;
}
// set minCORDist as 600
if( (minCORDist<=bp_per_read_in_co && bp_per_read_in_co<=maxCORDist) ||
(minCORDist>=bp_per_read_in_co && n_readLeft>=minRead && n_readRight>=minRead)
)
{
if( 1 )
{
//vector<string> chrbarcode = split_string(lineinfo[0], '#');
std::stringstream sstmp;
sstmp.str("");
sstmp << chrbarcode[0] << "\t"
<< leftp << "\t"
<< rightp << "\t"
<< score << "\t"
<< artSeq.substr(0, bpPos) << "\t"
<< artSeq.substr(bpPos) << "\t"
<< chrbarcode[1] << "\t"
<< lastpos - firstpos + 1 << "\t"
<< leftp - firstpos + 1 << "\t"
<< lastpos - rightp + 1 << "\t"
<< bp_per_read_in_co << "\t"
<< bp_boundary_same_fragment << "\t"
<< min_RR_dist << "\t"
<< max_RR_dist << "\t"
<< molelen << "\t"
<< readnum << "\t"
<< molecov << "\t"
<< artSeqAF.str() << "\t"
<< artSeqCov.str() << "\t"
<< rightp - leftp + 1 << "\t"
<< n_readLeft << "\t"
<< n_readRight << "\t"
<< max_CORR_dist;
rawfp << sstmp.str() << endl;
// caution filter out larger CO intervals
// if(rightp - leftp < 15000)
string aord = check_allele_order(artSeq.substr(0, bpPos), artSeq.substr(bpPos));
insert_raw_bp(&rawbpmap,
chrbarcode[0],
leftp,
rightp,
score,
aord,
sstmp.str());
cout << " bp: " << leftp << " -> " << rightp
<< "\t read density in interval = " << bp_per_read_in_co << " bp/read" << endl;
cout << " allele frequency at markers: " << artSeqAF.str() << endl;
cout << " allele coverage at markers: " << artSeqCov.str() << endl;
// collect detailed molecule info with a CO prediction
std::stringstream coinfo;
coinfo.str("");
cout << " Info: prepare file for " << chrbarcode[0]+"#" << leftp << "#" << rightp << endl;
coinfo << outPrefix+ipre.str() << "/"
<< chrbarcode[0] << "_"
<< leftp << "_"
<< rightp << "_"
<< aord << "_"
<< chrbarcode[1] << ".txt\0";
cout << " Info: details of molecule recorded in file " << coinfo.str() << endl;
ofstream tmpofp;
tmpofp.open((coinfo.str()).c_str(), ios::out | ios::app);
if(!tmpofp.good())
{
cout << " Error: cannot open file " << coinfo.str() << endl;
return false;
}
else
{
tmpofp << "# " << sstmp.str() << endl;
tmpofp << "# bckey " << bckey << " for current molecule " << msta << " to " << mend << ":" << endl;
tmpofp << sspos.str() << endl;
tmpofp.close();
}
}
else
{
cout << " bp: " << leftp << " -> " << rightp
<< "\tsame fragment but not sufficient uniq reads on each side "
<< endl;
}
}
else
{
cout << " bp: " << leftp << " -> " << rightp
<< "\t read density in interval = " << bp_per_read_in_co << " bp/read "
<< " not in given range [" << minCORDist << ", " << maxCORDist << "]" << endl;
cout << " allele frequency at markers: " << artSeqAF.str() << endl;
cout << " allele coverage at markers: " << artSeqCov.str() << endl;
}
}
else
{
cout << " bp: " << leftp << " -> " << rightp << "\tlow score=" << score << endl;
}
}
else
{
cout << " bp: " << leftp << " -> " << rightp << "\tsmall span" << endl;
}
} // end of CO size checking
else
{
cout << " larger interval size: " << rightp - leftp + 1 << " bp." << endl;
}
} // end of if(get_break_pos_of_artSeq(artSeq, alleleinfo, min_marker, &leftp, &rightp, &bpPos, &score))
else
{
cout << " Error: failed in scoring molecule sequence. " << endl;
return false;
}
}
else
{
// length of Col-/Ler-molecules
if(outLerCol && allele1*1.0/(allele1+allele2) >= 0.99) // Col-molecule
{
colMolefp << bckey << "\t" << lineinfo[1] << "\t" << lineinfo[3] << "\t" << artSeq << endl;
}