-
Notifications
You must be signed in to change notification settings - Fork 13
/
io_utils.hpp
executable file
·718 lines (615 loc) · 30 KB
/
io_utils.hpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
#ifndef __io_utils_hpp_included
#define __io_utils_hpp_included
// For definition of DTW moves NIL, RIGHT, UP...
#include "dtw.hpp"
#include "exit_codes.hpp"
// For CONCAT definitions, templateToShort()
#include "cpu_utils.hpp"
// C++ string/file manipulation
#include <iomanip>
#include <iostream>
#include <stdexcept>
#include <string>
#include <utility>
#include <cstdio>
#if HDF5_SUPPORTED == 1
extern "C"{
#include "hdf5.h"
}
#endif
#define ARRAYSIZE(a) \
((sizeof(a) / sizeof(*(a))) / \
static_cast<size_t>(!(sizeof(a) % sizeof(*(a)))))
// Text progress bar UI element
static char spinner[] = { '|', '/', '-', '\\'};
static bool warned_about_checkpoint;
__host__
bool file_exists(const char *fileName){
std::ifstream infile(fileName);
return infile.good();
}
// Inspired by http://stackoverflow.com/a/236803/248823
__host__
void split_line_by_delimiter(const std::string &s, char delim, std::vector<std::string> &tokens){
std::stringstream ss;
ss.str(s);
std::string token;
while (std::getline(ss, token, delim)) {
tokens.push_back(token);
}
}
__host__
void deleteCentroidCheckpointFile(const char *checkpoint_file_name){
if(remove(checkpoint_file_name) != 0){
std::cerr << "Warning: could not remove temporary checkpoint file " << checkpoint_file_name << std::endl;
}
}
template <typename T>
__host__
void writeCentroidCheckpointToFile(const char *checkpoint_file_name, T *gpu_barycenter, int centroidLength){
std::ofstream checkpoint_file(checkpoint_file_name);
if(!checkpoint_file.is_open()){
if(!warned_about_checkpoint){
warned_about_checkpoint = true;
std::cerr << "Cannot open centroid convergence checkpoint file " << checkpoint_file_name <<
" for writing, no checkpointing for this cluster will be done (i.e. computation cannot be resumed if the program dies unexpectedly)" << std::endl;
}
return;
}
checkpoint_file << gpu_barycenter[0];
for(int i = 1; i < centroidLength; i++){
checkpoint_file << " " << gpu_barycenter[i];
}
checkpoint_file << std::endl;
checkpoint_file.close();
}
// Read an evolving centroid (as printed between rounds of convergence) for the ability to pick up the computation from a checkpoint.
template <typename T>
__host__
int readCentroidCheckpointFromFile(const char *checkpoint_file_name, T *centroid, int centroidLength){
if(!file_exists(checkpoint_file_name)){
return 0;
}
std::ifstream checkpoint_file(checkpoint_file_name);
if(!checkpoint_file.is_open()){
std::cerr << "Cannot open existing centroid convergence checkpoint file " << checkpoint_file_name <<
" for reading, will have to restart convergence for this cluster from the start (medoid)" << std::endl;
return 0;
}
std::string line;
if(!std::getline(checkpoint_file, line)) {
std::cerr << "Existing centroid convergence checkpoint file " << checkpoint_file_name <<
"is blank, will have to restart convergence for this cluster from the start (centroid = medoid)" << std::endl;
return 0;
}
std::vector<std::string> values;
split_line_by_delimiter(line, ' ', values);
if(values.size() != centroidLength){
std::cerr << "Existing centroid convergence checkpoint file " << checkpoint_file_name <<
" contents does not have the same sequence length as the cluster medoid (" << values.size() <<
" != " << centroidLength << "), assuming corrupt checkpoint file and " <<
"will have to restart convergence for this cluster from the start (centroid = medoid)" << std::endl;
return 0;
}
std::cerr << "Resuming convergence from partially converged centroid in file " << checkpoint_file_name << std::endl;
std::stringstream ss(line);
for (int i = 0; i < centroidLength; i++){
ss >> centroid[i];
}
checkpoint_file.close();
return 1;
}
// Always reading as shorts because this function is for FAST5 writing capability of DBA.
__host__
int readSequenceAverages(const char *avgs_file_name, short **avgSequences, char **avgNames, size_t *avgSeqLengths){
// Are we even in a mode where we want these data?
if(avgSequences == 0 || avgNames == 0 || avgSeqLengths == 0){
return 0;
}
std::ifstream avgs_file(avgs_file_name);
if(!avgs_file.is_open()){
std::cerr << "Cannot open sequence averages file " << avgs_file_name << " for reading" << std::endl;
exit(CANNOT_READ_DBA_AVG);
}
std::string line;
int line_number = 0;
while (std::getline(avgs_file, line)) {
line_number++;
std::vector<std::string> row_values;
split_line_by_delimiter(line, '\t', row_values);
if(row_values.size() < 2){
std::cerr << "The existing cluster average sequences file " << avgs_file_name << " has a line (#" << line_number
<< ") without the expected two-plus columns (found " << row_values.size() << ")" << std::endl;
exit(AVG_FILE_FORMAT_VIOLATION);
}
cudaMallocHost(&avgNames[line_number-1], sizeof(char)*row_values[0].length()); CUERR("Allocating host memory for a centroid name from file");
row_values[0].copy(avgNames[line_number-1], row_values[0].length());
row_values.erase(row_values.begin()); // remove the name
avgSeqLengths[line_number-1] = row_values.size();
std::stringstream ss(line);
short *avg;
cudaMallocHost(&avg, sizeof(short)*row_values.size()); CUERR("Allocating host memory for a centroid sequence from file");
avgSequences[line_number-1] = avg;
for (int i = 0; i < row_values.size(); i++){
ss >> avg[i];
}
}
avgs_file.close();
return line_number;
}
/*
* sequences_membership gets populated from tab-delimited membership_filename, return value is sequence index (per sequence_names) of the medoids for each cluster
*/
int* readMedoidIndices(const char *membership_filename, int num_sequences, char **sequence_names, int *sequences_membership){
// Read an existing cluster membership info set from file
std::ifstream membership_file(membership_filename);
if(!membership_file.is_open()){
std::cerr << "Cannot open sequence cluster membership file " << membership_filename << " for reading" << std::endl;
exit(CANNOT_READ_MEMBERSHIP);
}
std::string line;
int file_num_clusters = 0;
// Ignore first, comment line
std::getline(membership_file, line);
if(line.compare(0, 1, "#")){
std::cerr << "The existing sequence cluster membership file " << membership_filename << " has a first line "
<< "without the expected '#' comment start" << std::endl;
exit(MEMBERSHIP_FILE_FORMAT_VIOLATION);
}
int line_number = 1;
while (std::getline(membership_file, line)) {
line_number++;
std::vector<std::string> row_values;
split_line_by_delimiter(line, '\t', row_values);
if(row_values.size() != 3){
std::cerr << "The existing sequence cluster membership file " << membership_filename << " has a line (#" << line_number
<< ") without the expected three columns (found " << row_values.size() << ")" << std::endl;
exit(MEMBERSHIP_FILE_FORMAT_VIOLATION);
}
try{
int cluster_index = std::stoi(row_values[1]);
if(cluster_index > file_num_clusters){
file_num_clusters = cluster_index;
}
int sequence_index = -1;
// Find the index of the sequence name
for(int i = 0; i < num_sequences; i++){
if(row_values[0].compare(sequence_names[i]) == 0){
sequence_index = i;
break;
}
}
if(sequence_index == -1){
std::cerr << "The existing sequence cluster membership file " << membership_filename << " has a line (#" << line_number
<< ") with a sequence name not found in the input (" << row_values[0] << " not in existing list of " << num_sequences << " names)" << std::endl;
exit(MEMBERSHIP_FILE_FORMAT_VIOLATION);
}
sequences_membership[sequence_index] = cluster_index;
}catch (std::invalid_argument const& ex) {
std::cerr << "The existing sequence cluster membership file " << membership_filename << " has a line (#" << line_number
<< ") where the second tab-delimited column value (" << row_values[1] << ") is not an integer as expected: "
<< ex.what() << std::endl;
exit(MEMBERSHIP_FILE_FORMAT_VIOLATION);
}
catch (std::out_of_range const& ex) {
std::cerr << "The existing sequence cluster membership file " << membership_filename << " has a line (#" << line_number
<< ") where the second tab-delimited column value (" << row_values[1] << ") is not in the standard integer range: "
<< ex.what() << std::endl;
exit(MEMBERSHIP_FILE_FORMAT_VIOLATION);
}
}
// Rewind to capture the medoids now that we know how many there are.
int *medoidIndices = new int[file_num_clusters];
membership_file.clear();
membership_file.seekg(0);
while (std::getline(membership_file, line)) {
// Forgoing checks as input's been validated above.
std::vector<std::string> row_values;
split_line_by_delimiter(line, '\t', row_values);
if(row_values[0] == row_values[2]){
int sequence_index = -1;
// Find the index of the sequence name
for(int i = 0; i < num_sequences; i++){
if(row_values[0].compare(sequence_names[i]) == 0){
sequence_index = i;
break;
}
}
medoidIndices[std::stoi(row_values[1])] = sequence_index;
}
}
return medoidIndices;
}
template <typename T>
__host__
int
writeSequences(T **cpu_sequences, size_t *seq_lengths, char **seq_names, int num_seqs, const char *filename){
std::ofstream out(filename);
if(!out.is_open()){
std::cerr << "Cannot write to " << filename << std::endl;
return CANNOT_WRITE_DTW_PATH_MATRIX;
}
for(int i = 0; i < num_seqs; i++){
T * seq = cpu_sequences[i];
out << seq_names[i] << "\t" << seq[0];
for(int j = 1; j < seq_lengths[i]; j++){
out << "\t" << seq[j];
}
out << std:: endl;
}
out.close();
return 0;
}
template<typename T>
__host__
int writeDTWPathMatrix(unsigned char *cpu_stepMatrix, const char *step_filename, size_t num_columns, size_t num_rows, size_t pathPitch){
std::ofstream step(step_filename);
if(!step.is_open()){
std::cerr << "Cannot write to " << step_filename << std::endl;
return CANNOT_WRITE_DTW_PATH_MATRIX;
}
for(int i = 0; i < num_rows; i++){
for(int j = 0; j < num_columns; j++){
unsigned char move = cpu_stepMatrix[pitchedCoord(j,i,pathPitch)];
step << (move == DIAGONAL ? "D" : (move == RIGHT ? "R" : (move == UP ? "U" : (move == OPEN_RIGHT ? "O" : (move == NIL || move == NIL_OPEN_RIGHT ? "N" : "?")))));
}
step << std::endl;
}
step.close();
return 0;
}
template <typename T>
__host__
int writeDTWPath(unsigned char *cpu_pathMatrix, std::ofstream *path, T *gpu_seq, char *cpu_seqname, size_t gpu_seq_len, T *cpu_centroid, size_t cpu_centroid_len, size_t num_columns, size_t num_rows, size_t pathPitch, int flip_seq_order, int column_offset = 0, int *stripe_rows = 0){
if((*path).tellp() == 0){ // Print the sequence name at the top of the file
*path << cpu_seqname << std::endl;
}
T *cpu_seq;
// TODO: this is pretty inefficient if running multiple rounds of the same seqs (e.g. during convergence, where we never know if it's the last path that we want to capture), ask CPU seq to be passed in
cudaMallocHost(&cpu_seq, sizeof(T)*gpu_seq_len); CUERR("Allocating CPU memory for query seq in DTW path printing");
cudaMemcpy(cpu_seq, gpu_seq, sizeof(T)*gpu_seq_len, cudaMemcpyDeviceToHost); CUERR("Copying incoming GPU query to CPU in DTW path printing");
// moveI and moveJ are defined device-side in dtw.hpp, but we are host side so we need to replicate
// NIL sentinel value is for the start of the DTW alignment, the stop condition for backtracking (ergo has no corresponding moveI or moveJ)
int moveI[] = { -1, -1, 0, -1, 0, 0, 0 };
int moveJ[] = { -1, -1, -1, 0, -1, -1, -1 };
int j = num_columns - 1;
int i = stripe_rows ? *stripe_rows -1 : num_rows - 1;
unsigned char move = cpu_pathMatrix[pitchedCoord(j,i,pathPitch)];
while (move != NIL && move != NIL_OPEN_RIGHT && (column_offset == 0 || i >= 0 && j >= 0)) { // special stop condition if partially printing the matrix
if(flip_seq_order){
// Technically NIL and NIL_OPEN_RIGHT should never happen in here, but if they do we know there's a bad bug :-)
*path << column_offset+j << "\t" << cpu_seq[j+column_offset] << "\t" << i << "\t" << cpu_centroid[i] << "\t" << (move == DIAGONAL ? "DIAG" : (move == RIGHT ? "RIGHT" : (move == UP ? "UP" : (move == OPEN_RIGHT ? "OPEN_RIGHT" : (move == NIL ? "NIL" : (move == NIL_OPEN_RIGHT ? "NIL_OPEN_RIGHT" : "?")))))) << std::endl;
}
else{
*path << i << "\t" << cpu_seq[i] << "\t" << column_offset+j << "\t" << cpu_centroid[j+column_offset] << "\t" << (move == DIAGONAL ? "DIAG" : (move == RIGHT ? "RIGHT" : (move == UP ? "UP" : (move == OPEN_RIGHT ? "OPEN_RIGHT" : (move == NIL ? "NIL" : (move == NIL_OPEN_RIGHT ? "NIL_OPEN_RIGHT" : "?")))))) << std::endl;
}
i += moveI[move];
j += moveJ[move];
move = cpu_pathMatrix[pitchedCoord(j,i,pathPitch)];
}
// Print the anchor
if(column_offset == 0){
*path << i << "\t" << cpu_seq[i] << "\t" << column_offset+j << "\t" << cpu_centroid[j] << "\t" << (move == NIL ? "NIL" : (move == NIL_OPEN_RIGHT ? "NIL_OPEN_RIGHT" : "?")) << std::endl;
}
cudaFreeHost(cpu_seq); CUERR("Freeing CPU memory for query seq in DTW path printing");
if(stripe_rows){*stripe_rows = i+1;}
return 0;
}
template <typename T>
__host__
int writePairDistMatrix(char *output_prefix, char **sequence_names, size_t num_sequences, T *dtwPairwiseDistances){
size_t index_offset = 0;
std::ofstream mats((std::string(output_prefix)+std::string(".pair_dists.txt")).c_str());
if(!mats.good()){
return CANNOT_WRITE_DISTANCE_MATRIX;
}
for(size_t seq_index = 0; seq_index < num_sequences-1; seq_index++){
mats << sequence_names[seq_index];
for(size_t pad = 0; pad < seq_index; ++pad){
mats << "\t";
}
mats << "\t0"; //self-distance
for(size_t paired_seq_index = seq_index + 1; paired_seq_index < num_sequences; ++paired_seq_index){
mats << "\t" << dtwPairwiseDistances[index_offset+paired_seq_index-seq_index-1];
}
index_offset += num_sequences-seq_index-1;
mats << std::endl;
}
// Last line is pro forma as all pair distances have already been printed
mats << sequence_names[num_sequences-1];
for(size_t pad = 0; pad < num_sequences; ++pad){
mats << "\t";
}
mats << "0" << std::endl;
// Last line is pro forma as all pair distances have already been printed
mats << sequence_names[num_sequences-1];
for(size_t pad = 0; pad < num_sequences; ++pad){
mats << "\t";
}
mats << "0" << std::endl;
mats.close();
return 0;
}
#if SLOW5_SUPPORTED == 1
// Function to take a SLOW5 file and take a selection of sequences from it. The Raw sequence data of the chosen sequences will be replaced with data passed in from the variable "sequences"
// slow5_file_name - the slow5 file that we will be copying the sequences from
// new_slow5_file - the name of the new slow5 file where the new data will be written
// sequence_names - a list of the sequence names found in the slow5 file that will be copied over
// sequences - the new sequence data that will be written to the new slow5 file
// sequence_lengths - the lengths of the new sequences. These should equal the lengths of the matching sequences in the original slow5 file
// num_sequences - the number of new sequences passed in
// Returns 1 on a fail, 0 on success
__host__
int writeSlow5Output(const char* slow5_file_name, const char* new_slow5_file, char** sequence_names, short** sequences, size_t *sequence_lengths, int num_sequences){
slow5_file_t *sp = slow5_open(slow5_file_name,"r");
if(sp==NULL){
std::cerr << "Error opening Slow5 file " << slow5_file_name << " Exiting." << std::endl;
return 1;
}
int ret = slow5_idx_load(sp);
if(ret<0){
std::cerr << "Error opening Slow5 index file" << slow5_file_name << " Exiting." << std::endl;
return 1;
}
slow5_file_t *sp_new = slow5_open(new_slow5_file, "w");
if(sp==NULL){
std::cerr << "Error creating new Slow5 file " << new_slow5_file << " Exiting." << std::endl;
return 1;
}
slow5_hdr_t* header_tmp = sp_new->header;
sp_new->header = sp->header;
if(slow5_hdr_write(sp_new) < 0){
std::cerr << "Error writting header to Slow5 file " << new_slow5_file << " Exiting." << std::endl;
return 1;
}
slow5_rec_t *rec = NULL;
// Start of reads copy
for(int i = 0; i < num_sequences; i++){
// Check if sequence exists in original Slow5 file
ret = slow5_get(sequence_names[i], &rec, sp);
if(ret < 0){
std::cerr << "Error. Sequence " << sequence_names[i] << " does not exist in Slow5 file " << slow5_file_name << " Exiting." << std::endl;
return 1;
}
// Get the length of the Raw Signal and check if it matches with the length of the new sequence that will be replacing it
if(rec->len_raw_signal != sequence_lengths[i]){
std::cerr << "Length of sequence " << sequence_names[i] << " in Slow5 file " << slow5_file_name << " (" << rec->len_raw_signal
<< ") does not match length of sequence given (" << sequence_lengths[i] << ") Exiting." << std::endl;
return 1;
}
for(uint64_t j=0; j < rec->len_raw_signal; j++){
rec->raw_signal[j]=sequences[i][j];
}
//write to file
if (slow5_write(rec, sp_new) < 0){
std::cerr << "Error writing new sequences to new Slow5 file " << new_slow5_file << " Exiting." << std::endl;
return 1;
}
}
slow5_rec_free(rec);
slow5_idx_unload(sp);
slow5_close(sp);
sp_new->header=header_tmp;
slow5_close(sp_new);
return 0;
}
#endif
#if HDF5_SUPPORTED == 1
// Function to take a multi Fast5 file and take a selection of sequences from it. The Raw sequence data of the chosen sequences will be replaced with data passed in from the variable "sequences"
// fast5_file_name - the fast5 file that we will be copying the sequences from
// new_fast5_file - the name of the new fast5 file where the new data will be written
// sequence_names - a list of the sequence names found in the fast5 file that will be copied over
// sequences - the new sequence data that will be written to the new fast5 file
// sequence_lengths - the lengths of the new sequences. These should equal the lengths of the matching sequences in the original fast5 file
// num_sequences - the number of new sequences passed in
// Returns 1 on a fail, 0 on success
__host__
int writeFast5Output(const char* fast5_file_name, const char* new_fast5_file, char** sequence_names, short** sequences, size_t *sequence_lengths, int num_sequences){
// HDF5 variables needed
hid_t org_file_id, new_file_id, org_read_group, new_read_group, org_attr, new_attr, memtype, space, org_signal_dataset_id, signal_dataspace_id, new_group, new_dataset_prop_list, new_dataset;
hsize_t org_attr_size;
// Initial copy of metadata
// Not sure if you can actually 'copy' this info, so for right now we're creating new data in the new file. Might want to see if this can just be copied in the future
// Open Fast5 file we want to copy data from
if((org_file_id = H5Fopen(fast5_file_name, H5F_ACC_RDONLY, H5P_DEFAULT)) < 0){
std::cerr << "Error opening Fast5 file " << fast5_file_name << " Exiting." << std::endl;
return 1;
}
// Create the new Fast5 file we want to write coppied data to
if((new_file_id = H5Fcreate(new_fast5_file, H5F_ACC_TRUNC, H5P_DEFAULT, H5P_DEFAULT)) < 0){
std::cerr << "Error creating new Fast5 file " << new_fast5_file << " Exiting." << std::endl;
return 1;
}
H5Eset_auto1(NULL, NULL);
// Open the base read group inside the original file
if((org_read_group = H5Gopen(org_file_id, "/", H5P_DEFAULT)) < 0){
std::cerr << "Error opening Group '/' from " << fast5_file_name << " Exiting." << std::endl;
return 1;
}
// Open the base read group in the new file
if((new_read_group = H5Gopen(new_file_id, "/", H5P_DEFAULT)) < 0){
std::cerr << "Error opening Group '/' from " << new_fast5_file << " Exiting." << std::endl;
return 1;
}
// Open the file_type attribute inside of the / read group in the original file
// To be closed after use
if((org_attr = H5Aopen(org_read_group, "file_type", H5P_DEFAULT)) >= 0){
// Get the size of the data in file_type in the original file
if((org_attr_size = H5Aget_storage_size(org_attr)) == 0){
std::cerr << "Error getting Attribute 'file_type' size from " << fast5_file_name << " Exiting." << std::endl;
return 1;
}
// Get the type of the data in file_type in the original file
hid_t org_atype = H5Aget_type(org_attr);
char* data_buffer = (char*) std::malloc(org_attr_size);
// Read in the data from the attribute file_type in the original file
if(H5Aread(org_attr, org_atype, (void*)data_buffer)){
std::cerr << "Error reading attribute 'file_type' from Fast5 file " << fast5_file_name << " Exiting." << std::endl;
return 1;
}
// Set the memory type of the new attribute to be a c string
memtype = H5Tcopy(H5T_C_S1);
if(H5Tset_size(memtype, org_attr_size)){
std::cerr << "Error assigning size for file_type in " << new_fast5_file << " Exiting." << std::endl;
return 1;
}
// Create a new string scalar dataspace for the data
if((space = H5Screate(H5S_SCALAR)) < 0){
std::cerr << "Failed to create a scalar memory space specification in the HDF5 API, please report to the software author(s)." << std::endl;
exit(FAST5_HDF5_API_ERROR);
}
// Create the new attribute with the above memtype and space
// To be closed after use
if((new_attr = H5Acreate(new_read_group, "file_type", memtype, space, H5P_DEFAULT, H5P_DEFAULT)) < 0){
std::cerr << "Error creating attribute 'file_type' for " << new_fast5_file << " Exiting." << std::endl;
return 1;
}
// Write the new attribute to the new file
if(H5Awrite(new_attr, memtype, (void*)data_buffer)){
std::cerr << "Error writting attribute 'file_type' to Fast5 file " << new_fast5_file << " Exiting." << std::endl;
return 1;
}
H5Aclose(org_attr);
H5Aclose(new_attr);
// Open 'file_version' attribute in original file
if((org_attr = H5Aopen(org_read_group, "file_version", H5P_DEFAULT)) < 0){
std::cerr << "Error opening Attribute 'file_version' from " << fast5_file_name << " Exiting." << std::endl;
return 1;
}
// Get size of 'file_version' attribute
if((org_attr_size = H5Aget_storage_size(org_attr)) == 0){
std::cerr << "Error getting Attribute 'file_version' size from " << fast5_file_name << " Exiting." << std::endl;
return 1;
}
org_atype = H5Aget_type(org_attr);
data_buffer = (char*) std::realloc(data_buffer, org_attr_size);
// Read in the data from the attribute file_version in the original file
if(H5Aread(org_attr, org_atype, (void*)data_buffer)){
std::cerr << "Error reading attribute 'file_version' from Fast5 file " << fast5_file_name << " Exiting." << std::endl;
return 1;
}
if(H5Tset_size (memtype, org_attr_size)){
std::cerr << "Error assigning size for file_version in " << new_fast5_file << " Exiting." << std::endl;
return 1;
}
// Create the new attribute with the above memtype and space
if((new_attr = H5Acreate(new_read_group, "file_version", memtype, space, H5P_DEFAULT, H5P_DEFAULT)) < 0){
std::cerr << "Error creating attribute 'file_version' for " << new_fast5_file << " Exiting." << std::endl;
return 1;
}
// Write the new attribute to the new file
if(H5Awrite(new_attr, memtype, (void*)data_buffer)){
std::cerr << "Error writting attribute 'file_version' to Fast5 file " << new_fast5_file << " Exiting." << std::endl;
return 1;
}
// Close what's no longer needed
H5Tclose(memtype);
H5Sclose(space);
H5Aclose(org_attr);
H5Aclose(new_attr);
free(data_buffer);
}
H5Gclose(org_read_group);
H5Gclose(new_read_group);
// End of metadata copy
// Start of reads copy
for(int i = 0; i < num_sequences; i++){
// Check if sequence exists in original Fast5 file
if(H5Oexists_by_name(org_file_id, CONCAT2("/", sequence_names[i]).c_str(), H5P_DEFAULT) == 0){
std::cerr << "Error. Sequence " << sequence_names[i] << " does not exist in Fast5 file " << fast5_file_name << " Exiting." << std::endl;
return 1;
}
// Get Dataset for the Raw Signal of the sequence
if((org_signal_dataset_id = H5Dopen(org_file_id, (CONCAT3("/", sequence_names[i], "/Raw/Signal")).c_str(), H5P_DEFAULT)) < 0){
std::cerr << "Unable to open " << sequence_names[i] << " Signal in " << fast5_file_name << " Exiting." << std::endl;
return 1;
}
// Get the Dataspace for the Raw Signal
if((signal_dataspace_id = H5Dget_space(org_signal_dataset_id)) < 0){
std::cerr << "Unable to get dataspace for " << sequence_names[i] << " Signal in " << fast5_file_name << " Exiting." << std::endl;
return 1;
}
// Get the length of the Raw Signal and check if it matches with the length of the new sequence that will be replacing it
const hsize_t read_length = H5Sget_simple_extent_npoints(signal_dataspace_id);
if(read_length != sequence_lengths[i]){
std::cerr << "Length of sequence " << sequence_names[i] << " in Fast5 file " << fast5_file_name << " (" << read_length
<< ") does not match length of sequence given (" << sequence_lengths[i] << ") Exiting." << std::endl;
return 1;
}
// Copy over the data of the sequence in the original Fast5 file to the new Fast5 file
if(H5Ocopy(org_file_id, CONCAT2("/", sequence_names[i]).c_str(), new_file_id, CONCAT2("/", sequence_names[i]).c_str(), H5P_DEFAULT, H5P_DEFAULT)){
std::cerr << "Error copying Attribute 'file_type' from " << fast5_file_name << " to " << new_fast5_file << " Exiting." << std::endl;
return 1;
}
// Create a space that will be the size of the sequence length and have a max size of H5S_UNLIMITED
hsize_t max_dims = H5S_UNLIMITED;
if((space = H5Screate_simple(1, &read_length, &max_dims)) < 0){
std::cerr << "Failed to create a simple memory space specification in the HDF5 API, please report to the software author(s)." << std::endl;
exit(FAST5_HDF5_API_ERROR);
}
// Delete the link to the original signal data in the new file
if(H5Ldelete(new_file_id, (CONCAT3("/", sequence_names[i], "/Raw/Signal")).c_str(), H5P_DEFAULT)){
std::cerr << "Unable to delete " << (CONCAT3("/", sequence_names[i], "/Raw/Signal")).c_str() << " from " << new_fast5_file << " Exiting." << std::endl;
return 1;
}
// Open the group where the new signal data will be written
if((new_group = H5Gopen(new_file_id, (CONCAT3("/", sequence_names[i], "/Raw")).c_str(), H5P_DEFAULT)) < 0){
std::cerr << "Unable to open " << (CONCAT3("/", sequence_names[i], "/Raw")).c_str() << " group in " << new_fast5_file << " Exiting." << std::endl;
return 1;
}
// Create a property list for the dataset
if((new_dataset_prop_list = H5Pcreate(H5P_DATASET_CREATE)) < 0){
std::cerr << "Unable to create property list for " << new_fast5_file << " Exiting." << std::endl;
return 1;
}
// Set chunk size for the property list since the max size for the space is UNLIMITED
if(H5Pset_chunk(new_dataset_prop_list, 1, &read_length)){
std::cerr << "Unable to set chunk for property list in " << new_fast5_file << " Exiting." << std::endl;
return 1;
}
// Create the new dataset with the above parameters that we've set
if((new_dataset = H5Dcreate2 (new_group, "Signal", H5T_STD_I16LE, space, H5P_DEFAULT, new_dataset_prop_list, H5P_DEFAULT)) < 0){
std::cerr << "Unable to create dataset " << (CONCAT3("/", sequence_names[i], "/Raw/Signal")).c_str() << " in " << new_fast5_file << " Exiting." << std::endl;
return 1;
}
// Write the new sequences to the newly created dataset
if(H5Dwrite(new_dataset, H5T_STD_I16LE, H5S_ALL, H5S_ALL, H5P_DEFAULT, sequences[i])){
std::cerr << "Error writing new sequences to new Fast5 file " << new_fast5_file << " Exiting." << std::endl;
return 1;
}
// Close what was opened in the loop
H5Gclose(new_group);
H5Pclose(new_dataset_prop_list);
H5Dclose(new_dataset);
H5Dclose(org_signal_dataset_id);
H5Sclose(space);
}
// Close everything else
H5Fclose(org_file_id);
H5Fclose(new_file_id);
return 0;
}
#endif
__host__
void setupPercentageDisplay(std::string title){
std::cerr << title << std::endl;
std::cerr << "0% 10% 20% 30% 40% 50% 60% 70% 80% 90% 100%" << std::endl;
}
__host__
void teardownPercentageDisplay(){
std::cerr << std::endl;
}
__host__
int updatePercentageComplete(int current_item, int total_items, int alreadyDisplaying){
int newDisplayTotal = 100*((float) current_item/total_items);
if(newDisplayTotal > alreadyDisplaying){
for(; alreadyDisplaying < newDisplayTotal; alreadyDisplaying++){
std::cerr << "\b.|";
}
}
else{
std::cerr << "\b" << spinner[current_item%ARRAYSIZE(spinner)];
}
return newDisplayTotal;
}
#endif