diff --git a/ribodetector/detect.py b/ribodetector/detect.py index 2982c89..902b484 100644 --- a/ribodetector/detect.py +++ b/ribodetector/detect.py @@ -39,7 +39,7 @@ class Predictor: def __init__(self, config, args): self.config = config self.args = args - self.logger = config.get_logger('predict', 1) + self.logger = config.get_logger('predict', 1, self.args.log) self.chunk_size = self.args.chunk_size def get_state_dict(self): @@ -69,11 +69,17 @@ def get_state_dict(self): self.state_file = os.path.join( cd, self.config['state_file'][model_file_ext]) - self.logger.info('Using high {} model file: {}{}{}{}'.format(model_file_ext.upper(), - colors.BOLD, - colors.OKCYAN, - self.state_file, - colors.ENDC)) + # self.logger.info('Using high {} model file: {}{}{}{}'.format(model_file_ext.upper(), + # colors.BOLD, + # colors.OKCYAN, + # self.state_file, + # colors.ENDC)) + + self.logger.info('Using high {} model'.format(model_file_ext.upper())) + + self.logger.info('Log file: {}'.format( + self.args.log + )) def load_model(self): """Load the model onto CUDA device @@ -117,6 +123,9 @@ def run(self): Load data and run the predictor """ + num_nonrrna = 0 + num_rrna = 0 + if self.is_paired: # Load paired end read files using multiprocessing with Pool(2) as p: @@ -139,7 +148,7 @@ def run(self): colors.ENDC)) rrna1_fh = open_for_write(self.rrna[0]) rrna2_fh = open_for_write(self.rrna[1]) - num_rrna = 0 + # num_rrna = 0 self.logger.info('Writing output non-rRNA sequences into file: {}{}{}'.format( colors.OKBLUE, @@ -180,13 +189,17 @@ def run(self): r1_dict, r2_dict = self.separate_paired_reads( r1, r1_output, r2, r2_output) + + num_nonrrna += len(r1_dict[0]) + num_rrna += len(r1_dict[1]) + if r1_dict[0]: norrna1_fh.write('\n'.join(r1_dict[0]) + '\n') norrna2_fh.write('\n'.join(r2_dict[0]) + '\n') if self.rrna is not None and r1_dict[1]: rrna1_fh.write('\n'.join(r1_dict[1]) + '\n') rrna2_fh.write('\n'.join(r2_dict[1]) + '\n') - num_rrna += len(r1_dict[1]) + if self.args.ensure == 'both' and r1_dict[-1]: unclf1_fh.write('\n'.join(r1_dict[-1]) + '\n') unclf2_fh.write('\n'.join(r2_dict[-1]) + '\n') @@ -195,14 +208,21 @@ def run(self): # del r1_data, r2_data, r1_output, r2_output, r1_batch_labels, r2_batch_labels # Write predicted rRNA sequences if the rRNA output file is given - if self.rrna is not None: - self.logger.info('Done! Detected {}{}{}{} rRNA sequences.'.format( - colors.BOLD, - colors.OKCYAN, - num_rrna, - colors.ENDC - )) + self.logger.info('Detected {}{}{}{} non-rRNA sequences.'.format( + colors.BOLD, + colors.OKCYAN, + num_nonrrna, + colors.ENDC + )) + + self.logger.info('Detected {}{}{}{} rRNA sequences.'.format( + colors.BOLD, + colors.OKCYAN, + num_rrna, + colors.ENDC + )) + if self.rrna is not None: rrna1_fh.close() rrna2_fh.close() @@ -239,7 +259,7 @@ def run(self): colors.ENDC)) rrna_fh = open_for_write(self.rrna[0]) - num_rrna = 0 + # num_rrna = 0 self.logger.info('Writing output non-rRNA sequences into file: {}{}{}'.format( colors.OKBLUE, @@ -262,21 +282,32 @@ def run(self): batch_labels = torch.argmax(output, dim=1).tolist() separated_reads = Predictor.separate_reads( reads, batch_labels) + + num_nonrrna += len(separated_reads[0]) + num_rrna += len(separated_reads[1]) + if separated_reads[0]: norrna_fh.write('\n'.join(separated_reads[0]) + '\n') if self.rrna is not None and separated_reads[1]: rrna_fh.write('\n'.join(separated_reads[1]) + '\n') - num_rrna += len(separated_reads[1]) # del data, output, batch_labels + + self.logger.info('Detected {}{}{}{} non-rRNA sequences.'.format( + colors.BOLD, + colors.OKCYAN, + num_nonrrna, + colors.ENDC + )) + + self.logger.info('Detected {}{}{}{} rRNA sequences'.format( + colors.BOLD, + colors.OKCYAN, + num_rrna, + colors.ENDC + )) if self.rrna is not None: - self.logger.info('Done! Detected {}{}{}{} rRNA sequences'.format( - colors.BOLD, - colors.OKCYAN, - num_rrna, - colors.ENDC - )) rrna_fh.close() norrna_fh.close() @@ -285,6 +316,10 @@ def run_with_chunks(self): Run the model on input sequence files loaded into chunks to reduce the memory comsumption """ + num_read = 0 + num_nonrrna = 0 + num_rrna = 0 + if self.is_paired: if self.rrna is not None: @@ -295,7 +330,6 @@ def run_with_chunks(self): rrna1_fh = open_for_write(self.rrna[0]) rrna2_fh = open_for_write(self.rrna[1]) - num_rrna = 0 self.logger.info('Writing output non-rRNA sequences into file: {}{}{}'.format( colors.OKBLUE, @@ -317,7 +351,7 @@ def run_with_chunks(self): colors.ENDC)) num_unknown = 0 - num_read = 0 + #num_read = 0 with torch.no_grad(): # Load paired end read files into chunks @@ -339,13 +373,14 @@ def run_with_chunks(self): r1_dict, r2_dict = self.separate_paired_reads( r1, r1_output, r2, r2_output) + num_nonrrna += len(r1_dict[0]) + num_rrna += len(r1_dict[1]) if r1_dict[0]: norrna1_fh.write('\n'.join(r1_dict[0]) + '\n') norrna2_fh.write('\n'.join(r2_dict[0]) + '\n') if self.rrna is not None and r1_dict[1]: rrna1_fh.write('\n'.join(r1_dict[1]) + '\n') rrna2_fh.write('\n'.join(r2_dict[1]) + '\n') - num_rrna += len(r1_dict[1]) if self.args.ensure == 'both' and r1_dict[-1]: unclf1_fh.write('\n'.join(r1_dict[-1]) + '\n') @@ -358,14 +393,22 @@ def run_with_chunks(self): colors.ENDC)) # del r1_data, r2_data, r1_output, r2_output, r1_batch_labels, r2_batch_labels + self.logger.info('Detected {}{}{}{} non-rRNA sequences.'.format( + colors.BOLD, + colors.OKCYAN, + num_nonrrna, + colors.ENDC + )) + + self.logger.info('Detected {}{}{}{} rRNA sequences.'.format( + colors.BOLD, + colors.OKCYAN, + num_rrna, + colors.ENDC + )) + if self.rrna is not None: - self.logger.info('Done! Detected {}{}{}{} rRNA sequences.'.format( - colors.BOLD, - colors.OKCYAN, - num_rrna, - colors.ENDC - )) rrna1_fh.close() rrna2_fh.close() @@ -387,7 +430,7 @@ def run_with_chunks(self): self.logger.info('Writing output rRNA sequences into file: {}{}{}'.format( colors.OKBLUE, ", ".join(self.rrna), colors.ENDC)) rrna_fh = open_for_write(self.rrna[0]) - num_rrna = 0 + #num_rrna = 0 self.logger.info('Writing output non-rRNA sequences into file: {}{}{}'.format( colors.OKBLUE, @@ -395,7 +438,7 @@ def run_with_chunks(self): colors.ENDC)) norrna_fh = open_for_write(self.output[0]) - num_read = 0 + #num_read = 0 with torch.no_grad(): # Load single end read file into chunks @@ -417,12 +460,15 @@ def run_with_chunks(self): batch_labels = torch.argmax(output, dim=1).tolist() separated_reads = Predictor.separate_reads( reads, batch_labels) + + num_nonrrna += len(separated_reads[0]) + num_rrna += len(separated_reads[1]) + if separated_reads[0]: norrna_fh.write( '\n'.join(separated_reads[0]) + '\n') if self.rrna is not None and separated_reads[1]: rrna_fh.write('\n'.join(separated_reads[1]) + '\n') - num_rrna += len(separated_reads[1]) # del data, output, batch_labels @@ -431,13 +477,21 @@ def run_with_chunks(self): num_read, colors.ENDC)) + self.logger.info('Detected {}{}{}{} non-rRNA sequences'.format( + colors.BOLD, + colors.OKCYAN, + num_nonrrna, + colors.ENDC + )) + + self.logger.info('Detected {}{}{}{} rRNA sequences'.format( + colors.BOLD, + colors.OKCYAN, + num_rrna, + colors.ENDC + )) + if self.rrna is not None: - self.logger.info('Done! Detected {}{}{}{} rRNA sequences'.format( - colors.BOLD, - colors.OKCYAN, - num_rrna, - colors.ENDC - )) rrna_fh.close() norrna_fh.close() @@ -711,6 +765,8 @@ def main(): 'Not needed when free RAM >=5 * your_file_size (uncompressed, sum of paired ends)', 'When chunk_size=256, memory=16 it will load 256 * 16 * 1024 reads each chunk (use ~20 GB for 100bp paired end)' )) + args.add_argument('--log', default='ribodetector.log', type=str, + help='Log file name') args.add_argument('-v', '--version', action='version', version='%(prog)s {version}'.format(version=__version__)) diff --git a/ribodetector/detect_cpu.py b/ribodetector/detect_cpu.py index b7f9947..21eddef 100644 --- a/ribodetector/detect_cpu.py +++ b/ribodetector/detect_cpu.py @@ -37,7 +37,7 @@ class Predictor: def __init__(self, config, args): self.config = config self.args = args - self.logger = config.get_logger('predict', 1) + self.logger = config.get_logger('predict', 1, self.args.log) self.chunk_size = self.args.chunk_size # self.input = self.args.input # self.output = self.args.output @@ -71,11 +71,16 @@ def load_model(self): self.model_file = os.path.join( cd, self.config['state_file'][model_file_ext]).replace('.pth', '.onnx') - self.logger.info('Using high {} model file: {}{}{}{} on CPU'.format(model_file_ext.upper(), - colors.BOLD, - colors.OKCYAN, - self.model_file, - colors.ENDC)) + # self.logger.info('Using high {} model file: {}{}{}{} on CPU'.format(model_file_ext.upper(), + # colors.BOLD, + # colors.OKCYAN, + # self.model_file, + # colors.ENDC)) + self.logger.info('Using high {} model'.format(model_file_ext.upper())) + + self.logger.info('Log file: {}'.format( + self.args.log + )) so = onnxruntime.SessionOptions() so.intra_op_num_threads = 1 @@ -105,7 +110,10 @@ def run(self): # queue for progressbar signal q_pbar = mp.Queue() - + + num_nonrrna = 0 + num_rrna = 0 + if self.is_paired: # Load paired end read files with multiprocessing with mp.Pool(2) as p: @@ -129,7 +137,6 @@ def run(self): colors.ENDC)) rrna1_fh = open_for_write(self.rrna[0]) rrna2_fh = open_for_write(self.rrna[1]) - num_rrna = 0 self.logger.info('Writing output non-rRNA sequences into file: {}{}{}'.format( colors.OKBLUE, @@ -182,6 +189,9 @@ def run(self): for r1_dict, r2_dict in results: # Load the prediciton results and split the input reads accordingly + + num_nonrrna += len(r1_dict[0]) + num_rrna += len(r1_dict[1]) if r1_dict[0]: norrna1_fh.write('\n'.join(r1_dict[0]) + '\n') @@ -189,7 +199,6 @@ def run(self): if self.rrna is not None and r1_dict[1]: rrna1_fh.write('\n'.join(r1_dict[1]) + '\n') rrna2_fh.write('\n'.join(r2_dict[1]) + '\n') - num_rrna += len(r1_dict[1]) if self.args.ensure == 'both' and r1_dict[-1]: unclf1_fh.write('\n'.join(r1_dict[-1]) + '\n') @@ -198,14 +207,21 @@ def run(self): # del r1_data, r2_data, r1_output, r2_output, r1_batch_labels, r2_batch_labels - if self.rrna is not None: - self.logger.info('Done! Detected {}{}{}{} rRNA sequences.'.format( - colors.BOLD, - colors.OKCYAN, - num_rrna, - colors.ENDC - )) + self.logger.info('Detected {}{}{}{} non-rRNA sequences.'.format( + colors.BOLD, + colors.OKCYAN, + num_nonrrna, + colors.ENDC + )) + self.logger.info('Detected {}{}{}{} rRNA sequences.'.format( + colors.BOLD, + colors.OKCYAN, + num_rrna, + colors.ENDC + )) + + if self.rrna is not None: rrna1_fh.close() rrna2_fh.close() @@ -243,7 +259,7 @@ def run(self): colors.ENDC)) rrna_fh = open_for_write(self.rrna[0]) - num_rrna = 0 +# num_rrna = 0 self.logger.info('Writing output non-rRNA sequences into file: {}{}{}'.format( colors.OKBLUE, @@ -278,18 +294,28 @@ def run(self): for r_dict in results: + num_nonrrna += len(r_dict[0]) + num_rrna += len(r_dict[1]) if r_dict[0]: norrna_fh.write('\n'.join(r_dict[0]) + '\n') if self.rrna is not None and r_dict[1]: rrna_fh.write('\n'.join(r_dict[1]) + '\n') - num_rrna += len(r_dict[1]) + + self.logger.info('Detected {}{}{}{} non-rRNA sequences'.format( + colors.BOLD, + colors.OKCYAN, + num_nonrrna, + colors.ENDC + )) + + self.logger.info('Detected {}{}{}{} rRNA sequences'.format( + colors.BOLD, + colors.OKCYAN, + num_rrna, + colors.ENDC + )) + if self.rrna is not None: - self.logger.info('Done! Detected {}{}{}{} rRNA sequences'.format( - colors.BOLD, - colors.OKCYAN, - num_rrna, - colors.ENDC - )) rrna_fh.close() norrna_fh.close() @@ -302,6 +328,10 @@ def run_with_chunks(self): read_chunk_size = self.batch_size * self.chunk_size + num_read = 0 + num_nonrrna = 0 + num_rrna = 0 + if self.is_paired: if self.rrna is not None: self.logger.info('Writing output rRNA sequences into file: {}{}{}'.format( @@ -310,7 +340,7 @@ def run_with_chunks(self): colors.ENDC)) rrna1_fh = open_for_write(self.rrna[0]) rrna2_fh = open_for_write(self.rrna[1]) - num_rrna = 0 + # num_rrna = 0 self.logger.info('Writing output non-rRNA sequences into file: {}{}{}'.format( colors.OKBLUE, @@ -334,7 +364,7 @@ def run_with_chunks(self): num_unknown = 0 - num_read = 0 + # num_read = 0 # Load paired end reads with chunks for chunk in SeqEncoder.get_pairedread_chunks(*self.input, @@ -366,13 +396,17 @@ def run_with_chunks(self): for r1_dict, r2_dict in results: # Load the prediciton results and split the input reads accordingly + + num_nonrrna += len(r1_dict[0]) + num_rrna += len(r1_dict[1]) + if r1_dict[0]: norrna1_fh.write('\n'.join(r1_dict[0]) + '\n') norrna2_fh.write('\n'.join(r2_dict[0]) + '\n') if self.rrna is not None and r1_dict[1]: rrna1_fh.write('\n'.join(r1_dict[1]) + '\n') rrna2_fh.write('\n'.join(r2_dict[1]) + '\n') - num_rrna += len(r1_dict[1]) + # num_rrna += len(r1_dict[1]) if self.args.ensure == 'both' and r1_dict[-1]: unclf1_fh.write('\n'.join(r1_dict[-1]) + '\n') @@ -388,14 +422,21 @@ def run_with_chunks(self): num_read, colors.ENDC)) - if self.rrna is not None: - self.logger.info('Done! Detected {}{}{}{} rRNA sequences.'.format( - colors.BOLD, - colors.OKCYAN, - num_rrna, - colors.ENDC - )) + self.logger.info('Detected {}{}{}{} non-rRNA sequences.'.format( + colors.BOLD, + colors.OKCYAN, + num_nonrrna, + colors.ENDC + )) + + self.logger.info('Detected {}{}{}{} rRNA sequences.'.format( + colors.BOLD, + colors.OKCYAN, + num_rrna, + colors.ENDC + )) + if self.rrna is not None: rrna1_fh.close() rrna2_fh.close() @@ -413,7 +454,7 @@ def run_with_chunks(self): norrna2_fh.close() else: - num_read = 0 + # num_read = 0 self.logger.info('Classify paired end reads with chunk size {}{}{}'.format( colors.BOLD, self.chunk_size, @@ -426,7 +467,7 @@ def run_with_chunks(self): colors.ENDC)) rrna_fh = open_for_write(self.rrna[0]) - num_rrna = 0 + # num_rrna = 0 self.logger.info('Writing output non-rRNA sequences into file: {}{}{}'.format( colors.OKBLUE, @@ -462,12 +503,14 @@ def run_with_chunks(self): p.join() for r_dict in results: + + num_nonrrna += len(r_dict[0]) + num_rrna += len(r_dict[1]) if r_dict[0]: norrna_fh.write('\n'.join(r_dict[0]) + '\n') if self.rrna is not None and r_dict[1]: rrna_fh.write('\n'.join(r_dict[1]) + '\n') - num_rrna += len(r_dict[1]) num_read += len(chunk) @@ -477,13 +520,21 @@ def run_with_chunks(self): num_read, colors.ENDC)) + self.logger.info('Detected {}{}{}{} non-rRNA sequences'.format( + colors.BOLD, + colors.OKCYAN, + num_nonrrna, + colors.ENDC + )) + + self.logger.info('Detected {}{}{}{} rRNA sequences'.format( + colors.BOLD, + colors.OKCYAN, + num_rrna, + colors.ENDC + )) + if self.rrna is not None: - self.logger.info('Done! Detected {}{}{}{} rRNA sequences'.format( - colors.BOLD, - colors.OKCYAN, - num_rrna, - colors.ENDC - )) rrna_fh.close() norrna_fh.close() @@ -725,7 +776,8 @@ def main(): args.add_argument('--chunk_size', default=None, type=int, help='chunk_size * 1024 reads to load each time. \n{}.'.format( 'When chunk_size=1000 and threads=20, consumming ~20G memory, better to be multiples of the number of threads.')) - + args.add_argument('--log', default='ribodetector.log', type=str, + help='Log file name') args.add_argument('-v', '--version', action='version', version='%(prog)s {version}'.format(version=__version__)) diff --git a/ribodetector/parse_config.py b/ribodetector/parse_config.py index bc688ee..7c15b63 100644 --- a/ribodetector/parse_config.py +++ b/ribodetector/parse_config.py @@ -76,14 +76,18 @@ def __getitem__(self, name): """Access items like ordinary dict.""" return self.config[name] - def get_logger(self, name, verbosity=2): + def get_logger(self, name, verbosity=2, logfile=None): + handlers = [logging.StreamHandler()] + if logfile is not None: + handlers.append(logging.FileHandler(logfile, mode='w')) msg_verbosity = 'verbosity option {} is invalid. Valid options are {}.'.format( verbosity, self.log_levels.keys()) assert verbosity in self.log_levels, msg_verbosity logging.basicConfig( level=self.log_levels[verbosity], format='%(asctime)s : %(levelname)s %(message)s', - datefmt='%Y-%m-%d %H:%M:%S') + datefmt='%Y-%m-%d %H:%M:%S', + handlers=handlers) logger = logging.getLogger(name) return logger diff --git a/setup.py b/setup.py index 6908bfe..aee93e2 100644 --- a/setup.py +++ b/setup.py @@ -11,14 +11,14 @@ "tqdm", "numpy", "biopython", - "onnxruntime >= 1.10.0, <= 1.8.2" + "onnxruntime >= 1.10.0, <= 1.15.1", "torch >= 1.7.1, <= 1.12.1", ] setup( name="ribodetector", version="0.2.8", - python_requires=">=3.8, <3.11", + python_requires=">=3.8, <=3.10", author="Z-L Deng", author_email="dawnmsg@gmail.com", description="Accurate and rapid RiboRNA sequences Detector based on deep learning.",