Skip to content

Commit

Permalink
Refactor file path handling
Browse files Browse the repository at this point in the history
  • Loading branch information
tgurbich committed Feb 25, 2021
1 parent 493b932 commit c0d8dc1
Show file tree
Hide file tree
Showing 2 changed files with 50 additions and 95 deletions.
111 changes: 34 additions & 77 deletions ClassifyCNV.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,50 +7,43 @@
import copy
from multiprocessing import Pool
import time
import shutil


__version__ = '1.0.1'
__version__ = '1.1.0'

parser = argparse.ArgumentParser()

parser.add_argument('--infile', required=True, help='Input file in BED format; the first four columns should be '
'chromosome, start position, end position, CNV type (DEL or DUP).')
parser.add_argument('--GenomeBuild', required=True, choices=['hg19', 'hg38'], help='Human assembly version '
'(hg19 or hg38).')
parser.add_argument('--infile', required=True,
help='Input file in BED format; the first four columns should be chromosome, start position, '
'end position, CNV type (DEL or DUP).')
parser.add_argument('--GenomeBuild', required=True, choices=['hg19', 'hg38'],
help='Human assembly version (hg19 or hg38).')
parser.add_argument('--cores', type=int, default=1, help='Maximum number of threads to use. Default: 1')
parser.add_argument('--precise', action='store_true', help='Specify this flag if the CNV breakpoints '
'are precise. WARNING: if the breakpoints are not precise, '
'specifying the flag could lead to incorrect results. '
'Default = False')
parser.add_argument('--outdir', help='Specify the name for the run output directory that will be created inside the'
'ClassifyCNV_results folder. The default name is Result_dd_Mon_yyyy-hh-mm-ss')
parser.add_argument('--precise', action='store_true',
help='Specify this flag if the CNV breakpoints are precise. WARNING: if the breakpoints are not '
'precise, specifying the flag could lead to incorrect results. Default = False')
parser.add_argument('--outdir', default=default_results_folder,
help='Specify path to the run output directory. If no output directory is provided, results will '
'be saved to ClassifyCNV_results/Result_dd_Mon_yyyy-hh-mm-ss-{random}')
args = parser.parse_args()


def make_results_folder():
"""Creates a directory where all results and technical files will be saved to.
"""Creates a directory where all results and technical files will be saved to if it doesn't already exist.
"""
# check if the main results folder already exists
# check if the main results folder already exists and if it is empty
# if not, create it before proceeding
if not os.path.isdir(os.path.join(home_dir, main_results_folder)):
try:
os.mkdir(os.path.join(home_dir, main_results_folder))
except OSError:
print('Cannot create main results folder', main_results_folder)
sys.exit(1)
# make the results folder for this run
try:
os.mkdir(path_to_results)
except OSError:
print('Cannot create results folder', path_to_results)
sys.exit(1)
if os.path.isdir(args.outdir):
assert not os.listdir(args.outdir), "Results directory is not empty"
else:
if args.outdir == default_results_folder:
os.makedirs(args.outdir)
else:
os.mkdir(args.outdir)
# change to the results directory
os.chdir(args.outdir)
# make the intermediate folder where the technical files will be saved to
try:
os.mkdir(path_to_intermediate)
except OSError:
print('Cannot create intermediate results folder', path_to_intermediate)
sys.exit(1)
os.mkdir(intermediate_folder)


def run_in_parallel(function, params_list, cores):
Expand All @@ -76,22 +69,24 @@ def run_in_parallel(function, params_list, cores):
return returncodes


def parse_infile():
def parse_infile(infile):
"""Parses the BED infile.
Adds "chr" to the chromosome number if needed, removes duplicate entries.
Creates a new "clean" BED file for further manipulations.
Args:
infile: Original CNV file submitted by the user.
Returns:
parsed_list: A list of CNVs in the chr_start_end_type format, for example, chr1_1000_2500_DEL.
"""
parsed_list = set()
bed_infile = open(args.infile, 'r')
bed_infile = open(infile, 'r')
parsed_outfile = open(cleaned_bed_path, 'w')
for line in bed_infile:
fields = line.strip().split()
if len(fields) < 4:
shutil.rmtree(path_to_results)
sys.exit("ERROR: the input file must have at least 4 columns")
# check that the line starts with the chromosome number, skip the line if it does not
if fields[0][:3] not in ['chr', 'X', 'Y', 'M', '1', '2', '3', '4', '5', '6', '7', '8', '9',
Expand All @@ -103,7 +98,6 @@ def parse_infile():
continue
# stop execution if the fourth column does not contain the CNV type
if fields[3] not in ['DEL', 'DUP']:
shutil.rmtree(path_to_results)
sys.exit("ERROR: the 4th column of the input file does not contain the CNV type (DEL/DUP).")
# each chromosome number should start with 'chr'
if not fields[0].startswith('chr'):
Expand Down Expand Up @@ -138,7 +132,6 @@ def run_bedtools_intersect(file_b_type):
intersect_proc.wait()
if not intersect_proc.returncode == 0:
print("ERROR when running BEDTools on ", file_b_type)
shutil.rmtree(path_to_results)
sys.exit(1)


Expand Down Expand Up @@ -757,7 +750,7 @@ def generate_results():
and prints full results to file.
"""
results_out = open(scoresheet, 'w')
results_out = open(scoresheet_filename, 'w')
results_out.write(scoresheet_header + '\n')
for cnv in sorted(cnv_list):
# add up individual scores for each element in the rubric to get the final score
Expand Down Expand Up @@ -799,36 +792,6 @@ def generate_results():
results_out.close()


def rename_directory():
"""If a preferred output directory is specified, this function renames the results directory.
If the specified directory already exists, result files are copied over and the temporary results
directory is deleted.
Returns:
Renames the results directory.
"""
new_output_path = os.path.join(home_dir, main_results_folder, args.outdir)
# check if the specified directory already exists; if so - copy files over to this directory
if os.path.isdir(new_output_path):
try:
shutil.copyfile(scoresheet, os.path.join(new_output_path, scoresheet_filename))
except OSError:
print('Incorrect folder name, results are in', path_to_results)
sys.exit(1)
if os.path.isdir(os.path.join(new_output_path, intermediate_folder)):
shutil.rmtree(os.path.join(new_output_path, intermediate_folder))
shutil.copytree(path_to_intermediate, os.path.join(new_output_path, intermediate_folder))
shutil.rmtree(path_to_results)
# if the directory doesn't already exist, rename the temporary directory with user-specified name
else:
try:
os.rename(path_to_results, new_output_path)
except OSError:
print('Incorrect folder name, results are in', path_to_results)
sys.exit(1)


if __name__ == "__main__":
t_start = time.perf_counter() # time the run
print(__file__, 'Version', __version__)
Expand All @@ -840,8 +803,9 @@ def rename_directory():
if args.precise:
breakpoints = dict() # stores intragenic CNVs

make_results_folder() # create a folder where the results will be stored
cnv_list = parse_infile() # save each CNV as chr_start_end_type and print a new file for BEDTools
infile_path = os.path.abspath(args.infile)
make_results_folder() # create a folder where the results will be stored if it doesn't already exist
cnv_list = parse_infile(infile_path) # save each CNV as chr_start_end_type and print a new file for BEDTools

# make empty result dictionaries
for cnv in cnv_list:
Expand All @@ -856,14 +820,7 @@ def rename_directory():
# calculate the total score, determine pathogenicity, print results to file
generate_results()

# rename the output directory if the user specified a preferred name
if args.outdir:
rename_directory()

t_stop = time.perf_counter()
t_fact = t_stop - t_start
if args.outdir:
print('Results saved to', os.path.join(home_dir, main_results_folder, args.outdir) + '/')
else:
print('Results saved to', path_to_results + '/')
print('Results saved to', args.outdir)
print('Elapsed time:', '{0:.2f}'.format(t_fact), 'seconds')
34 changes: 16 additions & 18 deletions resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,21 +9,21 @@
import sys
from datetime import datetime
from collections import OrderedDict
import random
import string

# Results path
# Default results path
home_dir = os.path.dirname(os.path.realpath(sys.argv[0])) # directory where the script is actually located
run_origin_dir = os.path.abspath(os.getcwd()) # directory where the script is run from
main_results_folder = 'ClassifyCNV_results'
run_folder_prefix = 'Result_'
run_results_folder = run_folder_prefix + datetime.now().strftime("%d-%b-%Y-%H-%M-%S")
path_to_results = os.path.join(home_dir, main_results_folder, run_results_folder)
random_string = ''.join(random.choices(string.ascii_lowercase + string.digits, k=8))
run_results_folder = run_folder_prefix + datetime.now().strftime("%d-%b-%Y-%H-%M-%S-") + random_string
default_results_folder = os.path.join(main_results_folder, run_results_folder)
intermediate_folder = 'Intermediate_files'
path_to_intermediate = os.path.join(path_to_results, intermediate_folder)


# Filename for the cleaned input
cleaned_bed = 'infile.cleaned.bed'
cleaned_bed_path = os.path.join(path_to_intermediate, cleaned_bed)
cleaned_bed_path = os.path.join(intermediate_folder, cleaned_bed)

# Resources and databases
main_resources_folder = 'Resources'
Expand Down Expand Up @@ -57,16 +57,15 @@
gene_features_intersect = 'gene_features_intersect.bed'
pop_freqs_intersect = 'population_freqs_intersect.bed'

refgenes_intersect_path = os.path.join(path_to_intermediate, refgenes_intersect)
promoters_intersect_path = os.path.join(path_to_intermediate, promoters_intersect)
enhancers_intersect_path = os.path.join(path_to_intermediate, enhancers_intersect)
clingen_hi_intersect_path = os.path.join(path_to_intermediate, clingen_hi_intersect)
clingen_ts_intersect_path = os.path.join(path_to_intermediate, clingen_ts_intersect)
clingen_regions_hi_intersect_path = os.path.join(path_to_intermediate, clingen_regions_hi_intersect)
clingen_regions_ts_intersect_path = os.path.join(path_to_intermediate, clingen_regions_ts_intersect)
gene_features_intersect_path = os.path.join(path_to_intermediate, gene_features_intersect)
pop_freqs_intersect_path = os.path.join(path_to_intermediate, pop_freqs_intersect)

refgenes_intersect_path = os.path.join(intermediate_folder, refgenes_intersect)
promoters_intersect_path = os.path.join(intermediate_folder, promoters_intersect)
enhancers_intersect_path = os.path.join(intermediate_folder, enhancers_intersect)
clingen_hi_intersect_path = os.path.join(intermediate_folder, clingen_hi_intersect)
clingen_ts_intersect_path = os.path.join(intermediate_folder, clingen_ts_intersect)
clingen_regions_hi_intersect_path = os.path.join(intermediate_folder, clingen_regions_hi_intersect)
clingen_regions_ts_intersect_path = os.path.join(intermediate_folder, clingen_regions_ts_intersect)
gene_features_intersect_path = os.path.join(intermediate_folder, gene_features_intersect)
pop_freqs_intersect_path = os.path.join(intermediate_folder, pop_freqs_intersect)

databases = {
'genes': {'source': refgenes_db, 'result_path': refgenes_intersect_path},
Expand All @@ -89,7 +88,6 @@

# Printed results
scoresheet_filename = 'Scoresheet.txt'
scoresheet = os.path.join(path_to_results, scoresheet_filename)
scoresheet_header = '\t'.join(['VariantID', 'Chromosome', 'Start', 'End', 'Type', 'Classification', 'Total score']) + '\t'
scoresheet_header += '\t'.join(rubric.keys()) + '\t' + 'Known or predicted dosage-sensitive genes' + \
'\t' + 'All protein coding genes'
Expand Down

0 comments on commit c0d8dc1

Please sign in to comment.