PNNL-CompBio · jjacobson95 · Dec 6, 2024 · Nov 21, 2024 · Nov 21, 2024 · Nov 21, 2024
diff --git a/build/build_dataset.py b/build/build_dataset.py
@@ -0,0 +1,321 @@
+"""
+Script that builds a single dataset.
+"""
+
+import os
+import argparse
+import subprocess
+import shutil
+import gzip
+from concurrent.futures import ThreadPoolExecutor
+import glob 
+
+def run_docker_cmd(cmd_arr, filename):
+    '''
+    Wrapper for 'docker run' command. Executes a Docker container with the specified command.
+    '''
+    print('Running...', filename)
+    env = os.environ.copy()
+    if 'SYNAPSE_AUTH_TOKEN' not in env:
+        print('You need to set the SYNAPSE_AUTH_TOKEN to access the MPNST and beatAML datasets')
+        docker_run = ['docker', 'run', '-v', f"{env['PWD']}/local/:/tmp/", '--platform=linux/amd64']
+    else:
+        docker_run = ['docker', 'run',  '-v', f"{env['PWD']}/local/:/tmp/", '-e', f"SYNAPSE_AUTH_TOKEN={env['SYNAPSE_AUTH_TOKEN']}", '--platform=linux/amd64']
+
+    cmd = docker_run + cmd_arr
+    print('Executing command:', ' '.join(cmd))
+    res = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    if res.returncode != 0:
+        print(res.stderr.decode())
+        exit(f'{filename} failed')
+    else:
+        print(f'{filename} completed successfully')
+
+def process_docker(dataset,validate):
+    '''
+    Build Docker images required for the specified dataset.
+    '''
+    compose_file = 'build/docker/docker-compose.yml'
+    dataset_map = {
+        'broad_sanger': ['broad_sanger_exp', 'broad_sanger_omics'],
+        'hcmi': ['hcmi'],
+        'beataml': ['beataml'],
+        'mpnst': ['mpnst'],
+        'mpnstpdx': ['mpnstpdx'],
+        'cptac': ['cptac'],
+        'genes': ['genes'],
+        'upload': ['upload']
+    }
+
+    # Collect container names to build based on the dataset provided. Always build 'genes'.
+    datasets_to_build = ['genes']
+    # Append upload if validation step is included
+    if validate is True:
+        datasets_to_build.append('upload')
+
+    datasets_to_build.extend(dataset_map.get(dataset, []))
+
+    compose_command = ['docker-compose', '-f', compose_file, 'build'] + datasets_to_build
+
+    log_file_path = 'local/docker.log'
+    env = os.environ.copy()
+
+    print(f"Docker-compose is building images for {', '.join(datasets_to_build)}. View output in {log_file_path}.")
+
+    with open(log_file_path, 'w') as log_file:
+        try:
+            subprocess.run(compose_command, env=env, stdout=log_file, stderr=log_file, text=True, check=True)
+            log_file.write("Docker images built successfully.\n")
+            print(f"Docker images for {', '.join(datasets_to_build)} built successfully. Details logged in {log_file_path}.")
+        except subprocess.CalledProcessError as e:
+            log_file.write(f"Docker compose build failed with error: {e}\n")
+            print(f"Docker compose build failed. See {log_file_path} for details.")
+            raise
+
+def process_genes(executor):
+    '''
+    Build the genes file if it does not exist.
+    '''
+    if not os.path.exists('local/genes.csv'):
+        executor.submit(run_docker_cmd, ['genes', 'bash', 'build_genes.sh'], 'genes file')
+
+def process_samples(executor, dataset, use_prev_dataset, should_continue):
+    '''
+    Build the samples file for the specified dataset.
+    '''
+    samples_file = f'local/{dataset}_samples.csv'
+    if should_continue and os.path.exists(samples_file):
+        print(f"Samples file for {dataset} already exists. Skipping samples build.")
+        return
+
+    prev_samples_file = f'/tmp/{use_prev_dataset}_samples.csv' if use_prev_dataset else ''
+    di = 'broad_sanger_omics' if dataset == 'broad_sanger' else dataset
+    filename = f'{dataset} samples'
+    executor.submit(run_docker_cmd, [di, 'bash', 'build_samples.sh', prev_samples_file], filename)
+
+def process_drugs(executor, dataset, use_prev_dataset, should_continue):
+    '''
+    Build the drugs file for the specified dataset.
+    '''
+    if dataset in ['cptac', 'hcmi']:
+        return  # No drugs to process for these datasets
+
+    drugs_file = f'local/{dataset}_drugs.tsv'
+    if should_continue and os.path.exists(drugs_file):
+        print(f"Drugs file for {dataset} already exists. Skipping drugs build.")
+        return
+
+    prev_drugs_file = f'/tmp/{use_prev_dataset}_drugs.tsv' if use_prev_dataset else ''
+    dflist = [prev_drugs_file] if use_prev_dataset else []
+    di = 'broad_sanger_exp' if dataset == 'broad_sanger' else dataset
+    filename = f'{dataset} drugs'
+    executor.submit(run_docker_cmd, [di, 'bash', 'build_drugs.sh', ','.join(dflist)], filename)
+
+
+def process_omics(executor, dataset, should_continue):
+    '''
+    Build the omics files for the specified dataset.
+    '''
+    # Map datasets to their expected omics files
+    dataset_omics_files = {
+        'beataml': ['mutations', 'proteomics', 'transcriptomics'],
+        'mpnst': ['copy_number', 'mutations', 'proteomics', 'transcriptomics'],
+        'broad_sanger': ['copy_number', 'mutations', 'proteomics', 'transcriptomics'],
+        'cptac': ['copy_number', 'mutations', 'proteomics', 'transcriptomics'],
+        'hcmi': ['mutations', 'transcriptomics'],
+        'mpnstpdx':['copy_number', 'mutations', 'proteomics', 'transcriptomics']
+    }
+
+    expected_omics = dataset_omics_files.get(dataset, [])
+
+    if not expected_omics:
+        print(f"No omics data expected for dataset {dataset}. Skipping omics build.")
+        return
+
+    # Check if all expected omics files exist
+    omics_files_exist = True
+    for omics_type in expected_omics:
+        patterns = [
+            f'local/{dataset}_{omics_type}.csv',
+            f'local/{dataset}_{omics_type}.csv.gz',
+            f'local/{dataset}_{omics_type}.tsv',
+            f'local/{dataset}_{omics_type}.tsv.gz'
+        ]
+        file_found = False
+        for pattern in patterns:
+            matches = glob.glob(pattern)
+            if matches:
+                file_found = True
+                break
+        if not file_found:
+            omics_files_exist = False
+            break  # If any omics files are missing, just build / rebuild them all.
+
+    if should_continue and omics_files_exist:
+        print(f"Omics files for {dataset} already exist. Skipping omics build.")
+        return
+
+    di = 'broad_sanger_omics' if dataset == 'broad_sanger' else dataset
+    filename = f'{dataset} omics'
+    executor.submit(run_docker_cmd, [di, 'bash', 'build_omics.sh', '/tmp/genes.csv', f'/tmp/{dataset}_samples.csv'], filename)
+
+
+def process_experiments(executor, dataset, should_continue):
+    '''
+    Build the experiments files for the specified dataset.
+    '''
+    if dataset in ['cptac', 'hcmi']:
+        return  # No experiments to process for these datasets
+
+    experiments_file = f'local/{dataset}_experiments.tsv'
+    if should_continue and os.path.exists(experiments_file):
+        print(f"Experiments file for {dataset} already exists. Skipping experiments build.")
+        return
+
+    di = 'broad_sanger_exp' if dataset == 'broad_sanger' else dataset
+    filename = f'{dataset} experiments'
+    executor.submit(run_docker_cmd, [di, 'bash', 'build_exp.sh', f'/tmp/{dataset}_samples.csv', f'/tmp/{dataset}_drugs.tsv'], filename)
+
+
+
+def process_misc(executor, datasets):
+    '''
+    Run all misc scripts concurrently or one at a time.
+    '''
+    last_misc_future = None
+    #Currently this only applies to broad_sanger. Add others here if they need a final step.
+    if "broad_sanger" in datasets:
+        datasets = ["broad_sanger"]
+    else:
+        return
+    for da in datasets:
+        di = 'broad_sanger_omics' if da == 'broad_sanger' else da
+        #Run all at once:
+        if last_misc_future:
+            last_misc_future.result() 
+        last_misc_future = executor.submit(run_docker_cmd,  [di, 'bash', 'build_misc.sh'], f'{da} misc')
+
+
+
+def decompress_file(file_path):
+    """Decompress a gzip file and delete the original compressed file."""
+    with gzip.open(file_path, 'rb') as f_in:
+        decompressed_file_path = file_path[:-3]  # Remove '.gz' from the filename
+        with open(decompressed_file_path, 'wb') as f_out:
+            shutil.copyfileobj(f_in, f_out)
+    os.remove(file_path)
+
+def compress_file(file_path):
+    """Compress a file using gzip and delete the original uncompressed file."""
+    compressed_file_path = file_path + '.gz'
+    with open(file_path, 'rb') as f_in:
+        with gzip.open(compressed_file_path, 'wb') as f_out:
+            shutil.copyfileobj(f_in, f_out)
+    os.remove(file_path)
+
+def run_docker_validate_cmd(cmd_arr, all_files_dir, name):
+    '''
+    Wrapper for 'docker run' command used during validation and uploads.
+    '''
+    env = os.environ.copy()
+    docker_run = ['docker', 'run', '-v', f"{env['PWD']}/local/{all_files_dir}:/tmp"]
+    docker_run.extend(['upload']) 
+    docker_run.extend(cmd_arr)
+    print('Executing:', ' '.join(docker_run))
+    res = subprocess.run(docker_run, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    if res.returncode != 0:
+        print(res.stderr.decode())
+        exit(f'{name} failed')
+    else:
+        print(f'{name} completed successfully')
+
+def run_schema_checker(dataset):
+    '''
+    Run schema checker on the built files for the specified dataset.
+    '''
+    # Prepare the directory with the built files
+    prefixes = ['genes', dataset]
+    datasets = [dataset]
+    broad_sanger_datasets = ["ccle","ctrpv2","fimm","gdscv1","gdscv2","gcsi","prism","nci60"]
+    all_files_dir = 'all_files_dir'
+    if "broad_sanger" == dataset:
+            prefixes.extend(broad_sanger_datasets)
+            datasets.extend(broad_sanger_datasets)
+            datasets.remove("broad_sanger")
+            prefixes.remove("broad_sanger")
+
+    if not os.path.exists(f'local/{all_files_dir}'):
+        os.makedirs(f'local/{all_files_dir}')
+
+    # Move relevant files to all_files_dir
+    for file in os.listdir('local'):
+        if any(file.startswith(prefix) for prefix in prefixes):
+            shutil.move(os.path.join('local', file), os.path.join('local', all_files_dir, file))
+
+    # Decompress any compressed files
+    for file in os.listdir(f'local/{all_files_dir}'):
+        if file.endswith('.gz'):
+            decompress_file(os.path.join('local', all_files_dir, file))
+
+    # Run schema checker
+    schema_check_command = ['python3', 'scripts/check_schema.py', '--datasets'] + datasets
+    run_docker_validate_cmd(schema_check_command, all_files_dir, 'Validation')
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="This script builds a single dataset."
+    )
+    parser.add_argument('--dataset', required=True, help='Name of the dataset to build')
+    parser.add_argument('--use_prev_dataset', help='Prefix of the previous dataset for sample and drug ID assignment')
+    parser.add_argument('--build', action='store_true', help='Run data build.')
+    parser.add_argument('--validate', action='store_true', help='Run schema checker on the built files')
+    parser.add_argument('--continue', dest='should_continue', action='store_true', help='Continue from where the build left off by skipping existing files')
+
+    args = parser.parse_args()
+
+    if not os.path.exists('local'):
+        os.mkdir('local')
+
+    # Build Docker Image
+    process_docker(args.dataset,args.validate)
+
+    if args.build:
+        # Use ThreadPoolExecutor for parallel execution
+        with ThreadPoolExecutor() as executor:
+            # Always build genes file
+            process_genes(executor)
+
+            # Build samples and drugs
+            samples_future = executor.submit(process_samples, executor, args.dataset, args.use_prev_dataset, args.should_continue)
+            drugs_future = executor.submit(process_drugs, executor, args.dataset, args.use_prev_dataset, args.should_continue)
+
+            samples_future.result()
+            drugs_future.result()
+
+        print("Samples and Drugs Files Completed.")
+
+        with ThreadPoolExecutor() as executor:
+
+            # Build omics and experiments
+            omics_future = executor.submit(process_omics, executor, args.dataset, args.should_continue)
+            experiments_future = executor.submit(process_experiments, executor, args.dataset, args.should_continue)
+
+            omics_future.result()
+            experiments_future.result()
+
+        print("Experiments and Omics Files completed.")
+
+        with ThreadPoolExecutor() as executor:
+
+            if args.build:
+                misc_thread = executor.submit(process_misc, executor, args.dataset)
+            if args.build:
+                misc_thread.result()
+                print("Final build step complete.")
+
+    if args.validate:
+        run_schema_checker(args.dataset)
+        print("Validation completed.")
+
+if __name__ == '__main__':
+    main()
diff --git a/build/docker/Dockerfile.mpnst b/build/docker/Dockerfile.mpnst
@@ -1,16 +1,44 @@
 FROM r-base:4.3.2
+
+# Set environment to noninteractive
 ENV DEBIAN_FRONTEND=noninteractive
-RUN apt-get update --allow-insecure-repositories
-#RUN apt-get install -y --allow-unauthenticated build-essential --fix-missing libpq-dev python3-pip python3-setuptools python3-dev python3-venv libcurl4-openssl-dev libxml2-dev
-RUN apt-get install -y --allow-unauthenticated build-essential --fix-missing python3-pip python3-setuptools python3-dev python3-venv libcurl4-openssl-dev libxml2-dev 
 
+# Update package list and install required packages
+RUN apt-get update && \
+    apt-get install -y build-essential wget curl libcurl4-openssl-dev libxml2-dev \
+    zlib1g-dev libssl-dev libbz2-dev libreadline-dev libsqlite3-dev libffi-dev
+
+# Download and compile Python 3.10 with shared library support
+RUN wget https://www.python.org/ftp/python/3.10.12/Python-3.10.12.tgz && \
+    tar -xf Python-3.10.12.tgz && \
+    cd Python-3.10.12 && \
+    ./configure --enable-optimizations --enable-shared && \
+    make -j$(nproc) && \
+    make altinstall && \
+    cd .. && \
+    rm -rf Python-3.10.12.tgz Python-3.10.12
+
+# Set Python 3.10 as default
+RUN ln -s /usr/local/bin/python3.10 /usr/bin/python3 && \
+    ln -s /usr/local/bin/pip3.10 /usr/bin/pip3
+
+# Update library paths for Python shared library
+RUN echo "/usr/local/lib" >> /etc/ld.so.conf.d/python3.10.conf && ldconfig
+
+# Create a Python virtual environment
 RUN python3 -m venv /opt/venv
-RUN /opt/venv/bin/pip3 install --upgrade pip
-
+RUN /opt/venv/bin/pip install --upgrade pip
 
-ENV PYTHONPATH "${PYTHONPATH}:/app"
+# Set environment variables for reticulate
+ENV RETICULATE_PYTHON="/opt/venv/bin/python3"
+ENV PYTHONPATH=/app#"${PYTHONPATH}:/app"
 WORKDIR /app
 
+# Set MPLCONFIGDIR to a writable directory
+ENV MPLCONFIGDIR=/app/tmp/matplotlib
+RUN mkdir -p /app/tmp/matplotlib
+
+# Add necessary files to the container
 ADD build/mpnst/requirements.txt .
 ADD build/mpnst/requirements.r .
 ADD build/mpnst/* ./
@@ -19,8 +47,8 @@ ADD build/utils/* ./
 # installing python libraries
 RUN /opt/venv/bin/pip3 install -r requirements.txt
 
-# installing r libraries
+# Install all R libraries from requirements.r
 RUN Rscript requirements.r
 
-
+# Set up volume for temporary storage
 VOLUME ["/tmp"]