Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MPNST sample updates AND PDX code addition #251

Merged
merged 6 commits into from
Dec 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
321 changes: 321 additions & 0 deletions build/build_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,321 @@
"""
Script that builds a single dataset.
"""

import os
import argparse
import subprocess
import shutil
import gzip
from concurrent.futures import ThreadPoolExecutor
import glob

def run_docker_cmd(cmd_arr, filename):
'''
Wrapper for 'docker run' command. Executes a Docker container with the specified command.
'''
print('Running...', filename)
env = os.environ.copy()
if 'SYNAPSE_AUTH_TOKEN' not in env:
print('You need to set the SYNAPSE_AUTH_TOKEN to access the MPNST and beatAML datasets')
docker_run = ['docker', 'run', '-v', f"{env['PWD']}/local/:/tmp/", '--platform=linux/amd64']
else:
docker_run = ['docker', 'run', '-v', f"{env['PWD']}/local/:/tmp/", '-e', f"SYNAPSE_AUTH_TOKEN={env['SYNAPSE_AUTH_TOKEN']}", '--platform=linux/amd64']

cmd = docker_run + cmd_arr
print('Executing command:', ' '.join(cmd))
res = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
if res.returncode != 0:
print(res.stderr.decode())
exit(f'{filename} failed')
else:
print(f'{filename} completed successfully')

def process_docker(dataset,validate):
'''
Build Docker images required for the specified dataset.
'''
compose_file = 'build/docker/docker-compose.yml'
dataset_map = {
'broad_sanger': ['broad_sanger_exp', 'broad_sanger_omics'],
'hcmi': ['hcmi'],
'beataml': ['beataml'],
'mpnst': ['mpnst'],
'mpnstpdx': ['mpnstpdx'],
'cptac': ['cptac'],
'genes': ['genes'],
'upload': ['upload']
}

# Collect container names to build based on the dataset provided. Always build 'genes'.
datasets_to_build = ['genes']
# Append upload if validation step is included
if validate is True:
datasets_to_build.append('upload')

datasets_to_build.extend(dataset_map.get(dataset, []))

compose_command = ['docker-compose', '-f', compose_file, 'build'] + datasets_to_build

log_file_path = 'local/docker.log'
env = os.environ.copy()

print(f"Docker-compose is building images for {', '.join(datasets_to_build)}. View output in {log_file_path}.")

with open(log_file_path, 'w') as log_file:
try:
subprocess.run(compose_command, env=env, stdout=log_file, stderr=log_file, text=True, check=True)
log_file.write("Docker images built successfully.\n")
print(f"Docker images for {', '.join(datasets_to_build)} built successfully. Details logged in {log_file_path}.")
except subprocess.CalledProcessError as e:
log_file.write(f"Docker compose build failed with error: {e}\n")
print(f"Docker compose build failed. See {log_file_path} for details.")
raise

def process_genes(executor):
'''
Build the genes file if it does not exist.
'''
if not os.path.exists('local/genes.csv'):
executor.submit(run_docker_cmd, ['genes', 'bash', 'build_genes.sh'], 'genes file')

def process_samples(executor, dataset, use_prev_dataset, should_continue):
'''
Build the samples file for the specified dataset.
'''
samples_file = f'local/{dataset}_samples.csv'
if should_continue and os.path.exists(samples_file):
print(f"Samples file for {dataset} already exists. Skipping samples build.")
return

prev_samples_file = f'/tmp/{use_prev_dataset}_samples.csv' if use_prev_dataset else ''
di = 'broad_sanger_omics' if dataset == 'broad_sanger' else dataset
filename = f'{dataset} samples'
executor.submit(run_docker_cmd, [di, 'bash', 'build_samples.sh', prev_samples_file], filename)

def process_drugs(executor, dataset, use_prev_dataset, should_continue):
'''
Build the drugs file for the specified dataset.
'''
if dataset in ['cptac', 'hcmi']:
return # No drugs to process for these datasets

drugs_file = f'local/{dataset}_drugs.tsv'
if should_continue and os.path.exists(drugs_file):
print(f"Drugs file for {dataset} already exists. Skipping drugs build.")
return

prev_drugs_file = f'/tmp/{use_prev_dataset}_drugs.tsv' if use_prev_dataset else ''
dflist = [prev_drugs_file] if use_prev_dataset else []
di = 'broad_sanger_exp' if dataset == 'broad_sanger' else dataset
filename = f'{dataset} drugs'
executor.submit(run_docker_cmd, [di, 'bash', 'build_drugs.sh', ','.join(dflist)], filename)


def process_omics(executor, dataset, should_continue):
'''
Build the omics files for the specified dataset.
'''
# Map datasets to their expected omics files
dataset_omics_files = {
'beataml': ['mutations', 'proteomics', 'transcriptomics'],
'mpnst': ['copy_number', 'mutations', 'proteomics', 'transcriptomics'],
'broad_sanger': ['copy_number', 'mutations', 'proteomics', 'transcriptomics'],
'cptac': ['copy_number', 'mutations', 'proteomics', 'transcriptomics'],
'hcmi': ['mutations', 'transcriptomics'],
'mpnstpdx':['copy_number', 'mutations', 'proteomics', 'transcriptomics']
}

expected_omics = dataset_omics_files.get(dataset, [])

if not expected_omics:
print(f"No omics data expected for dataset {dataset}. Skipping omics build.")
return

# Check if all expected omics files exist
omics_files_exist = True
for omics_type in expected_omics:
patterns = [
f'local/{dataset}_{omics_type}.csv',
f'local/{dataset}_{omics_type}.csv.gz',
f'local/{dataset}_{omics_type}.tsv',
f'local/{dataset}_{omics_type}.tsv.gz'
]
file_found = False
for pattern in patterns:
matches = glob.glob(pattern)
if matches:
file_found = True
break
if not file_found:
omics_files_exist = False
break # If any omics files are missing, just build / rebuild them all.

if should_continue and omics_files_exist:
print(f"Omics files for {dataset} already exist. Skipping omics build.")
return

di = 'broad_sanger_omics' if dataset == 'broad_sanger' else dataset
filename = f'{dataset} omics'
executor.submit(run_docker_cmd, [di, 'bash', 'build_omics.sh', '/tmp/genes.csv', f'/tmp/{dataset}_samples.csv'], filename)


def process_experiments(executor, dataset, should_continue):
'''
Build the experiments files for the specified dataset.
'''
if dataset in ['cptac', 'hcmi']:
return # No experiments to process for these datasets

experiments_file = f'local/{dataset}_experiments.tsv'
if should_continue and os.path.exists(experiments_file):
print(f"Experiments file for {dataset} already exists. Skipping experiments build.")
return

di = 'broad_sanger_exp' if dataset == 'broad_sanger' else dataset
filename = f'{dataset} experiments'
executor.submit(run_docker_cmd, [di, 'bash', 'build_exp.sh', f'/tmp/{dataset}_samples.csv', f'/tmp/{dataset}_drugs.tsv'], filename)



def process_misc(executor, datasets):
'''
Run all misc scripts concurrently or one at a time.
'''
last_misc_future = None
#Currently this only applies to broad_sanger. Add others here if they need a final step.
if "broad_sanger" in datasets:
datasets = ["broad_sanger"]
else:
return
for da in datasets:
di = 'broad_sanger_omics' if da == 'broad_sanger' else da
#Run all at once:
if last_misc_future:
last_misc_future.result()
last_misc_future = executor.submit(run_docker_cmd, [di, 'bash', 'build_misc.sh'], f'{da} misc')



def decompress_file(file_path):
"""Decompress a gzip file and delete the original compressed file."""
with gzip.open(file_path, 'rb') as f_in:
decompressed_file_path = file_path[:-3] # Remove '.gz' from the filename
with open(decompressed_file_path, 'wb') as f_out:
shutil.copyfileobj(f_in, f_out)
os.remove(file_path)

def compress_file(file_path):
"""Compress a file using gzip and delete the original uncompressed file."""
compressed_file_path = file_path + '.gz'
with open(file_path, 'rb') as f_in:
with gzip.open(compressed_file_path, 'wb') as f_out:
shutil.copyfileobj(f_in, f_out)
os.remove(file_path)

def run_docker_validate_cmd(cmd_arr, all_files_dir, name):
'''
Wrapper for 'docker run' command used during validation and uploads.
'''
env = os.environ.copy()
docker_run = ['docker', 'run', '-v', f"{env['PWD']}/local/{all_files_dir}:/tmp"]
docker_run.extend(['upload'])
docker_run.extend(cmd_arr)
print('Executing:', ' '.join(docker_run))
res = subprocess.run(docker_run, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
if res.returncode != 0:
print(res.stderr.decode())
exit(f'{name} failed')
else:
print(f'{name} completed successfully')

def run_schema_checker(dataset):
'''
Run schema checker on the built files for the specified dataset.
'''
# Prepare the directory with the built files
prefixes = ['genes', dataset]
datasets = [dataset]
broad_sanger_datasets = ["ccle","ctrpv2","fimm","gdscv1","gdscv2","gcsi","prism","nci60"]
all_files_dir = 'all_files_dir'
if "broad_sanger" == dataset:
prefixes.extend(broad_sanger_datasets)
datasets.extend(broad_sanger_datasets)
datasets.remove("broad_sanger")
prefixes.remove("broad_sanger")

if not os.path.exists(f'local/{all_files_dir}'):
os.makedirs(f'local/{all_files_dir}')

# Move relevant files to all_files_dir
for file in os.listdir('local'):
if any(file.startswith(prefix) for prefix in prefixes):
shutil.move(os.path.join('local', file), os.path.join('local', all_files_dir, file))

# Decompress any compressed files
for file in os.listdir(f'local/{all_files_dir}'):
if file.endswith('.gz'):
decompress_file(os.path.join('local', all_files_dir, file))

# Run schema checker
schema_check_command = ['python3', 'scripts/check_schema.py', '--datasets'] + datasets
run_docker_validate_cmd(schema_check_command, all_files_dir, 'Validation')

def main():
parser = argparse.ArgumentParser(
description="This script builds a single dataset."
)
parser.add_argument('--dataset', required=True, help='Name of the dataset to build')
parser.add_argument('--use_prev_dataset', help='Prefix of the previous dataset for sample and drug ID assignment')
parser.add_argument('--build', action='store_true', help='Run data build.')
parser.add_argument('--validate', action='store_true', help='Run schema checker on the built files')
parser.add_argument('--continue', dest='should_continue', action='store_true', help='Continue from where the build left off by skipping existing files')

args = parser.parse_args()

if not os.path.exists('local'):
os.mkdir('local')

# Build Docker Image
process_docker(args.dataset,args.validate)

if args.build:
# Use ThreadPoolExecutor for parallel execution
with ThreadPoolExecutor() as executor:
# Always build genes file
process_genes(executor)

# Build samples and drugs
samples_future = executor.submit(process_samples, executor, args.dataset, args.use_prev_dataset, args.should_continue)
drugs_future = executor.submit(process_drugs, executor, args.dataset, args.use_prev_dataset, args.should_continue)

samples_future.result()
drugs_future.result()

print("Samples and Drugs Files Completed.")

with ThreadPoolExecutor() as executor:

# Build omics and experiments
omics_future = executor.submit(process_omics, executor, args.dataset, args.should_continue)
experiments_future = executor.submit(process_experiments, executor, args.dataset, args.should_continue)

omics_future.result()
experiments_future.result()

print("Experiments and Omics Files completed.")

with ThreadPoolExecutor() as executor:

if args.build:
misc_thread = executor.submit(process_misc, executor, args.dataset)
if args.build:
misc_thread.result()
print("Final build step complete.")

if args.validate:
run_schema_checker(args.dataset)
print("Validation completed.")

if __name__ == '__main__':
main()
44 changes: 36 additions & 8 deletions build/docker/Dockerfile.mpnst
Original file line number Diff line number Diff line change
@@ -1,16 +1,44 @@
FROM r-base:4.3.2

# Set environment to noninteractive
ENV DEBIAN_FRONTEND=noninteractive
RUN apt-get update --allow-insecure-repositories
#RUN apt-get install -y --allow-unauthenticated build-essential --fix-missing libpq-dev python3-pip python3-setuptools python3-dev python3-venv libcurl4-openssl-dev libxml2-dev
RUN apt-get install -y --allow-unauthenticated build-essential --fix-missing python3-pip python3-setuptools python3-dev python3-venv libcurl4-openssl-dev libxml2-dev

# Update package list and install required packages
RUN apt-get update && \
apt-get install -y build-essential wget curl libcurl4-openssl-dev libxml2-dev \
zlib1g-dev libssl-dev libbz2-dev libreadline-dev libsqlite3-dev libffi-dev

# Download and compile Python 3.10 with shared library support
RUN wget https://www.python.org/ftp/python/3.10.12/Python-3.10.12.tgz && \
tar -xf Python-3.10.12.tgz && \
cd Python-3.10.12 && \
./configure --enable-optimizations --enable-shared && \
make -j$(nproc) && \
make altinstall && \
cd .. && \
rm -rf Python-3.10.12.tgz Python-3.10.12

# Set Python 3.10 as default
RUN ln -s /usr/local/bin/python3.10 /usr/bin/python3 && \
ln -s /usr/local/bin/pip3.10 /usr/bin/pip3

# Update library paths for Python shared library
RUN echo "/usr/local/lib" >> /etc/ld.so.conf.d/python3.10.conf && ldconfig

# Create a Python virtual environment
RUN python3 -m venv /opt/venv
RUN /opt/venv/bin/pip3 install --upgrade pip

RUN /opt/venv/bin/pip install --upgrade pip

ENV PYTHONPATH "${PYTHONPATH}:/app"
# Set environment variables for reticulate
ENV RETICULATE_PYTHON="/opt/venv/bin/python3"
ENV PYTHONPATH=/app#"${PYTHONPATH}:/app"
WORKDIR /app

# Set MPLCONFIGDIR to a writable directory
ENV MPLCONFIGDIR=/app/tmp/matplotlib
RUN mkdir -p /app/tmp/matplotlib

# Add necessary files to the container
ADD build/mpnst/requirements.txt .
ADD build/mpnst/requirements.r .
ADD build/mpnst/* ./
Expand All @@ -19,8 +47,8 @@ ADD build/utils/* ./
# installing python libraries
RUN /opt/venv/bin/pip3 install -r requirements.txt

# installing r libraries
# Install all R libraries from requirements.r
RUN Rscript requirements.r


# Set up volume for temporary storage
VOLUME ["/tmp"]
Loading