diff --git a/.gitignore b/.gitignore index abe11b1..fb27253 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,5 @@ data/ - +executions/ # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/compute1/launch_notebook.sh b/compute1/launch_notebook.sh new file mode 100644 index 0000000..d9db527 --- /dev/null +++ b/compute1/launch_notebook.sh @@ -0,0 +1,4 @@ +export LSF_DOCKER_VOLUMES="/storage1/fs1/dinglab:/storage1/fs1/dinglab /scratch1/fs1/dinglab:/scratch1/fs1/dinglab /home/estorrs:/home/estorrs" +export PATH="/miniconda/envs/ancestry/bin:$PATH" + +LSF_DOCKER_PORTS='8282:8888' bsub -R 'select[mem>10GB,port8282=1] rusage[mem=10GB] span[hosts=1]' -M 11GB -q general-interactive -G compute-dinglab -Is -a 'docker(estorrs/ancestry-pipeline:0.0.1)' 'jupyter notebook --port 8888 --no-browser --ip=0.0.0.0' diff --git a/notebooks/make_test_data.ipynb b/notebooks/make_test_data.ipynb new file mode 100644 index 0000000..1078006 --- /dev/null +++ b/notebooks/make_test_data.ipynb @@ -0,0 +1,596 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "id": "a63150d1", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import re\n", + "\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "accd9693", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + " | # sample_name | \n", + "case | \n", + "disease | \n", + "experimental_strategy | \n", + "sample_type | \n", + "data_path | \n", + "filesize | \n", + "data_format | \n", + "reference | \n", + "UUID | \n", + "system | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|
29 | \n", + "C1230738.WXS.T.ADNA_eb44394c.hg38 | \n", + "C1230738 | \n", + "GBM | \n", + "WXS | \n", + "tumor | \n", + "/storage1/fs1/dinglab/Active/Primary/CPTAC3.sh... | \n", + "34561898412 | \n", + "BAM | \n", + "hg38 | \n", + "f6c72dde-4426-4a98-9716-e4490b425df3 | \n", + "storage1 | \n", + "
31 | \n", + "C1230738.WXS.T.hg38 | \n", + "C1230738 | \n", + "GBM | \n", + "WXS | \n", + "tumor | \n", + "/storage1/fs1/dinglab/Active/Primary/CPTAC3.sh... | \n", + "34971128893 | \n", + "BAM | \n", + "hg38 | \n", + "b06c50fe-383f-4cce-a9b0-9f70f118d5e0 | \n", + "storage1 | \n", + "
51 | \n", + "C1245129.WXS.T.ADNA_f4f0a623.hg38 | \n", + "C1245129 | \n", + "GBM | \n", + "WXS | \n", + "tumor | \n", + "/storage1/fs1/dinglab/Active/Primary/CPTAC3.sh... | \n", + "37402084587 | \n", + "BAM | \n", + "hg38 | \n", + "9e05a653-da61-49f4-b2a7-b58f4781e760 | \n", + "storage1 | \n", + "
53 | \n", + "C1245129.WXS.T.hg38 | \n", + "C1245129 | \n", + "GBM | \n", + "WXS | \n", + "tumor | \n", + "/storage1/fs1/dinglab/Active/Primary/CPTAC3.sh... | \n", + "39273198570 | \n", + "BAM | \n", + "hg38 | \n", + "923cea70-4973-4a54-ad99-151028fe7669 | \n", + "storage1 | \n", + "
81 | \n", + "C204057.WXS.T.ADNA_fb79d37d.hg38 | \n", + "C204057 | \n", + "GBM | \n", + "WXS | \n", + "tumor | \n", + "/storage1/fs1/dinglab/Active/Primary/CPTAC3.sh... | \n", + "36138400689 | \n", + "BAM | \n", + "hg38 | \n", + "6a93dee0-802a-47cc-90a5-37f686a9aebf | \n", + "storage1 | \n", + "
... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "
7190 | \n", + "C761370.WXS.T.ADNA_260f1df4.hg38 | \n", + "C761370 | \n", + "GBM | \n", + "WXS | \n", + "tumor | \n", + "/storage1/fs1/dinglab/Active/Primary/CPTAC3.sh... | \n", + "44821987147 | \n", + "BAM | \n", + "hg38 | \n", + "fd66e1f9-ce3b-4a76-9e48-23bf78568f41 | \n", + "storage1 | \n", + "
7192 | \n", + "C761370.WXS.T.hg38 | \n", + "C761370 | \n", + "GBM | \n", + "WXS | \n", + "tumor | \n", + "/storage1/fs1/dinglab/Active/Primary/CPTAC3.sh... | \n", + "34824648092 | \n", + "BAM | \n", + "hg38 | \n", + "8541768d-a439-4abf-987e-d08039a4ec1d | \n", + "storage1 | \n", + "
7214 | \n", + "C827913.WXS.T.hg38 | \n", + "C827913 | \n", + "GBM | \n", + "WXS | \n", + "tumor | \n", + "/storage1/fs1/dinglab/Active/Primary/CPTAC3.sh... | \n", + "31571237792 | \n", + "BAM | \n", + "hg38 | \n", + "295d757c-a37e-49f5-9d26-65dff2dbbf13 | \n", + "storage1 | \n", + "
7238 | \n", + "C846363.WXS.T.ADNA_51dcc4a3.hg38 | \n", + "C846363 | \n", + "GBM | \n", + "WXS | \n", + "tumor | \n", + "/storage1/fs1/dinglab/Active/Primary/CPTAC3.sh... | \n", + "47361689745 | \n", + "BAM | \n", + "hg38 | \n", + "83a989e8-836e-42b2-a240-f87f0858d189 | \n", + "storage1 | \n", + "
7240 | \n", + "C846363.WXS.T.hg38 | \n", + "C846363 | \n", + "GBM | \n", + "WXS | \n", + "tumor | \n", + "/storage1/fs1/dinglab/Active/Primary/CPTAC3.sh... | \n", + "22124861490 | \n", + "BAM | \n", + "hg38 | \n", + "e3e1eeb8-e749-4520-83e2-3ae1091a7207 | \n", + "storage1 | \n", + "
137 rows × 11 columns
\n", + "\n", + " | sample_id | \n", + "filepath | \n", + "
---|---|---|
29 | \n", + "C1230738.WXS.T.ADNA_eb44394c.hg38 | \n", + "/storage1/fs1/dinglab/Active/Primary/CPTAC3.sh... | \n", + "
31 | \n", + "C1230738.WXS.T.hg38 | \n", + "/storage1/fs1/dinglab/Active/Primary/CPTAC3.sh... | \n", + "
51 | \n", + "C1245129.WXS.T.ADNA_f4f0a623.hg38 | \n", + "/storage1/fs1/dinglab/Active/Primary/CPTAC3.sh... | \n", + "
53 | \n", + "C1245129.WXS.T.hg38 | \n", + "/storage1/fs1/dinglab/Active/Primary/CPTAC3.sh... | \n", + "
81 | \n", + "C204057.WXS.T.ADNA_fb79d37d.hg38 | \n", + "/storage1/fs1/dinglab/Active/Primary/CPTAC3.sh... | \n", + "
... | \n", + "... | \n", + "... | \n", + "
7190 | \n", + "C761370.WXS.T.ADNA_260f1df4.hg38 | \n", + "/storage1/fs1/dinglab/Active/Primary/CPTAC3.sh... | \n", + "
7192 | \n", + "C761370.WXS.T.hg38 | \n", + "/storage1/fs1/dinglab/Active/Primary/CPTAC3.sh... | \n", + "
7214 | \n", + "C827913.WXS.T.hg38 | \n", + "/storage1/fs1/dinglab/Active/Primary/CPTAC3.sh... | \n", + "
7238 | \n", + "C846363.WXS.T.ADNA_51dcc4a3.hg38 | \n", + "/storage1/fs1/dinglab/Active/Primary/CPTAC3.sh... | \n", + "
7240 | \n", + "C846363.WXS.T.hg38 | \n", + "/storage1/fs1/dinglab/Active/Primary/CPTAC3.sh... | \n", + "
137 rows × 2 columns
\n", + "\n", + " | sample_id | \n", + "filepath | \n", + "
---|---|---|
29 | \n", + "C1230738.WXS.T.ADNA_eb44394c.hg38 | \n", + "/storage1/fs1/dinglab/Active/Primary/CPTAC3.sh... | \n", + "
31 | \n", + "C1230738.WXS.T.hg38 | \n", + "/storage1/fs1/dinglab/Active/Primary/CPTAC3.sh... | \n", + "
51 | \n", + "C1245129.WXS.T.ADNA_f4f0a623.hg38 | \n", + "/storage1/fs1/dinglab/Active/Primary/CPTAC3.sh... | \n", + "
53 | \n", + "C1245129.WXS.T.hg38 | \n", + "/storage1/fs1/dinglab/Active/Primary/CPTAC3.sh... | \n", + "
81 | \n", + "C204057.WXS.T.ADNA_fb79d37d.hg38 | \n", + "/storage1/fs1/dinglab/Active/Primary/CPTAC3.sh... | \n", + "
... | \n", + "... | \n", + "... | \n", + "
7190 | \n", + "C761370.WXS.T.ADNA_260f1df4.hg38 | \n", + "/storage1/fs1/dinglab/Active/Primary/CPTAC3.sh... | \n", + "
7192 | \n", + "C761370.WXS.T.hg38 | \n", + "/storage1/fs1/dinglab/Active/Primary/CPTAC3.sh... | \n", + "
7214 | \n", + "C827913.WXS.T.hg38 | \n", + "/storage1/fs1/dinglab/Active/Primary/CPTAC3.sh... | \n", + "
7238 | \n", + "C846363.WXS.T.ADNA_51dcc4a3.hg38 | \n", + "/storage1/fs1/dinglab/Active/Primary/CPTAC3.sh... | \n", + "
7240 | \n", + "C846363.WXS.T.hg38 | \n", + "/storage1/fs1/dinglab/Active/Primary/CPTAC3.sh... | \n", + "
69 rows × 2 columns
\n", + "\n", + " | sample_id | \n", + "filepath | \n", + "
---|---|---|
0 | \n", + "C1230738.WXS.T.ADNA_eb44394c.hg38 | \n", + "/storage1/fs1/dinglab/Active/Primary/CPTAC3.sh... | \n", + "
1 | \n", + "C1230738.WXS.T.hg38 | \n", + "/storage1/fs1/dinglab/Active/Primary/CPTAC3.sh... | \n", + "
2 | \n", + "C1245129.WXS.T.ADNA_f4f0a623.hg38 | \n", + "/storage1/fs1/dinglab/Active/Primary/CPTAC3.sh... | \n", + "
3 | \n", + "C1245129.WXS.T.hg38 | \n", + "/storage1/fs1/dinglab/Active/Primary/CPTAC3.sh... | \n", + "
4 | \n", + "C204057.WXS.T.ADNA_fb79d37d.hg38 | \n", + "/storage1/fs1/dinglab/Active/Primary/CPTAC3.sh... | \n", + "
... | \n", + "... | \n", + "... | \n", + "
64 | \n", + "C761370.WXS.T.ADNA_260f1df4.hg38 | \n", + "/storage1/fs1/dinglab/Active/Primary/CPTAC3.sh... | \n", + "
65 | \n", + "C761370.WXS.T.hg38 | \n", + "/storage1/fs1/dinglab/Active/Primary/CPTAC3.sh... | \n", + "
66 | \n", + "C827913.WXS.T.hg38 | \n", + "/storage1/fs1/dinglab/Active/Primary/CPTAC3.sh... | \n", + "
67 | \n", + "C846363.WXS.T.ADNA_51dcc4a3.hg38 | \n", + "/storage1/fs1/dinglab/Active/Primary/CPTAC3.sh... | \n", + "
68 | \n", + "C846363.WXS.T.hg38 | \n", + "/storage1/fs1/dinglab/Active/Primary/CPTAC3.sh... | \n", + "
69 rows × 2 columns
\n", + "\n", + " | 0 | \n", + "1 | \n", + "2 | \n", + "
---|---|---|---|
0 | \n", + "chr1 | \n", + "13273 | \n", + "13273 | \n", + "