forked from gcorso/DiffDock
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathrelaunchFailedCompounds.py
123 lines (98 loc) · 4.26 KB
/
relaunchFailedCompounds.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Created on Thu Jul 25 13:46:19 2024
@author: Jochem Nelen ([email protected])
"""
# This script is used to rerun molecules that failed to dock during a DiffDockHPC job. It checks the output directory for missing sdf files and prompts the user to relaunch the failed molecules.
# To use this script, simply run it from the command line and provide the main output directory as an argument:
# python relaunchFailedCompounds.py VS_DD_.../
import glob
import os
import re
import shutil
import subprocess
import sys
if len(sys.argv) < 2:
sys.exit("You have to put in a DiffDockHPC run as an argument")
inputPath = sys.argv[1]
if not os.path.isdir(inputPath):
sys.exit("The input path doesn't seem to be a valid directory")
successCounter = 0
pathDict = {}
finishedList = []
## Code to distribute the query ligands among the amount of jobs
def split(a, n):
if n > len(a):
print("more jobs than files, launching 1 job per file")
return [a[i:i+1] for i in range(len(a))]
k, m = divmod(len(a), n)
return (a[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n))
## Check the csv files and store the information to a dict
for path in glob.glob(f"{inputPath}/csvs/*.csv"):
with open(path) as inputFile:
inputLines = inputFile.readlines()
for line in inputLines[1:]:
lineSplit = line.strip().split(';')
molName = lineSplit[0]
molPath = lineSplit[-1]
pathDict[molName] = molPath
finishedPaths = glob.glob(f"{inputPath}/molecules/*.sdf")
## Identify which compounds finished successfully
for finishedPath in finishedPaths:
finishedName = finishedPath.split("VS_DD_")[-1].split("_rank")[0]
finishedList.append(finishedName)
## Loop through the finished compounds and remove the values from the pathDict
for name in finishedList:
del(pathDict[name])
successCounter += 1
print(f"Failed to process {len(pathDict)} files, {successCounter} did process successfully")
## Ask with how many jobs it should be rerun (interactively)
answer = input("With how many jobs would you like to relaunch the failed compounds? ").lower()
if not answer.isdigit():
sys.exit("Not launching any jobs..")
else:
jobNumber = int(answer)
print(f"launching {jobNumber} jobs..")
ligandPathsSplit = list(split(list(pathDict.values()), jobNumber))
## Check if the output directory already exists, and asks the user what to do if it does
redoDir = f"{inputPath}/redo/"
if os.path.isdir(redoDir):
print(f"The directory {redoDir} already exists. To continue you must delete this directory or choose another outputname.")
answer = input("Do you want to remove it? (y/n) ").lower()
while answer not in ("y", "n", "yes", "no"):
print("Invalid input. Please enter y(es) or n(o).")
answer = input("Do you want to remove or overwrite it? (y/n) ").lower()
if answer == "y" or answer == "yes":
shutil.rmtree(redoDir, ignore_errors=False)
else:
sys.exit()
## write a "redo" or directory in the output directory, write csvs using this information and relaunch it as jobs using the original settings
os.mkdir(redoDir)
os.mkdir(f"{redoDir}/csvs/")
os.mkdir(f"{redoDir}/jobs_out/")
os.mkdir(f"{redoDir}/jobs/")
## Get a job file to copy the settings automatically
jobPaths = glob.glob(f"{inputPath}/jobs/job_*.sh")
with open(jobPaths[0]) as jobFile:
jobLine = jobFile.readlines()[1]
## Get the protein path
protein_path = glob.glob(f"{inputPath}/*.pdb")[0]
## Write and launch jobs
for i, jobLigands in enumerate(ligandPathsSplit):
csvFilePath = f"{redoDir}/csvs/job_csv_{str(i+1)}.csv"
with open(csvFilePath, 'w') as jobCSV:
jobCSV.write("complex_name;protein_path;ligand_description\n")
for jobLigand in jobLigands:
complexName = os.path.basename(jobLigand).split('.')[0]
jobCSV.write(f"{complexName};{protein_path};{jobLigand}\n")
jobCSV.close()
## Modify the command to use the correct csv and job output path
jobCMD = re.sub(r"/jobs_out/job_.*\.out", f"/redo/jobs_out/redo_job_{i}_%j.out", jobLine)
jobCMD = re.sub(r"(--protein_ligand_csv\s+)[^ ]+(\s+--samples_per_complex)", r"\1" + csvFilePath + r"\2", jobCMD)
## Write the DiffDockHPC job file
with open(f"{redoDir}/jobs/redo_job_{str(i+1)}.sh", "w") as jobfile:
jobfile.write("#!/usr/bin/env bash\n")
jobfile.write(jobCMD)
## Run the DiffDockHPC job file
subprocess.run(jobCMD, shell=True)