-
Notifications
You must be signed in to change notification settings - Fork 195
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[JENKINS-55287] Improve error messages for nonresumable Pipelines #363
Changes from 8 commits
ebb5d76
b360ae0
f631a44
686a6e4
eb08b5b
2214c42
b3a6847
2ad7e70
d0c638c
22e7086
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -712,17 +712,26 @@ public void onLoad(FlowExecutionOwner owner) throws IOException { | |
this.owner = owner; | ||
|
||
try { | ||
initializeStorage(); // Throws exception and bombs out if we can't load FlowNodes | ||
} catch (Exception ex) { | ||
try { | ||
initializeStorage(); // Throws exception and bombs out if we can't load FlowNodes | ||
} catch (Exception ex) { | ||
LOGGER.log(Level.WARNING, "Error initializing storage and loading nodes, will try to create placeholders for: "+this, ex); | ||
if (!canResume()) { | ||
// This case is expected for PERFORMANCE_OPTIMIZED Pipelines after a Jenkins crash (JENKINS-55287). The specific issue is | ||
// usually that only the FlowStartNode has been persisted and so we cannot find the current head node in storage. | ||
String message = "Unable to resume " + owner.getExecutable() + " because it was not saved before Jenkins shut down. Did Jenkins crash?"; | ||
LOGGER.log(Level.WARNING, message, ex); | ||
ex = new AbortException(message); // Change the exception displayed in the build log to one that is more useful for users. | ||
} else { | ||
LOGGER.log(Level.WARNING, "Unable to resume " + owner.getExecutable() + " because its flow nodes could not be loaded", ex); | ||
} | ||
createPlaceholderNodes(ex); | ||
return; | ||
} catch (Exception ex2) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is all mostly refactoring. The old code had this structure:
Now it has this structure:
|
||
ex2.addSuppressed(ex); | ||
done = true; | ||
programPromise = Futures.immediateFailedFuture(ex2); | ||
throw new IOException("Failed to even create placeholder nodes for execution", ex2); | ||
} | ||
} catch (Exception ex) { | ||
done = true; | ||
programPromise = Futures.immediateFailedFuture(ex); | ||
throw new IOException("Failed to even create placeholder nodes for execution", ex); | ||
return; | ||
} | ||
|
||
try { | ||
|
@@ -737,12 +746,16 @@ public void onLoad(FlowExecutionOwner owner) throws IOException { | |
} | ||
} else { // See if we can/should resume build | ||
if (canResume()) { | ||
// Before we resume, we need to unset persistedClean in case Jenkins restarts again. | ||
persistedClean = null; | ||
saveOwner(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should add a test case for this. The test would use a PERFORMANCE_OPTIMIZED Pipeline, restart Jenkins normally, then restart Jenkins again with a hard shutdown and make sure we get the error message that the Pipeline can't be resumed rather than a raw exception. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I came up with a test case, but it doesn't fail without this line because of a random save that happens when |
||
loadProgramAsync(getProgramDataFile()); | ||
} else { | ||
// TODO if possible, consider trying to close out unterminated blocks to keep existing graph history | ||
// That way we can visualize the graph in some error cases. | ||
LOGGER.log(Level.WARNING, "Pipeline state not properly persisted, cannot resume "+owner.getUrl()); | ||
throw new IOException("Cannot resume build -- was not cleanly saved when Jenkins shut down."); | ||
String message = "Unable to resume " + owner.getExecutable() + " because it was not saved before Jenkins shut down. Did Jenkins crash?"; | ||
LOGGER.log(Level.WARNING, message); | ||
throw new IOException(message); | ||
} | ||
} | ||
} catch (Exception e) { // Broad catch ensures that failure to load do NOT nuke the master | ||
|
@@ -1526,10 +1539,6 @@ public boolean isPaused() { | |
return false; | ||
} | ||
|
||
private void setPersistedClean(boolean persistedClean) { // Workaround for some issues with anonymous classes. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Unused |
||
this.persistedClean = persistedClean; | ||
} | ||
|
||
/** | ||
* Pause or unpause the execution. | ||
* | ||
|
@@ -1548,8 +1557,12 @@ public void pause(final boolean v) throws IOException { | |
if (v) { | ||
g.pause(); | ||
checkAndAbortNonresumableBuild(); // TODO Verify if we can rely on just killing paused builds at shutdown via checkAndAbortNonresumableBuild() | ||
checkpoint(); | ||
checkpoint(false); | ||
} else { | ||
// Pausing the build sets persistedClean to true so the build can resume, so if we unpause the build | ||
// we need to unset persistedClean again. | ||
persistedClean = null; | ||
saveOwner(); | ||
Comment on lines
+1566
to
+1569
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We should add a test case for this. The Pipeline would need to be paused, then unpaused, and then we do a hard shutdown and make sure we get the error message that the Pipeline can't be resumed rather than a raw exception. |
||
g.unpause(); | ||
} | ||
try { | ||
|
@@ -1591,7 +1604,7 @@ public void pause(final boolean v) throws IOException { | |
LOGGER.log(Level.WARNING, "Error waiting for Pipeline to suspend: "+exec, ex); | ||
} | ||
} | ||
cpsExec.checkpoint(); | ||
cpsExec.checkpoint(true); | ||
} | ||
} catch (Exception ex) { | ||
LOGGER.log(Level.WARNING, "Error persisting Pipeline execution at shutdown: "+((CpsFlowExecution) execution).owner, ex); | ||
|
@@ -1954,7 +1967,6 @@ void saveOwner() { | |
try { | ||
if (this.owner != null && this.owner.getExecutable() instanceof Saveable) { // Null-check covers some anomalous cases we've seen | ||
Saveable saveable = (Saveable)(this.owner.getExecutable()); | ||
persistedClean = true; | ||
if (storage != null && storage.delegate != null) { | ||
// Defensively flush FlowNodes to storage | ||
try { | ||
|
@@ -1972,72 +1984,95 @@ void saveOwner() { | |
} | ||
} | ||
|
||
/** Save everything we can to disk - program, run, flownodes. */ | ||
private void checkpoint() { | ||
/** | ||
* Save everything we can to disk - program, run, flownodes. | ||
* @param shuttingDown True if this checkpoint is happening because Jenkins is shutting down, false if it is happening because execution was paused. | ||
*/ | ||
private void checkpoint(boolean shuttingDown) { | ||
if (isComplete() || this.getDurabilityHint().isPersistWithEveryStep()) { | ||
// Nothing to persist OR we've already persisted it along the way. | ||
return; | ||
} | ||
LOGGER.log(Level.INFO, "Attempting to checkpoint all data for {0}{1}", new Object[] { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The logging-related changes were originally part of #354 (there are some slight modifications here), so it might be easier to review that PR first. |
||
this, (shuttingDown ? " before shutdown" : "") | ||
}); | ||
boolean persistOk = true; | ||
FlowNodeStorage storage = getStorage(); | ||
if (storage != null) { | ||
try { // Node storage must be flushed first so program can be restored | ||
storage.flush(); | ||
} catch (IOException ioe) { | ||
persistOk=false; | ||
LOGGER.log(Level.WARNING, "Error persisting FlowNode storage before shutdown", ioe); | ||
LOGGER.log(Level.WARNING, "Error persisting FlowNode storage for: " + this, ioe); | ||
} | ||
|
||
// Try to ensure we've saved the appropriate things -- the program is the last stumbling block. | ||
try { | ||
final SettableFuture<Void> myOutcome = SettableFuture.create(); | ||
LOGGER.log(Level.INFO, "About to try to checkpoint the program for build"+this); | ||
LOGGER.log(Level.FINE, "About to try to checkpoint the program for: {0}", this); | ||
if (programPromise != null && programPromise.isDone()) { | ||
runInCpsVmThread(new FutureCallback<CpsThreadGroup>() { | ||
@Override | ||
public void onSuccess(CpsThreadGroup result) { | ||
try { | ||
LOGGER.log(Level.INFO, "Trying to save program before shutdown "+this); | ||
LOGGER.log(Level.FINE, "Trying to save program for: {0}", CpsFlowExecution.this); | ||
result.saveProgramIfPossible(true); | ||
LOGGER.log(Level.INFO, "Finished saving program before shutdown "+this); | ||
LOGGER.log(Level.FINE, "Finished saving program for: {0}", CpsFlowExecution.this); | ||
myOutcome.set(null); | ||
} catch (Exception ex) { | ||
LOGGER.log(Level.WARNING, "Error persisting program: "+ex); | ||
// Logged at Level.WARNING when we call `myOutcome.get` and it throws an exception. | ||
myOutcome.setException(ex); | ||
} | ||
} | ||
|
||
@Override | ||
public void onFailure(Throwable t) { | ||
LOGGER.log(Level.WARNING, "Failed trying to save program before shutdown "+this); | ||
// Logged at Level.WARNING when we call `myOutcome.get` and it throws an exception. | ||
myOutcome.setException(t); | ||
} | ||
}); | ||
myOutcome.get(30, TimeUnit.SECONDS); | ||
LOGGER.log(Level.FINE, "Successfully saved program for "+this); | ||
LOGGER.log(Level.FINE, "Successfully saved program for: {0}", this); | ||
} else { | ||
persistOk = false; | ||
LOGGER.log(Level.WARNING, "Unable to persist program because it was never loaded for: {0}", this); | ||
} | ||
|
||
} catch (TimeoutException te) { | ||
persistOk = false; | ||
LOGGER.log(Level.WARNING, "Timeout persisting program at execution checkpoint", te); | ||
LOGGER.log(Level.WARNING, "Timeout persisting program for: " + this, te); | ||
} catch (ExecutionException | InterruptedException ex) { | ||
persistOk = false; | ||
LOGGER.log(Level.FINE, "Error saving program, that should be handled elsewhere.", ex); | ||
LOGGER.log(Level.WARNING, "Error saving program for: " + this, ex); | ||
} | ||
try { // Flush node storage just in case the Program mutated it, just to be sure | ||
storage.flush(); | ||
LOGGER.log(Level.FINE, "Successfully did final flush of storage for "+this); | ||
LOGGER.log(Level.FINE, "Successfully did final flush of storage for: {0}", this); | ||
} catch (IOException ioe) { | ||
persistOk=false; | ||
LOGGER.log(Level.WARNING, "Error persisting FlowNode storage before shutdown", ioe); | ||
LOGGER.log(Level.WARNING, "Error persisting FlowNode storage for: " + this, ioe); | ||
} | ||
persistedClean = persistOk; | ||
try { | ||
saveOwner(); | ||
} catch (Exception ex) { | ||
LOGGER.log(Level.WARNING, "Error saving build for "+this, ex); | ||
persistOk = false; | ||
LOGGER.log(Level.WARNING, "Error saving build for: " + this, ex); | ||
} | ||
|
||
} else { | ||
persistOk = false; | ||
LOGGER.log(Level.WARNING, "No FlowNode storage for: {0}", this); | ||
} | ||
|
||
if (persistOk) { | ||
LOGGER.log(Level.INFO, "Successfully checkpointed {0}{1}", new Object[] { | ||
this, (shuttingDown ? " before shutdown" : "") | ||
}); | ||
} else { | ||
LOGGER.log(Level.WARNING, "Unable to successfully checkpoint {0}{1}", new Object[] { | ||
this, (shuttingDown ? " before shutdown, so this build will probably fail when Jenkins restarts" : "") | ||
}); | ||
} | ||
} | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -16,7 +16,6 @@ | |
import org.jenkinsci.plugins.workflow.support.steps.input.InputStepExecution; | ||
import org.junit.Assert; | ||
import org.junit.ClassRule; | ||
import org.junit.Ignore; | ||
import org.junit.Rule; | ||
import org.junit.Test; | ||
import org.jvnet.hudson.test.BuildWatcher; | ||
|
@@ -244,7 +243,6 @@ public void inProgressNormal() throws Exception { | |
} | ||
|
||
@Test | ||
@Ignore | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These tests have been ignored since they were added. I am not sure why, but I went ahead and unignored them. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You probably already knew this, but I just noticed that these test cases seem to be duplicated in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, I think I will probably delete the copies in workflow-job after this. Ideally we would run the PCT in the CI builds here with just the core Pipeline plugins so that changes are tested against the newest versions of everything. |
||
public void inProgressMaxPerfCleanShutdown() throws Exception { | ||
final int[] build = new int[1]; | ||
story.then( j -> { | ||
|
@@ -263,14 +261,14 @@ public void inProgressMaxPerfCleanShutdown() throws Exception { | |
}); | ||
} | ||
|
||
@Issue("JENKINS-55287") | ||
@Test | ||
@Ignore | ||
public void inProgressMaxPerfDirtyShutdown() throws Exception { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is the exact situation reported in JENKINS-55287 as far as I can tell. |
||
final int[] build = new int[1]; | ||
final String[] finalNodeId = new String[1]; | ||
story.thenWithHardShutdown( j -> { | ||
runBasicPauseOnInput(j, DEFAULT_JOBNAME, build, FlowDurabilityHint.PERFORMANCE_OPTIMIZED); | ||
// SHOULD still save at end via persist-at-shutdown hooks | ||
// Will not be saved since the persist-at-shutdown hooks will occur after the snapshot for thenWithHardShutdown is taken. | ||
}); | ||
story.then( j->{ | ||
WorkflowJob r = j.jenkins.getItemByFullName(DEFAULT_JOBNAME, WorkflowJob.class); | ||
|
@@ -279,6 +277,7 @@ public void inProgressMaxPerfDirtyShutdown() throws Exception { | |
j.waitForCompletion(run); | ||
assertCompletedCleanly(run); | ||
Assert.assertEquals(Result.FAILURE, run.getResult()); | ||
j.assertLogContains("was not saved before Jenkins shut down", run); | ||
finalNodeId[0] = run.getExecution().getCurrentHeads().get(0).getId(); | ||
}); | ||
story.then(j-> { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Alternatively, we could check this even before we attempt to initialize storage and just bail out right away. I don't know if that is too pessimistic, maybe there is a case where the Pipeline is not resumable but flow nodes are all up to date and so the optimistic behavior here is useful because you will still be able to see the existing flow graph in those cases?