Skip to content

Commit

Permalink
Fix snapshot scheduling with expired jobs (apache#8832)
Browse files Browse the repository at this point in the history
Co-authored-by: Henrique Sato <[email protected]>
  • Loading branch information
hsato03 and Henrique Sato authored Aug 23, 2024
1 parent 1e12a80 commit c9f1c57
Show file tree
Hide file tree
Showing 6 changed files with 113 additions and 93 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@
import javax.persistence.TemporalType;

import com.cloud.storage.snapshot.SnapshotSchedule;
import org.apache.commons.lang3.builder.ReflectionToStringBuilder;
import org.apache.commons.lang3.builder.ToStringStyle;

@Entity
@Table(name = "snapshot_schedule")
Expand Down Expand Up @@ -132,4 +134,11 @@ public String getUuid() {
public void setUuid(String uuid) {
this.uuid = uuid;
}

@Override
public String toString() {
ReflectionToStringBuilder reflectionToStringBuilder = new ReflectionToStringBuilder(this, ToStringStyle.JSON_STYLE);
reflectionToStringBuilder.setExcludeFieldNames("id");
return reflectionToStringBuilder.toString();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,11 @@
*/
public interface SnapshotScheduleDao extends GenericDao<SnapshotScheduleVO, Long> {

List<SnapshotScheduleVO> getCoincidingSnapshotSchedules(long volumeId, Date date);

List<SnapshotScheduleVO> getSchedulesToExecute(Date currentTimestamp);

SnapshotScheduleVO getCurrentSchedule(Long volumeId, Long policyId, boolean executing);
List<SnapshotScheduleVO> getSchedulesAssignedWithAsyncJob();

SnapshotScheduleVO findOneByVolume(long volumeId);
SnapshotScheduleVO getCurrentSchedule(Long volumeId, Long policyId, boolean executing);

SnapshotScheduleVO findOneByVolumePolicy(long volumeId, long policyId);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
public class SnapshotScheduleDaoImpl extends GenericDaoBase<SnapshotScheduleVO, Long> implements SnapshotScheduleDao {
protected final SearchBuilder<SnapshotScheduleVO> executableSchedulesSearch;
protected final SearchBuilder<SnapshotScheduleVO> coincidingSchedulesSearch;
private final SearchBuilder<SnapshotScheduleVO> VolumeIdSearch;
protected final SearchBuilder<SnapshotScheduleVO> schedulesAssignedWithAsyncJob;
private final SearchBuilder<SnapshotScheduleVO> VolumeIdPolicyIdSearch;

protected SnapshotScheduleDaoImpl() {
Expand All @@ -48,36 +48,14 @@ protected SnapshotScheduleDaoImpl() {
coincidingSchedulesSearch.and("asyncJobId", coincidingSchedulesSearch.entity().getAsyncJobId(), SearchCriteria.Op.NULL);
coincidingSchedulesSearch.done();

VolumeIdSearch = createSearchBuilder();
VolumeIdSearch.and("volumeId", VolumeIdSearch.entity().getVolumeId(), SearchCriteria.Op.EQ);
VolumeIdSearch.done();

VolumeIdPolicyIdSearch = createSearchBuilder();
VolumeIdPolicyIdSearch.and("volumeId", VolumeIdPolicyIdSearch.entity().getVolumeId(), SearchCriteria.Op.EQ);
VolumeIdPolicyIdSearch.and("policyId", VolumeIdPolicyIdSearch.entity().getPolicyId(), SearchCriteria.Op.EQ);
VolumeIdPolicyIdSearch.done();

}

/**
* {@inheritDoc}
*/
@Override
public List<SnapshotScheduleVO> getCoincidingSnapshotSchedules(long volumeId, Date date) {
SearchCriteria<SnapshotScheduleVO> sc = coincidingSchedulesSearch.create();
sc.setParameters("volumeId", volumeId);
sc.setParameters("scheduledTimestamp", date);
// Don't return manual snapshots. They will be executed through another
// code path.
sc.addAnd("policyId", SearchCriteria.Op.NEQ, 1L);
return listBy(sc);
}

@Override
public SnapshotScheduleVO findOneByVolume(long volumeId) {
SearchCriteria<SnapshotScheduleVO> sc = VolumeIdSearch.create();
sc.setParameters("volumeId", volumeId);
return findOneBy(sc);
schedulesAssignedWithAsyncJob = createSearchBuilder();
schedulesAssignedWithAsyncJob.and("asyncJobId", schedulesAssignedWithAsyncJob.entity().getAsyncJobId(), SearchCriteria.Op.NNULL);
schedulesAssignedWithAsyncJob.done();
}

@Override
Expand All @@ -98,6 +76,11 @@ public List<SnapshotScheduleVO> getSchedulesToExecute(Date currentTimestamp) {
return listBy(sc);
}

@Override
public List<SnapshotScheduleVO> getSchedulesAssignedWithAsyncJob() {
return listBy(schedulesAssignedWithAsyncJob.create());
}

/**
* {@inheritDoc}
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,9 @@ CREATE TABLE IF NOT EXISTS `cloud_usage`.`quota_email_configuration`(
CONSTRAINT `FK_quota_email_configuration_account_id` FOREIGN KEY (`account_id`) REFERENCES `cloud_usage`.`quota_account`(`account_id`),
CONSTRAINT `FK_quota_email_configuration_email_template_id` FOREIGN KEY (`email_template_id`) REFERENCES `cloud_usage`.`quota_email_templates`(`id`));

-- Remove on delete cascade from snapshot schedule
ALTER TABLE `cloud`.`snapshot_schedule` DROP CONSTRAINT `fk__snapshot_schedule_async_job_id`;

-- Add `is_implicit` column to `host_tags` table
CALL `cloud`.`IDEMPOTENT_ADD_COLUMN`('cloud.host_tags', 'is_implicit', 'int(1) UNSIGNED NOT NULL DEFAULT 0 COMMENT "If host tag is implicit or explicit" ');

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
import org.apache.cloudstack.framework.jobs.AsyncJobManager;
import org.apache.cloudstack.framework.jobs.dao.AsyncJobDao;
import org.apache.cloudstack.framework.jobs.impl.AsyncJobVO;
import org.apache.cloudstack.jobs.JobInfo;
import org.apache.cloudstack.managed.context.ManagedContextTimerTask;
import org.springframework.stereotype.Component;

Expand All @@ -47,7 +48,6 @@
import com.cloud.storage.Snapshot;
import com.cloud.storage.SnapshotPolicyVO;
import com.cloud.storage.SnapshotScheduleVO;
import com.cloud.storage.SnapshotVO;
import com.cloud.storage.VolumeVO;
import com.cloud.storage.dao.SnapshotDao;
import com.cloud.storage.dao.SnapshotPolicyDao;
Expand All @@ -64,7 +64,6 @@
import com.cloud.utils.concurrency.TestClock;
import com.cloud.utils.db.DB;
import com.cloud.utils.db.GlobalLock;
import com.cloud.utils.db.SearchCriteria;
import com.cloud.utils.db.TransactionLegacy;
import com.cloud.vm.snapshot.VMSnapshotManager;
import com.cloud.vm.snapshot.VMSnapshotVO;
Expand Down Expand Up @@ -144,7 +143,7 @@ public void poll(final Date currentTimestamp) {
try {
if (scanLock.lock(ACQUIRE_GLOBAL_LOCK_TIMEOUT_FOR_COOPERATION)) {
try {
checkStatusOfCurrentlyExecutingSnapshots();
scheduleNextSnapshotJobsIfNecessary();
} finally {
scanLock.unlock();
}
Expand Down Expand Up @@ -174,68 +173,37 @@ public void poll(final Date currentTimestamp) {
}
}

private void checkStatusOfCurrentlyExecutingSnapshots() {
final SearchCriteria<SnapshotScheduleVO> sc = _snapshotScheduleDao.createSearchCriteria();
sc.addAnd("asyncJobId", SearchCriteria.Op.NNULL);
final List<SnapshotScheduleVO> snapshotSchedules = _snapshotScheduleDao.search(sc, null);
for (final SnapshotScheduleVO snapshotSchedule : snapshotSchedules) {
final Long asyncJobId = snapshotSchedule.getAsyncJobId();
final AsyncJobVO asyncJob = _asyncJobDao.findByIdIncludingRemoved(asyncJobId);
switch (asyncJob.getStatus()) {
case SUCCEEDED:
// The snapshot has been successfully backed up.
// The snapshot state has also been cleaned up.
// We can schedule the next job for this snapshot.
// Remove the existing entry in the snapshot_schedule table.
scheduleNextSnapshotJob(snapshotSchedule);
break;
case FAILED:
// Check the snapshot status.
final Long snapshotId = snapshotSchedule.getSnapshotId();
if (snapshotId == null) {
// createSnapshotAsync exited, successfully or unsuccessfully,
// even before creating a snapshot record
// No cleanup needs to be done.
// Schedule the next snapshot.
scheduleNextSnapshotJob(snapshotSchedule);
} else {
final SnapshotVO snapshot = _snapshotDao.findById(snapshotId);
if (snapshot == null || snapshot.getRemoved() != null) {
// This snapshot has been deleted successfully from the primary storage
// Again no cleanup needs to be done.
// Schedule the next snapshot.
// There's very little probability that the code reaches this point.
// The snapshotId is a foreign key for the snapshot_schedule table
// set to ON DELETE CASCADE. So if the snapshot entry is deleted, the snapshot_schedule entry will be too.
// But what if it has only been marked as removed?
scheduleNextSnapshotJob(snapshotSchedule);
} else {
// The management server executing this snapshot job appears to have crashed
// while creating the snapshot on primary storage/or backing it up.
// We have no idea whether the snapshot was successfully taken on the primary or not.
// Schedule the next snapshot job.
// The ValidatePreviousSnapshotCommand will take appropriate action on this snapshot
// If the snapshot was taken successfully on primary, it will retry backing it up.
// and cleanup the previous snapshot
// Set the userId to that of system.
//_snapshotManager.validateSnapshot(1L, snapshot);
// In all cases, schedule the next snapshot job
scheduleNextSnapshotJob(snapshotSchedule);
}
}
private void scheduleNextSnapshotJobsIfNecessary() {
List<SnapshotScheduleVO> snapshotSchedules = _snapshotScheduleDao.getSchedulesAssignedWithAsyncJob();
logger.info("Verifying the current state of [{}] snapshot schedules and scheduling next jobs, if necessary.", snapshotSchedules.size());
for (SnapshotScheduleVO snapshotSchedule : snapshotSchedules) {
scheduleNextSnapshotJobIfNecessary(snapshotSchedule);
}
}

break;
case IN_PROGRESS:
// There is no way of knowing from here whether
// 1) Another management server is processing this snapshot job
// 2) The management server has crashed and this snapshot is lying
// around in an inconsistent state.
// Hopefully, this can be resolved at the backend when the current snapshot gets executed.
// But if it remains in this state, the current snapshot will not get executed.
// And it will remain in stasis.
break;
}
protected void scheduleNextSnapshotJobIfNecessary(SnapshotScheduleVO snapshotSchedule) {
Long asyncJobId = snapshotSchedule.getAsyncJobId();
AsyncJobVO asyncJob = _asyncJobDao.findByIdIncludingRemoved(asyncJobId);

if (asyncJob == null) {
logger.debug("The async job [{}] of snapshot schedule [{}] does not exist anymore. Considering it as finished and scheduling the next snapshot job.",
asyncJobId, snapshotSchedule);
scheduleNextSnapshotJob(snapshotSchedule);
return;
}

JobInfo.Status status = asyncJob.getStatus();

if (JobInfo.Status.SUCCEEDED.equals(status)) {
logger.debug("Last job of schedule [{}] succeeded; scheduling the next snapshot job.", snapshotSchedule);
} else if (JobInfo.Status.FAILED.equals(status)) {
logger.debug("Last job of schedule [{}] failed with [{}]; scheduling a new snapshot job.", snapshotSchedule, asyncJob.getResult());
} else {
logger.debug("Schedule [{}] is still in progress, skipping next job scheduling.", snapshotSchedule);
return;
}

scheduleNextSnapshotJob(snapshotSchedule);
}

@DB
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@
import com.cloud.user.Account;
import com.cloud.user.AccountVO;
import com.cloud.user.dao.AccountDao;
import org.apache.cloudstack.framework.jobs.dao.AsyncJobDao;
import org.apache.cloudstack.framework.jobs.impl.AsyncJobVO;
import org.apache.cloudstack.jobs.JobInfo;
import org.junit.Assert;
import org.junit.Test;
import org.junit.runner.RunWith;
Expand Down Expand Up @@ -65,6 +68,16 @@ public class SnapshotSchedulerImplTest {
@Mock
AccountVO accountVoMock;

@Mock
private SnapshotScheduleVO snapshotScheduleVoMock;

@Mock
private AsyncJobDao asyncJobDaoMock;

@Mock
private AsyncJobVO asyncJobVoMock;


@Test
public void scheduleNextSnapshotJobTestParameterIsNullReturnNull() {
SnapshotScheduleVO snapshotScheduleVO = null;
Expand Down Expand Up @@ -215,4 +228,50 @@ public void canSnapshotBeScheduledTestSnapshotPolicyIsNotRemovedDoNotCallRemove(

Mockito.verify(snapshotScheduleDaoMock, Mockito.never()).remove(Mockito.anyLong());
}

@Test
public void scheduleNextSnapshotJobIfNecessaryTestAsyncJobIsNullThenScheduleNextSnapshot() {
Mockito.doReturn(1L).when(snapshotScheduleVoMock).getAsyncJobId();
Mockito.doReturn(null).when(asyncJobDaoMock).findByIdIncludingRemoved(Mockito.any());
Mockito.doReturn(new Date()).when(snapshotSchedulerImplSpy).scheduleNextSnapshotJob(Mockito.any(SnapshotScheduleVO.class));

snapshotSchedulerImplSpy.scheduleNextSnapshotJobIfNecessary(snapshotScheduleVoMock);

Mockito.verify(snapshotSchedulerImplSpy).scheduleNextSnapshotJob(Mockito.any(SnapshotScheduleVO.class));
}

@Test
public void scheduleNextSnapshotJobIfNecessaryTestAsyncJobSucceededThenScheduleNextSnapshot() {
Mockito.doReturn(1L).when(snapshotScheduleVoMock).getAsyncJobId();
Mockito.doReturn(asyncJobVoMock).when(asyncJobDaoMock).findByIdIncludingRemoved(Mockito.any());
Mockito.doReturn(JobInfo.Status.SUCCEEDED).when(asyncJobVoMock).getStatus();
Mockito.doReturn(new Date()).when(snapshotSchedulerImplSpy).scheduleNextSnapshotJob(Mockito.any(SnapshotScheduleVO.class));

snapshotSchedulerImplSpy.scheduleNextSnapshotJobIfNecessary(snapshotScheduleVoMock);

Mockito.verify(snapshotSchedulerImplSpy).scheduleNextSnapshotJob(Mockito.any(SnapshotScheduleVO.class));
}

@Test
public void scheduleNextSnapshotJobIfNecessaryTestAsyncJobFailedThenScheduleNextSnapshot() {
Mockito.doReturn(1L).when(snapshotScheduleVoMock).getAsyncJobId();
Mockito.doReturn(asyncJobVoMock).when(asyncJobDaoMock).findByIdIncludingRemoved(Mockito.any());
Mockito.doReturn(JobInfo.Status.FAILED).when(asyncJobVoMock).getStatus();
Mockito.doReturn(new Date()).when(snapshotSchedulerImplSpy).scheduleNextSnapshotJob(Mockito.any(SnapshotScheduleVO.class));

snapshotSchedulerImplSpy.scheduleNextSnapshotJobIfNecessary(snapshotScheduleVoMock);

Mockito.verify(snapshotSchedulerImplSpy).scheduleNextSnapshotJob(Mockito.any(SnapshotScheduleVO.class));
}

@Test
public void scheduleNextSnapshotJobIfNecessaryTestAsyncJobInProgressThenDoNothing() {
Mockito.doReturn(1L).when(snapshotScheduleVoMock).getAsyncJobId();
Mockito.doReturn(asyncJobVoMock).when(asyncJobDaoMock).findByIdIncludingRemoved(Mockito.any());
Mockito.doReturn(JobInfo.Status.IN_PROGRESS).when(asyncJobVoMock).getStatus();

snapshotSchedulerImplSpy.scheduleNextSnapshotJobIfNecessary(snapshotScheduleVoMock);

Mockito.verify(snapshotSchedulerImplSpy, Mockito.never()).scheduleNextSnapshotJob(Mockito.any(SnapshotScheduleVO.class));
}
}

0 comments on commit c9f1c57

Please sign in to comment.