From c9f1c5790d131b744fb16cc417c7f9540d7c604d Mon Sep 17 00:00:00 2001 From: Henrique Sato Date: Fri, 23 Aug 2024 06:19:13 -0300 Subject: [PATCH] Fix snapshot scheduling with expired jobs (#8832) Co-authored-by: Henrique Sato --- .../com/cloud/storage/SnapshotScheduleVO.java | 9 ++ .../storage/dao/SnapshotScheduleDao.java | 6 +- .../storage/dao/SnapshotScheduleDaoImpl.java | 35 ++----- .../META-INF/db/schema-41910to42000.sql | 3 + .../snapshot/SnapshotSchedulerImpl.java | 94 ++++++------------- .../snapshot/SnapshotSchedulerImplTest.java | 59 ++++++++++++ 6 files changed, 113 insertions(+), 93 deletions(-) diff --git a/engine/schema/src/main/java/com/cloud/storage/SnapshotScheduleVO.java b/engine/schema/src/main/java/com/cloud/storage/SnapshotScheduleVO.java index 80a890aacad6..86e0da53666f 100644 --- a/engine/schema/src/main/java/com/cloud/storage/SnapshotScheduleVO.java +++ b/engine/schema/src/main/java/com/cloud/storage/SnapshotScheduleVO.java @@ -29,6 +29,8 @@ import javax.persistence.TemporalType; import com.cloud.storage.snapshot.SnapshotSchedule; +import org.apache.commons.lang3.builder.ReflectionToStringBuilder; +import org.apache.commons.lang3.builder.ToStringStyle; @Entity @Table(name = "snapshot_schedule") @@ -132,4 +134,11 @@ public String getUuid() { public void setUuid(String uuid) { this.uuid = uuid; } + + @Override + public String toString() { + ReflectionToStringBuilder reflectionToStringBuilder = new ReflectionToStringBuilder(this, ToStringStyle.JSON_STYLE); + reflectionToStringBuilder.setExcludeFieldNames("id"); + return reflectionToStringBuilder.toString(); + } } diff --git a/engine/schema/src/main/java/com/cloud/storage/dao/SnapshotScheduleDao.java b/engine/schema/src/main/java/com/cloud/storage/dao/SnapshotScheduleDao.java index 7ca0a3915f54..284a42cf9e1d 100644 --- a/engine/schema/src/main/java/com/cloud/storage/dao/SnapshotScheduleDao.java +++ b/engine/schema/src/main/java/com/cloud/storage/dao/SnapshotScheduleDao.java @@ -27,13 +27,11 @@ */ public interface SnapshotScheduleDao extends GenericDao { - List getCoincidingSnapshotSchedules(long volumeId, Date date); - List getSchedulesToExecute(Date currentTimestamp); - SnapshotScheduleVO getCurrentSchedule(Long volumeId, Long policyId, boolean executing); + List getSchedulesAssignedWithAsyncJob(); - SnapshotScheduleVO findOneByVolume(long volumeId); + SnapshotScheduleVO getCurrentSchedule(Long volumeId, Long policyId, boolean executing); SnapshotScheduleVO findOneByVolumePolicy(long volumeId, long policyId); diff --git a/engine/schema/src/main/java/com/cloud/storage/dao/SnapshotScheduleDaoImpl.java b/engine/schema/src/main/java/com/cloud/storage/dao/SnapshotScheduleDaoImpl.java index 925d02dd90b4..14669ce1d438 100644 --- a/engine/schema/src/main/java/com/cloud/storage/dao/SnapshotScheduleDaoImpl.java +++ b/engine/schema/src/main/java/com/cloud/storage/dao/SnapshotScheduleDaoImpl.java @@ -32,7 +32,7 @@ public class SnapshotScheduleDaoImpl extends GenericDaoBase implements SnapshotScheduleDao { protected final SearchBuilder executableSchedulesSearch; protected final SearchBuilder coincidingSchedulesSearch; - private final SearchBuilder VolumeIdSearch; + protected final SearchBuilder schedulesAssignedWithAsyncJob; private final SearchBuilder VolumeIdPolicyIdSearch; protected SnapshotScheduleDaoImpl() { @@ -48,36 +48,14 @@ protected SnapshotScheduleDaoImpl() { coincidingSchedulesSearch.and("asyncJobId", coincidingSchedulesSearch.entity().getAsyncJobId(), SearchCriteria.Op.NULL); coincidingSchedulesSearch.done(); - VolumeIdSearch = createSearchBuilder(); - VolumeIdSearch.and("volumeId", VolumeIdSearch.entity().getVolumeId(), SearchCriteria.Op.EQ); - VolumeIdSearch.done(); - VolumeIdPolicyIdSearch = createSearchBuilder(); VolumeIdPolicyIdSearch.and("volumeId", VolumeIdPolicyIdSearch.entity().getVolumeId(), SearchCriteria.Op.EQ); VolumeIdPolicyIdSearch.and("policyId", VolumeIdPolicyIdSearch.entity().getPolicyId(), SearchCriteria.Op.EQ); VolumeIdPolicyIdSearch.done(); - } - - /** - * {@inheritDoc} - */ - @Override - public List getCoincidingSnapshotSchedules(long volumeId, Date date) { - SearchCriteria sc = coincidingSchedulesSearch.create(); - sc.setParameters("volumeId", volumeId); - sc.setParameters("scheduledTimestamp", date); - // Don't return manual snapshots. They will be executed through another - // code path. - sc.addAnd("policyId", SearchCriteria.Op.NEQ, 1L); - return listBy(sc); - } - - @Override - public SnapshotScheduleVO findOneByVolume(long volumeId) { - SearchCriteria sc = VolumeIdSearch.create(); - sc.setParameters("volumeId", volumeId); - return findOneBy(sc); + schedulesAssignedWithAsyncJob = createSearchBuilder(); + schedulesAssignedWithAsyncJob.and("asyncJobId", schedulesAssignedWithAsyncJob.entity().getAsyncJobId(), SearchCriteria.Op.NNULL); + schedulesAssignedWithAsyncJob.done(); } @Override @@ -98,6 +76,11 @@ public List getSchedulesToExecute(Date currentTimestamp) { return listBy(sc); } + @Override + public List getSchedulesAssignedWithAsyncJob() { + return listBy(schedulesAssignedWithAsyncJob.create()); + } + /** * {@inheritDoc} */ diff --git a/engine/schema/src/main/resources/META-INF/db/schema-41910to42000.sql b/engine/schema/src/main/resources/META-INF/db/schema-41910to42000.sql index 2bcf41a4042f..f7c78670ddd8 100644 --- a/engine/schema/src/main/resources/META-INF/db/schema-41910to42000.sql +++ b/engine/schema/src/main/resources/META-INF/db/schema-41910to42000.sql @@ -90,6 +90,9 @@ CREATE TABLE IF NOT EXISTS `cloud_usage`.`quota_email_configuration`( CONSTRAINT `FK_quota_email_configuration_account_id` FOREIGN KEY (`account_id`) REFERENCES `cloud_usage`.`quota_account`(`account_id`), CONSTRAINT `FK_quota_email_configuration_email_template_id` FOREIGN KEY (`email_template_id`) REFERENCES `cloud_usage`.`quota_email_templates`(`id`)); +-- Remove on delete cascade from snapshot schedule +ALTER TABLE `cloud`.`snapshot_schedule` DROP CONSTRAINT `fk__snapshot_schedule_async_job_id`; + -- Add `is_implicit` column to `host_tags` table CALL `cloud`.`IDEMPOTENT_ADD_COLUMN`('cloud.host_tags', 'is_implicit', 'int(1) UNSIGNED NOT NULL DEFAULT 0 COMMENT "If host tag is implicit or explicit" '); diff --git a/server/src/main/java/com/cloud/storage/snapshot/SnapshotSchedulerImpl.java b/server/src/main/java/com/cloud/storage/snapshot/SnapshotSchedulerImpl.java index 29955066062c..2a53021636c5 100644 --- a/server/src/main/java/com/cloud/storage/snapshot/SnapshotSchedulerImpl.java +++ b/server/src/main/java/com/cloud/storage/snapshot/SnapshotSchedulerImpl.java @@ -35,6 +35,7 @@ import org.apache.cloudstack.framework.jobs.AsyncJobManager; import org.apache.cloudstack.framework.jobs.dao.AsyncJobDao; import org.apache.cloudstack.framework.jobs.impl.AsyncJobVO; +import org.apache.cloudstack.jobs.JobInfo; import org.apache.cloudstack.managed.context.ManagedContextTimerTask; import org.springframework.stereotype.Component; @@ -47,7 +48,6 @@ import com.cloud.storage.Snapshot; import com.cloud.storage.SnapshotPolicyVO; import com.cloud.storage.SnapshotScheduleVO; -import com.cloud.storage.SnapshotVO; import com.cloud.storage.VolumeVO; import com.cloud.storage.dao.SnapshotDao; import com.cloud.storage.dao.SnapshotPolicyDao; @@ -64,7 +64,6 @@ import com.cloud.utils.concurrency.TestClock; import com.cloud.utils.db.DB; import com.cloud.utils.db.GlobalLock; -import com.cloud.utils.db.SearchCriteria; import com.cloud.utils.db.TransactionLegacy; import com.cloud.vm.snapshot.VMSnapshotManager; import com.cloud.vm.snapshot.VMSnapshotVO; @@ -144,7 +143,7 @@ public void poll(final Date currentTimestamp) { try { if (scanLock.lock(ACQUIRE_GLOBAL_LOCK_TIMEOUT_FOR_COOPERATION)) { try { - checkStatusOfCurrentlyExecutingSnapshots(); + scheduleNextSnapshotJobsIfNecessary(); } finally { scanLock.unlock(); } @@ -174,68 +173,37 @@ public void poll(final Date currentTimestamp) { } } - private void checkStatusOfCurrentlyExecutingSnapshots() { - final SearchCriteria sc = _snapshotScheduleDao.createSearchCriteria(); - sc.addAnd("asyncJobId", SearchCriteria.Op.NNULL); - final List snapshotSchedules = _snapshotScheduleDao.search(sc, null); - for (final SnapshotScheduleVO snapshotSchedule : snapshotSchedules) { - final Long asyncJobId = snapshotSchedule.getAsyncJobId(); - final AsyncJobVO asyncJob = _asyncJobDao.findByIdIncludingRemoved(asyncJobId); - switch (asyncJob.getStatus()) { - case SUCCEEDED: - // The snapshot has been successfully backed up. - // The snapshot state has also been cleaned up. - // We can schedule the next job for this snapshot. - // Remove the existing entry in the snapshot_schedule table. - scheduleNextSnapshotJob(snapshotSchedule); - break; - case FAILED: - // Check the snapshot status. - final Long snapshotId = snapshotSchedule.getSnapshotId(); - if (snapshotId == null) { - // createSnapshotAsync exited, successfully or unsuccessfully, - // even before creating a snapshot record - // No cleanup needs to be done. - // Schedule the next snapshot. - scheduleNextSnapshotJob(snapshotSchedule); - } else { - final SnapshotVO snapshot = _snapshotDao.findById(snapshotId); - if (snapshot == null || snapshot.getRemoved() != null) { - // This snapshot has been deleted successfully from the primary storage - // Again no cleanup needs to be done. - // Schedule the next snapshot. - // There's very little probability that the code reaches this point. - // The snapshotId is a foreign key for the snapshot_schedule table - // set to ON DELETE CASCADE. So if the snapshot entry is deleted, the snapshot_schedule entry will be too. - // But what if it has only been marked as removed? - scheduleNextSnapshotJob(snapshotSchedule); - } else { - // The management server executing this snapshot job appears to have crashed - // while creating the snapshot on primary storage/or backing it up. - // We have no idea whether the snapshot was successfully taken on the primary or not. - // Schedule the next snapshot job. - // The ValidatePreviousSnapshotCommand will take appropriate action on this snapshot - // If the snapshot was taken successfully on primary, it will retry backing it up. - // and cleanup the previous snapshot - // Set the userId to that of system. - //_snapshotManager.validateSnapshot(1L, snapshot); - // In all cases, schedule the next snapshot job - scheduleNextSnapshotJob(snapshotSchedule); - } - } + private void scheduleNextSnapshotJobsIfNecessary() { + List snapshotSchedules = _snapshotScheduleDao.getSchedulesAssignedWithAsyncJob(); + logger.info("Verifying the current state of [{}] snapshot schedules and scheduling next jobs, if necessary.", snapshotSchedules.size()); + for (SnapshotScheduleVO snapshotSchedule : snapshotSchedules) { + scheduleNextSnapshotJobIfNecessary(snapshotSchedule); + } + } - break; - case IN_PROGRESS: - // There is no way of knowing from here whether - // 1) Another management server is processing this snapshot job - // 2) The management server has crashed and this snapshot is lying - // around in an inconsistent state. - // Hopefully, this can be resolved at the backend when the current snapshot gets executed. - // But if it remains in this state, the current snapshot will not get executed. - // And it will remain in stasis. - break; - } + protected void scheduleNextSnapshotJobIfNecessary(SnapshotScheduleVO snapshotSchedule) { + Long asyncJobId = snapshotSchedule.getAsyncJobId(); + AsyncJobVO asyncJob = _asyncJobDao.findByIdIncludingRemoved(asyncJobId); + + if (asyncJob == null) { + logger.debug("The async job [{}] of snapshot schedule [{}] does not exist anymore. Considering it as finished and scheduling the next snapshot job.", + asyncJobId, snapshotSchedule); + scheduleNextSnapshotJob(snapshotSchedule); + return; } + + JobInfo.Status status = asyncJob.getStatus(); + + if (JobInfo.Status.SUCCEEDED.equals(status)) { + logger.debug("Last job of schedule [{}] succeeded; scheduling the next snapshot job.", snapshotSchedule); + } else if (JobInfo.Status.FAILED.equals(status)) { + logger.debug("Last job of schedule [{}] failed with [{}]; scheduling a new snapshot job.", snapshotSchedule, asyncJob.getResult()); + } else { + logger.debug("Schedule [{}] is still in progress, skipping next job scheduling.", snapshotSchedule); + return; + } + + scheduleNextSnapshotJob(snapshotSchedule); } @DB diff --git a/server/src/test/java/com/cloud/storage/snapshot/SnapshotSchedulerImplTest.java b/server/src/test/java/com/cloud/storage/snapshot/SnapshotSchedulerImplTest.java index 971af289ef70..3827531891fd 100644 --- a/server/src/test/java/com/cloud/storage/snapshot/SnapshotSchedulerImplTest.java +++ b/server/src/test/java/com/cloud/storage/snapshot/SnapshotSchedulerImplTest.java @@ -26,6 +26,9 @@ import com.cloud.user.Account; import com.cloud.user.AccountVO; import com.cloud.user.dao.AccountDao; +import org.apache.cloudstack.framework.jobs.dao.AsyncJobDao; +import org.apache.cloudstack.framework.jobs.impl.AsyncJobVO; +import org.apache.cloudstack.jobs.JobInfo; import org.junit.Assert; import org.junit.Test; import org.junit.runner.RunWith; @@ -65,6 +68,16 @@ public class SnapshotSchedulerImplTest { @Mock AccountVO accountVoMock; + @Mock + private SnapshotScheduleVO snapshotScheduleVoMock; + + @Mock + private AsyncJobDao asyncJobDaoMock; + + @Mock + private AsyncJobVO asyncJobVoMock; + + @Test public void scheduleNextSnapshotJobTestParameterIsNullReturnNull() { SnapshotScheduleVO snapshotScheduleVO = null; @@ -215,4 +228,50 @@ public void canSnapshotBeScheduledTestSnapshotPolicyIsNotRemovedDoNotCallRemove( Mockito.verify(snapshotScheduleDaoMock, Mockito.never()).remove(Mockito.anyLong()); } + + @Test + public void scheduleNextSnapshotJobIfNecessaryTestAsyncJobIsNullThenScheduleNextSnapshot() { + Mockito.doReturn(1L).when(snapshotScheduleVoMock).getAsyncJobId(); + Mockito.doReturn(null).when(asyncJobDaoMock).findByIdIncludingRemoved(Mockito.any()); + Mockito.doReturn(new Date()).when(snapshotSchedulerImplSpy).scheduleNextSnapshotJob(Mockito.any(SnapshotScheduleVO.class)); + + snapshotSchedulerImplSpy.scheduleNextSnapshotJobIfNecessary(snapshotScheduleVoMock); + + Mockito.verify(snapshotSchedulerImplSpy).scheduleNextSnapshotJob(Mockito.any(SnapshotScheduleVO.class)); + } + + @Test + public void scheduleNextSnapshotJobIfNecessaryTestAsyncJobSucceededThenScheduleNextSnapshot() { + Mockito.doReturn(1L).when(snapshotScheduleVoMock).getAsyncJobId(); + Mockito.doReturn(asyncJobVoMock).when(asyncJobDaoMock).findByIdIncludingRemoved(Mockito.any()); + Mockito.doReturn(JobInfo.Status.SUCCEEDED).when(asyncJobVoMock).getStatus(); + Mockito.doReturn(new Date()).when(snapshotSchedulerImplSpy).scheduleNextSnapshotJob(Mockito.any(SnapshotScheduleVO.class)); + + snapshotSchedulerImplSpy.scheduleNextSnapshotJobIfNecessary(snapshotScheduleVoMock); + + Mockito.verify(snapshotSchedulerImplSpy).scheduleNextSnapshotJob(Mockito.any(SnapshotScheduleVO.class)); + } + + @Test + public void scheduleNextSnapshotJobIfNecessaryTestAsyncJobFailedThenScheduleNextSnapshot() { + Mockito.doReturn(1L).when(snapshotScheduleVoMock).getAsyncJobId(); + Mockito.doReturn(asyncJobVoMock).when(asyncJobDaoMock).findByIdIncludingRemoved(Mockito.any()); + Mockito.doReturn(JobInfo.Status.FAILED).when(asyncJobVoMock).getStatus(); + Mockito.doReturn(new Date()).when(snapshotSchedulerImplSpy).scheduleNextSnapshotJob(Mockito.any(SnapshotScheduleVO.class)); + + snapshotSchedulerImplSpy.scheduleNextSnapshotJobIfNecessary(snapshotScheduleVoMock); + + Mockito.verify(snapshotSchedulerImplSpy).scheduleNextSnapshotJob(Mockito.any(SnapshotScheduleVO.class)); + } + + @Test + public void scheduleNextSnapshotJobIfNecessaryTestAsyncJobInProgressThenDoNothing() { + Mockito.doReturn(1L).when(snapshotScheduleVoMock).getAsyncJobId(); + Mockito.doReturn(asyncJobVoMock).when(asyncJobDaoMock).findByIdIncludingRemoved(Mockito.any()); + Mockito.doReturn(JobInfo.Status.IN_PROGRESS).when(asyncJobVoMock).getStatus(); + + snapshotSchedulerImplSpy.scheduleNextSnapshotJobIfNecessary(snapshotScheduleVoMock); + + Mockito.verify(snapshotSchedulerImplSpy, Mockito.never()).scheduleNextSnapshotJob(Mockito.any(SnapshotScheduleVO.class)); + } }