From f1d22db0fda00ae89a399ea62f42e549fcd7f8b4 Mon Sep 17 00:00:00 2001 From: Steve Wirt Date: Wed, 20 Nov 2024 17:19:05 -0500 Subject: [PATCH] DKAN-4287 Make harvest_run id not a primary key. --- modules/harvest/harvest.install | 161 +++++++++++++++++- .../harvest/src/Commands/HarvestCommands.php | 5 - modules/harvest/src/Entity/HarvestRun.php | 29 +++- .../src/Entity/HarvestRunRepository.php | 50 +++--- modules/harvest/src/HarvestService.php | 27 ++- modules/harvest/src/HarvestUtility.php | 6 +- 6 files changed, 221 insertions(+), 57 deletions(-) diff --git a/modules/harvest/harvest.install b/modules/harvest/harvest.install index 64c25a4134..5c611c1f7f 100644 --- a/modules/harvest/harvest.install +++ b/modules/harvest/harvest.install @@ -1,5 +1,10 @@ select($table_name_temp, 'hrt') + ->fields('hrt', ['id']); + $query->orderBy('id', 'ASC'); + $result = $query->execute()->fetchCol(0); + // Can't do orderBy as the sort end up natural, not numeric. + asort($result, SORT_NUMERIC); + + return $result ?? []; +} + +/** + * Reads a single harvest row from the temp table. + * + * @param string $table_name_temp + * Name of the table to read from. + * + * @param string $time_id + * The id to read from, which was also the timestamp. + * + * @return array + * Elements from the row ['id', 'harvest_plan_id', 'data', 'extract_status']. + */ +function harvest_read_harvest_run(string $table_name_temp, string $time_id): array { + $connection = Database::getConnection(); + $query = $connection->select($table_name_temp, 'hrt') + ->fields('hrt', ['id', 'harvest_plan_id', 'data', 'extract_status']) + ->condition('id', $time_id, '='); + // ->where($time_id, 'id', '='); + $query->orderBy('id', 'ASC'); + $result = $query->execute()->fetchAll(PDO::FETCH_ASSOC); + return reset($result); +} + +function harvest_write_harvest_run(string $id, string $harvest_plan_id, string $data, string $extract_status) { + /** @var \Drupal\Core\Database\Connection $connection */ + $connection = \Drupal::service('database'); + $result = $connection->insert('harvest_runs') + ->fields([ + 'timestamp' => (int) $id, + 'harvest_plan_id' => $harvest_plan_id, + 'data' => $data, + 'extract_status' => $extract_status, + ]) + ->execute(); +} + /** * Uninstall obsolete submodule harvest_dashboard. */ @@ -95,8 +158,100 @@ function harvest_update_8007(&$sandbox) { * This finishes the process started by harvest_update_8007. */ function harvest_update_8008(&$sandbox) { + // Moved and repeated to 8010. +} + +/** + * Update harvest_run schema to add timestamp, uuid, and true id. + * + * @see https://github.com/GetDKAN/dkan/issues/4287 + */ +function harvest_update_8009(&$sandbox) { + $table_name = 'harvest_runs'; + $table_name_temp = "{$table_name}_temp"; + $entity_type_name = 'harvest_run'; + + $definition_update_manager = \Drupal::entityDefinitionUpdateManager(); + $entity_type_manager = \Drupal::entityTypeManager(); + $schema = \Drupal::database()->schema(); + + // Move the table so we can rebuild from it. + $schema->renameTable($table_name, $table_name_temp); + $messages = "Table {$table_name} moved to {$table_name_temp}. " . PHP_EOL; + // Uninstall the the original entity. + $original_type = $definition_update_manager->getEntityType($entity_type_name); + $definition_update_manager->uninstallEntityType($original_type); + $messages .= "Old harvest_run entity removed. " . PHP_EOL; + $entity_type_manager->clearCachedDefinitions(); + // Install the new entity. + //$entity_type = $entity_type_manager->get($entity_type_name); + $entity_type_manager->clearCachedDefinitions(); + $entity_type_def = $entity_type_manager->getDefinition($entity_type_name); + $definition_update_manager->installEntityType($entity_type_def); + $messages .= "New harvest_run entity installed. " . PHP_EOL; + + return $messages; +} + +/** + * Move data from temp table back into harvest_run. + * + * @see https://github.com/GetDKAN/dkan/issues/4287 + */ +function harvest_update_8010(&$sandbox) { + $table_name = 'harvest_runs'; + $table_name_temp = "{$table_name}_temp"; + $messages = ''; + $schema = \Drupal::database()->schema(); + + if (!isset($sandbox['total'])) { + // Sandbox has not been initiated, so initiate it. + $sandbox['items_to_process'] = harvest_get_temp_run_ids($table_name_temp); + $sandbox['total'] = count($sandbox['items_to_process']); + $sandbox['current'] = 0; + } + // Process them in batches of 25. + $harvest_runs_batch = array_slice($sandbox['items_to_process'], 0, 25, TRUE); + // Loop through all the entries in temp table and save them new. + foreach ($harvest_runs_batch as $key => $time_id) { + // Load the old row. + $row = harvest_read_harvest_run($table_name_temp, $time_id); + // Write the new harvest run. + harvest_write_harvest_run($row['id'], $row['harvest_plan_id'], $row['data'], $row['extract_status']); + // The item has been processed, remove it from the array. + unset($sandbox['items_to_process'][$key]); + } + + // Determine when to stop batching. + $sandbox['current'] = ($sandbox['total'] - count($sandbox['items_to_process'])); + $sandbox['#finished'] = (empty($sandbox['total'])) ? 1 : ($sandbox['current'] / $sandbox['total']); + $vars = [ + '@completed' => $sandbox['current'], + '@total' => $sandbox['total'], + ]; + + $messages = t('Processed: @completed/@total.', $vars) . PHP_EOL; + // Log the all finished notice. + if ($sandbox['#finished'] === 1) { + // The update of the harvest_runs is complete. + $messages .= t('Data in harvest_runs updated to new schema:') . PHP_EOL; + $dropped = $schema->dropTable($table_name_temp); + if ($dropped) { + $messages .= t('Temporary table dropped.') . PHP_EOL; + } + } + + return $messages; +} + +/** + * Move entries from harvest_[ID]_runs to harvest_runs. + * + * This finishes the process started by harvest_update_8007 and re-runs 8008. +*/ +function harvest_update_8011(&$sandbox) { /** @var \Drupal\harvest\HarvestUtility $harvest_utility */ - $harvest_utility = \Drupal::service('dkan.harvest.utility'); - $harvest_utility->harvestRunsUpdate(); - return 'Harvest runs coalesced into table harvest_runs.'; + $harvest_utility = \Drupal::service('dkan.harvest.utility'); + $harvest_utility->harvestRunsUpdate(); + return 'Harvest plan specific run tables coalesced into table harvest_runs.'; } diff --git a/modules/harvest/src/Commands/HarvestCommands.php b/modules/harvest/src/Commands/HarvestCommands.php index 9308c98ee8..f28ed9d89b 100644 --- a/modules/harvest/src/Commands/HarvestCommands.php +++ b/modules/harvest/src/Commands/HarvestCommands.php @@ -221,11 +221,6 @@ public function runAll($options = ['new' => FALSE]) { foreach ($plan_ids as $plan_id) { $result = $this->harvestService->runHarvest($plan_id); $runs_info[] = $result; - // Since run IDs are also one-second-resolution timestamps, we must wait - // one second before running the next harvest. - // @todo Remove this sleep when we've switched to a better system for - // timestamps. - sleep(1); } $this->renderHarvestRunsInfo($runs_info); } diff --git a/modules/harvest/src/Entity/HarvestRun.php b/modules/harvest/src/Entity/HarvestRun.php index 66de79c20d..42c5f3f96d 100644 --- a/modules/harvest/src/Entity/HarvestRun.php +++ b/modules/harvest/src/Entity/HarvestRun.php @@ -46,15 +46,12 @@ * admin_permission = "administer harvest_run", * entity_keys = { * "id" = "id", - * "label" = "id", + * "label" = "ID", * }, * links = { * "canonical" = "/harvest-run/{harvest_run}", * }, * ) - * - * @todo Convert to using microtime() or other better system for the timestamp/ - * id. */ final class HarvestRun extends HarvestEntityBase implements HarvestRunInterface { @@ -64,11 +61,25 @@ final class HarvestRun extends HarvestEntityBase implements HarvestRunInterface public static function baseFieldDefinitions(EntityTypeInterface $entity_type) { $base_fields = parent::baseFieldDefinitions($entity_type); - // The id is the unique ID for the harvest run, and also the timestamp at - // which the run occurred, generated by time(). - $base_fields['id'] = static::getBaseFieldIdentifier( - new TranslatableMarkup('Harvest Run') - ); + $base_fields['id'] = BaseFieldDefinition::create('integer') + ->setLabel(t('ID')) + ->setDescription(t('The ID of the Harvest Run entity.')) + ->setDisplayOptions('view', [ + 'label' => 'inline', + 'weight' => 0, + ]) + ->setReadOnly(TRUE); + + $base_fields['timestamp'] = BaseFieldDefinition::create('timestamp') + ->setLabel(t('timestamp')) + ->setDescription(t('The timestamp of when this harvest was run.')) + ->setRequired(TRUE); + + $base_fields['uuid'] = BaseFieldDefinition::create('uuid') + ->setLabel(t('UUID')) + ->setDescription(t('The unique identifier for this harvest_run')) + ->setRequired(TRUE) + ->setReadOnly(TRUE); // Harvest plan id. This is the name of the harvest plan as seen in the UI. $base_fields['harvest_plan_id'] = BaseFieldDefinition::create('string') diff --git a/modules/harvest/src/Entity/HarvestRunRepository.php b/modules/harvest/src/Entity/HarvestRunRepository.php index 04af86439b..e80848e552 100644 --- a/modules/harvest/src/Entity/HarvestRunRepository.php +++ b/modules/harvest/src/Entity/HarvestRunRepository.php @@ -89,8 +89,8 @@ public function destructForPlanId(string $plan_id) { * Run data. Usually the result returned by Harvester::harvest(). * @param string $plan_id * The plan identifier. - * @param string $run_id - * The run identifier, which is also a timestamp. + * @param string $timestamp + * The run timestamp. * * @return string * The run identifier. @@ -100,9 +100,9 @@ public function destructForPlanId(string $plan_id) { * @todo Eventually all the subsystems will be able to understand the entity * rather than needing conversion to and from the array format. */ - public function storeRun(array $run_data, string $plan_id, string $run_id): string { + public function storeRun(array $run_data, string $plan_id, string $timestamp): string { $field_values = [ - 'id' => $run_id, + 'timestamp' => (int) $timestamp, 'harvest_plan_id' => $plan_id, ]; $field_values['extract_status'] = $run_data['status']['extract'] ?? 'FAILURE'; @@ -139,7 +139,7 @@ public function storeRun(array $run_data, string $plan_id, string $run_id): stri // JSON encode remaining run data. $field_values['data'] = json_encode($run_data); - return $this->writeEntity($field_values, $plan_id, $run_id); + return $this->writeEntity($field_values, $plan_id, $timestamp); } /** @@ -147,14 +147,14 @@ public function storeRun(array $run_data, string $plan_id, string $run_id): stri * * @param string $plan_id * The harvest plan identifier. - * @param string $run_id - * The harvest run identifier. + * @param string $timestamp + * The harvest run timestamp. * * @return string|null * JSON-encoded run result data, or NULL if none could be found. */ - public function retrieveRunJson(string $plan_id, string $run_id): ?string { - if ($entity = $this->loadEntity($plan_id, $run_id)) { + public function retrieveRunJson(string $plan_id, string $timestamp): ?string { + if ($entity = $this->loadEntity($plan_id, $timestamp)) { return json_encode($entity->toResult()); } return NULL; @@ -220,16 +220,16 @@ public function getUniqueHarvestPlanIds(): array { * * @param string $plan_id * The harvest plan ID. - * @param string $run_id - * The harvest run ID. + * @param string $timestamp + * The harvest run timestamp. * * @return string[] * Array of UUIDs, keyed by UUID. Note that these are UUIDs by convention; * they could be any string value. */ - public function getExtractedUuids(string $plan_id, string $run_id): array { + public function getExtractedUuids(string $plan_id, string $timestamp): array { $extracted = []; - if ($entity = $this->loadEntity($plan_id, $run_id)) { + if ($entity = $this->loadEntity($plan_id, $timestamp)) { foreach ($entity->get('extracted_uuid')->getValue() as $field) { $uuid = $field['value']; $extracted[$uuid] = $uuid; @@ -243,16 +243,19 @@ public function getExtractedUuids(string $plan_id, string $run_id): array { * * @param string $plan_id * Plan ID. - * @param string $run_id - * Run ID, which is a timestamp. + * @param string $timestamp + * The timestamp for the run. Formerly the id. * * @return \Drupal\harvest\HarvestRunInterface|\Drupal\Core\Entity\EntityInterface|null * The loaded entity or NULL if none could be loaded. + * + * @deprecated in dkan:2.19.11 and is removed from dkan:3.0.0 Use HarvestService::load(). */ - public function loadEntity(string $plan_id, string $run_id): ?HarvestRunInterface { + public function loadEntity(string $plan_id, string $timestamp): ?HarvestRunInterface { if ($ids = $this->runStorage->getQuery() - ->condition('id', $run_id) + ->condition('timestamp', $timestamp) ->condition('harvest_plan_id', $plan_id) + ->sort('id', 'DESC') ->range(0, 1) ->accessCheck(FALSE) ->execute() @@ -269,21 +272,22 @@ public function loadEntity(string $plan_id, string $run_id): ?HarvestRunInterfac * Structured data ready to send to entity_storage->create(). * @param string $plan_id * Harvest plan identifier. - * @param string $run_id - * Harvest run identifier. + * @param mixed $timestamp + * Harvest run timestamp. * * @return string - * Harvest plan identifier for the entity that was written. + * Harvest run id. */ - public function writeEntity(array $field_values, string $plan_id, string $run_id) { + public function writeEntity(array $field_values, string $plan_id, mixed $timestamp) { + $timestamp = (int) $timestamp; /** @var \Drupal\harvest\HarvestRunInterface $entity */ - $entity = $this->loadEntity($plan_id, $run_id); + $entity = $this->loadEntity($plan_id, $timestamp); if ($entity) { // Modify entity. - unset($field_values['id']); foreach ($field_values as $key => $value) { $entity->set($key, $value); } + $field_values['id'] = $entity->id(); } else { // Create new entity. diff --git a/modules/harvest/src/HarvestService.php b/modules/harvest/src/HarvestService.php index 6d3ba1ac19..2f6ae3ac77 100644 --- a/modules/harvest/src/HarvestService.php +++ b/modules/harvest/src/HarvestService.php @@ -208,7 +208,7 @@ public function revertHarvest($id) { public function runHarvest($plan_id) { $harvester = $this->getHarvester($plan_id); - $run_id = (string) time(); + $timestamp = (string) time(); $result = $harvester->harvest(); if (empty($result['status']['extracted_items_ids'])) { @@ -218,8 +218,8 @@ public function runHarvest($plan_id) { $this->getOrphanIdsFromResult($plan_id, $result['status']['extracted_items_ids']); $this->processOrphanIds($result['status']['orphan_ids']); - $result['identifier'] = $run_id; - $this->runRepository->storeRun($result, $plan_id, $run_id); + $result['identifier'] = $timestamp; + $this->runRepository->storeRun($result, $plan_id, $timestamp); return $result; } @@ -229,15 +229,15 @@ public function runHarvest($plan_id) { * * @param string $plan_id * The harvest plan ID. - * @param string $run_id - * The harvest run ID. + * @param string $timestamp + * The timestamp of the harvest_run. * * @return bool|string * JSON-encoded run information for the given run, or FALSE if no matching * runID is found. */ - public function getHarvestRunInfo(string $plan_id, string $run_id): bool|string { - if ($info = $this->runRepository->retrieveRunJson($plan_id, $run_id)) { + public function getHarvestRunInfo(string $plan_id, string $timestamp): bool|string { + if ($info = $this->runRepository->retrieveRunJson($plan_id, $timestamp)) { return $info; } return FALSE; @@ -248,14 +248,14 @@ public function getHarvestRunInfo(string $plan_id, string $run_id): bool|string * * @param string $plan_id * Harvest plan ID. - * @param string $run_id - * Harvest run ID. + * @param string $timestamp + * Harvest run timestamp. * * @return array * Array of status info from the run. */ - public function getHarvestRunResult(string $plan_id, string $run_id): array { - if ($entity = $this->runRepository->loadEntity($plan_id, $run_id)) { + public function getHarvestRunResult(string $plan_id, string $timestamp): array { + if ($entity = $this->runRepository->loadEntity($plan_id, $timestamp)) { return $entity->toResult(); } else { @@ -301,14 +301,13 @@ public function getRunIdsForHarvest(string $plan_id): array { * Since the run record id is a timestamp, we can sort on the id. * * @param string $plan_id - * The harvest identifier. + * The harvest plan identifier. * * @return string - * The most recent harvest run record identifier. + * The entity id of the most recent harvest run. */ public function getLastHarvestRunId(string $plan_id): string { $run_ids = $this->runRepository->retrieveAllRunIds($plan_id); - rsort($run_ids); return reset($run_ids); } diff --git a/modules/harvest/src/HarvestUtility.php b/modules/harvest/src/HarvestUtility.php index 54cee32996..5a8e03b5cb 100644 --- a/modules/harvest/src/HarvestUtility.php +++ b/modules/harvest/src/HarvestUtility.php @@ -217,10 +217,10 @@ public function harvestHashUpdate() { */ public function convertRunTable(string $plan_id) { $old_runs_table = $this->storeFactory->getInstance('harvest_' . $plan_id . '_runs'); - foreach ($old_runs_table->retrieveAll() as $id) { - if ($data = $old_runs_table->retrieve($id)) { + foreach ($old_runs_table->retrieveAll() as $timestamp) { + if ($data = $old_runs_table->retrieve($timestamp)) { // Explicitly decode the data as an array. - $this->runRepository->storeRun(json_decode($data, TRUE), $plan_id, $id); + $this->runRepository->storeRun(json_decode($data, TRUE), $plan_id, $timestamp); } } }