Skip to content

Commit

Permalink
DKAN-4287 Make harvest_run id not a primary key.
Browse files Browse the repository at this point in the history
  • Loading branch information
Steve Wirt authored and Steve Wirt committed Dec 11, 2024
1 parent 61e73f4 commit c5c9048
Show file tree
Hide file tree
Showing 11 changed files with 288 additions and 93 deletions.
161 changes: 158 additions & 3 deletions modules/harvest/harvest.install
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
<?php

use Drupal\Component\Render\FormattableMarkup;
use Drupal\Core\Database\Database;
use Drupal\harvest\Entity\HarvestRunRepository;
use Drupal\harvest\HarvestUtility;

/**
* @file
*/
Expand Down Expand Up @@ -28,6 +33,64 @@ function harvest_requirements($phase): array {
return $requirements;
}

/**
* Get the ids from the temp harvest run table.
*
* @param mixed $table_name_temp
* The name of the temp table.
*
* @return array
* The ids of all the harvest runs in the table sorted oldest to newest.
*/
function harvest_get_temp_run_ids($table_name_temp) : array {
$connection = Database::getConnection();
//- Step one get all referenced nids
$query = $connection->select($table_name_temp, 'hrt')
->fields('hrt', ['id']);
$query->orderBy('id', 'ASC');
$result = $query->execute()->fetchCol(0);
// Can't do orderBy as the sort end up natural, not numeric.
asort($result, SORT_NUMERIC);

return $result ?? [];
}

/**
* Reads a single harvest row from the temp table.
*
* @param string $table_name_temp
* Name of the table to read from.
*
* @param string $time_id
* The id to read from, which was also the timestamp.
*
* @return array
* Elements from the row ['id', 'harvest_plan_id', 'data', 'extract_status'].
*/
function harvest_read_harvest_run(string $table_name_temp, string $time_id): array {
$connection = Database::getConnection();
$query = $connection->select($table_name_temp, 'hrt')
->fields('hrt', ['id', 'harvest_plan_id', 'data', 'extract_status'])
->condition('id', $time_id, '=');
// ->where($time_id, 'id', '=');
$query->orderBy('id', 'ASC');
$result = $query->execute()->fetchAll(PDO::FETCH_ASSOC);
return reset($result);
}

function harvest_write_harvest_run(string $id, string $harvest_plan_id, string $data, string $extract_status) {
/** @var \Drupal\Core\Database\Connection $connection */
$connection = \Drupal::service('database');
$result = $connection->insert('harvest_runs')
->fields([
'timestamp' => (int) $id,
'harvest_plan_id' => $harvest_plan_id,
'data' => $data,
'extract_status' => $extract_status,
])
->execute();
}

/**
* Uninstall obsolete submodule harvest_dashboard.
*/
Expand Down Expand Up @@ -95,8 +158,100 @@ function harvest_update_8007(&$sandbox) {
* This finishes the process started by harvest_update_8007.
*/
function harvest_update_8008(&$sandbox) {
// Moved and repeated to 8010.
}

/**
* Update harvest_run schema to add timestamp, uuid, and true id.
*
* @see https://github.com/GetDKAN/dkan/issues/4287
*/
function harvest_update_8009(&$sandbox) {
$table_name = 'harvest_runs';
$table_name_temp = "{$table_name}_temp";
$entity_type_name = 'harvest_run';

$definition_update_manager = \Drupal::entityDefinitionUpdateManager();
$entity_type_manager = \Drupal::entityTypeManager();
$schema = \Drupal::database()->schema();

// Move the table so we can rebuild from it.
$schema->renameTable($table_name, $table_name_temp);
$messages = "Table {$table_name} moved to {$table_name_temp}. " . PHP_EOL;
// Uninstall the the original entity.
$original_type = $definition_update_manager->getEntityType($entity_type_name);
$definition_update_manager->uninstallEntityType($original_type);
$messages .= "Old harvest_run entity removed. " . PHP_EOL;
$entity_type_manager->clearCachedDefinitions();
// Install the new entity.
//$entity_type = $entity_type_manager->get($entity_type_name);
$entity_type_manager->clearCachedDefinitions();
$entity_type_def = $entity_type_manager->getDefinition($entity_type_name);
$definition_update_manager->installEntityType($entity_type_def);
$messages .= "New harvest_run entity installed. " . PHP_EOL;

return $messages;
}

/**
* Move data from temp table back into harvest_run.
*
* @see https://github.com/GetDKAN/dkan/issues/4287
*/
function harvest_update_8010(&$sandbox) {
$table_name = 'harvest_runs';
$table_name_temp = "{$table_name}_temp";
$messages = '';
$schema = \Drupal::database()->schema();

if (!isset($sandbox['total'])) {
// Sandbox has not been initiated, so initiate it.
$sandbox['items_to_process'] = harvest_get_temp_run_ids($table_name_temp);
$sandbox['total'] = count($sandbox['items_to_process']);
$sandbox['current'] = 0;
}
// Process them in batches of 25.
$harvest_runs_batch = array_slice($sandbox['items_to_process'], 0, 25, TRUE);
// Loop through all the entries in temp table and save them new.
foreach ($harvest_runs_batch as $key => $time_id) {
// Load the old row.
$row = harvest_read_harvest_run($table_name_temp, $time_id);
// Write the new harvest run.
harvest_write_harvest_run($row['id'], $row['harvest_plan_id'], $row['data'], $row['extract_status']);
// The item has been processed, remove it from the array.
unset($sandbox['items_to_process'][$key]);
}

// Determine when to stop batching.
$sandbox['current'] = ($sandbox['total'] - count($sandbox['items_to_process']));
$sandbox['#finished'] = (empty($sandbox['total'])) ? 1 : ($sandbox['current'] / $sandbox['total']);
$vars = [
'@completed' => $sandbox['current'],
'@total' => $sandbox['total'],
];

$messages = t('Processed: @completed/@total.', $vars) . PHP_EOL;
// Log the all finished notice.
if ($sandbox['#finished'] === 1) {
// The update of the harvest_runs is complete.
$messages .= t('Data in harvest_runs updated to new schema:') . PHP_EOL;
$dropped = $schema->dropTable($table_name_temp);
if ($dropped) {
$messages .= t('Temporary table dropped.') . PHP_EOL;
}
}

return $messages;
}

/**
* Move entries from harvest_[ID]_runs to harvest_runs.
*
* This finishes the process started by harvest_update_8007 and re-runs 8008.
*/
function harvest_update_8011(&$sandbox) {
/** @var \Drupal\harvest\HarvestUtility $harvest_utility */
$harvest_utility = \Drupal::service('dkan.harvest.utility');
$harvest_utility->harvestRunsUpdate();
return 'Harvest runs coalesced into table harvest_runs.';
$harvest_utility = \Drupal::service('dkan.harvest.utility');
$harvest_utility->harvestRunsUpdate();
return 'Harvest plan specific run tables coalesced into table harvest_runs.';
}
5 changes: 0 additions & 5 deletions modules/harvest/src/Commands/HarvestCommands.php
Original file line number Diff line number Diff line change
Expand Up @@ -221,11 +221,6 @@ public function runAll($options = ['new' => FALSE]) {
foreach ($plan_ids as $plan_id) {
$result = $this->harvestService->runHarvest($plan_id);
$runs_info[] = $result;
// Since run IDs are also one-second-resolution timestamps, we must wait
// one second before running the next harvest.
// @todo Remove this sleep when we've switched to a better system for
// timestamps.
sleep(1);
}
$this->renderHarvestRunsInfo($runs_info);
}
Expand Down
29 changes: 20 additions & 9 deletions modules/harvest/src/Entity/HarvestRun.php
Original file line number Diff line number Diff line change
Expand Up @@ -46,15 +46,12 @@
* admin_permission = "administer harvest_run",
* entity_keys = {
* "id" = "id",
* "label" = "id",
* "label" = "ID",
* },
* links = {
* "canonical" = "/harvest-run/{harvest_run}",
* },
* )
*
* @todo Convert to using microtime() or other better system for the timestamp/
* id.
*/
final class HarvestRun extends HarvestEntityBase implements HarvestRunInterface {

Expand All @@ -64,11 +61,25 @@ final class HarvestRun extends HarvestEntityBase implements HarvestRunInterface
public static function baseFieldDefinitions(EntityTypeInterface $entity_type) {
$base_fields = parent::baseFieldDefinitions($entity_type);

// The id is the unique ID for the harvest run, and also the timestamp at
// which the run occurred, generated by time().
$base_fields['id'] = static::getBaseFieldIdentifier(
new TranslatableMarkup('Harvest Run')
);
$base_fields['id'] = BaseFieldDefinition::create('integer')
->setLabel(t('ID'))
->setDescription(t('The ID of the Harvest Run entity.'))
->setDisplayOptions('view', [
'label' => 'inline',
'weight' => 0,
])
->setReadOnly(TRUE);

$base_fields['timestamp'] = BaseFieldDefinition::create('timestamp')
->setLabel(t('timestamp'))
->setDescription(t('The timestamp of when this harvest was run.'))
->setRequired(TRUE);

$base_fields['uuid'] = BaseFieldDefinition::create('uuid')
->setLabel(t('UUID'))
->setDescription(t('The unique identifier for this harvest_run'))
->setRequired(TRUE)
->setReadOnly(TRUE);

// Harvest plan id. This is the name of the harvest plan as seen in the UI.
$base_fields['harvest_plan_id'] = BaseFieldDefinition::create('string')
Expand Down
Loading

0 comments on commit c5c9048

Please sign in to comment.