From 7fa499f3d111ef51ba86d3589ef7a504a61a12c6 Mon Sep 17 00:00:00 2001 From: Paul Mitchum Date: Thu, 16 Nov 2023 15:47:40 -0600 Subject: [PATCH] Clean up leftover harvest data tables (#4049) --- modules/harvest/drush.services.yml | 1 + modules/harvest/harvest.install | 24 +++ modules/harvest/harvest.services.yml | 6 + .../harvest/src/Commands/HarvestCommands.php | 58 ++++++- modules/harvest/src/HarvestService.php | 27 ++- modules/harvest/src/HarvestUtility.php | 155 ++++++++++++++++++ .../tests/src/Kernel/HarvestServiceTest.php | 121 ++++++++++++++ .../tests/src/Kernel/HarvestUtilityTest.php | 59 +++++++ ...ServiceTest.php => HarvestServiceTest.php} | 141 +--------------- 9 files changed, 444 insertions(+), 148 deletions(-) create mode 100644 modules/harvest/src/HarvestUtility.php create mode 100644 modules/harvest/tests/src/Kernel/HarvestServiceTest.php create mode 100644 modules/harvest/tests/src/Kernel/HarvestUtilityTest.php rename modules/harvest/tests/src/Unit/{ServiceTest.php => HarvestServiceTest.php} (61%) diff --git a/modules/harvest/drush.services.yml b/modules/harvest/drush.services.yml index 4bbb7c7b53..a1bd1730fd 100644 --- a/modules/harvest/drush.services.yml +++ b/modules/harvest/drush.services.yml @@ -4,5 +4,6 @@ services: arguments: - '@dkan.harvest.service' - '@dkan.harvest.logger_channel' + - '@dkan.harvest.utility' tags: - { name: drush.command } diff --git a/modules/harvest/harvest.install b/modules/harvest/harvest.install index 74825560b9..9b4796501b 100644 --- a/modules/harvest/harvest.install +++ b/modules/harvest/harvest.install @@ -4,6 +4,30 @@ * @file */ +function harvest_requirements($phase): array { + $requirements = []; + if ($phase == 'runtime') { + /** @var \Drupal\harvest\HarvestUtility $harvest_utility */ + if ($harvest_utility = \Drupal::service('dkan.harvest.utility')) { + if ($leftover_harvest_data_ids = $harvest_utility->findOrphanedHarvestDataIds()) { + $requirements['dkan harvest leftover data'] = [ + 'title' => t('DKAN Harvest Leftover Plan Data'), + 'value' => t('Leftover harvest data for plans: @plans', [ + '@plans' => implode(', ', $leftover_harvest_data_ids) + ]), + 'description' => t( + 'DKAN\'s harvest module has detected extra unneeded data tables. + You can remove them using this Drush command from the CLI: + drush dkan:harvest:cleanup' + ), + 'severity' => REQUIREMENT_WARNING, + ]; + } + } + } + return $requirements; +} + /** * Uninstall obsolete submodule harvest_dashboard. */ diff --git a/modules/harvest/harvest.services.yml b/modules/harvest/harvest.services.yml index cd80ec01cb..fa9a481ee5 100644 --- a/modules/harvest/harvest.services.yml +++ b/modules/harvest/harvest.services.yml @@ -7,6 +7,12 @@ services: - '@entity_type.manager' calls: - [ setLoggerFactory, [ '@logger.factory' ] ] + dkan.harvest.utility: + class: Drupal\harvest\HarvestUtility + arguments: + - '@dkan.harvest.service' + - '@dkan.harvest.storage.database_table' + - '@database' dkan.harvest.storage.database_table: class: Drupal\harvest\Storage\DatabaseTableFactory arguments: diff --git a/modules/harvest/src/Commands/HarvestCommands.php b/modules/harvest/src/Commands/HarvestCommands.php index 0f456b5254..053546d90c 100644 --- a/modules/harvest/src/Commands/HarvestCommands.php +++ b/modules/harvest/src/Commands/HarvestCommands.php @@ -3,6 +3,7 @@ namespace Drupal\harvest\Commands; use Drupal\Core\Logger\LoggerChannelInterface; +use Drupal\harvest\HarvestUtility; use Drupal\harvest\Load\Dataset; use Drupal\harvest\HarvestService; use Drush\Commands\DrushCommands; @@ -26,14 +27,26 @@ class HarvestCommands extends DrushCommands { */ protected $harvestService; + /** + * Harvest utility service. + * + * @var \Drupal\harvest\HarvestUtility + */ + protected HarvestUtility $harvestUtility; + /** * Constructor. */ - public function __construct(HarvestService $service, LoggerChannelInterface $logger) { + public function __construct( + HarvestService $service, + LoggerChannelInterface $logger, + HarvestUtility $harvestUtility + ) { parent::__construct(); // @todo passing via arguments doesn't seem play well with drush.services.yml $this->harvestService = $service; $this->logger = $logger; + $this->harvestUtility = $harvestUtility; } /** @@ -51,7 +64,7 @@ function ($id) { return [$id]; }, $this->harvestService->getAllHarvestIds() - ); + ); (new Table(new ConsoleOutput()))->setHeaders(['plan id'])->setRows($rows)->render(); } @@ -347,6 +360,47 @@ public function orphanDatasets(string $harvestId) : int { } } + /** + * Report and cleanup harvest data which may be cluttering your database. + * + * Will print a report. Add -y or --no-interaction to automatically perform + * this cleanup. + * + * @command dkan:harvest:cleanup + * + * @return int + * Bash status code. + * + * @bootstrap full + */ + public function harvestCleanup(): int { + $logger = $this->logger(); + $orphaned = $this->harvestUtility->findOrphanedHarvestDataIds(); + if ($orphaned) { + $logger->notice('Detected leftover harvest data for these plans: ' . implode(', ', $orphaned)); + if ($this->io()->confirm('Do you want to remove this data?', FALSE)) { + $this->cleanupHarvestDataTables($orphaned); + } + } + else { + $logger->notice('No leftover harvest data detected.'); + } + return DrushCommands::EXIT_SUCCESS; + } + + /** + * Perform the harvest data table cleanup. + * + * @param array $plan_ids + * An array of plan identifiers to clean up. + */ + protected function cleanupHarvestDataTables(array $plan_ids) : void { + foreach ($plan_ids as $plan_id) { + $this->logger()->notice('Cleaning up: ' . $plan_id); + $this->harvestUtility->destructOrphanTables($plan_id); + } + } + /** * Throw error if Harvest ID does not exist. * diff --git a/modules/harvest/src/HarvestService.php b/modules/harvest/src/HarvestService.php index 45e93d5082..b7cb445fc0 100644 --- a/modules/harvest/src/HarvestService.php +++ b/modules/harvest/src/HarvestService.php @@ -52,7 +52,7 @@ class HarvestService implements ContainerInjectionInterface { */ public static function create(ContainerInterface $container) { return new self( - $container->get("dkan.harvest.storage.database_table"), + $container->get('dkan.harvest.storage.database_table'), $container->get('dkan.metastore.service'), $container->get('entity_type.manager') ); @@ -129,17 +129,26 @@ public function registerHarvest($plan) { /** * Deregister harvest. * - * @param string $id - * Id. + * @param string $plan_id + * Plan identifier. * * @return bool - * Boolean. + * Whether this happened successfully. */ - public function deregisterHarvest(string $id) { - - $plan_store = $this->storeFactory->getInstance("harvest_plans"); - - return $plan_store->remove($id); + public function deregisterHarvest(string $plan_id) { + // Remove all the support tables for this plan id. + foreach ([ + 'harvest_' . $plan_id . '_items', + 'harvest_' . $plan_id . '_hashes', + 'harvest_' . $plan_id . '_runs', + ] as $table_name) { + /** @var \Drupal\common\Storage\DatabaseTableInterface $store */ + $store = $this->storeFactory->getInstance($table_name); + $store->destruct(); + } + // Remove the plan id from the harvest_plans table. + $plan_store = $this->storeFactory->getInstance('harvest_plans'); + return $plan_store->remove($plan_id); } /** diff --git a/modules/harvest/src/HarvestUtility.php b/modules/harvest/src/HarvestUtility.php new file mode 100644 index 0000000000..179c027bcc --- /dev/null +++ b/modules/harvest/src/HarvestUtility.php @@ -0,0 +1,155 @@ +get('dkan.harvest.service'), + $container->get('dkan.harvest.storage.database_table'), + $container->get('database'), + ); + } + + /** + * Constructor. + */ + public function __construct( + HarvestService $harvestService, + DatabaseTableFactory $storeFactory, + Connection $connection + ) { + $this->harvestService = $harvestService; + $this->storeFactory = $storeFactory; + $this->connection = $connection; + } + + /** + * Get the plan ID from a given harvest table name. + * + * Harvest table names are assumed to look like this: + * harvest_ID_that_might_have_underscores_[something]. For example: + * 'harvest_ABC_123_runs'. + * + * @param string $table_name + * The table name. + * + * @return string + * The ID gleaned from the table name. If no ID could be gleaned, returns + * an empty string. + */ + public static function planIdFromTableName(string $table_name): string { + $name_explode = explode('_', $table_name); + if (count($name_explode) < 3) { + return ''; + } + // Remove first and last item. + array_shift($name_explode); + array_pop($name_explode); + return implode('_', $name_explode); + } + + /** + * Find harvest IDs with data tables that aren't in the harvest_plans table. + * + * @return array + * Array of orphan plan ids, as both key and value. Empty if there are no + * orphaned plan ids. + */ + public function findOrphanedHarvestDataIds(): array { + $existing_plans = $this->harvestService->getAllHarvestIds(); + + $table_names = $this->findAllHarvestDataTables(); + + $orphan_ids = []; + // Find IDs that are not in the existing plans. + foreach ($table_names as $table_name) { + $plan_id = static::planIdFromTableName($table_name); + if (!in_array($plan_id, $existing_plans)) { + $orphan_ids[$plan_id] = $plan_id; + } + } + return $orphan_ids; + } + + /** + * Find all the potential harvest data tables names in the database. + * + * @return array + * All the table names that might be harvest data tables. + */ + protected function findAllHarvestDataTables(): array { + $tables = []; + foreach ([ + // @todo Figure out an expression for harvest_%_thing, since underscore + // is a special character. + 'harvest%runs', + 'harvest%items', + 'harvest%hashes', + ] as $table_expression) { + if ($found_tables = $this->connection->schema()->findTables($table_expression)) { + $tables = array_merge($tables, $found_tables); + } + } + return $tables; + } + + /** + * Remove existing harvest data tables for the given plan identifier. + * + * Will not remove data tables for existing plans. + * + * @param string $plan_id + * Plan identifier to work with. + */ + public function destructOrphanTables(string $plan_id): void { + if (!in_array($plan_id, $this->harvestService->getAllHarvestIds())) { + foreach ([ + 'harvest_' . $plan_id . '_runs', + 'harvest_' . $plan_id . '_items', + 'harvest_' . $plan_id . '_hashes', + ] as $table) { + $this->storeFactory->getInstance($table)->destruct(); + } + } + } + +} diff --git a/modules/harvest/tests/src/Kernel/HarvestServiceTest.php b/modules/harvest/tests/src/Kernel/HarvestServiceTest.php new file mode 100644 index 0000000000..e16401972a --- /dev/null +++ b/modules/harvest/tests/src/Kernel/HarvestServiceTest.php @@ -0,0 +1,121 @@ +container->get('dkan.harvest.service'); + $storage_factory = $this->container->get('dkan.harvest.storage.database_table'); + + $plan = (object) [ + 'identifier' => 'test_plan', + 'extract' => (object) [ + 'type' => DataJson::class, + 'uri' => 'file://' . __DIR__ . '/../../files/data.json', + ], + 'transforms' => [], + 'load' => (object) [ + 'type' => Simple::class, + ], + ]; + + // Register a harvest. + $result = $harvest_service->registerHarvest($plan); + + $this->assertEquals('test_plan', $result); + + $storedTestPlan = json_decode($storage_factory->getInstance('harvest_plans')->retrieve('test_plan')); + $this->assertEquals('test_plan', $storedTestPlan->identifier); + + // Run a harvest. + $result = $harvest_service->runHarvest('test_plan'); + + $this->assertEquals('SUCCESS', $result['status']['extract']); + $this->assertEquals(2, count($result['status']['extracted_items_ids'])); + $this->assertEquals(json_encode(['NEW', 'NEW']), json_encode(array_values($result['status']['load']))); + + $storedObject = $storage_factory->getInstance('harvest_test_plan_items')->retrieve('cedcd327-4e5d-43f9-8eb1-c11850fa7c55'); + $this->assertTrue(is_string($storedObject)); + $storedObject = json_decode($storedObject); + $this->assertTrue(is_object($storedObject)); + + // Run harvest again, no changes. + $result = $harvest_service->runHarvest('test_plan'); + + $this->assertEquals('SUCCESS', $result['status']['extract']); + $this->assertEquals(2, count($result['status']['extracted_items_ids'])); + $this->assertEquals(json_encode(['UNCHANGED', 'UNCHANGED']), json_encode(array_values($result['status']['load']))); + + // Run harvest with changes. + $plan2 = clone $plan; + $plan2->extract->uri = 'file://' . __DIR__ . '/../../files/data2.json'; + $harvest_service->registerHarvest($plan2); + $result = $harvest_service->runHarvest('test_plan'); + + $this->assertEquals('SUCCESS', $result['status']['extract']); + $this->assertEquals(2, count($result['status']['extracted_items_ids'])); + $this->assertEquals(json_encode(['UPDATED', 'UNCHANGED']), json_encode(array_values($result['status']['load']))); + + $storedObject = $storage_factory->getInstance('harvest_test_plan_items')->retrieve('cedcd327-4e5d-43f9-8eb1-c11850fa7c55'); + $this->assertTrue(is_string($storedObject)); + $storedObject = json_decode($storedObject); + $this->assertTrue(is_object($storedObject)); + $this->assertEquals('Florida Bike Lanes 2', $storedObject->title); + + /** @var \Drupal\Core\Database\Schema $schema */ + $schema = $this->container->get('database')->schema(); + + // Reverting the harvest should leave behind the items and hashes tables, + // but remove the runs table. + $harvest_service->revertHarvest('test_plan'); + foreach ([ + 'harvest_test_plan_items', + 'harvest_test_plan_hashes', + ] as $storageId) { + $this->assertTrue($schema->tableExists($storageId), $storageId . ' does not exist.'); + } + $this->assertFalse($schema->tableExists('harvest_test_plan_runs', 'harvest_test_plan_runs exists.')); + + // All these tables should be empty. The runs table will be re-created + // as a side effect of calling retrieveAll() on it. + $storageTables = [ + 'harvest_test_plan_items', + 'harvest_test_plan_hashes', + 'harvest_test_plan_runs', + ]; + foreach ($storageTables as $storageId) { + $this->assertCount(0, $storage_factory->getInstance($storageId)->retrieveAll()); + } + + // Deregister harvest. + $harvest_service->deregisterHarvest('test_plan'); + $this->assertNull($harvest_service->getHarvestPlan('test_plan')); + $this->assertNotContains('test_plan', $harvest_service->getAllHarvestIds()); + // Check the data tables. They should have been removed. + foreach ($storageTables as $storageId) { + $this->assertFalse($schema->tableExists($storageId), $storageId . ' exists.'); + } + } + +} diff --git a/modules/harvest/tests/src/Kernel/HarvestUtilityTest.php b/modules/harvest/tests/src/Kernel/HarvestUtilityTest.php new file mode 100644 index 0000000000..13396f95bc --- /dev/null +++ b/modules/harvest/tests/src/Kernel/HarvestUtilityTest.php @@ -0,0 +1,59 @@ +container->get('dkan.harvest.service'); + + // Use a database table to store a fake plan so we don't have to actually + // store a plan. + /** @var \Drupal\harvest\Storage\DatabaseTableFactory $table_factory */ + $table_factory = $this->container->get('dkan.harvest.storage.database_table'); + /** @var \Drupal\harvest\Storage\DatabaseTable $plan_storage */ + $plan_storage = $table_factory->getInstance('harvest_plans'); + $plan_storage->store('{we do not have a plan}', $existing_plan_id); + + // Getting all harvest run info for a non-existent plan results in a run + // table being created. + // @todo This is probably something that needs fixing. + $harvest_service->getAllHarvestRunInfo($orphan_plan_id); + + /** @var \Drupal\harvest\HarvestUtility $harvest_utility */ + $harvest_utility = $this->container->get('dkan.harvest.utility'); + $orphaned = $harvest_utility->findOrphanedHarvestDataIds(); + $this->assertNotContains($existing_plan_id, $orphaned); + $this->assertEquals([$orphan_plan_id => $orphan_plan_id], $orphaned); + + // Remove the orphans. + foreach ($orphaned as $orphan) { + $harvest_utility->destructOrphanTables($orphan); + } + $this->assertEmpty( + $this->container->get('database')->schema() + ->findTables('harvest_' . $orphan_plan_id . '%') + ); + } + +} diff --git a/modules/harvest/tests/src/Unit/ServiceTest.php b/modules/harvest/tests/src/Unit/HarvestServiceTest.php similarity index 61% rename from modules/harvest/tests/src/Unit/ServiceTest.php rename to modules/harvest/tests/src/Unit/HarvestServiceTest.php index 5264778de7..53dd5d6efb 100644 --- a/modules/harvest/tests/src/Unit/ServiceTest.php +++ b/modules/harvest/tests/src/Unit/HarvestServiceTest.php @@ -2,8 +2,6 @@ namespace Drupal\Tests\harvest\Unit; -use Contracts\FactoryInterface; -use Contracts\Mock\Storage\Memory; use Drupal\Component\DependencyInjection\Container; use Drupal\Core\Entity\EntityTypeManager; use Drupal\Core\Logger\LoggerChannelFactory; @@ -14,112 +12,21 @@ use Drupal\harvest\Storage\DatabaseTableFactory; use Drupal\metastore\MetastoreService; use Drupal\node\NodeStorage; -use Harvest\ETL\Extract\DataJson; -use Harvest\ETL\Load\Simple; use MockChain\Chain; use MockChain\Options; use MockChain\Sequence; use PHPUnit\Framework\TestCase; -use Symfony\Component\DependencyInjection\ContainerInterface; /** + * @covers \Drupal\harvest\HarvestService * @coversDefaultClass \Drupal\harvest\HarvestService + * + * @group dkan * @group harvest */ -class ServiceTest extends TestCase { +class HarvestServiceTest extends TestCase { use ServiceCheckTrait; - private $storageFactory; - - /** - * - */ - public function test() { - $options = (new Options()) - ->add('dkan.harvest.storage.database_table', $this->getStorageFactory()) - ->add('dkan.metastore.service', $this->getMetastoreMockChain()) - ->add('entity_type.manager', $this->getEntityTypeManagerMockChain()) - ->index(0); - - $this->checkService('dkan.harvest.storage.database_table', 'harvest'); - - $container = (new Chain($this)) - ->add(ContainerInterface::class, 'get', $options) - ->getMock(); - - $service = HarvestService::create($container); - - $plan = (object) [ - 'identifier' => 'test_plan', - 'extract' => (object) [ - "type" => DataJson::class, - "uri" => "file://" . __DIR__ . '/../../files/data.json', - ], - 'transforms' => [], - 'load' => (object) [ - "type" => Simple::class, - ], - ]; - - // Register a harvest. - $result = $service->registerHarvest($plan); - - $this->assertEquals('test_plan', $result); - - $storedTestPlan = json_decode($this->getStorageFactory()->getInstance('harvest_plans')->retrieve('test_plan')); - $this->assertEquals('test_plan', $storedTestPlan->identifier); - - // Run a harvest. - $result = $service->runHarvest('test_plan'); - - $this->assertEquals("SUCCESS", $result['status']['extract']); - $this->assertEquals(2, count($result['status']['extracted_items_ids'])); - $this->assertEquals(json_encode(["NEW", "NEW"]), json_encode(array_values($result['status']['load']))); - - $storedObject = $this->getStorageFactory()->getInstance('harvest_test_plan_items')->retrieve("cedcd327-4e5d-43f9-8eb1-c11850fa7c55"); - $this->assertTrue(is_string($storedObject)); - $storedObject = json_decode($storedObject); - $this->assertTrue(is_object($storedObject)); - - // Run harvest again, no changes. - $result = $service->runHarvest('test_plan'); - - $this->assertEquals("SUCCESS", $result['status']['extract']); - $this->assertEquals(2, count($result['status']['extracted_items_ids'])); - $this->assertEquals(json_encode(["UNCHANGED", "UNCHANGED"]), json_encode(array_values($result['status']['load']))); - - // Run harvest with changes. - $plan2 = clone $plan; - $plan2->extract->uri = "file://" . __DIR__ . '/../../files/data2.json'; - $service->registerHarvest($plan2); - $result = $service->runHarvest('test_plan'); - - $this->assertEquals("SUCCESS", $result['status']['extract']); - $this->assertEquals(2, count($result['status']['extracted_items_ids'])); - $this->assertEquals(json_encode(["UPDATED", "UNCHANGED"]), json_encode(array_values($result['status']['load']))); - - $storedObject = $this->getStorageFactory()->getInstance('harvest_test_plan_items')->retrieve("cedcd327-4e5d-43f9-8eb1-c11850fa7c55"); - $this->assertTrue(is_string($storedObject)); - $storedObject = json_decode($storedObject); - $this->assertTrue(is_object($storedObject)); - $this->assertEquals("Florida Bike Lanes 2", $storedObject->title); - - // Revert harvest. - $service->revertHarvest('test_plan'); - $storageTypes = [ - 'harvest_test_plan_items', - 'harvest_test_plan_hashes', - 'harvest_test_plan_runs', - ]; - foreach ($storageTypes as $storageId) { - $this->assertEquals(0, count($this->getStorageFactory()->getInstance($storageId)->retrieveAll())); - } - - // Deregister harvest. - $service->deregisterHarvest('test_plan'); - $this->assertEquals(0, count($this->getStorageFactory()->getInstance('harvest_plans')->retrieveAll())); - } - /** * */ @@ -160,46 +67,6 @@ public function testGetHarvestRunInfo() { $this->assertFalse($result); } - /** - * Private. - */ - private function getStorageFactory() { - if (!isset($this->storageFactory)) { - $this->storageFactory = new class() implements FactoryInterface { - private $stores = []; - - /** - * Getter. - */ - public function getInstance(string $identifier, array $config = []) { - if (!isset($this->stores[$identifier])) { - $this->stores[$identifier] = new class() extends Memory { - - /** - * - */ - public function retrieveAll(): array { - return array_keys(parent::retrieveAll()); - } - - /** - * - */ - public function destruct() { - $this->storage = []; - } - - }; - } - return $this->stores[$identifier]; - } - - }; - } - - return $this->storageFactory; - } - /** * Private. */