From 16d8c14e02031c8b97fcb37aa07d4a32f79aba21 Mon Sep 17 00:00:00 2001 From: fmizzell Date: Thu, 20 Jun 2019 21:51:22 -0500 Subject: [PATCH] Improvements (#6) * Simplifying config schema, generalizing the "get id" function, better checks in factory and allowing access to factory properties, adding revert functionality to the harvester, and bringing compatibility back to php 7.1. --- schema/schema.json | 50 +++---------------------------- src/ETL/Extract/DataJson.php | 17 ++--------- src/ETL/Extract/Extract.php | 6 ++-- src/ETL/Factory.php | 40 +++++++++++++++---------- src/ETL/Load/Load.php | 4 +-- src/ETL/Load/Simple.php | 5 ++++ src/ETL/Transform/Transform.php | 2 +- src/Harvester.php | 28 +++++++++++++++-- src/Util.php | 17 +++++++++++ tests/ETL/Extract/ExtractTest.php | 30 +++++++++++++++++++ tests/ETL/FactoryTest.php | 36 ++++++++++++++++++++++ tests/HarvesterTest.php | 17 +++++++++-- tests/UtilTest.php | 11 +++++++ tests/json/badplan.json | 2 +- tests/json/badplan2.json | 11 +++++++ tests/json/plan.json | 2 +- 16 files changed, 188 insertions(+), 90 deletions(-) create mode 100644 tests/ETL/Extract/ExtractTest.php create mode 100644 tests/ETL/FactoryTest.php create mode 100644 tests/UtilTest.php create mode 100644 tests/json/badplan2.json diff --git a/schema/schema.json b/schema/schema.json index daa0576..8690944 100644 --- a/schema/schema.json +++ b/schema/schema.json @@ -5,7 +5,7 @@ "title": "Harvest Plan", "required": [ "identifier", - "source", + "extract", "load" ], "properties": { @@ -14,9 +14,9 @@ "title": "The plan's identifier", "pattern": "^(.*)$" }, - "source": { + "extract": { "type": "object", - "title": "The Source to harvest", + "title": "Extract", "required": [ "type", "uri" @@ -40,25 +40,7 @@ "title": "The Transforms for the Harvest", "additionalProperties": false, "items": { - "anyOf": [ - { - "type":"object", - "title": "The Items Schema", - "properties": { - "Filter": { - "type": "object", - "title": "The Filter to use on the harvest" - }, - "Override": { - "type": "object", - "title": "The Filter to use on the harvest" - } - } - }, - { - "type": "string" - } - ] + "type": "string" } }, "load": { @@ -68,30 +50,6 @@ "type" ], "properties": { - "migrate": { - "type": "boolean", - "title": "Whether or not to fully pull in the source", - "default": false, - "examples": [ - false - ] - }, - "collectionsToUpdate": { - "type": "array", - "title": "The Collections from the source to update in the catalog", - "description":"These collection should be defined i the active schema. ", - "items": { - "type": "string", - "examples": [ - "dataset", - "organization", - "license", - "theme", - "keyword" - ], - "pattern": "^(.*)$" - } - }, "type": { "type": "string", "title": "Class utilized to load the harvested data." diff --git a/src/ETL/Extract/DataJson.php b/src/ETL/Extract/DataJson.php index 208780c..eaeb4a6 100644 --- a/src/ETL/Extract/DataJson.php +++ b/src/ETL/Extract/DataJson.php @@ -3,6 +3,7 @@ namespace Harvest\ETL\Extract; use GuzzleHttp\Client; +use Harvest\Util; class DataJson extends Extract { @@ -13,7 +14,7 @@ function __construct($harvest_plan) { } public function getItems() { - $file_location = $this->harvest_plan->source->uri; + $file_location = $this->harvest_plan->extract->uri; if (substr_count($file_location, "file://") > 0) { $json = file_get_contents($file_location); } @@ -33,23 +34,11 @@ public function getItems() { $datasets = []; foreach ($data->dataset as $dataset) { - $datasets[$this->getDatasetId($dataset)] = $dataset; + $datasets[Util::getDatasetId($dataset)] = $dataset; } return $datasets; } - private function getDatasetId(object $dataset): string - { - if (filter_var($dataset->identifier, FILTER_VALIDATE_URL)) { - $i = explode("/", $dataset->identifier); - $id = end($i); - } - else { - $id = $dataset->identifier; - } - return "{$id}"; - } - private function httpRequest($uri) { try { $client = new Client(); diff --git a/src/ETL/Extract/Extract.php b/src/ETL/Extract/Extract.php index 8fbab40..5d9b6d8 100644 --- a/src/ETL/Extract/Extract.php +++ b/src/ETL/Extract/Extract.php @@ -2,9 +2,6 @@ namespace Harvest\ETL\Extract; -use Harvest\Log\MakeItLog; -use Harvest\Storage\Storage; - abstract class Extract implements IExtract { /** @@ -20,7 +17,8 @@ public function run(): array $copy = array_values($items); if (!is_object($copy[0])) { - throw new \Exception("The items extracted are not php objects: {json_encode($copy[0])}"); + $item = json_encode($copy[0]); + throw new \Exception("The items extracted are not php objects: {$item}"); } return $items; diff --git a/src/ETL/Factory.php b/src/ETL/Factory.php index fecf5f8..3658f6c 100644 --- a/src/ETL/Factory.php +++ b/src/ETL/Factory.php @@ -9,9 +9,9 @@ class Factory { - private $harvestPlan; - private $itemStorage; - private $hashStorage; + public $harvestPlan; + public $itemStorage; + public $hashStorage; public function __construct($harvest_plan, Storage $item_storage, Storage $hash_storage) { if (self::validateHarvestPlan($harvest_plan)) { @@ -22,12 +22,23 @@ public function __construct($harvest_plan, Storage $item_storage, Storage $hash_ } public function get($type) { + if ($type == "extract") { - $class = $this->harvestPlan->source->type; + $class = $this->harvestPlan->extract->type; + + if (!class_exists($class)) { + throw new \Exception("Class {$class} does not exist"); + } + return new $class($this->harvestPlan); } elseif ($type == "load") { $class = $this->harvestPlan->load->type; + + if (!class_exists($class)) { + throw new \Exception("Class {$class} does not exist"); + } + return new $class($this->harvestPlan, $this->hashStorage, $this->itemStorage); } elseif($type == "transforms") { @@ -35,14 +46,12 @@ public function get($type) { if ($this->harvestPlan->transforms) { foreach ($this->harvestPlan->transforms as $info) { $config = NULL; + $class = $info; - if (is_object($info)) { - $info = (array) $info; - $class = array_keys($info)[0]; - } - else { - $class = $info; + if (!class_exists($class)) { + throw new \Exception("Class {$class} does not exist"); } + $transforms[] = $this->getOne($class, $this->harvestPlan); } } @@ -58,14 +67,13 @@ private function getOne($class, $config = NULL) { return new $class($config); } - public static function validateHarvestPlan(object $harvest_plan) { + public static function validateHarvestPlan($harvest_plan) { + if (!is_object($harvest_plan)) { + throw new \Exception("Harvest plan must be a php object."); + } + $path_to_schema = __DIR__ . "/../../schema/schema.json"; $json_schema = file_get_contents($path_to_schema); - $schema = json_decode($json_schema); - - if ($schema == null) { - throw new \Exception("the json-schema is invalid json."); - } $data = $harvest_plan; $schema = Schema::fromJsonString($json_schema); diff --git a/src/ETL/Load/Load.php b/src/ETL/Load/Load.php index e342af7..2be522a 100644 --- a/src/ETL/Load/Load.php +++ b/src/ETL/Load/Load.php @@ -29,7 +29,7 @@ public function run($item) { $this->saveItem($item); - $identifier = $item->identifier; + $identifier = Util::getDatasetId($item); $hash = Util::generateHash($item); $object = (object) ['harvest_plan_id' => $this->harvestPlan->identifier, "hash" => $hash]; @@ -41,7 +41,7 @@ public function run($item) { private function itemState($item) { if (isset($item->identifier)) { - $identifier = $item->identifier; + $identifier = Util::getDatasetId($item); $json = $this->hashStorage->retrieve($identifier); diff --git a/src/ETL/Load/Simple.php b/src/ETL/Load/Simple.php index b7b9d61..bcc3e53 100644 --- a/src/ETL/Load/Simple.php +++ b/src/ETL/Load/Simple.php @@ -12,4 +12,9 @@ protected function saveItem($item) } $this->itemStorage->store(json_encode($item), $id); } + + public function removeItem($id) + { + $this->itemStorage->remove($id); + } } \ No newline at end of file diff --git a/src/ETL/Transform/Transform.php b/src/ETL/Transform/Transform.php index 5d7350f..acda782 100644 --- a/src/ETL/Transform/Transform.php +++ b/src/ETL/Transform/Transform.php @@ -12,5 +12,5 @@ function __construct($harvest_plan) { $this->harvestPlan = $harvest_plan; } - abstract function run($item): object; + abstract function run($item); } diff --git a/src/Harvester.php b/src/Harvester.php index a53065e..62d9e85 100644 --- a/src/Harvester.php +++ b/src/Harvester.php @@ -16,8 +16,23 @@ public function __construct(Factory $factory) { $this->factory = $factory; } + public function revert() { + $ids = array_keys($this->factory->hashStorage->retrieveAll()); + $load = $this->factory->get("load"); + $counter = 0; + foreach($ids as $id) { + if (method_exists($load, "removeItem")) { + $load->removeItem($id); + $this->factory->hashStorage->remove($id); + $counter++; + } + } + return $counter; + } + public function harvest() { $items = $this->extract(); + $result['plan'] = json_encode($this->factory->harvestPlan); if (is_string($items)) { $result['status']['extract'] = "FAILURE"; @@ -32,7 +47,13 @@ public function harvest() { $result['status']['transform'] = []; $transformed_items = []; - $transformers = $this->factory->get("transforms"); + try { + $transformers = $this->factory->get("transforms"); + } + catch (\Exception $e) { + $result['errors']['transform']['loading'] = $e->getMessage(); + } + if ($transformers) { /** @var $transform Transform */ foreach ($items as $identifier => $item) { @@ -56,6 +77,9 @@ public function harvest() { } } } + else { + $transformed_items = $items; + } if (empty($transformed_items)) { return $result; @@ -78,8 +102,8 @@ public function harvest() { } private function extract() { - $extract = $this->factory->get('extract'); try { + $extract = $this->factory->get('extract'); $items = $extract->run(); } catch(\Exception $e) { diff --git a/src/Util.php b/src/Util.php index 11f9550..e3748fd 100644 --- a/src/Util.php +++ b/src/Util.php @@ -8,4 +8,21 @@ public static function generateHash($item) { return hash('sha256', serialize($item)); } + + public static function getDatasetId($dataset): string + { + if (!is_object($dataset)) { + throw new \Exception("The dataset " . json_encode($dataset) . " is not an object."); + } + + if (filter_var($dataset->identifier, FILTER_VALIDATE_URL)) { + $i = explode("/", $dataset->identifier); + $id = end($i); + } + else { + $id = $dataset->identifier; + } + return "{$id}"; + } + } \ No newline at end of file diff --git a/tests/ETL/Extract/ExtractTest.php b/tests/ETL/Extract/ExtractTest.php new file mode 100644 index 0000000..52f5ce5 --- /dev/null +++ b/tests/ETL/Extract/ExtractTest.php @@ -0,0 +1,30 @@ +expectExceptionMessage("No Items were extracted."); + (new TestExtract())->run(); + } + + public function testNoObjects() { + $item = json_encode("Hello World!!"); + $this->expectExceptionMessage("The items extracted are not php objects: {$item}"); + (new TestExtractNoObjects())->run(); + } +} + +class TestExtract extends \Harvest\ETL\Extract\Extract { + protected function getItems() + { + return []; + } +} + +class TestExtractNoObjects extends \Harvest\ETL\Extract\Extract { + protected function getItems() + { + return ["Hello World!!"]; + } +} \ No newline at end of file diff --git a/tests/ETL/FactoryTest.php b/tests/ETL/FactoryTest.php new file mode 100644 index 0000000..d1261c8 --- /dev/null +++ b/tests/ETL/FactoryTest.php @@ -0,0 +1,36 @@ +expectExceptionMessage("Class NoClass does not exist"); + $this->getFactory()->get('extract'); + } + + public function testTransform() { + $this->expectExceptionMessage("Class NoClass does not exist"); + $this->getFactory()->get('transforms'); + } + + public function testLoad() { + $this->expectExceptionMessage("Class NoClass does not exist"); + $this->getFactory()->get('load'); + } + + public function testPlanCheck() { + $this->expectExceptionMessage("Harvest plan must be a php object."); + new \Harvest\ETL\Factory("hello", new MemStore(), new MemStore()); + } + + private function getFactory() { + return new \Harvest\ETL\Factory($this->getPlan("badplan2"), new MemStore(), new MemStore()); + } + + private function getPlan($name) { + $path = __DIR__ . "/../json/{$name}.json"; + $content = file_get_contents($path); + return json_decode($content); + } + +} \ No newline at end of file diff --git a/tests/HarvesterTest.php b/tests/HarvesterTest.php index c300806..84e8215 100644 --- a/tests/HarvesterTest.php +++ b/tests/HarvesterTest.php @@ -22,7 +22,7 @@ public function basicData() { */ public function testBasic($uri) { $plan = $this->getPlan("plan"); - $plan->source->uri = $uri; + $plan->extract->uri = $uri; $item_store = new MemStore(); $hash_store = new MemStore(); @@ -49,7 +49,7 @@ public function testBasic($uri) { $this->assertEquals(10, count($item_store->retrieveAll())); if (substr_count($uri, "file://") > 0) { - $plan->source->uri = str_replace("data.json", "data2.json", $uri); + $plan->extract->uri = str_replace("data.json", "data2.json", $uri); $harvester = $this->getHarvester($plan, $item_store, $hash_store); $result = $harvester->harvest(); @@ -61,13 +61,24 @@ public function testBasic($uri) { $this->assertEquals(10, $interpreter->countProcessed()); $this->assertEquals(11, count($item_store->retrieveAll())); } + + $harvester->revert(); + + if (substr_count($uri, "file://") > 0) { + $expected = 1; + } + else { + $expected = 0; + } + + $this->assertEquals($expected, count($item_store->retrieveAll())); } public function testBadUri() { $uri = "httpp://asdfnde.exo/data.json"; $plan = $this->getPlan("plan"); - $plan->source->uri = $uri; + $plan->extract->uri = $uri; $harvester = $this->getHarvester($plan, new MemStore()); $result = $harvester->harvest(); diff --git a/tests/UtilTest.php b/tests/UtilTest.php new file mode 100644 index 0000000..3f419ce --- /dev/null +++ b/tests/UtilTest.php @@ -0,0 +1,11 @@ +expectExceptionMessage("The dataset " . json_encode($fake_dataset) . " is not an object."); + \Harvest\Util::getDatasetId($fake_dataset); + } +} \ No newline at end of file diff --git a/tests/json/badplan.json b/tests/json/badplan.json index c3eaf09..ad18685 100644 --- a/tests/json/badplan.json +++ b/tests/json/badplan.json @@ -1,6 +1,6 @@ { "identifier": "test", - "source": { + "extract": { "type": "DataJson", "uri": "http://demo.getdkan.com/data.json" }, diff --git a/tests/json/badplan2.json b/tests/json/badplan2.json new file mode 100644 index 0000000..51d3556 --- /dev/null +++ b/tests/json/badplan2.json @@ -0,0 +1,11 @@ +{ + "identifier": "test", + "extract": { + "type": "NoClass", + "uri": "http://demo.getdkan.com/data.json" + }, + "transforms": ["NoClass"], + "load": { + "type": "NoClass" + } +} \ No newline at end of file diff --git a/tests/json/plan.json b/tests/json/plan.json index 2717546..22c6020 100644 --- a/tests/json/plan.json +++ b/tests/json/plan.json @@ -1,6 +1,6 @@ { "identifier": "test", - "source": { + "extract": { "type": "\\Harvest\\ETL\\Extract\\DataJson", "uri": "" },