Skip to content

Commit

Permalink
Improvements (#6)
Browse files Browse the repository at this point in the history
* Simplifying config schema, generalizing the "get id" function, better checks in factory and allowing access to factory properties, adding revert functionality to the harvester, and bringing compatibility back to php 7.1.
  • Loading branch information
fmizzell authored Jun 21, 2019
1 parent fb0aed7 commit 16d8c14
Show file tree
Hide file tree
Showing 16 changed files with 188 additions and 90 deletions.
50 changes: 4 additions & 46 deletions schema/schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
"title": "Harvest Plan",
"required": [
"identifier",
"source",
"extract",
"load"
],
"properties": {
Expand All @@ -14,9 +14,9 @@
"title": "The plan's identifier",
"pattern": "^(.*)$"
},
"source": {
"extract": {
"type": "object",
"title": "The Source to harvest",
"title": "Extract",
"required": [
"type",
"uri"
Expand All @@ -40,25 +40,7 @@
"title": "The Transforms for the Harvest",
"additionalProperties": false,
"items": {
"anyOf": [
{
"type":"object",
"title": "The Items Schema",
"properties": {
"Filter": {
"type": "object",
"title": "The Filter to use on the harvest"
},
"Override": {
"type": "object",
"title": "The Filter to use on the harvest"
}
}
},
{
"type": "string"
}
]
"type": "string"
}
},
"load": {
Expand All @@ -68,30 +50,6 @@
"type"
],
"properties": {
"migrate": {
"type": "boolean",
"title": "Whether or not to fully pull in the source",
"default": false,
"examples": [
false
]
},
"collectionsToUpdate": {
"type": "array",
"title": "The Collections from the source to update in the catalog",
"description":"These collection should be defined i the active schema. ",
"items": {
"type": "string",
"examples": [
"dataset",
"organization",
"license",
"theme",
"keyword"
],
"pattern": "^(.*)$"
}
},
"type": {
"type": "string",
"title": "Class utilized to load the harvested data."
Expand Down
17 changes: 3 additions & 14 deletions src/ETL/Extract/DataJson.php
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
namespace Harvest\ETL\Extract;

use GuzzleHttp\Client;
use Harvest\Util;

class DataJson extends Extract {

Expand All @@ -13,7 +14,7 @@ function __construct($harvest_plan) {
}

public function getItems() {
$file_location = $this->harvest_plan->source->uri;
$file_location = $this->harvest_plan->extract->uri;
if (substr_count($file_location, "file://") > 0) {
$json = file_get_contents($file_location);
}
Expand All @@ -33,23 +34,11 @@ public function getItems() {

$datasets = [];
foreach ($data->dataset as $dataset) {
$datasets[$this->getDatasetId($dataset)] = $dataset;
$datasets[Util::getDatasetId($dataset)] = $dataset;
}
return $datasets;
}

private function getDatasetId(object $dataset): string
{
if (filter_var($dataset->identifier, FILTER_VALIDATE_URL)) {
$i = explode("/", $dataset->identifier);
$id = end($i);
}
else {
$id = $dataset->identifier;
}
return "{$id}";
}

private function httpRequest($uri) {
try {
$client = new Client();
Expand Down
6 changes: 2 additions & 4 deletions src/ETL/Extract/Extract.php
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,6 @@

namespace Harvest\ETL\Extract;

use Harvest\Log\MakeItLog;
use Harvest\Storage\Storage;

abstract class Extract implements IExtract {

/**
Expand All @@ -20,7 +17,8 @@ public function run(): array

$copy = array_values($items);
if (!is_object($copy[0])) {
throw new \Exception("The items extracted are not php objects: {json_encode($copy[0])}");
$item = json_encode($copy[0]);
throw new \Exception("The items extracted are not php objects: {$item}");
}

return $items;
Expand Down
40 changes: 24 additions & 16 deletions src/ETL/Factory.php
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@

class Factory {

private $harvestPlan;
private $itemStorage;
private $hashStorage;
public $harvestPlan;
public $itemStorage;
public $hashStorage;

public function __construct($harvest_plan, Storage $item_storage, Storage $hash_storage) {
if (self::validateHarvestPlan($harvest_plan)) {
Expand All @@ -22,27 +22,36 @@ public function __construct($harvest_plan, Storage $item_storage, Storage $hash_
}

public function get($type) {

if ($type == "extract") {
$class = $this->harvestPlan->source->type;
$class = $this->harvestPlan->extract->type;

if (!class_exists($class)) {
throw new \Exception("Class {$class} does not exist");
}

return new $class($this->harvestPlan);
}
elseif ($type == "load") {
$class = $this->harvestPlan->load->type;

if (!class_exists($class)) {
throw new \Exception("Class {$class} does not exist");
}

return new $class($this->harvestPlan, $this->hashStorage, $this->itemStorage);
}
elseif($type == "transforms") {
$transforms = [];
if ($this->harvestPlan->transforms) {
foreach ($this->harvestPlan->transforms as $info) {
$config = NULL;
$class = $info;

if (is_object($info)) {
$info = (array) $info;
$class = array_keys($info)[0];
}
else {
$class = $info;
if (!class_exists($class)) {
throw new \Exception("Class {$class} does not exist");
}

$transforms[] = $this->getOne($class, $this->harvestPlan);
}
}
Expand All @@ -58,14 +67,13 @@ private function getOne($class, $config = NULL) {
return new $class($config);
}

public static function validateHarvestPlan(object $harvest_plan) {
public static function validateHarvestPlan($harvest_plan) {
if (!is_object($harvest_plan)) {
throw new \Exception("Harvest plan must be a php object.");
}

$path_to_schema = __DIR__ . "/../../schema/schema.json";
$json_schema = file_get_contents($path_to_schema);
$schema = json_decode($json_schema);

if ($schema == null) {
throw new \Exception("the json-schema is invalid json.");
}

$data = $harvest_plan;
$schema = Schema::fromJsonString($json_schema);
Expand Down
4 changes: 2 additions & 2 deletions src/ETL/Load/Load.php
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ public function run($item) {

$this->saveItem($item);

$identifier = $item->identifier;
$identifier = Util::getDatasetId($item);

$hash = Util::generateHash($item);
$object = (object) ['harvest_plan_id' => $this->harvestPlan->identifier, "hash" => $hash];
Expand All @@ -41,7 +41,7 @@ public function run($item) {

private function itemState($item) {
if (isset($item->identifier)) {
$identifier = $item->identifier;
$identifier = Util::getDatasetId($item);

$json = $this->hashStorage->retrieve($identifier);

Expand Down
5 changes: 5 additions & 0 deletions src/ETL/Load/Simple.php
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,9 @@ protected function saveItem($item)
}
$this->itemStorage->store(json_encode($item), $id);
}

public function removeItem($id)
{
$this->itemStorage->remove($id);
}
}
2 changes: 1 addition & 1 deletion src/ETL/Transform/Transform.php
Original file line number Diff line number Diff line change
Expand Up @@ -12,5 +12,5 @@ function __construct($harvest_plan) {
$this->harvestPlan = $harvest_plan;
}

abstract function run($item): object;
abstract function run($item);
}
28 changes: 26 additions & 2 deletions src/Harvester.php
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,23 @@ public function __construct(Factory $factory) {
$this->factory = $factory;
}

public function revert() {
$ids = array_keys($this->factory->hashStorage->retrieveAll());
$load = $this->factory->get("load");
$counter = 0;
foreach($ids as $id) {
if (method_exists($load, "removeItem")) {
$load->removeItem($id);
$this->factory->hashStorage->remove($id);
$counter++;
}
}
return $counter;
}

public function harvest() {
$items = $this->extract();
$result['plan'] = json_encode($this->factory->harvestPlan);

if (is_string($items)) {
$result['status']['extract'] = "FAILURE";
Expand All @@ -32,7 +47,13 @@ public function harvest() {
$result['status']['transform'] = [];

$transformed_items = [];
$transformers = $this->factory->get("transforms");
try {
$transformers = $this->factory->get("transforms");
}
catch (\Exception $e) {
$result['errors']['transform']['loading'] = $e->getMessage();
}

if ($transformers) {
/** @var $transform Transform */
foreach ($items as $identifier => $item) {
Expand All @@ -56,6 +77,9 @@ public function harvest() {
}
}
}
else {
$transformed_items = $items;
}

if (empty($transformed_items)) {
return $result;
Expand All @@ -78,8 +102,8 @@ public function harvest() {
}

private function extract() {
$extract = $this->factory->get('extract');
try {
$extract = $this->factory->get('extract');
$items = $extract->run();
}
catch(\Exception $e) {
Expand Down
17 changes: 17 additions & 0 deletions src/Util.php
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,21 @@ public static function generateHash($item) {
return hash('sha256', serialize($item));
}


public static function getDatasetId($dataset): string
{
if (!is_object($dataset)) {
throw new \Exception("The dataset " . json_encode($dataset) . " is not an object.");
}

if (filter_var($dataset->identifier, FILTER_VALIDATE_URL)) {
$i = explode("/", $dataset->identifier);
$id = end($i);
}
else {
$id = $dataset->identifier;
}
return "{$id}";
}

}
30 changes: 30 additions & 0 deletions tests/ETL/Extract/ExtractTest.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
<?php


class ExtractTest extends \PHPUnit\Framework\TestCase
{
public function testNoItems() {
$this->expectExceptionMessage("No Items were extracted.");
(new TestExtract())->run();
}

public function testNoObjects() {
$item = json_encode("Hello World!!");
$this->expectExceptionMessage("The items extracted are not php objects: {$item}");
(new TestExtractNoObjects())->run();
}
}

class TestExtract extends \Harvest\ETL\Extract\Extract {
protected function getItems()
{
return [];
}
}

class TestExtractNoObjects extends \Harvest\ETL\Extract\Extract {
protected function getItems()
{
return ["Hello World!!"];
}
}
Loading

0 comments on commit 16d8c14

Please sign in to comment.