Skip to content

Commit

Permalink
add support for sha1/md5 provenance logging; related to #52
Browse files Browse the repository at this point in the history
  • Loading branch information
Jorrit Poelen committed Nov 30, 2023
1 parent 0ca2f7f commit ea977fa
Show file tree
Hide file tree
Showing 2 changed files with 56 additions and 16 deletions.
39 changes: 23 additions & 16 deletions src/main/java/org/globalbioticinteractions/elton/cmd/CmdLog.java
Original file line number Diff line number Diff line change
Expand Up @@ -4,17 +4,13 @@
import bio.guoda.preston.Hasher;
import bio.guoda.preston.RefNodeConstants;
import bio.guoda.preston.RefNodeFactory;
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.output.NullAppendable;
import org.apache.commons.io.output.NullOutputStream;
import org.apache.commons.rdf.api.IRI;
import org.apache.commons.rdf.api.Quad;
import org.apache.jena.tdb.store.Hash;
import org.eol.globi.data.NodeFactory;
import org.eol.globi.service.ResourceService;
import org.eol.globi.tool.NullImportLogger;
import org.eol.globi.util.ResourceServiceLocal;
import org.eol.globi.util.ResourceServiceLocalAndRemote;
import org.eol.globi.util.ResourceUtil;
import org.globalbioticinteractions.cache.CacheUtil;
import org.globalbioticinteractions.dataset.Dataset;
Expand All @@ -23,20 +19,17 @@
import org.globalbioticinteractions.dataset.DatasetRegistry;
import org.globalbioticinteractions.dataset.DatasetRegistryException;
import org.globalbioticinteractions.dataset.DatasetRegistryProxy;
import org.globalbioticinteractions.dataset.DatasetUtil;
import org.globalbioticinteractions.elton.util.DatasetRegistryUtil;
import org.globalbioticinteractions.elton.util.NodeFactoryNull;
import picocli.CommandLine;

import javax.validation.constraints.Null;
import java.io.IOException;
import java.io.InputStream;
import java.io.PrintStream;
import java.net.URI;
import java.security.DigestInputStream;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.Arrays;
import java.util.Collections;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicReference;
Expand All @@ -48,6 +41,13 @@
)
public class CmdLog extends CmdDefaultParams {

@CommandLine.Option(
names = {"--hash-algorithm", "--algo", "-a"},
description = "Hash algorithm used to generate primary content identifiers. Supported values: ${COMPLETION-CANDIDATES}."
)
private HashType hashType = HashType.sha256;


@Override
public void run() {
run(System.out);
Expand All @@ -64,15 +64,13 @@ void run(PrintStream out) {
public Dataset datasetFor(String namespace) throws DatasetRegistryException {
Dataset dataset = super.datasetFor(namespace);
return new DatasetProxy(dataset) {
ResourceService service = new LoggingResourceService(out, dataset);
ResourceService service = new LoggingResourceService(out, dataset, hashType);

public InputStream retrieve(URI resourcePath) throws IOException {
return service.retrieve(resourcePath);
}
};
}

;
};

NodeFactory nodeFactory = new NodeFactoryNull();
Expand All @@ -87,14 +85,14 @@ public InputStream retrieve(URI resourcePath) throws IOException {

private static class LoggingResourceService implements ResourceService {
private final PrintStream out;
private HashType sha256;
private HashType hashType;
private final ResourceService local;
private final AtomicReference<IRI> archiveContentId = new AtomicReference<>(null);

public LoggingResourceService(PrintStream out, ResourceService resourceService) {
public LoggingResourceService(PrintStream out, ResourceService resourceService, HashType hashType) {
this.out = out;
this.local = resourceService;
sha256 = HashType.sha256;
this.hashType = hashType;
}

@Override
Expand All @@ -104,7 +102,7 @@ public InputStream retrieve(URI uri) throws IOException {

private InputStream logVersion(URI uri, InputStream retrieve) throws IOException {
try {
final MessageDigest md = MessageDigest.getInstance(sha256.getAlgorithm());
final MessageDigest md = MessageDigest.getInstance(hashType.getAlgorithm());
final URI resource = local instanceof Dataset
? getLocationInDataset(uri, (Dataset) local) : uri;

Expand All @@ -122,7 +120,11 @@ private URI getLocationInDataset(URI uri, Dataset dataset) throws IOException {
resourceLocation = ResourceUtil.getAbsoluteResourceURI(archiveURI, uri);
} else {
if (this.archiveContentId.get() == null) {
IRI archiveContentId = Hasher.calcHashIRI(local.retrieve(archiveURI), NullOutputStream.NULL_OUTPUT_STREAM, sha256);
IRI archiveContentId = Hasher.calcHashIRI(
local.retrieve(archiveURI),
NullOutputStream.NULL_OUTPUT_STREAM,
hashType
);
this.archiveContentId.set(archiveContentId);
Quad quad = RefNodeFactory.toStatement(
RefNodeFactory.toIRI(archiveURI),
Expand Down Expand Up @@ -177,7 +179,7 @@ public void close() throws IOException {
Quad quad = RefNodeFactory.toStatement(
RefNodeFactory.toIRI(resourceLocation),
RefNodeConstants.HAS_VERSION,
Hasher.toHashIRI(md, HashType.sha256)
Hasher.toHashIRI(md, hashType)
);
if (isEOF.get() && !hasLogged.get()) {
out.println(quad.toString());
Expand All @@ -186,6 +188,11 @@ public void close() throws IOException {
}
}
}

public void setHashType(HashType hashType) {
this.hashType = hashType;
}

}


Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package org.globalbioticinteractions.elton.cmd;

import bio.guoda.preston.HashType;
import org.junit.Test;

import java.io.ByteArrayOutputStream;
Expand Down Expand Up @@ -28,4 +29,36 @@ public void logTemplate() throws URISyntaxException {

}

@Test
public void logTemplateSha1() throws URISyntaxException {
CmdLog cmd = new CmdLog();
cmd.setHashType(HashType.sha1);
cmd.setCacheDir(CmdTestUtil.cacheDirTest());
cmd.setNamespaces(Collections.singletonList("globalbioticinteractions/template-dataset"));
ByteArrayOutputStream out1 = new ByteArrayOutputStream();
PrintStream out = new PrintStream(out1);
cmd.run(out);
String[] split = out1.toString().split("\n");
assertThat(split.length, is(3));
assertThat(split[0], is("<https://zenodo.org/record/207958/files/globalbioticinteractions/template-dataset-0.0.2.zip> <http://purl.org/pav/hasVersion> <hash://sha1/1bffa147ccca290482329e42a4f7d4c5db5f1d04> ."));
assertThat(split[1], is("<zip:hash://sha1/1bffa147ccca290482329e42a4f7d4c5db5f1d04!/globalbioticinteractions-template-dataset-e68f448/globi.json> <http://purl.org/pav/hasVersion> <hash://sha1/bec707471bcd75ebb69ae4b2a155ff64cfe7221a> ."));
assertThat(split[2], is("<zip:hash://sha1/1bffa147ccca290482329e42a4f7d4c5db5f1d04!/globalbioticinteractions-template-dataset-e68f448/interactions.tsv> <http://purl.org/pav/hasVersion> <hash://sha1/ee1b4b58d9c356d5ef20e4076b929094516beb35> ."));
}

@Test
public void logTemplateMD5() throws URISyntaxException {
CmdLog cmd = new CmdLog();
cmd.setHashType(HashType.md5);
cmd.setCacheDir(CmdTestUtil.cacheDirTest());
cmd.setNamespaces(Collections.singletonList("globalbioticinteractions/template-dataset"));
ByteArrayOutputStream out1 = new ByteArrayOutputStream();
PrintStream out = new PrintStream(out1);
cmd.run(out);
String[] split = out1.toString().split("\n");
assertThat(split.length, is(3));
assertThat(split[0], is("<https://zenodo.org/record/207958/files/globalbioticinteractions/template-dataset-0.0.2.zip> <http://purl.org/pav/hasVersion> <hash://md5/98ea358786947a5c3217a12a0810ddea> ."));
assertThat(split[1], is("<zip:hash://md5/98ea358786947a5c3217a12a0810ddea!/globalbioticinteractions-template-dataset-e68f448/globi.json> <http://purl.org/pav/hasVersion> <hash://md5/5d4fa61630858b39ddf527f390883487> ."));
assertThat(split[2], is("<zip:hash://md5/98ea358786947a5c3217a12a0810ddea!/globalbioticinteractions-template-dataset-e68f448/interactions.tsv> <http://purl.org/pav/hasVersion> <hash://md5/7a53de3ea4bde18126a32f2f95b56843> ."));
}

}

0 comments on commit ea977fa

Please sign in to comment.