Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Migrated DSpace-VIVO project into harvester #62

Open
wants to merge 13 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
/data
/logs
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
type = OAI
filePrefix=d7_
harvestTotalCount=5
uriPrefix = https://dspace7.org/resource/
endpoint = https://api7.dspace.org/server/oai/request
etl.dir.extract=data_src_dspace7/extract
etl.dir.transform=data_src_dspace7/transform
startDate=2024-06-15
endDate=2025-01-01
set=com_10673_1190
metadataFormat=DC
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# Specifies the type of the data source or harvesting mechanism.
# "OAI" refers to OAI-PMH protocol which is supported in all DSpace versions
# "RESTv7" is also supported if you are harvesting from DSpace7+
type = OAI

# A prefix to add to files during the processing to ensure uniqueness or identification.
# This could help in managing files for different datasets or harvesting jobs.
filePrefix = d7_

# The maximum number of pages to harvest during the operation.
# This is useful for testing or limiting the size of the operation.
harvestTotalCount = 5

# The base URI prefix used to construct resource identifiers.
uriPrefix = https://dspace7.org/resource/

# The endpoint URL of the OAI-PMH or REST server from which metadata is harvested.
# This is the entry point for the OAI-PMH protocol requests.
endpoint = https://api7.dspace.org/server/oai/request

# Directory where extracted data will be stored.
etl.dir.extract = data_src_dspace7/extract

# Directory where transformed data will be stored after applying processing or mapping.
# This ensures separation of raw and processed data.
etl.dir.transform = data_src_dspace7/transform

# Start date for the metadata harvesting process.
# Records modified or created on or after this date will be included.
startDate = 2024-06-15

# End date for the metadata harvesting process.
# Records modified or created before or on this date will be included.
endDate = 2025-01-01

# The specific OAI-PMH set to harvest from.
# Sets are used to group records in an OAI-PMH repository, e.g., by collection or subject.
set = com_10673_1190

# The metadata format to use for harvesting.
# "DC" stands for Dublin Core
# "DIM" stands for DSpace Internal Model
metadataFormat = DC
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
/list-of-all-expertises_demodspace.data
/list-of-all-expertises.data
/list-of-all-persons.data
/list-of-itemsType.data
/all_combined.nt
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
"abstract" <http://purl.org/ontology/bibo/abstract>
"animation" <http://purl.org/ontology/bibo/interviewee>
"article" <http://purl.org/ontology/bibo/Article>
"bachelorthesis" <http://purl.org/ontology/bibo/Thesis>
"book" <http://purl.org/ontology/bibo/Book>
"book chapter" <http://purl.org/ontology/bibo/BookSection>
"booklet" <http://purl.org/ontology/bibo/BookSection>
"conferenceobject" <http://purl.org/ontology/bibo/Conference>
"conference proceeding" <http://purl.org/ontology/bibo/Proceedings>
"conference proceedings" <http://purl.org/ontology/bibo/Proceedings>
"consensus study report" <http://purl.org/ontology/bibo/Report>
"consensus study report (concise)" <http://purl.org/ontology/bibo/Report>
"dataset" <http://vivoweb.org/ontology/core#Dataset>
"doctoralthesis" <http://purl.org/ontology/bibo/Thesis>
"editorial" <http://vivoweb.org/ontology/core#EditorialArticle>
"erratum" <http://purl.org/spar/fabio/Erratum>
"forum proceedings" <http://purl.org/ontology/bibo/Proceedings>
"image" <http://purl.org/ontology/bibo/Image>
"image, 3-d" <http://purl.org/ontology/bibo/Image>
"info:eu-repo/semantics/article" <http://purl.org/ontology/bibo/Article>
"info:eu-repo/semantics/publishedversion" <http://purl.org/ontology/bibo/Article>
"journal article" <http://purl.obolibrary.org/obo/IAO_0000013>
"learning object" <http://vivoweb.org/ontology/core#Video>
"magazine" <http://purl.org/ontology/bibo/Magazine>
"masterthesis" <http://purl.org/ontology/bibo/Thesis>
"meeting proceedings" <http://purl.org/ontology/bibo/Proceedings>
"other" <http://purl.org/ontology/bibo/Document>
"peer review report" <http://purl.org/ontology/bibo/Report>
"policy brief" <http://purl.org/ontology/bibo/Report>
"policymakers’ booklet" <http://purl.org/ontology/bibo/Report>
"presentation" <http://vivoweb.org/ontology/core#Presentation>
"proceedings report" <http://purl.org/ontology/bibo/Report>
"recording, musical" <http://purl.org/ontology/bibo/AudioDocument>
"report" <http://purl.org/ontology/bibo/Report>
"research article" <http://purl.org/ontology/bibo/AcademicArticle>
"retraction" <http://purl.org/spar/fabio/Comment>
"review article" <http://vivoweb.org/ontology/core#Review>
"software" <http://purl.obolibrary.org/obo/ERO_0000071>
"statement" <http://purl.org/ontology/bibo/Issue>
"survey report" <http://purl.org/ontology/bibo/Report>
"symposium proceedings" <http://purl.org/ontology/bibo/Proceedings>
"technical report" <http://purl.org/ontology/bibo/Report>
"thesis" <http://purl.org/ontology/bibo/Thesis>
"video" <http://vivoweb.org/ontology/core#Video>
"workshop proceedings" <http://purl.org/ontology/bibo/Proceedings>
"text" <http://purl.org/ontology/bibo/Manuscript>
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
PREFIX bibo: <http://purl.org/ontology/bibo/>
PREFIX c4o: <http://purl.org/spar/c4o/>
PREFIX cito: <http://purl.org/spar/cito/>
PREFIX dc: <http://purl.org/dc/terms/>
PREFIX dcterms: <http://purl.org/dc/terms/>
PREFIX event: <http://purl.org/NET/c4dm/event.owl#>
PREFIX fabio: <http://purl.org/spar/fabio/>
PREFIX foaf: <http://xmlns.com/foaf/0.1/>
PREFIX geo: <http://aims.fao.org/aos/geopolitical.owl#>
PREFIX obo: <http://purl.obolibrary.org/obo/>
PREFIX ocrer: <http://purl.org/net/OCRe/research.owl#>
PREFIX ocresd: <http://purl.org/net/OCRe/study_design.owl#>
PREFIX ocresp: <http://purl.org/net/OCRe/study_protocol.owl#>
PREFIX ocresst: <http://purl.org/net/OCRe/statistics.owl#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX ro: <http://purl.obolibrary.org/obo/ro.owl#>
PREFIX scires: <http://vivoweb.org/ontology/scientific-research#>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX swo: <http://www.ebi.ac.uk/efo/swo/>
PREFIX swrlb: <http://www.w3.org/2003/11/swrlb#>
PREFIX swrl: <http://www.w3.org/2003/11/swrl#>
PREFIX vann: <http://purl.org/vocab/vann/>
PREFIX vcard: <http://www.w3.org/2006/vcard/ns#>
PREFIX vitro: <http://vitro.mannlib.cornell.edu/ns/vitro/0.7#>
PREFIX vitro-public: <http://vitro.mannlib.cornell.edu/ns/vitro/public#>
PREFIX vivo: <http://vivoweb.org/ontology/core#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
/*.json
/data/
/ETL-migration.sh
etl.properties
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
#!/bin/bash

###################################################################
# Script Name : 00-env.sh
# Description : This file is used to define the environment variables
# needed to run the extract/transform/load (ETL)
# process of dspace2vivo
# Args :
# Author : Michel Héon PhD
# Institution : Université du Québec à Montréal
# Copyright : Université du Québec à Montréal (c) 2022
# Email : [email protected]
###################################################################
# Scripts root directory
export LOC_SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd -P)"
###################################################################
## Root variables
source cleanup.sh
export DEPLOY=$(cd $LOC_SCRIPT_DIR/../../../../../build; pwd -P)
export LIB=$DEPLOY/dependency

export VIVO_APP_NAME=vivo

export VIVO_URL=http://localhost:8080/$VIVO_APP_NAME
export PATH=$CATALINA_HOME/bin:$SOLR_DIR/bin:$DSPACE_HOME/bin:$PATH

###################################################################
## Extract username and password
export RUNTIME_PROP=etl.properties
if test -f "$RUNTIME_PROP"; then
[ -v ROOT_PASSWD ] || export ROOT_PASSWD=$(grep 'rootUser.password' < $RUNTIME_PROP | tr -d ' ' | cut -f 2 -d '=')
[ -v ROOT_USER ] || export ROOT_USER=$(grep 'rootUser.emailAddress' < $RUNTIME_PROP | tr -d ' ' | cut -f 2 -d '=')
[ -v JENA_PATH ] || export JENA_PATH=$(grep 'jenaPath' < $RUNTIME_PROP | tr -d ' ' | cut -f 2 -d '=')
[ -v GRAPH_NAME ] || export GRAPH_NAME=$(grep 'graphName=' < $RUNTIME_PROP | tr -d ' ' | cut -f 2 -d '=')
alias vivo_passwd="echo $ROOT_PASSWD"
alias vivo_user="echo $ROOT_USER"
alias jena_path="echo $JENA_PATH"
alias vivo_graph="echo $GRAPH_NAME"
fi

###################################################################
## Variables for dspace backend/frontend runtime

# "rest" section
export DSPACE_REST_SSL=false
export DSPACE_REST_HOST=localhost
export DSPACE_REST_PORT=8080
export DSPACE_REST_NAMESPACE=/server

###################################################################
# Executable and script path needed to run dspace2VIVO
PATH=$LOC_SCRIPT_DIR:$PATH

###################################################################
# Working directory of scripts
export WORKDIR=$(cd $LOC_SCRIPT_DIR/../; pwd -P)

###################################################################
# Directory of resources needed to configure the expected operation of the scripts
#export RESSOURCESDIR=$(cd $WORKDIR/src/main/resources ; pwd -P)
export RESSOURCESDIR=$(cd $WORKDIR/resources ; pwd -P)

###################################################################
# Directory containing the correspondence files between DSpace values and VIVO values
export MAPPING_DATA_DIR=$(cd $RESSOURCESDIR/mapping_data ; pwd -P)

###################################################################
# Resource directories after compilation. This directory is modified at each compilation (Do not edit)
export RESSOURCES_TARGET_DIR=$(cd $WORKDIR/../../../../build/classes ; pwd -P)

###################################################################
# Directory containing the queries necessary for the execution of SPARQL
export QUERY_DIR=$(cd $RESSOURCESDIR/query ; pwd -P)

###################################################################
# Repositories containing transient data from the extract/transform/load process
export DATA_DIR=$(cd $WORKDIR/data ; pwd -P)

###################################################################
# Data transition sub-directories for each step of the ETL process
export ETL_DIR_EXTRACT=$DATA_DIR/extract
export ETL_DIR_TRANSFORM=$DATA_DIR/transform
export ETL_DIR_TRANSFORM_DOC_TYPE=$(cd ${ETL_DIR_TRANSFORM}_doc_type ; pwd -P)
export ETL_DIR_TRANSFORM_PERSON=$(cd ${ETL_DIR_TRANSFORM}_person ; pwd -P)
export ETL_DIR_TRANSFORM_EXPERTISES=$(cd ${ETL_DIR_TRANSFORM}_expertises ; pwd -P)
export ETL_DIR_TRANSFORM_PERSON_EXPERTISES=$(cd ${ETL_DIR_TRANSFORM}_person_expertises ; pwd -P)

###################################################################
# Setup Jena environment
cd $JENA_PATH
export JENA_HOME="$(pwd)"
export PATH="$PATH:$(pwd)/bin"
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
#!/bin/bash

###################################################################
# Script Name :
# Description : This script encapsulates the functions call allowing the migration of DSpace Demo(6&7) data into VIVO
# Args :
# Author : Michel Héon PhD
# Institution : Université du Québec à Montréal
# Copyright : Université du Québec à Montréal (c) 2022
# Email : [email protected]
###################################################################
export SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd -P)"
source $SCRIPT_DIR/00-env.sh

$SCRIPT_DIR/create-all-transformation-directory.sh
unset ROOT_USER
unset ROOT_PASSWD
unset JENA_PATH
unset GRAPH_NAME
source $SCRIPT_DIR/00-env.sh
cd $SCRIPT_DIR

##################################################################
# Clean and setup up data directories and properties
cp $RESSOURCESDIR/*.conf $RESSOURCES_TARGET_DIR
flush_data_dspace.sh 2>/dev/null

###################################################################
# Extract dspace data
./extract-dspace.sh

###################################################################
# Produce all list
echo run produce-list-of-expertise.sh
produce-list-of-expertise.sh

###########################
echo run produce-list-of-itemtype.sh
produce-list-of-itemtype.sh

###########################
echo run produce-list-of-persons.sh
produce-list-of-persons.sh

###################################################################
# Process transformation and load to VIVO
load-data-to-vivo.sh

transform-map-vivo-doc-type.sh
load-data-doc_type-to-vivo.sh ; vivo-recomputeIndex.sh

transform-map-vivo-person.sh
load-data-person-to-vivo.sh ; vivo-recomputeIndex.sh

transform-map-vivo-expertises.sh
load-data-expertises-to-vivo.sh ; vivo-recomputeIndex.sh

transform-map-expertise-and-item-to-a-person-to-vivo.sh
load-data-person-expertise-to-vivo.sh ; vivo-recomputeIndex.sh

###################################################################
# Done ETL Process
echo "Done!"
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#!/bin/bash

###################################################################
# Script Name :
# Description :
# Args :
# Author : Michel Héon PhD
# Institution : Université du Québec à Montréal
# Copyright : Université du Québec à Montréal (c) 2022
# Email : [email protected]
###################################################################
export SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd -P)"
source $SCRIPT_DIR/00-env.sh

cd $ETL_DIR_EXTRACT; rm -f *
cd $ETL_DIR_TRANSFORM; rm -f *
cd $ETL_DIR_TRANSFORM_DOC_TYPE; rm -f *
cd $ETL_DIR_TRANSFORM_PERSON; rm -f *
cd $ETL_DIR_TRANSFORM_EXPERTISES; rm -f *
cd $ETL_DIR_TRANSFORM_PERSON_EXPERTISES; rm -f *
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#!/bin/bash

###################################################################
# Script Name :
# Description :
# Args :
# Author : Michel Héon PhD
# Institution : Université du Québec à Montréal
# Copyright : Université du Québec à Montréal (c) 2022
# Email : [email protected]
###################################################################

# Create a temporary directory to hold the downloaded contents, and make sure
# it's removed later, unless the user set KEEP_BATCH_FILE_CONTENTS.
cleanup () {
if [ -z "${KEEP_BATCH_FILE_CONTENTS}" ] \
&& [ -n "${TMPDIR}" ] \
&& [ "${TMPDIR}" != "/" ]; then
rm -r "${TMPDIR}"
fi
}
trap 'cleanup' EXIT HUP INT QUIT TERM
# mktemp arguments are not very portable. We make a temporary directory with
# portable arguments, then use a consistent filename within.
TMPDIR="$(mktemp -d -t tmp.XXXXXXXXX)" || error_exit "Failed to create temp directory."
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#!/bin/bash

###################################################################
# Script Name :
# Description :
# Args :
# Author : Michel Héon PhD
# Institution : Université du Québec à Montréal
# Copyright : Université du Québec à Montréal (c) 2022
# Email : [email protected]
###################################################################
export LOC_SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd -P)"
DATA_ROOT="$( cd $LOC_SCRIPT_DIR/../ && pwd -P)"

rm -fr $DATA_ROOT/data

mkdir -p $DATA_ROOT/data
cd $DATA_ROOT/data
mkdir -p extract transform transform_doc_type transform_person transform_expertises transform_person_expertises
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# The email address of the root VIVO user
[email protected]

# The password for the root VIVO user
rootUser.password=admin

# The file system path to the Apache Jena installation directory.
# This path is used to locate the necessary tools and libraries for interacting with RDF data.
jenaPath=/opt/apache-jena-4.10.0

# The URI identifying the desired import graph in the RDF dataset.
# In this case, it refers to the default graph of the Vitro knowledge base.
graphName=http://vitro.mannlib.cornell.edu/default/vitro-kb-2
Loading