This repository has been archived by the owner on Jun 25, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
load-new-data.sh
executable file
·140 lines (126 loc) · 5.53 KB
/
load-new-data.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
#!/bin/bash
# https://github.com/subhh/HOS-MetadataTransformations
# change directory to location of shell script
cd $(dirname $0)
# pathnames
metha_sync="/usr/sbin/metha-sync"
metha_cat="/usr/sbin/metha-cat"
recordpath=(Records Record) # metha-cat default xml path to harvested records
openrefine_client="$(readlink -f opt/openrefine-client)"
data_dir="$(readlink -f data)"
# help screen
function usage () {
cat <<EOF
Usage: ./load-new-data.sh [-c CODENAME] [-i OAIURL] [-s OAISET] [-f OAIFORMAT] [-r RECORDPATH] [-d OPENREFINEURL]
== options ==
-c CODENAME code name of the new data source
-i OAIURL url to the oai endpoint of the new data source
-s OAISET setSpec of the oai endpoint
-f OAIFORMAT metadataFormat of the oai endpoint
-r RECORDPATH filter data by additional xml node(s), e.g. "-r metadata" selects /Records/Record/metadata (ignoring /Records/Record/header)
-d OPENREFINEURL ingest data to external OpenRefine service (default: http://localhost:3333)
== examples ==
./load-new-data.sh -c ediss -i http://ediss.sub.uni-hamburg.de/oai2/oai2.php -d http://localhost:3333
./load-new-data.sh -c tubdok -i http://tubdok.tub.tuhh.de/oai/request -r metadata -d http://localhost:3333
./load-new-data.sh -c tuhh-fdm -i https://zenodo.org/oai2d -s user-tuhh -f datacite -r metadata -r resource -d http://localhost:3333
./load-new-data.sh -c uhh-fis -i https://fis-www-test.rrz.uni-hamburg.de/ws/oai -s publications:all -d http://localhost:3333
EOF
exit 1
}
# defaults
openrefine_url="http://localhost:3333"
# check input
NUMARGS=$#
if [ "$NUMARGS" -eq 0 ]; then
usage
fi
# get user input
options="c:i:s:f:r:d:h"
while getopts $options opt; do
case $opt in
c ) codename=${OPTARG} ;;
i ) oai_url=${OPTARG} ;;
s ) oai_set=${OPTARG} ;;
f ) oai_format=${OPTARG} ;;
r ) recordpath+=("${OPTARG}") ;;
d ) openrefine_url=${OPTARG} ;;
h ) usage ;;
\? ) echo 1>&2 "Unknown option: -$OPTARG"; usage; exit 1;;
: ) echo 1>&2 "Missing option argument for -$OPTARG"; usage; exit 1;;
* ) echo 1>&2 "Unimplemented option: -$OPTARG"; usage; exit 1;;
esac
done
shift $((OPTIND - 1))
# check for mandatory options
if [ -z "$codename" ]; then
echo 1>&2 "please provide a name for the new data source"
echo 1>&2 "example: ./load-new-data.sh -c ediss-test -i http://ediss.sub.uni-hamburg.de/oai2/oai2.php"
exit 1
fi
if [ -z "$oai_url" ]; then
echo 1>&2 "please provide the url to the oai endpoint of the new data source"
echo 1>&2 "example: ./load-new-data.sh -c ediss-test -i http://ediss.sub.uni-hamburg.de/oai2/oai2.php"
exit 1
fi
# declare additional variables
date=$(date +%Y%m%d_%H%M%S)
checkpoints=${#checkpointdate[@]}
checkpointdate[$((checkpoints + 1))]=$(date +%s)
checkpointname[$((checkpoints + 1))]="Start process"
external=${openrefine_url##*/}
external_host=${external%:*}
external_port=${external##*:}
# print variables
echo "Code name: $codename"
echo "OAI server: $oai_url"
echo "OAI set: $oai_set"
echo "OAI metadata format: $oai_format"
echo "Record path: $(for i in ${recordpath[@]}; do echo -n "/$i"; done)"
echo "OpenRefine service URL: $openrefine_url"
echo ""
# Download data via OAI with metha
checkpoints=${#checkpointdate[@]}
checkpointdate[$((checkpoints + 1))]=$(date +%s)
checkpointname[$((checkpoints + 1))]="Download via OAI with metha"
echo "=== $checkpoints. ${checkpointname[$((checkpoints + 1))]} ==="
echo ""
echo "starting time: $(date --date=@${checkpointdate[$((checkpoints + 1))]})"
echo ""
$metha_sync $(if [ -n "$oai_set" ]; then echo "-set $oai_set"; fi) $(if [ -n "$oai_format" ]; then echo "-format $oai_format"; fi) "$oai_url"
$metha_cat $(if [ -n "$oai_set" ]; then echo "-set $oai_set"; fi) $(if [ -n "$oai_format" ]; then echo "-format $oai_format"; fi) "$oai_url" > "${data_dir}/01_oai/${codename}_${date}.xml"
records_metha=$(grep -c '<Record>' "${data_dir}/01_oai/${codename}_${date}.xml")
echo "saved $records_metha records in ${data_dir}/01_oai/${codename}_${date}.xml"
echo ""
# Ingest data into OpenRefine
if [ -n "$openrefine_url" ]; then
checkpoints=${#checkpointdate[@]}
checkpointdate[$((checkpoints + 1))]=$(date +%s)
checkpointname[$((checkpoints + 1))]="Ingest data into OpenRefine"
echo "=== $checkpoints. ${checkpointname[$((checkpoints + 1))]} ==="
echo ""
echo "starting time: $(date --date=@${checkpointdate[$((checkpoints + 1))]})"
echo ""
echo "delete existing project ${codename}_new..."
${openrefine_client} -H ${external_host} -P ${external_port} --delete "${codename}_new"
echo ""
echo "create new project ${codename}_new..."
${openrefine_client} -H ${external_host} -P ${external_port} --create "${data_dir}/01_oai/${codename}_${date}.xml" $(for i in ${recordpath[@]}; do echo "--recordPath=$i "; done) --projectName=${codename}_new
echo ""
fi
# calculate and print checkpoints
echo "=== Statistics ==="
echo ""
checkpoints=${#checkpointdate[@]}
checkpointdate[$((checkpoints + 1))]=$(date +%s)
checkpointname[$((checkpoints + 1))]="End process"
echo "starting time and run time of each step:"
checkpoints=${#checkpointdate[@]}
checkpointdate[$((checkpoints + 1))]=$(date +%s)
for i in $(seq 1 $checkpoints); do
diffsec="$((${checkpointdate[$((i + 1))]} - ${checkpointdate[$i]}))"
printf "%35s $(date --date=@${checkpointdate[$i]}) ($(date -d@${diffsec} -u +%H:%M:%S))\n" "${checkpointname[$i]}"
done
echo ""
diffsec="$((checkpointdate[$checkpoints] - checkpointdate[1]))"
echo "$records_metha records"
echo "total run time: $(date -d@${diffsec} -u +%H:%M:%S) (hh:mm:ss)"