-
Notifications
You must be signed in to change notification settings - Fork 1
/
eprints2dpn_ep3xml_split.pl
executable file
·314 lines (259 loc) · 10.6 KB
/
eprints2dpn_ep3xml_split.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
#!/usr/bin/perl -w
#
# eprints_ep3xml_split.pl - B. Coles.
#
# Last modified 1/4/2018
#
# PROJECT: Archiving EPrints material to DPN - Step 2 in creating Bags for DPN.
#
# This script reads a file of EP3XML records exported from eprints, either using
# the batch 'export' command or the Export functionality in the web interface.
# It collects records and files in separate directories for each record, suitable
# for later bagging.
#
# NOTE: the script eprints2dpn_dc_split.pl should be run before this script. It will create
# the directories needed to hold the parts of each EPrints record that are to be bagged.
#
###########################################################################
# CONFIGURATION: change $repo_name variable to name of EPrints repository
# being processed before running, e.g. "caltechthesis"
#
# Also change the $curl user variable, to
# an admin user for the current repo.
#
###########################################################################
#
# Specific functions:
#
# 1. It creates individual EP3XML records, each in its own directory. Directories are named
# according to the eprints ID (e.g. 12345) and XML records are named, for ex. '12345.xml'.
# If eprints2dpn_dc_split has already been run, these directories will already exist and will
# contain the DC record for this item.
#
# 2. It collects the URLs of all <document> elements in the records and retrieves the
# documents using curl. These documents are placed in the same directory as the XML record
# and also the DC record produced by the corresponding eprints2dpn_dc_split script, which
# should have been run first to create the directories.
#
# Sample input:
#
# Starting EPrints Repository.
# Connecting to DB ... done.
# <?xml version='1.0' encoding='utf-8'?>
# <eprints xmlns='http://eprints.org/ep2/data/2.0'>
# <eprint id='http://oralhistories.library.caltech.edu/id/eprint/15'>
# <eprintid>15</eprintid>
# <rev_number>7</rev_number>
# <documents>
# <document id='http://oralhistories.library.caltech.edu/id/document/643'>
# <docid>643</docid>
# <rev_number>2</rev_number>
# <files>
# <file id='http://oralhistories.library.caltech.edu/id/file/1684'>
# <fileid>1684</fileid>
# <datasetid>document</datasetid>
# <objectid>643</objectid>
# <filename>OH_Bonner_J.pdf</filename>
# <mime_type>application/pdf</mime_type>
# <hash>c495b7a88582aaa0f07c7a49ea9278af</hash>
# <hash_type>MD5</hash_type>
# <filesize>3370653</filesize>
# <mtime>2015-03-10 20:08:14</mtime>
# <url>http://oralhistories.library.caltech.edu/15/1/OH_Bonner_J.pdf</url>
# </file>
# </files>
# <eprintid>15</eprintid>
# <pos>1</pos>
# <placement>1</placement>
# <mime_type>application/pdf</mime_type>
# <format>application/pdf</format>
# <language>en</language>
# <security>public</security>
# <license>other</license>
# <main>OH_Bonner_J.pdf</main>
# <content>published</content>
# </document>
# </documents>
# <eprint_status>archive</eprint_status>
# <userid>1</userid>
# <dir>disk0/00/00/00/15</dir>
# <datestamp>2002-10-21</datestamp>
# <lastmod>2015-03-10 20:08:35</lastmod>
# <status_changed>2011-10-20 19:44:28</status_changed>
# <type>oral_history</type>
# <metadata_visibility>show</metadata_visibility>
# <item_issues_count>0</item_issues_count>
# <creators>
# <item>
# <name>
# <family>Bonner</family>
# <given>James</given>
# </name>
# <id>Bonner-J</id>
# </item>
# </creators>
# <title>Interview with James F. Bonner</title>
# <ispublished>unpub</ispublished>
# <subjects>
# <item>name</item>
# <item>bio</item>
# </subjects>
# <full_text_status>public</full_text_status>
# <abstract>Interview in 1980 with professor of biology James Bonner begins with his re
# collections of growing up in an academic family. In 1929, his father, a physical chemist
# at the University of Utah, was a visitor at Caltech, where Bonner enrolled as a junior.
# Recalls course work with X-ray crystallographer Roscoe G. Dickinson and activities of Di
# vision of Chemistry and Chemical Engineering under Arthur Amos Noyes; humanities courses
# with William B. Munro; physics with Earnest Watson, William V. Houston, and Carl Anderson
# ; geology with John P. Buwalda; and biology with Thomas Hunt Morgan, Henry Borsook, and T
# heodosius Dobzhansky. Became Dobzhansky's summer researcher and editor; switched fr
# om chemistry to biology. Graduate work with Dobzhansky on Drosophila genetics and Kennet
# h Thimann on plant hormone auxin. Friendship with Noyes. NRC postdoctoral fellowship to
# Utrecht, Leiden, and ETH, 1934-35. Joined Caltech's Biology Division in 1936 as an
# instructor: recalls colleagues Frits Went, Arie J. Haagen-Smit, Johannes van Overbeek; p
# lant labs at Caltech; coining of term phytotron. Recollections of Robert A. Millikan. W
# ar work for U.S. Emergency Rubber Project on guayule and Cryptostegia. Work on cell biol
# ogy with Sam Wildman; discovery of Fraction 1, central enzyme of photosynthesis. Foundin
# g of Caltech's Industrial Associates program in 1950. Recalls graduate student Paul
# Tso, discovery of plant actomycin, isolation of ribosomes. Work of Robert Holley on tra
# nsfer RNAs. Consultant to Malaysian rubber industry. "Next 100 Years" project
# , with Harrison Brown. Studies RNA in 1960s with R. C. Huang and histone chemistry with
# Douglas Fambrough. Visitor at Oxford, 1963. Remarks on underdeveloped countries. Study
# of population growth with H. Brown. Comments on his recent work on cloning genes, and v
# isits to Singapore and China. His hopes for genetic engineering. Stint as acting chairm
# an of the Biology Division; comments on Robert L. Sinsheimer. [See also 1978 joint inter
# view with Bonner, N. H. Horowitz, D. F. Poulson, and S. H. Emerson.]</abstract>
# <date>1982-01-01</date>
# <date_type>published</date_type>
# <id_number>CaltechOH:OH_Bonner_J</id_number>
# <refereed>FALSE</refereed>
# <official_url>http://resolver.caltech.edu/CaltechOH:OH_Bonner_J</official_url>
# <rights>No commercial reproduction, distribution, display or performance rights in th
# is work are provided.</rights>
# <collection>CaltechOralHistories</collection>
# <interviewer>Graham Berry</interviewer>
# <interviewdate>March 13-14, 1980</interviewdate>
# </eprint>
#
#############################################################################
use Data::Dumper;
use strict;
# my $debug = 1;
my $debug = 0;
# CONFIGURATION: change to repository name of EPrints repo being processed,
# e.g., "caltechthesis"
my $repo_name = "caltechauthors";
my $curl_user = "admin:admin\@authors";
my $line = 1;
my $input = '';
my $records_created = 0;
my $documents_written = 0;
my $eprint_number = '';
my $eprint_status = '';
my $output_directory_name;
my $output_file_name;
my $output_record = '';
my $eprints_xml_header = "<?xml version='1.0' encoding='utf-8'?>\n<eprints xmlns='http://eprints.org/ep2/data/2.0'>\n";
my $eprints_xml_footer = "</eprints>\n";
my @document_urls = '';
my $this_url = '';
my $num_docs = 0;
open(IN, "<:encoding(UTF-8)", "/coda/dpn/" . $repo_name . ".xml") or die "*** Cannot open /coda/dpn/" . $repo_name . ".xml for input - terminating\n";
while(<IN>) # loop through input one line at a time
{
$input = $_;
if($debug)
{
print $input;
}
# skip lines with no useful content, including first 5 lines
if (($input ne '') && ($input ne ' ') && ($input ne "\n") && ($line > 5))
{
if (substr($input,2,8) eq "<eprint " ) # we have encountered a new record
{
# Write the current $output record to the directory
if ($output_record ne '') {
# add XML header and footer elements
$output_record = $eprints_xml_header . $output_record .
$eprints_xml_footer;
write_record ($output_record, $eprint_number);
$records_created++;
$output_record = '';
# fetch all the document files for this record and write
# them to the same directory
if ($num_docs > 0)
{
fetch_docs (\@document_urls, $eprint_number);
$documents_written += $num_docs;
$num_docs = 0;
@document_urls = '';
}
}
}
# Collect data for the curent record
$output_record .= $input;
# save the eprint id - will be needed for file/dir naming
if ( length($input) > 15 ) # don't process short lines
{
if (substr($input,4,10) eq "<eprintid>" )
{
$eprint_number = substr($input,14);
chomp($eprint_number); # remove trailing newline
$eprint_number =~ s/<\/eprintid>//; # remove trailing end tag
if ( $debug)
{
print "EPRINTID=" . $eprint_number . "\n";
}
}
}
# save the urls for the document files attached to this eprint
if ( length($input) > 12 )
{
if (substr($input,12,5) eq "<url>" )
{
$this_url = substr($input,17);
# remove end tag
$this_url = substr($this_url,0,length($this_url)-7);
if ($ debug )
{
print "THIS_URL=" . $this_url . "\n";
}
$num_docs ++;
$document_urls[$num_docs-1] = $this_url;
}
}
}
$line++;
}
close(IN);
print "Number of input lines processed: $line\n\n";
print "Number of output records created: $records_created\n\n";
print "Number of documents written: $documents_written\n";
sub write_record {
my $record = shift;
my $dirname = shift;
my $target_dir = "/coda/dpn/" . $repo_name . "/" . $repo_name . "-" . $dirname;
my $output_file_name = $repo_name . "-" . $dirname . ".xml";
# $target_dir should already exist - created by eprints2dpn_dc_split.pl. If it's not
# there, next step will fail.
open(XML_OUT, ">:utf8", $target_dir . "/" . $output_file_name) or die "***cannot open " . $output_file_name. " for writing.\n\n";
print XML_OUT $record;
close(XML_OUT);
}
sub fetch_docs {
my @url = @{$_[0]};
my $dirname = $_[1];
foreach (@url)
{
my $fetch_url = $_;
my $target_dir = "/coda/dpn/" . $repo_name . "/" . $repo_name . "-" . $dirname;
# get the filename out of the URL so we can preserve it on output
my $pos = rindex($fetch_url, '/');
my $output_file_name = substr($fetch_url, $pos);
# use curl to get the document into a variable for output
my $document = `curl -u $curl_user $fetch_url 2>&1`;
open(DOC_OUT, ">", $target_dir . "/" . $output_file_name) or die "***cannot open " . $output_file_name. " for writing.\n\n";
print DOC_OUT $document;
close(DOC_OUT);
}
}