-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathomekacsv.py
executable file
·167 lines (144 loc) · 5.55 KB
/
omekacsv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
#! /usr/bin/python
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from collections import defaultdict
import csv
import json
import math
import time
try:
import readline
except ImportError:
pass
try:
from urllib.parse import urlencode
from urllib.request import urlopen
from urllib.error import URLError, HTTPError
except ImportError:
from urllib import urlencode
from urllib2 import urlopen, URLError, HTTPError
'''
Extract top-level metadata and element_texts from items returned by
Omeka 2.x API request, and then write to a CSV file. Intended for
requests to items, collections, element sets, elements, files, tags, exhibits, and exhibit pages.
Based on Caleb McDaniel's original Python CSV file generator: https://github.com/wcaleb/omekadd
'''
try:
input = raw_input
except NameError:
pass
try:
unicode
py2 = True
except NameError:
unicode = str
py2 = False
def request(endpoint, resource, query={}):
url = endpoint + "/" + resource
if apikey is not None:
query["key"] = apikey
url += "?" + urlencode(query)
response = urlopen(url)
return response.info(), response.read()
def unicodify(v):
if type(v) is list or type(v) is dict:
return None
if type(v) is bool or type(v) is int or type(v) is float:
return unicode(v)
return v
def get_all_pages(endpoint, resource):
data = []
page = 1
# default pages to 1 before we see the omeka-total-results header
pages = 1
while page <= pages:
response, content = request(endpoint, resource, {'page': str(page)})
content_list = json.loads(content)
data.extend(content_list)
# use first page to determine site's per_page setting
if (page == 1):
total = int(response['omeka-total-results'])
page_length = len(content_list)
if (page_length < total):
pages = int(math.ceil(total/page_length))
total_text = '\tTotal results: ' + str(total)
if (pages > 1):
total_text += ' (across ' + str(pages) + ' pages)'
print(total_text)
if (pages > 1):
print('\tGot results page ' + str(page))
page += 1
time.sleep(1)
return data
endpoint = ''
while not endpoint:
endpoint = input('Enter your Omeka API endpoint\n')
endpoint = endpoint.strip().rstrip('/');
apikey = input('\nIf you are using an API key, please enter it now. Otherwise press Enter.\n')
if not apikey:
apikey = None
multivalue_separator = input('\nEnter a character to separate mutiple values within a single cell.\nThis character must not be used anywhere in your actual data.\nLeave blank to use the default separator: |\n')
if not multivalue_separator:
multivalue_separator = '|'
# get list of supported resources by this site
response, content = request(endpoint, 'resources')
available_resources = json.loads(content)
resources = ['items', 'files', 'collections', 'elements', 'element_sets', 'tags', 'exhibits', 'exhibit_pages', 'geolocations']
for resource in resources:
if (resource not in available_resources):
continue
print('\nExporting ' + resource)
# get all pages
data = get_all_pages(endpoint, resource)
fields = []
csv_rows = []
for D in data:
csv_row = {}
for k, v in D.items():
if k == 'tags':
tags = [ tag['name'] for tag in v ]
csv_row['tags'] = ','.join(tags)
elif k == 'element_texts':
texts_by_element = defaultdict(list)
for element_text in v:
if (element_text['text'] is None):
continue
element_header = element_text['element_set']['name'] + ':' + element_text['element']['name']
texts_by_element[element_header].append(element_text['text'])
for element_header, texts in texts_by_element.items():
csv_row[element_header] = multivalue_separator.join(texts)
elif k == 'page_blocks':
text = [ block['text'] for block in v ]
csv_row['Text'] = multivalue_separator.join(filter(None, text))
elif type(v) is dict:
for subkey, subvalue in v.items():
if (subkey == 'url' or subkey == 'resource'):
continue
subvalue_string = unicodify(subvalue)
if subvalue_string is not None:
csv_row[k + '_' + subkey] = subvalue_string
continue;
elif type(v) is list or v is None:
continue;
else:
csv_row[k] = unicodify(v)
for k in csv_row.keys():
if k not in fields: fields.append(k)
csv_rows.append(csv_row)
fields = sorted(fields, key=lambda field: (field != 'id', field))
if (py2):
o = open(resource + '_output.csv', 'wb')
c = csv.DictWriter(o, [f.encode('utf-8', 'replace') for f in fields], extrasaction='ignore')
c.writeheader()
for row in csv_rows:
c.writerow({k:v.encode('utf-8', 'replace') for k,v in row.items() if isinstance(v, unicode)})
else:
o = open(resource + '_output.csv', 'w', encoding='utf-8', newline='')
c = csv.DictWriter(o, fields, extrasaction='ignore')
c.writeheader()
for row in csv_rows:
c.writerow(row)
o.close()
print('File created: ' + resource + '_output.csv')