-
Notifications
You must be signed in to change notification settings - Fork 7
/
bulk_get_unprocessed.py
76 lines (57 loc) · 2.43 KB
/
bulk_get_unprocessed.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
""" MetaDB bulk processing tools.
get_unprocessed: Get mbids and metadata that have not been looked up\
for a particular source
Usage:
bulk_get_unprocessed.py --source lastfm --outname x [-n n|-p p]
use -n to write n rows per file
use -p to write exactly p files
omit to write one file
"""
import argparse
import csv
import math
import metadb.data
import metadb.db
import metadb.log
import metadb.util
import config
metadb.db.init_db_engine(config.SQLALCHEMY_DATABASE_URI)
def dump_items(filename, data, keys):
with open(filename, "w") as fp:
dw = csv.DictWriter(fp, keys)
dw.writeheader()
dw.writerows(data)
def main(source_name, outname, perfile=None, numfiles=None):
source = metadb.data.load_source(source_name)
scraper = metadb.data.load_latest_scraper_for_source(source)
if scraper["mb_type"] == "recording":
metadb.log.info("Dumping recording items")
keys = ["mbid", "name", "artist_credit"]
data = metadb.data.get_unprocessed_recordings_for_scraper(scraper)
elif scraper["mb_type"] == "release_group":
metadb.log.info("Dumping release_group items")
keys = ["mbid", "name", "artist_credit", "first_release_date"]
data = metadb.data.get_unprocessed_release_groups_for_scraper(scraper)
datalen = len(data)
metadb.log.info("Got {} items".format(datalen))
if numfiles:
perfile = math.ceil(datalen/numfiles)
metadb.log.info("Dumping into {} files of {} each".format(numfiles, perfile))
elif perfile:
metadb.log.info("Dumping into files of {} each".format(perfile))
if perfile:
for i, chunk in enumerate(metadb.util.chunks(data, perfile), 1):
filename = "%s-%d.csv" % (outname, i)
dump_items(filename, chunk, keys)
else:
filename = "%s.csv" % (outname,)
dump_items(filename, data, keys)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Dump unprocessed items")
parser.add_argument("--source", help="source name", required=True)
parser.add_argument("--outname", help="Filename template to write to (no extension)", required=True)
group = parser.add_mutually_exclusive_group(required=True)
group.add_argument("-p", type=int, help="Number of items to dump per file")
group.add_argument("-n", type=int, help="Number of files to dump")
args = parser.parse_args()
main(args.source, args.outname, args.p, args.n)