-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathfacebook_exporter.py
105 lines (74 loc) · 3.62 KB
/
facebook_exporter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
from sfmutils.exporter import BaseExporter, BaseTable
from facebook_warc_iter import FacebookWarcIter
import logging
log = logging.getLogger(__name__)
QUEUE = "facebook_exporter"
TIMELINE_ROUTING_KEY = "export.start.facebook.facebook_user_timeline"
class BaseTwitterStatusTable(BaseTable):
"""
PETL Table for Facebook statuses.
"""
def __init__(self, warc_paths, dedupe, item_date_start, item_date_end,
seed_uids, warc_iter_cls, segment_row_size):
BaseTable.__init__(self, warc_paths, dedupe, item_date_start,
item_date_end, seed_uids, warc_iter_cls,
segment_row_size)
def _header_row(self):
return twarc.json2csv.get_headings()
def _row(self, item):
return twarc.json2csv.get_row(item, excel=True)
def id_field(self):
return "id"
class TwitterRestStatusTable(BaseTwitterStatusTable):
def __init__(self, warc_paths, dedupe, item_date_start, item_date_end,
seed_uids, segment_row_size=None):
BaseTwitterStatusTable.__init__(self, warc_paths, dedupe,
item_date_start, item_date_end,
seed_uids, TwitterRestWarcIter,
segment_row_size)
class TwitterRestExporter(BaseExporter):
def __init__(self, api_base_url, working_path, mq_config=None,
warc_base_path=None):
BaseExporter.__init__(self, api_base_url, TwitterRestWarcIter,
TwitterRestStatusTable, working_path,
mq_config=mq_config,
warc_base_path=warc_base_path)
if __name__ == "__main__":
TwitterRestExporter.main(TwitterRestExporter, QUEUE,
[SEARCH_ROUTING_KEY, TIMELINE_ROUTING_KEY])
log = logging.getLogger(__name__)
QUEUE = "twitter_rest_exporter"
SEARCH_ROUTING_KEY = "export.start.twitter.twitter_search"
TIMELINE_ROUTING_KEY = "export.start.twitter.twitter_user_timeline"
class BaseTwitterStatusTable(BaseTable):
"""
PETL Table for Twitter statuses.
"""
def __init__(self, warc_paths, dedupe, item_date_start, item_date_end,
seed_uids, warc_iter_cls, segment_row_size):
BaseTable.__init__(self, warc_paths, dedupe, item_date_start,
item_date_end, seed_uids, warc_iter_cls,
segment_row_size)
def _header_row(self):
return twarc.json2csv.get_headings()
def _row(self, item):
return twarc.json2csv.get_row(item, excel=True)
def id_field(self):
return "id"
class TwitterRestStatusTable(BaseTwitterStatusTable):
def __init__(self, warc_paths, dedupe, item_date_start, item_date_end,
seed_uids, segment_row_size=None):
BaseTwitterStatusTable.__init__(self, warc_paths, dedupe,
item_date_start, item_date_end,
seed_uids, TwitterRestWarcIter,
segment_row_size)
class TwitterRestExporter(BaseExporter):
def __init__(self, api_base_url, working_path, mq_config=None,
warc_base_path=None):
BaseExporter.__init__(self, api_base_url, TwitterRestWarcIter,
TwitterRestStatusTable, working_path,
mq_config=mq_config,
warc_base_path=warc_base_path)
if __name__ == "__main__":
TwitterRestExporter.main(TwitterRestExporter, QUEUE,
[SEARCH_ROUTING_KEY, TIMELINE_ROUTING_KEY])