-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathretrieve_data.py
executable file
·109 lines (85 loc) · 3.07 KB
/
retrieve_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
'''
This module retrieves the names of people in Facebook groups that
relate to Brazil and Portugal using the Facebook Graph Explorer API.
The names are written to a CSV file where the first column is first names and
middle names, and the second column is last names.
To get an access token, go to https://developers.facebook.com/tools/explorer
Used for a Machine Learning project to distinguish Brazilian and Portuguese names.
'''
import http.client
import json
import csv
import argparse
import regex
key = "EAACEdEose0cBALsqxTaHjAVNZCZCt3aixAuJKCTdG1yG4eOLSH1ojtQRGUObeXU7MIEuT23PlwbAFbhliZBrWU1ULwzMqPrbGY4qMnnVDrksWxVpihZB5PyCrbIMDxXTDXKCUhF3WiGrthoKUOdMATPRgOwk8qNo77VgYyhcmcZCmYqH9JJhhUAvVVBVfD4UZD"
def get_names(group_ids, limit, value):
f_training = open("training.csv", "a")
writer_training = csv.writer(f_training, quoting=csv.QUOTE_NONE)
f_testing = open("testing.csv", "a")
writer_testing = csv.writer(f_testing, quoting=csv.QUOTE_NONE)
f_eval = open("eval.csv", "a")
writer_eval = csv.writer(f_eval, quoting=csv.QUOTE_NONE)
count = 0
for group_id in group_ids:
conn = http.client.HTTPSConnection("graph.facebook.com")
next_link = " "
while not next_link == None:
names = []
conn.request("GET", "/v2.10/" + group_id + "/members?access_token=" + key + "&fields=name&after=" + next_link)
res = conn.getresponse()
data = res.read()
try:
json_data = json.loads(data)
except:
print("JSON data failed to load. Did you add the access token?")
return
if "data" not in json_data:
print(json_data["error"]["message"])
return
for user in json_data["data"]:
data_name = user["name"].lower()
pre_name = data_name.split()
fname = ' '.join(pre_name[:-1])
fname = fname.replace(","," ")
lname = pre_name[-1]
lname = lname.replace(","," ")
fname = regex.sub(u'[^\p{Latin}]', u'', fname)
lname = regex.sub(u'[^\p{Latin}]', u'', lname)
if fname == "" or lname == "":
continue
fname = fname.encode("utf-8")
lname = lname.encode("utf-8")
name = [fname, lname, value]
if count % 5 == 0 and count % 2 == 0:
writer_testing.writerow(name)
elif count % 5 == 0:
writer_eval.writerow(name)
else:
writer_training.writerow(name)
count += 1
if count >= limit:
f_training.close()
f_testing.close()
f_eval.close()
return
if "paging" in json_data:
next_link = str(json_data["paging"]["cursors"]["after"])
else:
next_link = None
f_training.close()
f_testing.close()
f_eval.close()
return
# These are the group ids of the groups we are taking names from.
brazil_ids = ["422121401312686","435932449946192","538824349628836","248974475276092"]
not_brazil_ids = ["369769286554402", "135263893484028", "716828768412499"]
# Default limit is one million
limit=1000000
# Argparse
parser = argparse.ArgumentParser()
parser.add_argument("-l", "--limit", help="Specify a limit for how many names to gather")
args = parser.parse_args()
if args.limit:
limit = int(args.limit)
get_names(brazil_ids, limit, 1)
get_names(not_brazil_ids, limit, 0)