-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathsimpleGenderPredictor.py
69 lines (58 loc) · 2.13 KB
/
simpleGenderPredictor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
"""
Download baby names from U.S. Social Security website and use historical frequency to predict gender
Modified from genderPredictor by Stephen Holiday
Feng Mai [email protected]
2016-02-17
"""
from __future__ import absolute_import, division, print_function
from six.moves import urllib
import os
from zipfile import ZipFile
import csv
import io
class simpleGenderPredictor():
def __init__(self):
self.name_dict = self.extractNamesDict()
def downloadNames(self):
urllib.request.urlretrieve(
'https://www.ssa.gov/oact/babynames/names.zip', 'names.zip')
def extractNamesDict(self):
"""
download names.zip from SSA if necessary
construct a dict from SSA name data: NAME: [number of M, number of F]
"""
if not os.path.exists('names.zip'):
print('names.zip does not exist, downloading from ssa.gov')
self.downloadNames()
else:
print('names.zip exists, not downloading')
zf = ZipFile('names.zip', 'r')
filenames = zf.namelist()
names = dict()
genderMap = {'M': 0, 'F': 1}
for filename in filenames:
if filename.endswith('.txt'):
file = zf.open(filename, 'r')
rows = csv.reader(
io.TextIOWrapper(file, encoding="latin-1"), delimiter=',')
for row in rows:
name = row[0].upper()
gender = genderMap[row[1]]
count = int(row[2])
if name not in names:
names[name] = [0, 0]
names[name][gender] = names[name][gender] + count
file.close()
print('name dictionary constructed')
return names
def predict_name(self, a_name):
"""
Outputs a tuple: prediction, probability from historical data
"""
freq = self.name_dict.get(a_name.upper())
if freq is None:
return 'Unknown', None
elif freq[0] >= freq[1]:
return 'M', freq[0] / (freq[0] + freq[1])
else:
return 'F', freq[1] / (freq[0] + freq[1])