This repository has been archived by the owner on Dec 31, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
build_dataset_census.py
115 lines (102 loc) · 4.57 KB
/
build_dataset_census.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
# -*- coding: utf-8 -*-
"""
Created on Sat Mar 10 13:58:30 2018
@author: lengchun
"""
import pandas as pd
import numpy as np
import random
import os
import argparse
import sys
import ast
import pdb
# Check if a string represents an int
def repsInt(test_str):
try:
int(test_str)
return True
except ValueError:
return False
def parse_args():
# Create parser and add an argument to specify which directory the data is in
roi = np.arange(3, 40, 4)
parser = argparse.ArgumentParser(description="")
parser.add_argument('-f','--filename', default='data/HSG/HSG01.xls',help="Dataset spreadsheet", required=False)
parser.add_argument('-c','--columns_to_read', default= '[ 3, 7, 11, 15, 19, 23, 27, 31, 35, 39]' ,help="Which columns to read from the spreadsheet", required=False)
parser.add_argument('-b','--is_b_vec', default='True',help="Are we trying to find the b vector", required=False)
parser.add_argument('-w','--work_sheet_to_read', default='HSG01A',help="worksheet", required=False)
return parser
def main(args):
# Check if dataset is where we expect it to be
# pdb.set_trace()
assert os.path.isfile(args.filename), "Couldn't find the dataset at {}".format(args.filename)
filename = args.filename
all_columns = ast.literal_eval(args.columns_to_read)
# Read data from speadsheet
if filename.endswith('.xls') or filename.endswith('.xlsx'):
df = pd.read_excel(filename, sheetname=args.work_sheet_to_read)
elif filename.endswith('.csv'):
df = pd.read_csv(filename)
# Convert from pandas to numpy array
full_array = df.values
array_size = full_array.shape
A = []
# Use later to figure out if we want to
first_data_ix = 100000
if args.is_b_vec == "False":
# Loop over rows of the full array
for i in range(0,array_size[0]):
if (repsInt(full_array[i,1])):
# if the 2nd column (country code) is 0, we are looking at a number for a
# state. This is what we want. Is this the case in other datasets?
if (int(full_array[i,1]) % 1000 == 0) and (full_array[i,1] > 0):
for col_num in range(len(all_columns)):
col = all_columns[col_num]
# Check if the data is a string. If not, write directly to temp (assumes a float)
if (type(full_array[i,col]) is str):
temp_str = full_array[i,col]
# The data has commas. Remove to cast to float
try:
temp = float(temp_str.replace(',',''))
except AttributeError:
print('Trying to use replace on temp_str when it is not a str')
pdb.set_trace()
else:
temp = float(full_array[i,col])
# Make a new row in our A matrix if we are in the first entry
# Else add to existing rows
if not A or (i == first_data_ix):
A.append([temp])
first_data_ix = i
else:
A[col_num].append(temp)
else:
print('else')
for i in range(0,array_size[0]):
for col_num in range(len(all_columns)):
col = all_columns[col_num]
if (type(full_array[i,col]) is str):
temp_str = full_array[i,col]
# The data has commas. Remove to cast to float
try:
temp = float(temp_str.replace(',',''))
except AttributeError:
print('Trying to use replace on temp_str when it is not a str')
pdb.set_trace()
else:
temp = float(full_array[i,col])
# Make a new row in our A matrix if we are in the first entry
# Else add to existing rows
# pdb.set_trace()
if not A or (i == first_data_ix):
A.append([temp])
first_data_ix = i
else:
A[col_num].append(temp)
# pdb.set_trace()
return A
if __name__ == '__main__':
parser = parse_args()
args = parser.parse_args()
main(args)