-
Notifications
You must be signed in to change notification settings - Fork 0
/
createDictOfVariableTypesFromCSV.py
49 lines (37 loc) · 1.7 KB
/
createDictOfVariableTypesFromCSV.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
# -*- coding: utf-8 -*-
"""
Created on Fri Apr 11 14:42:21 2014
filename: createDictOfVariableTypesFromCSV.py
@author: Misha
input: topxxxivs_CODED.csv
this csv files has the following structure
first 2 lines are column names and legend
lines 3 and on are:
VAR NAME, number of times used, TYPE, NOTES
TYPE = {C = CONTINUOUS, CL=CONTINUOUS-LIKE, CAT?=CATEGORICAL AND I DON'T KNOW HOW MANY LEVELS,
# = CATEGORICAL WITH THAT MANY LEVELS}
NOTES = sometimes there'll be a note of the form NUM=adfsdf.. and what that specifies is
usually the "other" category, which is not missing, but which screws up the
continuous-like-ness of the variable. Ex: levels 1-5 are ordinal, 6=other.
output: = {varname:type}
"""
import cPickle as cp
pathToData = 'C:\Users\Misha\Dropbox\GSS Project\Data/'
fileName = 'top300ivs_CODED.csv'
lines = open(pathToData + fileName).readlines()
lines = lines[2:]
variableTypes = {}
for line in lines:
if len(line.split(',')) == 4:
varname, numused, vartype, notes = line.split(',')
else:
varname, numused, vartype = line.split(',')
notes = ''
# in the types, include a type called 'DONOTUSE' which will be assigned to all variables
# of type 'CAT?' and those where the "notes" field includes the characters "#=...."
if notes[0].isdigit() and notes[1]=='=': vartype = 'DONOTUSE'
if vartype == 'CAT?': vartype = 'DONOTUSE'
# convert integer variable types (where int = how many levels) to actual ints
if vartype[0].isdigit(): vartype = int(vartype)
variableTypes[varname] = vartype
cp.dump(variableTypes, open(pathToData + 'variableTypes.pickle', 'wb'))