-
Notifications
You must be signed in to change notification settings - Fork 0
/
raw_data_processing.py
283 lines (186 loc) · 9.16 KB
/
raw_data_processing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os, shutil
# script takes about 6 minutes in total to run
start = 1974 # beginning of record
stop = 2017 # end of record
dir_path = os.path.dirname(os.path.realpath(__file__))
print(dir_path)
dist_list = open("dist_names.txt").read().splitlines() # read in irrigation
# districts as a list - alphabetical order
################################################################################
# begin process 1 - parse all data by irrigation districts
print('------------------------ process 1 ------------------------')
# the following section of code consolidates data from CSVs of all data from all
# locations for each year into CSVs of all data from each irrigation district
# for each year, and creates directories for each irrigation district and each
# year within each irrigation district
# time ~ 6 minutes
dir1 = 'Irrigation District Data' # name of first data processing directory
if dir1 in os.listdir(dir_path):
# quit()
shutil.rmtree(dir1) # only turn on if need to do
# again
os.mkdir(dir1) # make directory
for file_ in dist_list: # loop over irrigation districts
print(file_)
comtrs = pd.read_csv(dir_path + '/irrigation_districts_with_comtrs/' +
file_ + '.csv', index_col=0) # get locations in irrigation district
os.chdir(dir_path + '/' + dir1) # switch from main directory into dir1
os.mkdir(file_) # make directory for irrigation district
os.chdir(dir_path + '/' + dir1 + '/' + file_) # switch into directory for
# irrigation district
for i in np.arange(start,stop): # loop over years 1974-2016 for this
# irrigation district
path = dir_path + '/raw_data/all_data_normalized_year'+str(i)[2:]+'_by_COMTRS.csv' # path of CSV with all of the data from a given year
filename = '%d ' %i + file_ # name of CSV: 'year ' + 'irrigation dist'
df = pd.read_csv(path,index_col=0,sep='\t') # read in all data CSV
df.loc[comtrs.index].to_csv(filename,index=True) # write data in df
# from irrigation district to a new CSV by using location index from
# the district
# end process 1
os.chdir(dir_path)
################################################################################
# begin process 2 - sum irrigation district crop acreage totals by year
print('------------------------ process 2 ------------------------')
# the following section of code sums data of crop acreage at all locations in a
# given irrigation district for a given crop type ID, and outputs a summary CSV
# of the total for each crop in the irrigation district in a given year
# time ~ 20 seconds
dir2 = 'Irrigation District Summary Data' # name of second data processing
# directory
if dir2 in os.listdir(dir_path):
# quit()
shutil.rmtree(dir2) # only turn on if need to do
# again
os.mkdir(dir2) # make directory
for file_ in dist_list: # loop over irrigation districts
print(file_)
os.chdir(dir_path + '/' + dir2) # switch from main directory into dir2
os.mkdir(file_) # make directory for irrigation district
os.chdir(dir_path + '/' + dir2 + '/' + file_) # switch into directory for
# irrigation district
for i in np.arange(start,stop): # loop over years 1974-2016 for this
# irrigation district
path = dir_path+'/'+dir1+'/'+file_+'/'+'/%d ' %i + file_ # path of
# CSV with irrigation district data from a given year
filename = '%d ' %i + file_ # name of CSV: 'year ' + 'irrigation dist'
df = pd.read_csv(path,index_col=0,header=0) # read in irrigation
# district data CSV
df.sum(axis=0).to_csv(filename) # sum data for all locations for each
# crop type ID
# end process 2
os.chdir(dir_path)
################################################################################
# begin process 3 - construct step 1 time series
print('------------------------ process 3 ------------------------')
# the following section concatenates the up-till-now-kept-yearly data into time
# series. since there are two sets of crop type codes, one that applies over the
# time interval 1974-1989 and another that applies over the time interval
# 1990-2016, two CSVs for each irrigation district are created. these will be
# concatenated together in the next step, this intermediate step is the last
# unaltered data product. assumptions have to be made about crop type code
# equivalencies in the next step.
# time ~ 10 seconds
dir3 = 'Irrigation District Time Series' # name of third data processing
# directory
if dir3 in os.listdir(dir_path):
# quit()
shutil.rmtree(dir3) # only turn on if need to do
# again
os.mkdir(dir3) # make directory
code74 = pd.read_csv(dir_path+'/codes_1974.csv',index_col=0,header=None) # load
# crop type ID codes that apply from 1974-1989
code90 = pd.read_csv(dir_path+'/codes_1990.csv',index_col=0,header=None) # load
# crop type ID codes that apply from 1990-2016
colum = np.arange(start,stop,1,dtype=int) # creates column header years
for file_ in dist_list: # loop over irrigation districts
print(file_)
df1 = pd.DataFrame(0,index=code74.values.astype(str).T
[0],columns=colum.astype(str)[:16]) # initialize dataframe to load
# summary CSVs into from years 1974-1989
df2 = pd.DataFrame(0,index=code90.values.astype(str).T
[0],columns=colum.astype(str)[16:]) # initialize dataframe to load
# summary CSVs into from years 1990-2016
for i in np.arange(start,stop): # loop over years 1974-2016 for this
# irrigation district
path = dir_path+'/'+dir2+'/'+file_+'/'+'/%d ' %i + file_ # path of
# CSV with irrigation district summary data from a given year
roast = pd.read_csv(path,index_col=0,header=None) # loads summary data
rindex = np.array(roast.index,dtype=int).astype(str) # loads index
rindex = rindex[(rindex.astype(int)<=99999) & (rindex.astype(int)>=0)] #
# cleans index to be inbound values
if i < 1990:
df1.loc[rindex,[str(i)]] = roast.loc[rindex.astype
(int)].values.astype(int) # loads 1974-1989 data into df1
else:
df2.loc[rindex,[str(i)]] = roast.loc[rindex.astype
(int)].values.astype(int) # loads 1990-2016 data into df2
name1 = '1974-1989 ' + file_ # name df1 file output
name2 = '1990-2016 ' + file_ # name df2 file output
os.mkdir(dir_path + '/' + dir3 + '/' + file_) # make irrigation district
# folder underneath dir3 directory
os.chdir(dir_path + '/' + dir3 + '/' + file_) # change into irrigation
# district folder underneath dir3 directory
df1.to_csv(name1) # write 1974-1989 time series data to CSV
df2.to_csv(name2) # write 1990-2016 time series data to CSV
# end process 3
os.chdir(dir_path)
################################################################################
# begin process 4 - construct whole period 1974-2016 time series
print('------------------------ process 4 ------------------------')
# the following section concatenates two time series data sets, running from
# 1974-1989 and 1990-2016, respectively, together. this requires the use of a
# library of code equivalencies, which had to be constructed by inspection.
# time ~ 20 seconds
dir4 = 'Irrigation District Data - Final' # name of fourth data processing
# directory
if dir4 in os.listdir(dir_path):
# quit()
shutil.rmtree(dir4) # only turn on if need to do
# again
os.mkdir(dir4) # make directory
match = pd.read_csv(dir_path +
'/code_equiv.csv',index_col=0,header=0).fillna(0).astype(int) # loads the
# CSV that equates 1974-1989 codes to 1990-2016 codes. note that code
# equivalencies are not always one-to-one, hence the conditional statements in
# the loop below
active = list(match.index[match['code1'].values != 0]) # creates index of
# current codes used for final df3.to_csv command
m2 = match.set_index(['site_code_1990_2016']) # forgot why this is here
for file_ in dist_list: # loop over irrigation districts
print(file_)
path1 = dir_path + '/' + dir3 + '/' + file_ + '/1974-1989 ' + file_ # path
# to 1974-1989 time series data
path2 = dir_path + '/' + dir3 + '/' + file_ + '/1990-2016 ' + file_ # path
# to 1990-2016 time series data
df1 = pd.read_csv(path1,index_col=0,header=0) # loads 1974-1989 data
df2 = pd.read_csv(path2,index_col=0,header=0) # loads 1990-2016 data
new_yrs = list(map(str,np.arange(start,stop))) # string list of 1974-2016
df3 = pd.DataFrame(columns=new_yrs) # create empty df3 w/ 1974-2016 columns
count = 0 # initialize count once per irrigation district
for index, row in m2.iterrows():
# below here is reading code_equiv and setting the older set to the
# equivalent newer set. kind of messy but it works.
if row.values[0] != 0:
df2.loc[row.values[0]] += df2.loc[index]
if row.values[2] != 0:
df1.loc[row.values[1]] += df1.loc[row.values[2]]
if row.values[3] != 0:
df1.loc[row.values[1]] += df1.loc[row.values[3]]
if row.values[4] != 0:
df1.loc[row.values[1]] += df1.loc[row.values[4]]
if row.values[5] != 0:
df1.loc[row.values[1]] += df1.loc[row.values[5]]
if row.values[1] != 0:
df3.loc[match.index[count]] = np.append(
[df1.loc[row.values[1]].values],[df2.loc[index].values])
count += 1 # increment count
path3 = dir_path + '/' + dir4 + '/' + file_
df3 = df3.loc[active] # pare df3 down to current crop codes
df3.to_csv(path3) # write irrigation district time series to CSV
# end process 4
os.chdir(dir_path)
print('------------------------ end script ------------------------')
# end script