-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathStep2_PerformAggregation.py
313 lines (248 loc) · 15.6 KB
/
Step2_PerformAggregation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
#-------------------------------------------------------------------------------
# Name: Neighboring parcel ownership aggregations
# Purpose: Identify and group neighboring parcels with common ownership
#
# Author: shannon.thol
#
# Created: 03/04/2020
# Copyright: (c) shannon.thol 2020
# Licence: <your licence>
#-------------------------------------------------------------------------------
#load necessary packages
import arcpy
from arcpy import env
import sys, re,csv
sys.path.append(r"C:\gisdata\resources\conda\arcgispro-py3-clone\Lib\site-packages")
from fuzzywuzzy import fuzz
import time
#create initial start time reference
initialStart = time.time()
arcpy.env.overwriteOutput = True
########################################################################################################################################################################################################################
#Specify criteria for finding neighboring parcels
#dist is the maximum distance (polygon edge to edge) to search away from the target parcel when identifying neighbors
dist = "50 Meters" #50 meters will capture immediately adjacent neighbors and those across a right-of-way
#count is the maximum number of features to return when identifying neighbors
count = 1000 #1000 features will capture all neighbors with an extremely generous buffer
#Specify criteria for aggregating owernships in fuzzy string matching
#ownThresh is the fuzzy string matching score threshold to use when deciding if name strings are similar enough to be classified as part of the same owernship collection
ownThresh = 93
#compLen is the length comparison threshold to use when deciding if two strings should be compared for determining if they are part of the same ownership collection
compLen = 1.25
#########################################################################################################################################################################################################################
#Get path to BigQuery results in csv file
##bigQuery = r"C:\gisdata\projects\NYParcelAggregation\BigQuery_ny_adjacency_results\NY_Manhattan_ParcelAdjacency.csv"
#Get list of paths to BigQuery results in csv files
bqResults = [r"C:\gisdata\projects\NYParcelAggregation\BigQuery_ny_adjacency_results\NY_Central_ParcelAdjacency.csv", r"C:\gisdata\projects\NYParcelAggregation\BigQuery_ny_adjacency_results\NY_Manhattan_ParcelAdjacency.csv",
r"C:\gisdata\projects\NYParcelAggregation\BigQuery_ny_adjacency_results\NY_NorthEast_ParcelAdjacency.csv", r"C:\gisdata\projects\NYParcelAggregation\BigQuery_ny_adjacency_results\NY_South_ParcelAdjacency.csv",
r"C:\gisdata\projects\NYParcelAggregation\BigQuery_ny_adjacency_results\NY_Southeast_ParcelAdjacency.csv", r"C:\gisdata\projects\NYParcelAggregation\BigQuery_ny_adjacency_results\NY_West_ParcelAdjacency.csv"]
#Get path to parcel dataset
parcels = r"C:\gisdata\projects\NYParcelAggregation\NY_Statewide.gdb\ny_statewide"
#Get path to statewide parcel dataset
state = r"C:\gisdata\projects\NYParcelAggregation\NY_Statewide.gdb\ny_statewide"
#Get path to scratch gdb where all intermediate results will be written
scratch = r"C:\gisdata\projects\NYParcelAggregation\scratch.gdb"
#Get name of ID field in the parcel dataset
idField = "PARCEL_ID"
#Get name of owner name field in the parcel dataset
ownField = "OWNER"
#Specify name of the collection id field in the parcel dataset
collField = "COLL_ID"
#########################################################################################################################################################################################################################
#read results from BigQuery table into dictionary of neighboring parcels
nghbrDict = dict() ##nghbrDict = {focPar: [nearPar1, nearPar2, nearPar3 ...] ...}
for bigQuery in bqResults:
print ("loading " + bigQuery)
with open(bigQuery) as csvFile:
readCsv = csv.reader(csvFile, delimiter = ',')
counter = 0
for row in readCsv:
if counter == 0:
pass
else:
focal = int(row[0])
nghbr = int(row[1])
if focal in nghbrDict:
currVals = nghbrDict[focal]
currVals.append(nghbr)
else:
nghbrDict[focal] = [nghbr]
counter+=1
#Create empty dictionary for storing ID numbers, owner names, and owner addresses
ownDict = dict() #ownDict = {PARC_ID: PARC_OWN ...}
#Iterate through features in the statewide parcel dataset, populating ownDict dictionary with parcel ID numbers and owner names
print ("Creating reference dictionary of parcel IDs and owner names ...")
with arcpy.da.SearchCursor(state, [idField, ownField]) as cursor:
for row in cursor:
parcId = int(row[0])
if row[1] is None:
parcOwn = ""
elif row[1].replace(" ","") == "":
parcOwn = ""
else:
parcOwn = re.sub(' +', ' ',row[1].replace(","," ").replace("."," ").replace(";"," ").replace("-"," ").replace(":"," ").strip())
ownDict[parcId] = parcOwn
#Create empty dictionary for storing parcel collection information in the form #collPar = {coll ID = [parcel ID1, parcel ID2, etc.]}
collPar = dict()
#Create empty dictionary for storing parcel collection assignments in the form #parColl = {parcel ID: coll ID}
parColl = dict()
#Set up counter and collection ID
i = 1
collId = i
x = 1
xList = [1, 100000, 200000, 300000, 400000, 500000, 600000, 700000, 800000, 900000, 1000000, 1100000, 1200000, 1300000, 1400000, 1500000, 1600000, 1700000, 1800000, 1900000, 2000000, 2100000, 2200000, 2300000, 2400000, 2500000,
2600000, 2700000, 2800000, 2900000, 3000000, 3100000, 3200000, 3300000, 3400000, 3500000, 3600000, 3700000, 3800000, 3900000, 4000000, 4100000, 4200000, 4300000, 4400000, 4500000, 4600000, 4700000, 4800000, 4900000, 5000000,
5100000, 5200000, 5300000, 5400000, 5500000, 5600000, 5700000, 5800000, 5900000, 6000000]
start = time.time()
#Iterate through features in parcel dataset again, identifying near features and assessing similarity of their owner names and addresses
print ("Determining collection assignments ...")
with arcpy.da.SearchCursor(parcels, [idField, ownField]) as cursor:
for row in cursor:
if x in xList:
duration = time.time()-start
if x == 1:
pass
else:
print(" cumulative duration: " + str(round(duration/60.0,1)) + " minutes")
print("Working on parcel number " + str(x))
## print ("Working on parcel ID: " + str(focId))
#Define variables for fields
focId = int(row[0])
if row[1] is None:
focOwn = ""
elif row[1].replace(" ","") == "":
focOwn = ""
else:
focOwn = re.sub(' +', ' ',row[1].replace(","," ").replace("."," ").replace(";"," ").replace("-"," ").replace(":"," ").strip())
#If the current focal parcel owner name is blank, give it a unique collection id and move on
##(it's not possible to proceed with neighbor analysis because there is no name and no address)
if focOwn == "":
parColl[focId] = collId
collPar[collId] = [focId]
collId+=1
#If the current focal parcel owner name is NOT blank, retrieve list of neighboring parcels
else:
nghbrIds = nghbrDict[focId] ##nghbrDict = {focId: [nearId1, nearId2, nearId3 ...] ...}
#Create empty list for storing near features that have similar names
currColl = list()
#iterate through neighbors in the nghbrIds
for nearId in nghbrIds:
#If the current near parcel ID is the same as the current focal parcel ID, pass as this is identifying the focal polygon in the parcel layer
if nearId == focId:
pass
#If the current near parcel ID is NOT the same as the current focal parcel ID, retrieve the owner name and address for the current near parcel ID from the ownership dictionary (ownDict)
else:
nearOwn = ownDict[nearId] #ownDict = {PARC_ID: PARC_OWN ...}
#If the current near parcel owner is blank, pass as we cannot aggregate it based on ownership
if nearOwn == "":
pass
#If the current near parcel owner name is NOT blank, check to see if fuzzystring matching of the names is feasible
else:
#if either the lengtt of the focOwn or nearOwn are 0, pass and don't perform the length ratio test as it would throw a divide by zero error and don't compare the strings
if len(focOwn) == 0 or len(nearOwn) == 0:
ownScore = 0
#find the ratio between lengths of the focal parcel owner name and current near parcel owner name (longer/shorter) to see if the strings should be compared
else:
ownDiff = max([len(focOwn),len(nearOwn)])/min([len(focOwn),len(nearOwn)])
#if the difference between the owner length ratio is >= string comparison length (comLen) threshold specified above, set the owner score to 0 as the lengths are too dissimilar to meaningfully compare the strings
if ownDiff >= compLen:
ownScore = 0
#if the difference between the owner length ratio is < the comparison length threshold specified above, compare the strings
else:
ownSort = fuzz.partial_token_sort_ratio(focOwn, nearOwn)
ownRatio = fuzz.partial_ratio(focOwn, nearOwn)
ownScore = max([ownSort, ownRatio])
#If the owner score is greater than the specified owner threshold score, or the address score is greater than the specified address threshold score, proceed with adding the near parcel id to the current collection list
if ownScore > ownThresh:
currColl.append(nearId)
#If neither the owner nor address scores are not greater than the specified thresholds, pass
else:
pass
#if the current collection list has one or more entries (one or more neighbors), check to see if any of the collection members have already been assigned to a collection
if len(currColl) > 0:
## print ("neighbors of focal parcel (currColl):")
## print (currColl)
#create a new empty list for storing ids that have already been assigned to members of the current collection
colls = list()
for currMem in currColl:
#check to see if the current collection member has already been assigned to a collection in parColl
if currMem in parColl:
#if it has, retrieve the collection number and add it to the collections list
currCollId = parColl[currMem]
colls.append(currCollId)
colls = list(set(colls))
#if the collections list is empty (no members of the current collection have already been assigned a collection id), proceed with assigning the focal parcel and all members of the collection to a new collection id
if len(colls) == 0:
## print ("none of the neighbors have already been assigned to a collection - new collection number assigned:")
## print (collId)
parColl[focId] = collId
collPar[collId] = [focId]
for currMem in currColl:
parColl[currMem] = collId
if collId in collPar:
if currMem in collPar[collId]:
pass
else:
collPar[collId].append(currMem)
else:
collPar[collId] = [currMem]
collId+=1
#if the collection list has one or more items (members of the collection have already been assigned to one or more collection id), retrieve the smallest collection id that has already been used and assign all members of the collection to this collection id
else:
## print ("neighbors have already been assigned to other collection(s) of ids:")
## print (colls)
#set up new list for collecting collection ids of other members of the collection(s)
otherColls = list()
#get collection numbers of all members of these other collection(s)
for coll in colls:
members = collPar[coll]
for member in members:
other = parColl[member]
otherColls.append(other)
#retrieve the smallest collection id that has already been used
assignedColl = min(colls)
#assign the focal parcel to this collection id
parColl[focId] = assignedColl
if focId in collPar[assignedColl]:
pass
else:
collPar[assignedColl].append(focId)
#assign all previously identified members of the collection to this collection id
for currMem in currColl:
parColl[currMem] = assignedColl
if currMem in collPar[assignedColl]:
pass
else:
collPar[assignedColl].append(currMem)
#retrieve all members that have been assigned to the other collection ids and assign their members to the smallest collection id (called assignedColl)
for other in colls:
if other == assignedColl:
pass
else:
needReassigned = collPar[other] #this is the list of members that have already been assigned to the current other collection
for reassign in needReassigned:
parColl[reassign] = assignedColl
if reassign in collPar[assignedColl]:
pass
else:
collPar[assignedColl].append(reassign)
collPar.pop(other)
#If the current collection list is empty (the focal parcel has no neighbors), proceed with assigning it a new collection id
elif len(currColl) == 0:
parColl[focId] = collId
collPar[collId] = [focId]
## print ("the current parcel has no neighbors, assigning new collection id:")
## print (collId)
collId+=1
x+=1
print ("Writing results to attribute table ...")
with arcpy.da.UpdateCursor(parcels, [idField, collField]) as upCursor:
for upRow in upCursor:
currParc = upRow[0]
currParcColl = parColl[currParc]
upRow[1] = currParcColl
upCursor.updateRow(upRow)
finalStop = time.time()
finalElapsed = round(finalStop - initialStart, 1)
print ('All done!')
print (" took " + str(finalElapsed/60.0) + " mins")