-
Notifications
You must be signed in to change notification settings - Fork 3
/
test_missing.py
130 lines (97 loc) · 5.45 KB
/
test_missing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import argparse
import logging
from logging import info, warning
from pathlib import Path
from typing import List, Union
import geopandas as gpd
from joblib import Parallel, delayed
from shapely.geometry import MultiPolygon, Polygon
from prclz.blocks.methods import BufferedLineDifference
import os
import pandas as pd
from midway_blocks import extract
def get_gadm_level_column(gadm: gpd.GeoDataFrame, level: int) -> str:
gadm_level_column = "GID_{}".format(level)
while gadm_level_column not in gadm.columns and level > 0:
warning("GID column for GADM level %s not found, trying with level %s", level, level-1)
level -= 1
gadm_level_column = "GID_{}".format(level)
info("Using GID column for GADM level %s", level)
return gadm_level_column, level
# def extract(linestrings: gpd.GeoDataFrame, index: str, geometry: Union[Polygon, MultiPolygon], ls_idx: List[int], output_dir: Path) -> None:
# # minimize synchronization barrier by constructing a new extractor
# block_polygons = BufferedLineDifference().extract(geometry, linestrings.iloc[ls_idx].unary_union)
# print("Type is : ", type(block_polygons))
# blocks = gpd.GeoDataFrame(
# [(index + "_" + str(i), polygon) for (i, polygon) in enumerate(block_polygons)],
# columns=["block_id", "geometry"])
# blocks.set_index("block_id")
# filename = output_dir+("blocks_{}.csv".format(index))
# blocks.to_csv(filename)
# info("Serialized blocks from %s to %s", index, filename)
# DATA PATHS
level = 3
gadm_path = "../data/GADM/MWI/gadm36_MWI_3.shp"
linestrings_path = "../data/geojson/Africa/malawi_lines.geojson"
blocks_path = "../data/blocks/Africa/MWI"
# All blocks files currently output for Malawi
all_blocks_files = [x.replace("blocks_", "").replace(".csv", "") for x in os.listdir(blocks_path)]
all_blocks_files = pd.DataFrame({'out_files':all_blocks_files})
all_blocks_files['block_file_index'] = range(all_blocks_files.shape[0])
all_blocks_files.set_index('out_files', inplace=True)
# Now do the actual block-ing for Malawi
gadm = gpd.read_file(gadm_path)
linestrings = gpd.read_file(linestrings_path)
gamd_column, level = get_gadm_level_column(gadm, level)
gadm = gadm.set_index(gamd_column, level)
info("Overlaying GADM boundaries on linestrings.")
overlay = gpd.sjoin(gadm, linestrings, how="left", op="intersects")\
.groupby(lambda idx: idx)["index_right"]\
.agg(list)
info("Aggregating linestrings by GADM-%s delineation.", level)
gadm_aggregation = gadm.join(overlay)[["geometry", "index_right"]]\
.rename({"index_right": "linestring_index"}, axis=1)
gadm_aggregation = gadm_aggregation.join(all_blocks_files, how='left')
gadm_aggregation['success'] = gadm_aggregation['block_file_index'].notnull()
# Make a smaller dataset of both failures and successes
cols = ['geometry', 'linestring_index']
test_fail = gadm_aggregation[ ~gadm_aggregation['success'] ][cols].iloc[0:10]
test_success = gadm_aggregation[ gadm_aggregation['success'] ][cols].iloc[0:10]
# Run through the successes, for reference
# for index, geometry, ls_idx in test_success.itertuples():
# extract(linestrings, index, geometry, ls_idx, "small_test")
# Run through the failures, for reference
for index, geometry, ls_idx in test_fail.itertuples():
extract(linestrings, index, geometry, ls_idx, "small_test")
# def main(gadm_path, linestrings_path, output_dir, level, parallelism):
# info("Reading geospatial data from files.")
# gadm = gpd.read_file(str(gadm_path))
# linestrings = gpd.read_file(str(linestrings_path))
# info("Setting up indices.")
# gamd_column, level = get_gadm_level_column(gadm, level)
# gadm = gadm.set_index(gamd_column, level)
# info("Overlaying GADM boundaries on linestrings.")
# overlay = gpd.sjoin(gadm, linestrings, how="left", op="intersects")\
# .groupby(lambda idx: idx)["index_right"]\
# .agg(list)
# info("Aggregating linestrings by GADM-%s delineation.", level)
# gadm_aggregation = gadm.join(overlay)[["geometry", "index_right"]]\
# .rename({"index_right": "linestring_index"}, axis=1)
# extractor = BufferedLineDifference()
# info("Extracting blocks for each delineation using method: %s.", extractor)
# Parallel(n_jobs=parallelism, verbose=50)(delayed(extract)(linestrings, index, geometry, ls_idx, output_dir) for (index, geometry, ls_idx) in gadm_aggregation.itertuples())
# info("Done.")
# def setup(args=None):
# # logging
# logging.basicConfig(format="%(asctime)s/%(filename)s/%(funcName)s | %(levelname)s - %(message)s", datefmt='%Y-%m-%d %H:%M:%S')
# logging.getLogger().setLevel("INFO")
# # read arguments
# parser = argparse.ArgumentParser(description='Run parcelization workflow on midway2.')
# parser.add_argument('--gadm', required=True, type=Path, help='path to GADM file', dest="gadm_path")
# parser.add_argument('--linestrings', required=True, type=Path, help='path to linestrings', dest="linestrings_path")
# parser.add_argument('--output', required=True, type=Path, help='path to output', dest="output_dir")
# parser.add_argument('--level', default=3, type=int, help='GADM level to use')
# parser.add_argument('--parallelism', default=4, type=int, help='number of cores to use')
# return parser.parse_args(args)
# if __name__ == "__main__":
# main(**vars(setup()))