-
Notifications
You must be signed in to change notification settings - Fork 0
/
s4_validation.py
153 lines (103 loc) · 5.06 KB
/
s4_validation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
import os
import errno
import rasterio
import pandas as pd
import numpy as np
from scipy.stats.stats import pearsonr
"""
specify survey layer (with year) to validate
load s4_surface for same year (use json or fixed args to determine which surface?)
compare point (or buffer zs?) value of surface to survey value for each survey point
output dataframe of survey value, point value, buffer value along with percent differences? (include lat/lon to map errors)
"""
def make_dir(path):
try:
os.makedirs(path)
except OSError as exception:
if exception.errno != errno.EEXIST:
raise
def surface_win_mean(src, dim):
r, c = src.index(row.lon, row.lat)
win = ((r-dim/2, r+dim/2), (c-dim/2, c+dim/2))
data = src.read(1, window=win, boundless=True)
win_val = np.mean(data)
return win_val
def sign(val):
if val > 0:
return 1
elif val < 0:
return -1
else:
return 0
# survey_path = "/sciclone/aiddata10/REU/projects/mcc_tanzania/data/surveys/final/tanzania_2015_dhs_cluster.csv"
# cnn_surface_path = "/sciclone/aiddata10/REU/projects/mcc_tanzania/output/s4_surface/surface_cnn_ridge-cv10_ca5cf25_2810232_adm0_2015_v10_p10_m10_s10.tif"
survey_path = "/sciclone/aiddata10/REU/projects/mcc_ghana/data/surveys/final/ghana_2014_dhs_cluster.csv"
cnn_surface_path = "/sciclone/aiddata10/REU/projects/mcc_ghana/output/s4_surface/surface_cnn_ridge-cv10_e8b3cac_8f3999d_adm0_2014_v10_p10_m10_s10.tif"
ntl_surface_path = "/sciclone/aiddata10/REU/projects/mcc_ghana/output/s4_surface/surface_ntl_ridge-cv10_e8b3cac_8f3999d_adm0_2014_v10_p10_m10_s10.tif"
# -------------------------------------
cnn_surface_src = rasterio.open(cnn_surface_path, 'r')
ntl_surface_src = rasterio.open(ntl_surface_path, 'r')
# /////////
# tmp to run this without rewriting a bunch
cnn_surface_path = ntl_surface_path
cnn_surface_src = ntl_surface_src
# /////////
survey_df = pd.read_csv(survey_path)
validation_data = []
for i, row in survey_df.iterrows():
validation_data.append({
"point": cnn_surface_src.sample([(row.lon, row.lat)]).next()[0],
"dim3": surface_win_mean(cnn_surface_src, 3),
"dim16": surface_win_mean(cnn_surface_src, 16),
"dim33": surface_win_mean(cnn_surface_src, 33),
})
validation_df = pd.DataFrame(validation_data)
wealthscore_sign = survey_df.wealthscore.apply(lambda x: sign(x))
print "Survey points: {}".format(len(survey_df))
for i in validation_df.columns:
survey_df[i] = validation_df[i]
survey_df[i+"err"] = (survey_df.wealthscore - survey_df[i]) / survey_df.wealthscore
survey_df[i+"sign"] = (wealthscore_sign != survey_df[i].apply(lambda x: sign(x))).astype(int)
print i
print "\tcorr\t\t{}".format(round(pearsonr(survey_df.wealthscore, survey_df[i])[0], 5))
print "\tavg_abs_err\t{}".format(round(np.mean(np.abs(survey_df[i+"err"])), 5))
print "\tinv_sign\t{}".format(np.sum(survey_df[i+"sign"]))
final_cols = ["lon", "lat", "wealthscore"]
for i in validation_df.columns:
final_cols.append(i)
final_cols.append(i+"err")
final_cols.append(i+"sign")
survey_df = survey_df[final_cols]
validation_dir = os.path.dirname(os.path.dirname(cnn_surface_path)) + "/s4_validation"
validation_fname = os.path.basename(cnn_surface_path)[:-4] + "_" + os.path.basename(survey_path)
validation_path = os.path.join(validation_dir, validation_fname)
make_dir(validation_dir)
survey_df.to_csv(validation_path, index=False)
# -----------------------------------------------------------------------------
raise Exception("Not running second half")
import os
import rasterio
import numpy as np
# surface_a_path = "/sciclone/aiddata10/REU/projects/mcc_tanzania/output/s4_surface/surface_cnn_ridge-cv10_ca5cf25_2810232_adm0_2010_v10_p10_m10_s10.tif"
# surface_b_path = "/sciclone/aiddata10/REU/projects/mcc_tanzania/output/s4_surface/surface_cnn_ridge-cv10_ca5cf25_2810232_adm0_2015_v10_p10_m10_s10.tif"
surface_a_path = "/sciclone/aiddata10/REU/projects/mcc_ghana/output/s4_surface/surface_cnn_ridge-cv10_e8b3cac_8f3999d_adm0_2008_v10_p10_m10_s10.tif"
surface_b_path = "/sciclone/aiddata10/REU/projects/mcc_ghana/output/s4_surface/surface_cnn_ridge-cv10_e8b3cac_8f3999d_adm0_2014_v10_p10_m10_s10.tif"
surface_a_src = rasterio.open(surface_a_path, 'r')
surface_b_src = rasterio.open(surface_b_path, 'r')
surface_a = surface_a_src.read(1)
surface_b = surface_b_src.read(1)
flat_a = surface_a.flatten()
flat_b = surface_b.flatten()
c1 = pearsonr(flat_a, flat_b)[0]
c2 = np.corrcoef(flat_a, flat_b)[0,1]
raw_diff = surface_a - surface_b
abs_diff = np.abs(raw_diff)
validation_dir = os.path.dirname(os.path.dirname(surface_a_path)) + "/s4_validation"
raw_diff_path = os.path.join(validation_dir, "raw_diff_" + os.path.basename(surface_a_path))
abs_diff_path = os.path.join(validation_dir, "abs_diff_" + os.path.basename(surface_a_path))
meta = surface_a_src.profile
meta["crs"] = rasterio.crs.CRS.from_epsg(4236)
with rasterio.open(raw_diff_path, 'w', **meta) as result:
result.write(np.array([raw_diff]))
with rasterio.open(abs_diff_path, 'w', **meta) as result:
result.write(np.array([abs_diff]))