-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathclassCombine.py
41 lines (30 loc) · 1.44 KB
/
classCombine.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import pandas as pd
import numpy as np
import sys
from collections import Counter
# The join of the first and second run will include ALL parcels,
# because the outer join of df0 and df1 introduce the 20% subset of
# training parcels that were used in run 0 and not in run 1
root = sys.argv[1]
nchunks = int(sys.argv[2])
df0 = pd.read_csv('{}_cropselect_0_class.csv'.format(root), index_col = 0, low_memory=False)
df1 = pd.read_csv('{}_cropselect_1_class.csv'.format(root), index_col = 0, low_memory=False)
# Create the join and retain 'klass' label as 'klass_1'
df = df0.join(df1, how="outer", rsuffix= '_1')
# Records that were not yet in df0 have 'klass' label missing (NA)
# so, overwrite with those of 'klass_1'
df['klass'].loc[df['klass'].isnull()] = df['klass_1'].loc[df['klass'].isnull()]
# and drop the now redundant 'klass_1' label
df.drop('klass_1', axis=1, inplace=True)
for i in range(2,nchunks):
# Load the other runs
print i
dfN = pd.read_csv('{}_cropselect_{}_class.csv'.format(root, i), index_col = 0, low_memory=False)
dfN.drop('klass', axis=1, inplace=True)
df = df.join(dfN, how="outer")
# Join the latter, overwrite NA values and save as int
df.fillna(-1, inplace=True)
r_index = df.columns[1:]
df['majclass'] = df.apply(lambda x: Counter(x[r_index]).most_common(1)[0][0], axis=1)
df['majcount'] = df.apply(lambda x: Counter(x[r_index]).most_common(1)[0][1], axis=1)
df.astype(int).to_csv('{}_cropselect_classes.csv'.format(root))