-
Notifications
You must be signed in to change notification settings - Fork 0
/
04_inexact_duplicate_checks.py
177 lines (140 loc) · 7.43 KB
/
04_inexact_duplicate_checks.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
import pandas as pd
import numpy as np
from tqdm import trange
import os
from helpers import get_profile_st_en_idx
# Need to do inexact duplicate checking if there are bottle
# and CTD data from the same time/location
# Keep the CTD data since it's higher resolution
def coords_are_close(lat1, lon1, lat2, lon2, criteria):
# bounding box criteria for observation coordinates being close together
return abs(lat1 - lat2) < criteria and abs(lon1 - lon2) < criteria
def time_is_close(t1: pd.Timestamp, t2: pd.Timestamp, criteria):
# Take difference
# Check if difference is less than criteria
return abs(t2 - t1) < pd.to_timedelta(criteria)
def is_ios_profile(fname: str):
"""Detect whether netcdf file is from IOS assuming it has format YYYY-AAA-BBB.qrx.nc,
where YYYY is the year, AAA the cruise number, and BBB the event number.
qrx is the instrument type: ctd, che, or bot"""
fname_no_ext = fname.split('.')[0]
return all([x.isdigit() for x in fname_no_ext.split('-')]) and len(fname.split('.')) == 3
def ios_profiles_match(fname1: str, fname2: str):
"""
Identify IOS profile duplicates of same cast but different instrument type (between ctd, che (chemistry), and bot)
:param fname1:
:param fname2:
:return:
"""
return is_ios_profile(fname1) and is_ios_profile(fname2) and fname1.split('.')[0] == fname2.split('.')[0]
def run_check(input_file_path, output_file_path, sampling_station):
# With a set of duplicates,
# check which profile is longer and flag the shorter one as an inexact duplicate
dfin = pd.read_csv(input_file_path)
profile_start_idx, profile_end_idx = get_profile_st_en_idx(dfin.loc[:, 'Profile number'])
# Convert time to pandas.datetime format
dfin['Time_dt'] = pd.to_datetime(dfin.Time)
# Add column containing flags
dfin['Inexact_duplicate_flag'] = np.zeros(len(dfin), dtype='int32')
for i in trange(len(profile_start_idx)):
start_idx_i = profile_start_idx[i]
end_idx_i = profile_end_idx[i]
time_i = dfin.loc[start_idx_i, 'Time_dt']
lat_i = dfin.loc[start_idx_i, 'Latitude [deg N]']
lon_i = dfin.loc[start_idx_i, 'Longitude [deg E]']
file_i = dfin.loc[start_idx_i, 'File']
# Iterate through the rest of the profiles
for j in range(i + 1, len(profile_start_idx)):
start_idx_j = profile_start_idx[j]
end_idx_j = profile_end_idx[j]
# Check if profile has already been flagged in an earlier iteration
# If so then skip it
if dfin.loc[start_idx_j, 'Inexact_duplicate_flag'] == 1:
continue
time_j = dfin.loc[start_idx_j, 'Time_dt']
lat_j = dfin.loc[start_idx_j, 'Latitude [deg N]']
lon_j = dfin.loc[start_idx_j, 'Longitude [deg E]']
file_j = dfin.loc[start_idx_j, 'File']
# Check if profiles are empty or contain only nans?
# Compare the two profiles selected in time and space
# Add a check for duplicate IOS profiles (but different instrument)
if ios_profiles_match(file_i, file_j) or (
coords_are_close(lat_i, lon_i, lat_j, lon_j, 0.5) and time_is_close(time_i, time_j, '1 hour')
):
# Check which profile is longer and flag the shorter one as an inexact duplicate
if len(dfin.loc[start_idx_i:end_idx_i]
) >= len(dfin.loc[start_idx_j:end_idx_j]):
dfin.loc[start_idx_j:end_idx_j, 'Inexact_duplicate_flag'] = 1
else:
dfin.loc[start_idx_i:end_idx_i, 'Inexact_duplicate_flag'] = 1
# Print summary statistics to text file
summary_statistics_file = os.path.join(
os.path.dirname(output_file_path),
'{}_inexact_duplicate_check_summary_statistics.txt'.format(
sampling_station))
with open(summary_statistics_file, 'a') as txtfile:
txtfile.write('Source file: {}\n'.format(input_file_path))
txtfile.write('Output file: {}\n'.format(output_file_path))
txtfile.write(
'Number of profiles in: {}\n'.format(
len(profile_start_idx)))
txtfile.write(
'Number of profiles out: {}\n\n'.format(
sum(dfin.loc[profile_start_idx, 'Inexact_duplicate_flag'] == 0)))
# Apply the inexact duplicate flag
msk = dfin.loc[:, 'Inexact_duplicate_flag'] == 0
dfout = dfin.loc[msk, :]
# Remove the inexact duplicate flag column without SettingWithCopyWarning
dfout = dfout.drop(columns='Inexact_duplicate_flag')
# Save the dataframe to csv
dfout.to_csv(output_file_path, index=False)
return
def run_cs09():
sampling_station = 'CS09'
# input_dir = (f'C:\\Users\\hourstonh\\Documents\\charles\\more_oxygen_projects\\'
# f'{sampling_station}_03_station_qc_checks\\')
# input_file_path = os.path.join(input_dir, '{}_CTD_BOT_CHE_data.csv'.format(sampling_station))
# output_file_path = input_file_path.replace(
# '03_station_qc_checks', '04_inexact_duplicate_checks'
# )
input_dir = (f'C:\\Users\\hourstonh\\Documents\\charles\\more_oxygen_projects\\'
f'{sampling_station}_02b_remove_casts_missing_o2\\')
input_file_path = os.path.join(input_dir, '{}_CTD_BOT_CHE_data.csv'.format(sampling_station))
output_file_path = input_file_path.replace(
'02b_remove_casts_missing_o2', '04_inexact_duplicate_checks_skip03'
)
run_check(input_file_path, output_file_path, sampling_station)
return
def run_line_p():
for sampling_station in ['P4', 'P26']:
# data_types = 'ctd'
# data_types = 'CTD_BOT_CHE_OSD'
# input_dir = 'C:\\Users\\HourstonH\\Documents\\charles\\' \
# 'line_P_data_products\\csv\\has_osd_ctd_flags\\03_QC\\'
# input_dir = 'C:\\Users\\HourstonH\\Documents\\charles\\' \
# 'our_warming_ocean\\osp_sst\\csv\\03_QC\\'
# data_file_path = 'C:\\Users\\HourstonH\\Documents\\charles\\' \
# 'line_P_data_products\\csv\\03_QC\\' \
# '{}_{}_data.csv'.format(sampling_station, data_types)
# input_file_path = os.path.join(
# input_dir, '{}_data.csv'.format(sampling_station))
# output_file_path = 'C:\\Users\\HourstonH\\Documents\\charles\\' \
# 'line_P_data_products\\csv\\' \
# '{}_{}_data_idc.csv'.format(sampling_station, data_types)
# ----------------------------------------------------------------------------
input_dir = 'D:\\lineP\\processing\\03_merge\\'
input_file_path = os.path.join(input_dir, '{}_data.csv'.format(sampling_station))
output_file_path = input_file_path.replace(
'03_merge', '04_inexact_duplicate_check')
# -------2024 update-------
# input_dir = ('C:\\Users\\hourstonh\\Documents\\charles\\line_P_data_products\\'
# 'update_jan2024_sopo\\csv_data\\02_QC\\')
#
# input_file_path = os.path.join(input_dir, '{}_CTD_CHE_data.csv'.format(sampling_station))
#
# output_file_path = input_file_path.replace('02_QC', '04_inexact_duplicate_check')
# Iterate through all the profile numbers
# If there is a match of time/lat/lon, keep the profile
# that has more measurements in it (corresponding to CTD)
run_check(input_file_path, output_file_path, sampling_station)
return