Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Github Actions; Preliminary Documentation #6

Merged
merged 39 commits into from
Aug 10, 2023
Merged
Changes from 1 commit
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
276e400
test files added
RickytheGuy Aug 3, 2023
1a499b0
environment
RickytheGuy Aug 3, 2023
1e922a6
added simple test github actions
RickytheGuy Aug 3, 2023
09fd515
allow nc files
RickytheGuy Aug 3, 2023
7bc5d94
github tests
RickytheGuy Aug 3, 2023
9d88212
test actions
RickytheGuy Aug 7, 2023
0d4e635
Merge branch 'geoglows:main' into main
RickytheGuy Aug 7, 2023
a54430f
manual dispatch
RickytheGuy Aug 7, 2023
3247edb
Merge branch 'main' of https://github.com/RickytheGuy/rapid-inflows
RickytheGuy Aug 7, 2023
8e10efd
updated env
RickytheGuy Aug 7, 2023
b641c75
fixed env file
RickytheGuy Aug 7, 2023
f4389d3
update
RickytheGuy Aug 7, 2023
db87caf
updated test script
RickytheGuy Aug 7, 2023
440881a
updated yml
RickytheGuy Aug 7, 2023
59c7cd4
updated tests
RickytheGuy Aug 7, 2023
c5dd869
fixed path
RickytheGuy Aug 7, 2023
550ff97
updates
RickytheGuy Aug 7, 2023
7f29baa
completed test comparison
RickytheGuy Aug 7, 2023
c38339b
updated path names
RickytheGuy Aug 7, 2023
b0fc173
formatted
RickytheGuy Aug 7, 2023
aa76c47
updated path
RickytheGuy Aug 7, 2023
767efed
Create README.md
RickytheGuy Aug 7, 2023
b35245f
updated readme
RickytheGuy Aug 7, 2023
b13efe5
removed bioconda, package versions
RickytheGuy Aug 8, 2023
6f0d62c
Update test_compare.py
RickytheGuy Aug 8, 2023
a91e397
Update README.md
RickytheGuy Aug 8, 2023
5327612
Update test.yml, removed bionconda
RickytheGuy Aug 8, 2023
af0ed5a
updated .gitignore
RickytheGuy Aug 8, 2023
927c487
added to readme
RickytheGuy Aug 8, 2023
d569360
provided functinality for multiple wts
RickytheGuy Aug 8, 2023
85932ff
additional tests
RickytheGuy Aug 8, 2023
ec9fecf
more test files
RickytheGuy Aug 8, 2023
c6f24d8
Merge branch 'main' of https://github.com/RickytheGuy/rapid-inflows
RickytheGuy Aug 8, 2023
0476ec0
renamed
RickytheGuy Aug 8, 2023
4a85b1c
check weight table dims vs. xr dims
RickytheGuy Aug 8, 2023
075c092
updated readme for weigh table
RickytheGuy Aug 8, 2023
7797942
removed multiple wts functionality
RickytheGuy Aug 9, 2023
2471ad1
removed old paths
RickytheGuy Aug 9, 2023
7af4d6c
removed push
RickytheGuy Aug 9, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
removed multiple wts functionality
RickytheGuy committed Aug 9, 2023
commit 7797942803b9a70a10177b2126db4d4e7b9f512d
340 changes: 165 additions & 175 deletions rapid_inflows/inflow_fast.py
Original file line number Diff line number Diff line change
@@ -55,22 +55,13 @@ def create_inflow_file(lsm_file_list: list,
Path to the comid_lat_lon_z.csv corresponding to the weight table
inflow_file_path: str
Path and name of the output netcdf
input_tuples: list
A list of iterable objects likle the following: [(weight.csv, comid.csv, out.nc),(weight_1.csv), comid_1.csv, out_1.nc), ...]
"""
# Create the input_tuples object if not created already: This will be what will be iterated
if input_tuples is None:
if (weight_table is not None and comid_lat_lon_z is not None and inflow_file_path is not None):
input_tuples = [(weight_table, comid_lat_lon_z, inflow_file_path)]
else:
raise ValueError("Either input_tuples OR all other inputs must be defined.")

# Ensure that every input file exists
for weight_table, comid_lat_lon_z, _ in input_tuples:
if not os.path.exists(weight_table):
raise FileNotFoundError(f'{weight_table} does not exist')
if not os.path.exists(comid_lat_lon_z):
raise FileNotFoundError(f'{comid_lat_lon_z} does not exist')
if not os.path.exists(weight_table):
raise FileNotFoundError(f'{weight_table} does not exist')
if not os.path.exists(comid_lat_lon_z):
raise FileNotFoundError(f'{comid_lat_lon_z} does not exist')

# open all the ncs and select only the area within the weight table
logging.info('Opening LSM files multi-file dataset')
@@ -82,176 +73,175 @@ def create_inflow_file(lsm_file_list: list,
lat_variable = [x for x in ['lat', 'latitude', 'LATITUDE', 'LAT'] if x in lsm_dataset.variables][0]

# Check that the input table dimensions match the dataset dimensions
# We ignore time dimension
# This gets us the shape, while ignoring the time dimension
variable_dims = lsm_dataset[runoff_variable].dims
dataset_shape = [lsm_dataset[runoff_variable].shape[variable_dims.index(lat_variable)],
lsm_dataset[runoff_variable].shape[variable_dims.index(lon_variable)]]

for weight_table, _ ,_ in input_tuples:
matches = re.findall(r'(\d+)x(\d+)', weight_table)[0]
if len(matches) == 2:
if all(int(item) in dataset_shape for item in matches):
continue
matches = re.findall(r'(\d+)x(\d+)', weight_table)[0]
if len(matches) == 2:
if all(int(item) in dataset_shape for item in matches):
pass
else:
raise ValueError(f"{weight_table} dimensions don't match the input dataset shape: {dataset_shape}")
else:
raise ValueError(f"Could not find a shape (###x####) in {weight_table}. Consider renaming")

# Iterate over each weight table
for weight_table, comid_lat_lon_z, inflow_file_path in input_tuples:
# load in weight table and get some information
logging.info('Reading weight table and comid_lat_lon_z csvs')
weight_df = pd.read_csv(weight_table)
comid_df = pd.read_csv(comid_lat_lon_z)

sorted_rivid_array = comid_df.iloc[:, 0].to_numpy()

min_lon = weight_df['lon'].min()
max_lon = weight_df['lon'].max()
min_lat = weight_df['lat'].min()
max_lat = weight_df['lat'].max()

min_lon_idx = weight_df.loc[weight_df['lon'] == min_lon, 'lon_index'].values[0].astype(int)
max_lon_idx = weight_df.loc[weight_df['lon'] == max_lon, 'lon_index'].values[0].astype(int)
min_lat_idx = weight_df.loc[weight_df['lat'] == min_lat, 'lat_index'].values[0].astype(int)
max_lat_idx = weight_df.loc[weight_df['lat'] == max_lat, 'lat_index'].values[0].astype(int)

if min_lon_idx > max_lon_idx:
min_lon_idx, max_lon_idx = max_lon_idx, min_lon_idx
if min_lat_idx > max_lat_idx:
min_lat_idx, max_lat_idx = max_lat_idx, min_lat_idx

# for readability, select certain cols from the weight table
n_wt_rows = weight_df.shape[0]
stream_ids = weight_df.iloc[:, 0].to_numpy()
lat_indices = weight_df['lat_index'].values - min_lat_idx
lon_indices = weight_df['lon_index'].values - min_lon_idx

spatial_slices = {lon_variable: slice(min_lon_idx, max_lon_idx + 1),
lat_variable: slice(min_lat_idx, max_lat_idx + 1)}

ds = (
lsm_dataset
.isel(**spatial_slices)
[runoff_variable]
)

# Get approximate sizes of arrays and check if we have enough memory
logging.info('Checking anticipated memory requirement')
out_array_size = ds['time'].shape[0] * sorted_rivid_array.shape[0]
in_array_size = ds['time'].shape[0] * n_wt_rows
if ds.ndim == 4:
in_array_size *= 2
total_size = out_array_size + in_array_size
_memory_check(total_size)

# Get conversion factor
logging.info('Getting conversion factor')
# load in weight table and get some information
logging.info('Reading weight table and comid_lat_lon_z csvs')
weight_df = pd.read_csv(weight_table)
comid_df = pd.read_csv(comid_lat_lon_z)

sorted_rivid_array = comid_df.iloc[:, 0].to_numpy()

min_lon = weight_df['lon'].min()
max_lon = weight_df['lon'].max()
min_lat = weight_df['lat'].min()
max_lat = weight_df['lat'].max()

min_lon_idx = weight_df.loc[weight_df['lon'] == min_lon, 'lon_index'].values[0].astype(int)
max_lon_idx = weight_df.loc[weight_df['lon'] == max_lon, 'lon_index'].values[0].astype(int)
min_lat_idx = weight_df.loc[weight_df['lat'] == min_lat, 'lat_index'].values[0].astype(int)
max_lat_idx = weight_df.loc[weight_df['lat'] == max_lat, 'lat_index'].values[0].astype(int)

if min_lon_idx > max_lon_idx:
min_lon_idx, max_lon_idx = max_lon_idx, min_lon_idx
if min_lat_idx > max_lat_idx:
min_lat_idx, max_lat_idx = max_lat_idx, min_lat_idx

# for readability, select certain cols from the weight table
n_wt_rows = weight_df.shape[0]
stream_ids = weight_df.iloc[:, 0].to_numpy()
lat_indices = weight_df['lat_index'].values - min_lat_idx
lon_indices = weight_df['lon_index'].values - min_lon_idx

spatial_slices = {lon_variable: slice(min_lon_idx, max_lon_idx + 1),
lat_variable: slice(min_lat_idx, max_lat_idx + 1)}

ds = (
lsm_dataset
.isel(**spatial_slices)
[runoff_variable]
)

# Get approximate sizes of arrays and check if we have enough memory
logging.info('Checking anticipated memory requirement')
out_array_size = ds['time'].shape[0] * sorted_rivid_array.shape[0]
in_array_size = ds['time'].shape[0] * n_wt_rows
if ds.ndim == 4:
in_array_size *= 2
total_size = out_array_size + in_array_size
_memory_check(total_size)

# Get conversion factor
logging.info('Getting conversion factor')
conversion_factor = 1
units = ds.attrs.get('units', False)
if not units:
logging.warning("No units attribute found. Assuming meters")
elif ds.attrs['units'] == 'm':
conversion_factor = 1
units = ds.attrs.get('units', False)
if not units:
logging.warning("No units attribute found. Assuming meters")
elif ds.attrs['units'] == 'm':
conversion_factor = 1
elif ds.attrs['units'] == 'mm':
conversion_factor = .001
else:
raise ValueError(f"Unknown units: {ds.attrs['units']}")

# get the time array from the dataset
logging.info('Reading Time values')
datetime_array = ds['time'].to_numpy()

logging.info('Creating inflow array')
# todo more checks on names and order of dimensions
if ds.ndim == 3:
inflow_array = ds.values[:, lat_indices, lon_indices]
elif ds.ndim == 4:
inflow_array = ds.values[:, :, lat_indices, lon_indices]
inflow_array = np.where(np.isnan(inflow_array[:, 0, :]), inflow_array[:, 1, :], inflow_array[:, 0, :]),
else:
raise ValueError(f"Unknown number of dimensions: {ds.ndim}")
inflow_array = np.nan_to_num(inflow_array, nan=0)
inflow_array[inflow_array < 0] = 0
inflow_array = inflow_array * weight_df['area_sqm'].values * conversion_factor
inflow_array = pd.DataFrame(inflow_array, columns=stream_ids)
inflow_array = inflow_array.groupby(by=stream_ids, axis=1).sum()
inflow_array = inflow_array[sorted_rivid_array].to_numpy()

ds.close()

# Create output inflow netcdf data
logging.info("Writing inflows to file")
os.makedirs(os.path.dirname(inflow_file_path), exist_ok=True)
with nc.Dataset(inflow_file_path, "w", format="NETCDF3_CLASSIC") as inflow_nc:
# create dimensions
inflow_nc.createDimension('time', datetime_array.shape[0])
inflow_nc.createDimension('rivid', sorted_rivid_array.shape[0])
inflow_nc.createDimension('nv', 2)

# m3_riv
m3_riv_var = inflow_nc.createVariable('m3_riv', 'f4', ('time', 'rivid'), fill_value=0)
m3_riv_var[:] = inflow_array
m3_riv_var.long_name = 'accumulated inflow inflow volume in river reach boundaries'
m3_riv_var.units = 'm3'
m3_riv_var.coordinates = 'lon lat'
m3_riv_var.grid_mapping = 'crs'
m3_riv_var.cell_methods = "time: sum"

# rivid
rivid_var = inflow_nc.createVariable('rivid', 'i4', ('rivid',))
rivid_var[:] = sorted_rivid_array
rivid_var.long_name = 'unique identifier for each river reach'
rivid_var.units = '1'
rivid_var.cf_role = 'timeseries_id'

# time
reference_time = datetime_array[0]
time_step = (datetime_array[1] - datetime_array[0]).astype('timedelta64[s]')
time_var = inflow_nc.createVariable('time', 'i4', ('time',))
time_var[:] = (datetime_array - reference_time).astype('timedelta64[s]').astype(int)
time_var.long_name = 'time'
time_var.standard_name = 'time'
time_var.units = f'seconds since {reference_time.astype("datetime64[s]")}' # Must be seconds
time_var.axis = 'T'
time_var.calendar = 'gregorian'
time_var.bounds = 'time_bnds'

# time_bnds
time_bnds = inflow_nc.createVariable('time_bnds', 'i4', ('time', 'nv',))
time_bnds_array = np.stack([datetime_array, datetime_array + time_step], axis=1)
time_bnds_array = (time_bnds_array - reference_time).astype('timedelta64[s]').astype(int)
time_bnds[:] = time_bnds_array

# longitude
lon_var = inflow_nc.createVariable('lon', 'f8', ('rivid',), fill_value=-9999.0)
lon_var[:] = comid_df['lon'].values
lon_var.long_name = 'longitude of a point related to each river reach'
lon_var.standard_name = 'longitude'
lon_var.units = 'degrees_east'
lon_var.axis = 'X'

# latitude
lat_var = inflow_nc.createVariable('lat', 'f8', ('rivid',), fill_value=-9999.0)
lat_var[:] = comid_df['lat'].values
lat_var.long_name = 'latitude of a point related to each river reach'
lat_var.standard_name = 'latitude'
lat_var.units = 'degrees_north'
lat_var.axis = 'Y'

# crs
crs_var = inflow_nc.createVariable('crs', 'i4')
crs_var.grid_mapping_name = 'latitude_longitude'
crs_var.epsg_code = 'EPSG:4326' # WGS 84
crs_var.semi_major_axis = 6378137.0
crs_var.inverse_flattening = 298.257223563

# add global attributes
inflow_nc.Conventions = 'CF-1.6'
inflow_nc.history = 'date_created: {0}'.format(datetime.utcnow())
inflow_nc.featureType = 'timeSeries'
inflow_nc.geospatial_lat_min = min_lat
inflow_nc.geospatial_lat_max = max_lat
inflow_nc.geospatial_lon_min = min_lon
inflow_nc.geospatial_lon_max = max_lon
elif ds.attrs['units'] == 'mm':
conversion_factor = .001
else:
raise ValueError(f"Unknown units: {ds.attrs['units']}")

# get the time array from the dataset
logging.info('Reading Time values')
datetime_array = ds['time'].to_numpy()

logging.info('Creating inflow array')
# todo more checks on names and order of dimensions
if ds.ndim == 3:
inflow_array = ds.values[:, lat_indices, lon_indices]
elif ds.ndim == 4:
inflow_array = ds.values[:, :, lat_indices, lon_indices]
inflow_array = np.where(np.isnan(inflow_array[:, 0, :]), inflow_array[:, 1, :], inflow_array[:, 0, :]),
else:
raise ValueError(f"Unknown number of dimensions: {ds.ndim}")
inflow_array = np.nan_to_num(inflow_array, nan=0)
inflow_array[inflow_array < 0] = 0
inflow_array = inflow_array * weight_df['area_sqm'].values * conversion_factor
inflow_array = pd.DataFrame(inflow_array, columns=stream_ids)
inflow_array = inflow_array.groupby(by=stream_ids, axis=1).sum()
inflow_array = inflow_array[sorted_rivid_array].to_numpy()

ds.close()

# Create output inflow netcdf data
logging.info("Writing inflows to file")
os.makedirs(os.path.dirname(inflow_file_path), exist_ok=True)
with nc.Dataset(inflow_file_path, "w", format="NETCDF3_CLASSIC") as inflow_nc:
# create dimensions
inflow_nc.createDimension('time', datetime_array.shape[0])
inflow_nc.createDimension('rivid', sorted_rivid_array.shape[0])
inflow_nc.createDimension('nv', 2)

# m3_riv
m3_riv_var = inflow_nc.createVariable('m3_riv', 'f4', ('time', 'rivid'), fill_value=0)
m3_riv_var[:] = inflow_array
m3_riv_var.long_name = 'accumulated inflow inflow volume in river reach boundaries'
m3_riv_var.units = 'm3'
m3_riv_var.coordinates = 'lon lat'
m3_riv_var.grid_mapping = 'crs'
m3_riv_var.cell_methods = "time: sum"

# rivid
rivid_var = inflow_nc.createVariable('rivid', 'i4', ('rivid',))
rivid_var[:] = sorted_rivid_array
rivid_var.long_name = 'unique identifier for each river reach'
rivid_var.units = '1'
rivid_var.cf_role = 'timeseries_id'

# time
reference_time = datetime_array[0]
time_step = (datetime_array[1] - datetime_array[0]).astype('timedelta64[s]')
time_var = inflow_nc.createVariable('time', 'i4', ('time',))
time_var[:] = (datetime_array - reference_time).astype('timedelta64[s]').astype(int)
time_var.long_name = 'time'
time_var.standard_name = 'time'
time_var.units = f'seconds since {reference_time.astype("datetime64[s]")}' # Must be seconds
time_var.axis = 'T'
time_var.calendar = 'gregorian'
time_var.bounds = 'time_bnds'

# time_bnds
time_bnds = inflow_nc.createVariable('time_bnds', 'i4', ('time', 'nv',))
time_bnds_array = np.stack([datetime_array, datetime_array + time_step], axis=1)
time_bnds_array = (time_bnds_array - reference_time).astype('timedelta64[s]').astype(int)
time_bnds[:] = time_bnds_array

# longitude
lon_var = inflow_nc.createVariable('lon', 'f8', ('rivid',), fill_value=-9999.0)
lon_var[:] = comid_df['lon'].values
lon_var.long_name = 'longitude of a point related to each river reach'
lon_var.standard_name = 'longitude'
lon_var.units = 'degrees_east'
lon_var.axis = 'X'

# latitude
lat_var = inflow_nc.createVariable('lat', 'f8', ('rivid',), fill_value=-9999.0)
lat_var[:] = comid_df['lat'].values
lat_var.long_name = 'latitude of a point related to each river reach'
lat_var.standard_name = 'latitude'
lat_var.units = 'degrees_north'
lat_var.axis = 'Y'

# crs
crs_var = inflow_nc.createVariable('crs', 'i4')
crs_var.grid_mapping_name = 'latitude_longitude'
crs_var.epsg_code = 'EPSG:4326' # WGS 84
crs_var.semi_major_axis = 6378137.0
crs_var.inverse_flattening = 298.257223563

# add global attributes
inflow_nc.Conventions = 'CF-1.6'
inflow_nc.history = 'date_created: {0}'.format(datetime.utcnow())
inflow_nc.featureType = 'timeSeries'
inflow_nc.geospatial_lat_min = min_lat
inflow_nc.geospatial_lat_max = max_lat
inflow_nc.geospatial_lon_min = min_lon
inflow_nc.geospatial_lon_max = max_lon

lsm_dataset.close()
return
11 changes: 2 additions & 9 deletions tests/test_compare.py
Original file line number Diff line number Diff line change
@@ -14,7 +14,7 @@
import netCDF4 as nc
import sys
import os

import time
# Add the project_root directory to the Python path
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
sys.path.append(project_root)
@@ -63,16 +63,9 @@ def check_function(validation_ds, output_ds, test):
out_ds = nc.Dataset('./tests/test.nc', 'r')
val_ds = nc.Dataset('tests/validation/1980_01_01to10_last10.nc', 'r')

# TEST 2: Multiple weight tables
input_tuples = [('./tests/inputs/test_2/region1/weight_era5_721x1440_split_1.csv', './tests/inputs/test_2/region1/comid_lat_lon_z_1.csv', './tests/test_1.nc'),
('./tests/inputs/test_2/region2/weight_era5_721x1440_split_2.csv', './tests/inputs/test_2/region2/comid_lat_lon_z_2.csv', './tests/test_2.nc')]
create_inflow_file(glob.glob('./tests/inputs/era5_721x1440_sample_data/*.nc'),input_tuples=input_tuples)

out_ds_1 = nc.Dataset('./tests/test_1.nc', 'r')
validation_ds_1 = nc.Dataset('./tests/validation/1980_01_01to10_split_1.nc', 'r')
out_ds_2 = nc.Dataset('./tests/test_2.nc', 'r')
validation_ds_2 = nc.Dataset('./tests/validation/1980_01_01to10_split_2.nc', 'r')

check_function(val_ds, out_ds, 'TEST 1: Normal inputs')
check_function(validation_ds_1, out_ds_1, 'TEST 2.0: Multiple weight tables')
check_function(validation_ds_2, out_ds_2, 'TEST 2.1: Multiple weight tables')
check_function(val_ds, out_ds, 'TEST 1: Normal inputs')