-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathuseful_functions.py
152 lines (133 loc) · 5.88 KB
/
useful_functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
#!/usr/bin/env python3
# Library of useful functions
# Authors: Denisse Fierro Arcos and Tormey Reimer
# Date last update: 2024-09-06
# Loading libraries
import xarray as xr
import zarr
import pandas as pd
import datetime as dt
import os
import geopandas as gpd
import rioxarray
import numpy as np
# Converting downloaded WOA data to useable format ----
# Consolidates netCDF files downloaded from WOA converts them into
# cloud-friendly zarr files.
# Defining function to save netCDF files to zarr files
def netcdf_to_zarr(list_files, var_name, file_out):
'''
Inputs:
list_files (character list) Full paths to WOA files that need to be
merged and save as a single zarr collection. All files must contain
the same variable of interest.
var_name (character) Name of variable to be saved as zarr file.
file_out (character) Full file path where zarr file will be saved.
Outputs:
None. This function saves a zarr file to disk.
'''
da = xr.open_mfdataset(list_files, decode_times = False)[var_name]
# Fix time coordinate variable - Keep name of month only
if len(da.time) == 12:
units, reference_date = da.time.attrs['units'].split('since')
da['time'] = pd.date_range(start = reference_date,
periods = da.sizes['time'],
freq = 'MS').strftime('%B')
da = da.rename({'time': 'month'})
#Rechunk data
da_rechunk = da.chunk({'month': 12, 'depth': 57, 'lat': 120,
'lon': 240})
elif len(da.time) == 1:
#Removing time dimension because it is not needed
da = da.isel(time = 0)
da = da.reset_coords('time', drop = True)
#Rechunk data
da_rechunk = da.chunk({'depth': 57, 'lat': 120, 'lon': 240})
#Save data, but ensure directory exists
out_dir = os.path.dirname(file_out)
if not os.path.exists(out_dir):
os.makedirs(out_dir)
da_rechunk.to_zarr(file_out, consolidated = True, mode = 'w')
#Defining functions
def mask_boolean_ard_data(file_path, boolean_mask_ras):
'''
Open netCDF files in analysis ready data (ARD) format. That is apply chunks
that make data analysis easier.
Inputs:
file_path (character): Full file path where netCDF file is stored.
boolean_mask_ras (boolean data array): Data array to be used as initial mask
to decrease the size of the original dataset. This mask makes no distinction
between regional models, it simply identifies grid cells within regional
model boundaries with the value of 1.
Outputs:
da (data array): ARD data array containing data only for grid cells within
regional model boundaries.
'''
#Getting chunks from gridded mask to apply it to model data array
[lat_chunk] = np.unique(boolean_mask_ras.chunksizes['lat'])
[lon_chunk] = np.unique(boolean_mask_ras.chunksizes['lon'])
#Open data array
da = xr.open_zarr(file_path)
[var] = list(da.keys())
da = da[var]
#Ensure chunks are the same as mask
if 'month' in da.dims:
da = da.chunk({'month': 12, 'depth': 57, 'lat': lat_chunk, 'lon': lon_chunk})
#Make sure months are in correct format
da['month'] = pd.date_range(start = '2010-01-01', periods = 12,
freq = 'MS').strftime('%B')
else:
da = da.chunk({'depth': 57, 'lat': lat_chunk, 'lon': lon_chunk})
#Apply mask for all regions to decrease dataset size
da = da.where(boolean_mask_ras == 1)
#Add spatial information to dataset
da.rio.set_spatial_dims(x_dim = 'lon', y_dim = 'lat', inplace = True)
da.rio.write_crs('epsg:4326', inplace = True)
return da
def mask_ard_data(ard_da, shp_mask, file_out):
'''
Open netCDF files in analysis ready data (ARD) format. That is apply chunks
that make data analysis easier.
Inputs:
ard_da (data array): Analysis ready data (ARD) data array produced by the
function "open_ard_data"
shp_mask (shapefile): Shapefile containing the boundaries of regional models
file_out (character): Full file path where masked data should be stored.
Outputs:
No data is returned, but masked file will be stored in specified file path.
'''
#Clip data using regional shapefile
da_mask = ard_da.rio.clip(shp_mask.geometry, shp_mask.crs, drop = True,
all_touched = True)
#Remove spatial information
if 'spatial_ref' in da_mask.coords:
da_mask = da_mask.drop_vars('spatial_ref')
elif 'crs' in da_mask.coords:
da_mask = da_mask.drop_vars('crs')
da_mask.encoding = {}
#Check file extension included in path to save data
if file_out.endswith('zarr'):
for i, c in enumerate(da_mask.chunks):
if len(c) > 1 and len(set(c)) > 1:
print(f'Rechunking {file_out}.')
print(f'Dimension "{da_mask.dims[i]}" has unequal chunks.')
da_mask = da_mask.chunk({da_mask.dims[i]: '200MB'})
da_mask.to_zarr(file_out, consolidated = True, mode = 'w')
if file_out.endswith('parquet'):
#Keep data array attributes to be recorded in final data frame
da_attrs = ard_da.attrs
da_attrs = pd.DataFrame([da_attrs])
if 'month' in ard_da.dims:
ind_wider = ['lat', 'lon', 'month', 'depth', 'vals']
else:
ind_wider = ['lat', 'lon', 'depth', 'vals']
#Turn extracted data into data frame and remove rows with NA values
df = da_mask.to_series().to_frame().reset_index().dropna()
#Changing column name to standardise across variables
df = df.rename(columns = {ard_da.name: 'vals'}).reset_index(drop = True)
#Reorganise data
df = df[ind_wider]
#Include original dataset attributes
df = pd.concat([df, da_attrs], axis = 1)
#Saving data frame
df.to_parquet(file_out)