Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

update read-in module for ATL11 #398

Merged
merged 25 commits into from
Feb 17, 2023
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
1e20ed6
remove sc_orient from atl11 required vars list
JessicaS11 Nov 22, 2022
a452ef9
introduce new function to determine if data product uses ground track…
JessicaS11 Nov 22, 2022
8524148
add some custom treatments for 2d delta_times and cases when there is…
JessicaS11 Nov 28, 2022
941f69a
add atl11 path parsing and docs
JessicaS11 Nov 29, 2022
9cbfd57
handle merging given non-unique ref_pt coordinates
JessicaS11 Dec 2, 2022
33e319d
add var to coordinate conversion for cycles prior to merge
JessicaS11 Jan 9, 2023
76576ae
hopefully get ATL11 read in working
JessicaS11 Jan 18, 2023
31594a9
Merge branch 'development' into atl11
JessicaS11 Jan 18, 2023
79c4b7c
Merge branch 'development' into atl11
JessicaS11 Jan 27, 2023
4e5f533
add test for new track type function
JessicaS11 Feb 6, 2023
4a02879
add note about function that needs testing
JessicaS11 Feb 6, 2023
9226737
clean up and apply some of review suggestions
JessicaS11 Feb 6, 2023
35088ba
Merge branch 'development' into atl11
JessicaS11 Feb 8, 2023
191851f
Apply suggestions from code review
JessicaS11 Feb 9, 2023
0a31021
change path to pair_track
JessicaS11 Feb 9, 2023
4bb4cf9
specify typeerror in try/except
JessicaS11 Feb 9, 2023
5b2da3a
minor debugging and PR updates
JessicaS11 Feb 9, 2023
1b26978
remove comment
JessicaS11 Feb 9, 2023
8568fae
update failing test to match updated var name
JessicaS11 Feb 9, 2023
45540ce
add note for creating a test and return atl11 granule names correctly…
JessicaS11 Feb 14, 2023
03befa0
Merge branch 'development' into atl11
JessicaS11 Feb 15, 2023
dfaf2e9
address pr review: gt to pt, cycle_number dtype, atl11 filename pattern
JessicaS11 Feb 16, 2023
775dba1
Update icepyx/core/read.py
JessicaS11 Feb 17, 2023
2b7a348
update test for updated function
JessicaS11 Feb 17, 2023
bf6e632
update test for updated function
JessicaS11 Feb 17, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
181 changes: 165 additions & 16 deletions icepyx/core/read.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,48 @@ def _make_np_datetime(df, keyword):
return df


# TODO: add tests, round out docs, and test for atl09 and atl06, for this new function!!
def _get_track_type_str(grp_path):
"""
Determine whether the product contains ground tracks, paths, or profiles and
JessicaS11 marked this conversation as resolved.
Show resolved Hide resolved
parse the string/label the dimension accordingly.

Parameters
----------
grp_path : str
The group path for the ground track, path, or profile.
JessicaS11 marked this conversation as resolved.
Show resolved Hide resolved

Returns
-------
track_str : str
The string for the ground track, path, or profile of this group
JessicaS11 marked this conversation as resolved.
Show resolved Hide resolved
spot_dim_name : str
What the dimension should be named in the dataset
"""

import re

# TODO: This won't work for profile (e.g. atmos) data --> needs to be generalized!
if re.match(r"gt[1-3]['r','l']", grp_path):
JessicaS11 marked this conversation as resolved.
Show resolved Hide resolved
track_str = re.match(r"gt[1-3]['r','l']", grp_path).group()
# spot = is2ref.gt2spot(track_str, is2ds.sc_orient.values[0])
# FIX THIS (line above)!!
spot_dim_name = "spot"
# add a test for the gt2spot function (called here)!

elif re.match(r"profile_[1-3]", grp_path):
JessicaS11 marked this conversation as resolved.
Show resolved Hide resolved
track_str = re.match(r"profile_[1-3]", grp_path).group()
spot = int(track_str[-1])
spot_dim_name = "profile"

elif re.match(r"pt[1-3]", grp_path):
JessicaS11 marked this conversation as resolved.
Show resolved Hide resolved
track_str = re.match(r"pt[1-3]", grp_path).group()
spot = int(track_str[-1])
spot_dim_name = "path"
JessicaS11 marked this conversation as resolved.
Show resolved Hide resolved

return track_str, spot_dim_name


# Dev note: function fully tested (except else, which don't know how to get to)
def _check_datasource(filepath):
"""
Expand Down Expand Up @@ -395,35 +437,65 @@ def _add_vars_to_ds(is2ds, ds, grp_path, wanted_groups_tiered, wanted_dict):
except NameError:
import random

is2ds["gran_idx"] = [random.randint(900000, 999999)]
is2ds["gran_idx"] = [random.randint(800000, 899998)]
warnings.warn("Your granule index is made up of random values.")
# You must include the orbit/cycle_number and orbit/rgt variables to generate
except KeyError:
pass
# Added this when dealing with ATL11 - need to see if it breaks with other datasets
is2ds["gran_idx"] = [np.nanmax(is2ds["gran_idx"]) - 1]
# pass

if hasattr(is2ds, "data_start_utc"):
is2ds = _make_np_datetime(is2ds, "data_start_utc")
is2ds = _make_np_datetime(is2ds, "data_end_utc")

else:
import re
track_str, spot_dim_name = _get_track_type_str(grp_path)

gt_str = re.match(r"gt[1-3]['r','l']", grp_path).group()
spot = is2ref.gt2spot(gt_str, is2ds.sc_orient.values[0])
# add a test for the new function (called here)!
# get the spot number if relevant
if spot_dim_name == "spot":
spot = is2ref.gt2spot(track_str, is2ds.sc_orient.values[0])
else:
spot = track_str

grp_spec_vars = [
k
for k, v in wanted_dict.items()
if any(f"{grp_path}/{k}" in x for x in v)
]

# NEXT TODO: handle the case where it's the second time through a 2d delta_time...

# handle delta_times with 1 or more dimensions
idx_range = range(0, len(ds.delta_time.data))
# if hasattr(is2ds, "photon_idx"):

# # if is2ds already has a 2d photon idx/delta time AND the current delta time does too
# if np.ndim(ds.delta_time.data) > 1: # and np.ndim(is2ds.photon_idx) > 1:
# # repeat the range the number of times needed, then transpose to match the shape of the existing photon_idx
# photon_ids = (
# np.broadcast_to([*idx_range], (np.shape(ds.delta_time)[1], np.shape(ds.delta_time)[0])).transpose()
# + np.full_like(
# ds.delta_time, np.max(is2ds.photon_idx), dtype="int64"
# )
# + 1
# )
# # the original case, where delta_time is 2d but the existing photon_idx is 1d
# else:
# photon_ids = (
# range(0, len(ds.delta_time.data))
# + np.full_like(
# ds.delta_time, np.max(is2ds.photon_idx), dtype="int64"
# )
# + 1
# )
# else:
# photon_ids = range(0, len(ds.delta_time.data))

JessicaS11 marked this conversation as resolved.
Show resolved Hide resolved
try:
photon_ids = (
range(0, len(ds.delta_time.data))
+ np.full_like(
ds.delta_time, np.max(is2ds.photon_idx), dtype="int64"
)
idx_range
+ np.full_like(idx_range, np.max(is2ds.photon_idx), dtype="int64")
+ 1
)
except AttributeError:
Expand All @@ -432,16 +504,52 @@ def _add_vars_to_ds(is2ds, ds, grp_path, wanted_groups_tiered, wanted_dict):
hold_delta_times = ds.delta_time.data
ds = (
ds.reset_coords(drop=False)
.expand_dims(dim=["spot", "gran_idx"])
.expand_dims(dim=[spot_dim_name, "gran_idx"])
.assign_coords(
spot=("spot", [spot]), delta_time=("delta_time", photon_ids)
{
spot_dim_name: (spot_dim_name, [spot]),
"delta_time": ("delta_time", photon_ids),
}
)
.assign(gt=(("gran_idx", "spot"), [[gt_str]]))
.assign(gt=(("gran_idx", spot_dim_name), [[track_str]]))
.rename_dims({"delta_time": "photon_idx"})
.rename({"delta_time": "photon_idx"})
.assign_coords(delta_time=("photon_idx", hold_delta_times))
# .set_index("photon_idx")
)

# handle cases where the delta time is 2d due to multiple cycles in that group
if spot_dim_name == "path" and np.ndim(hold_delta_times) > 1:
ds = ds.assign_coords(
{"delta_time": (("photon_idx", "cycle_number"), hold_delta_times)}
)
else:
ds = ds.assign_coords({"delta_time": ("photon_idx", hold_delta_times)})

# for ATL11
if "ref_pt" in ds.coords:
ds = (
ds.drop_indexes(["ref_pt", "photon_idx"])
.drop(["ref_pt", "photon_idx"])
.swap_dims({"ref_pt": "photon_idx"})
.assign_coords(
ref_pt=("photon_idx", ds.ref_pt.data),
photon_idx=ds.photon_idx.data,
)
)

# for the subgoups where there is 1d delta time data, make sure that the cycle number is still a coordinate for merging
try:
ds = ds.assign_coords(
{
"cycle_number": (
"photon_idx",
ds.cycle_number["photon_idx"].data,
)
}
)
except KeyError:
pass

grp_spec_vars.extend(["gt", "photon_idx"])

is2ds = is2ds.merge(
Expand All @@ -450,7 +558,10 @@ def _add_vars_to_ds(is2ds, ds, grp_path, wanted_groups_tiered, wanted_dict):

# re-cast some dtypes to make array smaller
is2ds["gt"] = is2ds.gt.astype(str)
is2ds["spot"] = is2ds.spot.astype(np.uint8)
try:
is2ds[spot_dim_name] = is2ds[spot_dim_name].astype(np.uint8)
except:
pass
JessicaS11 marked this conversation as resolved.
Show resolved Hide resolved

return is2ds, ds[grp_spec_vars]

Expand Down Expand Up @@ -642,8 +753,10 @@ def _build_single_file_dataset(self, file, groups_list):
# with h5py.File(filepath,'r') as h5pt:
# prod_id = h5pt.attrs["identifier_product_type"]

# DEVNOTE: does not actually apply wanted variable list, and has not been tested for merging multiple files into one ds
# DEVNOTE: if and elif does not actually apply wanted variable list, and has not been tested for merging multiple files into one ds
# if a gridded product
# TODO: all products need to be tested, and quicklook products added or explicitly excluded
# Level 3b, gridded (netcdf): ATL14, 15, 16, 17, 18, 19, 20, 21
if self._prod in [
"ATL14",
"ATL15",
Expand All @@ -656,6 +769,39 @@ def _build_single_file_dataset(self, file, groups_list):
]:
is2ds = xr.open_dataset(file)

# Level 3b, hdf5: ATL11
elif self._prod in ["ATL11"]:
is2ds = self._build_dataset_template(file)

# returns the wanted groups as a single list of full group path strings
wanted_dict, wanted_groups = Variables.parse_var_list(
groups_list, tiered=False
)
wanted_groups_set = set(wanted_groups)

# orbit_info is used automatically as the first group path so the info is available for the rest of the groups
# wanted_groups_set.remove("orbit_info")
wanted_groups_set.remove("ancillary_data")
# Note: the sorting is critical for datasets with highly nested groups
wanted_groups_list = ["ancillary_data"] + sorted(wanted_groups_set)

# returns the wanted groups as a list of lists with group path string elements separated
_, wanted_groups_tiered = Variables.parse_var_list(
groups_list, tiered=True, tiered_vars=True
)

while wanted_groups_list:
# print(wanted_groups_list)
grp_path = wanted_groups_list[0]
wanted_groups_list = wanted_groups_list[1:]
ds = self._read_single_grp(file, grp_path)
is2ds, ds = Read._add_vars_to_ds(
is2ds, ds, grp_path, wanted_groups_tiered, wanted_dict
)
Comment on lines +772 to +779
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Won't this while loop code break on L796 once the wanted_groups_list variable is assigned a value? Or does it run to the end of the while-block first before breaking? Maybe I'm confusing Python with some other programming language...

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, because it will either get to this version of the while loop from the elif on L743 (which ends with a return) or the L796 version of the while loop via the else on L775.


return is2ds

# Level 2 and 3a Products: ATL03, 06, 07, 08, 09, 10, 12, 13
else:
is2ds = self._build_dataset_template(file)

Expand All @@ -677,6 +823,7 @@ def _build_single_file_dataset(self, file, groups_list):
)

while wanted_groups_list:
# print(wanted_groups_list)
grp_path = wanted_groups_list[0]
wanted_groups_list = wanted_groups_list[1:]
ds = self._read_single_grp(file, grp_path)
Expand All @@ -685,7 +832,9 @@ def _build_single_file_dataset(self, file, groups_list):
)

# if there are any deeper nested variables, get those so they have actual coordinates and add them
# this may apply to (at a minimum): ATL08
if any(grp_path in grp_path2 for grp_path2 in wanted_groups_list):
print("nested var")
JessicaS11 marked this conversation as resolved.
Show resolved Hide resolved
for grp_path2 in wanted_groups_list:
if grp_path in grp_path2:
sub_ds = self._read_single_grp(file, grp_path2)
Expand Down
8 changes: 8 additions & 0 deletions icepyx/core/variables.py
Original file line number Diff line number Diff line change
Expand Up @@ -289,6 +289,8 @@ def _check_valid_lists(
# check if the list of beams, if specified, are available in the product
if self.product == "ATL09":
beam_avail = ["profile_" + str(i + 1) for i in range(3)]
elif self.product == "ATL11":
beam_avail = ["pt" + str(i + 1) for i in range(3)]
else:
beam_avail = ["gt" + str(i + 1) + "l" for i in range(3)]
beam_avail = beam_avail + ["gt" + str(i + 1) + "r" for i in range(3)]
Expand Down Expand Up @@ -403,6 +405,7 @@ def append(self, defaults=False, var_list=None, beam_list=None, keyword_list=Non
beam_list : list of strings, default None
A list of beam strings, if only selected beams are wanted (the default value of None will automatically
include all beams). For ATL09, acceptable values are ['profile_1', 'profile_2', 'profile_3'].
For ATL11, acceptable values are ['pt1','pt2','pt3'].
For all other products, acceptable values are ['gt1l', 'gt1r', 'gt2l', 'gt2r', 'gt3l', 'gt3r'].

keyword_list : list of strings, default None
Expand Down Expand Up @@ -479,6 +482,10 @@ def append(self, defaults=False, var_list=None, beam_list=None, keyword_list=Non
"data_end_utc",
]

# Adjust the nec_varlist for individual products
if self.product == "ATL11":
nec_varlist.remove("sc_orient")

try:
self._check_valid_lists(vgrp, allpaths, var_list=nec_varlist)
except ValueError:
Expand Down Expand Up @@ -533,6 +540,7 @@ def remove(self, all=False, var_list=None, beam_list=None, keyword_list=None):
beam_list : list of strings, default None
A list of beam strings, if only selected beams are wanted (the default value of None will automatically
include all beams). For ATL09, acceptable values are ['profile_1', 'profile_2', 'profile_3'].
For ATL11, acceptable values are ['pt1','pt2','pt3'].
For all other products, acceptable values are ['gt1l', 'gt1r', 'gt2l', 'gt2r', 'gt3l', 'gt3r'].

keyword_list : list of strings, default None
Expand Down