From 34a2146fccbd81d54b36c7a41db15b6359a5f539 Mon Sep 17 00:00:00 2001 From: Ned Molter Date: Fri, 27 Sep 2024 13:42:52 -0400 Subject: [PATCH 1/3] moved unit tests into stcal and out of here --- .../tests/test_algorithms.py | 12 - jwst/outlier_detection/tests/test_utils.py | 164 ----------- jwst/outlier_detection/utils.py | 272 +----------------- 3 files changed, 7 insertions(+), 441 deletions(-) delete mode 100644 jwst/outlier_detection/tests/test_utils.py diff --git a/jwst/outlier_detection/tests/test_algorithms.py b/jwst/outlier_detection/tests/test_algorithms.py index d1c8ffd85d..b3b9255bb4 100644 --- a/jwst/outlier_detection/tests/test_algorithms.py +++ b/jwst/outlier_detection/tests/test_algorithms.py @@ -1,7 +1,6 @@ import numpy as np from jwst.outlier_detection.tso import moving_median_over_zeroth_axis -from jwst.outlier_detection.utils import nanmedian3D def test_rolling_median(): @@ -15,14 +14,3 @@ def test_rolling_median(): result = moving_median_over_zeroth_axis(arr, w) expected = expected_time_axis[:, np.newaxis, np.newaxis] * spatial_axis[np.newaxis, :, :] assert np.allclose(result, expected) - - -def test_nanmedian3D(): - - shp = (11, 50, 60) - cube = np.random.normal(size=shp) - cube[5, 5:7, 5:8] = np.nan - med = nanmedian3D(cube) - - assert med.dtype == np.float32 - assert np.allclose(med, np.nanmedian(cube, axis=0), equal_nan=True) \ No newline at end of file diff --git a/jwst/outlier_detection/tests/test_utils.py b/jwst/outlier_detection/tests/test_utils.py deleted file mode 100644 index d1fc0bd576..0000000000 --- a/jwst/outlier_detection/tests/test_utils.py +++ /dev/null @@ -1,164 +0,0 @@ -import pytest -from jwst.outlier_detection.utils import DiskAppendableArray, OnDiskMedian -from pathlib import Path -import numpy as np -import os - - -def test_disk_appendable_array(tmp_cwd): - - slice_shape = (8,7) - dtype = "float32" - tempdir = tmp_cwd / Path("tmptest") - os.mkdir(tempdir) - fname = tempdir / "test.bin" - - arr = DiskAppendableArray(slice_shape, dtype, fname) - - # check temporary file setup - assert str(arr._filename).split("/")[-1] in os.listdir(tempdir) - assert len(os.listdir(tempdir)) == 1 - assert arr.shape == (0,) + slice_shape - - # check cwd contains no files - assert all([not os.path.isfile(f) for f in os.listdir(tmp_cwd)]) - - # check expected append failures - with pytest.raises(ValueError): - candidate = np.zeros((7,7), dtype=dtype) - arr.append(candidate) - with pytest.raises(ValueError): - candidate = np.zeros((8,7), dtype='float64') - arr.append(candidate) - - # check append and read - candidate0 = np.zeros(slice_shape, dtype=dtype) - candidate1 = np.full(slice_shape, 2, dtype=dtype) - candidate2 = np.full(slice_shape, np.nan, dtype=dtype) - for candidate in [candidate0, candidate1, candidate2]: - arr.append(candidate) - - arr_in_memory = arr.read() - - assert arr_in_memory.shape == (3,) + slice_shape - assert np.all(arr_in_memory[0] == candidate0) - assert np.all(arr_in_memory[1] == candidate1) - assert np.allclose(arr_in_memory[2], candidate2, equal_nan=True) - - -def test_disk_appendable_array_bad_inputs(tmp_cwd): - - slice_shape = (8,7) - dtype = "float32" - tempdir = tmp_cwd / Path("tmptest") - fname = "test.bin" - - # test input directory does not exist - with pytest.raises(FileNotFoundError): - DiskAppendableArray(slice_shape, dtype, tempdir / fname) - - # make the input directory - os.mkdir(tempdir) - - # ensure failure if slice_shape is not 2-D - with pytest.raises(ValueError): - DiskAppendableArray((3,5,7), dtype, tempdir / fname) - - # ensure failure if dtype is not valid - with pytest.raises(TypeError): - DiskAppendableArray(slice_shape, "float3", tempdir / fname) - - # ensure failure if pass directory instead of filename - with pytest.raises(IsADirectoryError): - DiskAppendableArray(slice_shape, "float3", tempdir) - - -# filter warnings such that "ResourceWarning: Implictly cleaning up" is raised as error -# unfortunately need to do this indirectly by finding "finalize" string in PytestUnraisableException -@pytest.mark.filterwarnings("error:.*finalize.*:pytest.PytestUnraisableExceptionWarning") -def test_on_disk_median(tmp_cwd): - - library_length = 3 - frame_shape = (21,20) - dtype = 'float32' - tempdir = tmp_cwd / Path("tmptest") - os.mkdir(tempdir) - shape = (library_length,) + frame_shape - - median_computer = OnDiskMedian(shape, dtype=dtype, tempdir=tempdir) - - # test compute buffer indices - # buffer size equals size of single input model by default - # which means we expect same number of sections as library length - # in reality there is often one more section than that because - # of necessity of integer number of rows per section, but math is exact in this case - expected_buffer_size = frame_shape[0] * frame_shape[1] * np.dtype(dtype).itemsize - expected_section_nrows = frame_shape[0] // library_length - assert median_computer.nsections == library_length - assert median_computer.section_nrows == expected_section_nrows - assert median_computer.buffer_size == expected_buffer_size - - # test temp file setup - assert len(os.listdir(tempdir)) == 1 - assert str(median_computer._temp_path).startswith(str(tempdir)) - assert len(os.listdir(median_computer._temp_path)) == library_length - # check cwd and parent tempdir contain no files - assert all([not os.path.isfile(f) for f in os.listdir(tmp_cwd)]) - assert all([not os.path.isfile(f) for f in os.listdir(tempdir)]) - - # test validate data - with pytest.raises(ValueError): - candidate = np.zeros((20,20), dtype=dtype) - median_computer.add_image(candidate) - with pytest.raises(ValueError): - candidate = np.zeros((21,20), dtype='float64') - median_computer.add_image(candidate) - - # test add and compute - candidate0 = np.full(frame_shape, 3, dtype=dtype) - candidate1 = np.full(frame_shape, 2, dtype=dtype) - candidate2 = np.full(frame_shape, np.nan, dtype=dtype) - for candidate in [candidate0, candidate1, candidate2]: - median_computer.add_image(candidate) - median = median_computer.compute_median() - median_computer.cleanup() - assert median.shape == frame_shape - assert np.all(median == 2.5) - - # test expected error trying to add too many frames - # for loop to ensure always happens, not just the first time - candidate3 = np.zeros_like(candidate0) - for _ in range(2): - with pytest.raises(IndexError): - median_computer.add_image(candidate3) - - # test cleanup of tmpdir and everything else - assert not os.path.exists(median_computer._temp_path) - assert len(os.listdir(tempdir)) == 0 - assert len(os.listdir(os.getcwd())) == 1 # this is the "tmptest" directory - - -@pytest.mark.filterwarnings("error:.*finalize.*:pytest.PytestUnraisableExceptionWarning") -def test_on_disk_median_bad_inputs(tmp_cwd): - - library_length = 3 - frame_shape = (21,20) - dtype = 'float32' - tempdir = tmp_cwd / Path("tmptest") - os.mkdir(tempdir) - shape = (library_length,) + frame_shape - - with pytest.raises(ValueError): - OnDiskMedian(frame_shape, dtype=dtype, tempdir=tempdir) - - with pytest.raises(TypeError): - OnDiskMedian(shape, dtype="float3", tempdir=tempdir) - - with pytest.raises(FileNotFoundError): - OnDiskMedian(shape, dtype="float32", tempdir="dne") - - # ensure unreasonable buffer size will get set to minimum reasonable buffer - min_buffer = np.dtype(dtype).itemsize*frame_shape[1]*library_length - median_computer = OnDiskMedian(shape, dtype=dtype, tempdir=tempdir, buffer_size=-1) - assert median_computer.buffer_size == min_buffer - median_computer.cleanup() \ No newline at end of file diff --git a/jwst/outlier_detection/utils.py b/jwst/outlier_detection/utils.py index 03446aac5e..1563c2565d 100644 --- a/jwst/outlier_detection/utils.py +++ b/jwst/outlier_detection/utils.py @@ -4,15 +4,13 @@ import copy from functools import partial -import warnings import numpy as np -import tempfile -from pathlib import Path from jwst.lib.pipe_utils import match_nans_and_flags from jwst.resample.resample import compute_image_pixel_area from jwst.resample.resample_utils import build_driz_weight from stcal.outlier_detection.utils import compute_weight_threshold, gwcs_blot, flag_crs, flag_resampled_crs +from stcal.outlier_detection.median import MedianComputer, nanmedian3D from stdatamodels.jwst import datamodels from . import _fileio @@ -23,27 +21,6 @@ DO_NOT_USE = datamodels.dqflags.pixel['DO_NOT_USE'] OUTLIER = datamodels.dqflags.pixel['OUTLIER'] -_ONE_MB = 1 << 20 - - -def nanmedian3D(cube, overwrite_input=True): - """Compute the median of a cube ignoring warnings and with memory efficiency. - np.nanmedian always uses at least 64-bit precision internally, and this is too - memory-intensive. Instead, loop over the median calculation to avoid the - memory usage of the internal upcasting and temporary array allocation. - The additional runtime of this loop is indistinguishable from zero, - but this loop cuts overall step memory usage roughly in half for at least one - test association. - """ - with warnings.catch_warnings(): - warnings.filterwarnings(action="ignore", - message="All-NaN slice encountered", - category=RuntimeWarning) - output_arr = np.empty(cube.shape[1:], dtype=np.float32) - for i in range(output_arr.shape[0]): - # this for loop looks silly, but see docstring above - np.nanmedian(cube[:,i,:], axis=0, overwrite_input=overwrite_input, out=output_arr[i,:]) - return output_arr def create_cube_median(cube_model, maskpt): @@ -115,14 +92,14 @@ def median_without_resampling(input_models, if i == 0: input_shape = (ngroups,)+drizzled_model.data.shape dtype = drizzled_model.data.dtype - median_computer = _make_median_computer(input_shape, in_memory, buffer_size, dtype) + computer = MedianComputer(input_shape, in_memory, buffer_size, dtype) weight_threshold = compute_weight_threshold(drizzled_model.wht, maskpt) drizzled_model.data[drizzled_model.wht < weight_threshold] = np.nan - _append_to_median_computer(median_computer, i, drizzled_model.data, in_memory) + computer.append(drizzled_model.data, i) # Perform median combination on set of drizzled mosaics - median_data = _evaluate_median_computer(median_computer, in_memory) + median_data = computer.evaluate() if save_intermediate_results: # Save median model to fits @@ -187,15 +164,14 @@ def median_with_resampling(input_models, if i == 0: input_shape = (ngroups,)+drizzled_model.data.shape dtype = drizzled_model.data.dtype - median_computer = _make_median_computer(input_shape, in_memory, buffer_size, dtype) + computer = MedianComputer(input_shape, in_memory, buffer_size, dtype) weight_threshold = compute_weight_threshold(drizzled_model.wht, maskpt) drizzled_model.data[drizzled_model.wht < weight_threshold] = np.nan - _append_to_median_computer(median_computer, i, drizzled_model.data, in_memory) - + computer.append(drizzled_model.data, i) # Perform median combination on set of drizzled mosaics - median_data = _evaluate_median_computer(median_computer, in_memory) + median_data = computer.evaluate() if save_intermediate_results: # Save median model to fits @@ -210,240 +186,6 @@ def median_with_resampling(input_models, return median_data, median_wcs -def _make_median_computer(full_shape, in_memory, buffer_size, dtype): - - if in_memory: - # allocate memory for data arrays that go into median - median_computer = np.empty(full_shape, dtype=np.float32) - else: - # set up temporary storage for data arrays that go into median - median_computer = OnDiskMedian(full_shape, - dtype=dtype, - buffer_size=buffer_size) - return median_computer - - -def _append_to_median_computer(median_computer, idx, data, in_memory): - if in_memory: - # populate pre-allocated memory with the drizzled data - median_computer[idx] = data - else: - # distribute the drizzled data into the temporary storage - median_computer.add_image(data) - - -def _evaluate_median_computer(median_computer, in_memory): - if in_memory: - median_data = nanmedian3D(median_computer) - del median_computer - else: - median_data = median_computer.compute_median() - median_computer.cleanup() - return median_data - - -class DiskAppendableArray: - """ - Creates a temporary file to which to append data, in order to perform - timewise operations on a stack of input images without holding all of them - in memory. - - This class is purpose-built for the median computation during outlier detection - and is not very flexible. It is assumed that each data array passed to `append` - represents the same spatial segment of the full dataset. It is also assumed that - each data array passed to `append` represents only a single instant in time; - the append operation will stack them along a new axis. - - The `read` operation is only capable of loading the full array back into memory. - When working with large datasets that do not fit in memory, the - required workflow is to create many DiskAppendableArray objects, each holding - a small spatial segment of the full dataset. - """ - - def __init__(self, slice_shape, dtype, filename): - """ - Parameters - ---------- - slice_shape : tuple - The shape of the spatial section of input data to be appended to the array. - - dtype : str - The data type of the array. Must be a valid numpy array datatype. - - filename : str - The full file path in which to store the array - """ - if len(slice_shape) != 2: - raise ValueError(f"Invalid slice_shape {slice_shape}. Only 2-D arrays are supported.") - self._filename = Path(filename) - with open(filename, "wb") as f: # noqa: F841 - pass - self._slice_shape = slice_shape - self._dtype = np.dtype(dtype) - self._append_count = 0 - - - @property - def shape(self): - return (self._append_count,) + self._slice_shape - - - def append(self, data): - """Add a new slice to the temporary file""" - if data.shape != self._slice_shape: - raise ValueError(f"Data shape {data.shape} does not match slice shape {self._slice_shape}") - if data.dtype != self._dtype: - raise ValueError(f"Data dtype {data.dtype} does not match array dtype {self._dtype}") - with open(self._filename, "ab") as f: - data.tofile(f, sep="") - self._append_count += 1 - - - def read(self): - """Read the 3-D array into memory""" - shp = (self._append_count,) + self._slice_shape - with open(self._filename, "rb") as f: - output = np.fromfile(f, dtype=self._dtype).reshape(shp) - return output - - -class OnDiskMedian: - - def __init__(self, shape, dtype='float32', tempdir="", buffer_size=None): - """ - Set up temporary files to perform operations on a stack of 2-D input arrays - along the stacking axis (e.g., a time axis) without - holding all of them in memory. Currently the only supported operation - is the median. - - Parameters - ---------- - shape: tuple - The shape of the entire input, (n_images, imrows, imcols). - - dtype : str - The data type of the input data. - - tempdir : str - The parent directory in which to create the temporary directory, - which itself holds all the DiskAppendableArray tempfiles. - Default is the current working directory. - - buffer_size : int, optional - The buffer size, units of bytes. - Default is the size of one input image. - """ - if len(shape) != 3: - raise ValueError(f"Invalid input shape {shape}; only three-dimensional data are supported.") - self._expected_nframes = shape[0] - self.frame_shape = shape[1:] - self.dtype = np.dtype(dtype) - self.itemsize = self.dtype.itemsize - self._temp_dir = tempfile.TemporaryDirectory(dir=tempdir) - self._temp_path = Path(self._temp_dir.name) - - # figure out number of sections and rows per section that are needed - self.nsections, self.section_nrows = self._get_buffer_indices(buffer_size=buffer_size) - self.slice_shape = (self.section_nrows, shape[2]) - self._n_adds = 0 - - # instantiate a temporary DiskAppendableArray for each section - self._temp_arrays = self._temparray_setup(dtype) - - - def _get_buffer_indices(self, buffer_size=None): - """ - Parameters - ---------- - buffer_size : int, optional - The buffer size for the median computation, units of bytes. - - Returns - ------- - nsections : int - The number of sections to divide the input data into. - - section_nrows : int - The number of rows in each section (except the last one). - """ - imrows, imcols = self.frame_shape - if buffer_size is None: - buffer_size = imrows * imcols * self.itemsize - per_model_buffer_size = buffer_size / self._expected_nframes - min_buffer_size = imcols * self.itemsize - section_nrows = min(imrows, int(per_model_buffer_size // min_buffer_size)) - - if section_nrows <= 0: - buffer_size = min_buffer_size * self._expected_nframes - log.warning("Buffer size is too small to hold a single row." - f"Increasing buffer size to {buffer_size / _ONE_MB}MB") - section_nrows = 1 - self.buffer_size = buffer_size - - nsections = int(np.ceil(imrows / section_nrows)) - log.info(f"Computing median over {self._expected_nframes} groups in {nsections} " - f"sections with total memory buffer {buffer_size / _ONE_MB} MB") - return nsections, section_nrows - - - def _temparray_setup(self, dtype): - """Set up temp file handlers for each spatial section""" - temp_arrays = [] - for i in range(0, self.nsections): - shp = self.slice_shape - if i == self.nsections - 1: - # last section has whatever shape is left over - shp = (self.frame_shape[0] - (self.nsections-1) * self.section_nrows, self.frame_shape[1]) - arr = DiskAppendableArray(shp, dtype, self._temp_path / f"{i}.bin") - temp_arrays.append(arr) - return temp_arrays - - - def add_image(self, data): - """Split resampled model data into spatial sections and write to disk.""" - if self._n_adds >= self.nsections: - raise IndexError(f"Too many calls to add_image. Expected at most {self.nsections} input models.") - self._validate_data(data) - self._n_adds += 1 - for i in range(self.nsections): - row1 = i * self.section_nrows - row2 = min(row1 + self.section_nrows, self.frame_shape[0]) - arr = self._temp_arrays[i] - arr.append(data[row1:row2]) - - - def _validate_data(self, data): - if data.shape != self.frame_shape: - raise ValueError(f"Data shape {data.shape} does not match expected shape {self.frame_shape}") - if data.dtype != self.dtype: - raise ValueError(f"Data dtype {data.dtype} does not match expected dtype {self.dtype}") - - - def cleanup(self): - """Remove the temporary files and directory when finished""" - self._temp_dir.cleanup() - return - - - def compute_median(self): - """Read spatial sections from disk and compute the median across groups - (median over number of exposures on a per-pixel basis)""" - row_indices = [(i * self.section_nrows, min((i+1) * self.section_nrows, self.frame_shape[0])) - for i in range(self.nsections)] - - output_rows = row_indices[-1][1] - output_cols = self._temp_arrays[0].shape[2] - median_image = np.full((output_rows, output_cols), np.nan, dtype=self.dtype) - - for i, disk_arr in enumerate(self._temp_arrays): - row1, row2 = row_indices[i] - arr = disk_arr.read() - median_image[row1:row2] = nanmedian3D(arr) - del arr, disk_arr - - return median_image - - def flag_crs_in_models( input_models, median_data, From a2c7f3b87a075c9ba51f9aa5cac8b50c812b4a74 Mon Sep 17 00:00:00 2001 From: Ned Molter Date: Fri, 27 Sep 2024 14:31:57 -0400 Subject: [PATCH 2/3] added changelog entry --- changes/8840.outlier_detection.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 changes/8840.outlier_detection.rst diff --git a/changes/8840.outlier_detection.rst b/changes/8840.outlier_detection.rst new file mode 100644 index 0000000000..0d1fe91bb7 --- /dev/null +++ b/changes/8840.outlier_detection.rst @@ -0,0 +1 @@ +Moved median computers out of the jwst repository and into stcal. \ No newline at end of file From b353951ad1b35e8754629c182add3b1c97257699 Mon Sep 17 00:00:00 2001 From: Ned Molter Date: Tue, 1 Oct 2024 14:28:09 -0400 Subject: [PATCH 3/3] update pyproject.toml --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index def298ae11..c73a69fef9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,7 +34,7 @@ dependencies = [ "scikit-image>=0.19", "scipy>=1.9.3", "spherical-geometry>=1.2.22", - "stcal>=1.9.0,<1.10.0", + "stcal @ git+https://github.com/spacetelescope/stcal.git@main", "stdatamodels>=2.1.0,<2.2.0", "stpipe>=0.7.0,<0.8.0", "stsci.imagestats>=1.6.3",