From 34a2146fccbd81d54b36c7a41db15b6359a5f539 Mon Sep 17 00:00:00 2001
From: Ned Molter <emolter@stsci.edu>
Date: Fri, 27 Sep 2024 13:42:52 -0400
Subject: [PATCH 1/3] moved unit tests into stcal and out of here

---
 .../tests/test_algorithms.py                  |  12 -
 jwst/outlier_detection/tests/test_utils.py    | 164 -----------
 jwst/outlier_detection/utils.py               | 272 +-----------------
 3 files changed, 7 insertions(+), 441 deletions(-)
 delete mode 100644 jwst/outlier_detection/tests/test_utils.py

diff --git a/jwst/outlier_detection/tests/test_algorithms.py b/jwst/outlier_detection/tests/test_algorithms.py
index d1c8ffd85d..b3b9255bb4 100644
--- a/jwst/outlier_detection/tests/test_algorithms.py
+++ b/jwst/outlier_detection/tests/test_algorithms.py
@@ -1,7 +1,6 @@
 import numpy as np
 
 from jwst.outlier_detection.tso import moving_median_over_zeroth_axis
-from jwst.outlier_detection.utils import nanmedian3D
 
 
 def test_rolling_median():
@@ -15,14 +14,3 @@ def test_rolling_median():
     result = moving_median_over_zeroth_axis(arr, w)
     expected = expected_time_axis[:, np.newaxis, np.newaxis] * spatial_axis[np.newaxis, :, :]
     assert np.allclose(result, expected)
-
-
-def test_nanmedian3D():
-
-    shp = (11, 50, 60)
-    cube = np.random.normal(size=shp)
-    cube[5, 5:7, 5:8] = np.nan
-    med = nanmedian3D(cube)
-
-    assert med.dtype == np.float32
-    assert np.allclose(med, np.nanmedian(cube, axis=0), equal_nan=True)
\ No newline at end of file
diff --git a/jwst/outlier_detection/tests/test_utils.py b/jwst/outlier_detection/tests/test_utils.py
deleted file mode 100644
index d1fc0bd576..0000000000
--- a/jwst/outlier_detection/tests/test_utils.py
+++ /dev/null
@@ -1,164 +0,0 @@
-import pytest
-from jwst.outlier_detection.utils import DiskAppendableArray, OnDiskMedian
-from pathlib import Path
-import numpy as np
-import os
-
-
-def test_disk_appendable_array(tmp_cwd):
-
-    slice_shape = (8,7)
-    dtype = "float32"
-    tempdir = tmp_cwd / Path("tmptest")
-    os.mkdir(tempdir)
-    fname = tempdir / "test.bin"
-
-    arr = DiskAppendableArray(slice_shape, dtype, fname)
-
-    # check temporary file setup
-    assert str(arr._filename).split("/")[-1] in os.listdir(tempdir)
-    assert len(os.listdir(tempdir)) == 1
-    assert arr.shape == (0,) + slice_shape
-
-    # check cwd contains no files
-    assert all([not os.path.isfile(f) for f in os.listdir(tmp_cwd)])
-
-    # check expected append failures
-    with pytest.raises(ValueError):
-        candidate = np.zeros((7,7), dtype=dtype)
-        arr.append(candidate)
-    with pytest.raises(ValueError):
-        candidate = np.zeros((8,7), dtype='float64')
-        arr.append(candidate)
-
-    # check append and read
-    candidate0 = np.zeros(slice_shape, dtype=dtype)
-    candidate1 = np.full(slice_shape, 2, dtype=dtype)
-    candidate2 = np.full(slice_shape, np.nan, dtype=dtype)
-    for candidate in [candidate0, candidate1, candidate2]:
-        arr.append(candidate)
-    
-    arr_in_memory = arr.read()
-
-    assert arr_in_memory.shape == (3,) + slice_shape
-    assert np.all(arr_in_memory[0] == candidate0)
-    assert np.all(arr_in_memory[1] == candidate1)
-    assert np.allclose(arr_in_memory[2], candidate2, equal_nan=True)
-
-
-def test_disk_appendable_array_bad_inputs(tmp_cwd):
-
-    slice_shape = (8,7)
-    dtype = "float32"
-    tempdir = tmp_cwd / Path("tmptest")
-    fname = "test.bin"
-    
-    # test input directory does not exist
-    with pytest.raises(FileNotFoundError):
-        DiskAppendableArray(slice_shape, dtype, tempdir / fname)
-
-    # make the input directory
-    os.mkdir(tempdir)
-
-    # ensure failure if slice_shape is not 2-D
-    with pytest.raises(ValueError):
-        DiskAppendableArray((3,5,7), dtype, tempdir / fname)
-
-    # ensure failure if dtype is not valid
-    with pytest.raises(TypeError):
-        DiskAppendableArray(slice_shape, "float3", tempdir / fname)
-
-    # ensure failure if pass directory instead of filename
-    with pytest.raises(IsADirectoryError):
-        DiskAppendableArray(slice_shape, "float3", tempdir)
-
-
-# filter warnings such that "ResourceWarning: Implictly cleaning up" is raised as error
-# unfortunately need to do this indirectly by finding "finalize" string in PytestUnraisableException
-@pytest.mark.filterwarnings("error:.*finalize.*:pytest.PytestUnraisableExceptionWarning")
-def test_on_disk_median(tmp_cwd):
-
-    library_length = 3
-    frame_shape = (21,20)
-    dtype = 'float32'
-    tempdir = tmp_cwd / Path("tmptest")
-    os.mkdir(tempdir)
-    shape = (library_length,) + frame_shape
-
-    median_computer = OnDiskMedian(shape, dtype=dtype, tempdir=tempdir)
-
-    # test compute buffer indices
-    # buffer size equals size of single input model by default
-    # which means we expect same number of sections as library length
-    # in reality there is often one more section than that because
-    # of necessity of integer number of rows per section, but math is exact in this case
-    expected_buffer_size = frame_shape[0] * frame_shape[1] * np.dtype(dtype).itemsize
-    expected_section_nrows = frame_shape[0] // library_length
-    assert median_computer.nsections == library_length
-    assert median_computer.section_nrows == expected_section_nrows
-    assert median_computer.buffer_size == expected_buffer_size
-
-    # test temp file setup
-    assert len(os.listdir(tempdir)) == 1
-    assert str(median_computer._temp_path).startswith(str(tempdir))
-    assert len(os.listdir(median_computer._temp_path)) == library_length
-    # check cwd and parent tempdir contain no files
-    assert all([not os.path.isfile(f) for f in os.listdir(tmp_cwd)])
-    assert all([not os.path.isfile(f) for f in os.listdir(tempdir)])
-    
-    # test validate data
-    with pytest.raises(ValueError):
-        candidate = np.zeros((20,20), dtype=dtype)
-        median_computer.add_image(candidate)
-    with pytest.raises(ValueError):
-        candidate = np.zeros((21,20), dtype='float64')
-        median_computer.add_image(candidate)
-
-    # test add and compute
-    candidate0 = np.full(frame_shape, 3, dtype=dtype)
-    candidate1 = np.full(frame_shape, 2, dtype=dtype)
-    candidate2 = np.full(frame_shape, np.nan, dtype=dtype)
-    for candidate in [candidate0, candidate1, candidate2]:
-        median_computer.add_image(candidate)
-    median = median_computer.compute_median()
-    median_computer.cleanup()
-    assert median.shape == frame_shape
-    assert np.all(median == 2.5)
-
-    # test expected error trying to add too many frames
-    # for loop to ensure always happens, not just the first time
-    candidate3 = np.zeros_like(candidate0)
-    for _ in range(2):
-        with pytest.raises(IndexError):
-            median_computer.add_image(candidate3)
-
-    # test cleanup of tmpdir and everything else
-    assert not os.path.exists(median_computer._temp_path)
-    assert len(os.listdir(tempdir)) == 0
-    assert len(os.listdir(os.getcwd())) == 1 # this is the "tmptest" directory
-
-
-@pytest.mark.filterwarnings("error:.*finalize.*:pytest.PytestUnraisableExceptionWarning")
-def test_on_disk_median_bad_inputs(tmp_cwd):
-
-    library_length = 3
-    frame_shape = (21,20)
-    dtype = 'float32'
-    tempdir = tmp_cwd / Path("tmptest")
-    os.mkdir(tempdir)
-    shape = (library_length,) + frame_shape
-
-    with pytest.raises(ValueError):
-        OnDiskMedian(frame_shape, dtype=dtype, tempdir=tempdir)
-
-    with pytest.raises(TypeError):
-        OnDiskMedian(shape, dtype="float3", tempdir=tempdir)
-
-    with pytest.raises(FileNotFoundError):
-        OnDiskMedian(shape, dtype="float32", tempdir="dne")
-
-    # ensure unreasonable buffer size will get set to minimum reasonable buffer
-    min_buffer = np.dtype(dtype).itemsize*frame_shape[1]*library_length
-    median_computer = OnDiskMedian(shape, dtype=dtype, tempdir=tempdir, buffer_size=-1)
-    assert median_computer.buffer_size == min_buffer
-    median_computer.cleanup()
\ No newline at end of file
diff --git a/jwst/outlier_detection/utils.py b/jwst/outlier_detection/utils.py
index 03446aac5e..1563c2565d 100644
--- a/jwst/outlier_detection/utils.py
+++ b/jwst/outlier_detection/utils.py
@@ -4,15 +4,13 @@
 
 import copy
 from functools import partial
-import warnings
 import numpy as np
-import tempfile
-from pathlib import Path
 
 from jwst.lib.pipe_utils import match_nans_and_flags
 from jwst.resample.resample import compute_image_pixel_area
 from jwst.resample.resample_utils import build_driz_weight
 from stcal.outlier_detection.utils import compute_weight_threshold, gwcs_blot, flag_crs, flag_resampled_crs
+from stcal.outlier_detection.median import MedianComputer, nanmedian3D
 from stdatamodels.jwst import datamodels
 from . import _fileio
 
@@ -23,27 +21,6 @@
 
 DO_NOT_USE = datamodels.dqflags.pixel['DO_NOT_USE']
 OUTLIER = datamodels.dqflags.pixel['OUTLIER']
-_ONE_MB = 1 << 20
-
-
-def nanmedian3D(cube, overwrite_input=True):
-    """Compute the median of a cube ignoring warnings and with memory efficiency.
-    np.nanmedian always uses at least 64-bit precision internally, and this is too
-    memory-intensive. Instead, loop over the median calculation to avoid the
-    memory usage of the internal upcasting and temporary array allocation.
-    The additional runtime of this loop is indistinguishable from zero,
-    but this loop cuts overall step memory usage roughly in half for at least one
-    test association.
-    """
-    with warnings.catch_warnings():
-        warnings.filterwarnings(action="ignore",
-                                message="All-NaN slice encountered",
-                                category=RuntimeWarning)
-        output_arr = np.empty(cube.shape[1:], dtype=np.float32)
-        for i in range(output_arr.shape[0]):
-            # this for loop looks silly, but see docstring above
-            np.nanmedian(cube[:,i,:], axis=0, overwrite_input=overwrite_input, out=output_arr[i,:])
-        return output_arr
 
 
 def create_cube_median(cube_model, maskpt):
@@ -115,14 +92,14 @@ def median_without_resampling(input_models,
             if i == 0:
                 input_shape = (ngroups,)+drizzled_model.data.shape
                 dtype = drizzled_model.data.dtype
-                median_computer = _make_median_computer(input_shape, in_memory, buffer_size, dtype)
+                computer = MedianComputer(input_shape, in_memory, buffer_size, dtype)
 
             weight_threshold = compute_weight_threshold(drizzled_model.wht, maskpt)
             drizzled_model.data[drizzled_model.wht < weight_threshold] = np.nan
-            _append_to_median_computer(median_computer, i, drizzled_model.data, in_memory)
+            computer.append(drizzled_model.data, i)
 
     # Perform median combination on set of drizzled mosaics
-    median_data = _evaluate_median_computer(median_computer, in_memory)
+    median_data = computer.evaluate()
 
     if save_intermediate_results:
         # Save median model to fits
@@ -187,15 +164,14 @@ def median_with_resampling(input_models,
             if i == 0:
                 input_shape = (ngroups,)+drizzled_model.data.shape
                 dtype = drizzled_model.data.dtype
-                median_computer = _make_median_computer(input_shape, in_memory, buffer_size, dtype)
+                computer = MedianComputer(input_shape, in_memory, buffer_size, dtype)
 
             weight_threshold = compute_weight_threshold(drizzled_model.wht, maskpt)
             drizzled_model.data[drizzled_model.wht < weight_threshold] = np.nan
-            _append_to_median_computer(median_computer, i, drizzled_model.data, in_memory)
-
+            computer.append(drizzled_model.data, i)
 
     # Perform median combination on set of drizzled mosaics
-    median_data = _evaluate_median_computer(median_computer, in_memory)
+    median_data = computer.evaluate()
 
     if save_intermediate_results:
         # Save median model to fits
@@ -210,240 +186,6 @@ def median_with_resampling(input_models,
     return median_data, median_wcs
 
 
-def _make_median_computer(full_shape, in_memory, buffer_size, dtype):
-
-    if in_memory:
-        # allocate memory for data arrays that go into median
-        median_computer = np.empty(full_shape, dtype=np.float32)
-    else:
-        # set up temporary storage for data arrays that go into median
-        median_computer = OnDiskMedian(full_shape,
-                                        dtype=dtype,
-                                        buffer_size=buffer_size)
-    return median_computer
-
-
-def _append_to_median_computer(median_computer, idx, data, in_memory):
-    if in_memory:
-        # populate pre-allocated memory with the drizzled data
-        median_computer[idx] = data
-    else:
-        # distribute the drizzled data into the temporary storage
-        median_computer.add_image(data)
-
-
-def _evaluate_median_computer(median_computer, in_memory):
-    if in_memory:
-        median_data = nanmedian3D(median_computer)
-        del median_computer
-    else:
-        median_data = median_computer.compute_median()
-        median_computer.cleanup()
-    return median_data
-
-
-class DiskAppendableArray:
-    """
-    Creates a temporary file to which to append data, in order to perform
-    timewise operations on a stack of input images without holding all of them
-    in memory.
-    
-    This class is purpose-built for the median computation during outlier detection 
-    and is not very flexible. It is assumed that each data array passed to `append`
-    represents the same spatial segment of the full dataset. It is also assumed that 
-    each data array passed to `append` represents only a single instant in time; 
-    the append operation will stack them along a new axis.
-    
-    The `read` operation is only capable of loading the full array back into memory.
-    When working with large datasets that do not fit in memory, the
-    required workflow is to create many DiskAppendableArray objects, each holding
-    a small spatial segment of the full dataset.
-    """
-    
-    def __init__(self, slice_shape, dtype, filename):
-        """
-        Parameters
-        ----------
-        slice_shape : tuple
-            The shape of the spatial section of input data to be appended to the array.
-
-        dtype : str
-            The data type of the array. Must be a valid numpy array datatype.
-
-        filename : str
-            The full file path in which to store the array
-        """
-        if len(slice_shape) != 2:
-            raise ValueError(f"Invalid slice_shape {slice_shape}. Only 2-D arrays are supported.")
-        self._filename = Path(filename)
-        with open(filename, "wb") as f:   # noqa: F841
-            pass
-        self._slice_shape = slice_shape
-        self._dtype = np.dtype(dtype)
-        self._append_count = 0
-
-
-    @property
-    def shape(self):
-        return (self._append_count,) + self._slice_shape
-
-
-    def append(self, data):
-        """Add a new slice to the temporary file"""
-        if data.shape != self._slice_shape:
-            raise ValueError(f"Data shape {data.shape} does not match slice shape {self._slice_shape}")
-        if data.dtype != self._dtype:
-            raise ValueError(f"Data dtype {data.dtype} does not match array dtype {self._dtype}")
-        with open(self._filename, "ab") as f:
-            data.tofile(f, sep="")
-        self._append_count += 1
-
-
-    def read(self):
-        """Read the 3-D array into memory"""
-        shp = (self._append_count,) + self._slice_shape
-        with open(self._filename, "rb") as f:
-            output = np.fromfile(f, dtype=self._dtype).reshape(shp)
-        return output
-
-
-class OnDiskMedian:
-
-    def __init__(self, shape, dtype='float32', tempdir="", buffer_size=None):
-        """
-        Set up temporary files to perform operations on a stack of 2-D input arrays
-        along the stacking axis (e.g., a time axis) without
-        holding all of them in memory. Currently the only supported operation
-        is the median.
-
-        Parameters
-        ----------
-        shape: tuple
-            The shape of the entire input, (n_images, imrows, imcols).
-
-        dtype : str
-            The data type of the input data.
-
-        tempdir : str
-            The parent directory in which to create the temporary directory,
-            which itself holds all the DiskAppendableArray tempfiles.
-            Default is the current working directory.
-
-        buffer_size : int, optional
-            The buffer size, units of bytes.
-            Default is the size of one input image.
-        """
-        if len(shape) != 3:
-            raise ValueError(f"Invalid input shape {shape}; only three-dimensional data are supported.")
-        self._expected_nframes = shape[0]
-        self.frame_shape = shape[1:]
-        self.dtype = np.dtype(dtype)
-        self.itemsize = self.dtype.itemsize
-        self._temp_dir = tempfile.TemporaryDirectory(dir=tempdir)
-        self._temp_path = Path(self._temp_dir.name)
-
-        # figure out number of sections and rows per section that are needed
-        self.nsections, self.section_nrows = self._get_buffer_indices(buffer_size=buffer_size)
-        self.slice_shape = (self.section_nrows, shape[2])
-        self._n_adds = 0
-
-        # instantiate a temporary DiskAppendableArray for each section
-        self._temp_arrays = self._temparray_setup(dtype)
-
-
-    def _get_buffer_indices(self, buffer_size=None):
-        """
-        Parameters
-        ----------
-        buffer_size : int, optional
-            The buffer size for the median computation, units of bytes.
-
-        Returns
-        -------
-        nsections : int
-            The number of sections to divide the input data into.
-
-        section_nrows : int
-            The number of rows in each section (except the last one).
-        """
-        imrows, imcols = self.frame_shape
-        if buffer_size is None:
-            buffer_size = imrows * imcols * self.itemsize
-        per_model_buffer_size = buffer_size / self._expected_nframes
-        min_buffer_size = imcols * self.itemsize
-        section_nrows = min(imrows, int(per_model_buffer_size // min_buffer_size))
-
-        if section_nrows <= 0:
-            buffer_size = min_buffer_size * self._expected_nframes
-            log.warning("Buffer size is too small to hold a single row."
-                            f"Increasing buffer size to {buffer_size / _ONE_MB}MB")
-            section_nrows = 1
-        self.buffer_size = buffer_size
-
-        nsections = int(np.ceil(imrows / section_nrows))
-        log.info(f"Computing median over {self._expected_nframes} groups in {nsections} "
-                    f"sections with total memory buffer {buffer_size / _ONE_MB} MB")
-        return nsections, section_nrows
-
-
-    def _temparray_setup(self, dtype):
-        """Set up temp file handlers for each spatial section"""
-        temp_arrays = []
-        for i in range(0, self.nsections):
-            shp = self.slice_shape
-            if i == self.nsections - 1:
-                # last section has whatever shape is left over
-                shp = (self.frame_shape[0] - (self.nsections-1) * self.section_nrows, self.frame_shape[1])
-            arr = DiskAppendableArray(shp, dtype, self._temp_path / f"{i}.bin")
-            temp_arrays.append(arr)
-        return temp_arrays
-
-
-    def add_image(self, data):
-        """Split resampled model data into spatial sections and write to disk."""
-        if self._n_adds >= self.nsections:
-            raise IndexError(f"Too many calls to add_image. Expected at most {self.nsections} input models.")
-        self._validate_data(data)
-        self._n_adds += 1
-        for i in range(self.nsections):
-            row1 = i * self.section_nrows
-            row2 = min(row1 + self.section_nrows, self.frame_shape[0])
-            arr = self._temp_arrays[i]
-            arr.append(data[row1:row2])
-
-
-    def _validate_data(self, data):
-        if data.shape != self.frame_shape:
-            raise ValueError(f"Data shape {data.shape} does not match expected shape {self.frame_shape}")
-        if data.dtype != self.dtype:
-            raise ValueError(f"Data dtype {data.dtype} does not match expected dtype {self.dtype}")
-        
-
-    def cleanup(self):
-        """Remove the temporary files and directory when finished"""
-        self._temp_dir.cleanup()
-        return
-
-
-    def compute_median(self):
-        """Read spatial sections from disk and compute the median across groups
-        (median over number of exposures on a per-pixel basis)"""
-        row_indices = [(i * self.section_nrows, min((i+1) * self.section_nrows, self.frame_shape[0]))
-                       for i in range(self.nsections)]
-
-        output_rows = row_indices[-1][1]
-        output_cols = self._temp_arrays[0].shape[2]
-        median_image = np.full((output_rows, output_cols), np.nan, dtype=self.dtype)
-
-        for i, disk_arr in enumerate(self._temp_arrays):
-            row1, row2 = row_indices[i]
-            arr = disk_arr.read()
-            median_image[row1:row2] = nanmedian3D(arr)
-            del arr, disk_arr
-
-        return median_image
-
-
 def flag_crs_in_models(
     input_models,
     median_data,

From a2c7f3b87a075c9ba51f9aa5cac8b50c812b4a74 Mon Sep 17 00:00:00 2001
From: Ned Molter <emolter@stsci.edu>
Date: Fri, 27 Sep 2024 14:31:57 -0400
Subject: [PATCH 2/3] added changelog entry

---
 changes/8840.outlier_detection.rst | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 changes/8840.outlier_detection.rst

diff --git a/changes/8840.outlier_detection.rst b/changes/8840.outlier_detection.rst
new file mode 100644
index 0000000000..0d1fe91bb7
--- /dev/null
+++ b/changes/8840.outlier_detection.rst
@@ -0,0 +1 @@
+Moved median computers out of the jwst repository and into stcal.
\ No newline at end of file

From b353951ad1b35e8754629c182add3b1c97257699 Mon Sep 17 00:00:00 2001
From: Ned Molter <emolter@stsci.edu>
Date: Tue, 1 Oct 2024 14:28:09 -0400
Subject: [PATCH 3/3] update pyproject.toml

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index def298ae11..c73a69fef9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -34,7 +34,7 @@ dependencies = [
     "scikit-image>=0.19",
     "scipy>=1.9.3",
     "spherical-geometry>=1.2.22",
-    "stcal>=1.9.0,<1.10.0",
+    "stcal @ git+https://github.com/spacetelescope/stcal.git@main",
     "stdatamodels>=2.1.0,<2.2.0",
     "stpipe>=0.7.0,<0.8.0",
     "stsci.imagestats>=1.6.3",