Merge pull request #92 from TUW-GEO/dev

merge for v1.1.7
TUW-GEO · May 5, 2020 · 1ead235 · 1ead235
2 parents 2c304d4 + a22cbfa
commit 1ead235
Show file tree

Hide file tree

Showing 3 changed files with 105 additions and 44 deletions.
diff --git a/rt1/__init__.py b/rt1/__init__.py
@@ -2,5 +2,5 @@
 Import module for RT1 module
 """
 
-__version__ = '1.1.6'
+__version__ = '1.1.7'
 __author__ = 'Raphael Quast'
diff --git a/rt1/general_functions.py b/rt1/general_functions.py
@@ -7,8 +7,9 @@
 from itertools import tee, islice
 from collections import OrderedDict
 
+
 def rectangularize(array, return_mask=False, dim=None,
-                   return_masked=False):
+                   return_masked=False, dtype=None):
     '''
     return a rectangularized version of the input-array by repeating the
     last value to obtain the smallest possible rectangular shape.
@@ -31,7 +32,9 @@ def rectangularize(array, return_mask=False, dim=None,
          if None, the shortest length of all sub-lists will be used
     return_masked: bool (default=False)
                    indicator if a masked-array should be returned
-
+    dtype: type (default = None)
+           the dtype of the returned array. If None, the dtype of the first
+           element will be used
     Returns:
     ----------
     new_array: array-like
@@ -40,36 +43,37 @@ def rectangularize(array, return_mask=False, dim=None,
           a mask indicating the added values
 
     '''
+    # use this method to get the dtype of the first element since it works with
+    # pandas-Series, lists, arrays, dict-value views, etc.
+    if dtype is None:
+        dtype = np.array(next(islice(array, 1))).dtype
+
     if dim is None:
         # get longest dimension of sub-arrays
         dim  = len(max(array, key=len))
 
     if return_mask is True or return_masked is True:
-        newarray, mask = [], []
-        for s in array:
-            adddim = dim - len(s)
-            m = np.full_like(s, False, dtype=bool)
-            if adddim > 0:
-                s = np.append(s, np.full(adddim, s[-1]))
-                m = np.append(m, np.full(adddim, True))
-            newarray += [s]
-            mask     += [m]
-
-        newarray = np.array(newarray)
-        mask = np.array(mask, dtype=bool)
+        newarray = np.empty((len(array), dim), dtype=dtype)
+        mask = np.full((len(array), dim), False, dtype=bool)
+
+        for i, s in enumerate(array):
+            l = len(s)
+            newarray[i, :l] = s
+            newarray[i, l:] = s[-1]
+            mask[i,l:] = True
 
         if return_masked is True:
             return np.ma.masked_array(newarray, mask)
         else:
             return [newarray, mask]
     else:
-        newarray = []
-        for s in array:
-            adddim = dim - len(s)
-            if adddim > 0:
-                s = np.append(s, np.full(adddim, s[-1]))
-            newarray += [s]
-        return np.array(newarray)
+        newarray = np.empty((len(array), dim), dtype=dtype)
+        for i, s in enumerate(array):
+            l = len(s)
+            newarray[i, :l] = s
+            newarray[i, l:] = s[-1]
+        return newarray
+
 
 
 def meandatetime(datetimes):

diff --git a/rt1/rtfits.py b/rt1/rtfits.py
@@ -130,9 +130,14 @@ class Fits(Scatter):
                          will be used together with the dataset-index to
                          assign the temporal variability within the fit
                          (see http://pandas.pydata.org/pandas-docs/stable/timeseries.html#offset-aliases)
-                       - if both a pandas offset-alias and a dataset-column
-                         "key_dyn" is provided, the the provided variability
-                         will be superimposed onto the variability resulting
+                       - if freq is an integer (N), the dataset will be grouped
+                         such that each group contains N unique dataset-indexes
+                         (in case an exact split is not possible, the split is
+                          performed such that the groups are as similar as
+                          possible)
+                       - if both a freq AND a dataset-column "key_dyn" is
+                         provided, the the provided variability will be
+                         superimposed onto the variability resulting
                          form the chosen offset-alias
 
                 min, max: float (only needed if fitQ is True)
@@ -454,21 +459,56 @@ def param_dyn_dict(self):
                 param_dyn_dict[key] = list(repeat(1, len(self.dataset.index)))
             if freq is not None:
                 for i, f in enumerate(freq):
-                    try:
-                        grp_idx = self.dataset.index.to_frame().groupby(
-                                                        pd.Grouper(freq=f),
-                                                        sort=False)
-                    except ValueError:
-                        raise ValueError(f'The provided frequency ({f}) of ' +
-                                         f'{freqkeys[i]} is not a valid ' +
-                                         'pandas datetime-offset string. ' +
-                                         'Check the assignments in defdict!')
-                    # get unique group indices for each datetime-group
-                    for key in freqkeys[i]:
-                        grp_data = []
-                        for nval, [_, val] in enumerate(grp_idx):
-                            grp_data += repeat(nval, len(val))
-                        param_dyn_dict[key] = grp_data
+                    if isinstance(f, str):
+                        try:
+                            grp_idx = self.dataset.index.to_frame().groupby(
+                                                            pd.Grouper(freq=f),
+                                                            sort=False)
+                        except ValueError:
+                            raise ValueError(f'The provided frequency ({f}) of ' +
+                                             f'{freqkeys[i]} is not a valid ' +
+                                             'pandas datetime-offset string. ' +
+                                             'Check the assignments in defdict!')
+                        # get unique group indices for each datetime-group
+                        for key in freqkeys[i]:
+                            grp_data = []
+                            for nval, [_, val] in enumerate(grp_idx):
+                                grp_data += repeat(nval, len(val))
+                            param_dyn_dict[key] = grp_data
+                    elif isinstance(f, int):
+                        # find the number of groups required to split the
+                        # dataset into measurement-bins of length "f"
+                        n_dat = self.dataset.index.nunique()
+                        ngrps = n_dat//f
+                        rest = n_dat%f
+                        res = [0 for i in range(ngrps)]
+                        # distribute the rest as equal as possible
+                        # (to the first groups)
+                        for r in range(rest):
+                            res[r%len(res)] += 1
+                        if rest >= ngrps:
+                            print(f'warning: grouping {f} of {freqkeys}',
+                                  'is actually between',
+                                  f'{min([f+i for i in res])} and ',
+                                  f'{max([f+i for i in res])}')
+
+                        # (repetitions + rest + number of elements in group)
+                        dyn = chain(*[repeat(ni, f + r) for ni, r in
+                                      zip(range(ngrps), res)])
+                        # get the number of observations for each unique
+                        # index in the dataset
+                        dat = groupby_unsorted(
+                            zip(dyn,
+                                groupby_unsorted(self.dataset.index).values()),
+                            get=lambda x: len(x[1]),
+                            key=itemgetter(0))
+
+                        for key in freqkeys[i]:
+                            param_dyn_dict[key] = list(chain(*[repeat(key,
+                                                                      sum(val))
+                                                               for key, val in
+                                                               dat.items()]))
+
 
             manual_dyn_df = self._manual_dyn_df
             if manual_dyn_df is not None:
@@ -901,13 +941,30 @@ def _startvaldict(self):
                         f'you must provide a column {key + "_start"} in ' +
                         'the dataset if you want to use "auxiliary" start-vals'
                         )
-                    startval = list(groupby_unsorted(
+                    meanstartvals = list(groupby_unsorted(
                         zip(self._groupindex, self.dataset[key + '_start']),
                         key=itemgetter(0), get=itemgetter(1)).values())
 
-                    meanstartvals = []
-                    for dyn, idx in self._param_assigns[key].items():
-                        meanstartvals += [np.mean(np.take(startval, idx))]
+                    # evaluate the mean start-value for each group
+                    if val[2] == 'index':
+                        # avoid grouping with _param_assigns since anyway
+                        # only a single value is given for each group
+                        meanstartvals = np.mean(meanstartvals, axis=1)
+                    else:
+                        # get a rectangularized list of indices for each
+                        # parameter (instead of using a loop which can be VERY
+                        # slow for a large number of parameters)
+                        st, sm = rectangularize(
+                            self._param_assigns[key].values(),
+                            return_mask=True)
+                        # average over each group in the dataset
+                        meanstartvals=np.ma.mean(
+                            rectangularize(meanstartvals, return_masked=True),
+                            axis=1)
+                        # assign individual values to each parameter-group
+                        meanstartvals = np.ma.mean(
+                            np.ma.masked_array(np.take(meanstartvals, st), sm),
+                            axis=1).compressed()
 
                     startvaldict[key] = meanstartvals
                 else: