Skip to content

Commit

Permalink
Merge pull request #92 from TUW-GEO/dev
Browse files Browse the repository at this point in the history
merge for v1.1.7
  • Loading branch information
raphaelquast committed May 5, 2020
2 parents 2c304d4 + a22cbfa commit 1ead235
Show file tree
Hide file tree
Showing 3 changed files with 105 additions and 44 deletions.
2 changes: 1 addition & 1 deletion rt1/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@
Import module for RT1 module
"""

__version__ = '1.1.6'
__version__ = '1.1.7'
__author__ = 'Raphael Quast'
46 changes: 25 additions & 21 deletions rt1/general_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,9 @@
from itertools import tee, islice
from collections import OrderedDict


def rectangularize(array, return_mask=False, dim=None,
return_masked=False):
return_masked=False, dtype=None):
'''
return a rectangularized version of the input-array by repeating the
last value to obtain the smallest possible rectangular shape.
Expand All @@ -31,7 +32,9 @@ def rectangularize(array, return_mask=False, dim=None,
if None, the shortest length of all sub-lists will be used
return_masked: bool (default=False)
indicator if a masked-array should be returned
dtype: type (default = None)
the dtype of the returned array. If None, the dtype of the first
element will be used
Returns:
----------
new_array: array-like
Expand All @@ -40,36 +43,37 @@ def rectangularize(array, return_mask=False, dim=None,
a mask indicating the added values
'''
# use this method to get the dtype of the first element since it works with
# pandas-Series, lists, arrays, dict-value views, etc.
if dtype is None:
dtype = np.array(next(islice(array, 1))).dtype

if dim is None:
# get longest dimension of sub-arrays
dim = len(max(array, key=len))

if return_mask is True or return_masked is True:
newarray, mask = [], []
for s in array:
adddim = dim - len(s)
m = np.full_like(s, False, dtype=bool)
if adddim > 0:
s = np.append(s, np.full(adddim, s[-1]))
m = np.append(m, np.full(adddim, True))
newarray += [s]
mask += [m]

newarray = np.array(newarray)
mask = np.array(mask, dtype=bool)
newarray = np.empty((len(array), dim), dtype=dtype)
mask = np.full((len(array), dim), False, dtype=bool)

for i, s in enumerate(array):
l = len(s)
newarray[i, :l] = s
newarray[i, l:] = s[-1]
mask[i,l:] = True

if return_masked is True:
return np.ma.masked_array(newarray, mask)
else:
return [newarray, mask]
else:
newarray = []
for s in array:
adddim = dim - len(s)
if adddim > 0:
s = np.append(s, np.full(adddim, s[-1]))
newarray += [s]
return np.array(newarray)
newarray = np.empty((len(array), dim), dtype=dtype)
for i, s in enumerate(array):
l = len(s)
newarray[i, :l] = s
newarray[i, l:] = s[-1]
return newarray



def meandatetime(datetimes):
Expand Down
101 changes: 79 additions & 22 deletions rt1/rtfits.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,9 +130,14 @@ class Fits(Scatter):
will be used together with the dataset-index to
assign the temporal variability within the fit
(see http://pandas.pydata.org/pandas-docs/stable/timeseries.html#offset-aliases)
- if both a pandas offset-alias and a dataset-column
"key_dyn" is provided, the the provided variability
will be superimposed onto the variability resulting
- if freq is an integer (N), the dataset will be grouped
such that each group contains N unique dataset-indexes
(in case an exact split is not possible, the split is
performed such that the groups are as similar as
possible)
- if both a freq AND a dataset-column "key_dyn" is
provided, the the provided variability will be
superimposed onto the variability resulting
form the chosen offset-alias
min, max: float (only needed if fitQ is True)
Expand Down Expand Up @@ -454,21 +459,56 @@ def param_dyn_dict(self):
param_dyn_dict[key] = list(repeat(1, len(self.dataset.index)))
if freq is not None:
for i, f in enumerate(freq):
try:
grp_idx = self.dataset.index.to_frame().groupby(
pd.Grouper(freq=f),
sort=False)
except ValueError:
raise ValueError(f'The provided frequency ({f}) of ' +
f'{freqkeys[i]} is not a valid ' +
'pandas datetime-offset string. ' +
'Check the assignments in defdict!')
# get unique group indices for each datetime-group
for key in freqkeys[i]:
grp_data = []
for nval, [_, val] in enumerate(grp_idx):
grp_data += repeat(nval, len(val))
param_dyn_dict[key] = grp_data
if isinstance(f, str):
try:
grp_idx = self.dataset.index.to_frame().groupby(
pd.Grouper(freq=f),
sort=False)
except ValueError:
raise ValueError(f'The provided frequency ({f}) of ' +
f'{freqkeys[i]} is not a valid ' +
'pandas datetime-offset string. ' +
'Check the assignments in defdict!')
# get unique group indices for each datetime-group
for key in freqkeys[i]:
grp_data = []
for nval, [_, val] in enumerate(grp_idx):
grp_data += repeat(nval, len(val))
param_dyn_dict[key] = grp_data
elif isinstance(f, int):
# find the number of groups required to split the
# dataset into measurement-bins of length "f"
n_dat = self.dataset.index.nunique()
ngrps = n_dat//f
rest = n_dat%f
res = [0 for i in range(ngrps)]
# distribute the rest as equal as possible
# (to the first groups)
for r in range(rest):
res[r%len(res)] += 1
if rest >= ngrps:
print(f'warning: grouping {f} of {freqkeys}',
'is actually between',
f'{min([f+i for i in res])} and ',
f'{max([f+i for i in res])}')

# (repetitions + rest + number of elements in group)
dyn = chain(*[repeat(ni, f + r) for ni, r in
zip(range(ngrps), res)])
# get the number of observations for each unique
# index in the dataset
dat = groupby_unsorted(
zip(dyn,
groupby_unsorted(self.dataset.index).values()),
get=lambda x: len(x[1]),
key=itemgetter(0))

for key in freqkeys[i]:
param_dyn_dict[key] = list(chain(*[repeat(key,
sum(val))
for key, val in
dat.items()]))


manual_dyn_df = self._manual_dyn_df
if manual_dyn_df is not None:
Expand Down Expand Up @@ -901,13 +941,30 @@ def _startvaldict(self):
f'you must provide a column {key + "_start"} in ' +
'the dataset if you want to use "auxiliary" start-vals'
)
startval = list(groupby_unsorted(
meanstartvals = list(groupby_unsorted(
zip(self._groupindex, self.dataset[key + '_start']),
key=itemgetter(0), get=itemgetter(1)).values())

meanstartvals = []
for dyn, idx in self._param_assigns[key].items():
meanstartvals += [np.mean(np.take(startval, idx))]
# evaluate the mean start-value for each group
if val[2] == 'index':
# avoid grouping with _param_assigns since anyway
# only a single value is given for each group
meanstartvals = np.mean(meanstartvals, axis=1)
else:
# get a rectangularized list of indices for each
# parameter (instead of using a loop which can be VERY
# slow for a large number of parameters)
st, sm = rectangularize(
self._param_assigns[key].values(),
return_mask=True)
# average over each group in the dataset
meanstartvals=np.ma.mean(
rectangularize(meanstartvals, return_masked=True),
axis=1)
# assign individual values to each parameter-group
meanstartvals = np.ma.mean(
np.ma.masked_array(np.take(meanstartvals, st), sm),
axis=1).compressed()

startvaldict[key] = meanstartvals
else:
Expand Down

0 comments on commit 1ead235

Please sign in to comment.