diff --git a/.gitignore b/.gitignore index 4a49655..5d7bc53 100644 --- a/.gitignore +++ b/.gitignore @@ -24,6 +24,9 @@ var/ .installed.cfg *.egg setup.cfg +doc/_build +doc/_static +doc/_templates # PyInstaller # Usually these files are written by a python script from a template @@ -90,4 +93,7 @@ ENV/ .ropeproject # IDEA project settings -.idea \ No newline at end of file +.idea + +# VSCode settings +.vscode \ No newline at end of file diff --git a/README.md b/README.md index 3b1598e..12501e8 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,51 @@ -# balsa -Collection of modelling utilities. Works with both Python 2.7 and Python 3.5+ +# Balsa -The current documentation can be found [on the Wiki page](https://github.com/pbsag/balsa/wiki). Some components are under construction. +Balsa is a collection of functions and tools for Python to facilitate travel demand +forecasting applications and analyses. It is designed to work the the “scientific +stack” of Python, namely NumPy, Pandas, and Matplotlib; which are optimized for speed +and usability. Most of balsa consists of standalone functions - for input/output, for +analysis, etc. - as well as a few lightweight class-based data structures for specific +applications. -## Contents: +Balsa is published by the Systems Analytics for Policy group inside WSP Canada. + +## Key features + + + I/O routines to convert from binary matrix formats (INRO, OMX, and more) to + Pandas DataFrames and Series. + + Matrix operations such as balancing, dis/aggregation, and bucket rounding. + + Plotting functions such a Trip Length Frequency Distributions + + Pretty Logging utilities for use in program applications + + Management of JSON configuration files, including comments. + + and more! + + Balsa is compatible with Python 2.7 and 3.5+ + +## Installation + +As a private package, Balsa **is not hosted on PyPI or other services that do not +permit private code**. Currently the best way to install Balsa is using `pip` to +install directly from GitHub: + +`pip install git+https://github.com/wsp-sag/balsa.git` + +Git will prompt you to login to your account (also works with 2FA) before installing. +This requires you to download and install a +[standalone Git client](https://git-scm.com/downloads) to communicate with GitHub. + +**Windows Users:** It is recommended to install Balsa from inside an activated Conda +environment. Balsa uses several packages (NumPy, Pandas, etc.) that will otherwise +not install correctly from `pip` otherwise. For example: + +``` +C:\> conda activate base + +(base) C:\> pip install git+https://github.com/wsp-sag/balsa.git +... +``` + +## Documentation + +HTML documentation is available upon request - until we can find a suitable hosting +service. Just email peter.kucirek@wsp.com to request. -* __cheval__: High-performance engine for evaluating discrete-choice (logit) models over DataFrames where utilities can be specified as expressions. Works with multinomial or nested models. Also includes the LinkedDataFrame class, a subclass of Pandas DataFrame which can be linked to other data frames. -* __matrices__: Matrix balancing, as well as I/O for binary matrices. -* __pandas_utils__: Utilities (such as `fast_stack`, and `align_cateogires`) for the Pandas library -* __configuration__: Parsing and validation of JSON-based configuration files -* __scribe__: Convenient functions for logging model information during a run. diff --git a/balsa/__init__.py b/balsa/__init__.py index 199f92c..d3ee540 100644 --- a/balsa/__init__.py +++ b/balsa/__init__.py @@ -1,12 +1,3 @@ -import balsa.configuration -import balsa.cheval -import balsa.matrices -import balsa.pandas_utils -import balsa.utils - -from balsa.cheval import LinkedDataFrame, ChoiceModel, sample_from_weights -from balsa.configuration import Config -from balsa.matrices import * -from balsa.logging import * -from balsa.pandas_utils import fast_stack, fast_unstack -from balsa.models import * +from .routines import * +from .logging import get_model_logger, init_root, log_to_file, ModelLogger +from .configuration import Config, ConfigParseError, ConfigSpecificationError, ConfigTypeError diff --git a/balsa/cheval/__init__.py b/balsa/cheval/__init__.py deleted file mode 100644 index a13950f..0000000 --- a/balsa/cheval/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from .api import ChoiceModel, sample_from_weights -from .ldf import LinkedDataFrame, LinkageSpecificationError -from .scope import ScopeOrientationError, InconsistentUsageError, UnsupportedSyntaxError diff --git a/balsa/cheval/api.py b/balsa/cheval/api.py deleted file mode 100644 index 65f5dd0..0000000 --- a/balsa/cheval/api.py +++ /dev/null @@ -1,303 +0,0 @@ -from __future__ import division, absolute_import, print_function, unicode_literals - -import numpy as np -import pandas as pd -from threading import Thread - -from .scope import Scope, ExpressionContainer -from .tree import ChoiceTree -from .core import sample_multinomial_worker, sample_nested_worker, stochastic_multinomial_worker, \ - stochastic_nested_worker, weighted_sample_worker - - -class ChoiceModel(object): - - def __init__(self): - self._expression_container = ExpressionContainer(self) - self._scope_container = Scope(self) - self._tree_container = ChoiceTree(self) - - @property - def scope(self): - return self._scope_container - - @property - def expressions(self): - return self._expression_container - - @property - def tree(self): - return self._tree_container - - def run_discrete(self, randomizer=None, n_draws=1, astype='category', squeeze=True, n_threads=1, - override_utilities=None, logger=None, release_scope=True): - """ - For each record, discretely sample one or more times (with replacement) from the probability distribution. - - Args: - randomizer (RandomState, int, or None): If a RandomState instance is given, it will be used to generate - random draws for the model. If int is provided, it will be passed as the seed to new RandomState. If - None, numpy.random will be used instead of a consistent RandomState. - n_draws (int): The number of times to draw (with replacement) for each record. Must be >= 1. Run time is - proportional to the number of draws. - astype (dtype or 'category' or 'index'): The dtype of the return array; the result will be cast to the - given dtype. The special value 'category' returns a Categorical Series (or a DataFrame for n_draws > 1). - The special value 'index' returns the positional index in the sorted array of node names. - squeeze (bool): Only used when n_draws == 1. If True, then a Series will be returned, otherwise a DataFrame - with one column will be returned. - n_threads (int): The number of threads to uses in the computation. Must be >= 1 - override_utilities (None or DataFrame): If not None, then the model will assume a pre-computed set of - utilities for each record x each alternative; otherwise the built-in utility computation framework will - be used. The columns of the given DataFrame MUST match the sorted list of node (alternative) names. - logger (None or Logger): Optionally provide a Logger to report progress during the run. Progress will be - reported at the INFO level. - release_scope (bool): If True and override_utilities not provided, data stored in the scope for - utility computation will be released, freeing up memory. Turning this off is of limited use. - - Returns: - Series or DataFrame, depending on squeeze and n_draws. The dtype of the returned object depends on astype. - - """ - self._check_model_is_ready(compute_utilities=override_utilities is None) - - if randomizer is None: - randomizer = np.random - elif isinstance(randomizer, (int, np.int_)): - randomizer = np.random.RandomState(randomizer) - - assert n_draws >= 1 - - if override_utilities is None: - utilities = self._scope_container._compute_utilities(n_threads, logger=logger) - record_index = self._scope_container._records - if release_scope: - self._scope_container.clear() - else: - record_index = override_utilities.index - utilities = self._prep_override_utilities(override_utilities) - - result_indices = self._eval_probabilities_and_sample(utilities, randomizer, n_draws, n_threads) - - return self._convert_result(result_indices, astype, squeeze, record_index) - - def run_stochastic(self, n_threads=1, override_utilities=None, logger=None, release_scope=True): - """ - For each record, compute the probability distribution of the logit model. A DataFrame will be returned whose - columns match the sorted list of node names (alternatives) in the model. Probabilities over all alternatives for - each record will sum to 1.0. - - Args: - n_threads (int): The number of threads to be used in the computation. Must be >= 1. - override_utilities (None or DataFrame): If not None, then the model will assume a pre-computed set of - utilities for each record x each alternative; otherwise the built-in utility computation framework will - be used. The columns of the given DataFrame MUST match the sorted list of node (alternative) names. - logger (None or Logger): Provide a Logger object to record errors during expression evaluation. - release_scope (bool): If True and override_utilities not provided, data stored in the scope for - utility computation will be released, freeing up memory. Turning this off is of limited use. - - Returns: - DataFrame of probabilities of each record x each alternative. - - """ - self._check_model_is_ready(compute_utilities=override_utilities is None) - - if override_utilities is None: - utilities = self._scope_container._compute_utilities(n_threads, logger=logger) - record_index = self._scope_container._records - if release_scope: - self._scope_container.clear() - else: - record_index = override_utilities.index - utilities = self._prep_override_utilities(override_utilities) - - raw_results = self._eval_probabilities_only(utilities, n_threads) - - return pd.DataFrame(raw_results, record_index, self._scope_container._alternatives) - - def copy(self, expressions=False, scope=False): - """ - Creates a copy of this model's structure, optionally copying the scope and expressions. - - Args: - expressions (bool): - scope: - - Returns: - - """ - raise NotImplementedError() - - def _check_model_is_ready(self, compute_utilities=True): - - assert len(self._tree_container.node_index) > 1, "At least two choices are required for a model to be valid" - - if compute_utilities: - assert len(self._expression_container) > 0, "Must define at least one utility expression" - assert len(self._scope_container._empty_symbols) == 0, "Not all scope symbols have been filled with data" - - def _eval_probabilities_and_sample(self, utilities, randomizer, n_draws, n_threads): - - # TODO: Try out saving the results in the random draws array to save on memory. - result_shape = utilities.shape[0], n_draws - random_draws = randomizer.uniform(size=result_shape) - result = np.zeros(result_shape, dtype=np.int64) - - utility_chunks = np.array_split(utilities, n_threads, axis=0) - random_chunks = np.array_split(random_draws, n_threads, axis=0) - result_chunks = np.array_split(result, n_threads, axis=0) - - if self.tree.max_level() == 1: - # Multinomial model - - threads = [ - Thread(target=sample_multinomial_worker, args=[ - utility_chunks[i], random_chunks[i], result_chunks[i] - ]) - for i in range(n_threads) - ] - else: - # Nested model - - # instructions1, instructions2 = self.tree.flatten() - hierarchy, levels, ls_scales = self.tree.flatten() - h_copies = [hierarchy.copy() for _ in range(n_threads)] - l_copies = [levels.copy() for _ in range(n_threads)] - s_copies = [ls_scales.copy() for _ in range(n_threads)] - - threads = [ - Thread(target=sample_nested_worker, args=[ - utility_chunks[i], random_chunks[i], h_copies[i], l_copies[i], s_copies[i], result_chunks[i] - ]) - for i in range(n_threads) - ] - - for t in threads: t.start() - for t in threads: t.join() - - return result - - def _convert_result(self, results, astype, squeeze, record_index): - """ - Takes the discrete outcomes as an ndarray and converts it to a Series or DataFrame of the user-specified type. - """ - - n_draws = results.shape[1] - column_index = pd.Index(range(n_draws)) - - if astype == 'index': - if squeeze and n_draws == 1: - return pd.Series(results[:, 0], index=record_index) - return pd.DataFrame(results, index=record_index, columns=column_index) - elif astype == 'category': - lookup_table = pd.Categorical(self._tree_container.node_index) - else: - lookup_table = self._tree_container.node_index.values.astype(astype) - - retval = [] - for col in range(n_draws): - indices = results[:, col] - retval.append(pd.Series(lookup_table.take(indices), index=record_index)) - retval = pd.concat(retval, axis=1) - retval.columns = column_index - - if n_draws == 1 and squeeze: - return retval.iloc[:, 0] - return retval - - def _eval_probabilities_only(self, utilities, n_threads): - result = np.zeros(shape=utilities.shape, dtype=np.float64, order='C') - - utility_chunks = np.array_split(utilities, n_threads, axis=0) - result_chunks = np.array_split(result, n_threads, axis=0) - - if self.tree.max_level() == 1: - # Multinomial model - - threads = [ - Thread(target=stochastic_multinomial_worker, args=[ - utility_chunks[i], result_chunks[i] - ]) - for i in range(n_threads) - ] - else: - # Nested model - - hierarchy, levels, ls_scales = self.tree.flatten() - - threads = [ - Thread(target=stochastic_nested_worker, args=[ - utility_chunks[i], hierarchy, levels, ls_scales, result_chunks[i] - ]) - for i in range(n_threads) - ] - - for t in threads: t.start() - for t in threads: t.join() - - return result - - def _prep_override_utilities(self, override_utilities): - assert override_utilities.columns.equals(self._tree_container.node_index) - return override_utilities.values - - -def sample_from_weights(weights, randomizer=None, astype='category', n_threads=1, n_draws=1, squeeze=True): - """ - - Args: - weights (pd.DataFrame): - randomizer (inr, RandomState, or None): - astype (type or str): - n_threads (int): - - Returns (pd.Series): - - """ - assert np.all(weights.sum(axis=1) > 0) - - if randomizer is None: - randomizer = np.random - elif isinstance(randomizer, (int, np.int_)): - randomizer = np.random.RandomState(randomizer) - - nrows = len(weights) - - raw_weight_table = np.ascontiguousarray(weights.values.astype(np.float64)) - random_draws = randomizer.uniform(size=[nrows, n_draws]) - out = np.zeros(shape=[nrows, n_draws], dtype=np.int64) - - if n_threads <= 1: - weighted_sample_worker(raw_weight_table, random_draws, out) - else: - weight_chunks = np.array_split(raw_weight_table, n_threads, axis=0) - random_chunks = np.array_split(random_draws, n_threads, axis=0) - out_chunks = np.array_split(out, n_threads, axis=0) - - threads = [Thread(target=weighted_sample_worker, args=[ - weight_chunks[i], random_chunks[i], out_chunks[i] - ]) for i in range(n_threads)] - for t in threads: t.start() - for t in threads: t.join() - - if astype == 'index': - if n_draws == 1 and squeeze: - return pd.Series(out[:, 0], index=weights.index) - return pd.DataFrame(out, index=weights.index) - elif astype == 'category': - lookup_table = pd.Categorical(weights.columns) - else: - lookup_table = weights.columns.values.astype(astype) - - if n_draws == 1 and squeeze: - return pd.Series(lookup_table.take(out[:, 0]), index=weights.index) - - column_index = pd.Index(range(n_draws)) - - retval = [] - for col in range(n_draws): - indices = out[:, col] - retval.append(pd.Series(lookup_table.take(indices), index=weights.index)) - retval = pd.concat(retval, axis=1) - retval.columns = column_index - - return retval diff --git a/balsa/cheval/api.pyi b/balsa/cheval/api.pyi deleted file mode 100644 index 8889d24..0000000 --- a/balsa/cheval/api.pyi +++ /dev/null @@ -1,61 +0,0 @@ -from typing import Union -from logging import Logger - -import numpy as np -import pandas as pd - -from balsa.cheval.scope import Scope, ExpressionContainer -from balsa.cheval.tree import ChoiceTree - -class ChoiceModel: - - _expression_container: ExpressionContainer - _scope_container: Scope - _tree_container: ChoiceTree - - def __init__(self): - pass - - @property - def scope(self) -> Scope: - pass - - @property - def expressions(self) -> ExpressionContainer: - pass - - @property - def tree(self) -> ChoiceTree: - pass - - def run_discrete(self, randomizer: Union[np.random.RandomState, int]=None, n_draws: int=1, - astype: Union[str, np.dtype, type] ='category', squeeze: bool=True, n_threads :int=1, - override_utilities: pd.DataFrame=None, logger: Logger=None - ) -> Union[pd.DataFrame, pd.Series, np.ndarray]: - pass - - def run_stochastic(self, n_threads: int=1, override_utilities: pd.DataFrame=None, logger: Logger=None - ) -> pd.DataFrame: - pass - - def _check_model_is_ready(self, compute_utilities=True) -> bool: - pass - - def _eval_probabilities_and_sample(self, utilities: np.ndarray, randomizer: np.random.RandomState, n_draws: int, - n_threads: int) -> np.ndarray: - pass - - def _convert_result(self, results: np.ndarray, astype: Union[str, np.dtype, type], squeeze: bool - ) -> Union[pd.DataFrame, pd.Series, np.ndarray]: - pass - - def _eval_probabilities_only(self, utilities: np.ndarray, n_threads: int) -> np.ndarray: - pass - - def _prep_override_utilities(self, override_utilities: pd.DataFrame) -> np.ndarray: - pass - -def sample_from_weights(weights: pd.DataFrame, randomizer: Union[np.random.RandomState, int], - astype: Union[str, np.dtype, type]='category', n_threads: int=1 - ) -> pd.Series: - pass diff --git a/balsa/cheval/core.py b/balsa/cheval/core.py deleted file mode 100644 index 78b5bde..0000000 --- a/balsa/cheval/core.py +++ /dev/null @@ -1,232 +0,0 @@ -from __future__ import division, absolute_import, print_function -# Note: unicode literals is deliberately NOT imported because it causes an error with np.dtype in Py2 - -import numba as nb -import numpy as np - - -MIN_RANDOM_VALUE = np.finfo(np.float64).tiny - - -@nb.jit(nb.int64(nb.float64, nb.float64[:]), nogil=True, nopython=True) -def logarithmic_search(r, cps): - """ - Logarithmic (binary) search algorithm for finding the greatest index whose cumulative probability is <= the random - draw. - - Allows for cells with 0 probability. - - Args: - r (float): The random draw to compare against. - cps (float[]): The cumulative probabilities to search - - Returns (int): The found index. - """ - - # The check below is required to avoid a very specific edge case in which there is more than one 0-probability - # choice at the start of the probability array, e.g. [0, 0, 0, 0.1, 0.3, 0.7, 1.0]. The randomizer draws on the - # interval [0, 1), so it's a (very) small possibility, but nonetheless would yield potentially very wrong results - if r == 0: - r = MIN_RANDOM_VALUE - - ncols = len(cps) - - lower_bound, upper_bound = 0, ncols - 1 - while (upper_bound - lower_bound) > 1: - mid_index = np.uint32((upper_bound + lower_bound) // 2) - cp_at_mid = cps[mid_index] - if r <= cp_at_mid: # left branch - upper_bound = mid_index - else: # right branch - lower_bound = mid_index - - cp_at_left = cps[lower_bound] - if r <= cp_at_left: - return lower_bound - else: - return upper_bound - - -@nb.jit(nb.int64(nb.float64, nb.float64[:]), nogil=True, nopython=True) -def binary_sample(r, cps): - """ - Optimized sampler for a Binary Logit Model (e.g. with exactly 2 choices). - - Args: - r: - cps: - - Returns: - - """ - return r <= cps[0] - - -@nb.jit(nb.float64[:](nb.float64[:]), nogil=True, nopython=True) -def multinomial_probabilities(utilities): - n_cols = len(utilities) - p = np.zeros(n_cols, dtype=np.float64) # Return value - - ls = 0.0 # Logsum - for i, u in enumerate(utilities): - expu = np.exp(u) - ls += expu - p[i] = expu - - for i in range(n_cols): - p[i] = p[i] / ls - - return p - - -@nb.jit(nb.void(nb.float64[:]), nogil=True, nopython=True) -def cumsum(array): - accum = 0.0 - length = len(array) - for i in range(length): - accum += array[i] - array[i] = accum - -# region Nested Probabilities - - -@nb.jit(nb.float64[:](nb.float64[:], nb.int64[:], nb.int64[:], nb.float64[:]), nopython=True, nogil=True) -def nested_probabilities(utilities, hierarchy, levels, logsum_scales): - n_cells = len(utilities) - probabilities = utilities.copy() - top_logsum = 0 - logsums = np.zeros(n_cells, dtype=np.float64) - - # Step 1: Exponentiate the utilities and collect logsums - max_level =levels.max() - current_level = max_level - for _ in range(max_level + 1): - # Go through levels in reverse order (e.g. starting at the bottom) - for index, level in enumerate(levels): - if level != current_level: continue # This is still faster than using np.where() - parent = hierarchy[index] - - existing_logsum = logsums[index] - parent_ls_scale = logsum_scales[parent] if parent >= 0 else 1.0 - if existing_logsum != 0: - current_ls_scale = logsum_scales[index] - expu = np.exp((probabilities[index] + current_ls_scale * np.log(existing_logsum)) / parent_ls_scale) - else: - expu = np.exp(probabilities[index] / parent_ls_scale) - if parent >= 0: logsums[parent] += expu - else: top_logsum += expu - probabilities[index] = expu - current_level -= 1 - - # Step 2: Use logsums to compute conditional probabilities - for index, parent in enumerate(hierarchy): - ls = top_logsum if parent == -1 else logsums[parent] - probabilities[index] = probabilities[index] / ls - - # Step 3: Compute absolute probabilities for child nodes, collecting parent nodes - for current_level in range(1, max_level + 1): - for index, level in enumerate(levels): - if level != current_level: continue - parent = hierarchy[index] - probabilities[index] *= probabilities[parent] - - # Step 4: Zero-out parent node probabilities - # This does not use a Set because Numba sets are really slow - for parent in hierarchy: - if parent < 0: continue - probabilities[parent] = 0.0 - - return probabilities - - -# endregion - -# region Mid-level functions - - -@nb.jit(nb.void(nb.float64[:], nb.float64[:], nb.int64[:], nb.int64[:], nb.float64[:], nb.int64[:]), nopython=True, - nogil=True) -def sample_nested(utility_row, random_numbers, hierarchy, levels, logsum_scales, out): - probabilities = nested_probabilities(utility_row, hierarchy, levels, logsum_scales) - cumsum(probabilities) # Convert to cumulative sum - - for i, r in enumerate(random_numbers): - result = logarithmic_search(r, probabilities) - out[i] = result - - -@nb.jit(nb.void(nb.float64[:], nb.float64[:], nb.int64[:]), nopython=True, nogil=True) -def sample_multinomial(utility_row, random_numbers, out): - probabilities = multinomial_probabilities(utility_row) - cumsum(probabilities) - - for i, r in enumerate(random_numbers): - result = logarithmic_search(r, probabilities) - out[i] = result - - -@nb.jit(nb.void(nb.float64[:], nb.float64[:], nb.int64[:]), nopython=True, nogil=True) -def weighted_sample(weights, random_numbers, out): - total = weights.sum() - cps = weights / total - cumsum(cps) - - for i, r in enumerate(random_numbers): - result = logarithmic_search(r, cps) - out[i] = result - - -# endregion - -# region High-level functions (to be wrapped in Threads) - - -@nb.jit(nb.void(nb.float64[:, :], nb.float64[:, :], nb.int64[:], nb.int64[:], nb.float64[:], nb.int64[:, :]), - nogil=True, nopython=True) -def sample_nested_worker(utilities, random_numbers, hierarchy, levels, logsum_scales, out): - nrows, n_draws = random_numbers.shape - - for i in range(nrows): - util_row = utilities[i, :] - sample_nested(util_row, random_numbers[i, :], hierarchy, levels, logsum_scales, out[i, :]) - - -@nb.jit(nb.void(nb.float64[:, :], nb.float64[:, :], nb.int64[:, :]), nopython=True, nogil=True) -def sample_multinomial_worker(utilities, random_numbers, out): - nrows, n_draws = random_numbers.shape - - for i in range(nrows): - util_row = utilities[i, :] - sample_multinomial(util_row, random_numbers[i, :], out[i, :]) - - -@nb.jit(nb.void(nb.float64[:, :], nb.int64[:], nb.int64[:], nb.float64[:], nb.float64[:, :]), - nopython=True, nogil=True) -def stochastic_nested_worker(utilities, hierarchy, levels, logsum_scales, out): - nrows = utilities.shape[0] - - for i in range(nrows): - util_row = utilities[i, :] - probabilities = nested_probabilities(util_row, hierarchy, levels, logsum_scales) - out[i, :] = probabilities - - -@nb.jit(nb.void(nb.float64[:, :], nb.float64[:, :]), nopython=True, nogil=True) -def stochastic_multinomial_worker(utilities, out): - nrows = utilities.shape[0] - - for i in range(nrows): - util_row = utilities[i, :] - probabilities = multinomial_probabilities(util_row) - out[i, :] = probabilities - - -@nb.jit(nb.void(nb.float64[:, :], nb.float64[:, :], nb.int64[:, :]), nogil=True, nopython=True) -def weighted_sample_worker(weights, random_numbers, out): - nrows = weights.shape[0] - - for i in range(nrows): - weighted_sample(weights[i, :], random_numbers[i, :], out[i, :]) - -# endregion - diff --git a/balsa/cheval/ldf.py b/balsa/cheval/ldf.py deleted file mode 100644 index b6d34a5..0000000 --- a/balsa/cheval/ldf.py +++ /dev/null @@ -1,525 +0,0 @@ -from __future__ import division, absolute_import, print_function, unicode_literals - -import pandas as pd -import numpy as np - -from collections import namedtuple, Hashable -from six import iteritems, itervalues, string_types -from balsa.utils import is_identifier - -LinkageEntry = namedtuple("LinkageEntry", ['other_frame', 'self_indexer', 'other_indexer', 'fill_value', - 'self_names', 'self_index_flag', 'other_names', 'other_index_flag', - 'aggregation_required']) - -SUPPORTED_AGGREGATIONS = { - 'count', 'first', 'last', 'max', 'min', 'mean', 'median', 'prod', 'std', 'sum', 'var' -} - - -class LinkageSpecificationError(ValueError): - pass - - -class LinkageNode(object): - """ - Linkage derived from LinkedDataFrame - """ - - def __init__(self, frame, root_index, history): - self._history = history - self._df = frame - self._root_index = root_index - - def __dir__(self): - return [col for col in self._df.columns if is_identifier(col)] + list(self._df._links.keys()) - - def __getitem__(self, item): - return self.__getattr__(item) - - def __getattr__(self, item): - if item in self._df._links: - return self._get_link(item) - if item not in self._df: - raise AttributeError(item) - - s = self._df[item] - for left_indexer, right_indexer, fill_value in self._history: - s.index = right_indexer - s = s.reindex(left_indexer, fill_value=fill_value) - s.index = self._root_index - return s - - def _get_link(self, item): - - linkage_entry = self._df._links[item] - - history_copy = list(self._history) # Make a copy - history_copy.insert(0, (linkage_entry.self_indexer, linkage_entry.other_indexer, linkage_entry.fill_value)) - - other = linkage_entry.other_frame - if linkage_entry.aggregation_required: - return LinkageAttributeAggregator(other, self._root_index, history_copy) - if isinstance(other, LinkedDataFrame): - return LinkageNode(other, self._root_index, history_copy) - return LinkageLeaf(other, self._root_index, history_copy) - - -class LinkageLeaf(object): - """ - Linkage derived from regular DataFrame. - """ - - def __init__(self, frame, root_index, history): - self._history = history - self._df = frame - self._root_index = root_index - - def __dir__(self): - return self._df.columns - - def __getitem__(self, item): - return self.__getattr__(item) - - def __getattr__(self, item): - if item not in self._df.columns: - raise AttributeError(item) - - s = self._df[item] - for left_indexer, right_indexer, fill_value in self._history: - s.index = right_indexer - s = s.reindex(left_indexer, fill_value=fill_value) - s.index = self._root_index - return s - - -class LinkageAttributeAggregator(object): - """ - Many-to-one linkage manager. - """ - - def __init__(self, frame, root_index, history): - self._df = frame - self._history = history - self._root_index = root_index - - def __dir__(self): - return dir(LinkageAttributeAggregator) + list(self._df.columns) - - def _apply_expr(self, expr, grouper_func): - self_indexer, other_indexer, fill_value = self._history[0] - - s = self._df.eval(expr) - if not isinstance(s, pd.Series): - # Sometimes a single value is returned, so ensure that it is cast to Series - s = pd.Series(s, index=self._df.index) - s.index = other_indexer - grouper = 0 if other_indexer.nlevels == 1 else other_indexer.names - grouped = s.groupby(level=grouper) - - s = grouper_func(grouped) - s = s.reindex(self_indexer, fill_value=fill_value) - - for left_indexer, right_indexer, fill_value in self._history[1:]: - s.index = right_indexer - s = s.reindex(left_indexer, fill_value=fill_value) - s.index = self._root_index - return s - - def first(self, expr="1"): - """ - Analagous to pandas.GroupBy.first() - - Args: - expr (basestring): An attribute or expression to evaluate in the context of this DataFrame - - Returns: Series - - """ - return self._apply_expr(expr, lambda grouped: grouped.first()) - - def last(self, expr="1"): - """ - Analagous to pandas.GroupBy.last() - - Args: - expr (basestring): An attribute or expression to evaluate in the context of this DataFrame - - Returns: Series - - """ - return self._apply_expr(expr, lambda grouped: grouped.last()) - # return self._apply('last', expr) - - def max(self, expr="1"): - """ - Analagous to pandas.GroupBy.max() - - Args: - expr (basestring): An attribute or expression to evaluate in the context of this DataFrame - - Returns: Series - """ - return self._apply_expr(expr, lambda grouped: grouped.max()) - - def mean(self, expr="1"): - """ - Analagous to pandas.GroupBy.mean() - - Args: - expr (basestring): An attribute or expression to evaluate in the context of this DataFrame - - Returns: Series - """ - return self._apply_expr(expr, lambda grouped: grouped.mean()) - - def median(self, expr="1"): - """ - Analagous to pandas.GroupBy.median() - - Args: - expr (basestring): An attribute or expression to evaluate in the context of this DataFrame - - Returns: Series - """ - return self._apply_expr(expr, lambda grouped: grouped.median()) - - def min(self, expr="1"): - """ - Analagous to pandas.GroupBy.min() - - Args: - expr (basestring): An attribute or expression to evaluate in the context of this DataFrame - - Returns: Series - """ - return self._apply_expr(expr, lambda grouped: grouped.min()) - - def prod(self, expr="1"): - """ - Analagous to pandas.GroupBy.prod() - - Args: - expr (basestring): An attribute or expression to evaluate in the context of this DataFrame - - Returns: Series - """ - return self._apply_expr(expr, lambda grouped: grouped.prod()) - - def std(self, expr="1"): - """ - Analagous to pandas.GroupBy.std() - - Args: - expr (basestring): An attribute or expression to evaluate in the context of this DataFrame - - Returns: Series - """ - return self._apply_expr(expr, lambda grouped: grouped.std()) - - def sum(self, expr="1"): - """ - Analagous to pandas.GroupBy.sum() - - Args: - expr (basestring): An attribute or expression to evaluate in the context of this DataFrame - - Returns: Series - """ - return self._apply_expr(expr, lambda grouped: grouped.sum()) - - def var(self, expr="1"): - """ - Analagous to pandas.GroupBy.var() - - Args: - expr (basestring): An attribute or expression to evaluate in the context of this DataFrame - - Returns: Series - """ - return self._apply_expr(expr, lambda grouped: grouped.var()) - - -def _is_aggregation_required(self_indexer, other_indexer): - """ - Optimized code to test if Linkage relationship is many-to-one or one-to-one. - - Args: - self_indexer: - other_indexer: - - Returns: bool - - """ - # If the right indexer is 100% unique, then no aggregation is required - if other_indexer.is_unique: - return False - - # Otherwise, aggregation is only required if at least one duplicate value - # is in the left indexer. - dupes = other_indexer.get_duplicates() - for dupe in dupes: - # Eager loop through the index. For this to be slow, a large number of duplicates must be missing - # in self_indexer, which is practically never the case. - if dupe in self_indexer: - return True - return False - - -def _get_indexer_from_frame(frame, names, get_from_index): - """ - Gets or constructs an indexer object from a given DataFrame - - Args: - frame (DataFrame): - names (list): A list of names. If None, this function returns the index of the frame. - get_from_index (bool): True if new indexer is to be generated from the index of the frame, - False if it is to be generated from the columns of the DataFrame - - Returns: Either an Index or MultiIndex, depending on the arguments - - Raises: LinkageSpecificationError - - """ - if names is None: return frame.index - - if get_from_index: - if len(names) > frame.index.nlevels: - raise LinkageSpecificationError("Cannot specify more levels than in the index") - arrays = [] - for name in names: - # `get_level_values` works on both Index and MultiIndex objects and accepts both - # integer levels AND level names - try: arrays.append(frame.index.get_level_values(name)) - except KeyError: raise LinkageSpecificationError("'%s' not in index" % name) - else: - arrays = [] - for name in names: - if name not in frame: raise LinkageSpecificationError("'%s' not in columns" % name) - arrays.append(frame[name]) - - if len(arrays) == 1: - return pd.Index(arrays[0], name=names[0]) - return pd.MultiIndex.from_arrays(arrays, names=names) - - -class LinkedDataFrame(pd.DataFrame): - """ - Subclass of DataFrame, which links to other DataFrames. This allows "chained" attribute access. - - To create a link between a LinkedDataFrame and another DataFrame, call the `link(other, alias, *args, **kwargs)` - method. Links can be established either on level(s) in the index, or on column(s) in either Frame. The other Frame - will be set to the given alias. - """ - - @property - def _constructor(self): - return LinkedDataFrame - - def __init__(self, *args, **kwargs): - super(LinkedDataFrame, self).__init__(*args, **kwargs) - object.__setattr__(self, '_links', {}) - object.__setattr__(self, '_pythonic_links', set()) - - def __finalize__(self, original_ldf, method=None, **kwargs): - """ Essential functionality to ensure that slices of LinkedDataFrame retain the same behaviour """ - - # Super - pd.DataFrame.__finalize__(self, original_ldf, method=method, **kwargs) - - # Sometimes the original frame is not necessarily a LinkedDataFrame - if not isinstance(original_ldf, LinkedDataFrame): return self - - # Copy the links across - self._links = {} - for alias, linkage_entry in iteritems(original_ldf._links): - try: - self_names = linkage_entry.self_names - get_self_indexer_from_index = linkage_entry.self_index_flag - - new_self_indexer = _get_indexer_from_frame(self, self_names, get_self_indexer_from_index) - - new_linkage_entry = LinkageEntry(linkage_entry.other_frame, new_self_indexer, linkage_entry.other_indexer, - linkage_entry.fill_value, - linkage_entry.self_names, linkage_entry.self_index_flag, - linkage_entry.other_names, linkage_entry.other_index_flag, - linkage_entry.aggregation_required) - - self._links[alias] = new_linkage_entry - except: - # If there's a problem, silently drop the linkage. This is deliberate as a lot of pandas' methods - # make slices and therefore this needs to be very very flexible to accommodate a lot of different use - # cases. - pass - self._pythonic_links = set(original_ldf._pythonic_links) - - return self - - def __dir__(self): - """ Override dir() to show links as valid attributes """ - return super(LinkedDataFrame, self).__dir__() + sorted(self._pythonic_links) - - def __getitem__(self, item): - if isinstance(item, Hashable) and item in self._links: - return self._get_link(item) - return super(LinkedDataFrame, self).__getitem__(item) - - def __getattr__(self, item): - if isinstance(item, Hashable) and item in self._links: - return self._get_link(item) - return super(LinkedDataFrame, self).__getattr__(item) - - def link_to(self, other, alias, on=None, on_self=None, on_other=None, levels=None, self_levels=None, other_levels=None, - fill_value=np.NaN): - """ - Creates a new link from this DataFrame to another, assigning it to the given name. - - The relationship between the left-hand-side (this DataFrame itself) and the right-hand-side (the other - DataFrame) must be pre-specified to create the link. The relationship can be based on the index (or a subset of - it levels in a MultiIndex) OR based on columns in either DataFrame. - - By default, if both the "levels" and "on" args of one side are None, then the join will be made on ALL levels - of the side's index. - - Regardless of whether the join is based on an index or columns, the same number of levels must be given. For - example, if the left-hand indexer uses two levels from the index, then the right-hand indexer must also use - two levels in the index or two columns.s - - When the link is established, it is tested whether the relationship is one-to-one or many-to-one (the latter - indicates that aggregation is required). The result of this test is returned by this method. - - Args: - other (DataFrame or LinkedDataFrame): - alias (basestring): The alias (symbolic name) of the new link. If Pythonic, this will show up as an - attribute; otherwise the link will need to be accessed using []. - on (list or basestring or None): If given, the join will be made on the provided **column(s)** in both this - and the other DataFrame. This arg cannot be used with `levels` and will override `on_self` and - `on_other`. - on_self (list or basestring or None): If provided, the left-hand side of the join will be made on the - column(s) in this DataFrame. This arg cannot be used with `self_levels`. - on_other: (list or basestring or None): If provided, th right-hand-side of the join will be made on the - column(s) in the other DataFrame. This arg cannot be used with `other_levels`. - levels (list or basestring or None): If provided, the join will be made on the given **level(s)** - in both this and the other DataFrame's index. It can be specified as an integer or a string, - if both indexes have the same level names. This arg cannot be used with `on` and will override - `self_levels` and `other_levels`. - self_levels (list or basestring or None): If provided, the left-hand-side of the join will be made on the - level(s) in this DataFrame. This arg cannot be used with `on_self`. - other_levels (list or basestring or None): If provided, the right-hand-side of the join will be made on the - level(s) in the other DataFrame. This arg cannot be used with `on_other`. - fill_value (numeric): The value to fill in the results if there are any missing values. Defaults to NaN. - - Returns: - bool: True if aggregation is required for this link. False otherwise. - - Raises: - LinkageSpecificationError - - """ - - if isinstance(on, string_types): on = [on] - if isinstance(on_self, string_types): on_self = [on_self] - if isinstance(on_other, string_types): on_other = [on_other] - if isinstance(levels, string_types): levels = [levels] - if isinstance(self_levels, string_types): self_levels = [self_levels] - if isinstance(other_levels, string_types): other_levels = [other_levels] - - self_indexer_names = None - get_self_indexer_from_index = True - other_indexer_names = None - get_other_indexer_from_index = True - - if on is not None and levels is not None: - raise LinkageSpecificationError() - elif on is not None: - self_indexer_names = on - get_self_indexer_from_index = False - other_indexer_names = on - get_other_indexer_from_index = False - elif levels is not None: - self_indexer_names = levels - get_self_indexer_from_index = True - other_indexer_names = levels - get_other_indexer_from_index = True - else: - if on_self is not None and self_levels is not None: - raise LinkageSpecificationError() - elif on_self is not None: - self_indexer_names = on_self - get_self_indexer_from_index = False - elif self_levels is not None: - self_indexer_names = self_levels - get_self_indexer_from_index = True - - if on_other is not None and other_levels is not None: - raise LinkageSpecificationError() - elif on_other is not None: - other_indexer_names = on_other - get_other_indexer_from_index = False - elif other_levels is not None: - other_indexer_names = other_levels - get_other_indexer_from_index = True - - self_indexer = _get_indexer_from_frame(self, self_indexer_names, get_self_indexer_from_index) - other_indexer = _get_indexer_from_frame(other, other_indexer_names, get_other_indexer_from_index) - - aggregation_flag = _is_aggregation_required(self_indexer, other_indexer) - - link = LinkageEntry(other, self_indexer, other_indexer, fill_value, - self_indexer_names, get_self_indexer_from_index, - other_indexer_names, get_other_indexer_from_index, - aggregation_flag) - - self._links[alias] = link - - if is_identifier(alias): - self._pythonic_links.add(alias) - - return aggregation_flag - - def _get_link(self, name): - - linkage_entry = self._links[name] - - history = [(linkage_entry.self_indexer, linkage_entry.other_indexer, linkage_entry.fill_value)] - - other = linkage_entry.other_frame - if linkage_entry.aggregation_required: - return LinkageAttributeAggregator(other, self.index, history) - if isinstance(other, LinkedDataFrame): - return LinkageNode(other, self.index, history) - return LinkageLeaf(other, self.index, history) - - def remove_link(self, alias): - """ - Destroys a linkage. - - Args: - alias (basestring): The name of the linkage to remove. - - Raises: KeyError if no linkage matches the given alias. - - """ - del self._links[alias] - if alias in self._pythonic_links: - self._pythonic_links.remove(alias) - - def refresh_links(self): - """ - Updates all outgoing linkages, as there is no callback when the other indexer changes. In practise, this doesn't - happen very often - the most frequent example is if `set_index(*args, inplace=True)` is called. The join might - break if it is based on the index. - - Raises: - LinkageSpecificationError - """ - for linkage_entry in itervalues(self._links): - linkage_entry.other_indexer = _get_indexer_from_frame(linkage_entry.other_frame, - linkage_entry.other_names, - linkage_entry.other_index_flag) - - def links(self): - """ - Iterate over the names of links in this frame. - """ - for link_name in self._links: - yield link_name diff --git a/balsa/cheval/ldf.pyi b/balsa/cheval/ldf.pyi deleted file mode 100644 index 2dce290..0000000 --- a/balsa/cheval/ldf.pyi +++ /dev/null @@ -1,31 +0,0 @@ -from typing import Any, List, Dict, Set, Iterator - -from pandas import DataFrame, Index -import numpy as np - - -class LinkageEntry(object): - def __init__(self, *args, **kwargs): - self.other_frame: DataFrame - self.self_indexer: Index - self.other_indexer: Index - self.fill_value: Any - self.self_names: List[str] - self.self_index_flag: bool - self.other_names: List[str] - self.other_index_flag: bool - self.aggregation_required: bool - - -class LinkedDataFrame(DataFrame): - - def __init__(self, *args, **kwargs): - self._links: Dict[str, LinkageEntry] - self._pythonic_links: Set[str] - - def link_to(self, other: DataFrame, alias: str, on: str=None, on_self: str=None, on_other: str=None, - levels: str=None, self_levels: str=None, other_levels: str=None, fill_value: Any=np.NaN) -> bool: pass - - def remove_link(self, alis: str): pass - - def links(self) -> Iterator[str]: pass diff --git a/balsa/cheval/scope/__init__.py b/balsa/cheval/scope/__init__.py deleted file mode 100644 index 4620ae8..0000000 --- a/balsa/cheval/scope/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from .expressions import ExpressionContainer, InconsistentUsageError -from .scope import Scope, ScopeOrientationError -from .parsing import UnsupportedSyntaxError, SimpleUsage, DictLiteral, AttributedUsage, LinkedFrameUsage diff --git a/balsa/cheval/scope/expressions.py b/balsa/cheval/scope/expressions.py deleted file mode 100644 index 335b819..0000000 --- a/balsa/cheval/scope/expressions.py +++ /dev/null @@ -1,160 +0,0 @@ -from __future__ import division, absolute_import, print_function, unicode_literals - -from balsa.cheval.scope.parsing import ExpressionProcessor, SimpleUsage, DictLiteral, AttributedUsage, LinkedFrameUsage -from six import iteritems, string_types - - -class InconsistentUsageError(RuntimeError): - pass - - -class Expression(object): - - def __init__(self, expr): - self._raw_expr = expr - parsed, symbols = ExpressionProcessor.parse(expr) - self._parsed_expr = parsed - self._symbols = symbols - - def symbols(self): - for item in iteritems(self._symbols): yield item - - -class ExpressionContainer(object): - - def __init__(self, model): - self._expressions = [] - self._model_ref = model - self._modified = True - self._cached_types = None - self._cached_literals = None - - def __iter__(self): - for expr in self._expressions: yield expr - - def __len__(self): - return len(self._expressions) - - def append(self, expression_or_iterable): - """ - Appends an expression into the LogitModel scope. Expressions are assumed to be order-dependant, just in case. - - If the given argument is an iterable of strings, then this method will perform a "batch-add", adding them all - to the model in order. - - Args: - expression_or_iterable (str or Iterable): The expression or sequence of expressions to append to the model - """ - if isinstance(expression_or_iterable, string_types): - self._append_single(expression_or_iterable) - else: - # Assume that it's an iterable - self._batch_add_expressions(expression_or_iterable) - - def _append_single(self, expression): - expr_wrapper = Expression(expression) - self._expressions.append(expr_wrapper) - self._modify_event() - - def _batch_add_expressions(self, list_of_expressions): - for expr in list_of_expressions: - try: - expr_wrapper = Expression(expr) - except Exception as e: - raise SyntaxError("Error parsing '%s': %s: %s" % (expr, e.__class__.__name__, e)) - self._expressions.append(expr_wrapper) - self._modify_event() - - def insert(self, expression, index): - """ - Inserts an expression into the LogitModel scope at a given location. Expressions are assumed to be - order-dependant, just in case. - - Args: - expression (str): The expression to insert. - index (int): The 0-based position in which to insert the expression. - """ - expr_wrapper = Expression(expression) - self._expressions.insert(index, expr_wrapper) - self._modify_event() - - def remove_expression(self, index): - """ - Removes an expression at the provided index. - - Args: - index (int): The 0-based index at which to remove an expression - """ - del self._expressions[index] - self._modify_event() - - def clear(self): - """ - Clears the LogitModel of all expressions. Any symbols already filled in the Scope will be cleared as well. - """ - self._expressions.clear() - self._modify_event() - - def _modify_event(self): - self._modified = True - self._model_ref.scope.clear() - - def get_symbols(self): - if self._modified or self._cached_types is None or self._cached_literals is None: - symbol_types = {} - dict_literals = {} - for alias, list_of_usages in iteritems(self._all_symbols()): - symbol_type = self._check_symbol(alias, list_of_usages) - - if symbol_type is DictLiteral: - dict_literals[alias] = list_of_usages[0] # Dict literals only have one use case - else: - symbol_types[alias] = symbol_type - self._modified = False - self._cached_types = symbol_types - self._cached_literals = dict_literals - return symbol_types, dict_literals - else: - return self._cached_types, self._cached_literals - - @staticmethod - def _check_symbol(alias, list_of_usages): - """ - Checks a symbol's usages to ensure that they are consistently used; then return the inferred type of that symbol - """ - inferred_type = None - for usage in list_of_usages: - usage_type = usage.__class__ - - if inferred_type is None: - inferred_type = usage_type - else: - usage_is_attributed = usage_type == AttributedUsage - inferred_is_attributed = inferred_type == AttributedUsage - usage_is_linked = usage_type == LinkedFrameUsage - inferred_is_linked = inferred_type == LinkedFrameUsage - - if usage_is_linked and inferred_is_attributed: - inferred_type = LinkedFrameUsage - elif usage_is_attributed and inferred_is_linked: - pass - elif usage_type != inferred_type: - raise InconsistentUsageError( - "Symbol '%s' is used inconsistently" % alias - ) - if inferred_type is None: - raise RuntimeError("Inferred type should never be None") - - return inferred_type - - def _all_symbols(self): - all_symbols = {} - for expression in self._expressions: - for alias, usages in expression.symbols(): - if isinstance(usages, DictLiteral): - usages = [usages] - if alias not in all_symbols: - all_symbols[alias] = list(usages) # Make a copy - else: - all_symbols[alias] += usages - return all_symbols diff --git a/balsa/cheval/scope/expressions.pyi b/balsa/cheval/scope/expressions.pyi deleted file mode 100644 index e3d4fe5..0000000 --- a/balsa/cheval/scope/expressions.pyi +++ /dev/null @@ -1,33 +0,0 @@ -import pandas as pd -import numpy as np -from typing import List, Union, Generator, Iterable, Any - -class Expression: - def __init__(self, expr: str): - pass - -class ExpressionContainer: - - _expressions: list - _model_ref: Any - - def __init__(self, model): - pass - - def __iter__(self) -> Generator[Expression]: - pass - - def append(self, expression_or_iterable: Union[str, Iterable[str]]): - pass - - def clear(self): - pass - - def _append_single(self, expression): - pass - - def _batch_add_expressions(self, list_of_expressions): - pass - - def _modify_event(self): - pass diff --git a/balsa/cheval/scope/parsing.py b/balsa/cheval/scope/parsing.py deleted file mode 100644 index ccc39fd..0000000 --- a/balsa/cheval/scope/parsing.py +++ /dev/null @@ -1,239 +0,0 @@ -from __future__ import division, absolute_import, print_function, unicode_literals - -import ast -from collections import namedtuple, deque -import astor -import numpy as np -import pandas as pd -from numexpr import expressions as nee -import six - -from ..ldf import SUPPORTED_AGGREGATIONS - -NUMEXPR_FUNCTIONS = set(nee.functions.keys()) -MAX_ATTRIBUTE_CHAINS = 50 # To avoid infinite loop in __get_name_from_attribute() -NAN_STR = '__NAN' - - -class SimpleUsage(object): - pass - - -class DictLiteral(object): - def __init__(self, substitution, series): - self.substitution = substitution - self.series = series - - -class AttributedUsage(object): - def __init__(self, substitution, attribute): - self.substitution = substitution - self.attribute = attribute - - -class LinkedFrameUsage(object): - def __init__(self, substitution, stack, func, func_expr): - self.substitution = substitution - self.stack = stack - self.func = func - self.func_expr = func_expr - - -class UnsupportedSyntaxError(SyntaxError): - pass - - -class ExpressionProcessor(ast.NodeTransformer): - - # Only nodes used in expressions are included, due to the limited parsing - UNSUPPORTED_NODES = [ - ast.Load, ast.Store, ast.Del, ast.IfExp, ast.Subscript, ast.ListComp, ast.DictComp - ] - if six.PY3: - UNSUPPORTED_NODES.append(ast.Starred) - UNSUPPORTED_NODES = tuple(UNSUPPORTED_NODES) - - @staticmethod - def parse(expression): - tree = ast.parse(expression, mode='eval').body - transformer = ExpressionProcessor() - new_tree = transformer.visit(tree) - new_expression = astor.to_source(new_tree) - - return new_expression, transformer.__symbols - - def __init__(self): - self.__symbols = {} - self.__n_dicts = 0 - - def __append_symbol(self, name, usage): - if name not in self.__symbols: - self.__symbols[name] = [usage] - else: - self.__symbols[name].append(usage) - - def __generate_substitution(self, name): - return name + ('0' if name not in self.__symbols else str(len(self.__symbols[name]))) - - def visit(self, node): - return self.__get_visitor(node)(node) - - def __get_visitor(self, node): - if isinstance(node, ExpressionProcessor.UNSUPPORTED_NODES): - raise UnsupportedSyntaxError(node.__class__.__name__) - name = "visit_" + node.__class__.__name__.lower() - return getattr(self, name) if hasattr(self, name) else self.generic_visit - - def visit_call(self, node): - func_node = node.func - - if isinstance(func_node, ast.Name): - # Top-level function - return self.__visit_toplevel_func(node, func_node) - elif isinstance(func_node, ast.Attribute): - # Method of an object - return self.__visit_method(node, func_node) - else: - return self.generic_visit(node) - - def __visit_toplevel_func(self, node, func_node): - func_name = func_node.id - if func_name not in NUMEXPR_FUNCTIONS: - raise UnsupportedSyntaxError("Function '%s' not supported." % func_name) - - node.args = [self.__get_visitor(arg)(arg) for arg in node.args] - node.starargs = None - if not hasattr(node, 'kwargs'): - node.kwargs = None - - return node - - def __visit_method(self, call_node, func_node): - name, stack = self.__get_name_from_attribute(func_node) - func_name = stack.popleft() - if func_name not in SUPPORTED_AGGREGATIONS: - raise UnsupportedSyntaxError("Linked Data Frame aggregation '%s' is not supported." % func_name) - - if not hasattr(call_node, 'starargs'): call_node.starargs = None - if not hasattr(call_node, 'kwargs'): call_node.kwargs = None - - if len(call_node.keywords) > 0: - raise UnsupportedSyntaxError("Keyword args are not supported inside Linked Data Frame aggregations") - if call_node.starargs is not None or call_node.kwargs is not None: - raise UnsupportedSyntaxError("Star-args or star-kwargs are not supported inside Linked Data Frame " - "aggregation") - arg_expression = astor.to_source(call_node.args[0]) - substitution = self.__generate_substitution(name) - - usage = LinkedFrameUsage(substitution, stack, func_name, arg_expression) - self.__append_symbol(name, usage) - - new_node = ast.Name(substitution, ast.Load()) - return new_node - - def visit_name(self, node): - symbol_name = node.id - - if symbol_name.lower() == 'nan' or symbol_name == 'None': - # Allow None or NaN or nan to mean 'null' - node.id = NAN_STR - else: - # Register the symbol but do not change it. - self.__append_symbol(symbol_name, SimpleUsage()) - return node - - def visit_attribute(self, node): - name, stack = self.__get_name_from_attribute(node) - substitution = self.__generate_substitution(name) - if len(stack) == 1: - attribute = stack.pop() - usage = AttributedUsage(substitution, attribute) - self.__append_symbol(name, usage) - else: - usage = LinkedFrameUsage(substitution, stack, None, None) - self.__append_symbol(name, usage) - - return ast.Name(substitution, ast.Load()) - - @staticmethod - def __get_name_from_attribute(node): - current_node = node - stack = deque() - while not isinstance(current_node, ast.Name): - if not isinstance(current_node, ast.Attribute): - raise UnsupportedSyntaxError() - if len(stack) > MAX_ATTRIBUTE_CHAINS: - raise RuntimeError("Recursion error") - stack.append(current_node.attr) - current_node = current_node.value - - return current_node.id, stack - - def visit_dict(self, node): - substitution = '__dict%s' % self.__n_dicts - self.__n_dicts += 1 - new_node = ast.Name(substitution, ast.Load()) - - try: - values = [] - for val in node.values: - if isinstance(val, ast.UnaryOp): - assert isinstance(val.operand, ast.Num) - assert isinstance(val.op, ast.USub) - values.append(np.float32(-val.operand.n)) - elif isinstance(val, ast.Num): - values.append(np.float32(val.n)) - else: - raise ValueError() - - keys = [self.__get_dict_key(key) for key in node.keys] - except (ValueError, AssertionError): - raise UnsupportedSyntaxError("Dict literals are supported for numeric values only") - - s = pd.Series(values, index=keys) - usage = DictLiteral(substitution, s) - self.__symbols[substitution] = usage - - return new_node - - def visit_boolop(self, node): - # Converts 'and' and 'or' into '&' and '|' which NumExpr supports - # BoolOp objects have a list of values but need to be converted into a tree of BinOps - - if isinstance(node.op, ast.And): - new_op = ast.BitAnd() - elif isinstance(node.op, ast.Or): - new_op = ast.BitOr() - else: - raise NotImplementedError(type(node.op)) - - values = node.values - left = self.visit(values[0]) - i = 1 - while i < len(values): - right = self.visit(values[i]) - left = ast.BinOp(left=left, right=right, op=new_op) - i += 1 - return left - - def visit_unaryop(self, node): - # Converts 'not' into '~' which NumExpr supports - if isinstance(node.op, ast.Not): - return ast.UnaryOp(op=ast.Invert(), operand=self.visit(node.operand)) - elif isinstance(node.op, ast.USub): - return node - raise NotImplementedError(type(node.op)) - - def visit_str(self, node): - # Converts text-strings to NumExpr-supported byte-strings - if six.PY3: - return ast.Bytes(node.s.encode()) - return node - - @staticmethod - def __get_dict_key(node): - if isinstance(node, ast.Name): - return node.id - if isinstance(node, ast.Str): - return node.s - raise UnsupportedSyntaxError("Dict key of type '%s' unsupported" % node) diff --git a/balsa/cheval/scope/scope.py b/balsa/cheval/scope/scope.py deleted file mode 100644 index 3cf79fc..0000000 --- a/balsa/cheval/scope/scope.py +++ /dev/null @@ -1,478 +0,0 @@ -from __future__ import division, absolute_import, print_function, unicode_literals - -import pandas as pd -import numpy as np -import numexpr as ne -from six import iteritems - -from balsa.cheval.scope.parsing import SimpleUsage, DictLiteral, AttributedUsage, LinkedFrameUsage, NAN_STR -from balsa.cheval.ldf import LinkedDataFrame - - -class ScopeOrientationError(IndexError): - pass - - -class ScalarSymbol(object): - - def __init__(self, value): - self._value = value - - def get_value(self, usage): - return self._value - - -class Array1DSymbol(object): - - def __init__(self, array, orientation): - self._data = array - self._orientation = orientation - - def get_value(self, usage): - converted = convert_series(self._data, allow_raw=True) - - length = len(converted) - new_shape = [1, 1] - new_shape[self._orientation] = length - - view = converted[...] # Make a shallow copy - view.shape = new_shape - return view - - -class Array2DSymbol(object): - - def __init__(self, array, orientation): - self._data = np.transpose(array) if orientation == 1 else array[...] - - def get_value(self, usage): - return self._data - - -class FrameSymbol(object): - - def __init__(self, frame, orientation): - self._frame = frame - self._orientation = orientation - - def get_value(self, usage): - series = self._frame[usage.attribute] - data = convert_series(series) - - length = len(self._frame) - shape_tuple = [1, 1] - shape_tuple[self._orientation] = length - - data = data[...] # Make a shallow copy - data.shape = shape_tuple - return data - - -class LinkedFrameSymbol(object): - def __init__(self, frame, orientation): - self._frame = frame - self._orientation = orientation - - def get_value(self, usage): - if isinstance(usage, AttributedUsage): - series = self.__get_attributed_value(usage) - elif isinstance(usage, LinkedFrameUsage): - series = self.__get_linked_usage(usage) - else: - raise NotImplementedError("This should never happen") - - data = convert_series(series)[...] # Make a shallow copy so we can change the shape - n = len(series) - new_shape = [1, 1] - new_shape[self._orientation] = n - data.shape = new_shape - - return data - - def __get_attributed_value(self, usage): - return self._frame[usage.attribute] - - def __get_linked_usage(self, usage): - item = self._frame - for attribute in reversed(usage.stack): - item = getattr(item, attribute) - - if usage.func is not None: - method = getattr(item, usage.func) - func_expr = "1" if usage.func_expr is None else usage.func_expr - item = method(func_expr) - - if not isinstance(item, pd.Series): - pretty_stack = '.'.join(str(c) for c in reversed(usage.stack)) - raise AttributeError("Chained lookup '%s' does not point to a valid Series" % pretty_stack) - - return item - - -class PanelSymbol(object): - - def __init__(self, panel): - self._data = panel - - def get_value(self, usage): - raise NotImplementedError() - - -class Scope(object): - - def __init__(self, model): - self._root = model - self._empty_symbols = None - self._filled_symbols = None - - self._records = None - self._alternatives = None - - def set_record_index(self, index): - """ - Manually set the index of records being processed. - - If some symbols are already filled, calling this method CLEARS their data, resetting them back to 'empty'. - - Args: - index (List or Index): Any sequence that Pandas accepts when constructing an Index. - - """ - self._initialize() - if not isinstance(index, pd.Index): - index = pd.Index(index) - self._records = index - - def empty_symbols(self): - """ - Iterate through the set of empty (not yet filled) symbols. - - Returns: - Iterable[str]: Symbols that are empty (have no set data just yet) - - """ - raise NotImplementedError() - - def fill_symbol(self, symbol_name, data, orientation=None, strict=True, allow_unused=True): - """ - Associate an empty symbol with actual data, usually ararys, DataFrames, and/or Series. Symbol usages are - collected from the list of model expressions already-loaded, so the type of `data` must conform to the rules of - usage. - - Args: - symbol_name (str): The name of the symbol being filled. - data: Numerical data to be associated with the filled symbol. Supported types include scalar numbers, 1-D - arrays, 2-D arrays, Series, DataFrames, and Dictionaries of DataFrames / Panels. Not all data types are - supported by all use cases, refer to usage rules for further details. - orientation (int): Specifies which dimension of the utility table to which the FIRST dimension (axis) - of the data is oriented: 0: oriented to the records, 1: oriented to the alternatives. This is useful in - cases where the records index and alternatives index are the same (the labels are therefore ambiguous), - or the same length (for adding unlabelled data). - strict (bool): In the case that Cheval is unable to recognize or support the type of `data`, turning off - `strict` will permit the data to be filled at this time, and checked by the NumExpr engine later. In - most cases, this should probably be left to True. - allow_unused (bool): Disables raising KeyError if the symbol is not found in the expressions. This can be a - common occurrence if the expressions are read in from user input and the code makes more symbols - available for computation than are used. - - Raises: - KeyError: If the symbol_name is not found used in any Expression and allow_unused=False - TypeError: If the type of `data` was not understood or not supported by the scoping rules. - ScopeOrientationError: If the data do not conform to the rules for scoping. - - """ - self._initialize() - - if symbol_name not in self._empty_symbols: - if allow_unused: - return # Do not actually add the symbol, since it is known to be unused - else: - raise KeyError("Symbol '%s' is not used in any expression" % symbol_name) - - symbol_usage = self._empty_symbols[symbol_name] - - if symbol_usage is LinkedFrameUsage: - symbol_meta = self._fill_linked(data, orientation) - elif symbol_usage is AttributedUsage: - symbol_meta = self._fill_attributed(data, orientation) - elif symbol_usage is DictLiteral: - symbol_meta = self._fill_simple(symbol_usage.series, orientation=1) - elif symbol_usage is SimpleUsage: - symbol_meta = self._fill_simple(data, orientation, strict) - else: - raise NotImplementedError("Usage type '%s' not understood" % symbol_usage) - - del self._empty_symbols[symbol_name] - self._filled_symbols[symbol_name] = symbol_meta - - def _compute_utilities(self, n_threads, logger=None): - assert len(self._empty_symbols) == 0 - - model = self._root - - # Allocate an empty utility table - shape = len(self._records), len(model.tree.node_index) - utility_table = np.zeros(shape, dtype=np.float64, order='C') - - ne.set_num_threads(n_threads) - - # Evaluate each expression - for expr in model.expressions: - try: - self._evaluate_single_expression(expr, utility_table) - except Exception as e: - if logger is not None: - logger.error("Error while evaluating '%s'" % expr._raw_expr) - raise e - - return utility_table - - def _evaluate_single_expression(self, expr, utility_table): - # Setup local dictionary of data - local_dict = {NAN_STR: np.nan} - columns = self._root.tree.node_index - for symbol_name, list_of_usages in expr.symbols(): - if isinstance(list_of_usages, DictLiteral): - s = list_of_usages.series - reindexed = s.reindex(columns, fill_value=0) - vector = reindexed.values - vector.shape = 1, utility_table.shape[1] - local_dict[symbol_name] = vector - continue - - for usage in list_of_usages: - # Usage is one of SimpleUsage, DictLiteral, AttributedUsage, or LinkedFrameUsage - - # Symbol meta is an instance of scope.AbstractSymbol - symbol_meta = self._filled_symbols[symbol_name] - data = symbol_meta.get_value(usage) - - if isinstance(usage, SimpleUsage): - # In this case, no substitution was performed, so we can just use the symbol name - local_dict[symbol_name] = data - else: - # Otherwise, we need to set the data to another alias - local_dict[usage.substitution] = data - - # Run the expression. - final_expression = '__out + (%s)' % expr._parsed_expr - local_dict['__out'] = utility_table - ne.evaluate(final_expression, local_dict=local_dict, out=utility_table) - - def _fill_simple(self, data, orientation=None, strict=True): - """""" - - ''' - This is a bit ugly and should probably be broken out into separate functions for different types - ''' - - if isinstance(data, np.ndarray): - # Unlabelled 1- or 2-D array - self._check_records() - - if data.ndim == 1: - # 1-D array - n_rows = len(data) - n_rec = len(self._records) - n_alts = len(self._alternatives) - if n_rec == n_alts: - # Orientation matters - if n_rows != n_rec: - raise ScopeOrientationError("1D data array shape incompatible with records or alternatives") - if orientation is None: - raise ScopeOrientationError("Orientation must be provided for arrays when length of records " - "matches length of alternatives") - return Array1DSymbol(data, orientation) - else: - # Infer orientation from length of array - if n_rows == n_rec: - # Oriented to the records - return Array1DSymbol(data, 0) - elif n_rows == n_alts: - return Array1DSymbol(data, 1) - else: - raise ScopeOrientationError("1D arrays must be as long as either the records or alternatives") - elif data.ndim == 2: - # 2-D array - n_rows, n_cols = data.shape - n_rec, n_alts = len(self._records), len(self._alternatives) - - if n_rec == n_alts: - # Orientation matters - if n_rows != n_rec or n_cols != n_alts: - raise ScopeOrientationError("2D data array shape incompatible with records and alternatives") - if orientation is None: - raise ScopeOrientationError("Orientation must be provided for arrays when length of records " - "matches length of alternatives") - return Array2DSymbol(data, orientation) - elif n_rows == n_rec and n_cols == n_alts: - return Array2DSymbol(data, 0) - elif n_rows == n_alts and n_cols == n_rec: - return Array2DSymbol(data, 1) - else: - raise ScopeOrientationError("2D array shapes must align with both the records and alternatives") - else: - raise ScopeOrientationError("Numpy arrays are permitted, but only with 1 or 2 dimensions (found more)") - elif isinstance(data, pd.DataFrame): - # Labelled 2-D array - self._check_records() - # For a simple symbols, this is only permitted if BOTH axes align with the utilities table - if self._records.equals(self._alternatives): - # Orientation is ambiguous - self._check_orientation(orientation) - if not data.index.equals(self._records) or not data.columns.equals(self._records): - raise ScopeOrientationError("Simple DataFrames must align with both the records and alternatives") - return Array2DSymbol(data.values, orientation) - elif data.index.equals(self._records) and data.columns.equals(self._alternatives): - return Array2DSymbol(data.values, 0) - elif data.index.equals(self._alternatives) and data.columns.equals(self._records): - return Array2DSymbol(data.values, 1) - else: - raise ScopeOrientationError("Simple DataFrames must align with both the records and alternatives") - elif isinstance(data, pd.Series): - # Labelled 1-D array - self._check_records() - - if self._records.equals(self._alternatives): - # Orientation is ambiguous - self._check_orientation(orientation) - if not data.index.equals(self._records): - raise ScopeOrientationError("Series must align with either the records or alternatives") - - return Array1DSymbol(data, orientation) - elif data.index.equals(self._records): - return Array1DSymbol(data, 0) - elif data.index.equals(self._alternatives): - return Array1DSymbol(data, 1) - else: - raise ScopeOrientationError("Series must align with either the records or the alternatives") - elif strict: - # Rigourous type-checking for scalars if strict is required - if not isinstance(data, (int, float, np.int_, np.float_)): - raise TypeError("Unsupported simple symbol data type: %s" % type(data)) - return ScalarSymbol(data) - else: - # Permissive; let NumExpr figure out if the data is valid - return ScalarSymbol(data) - - @staticmethod - def _check_orientation(orientation): - if orientation is None: - raise ScopeOrientationError("Orientation must be provided when the record index exactly equals the" - " alternatives index.") - if orientation != 0 and orientation != 1: - raise ValueError("Orientation must be either 0 or 1") - - def _check_records(self): - assert self._records is not None, "This operation is not allowed if the records have not been set." - - def _fill_attributed(self, data, orientation=None): - self._check_records() - - if isinstance(data, pd.DataFrame): - if self._records.equals(self._alternatives): - self._check_orientation(orientation) - if not data.index.equals(self._records): - raise ScopeOrientationError("Filling attributed usage with a DataFrame is permitted, but the index " - "must align with either the records or the alternatives") - return FrameSymbol(data, orientation) - elif data.index.equals(self._records): - return FrameSymbol(data, 0) - elif data.index.equals(self._alternatives): - return FrameSymbol(data, 1) - else: - raise ScopeOrientationError("Filling attributed usage with a DataFrame is permitted, but the index " - "must align with either the records or the alternatives") - - elif isinstance(data, (pd.Panel, dict)): - if isinstance(data, dict): - data = pd.Panel(dict) - - if data.major_axis.equals(self._alternatives) and data.minor_axis.equals(self._records): - data = data.transpose('items', 'minor_axis', 'major_axis') - elif not (data.major_axis.equals(self._records) and data.minor_axis.equals(self._alternatives)): - raise ScopeOrientationError("Panel symbols major and minor axes must align with the records and " - "alternatives") - return PanelSymbol(data) - - else: - raise TypeError("Only DataFrames, dictionaries of DataFrames, or Panels can fill attributed symbols") - - def _fill_linked(self, data, orientation): - assert isinstance(data, LinkedDataFrame), "Only LinkedDataFrames can fill linked symbols." - self._check_records() - - if self._records.equals(self._alternatives): - self._check_orientation(orientation) - - if not data.index.equals(self._records): - raise ScopeOrientationError("Filling linked usage with a LinkedDataFrame is permitted, but the index " - "must align with either the records or alternatives") - - return LinkedFrameSymbol(data, orientation) - elif data.index.equals(self._records): - return LinkedFrameSymbol(data, 0) - elif data.index.equals(self._alternatives): - return LinkedFrameSymbol(data, 1) - else: - raise ScopeOrientationError("Filling linked usage with a LinkedDataFrame is permitted, but the index must " - "align with either the records or alternatives") - - def clear(self): - self._empty_symbols = None - self._filled_symbols = None - self._records = None - self._alternatives = None - - def _initialize(self): - if self._empty_symbols is None or self._filled_symbols is None: - self._empty_symbols, dict_literals = self._root.expressions.get_symbols() - self._filled_symbols = {} - self._alternatives = self._root.tree.node_index - - self._fill_dict_literals(dict_literals) - - def _fill_dict_literals(self, dict_literals): - for alias, usage in iteritems(dict_literals): - expanded_series = usage.series.reindex(self._alternatives, fill_value=0.0) - symbol = Array1DSymbol(expanded_series.values, orientation=1) - self._filled_symbols[alias] = symbol - - def _symbolize(self): - if self._empty_symbols: - raise AttributeError("Cannot evaluate expressions when there are still empty symbols that need to be " - "filled") - return self._filled_symbols - - -def convert_series(s, allow_raw=False): - dtype = s.dtype - - if dtype.name == 'category': - # Categorical column - categorical = s.values - - category_index = categorical.categories - if category_index.dtype.name == 'object': - max_len = category_index.str.len().max() - typename = 'S%s' % max_len - else: - typename = category_index.dtype - - return categorical.astype(typename) - elif dtype.name == 'object': - # Object or text column - max_length = s.str.len().max() - if np.isnan(max_length): - raise TypeError("Could not get max string length") - - return s.values.astype("S%s" % max_length) - elif np.issubdtype(dtype, np.datetime64): - raise TypeError("Datetime columns are not supported") - elif np.issubdtype(dtype, np.timedelta64): - raise TypeError("Timedelta columns are not supported") - try: - return s.values - except: - if allow_raw: return s - raise diff --git a/balsa/cheval/scope/scope.pyi b/balsa/cheval/scope/scope.pyi deleted file mode 100644 index e181eb6..0000000 --- a/balsa/cheval/scope/scope.pyi +++ /dev/null @@ -1,57 +0,0 @@ -import pandas as pd -import numpy as np -from typing import Iterable, Union, Any -from balsa.cheval.ldf import LinkedDataFrame - -class Scope: - - _root: Any - _empty_symbols: set - _filled_symbols: set - _records: pd.Index - _alternatives: pd.Index - - def __init__(self, model): - pass - - def set_record_index(self, index: Union[pd.Index, Iterable]): - pass - - def fill_symbol(self, symbol_name: str, - data: Union[int, float, np.ndarray, pd.Series, pd.DataFrame, LinkedDataFrame], - orientation: int=None, strict: bool=True, allow_unused: bool=True): - pass - - def _initialize(self): - pass - - def _compute_utilities(self, n_threads, logger=None): - pass - - def _evaluate_single_expression(self, expr, utility_table): - pass - - def _fill_simple(self, data, orientation=None, strict=True): - pass - - @staticmethod - def _check_orientation(orientation): - pass - - def _check_records(self): - pass - - def _fill_attributed(self, data, orientation=None): - pass - - def _fill_linked(self, data, orientation): - pass - - def clear(self): - pass - - def _fill_dict_literals(self, dict_literals): - pass - - def _symbolize(self): - pass diff --git a/balsa/cheval/tree.py b/balsa/cheval/tree.py deleted file mode 100644 index 725ab3f..0000000 --- a/balsa/cheval/tree.py +++ /dev/null @@ -1,212 +0,0 @@ -from __future__ import division, absolute_import, print_function, unicode_literals - -from six import iterkeys, itervalues - -import numpy as np -import pandas as pd - - -class _ChoiceNode(object): - - def __init__(self, *args): - self._name, self._root, self._parent, self.logsum_scale, self._level = args - self._children = set() - - def __str__(self): - return self._name - - def __repr__(self): - return "ChoiceNode(%s)" % self._name - - @property - def name(self): - return self._name - - @property - def root(self): - return self._root - - @property - def parent(self): - return self._parent - - @property - def level(self): - return self._level - - @property - def is_parent(self): - return len(self._children) > 0 - - def max_level(self): - max_level = self._level - - for c in self.children(): - max_level = max(max_level, c.max_level()) - - return max_level - - def children(self): - """ - Iterates through child nodes - - Yields: - _ChoiceNode: This node's children, if they exist - - """ - for c in self._children: - yield c - - def add_node(self, name, logsum_scale=1.0): - """ - Adds a nested alternative to the logit model. The name must be unique. - - Args: - name (str): The name of the alternative in the choice set. - logsum_scale (int): The logsum scale parameter. Not used in the probability computation if this node has no - children. A value of 1.0 can be used if the estimated coefficients are already scaled. - - Returns: - The added node, which also has an 'add_node' method. - - """ - return self._root._root_add(self, name, logsum_scale, self._level + 1) - - -class ChoiceTree(object): - - def __init__(self, root): - self._root = root - - self._all_nodes = {} - self._children = set() - self._cached_node_index = None - - def __getitem__(self, item): - return self._all_nodes[item] - - def max_level(self): - """ - Gets the maximum depth of the tree, with 1 being the lowest valid level. - - Returns: - int - - """ - - max_level = 1 - - for c in self.children(): - max_level = max(max_level, c.max_level()) - - return max_level - - def children(self): - """ - Iterates through child nodes - - Yields: - _ChoiceNode: Top-level child nodes - - """ - for c in self._children: - yield c - - @property - def node_index(self): - if self._cached_node_index is None: - idx = pd.Index(sorted(iterkeys(self._all_nodes))) - self._cached_node_index = idx - return idx - return self._cached_node_index - - def _root_add(self, parent, new_name, logsum_scale, level): - assert 0 < logsum_scale <= 1, "Logsum scale must be on the interval (0, 1]; got %s instead" % logsum_scale - if new_name in self._all_nodes: - old_node = self._all_nodes[new_name] - old_node.parent._children.remove(old_node) - new_node = _ChoiceNode(new_name, self, parent, logsum_scale, level) - parent._children.add(new_node) - self._all_nodes[new_name] = new_node - - # Clear the cached node index because the set of alternatives is being changed - self._cached_node_index = None - return new_node - - def add_node(self, name, logsum_scale=1.0): - """ - Adds a top-level alternative to the logit model. The name must be unique. - - Args: - name (str): The name of the alternative in the choice set. - logsum_scale (int): The logsum scale parameter. Not used in the probability computation if this node has no - children. A value of 1.0 can be used if the estimated coefficients are already scaled. - - Returns: - _ChoiceNode: The added node, which also has an 'add_node' method. - - """ - return self._root_add(self, name, logsum_scale, 0) - - def add_nodes(self, names, logsum_scales=None): - """ - Convenience function to "batch" add several alternatives at once (useful for Multinomial models with large - choice sets). - - Args: - names (Iterable[str]): Iterable of names to add at once. - logsum_scales (Iterable[float] or None): Iterable of logsum scales to use for the new nodes. `names` and - `logsum_scales` are `zip()`ed together, so use an ordered collection (like a List) for both. For most - use cases, the default value of None is sufficient. - - Returns: - dict[str, _ChoiceNode]: Dictionary whose keys are the names used, and whose values are the nodes that were - created. - - """ - if isinstance(names, str): - raise TypeError("To add a single node, use the singular `add_node` method") - if logsum_scales is None: logsum_scales = [1.0] * len(names) - - nodes = {} - for name, scale in zip(names, logsum_scales): - node = self.add_node(name, logsum_scale=scale) - nodes[name] = node - return nodes - - def remove_node(self, name): - """ - Removes a node (at any level) from the logit model. - - Args: - name (str): The name of the node to remove - - """ - - old_node = self._all_nodes[name] - old_node.parent._children.remove(old_node) - del self._all_nodes[name] - - # Clear the cached node index because the set of alternatives is being changed - self._cached_node_index = None - - def flatten(self): - node_index = self.node_index - node_positions = {name: i for i, name in enumerate(node_index)} - - hierarchy = np.full(len(node_index), -1, dtype='i8') - levels = np.zeros(len(node_index), dtype='i8') - logsum_scales = np.ones(len(node_index), dtype='f8') - - for node in itervalues(self._all_nodes): - position = node_positions[node.name] - levels[position] = node.level - - if node.parent is not self: - parent_position = node_positions[node.parent.name] - hierarchy[position] = parent_position - - if node.is_parent: - logsum_scales[position] = node.logsum_scale - - return hierarchy, levels, logsum_scales diff --git a/balsa/cheval/tree.pyi b/balsa/cheval/tree.pyi deleted file mode 100644 index 8aceb6d..0000000 --- a/balsa/cheval/tree.pyi +++ /dev/null @@ -1,72 +0,0 @@ -from typing import Iterable, List, Tuple, Any, Set, Union, Dict -import pandas as pd -import numpy as np - -class _ChoiceNode: - - _root: 'ChoiceTree' - _parent: Union['ChoiceTree', '_ChoiceNode'] - _name: str - _level: int - _children: Set['_ChoiceNode'] - - def __init__(self, *args): - pass - - @property - def name(self) -> str: - pass - - @property - def parent(self) -> bool: - pass - - @property - def level(self) -> int: - pass - - @property - def is_parent(self) -> bool: - pass - - def max_level(self) -> int: - pass - - def children(self) -> Iterable['_ChoiceNode']: - pass - - def add_node(self, name: str, logsum_scale: float=1.0) -> '_ChoiceNode': - pass - -class ChoiceTree: - - _children: Set[_ChoiceNode] - _all_nodes: Dict[str, _ChoiceNode] - - def __init__(self, *args): - pass - - def max_level(self) -> int: - pass - - def children(self) -> Iterable['_ChoiceNode']: - pass - - @property - def node_index(self) -> pd.Index: - pass - - def add_node(self, name: str, logsum_scale: float=1.0) -> _ChoiceNode: - pass - - def add_nodes(self, names: List[str], logsum_scales: List[float]=None) -> List[_ChoiceNode]: - pass - - def remove_node(self, name: str): - pass - - def flatten(self) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: - pass - - def _root_add(self, parent, new_name, logsum_scale, level) -> _ChoiceNode: - pass diff --git a/balsa/configuration/configuration.py b/balsa/configuration.py similarity index 74% rename from balsa/configuration/configuration.py rename to balsa/configuration.py index 5fec435..58a2b31 100644 --- a/balsa/configuration/configuration.py +++ b/balsa/configuration.py @@ -12,7 +12,8 @@ except ImportError: PATHLIB_LOADED = False -from balsa.utils import is_identifier, open_file +from .routines.general import is_identifier +from .routines.io.common import open_file class ConfigParseError(IOError): @@ -30,11 +31,6 @@ class ConfigTypeError(ValueError): class ConfigValue(object): """ Wraps the value of a Config attribute to facilitate type-checking and pretty error messages. - - Attributes: - value: Get or set the underlying value of the wrapper. - namespace: The dot-separated namespace of this attribute within the full Config. - """ def __init__(self, value, name, owner=None): @@ -48,6 +44,7 @@ def __repr__(self): return "ConfigValue(%r)" % self.value @property def namespace(self): + """ Dot-separated name of this config value""" if self._owner is not None: return self._owner.namespace + '.' + self._name return self._name @@ -59,9 +56,11 @@ def as_type(self, type_): Args: type_ (type): The type (e.g. int, float, etc.) to try to cast - Returns: The value cast as type + Returns: + The value cast as type - Raises: ConfigTypeError if the casting could not be performed. + Raises: + ConfigTypeError: if the casting could not be performed. """ @@ -74,17 +73,45 @@ def as_type(self, type_): ) raise ConfigTypeError(message) - def as_bool(self): return self.as_type(bool) + def as_bool(self): + """ + Resolves the value to bool + + Raises: + ConfigTypeError: If the value cannot be resolved to bool + """ + return self.as_type(bool) + + def as_int(self): + """ + Resolves the value to int + + Raises: + ConfigTypeError: If the value cannot be resolved to int + """ + return self.as_type(int) + + def as_float(self): + """ + Resolves the value to float - def as_int(self): return self.as_type(int) + Raises: + ConfigTypeError: If the value cannot be resolved to float + """ + return self.as_type(float) - def as_float(self): return self.as_type(float) + def as_str(self): + """ + Resolves the value to str - def as_str(self): return self.as_type(str) + Raises: + ConfigTypeError: If the value cannot be resolved to str + """ + return self.as_type(str) def as_list(self, sub_type=None): """ - Converts the value to a list. + Resolves the value to a list. Args: sub_type (type): Optional. Specifies the expected contiguous (uniform) type of the list to convert to. @@ -103,6 +130,15 @@ def as_list(self, sub_type=None): if PATHLIB_LOADED: def as_path(self, parent=None): + """ + Resolves the value to Path type (available only when using Python 3) + + Args: + parent: Optional parent folder if this is a relative path + + Raises: + ConfigTypeError: If the value cannot be resolved to Path + """ if parent is not None: return Path(parent) / Path(self.as_str()) return Path(self.as_str()) @@ -136,29 +172,22 @@ class Config(object): (beginning with '//') stripped out. Keys in the JSON file which conform to Python variable names (e.g. "my_attribute" but not "My Attribute") become *attributes* of the Config object (e.g. instance.my_attribute). - Value attributes (e.g. `value` in `{"key": value}`) are stored as ConfigValue objects to facilitate type conversion - and checking. So to access the raw value, write "instance.my_attribute.value" or, to convert it to a specified type, - write "instance.my_attribute.as_bool()". + Value attributes (e.g. ``value`` in ``{"key": value}``) are stored as ConfigValue objects to facilitate type + conversion and checking. So to access the raw value, write "instance.my_attribute.value" or, to convert it to a + specified type, write ``instance.my_attribute.as_bool()``. This all facilitates "pretty" error message generation, to provide the end-user with as much information about the source of an error as these are common when specifying a model. - A Config can be constructed from three static methods: - - from_file() to construct from a JSON file on-disk - - from_string() to construct from a JSON-formatted string in-memory - - from_dict() to construct from a dictionary in-memory - - Notes: - - Config implements __contains__ for testing if a name is 'in' the set of attributes. - - To use __getitem__, __setitem__ (like a Dictionary), use the `as_dict()` method to convert to a dictionary - representation. This also exposes dictionary iteration methods. + A `Config` can be constructed from three static methods: - Attributes: - name: Short name of each part of the config. For non-root Configs, this will be the name of the attribute used - to access this Config from the parent. - parent: Pointer to the parent of non-root Configs. - namespace: The dot-separated namespace of this part of the full Config. + - ``from_file()`` to construct from a JSON file on-disk + - ``from_string()`` to construct from a JSON-formatted string in-memory + - ``from_dict()`` to construct from a dictionary in-memory + Note: + - Config implements ``__contains__`` for testing if a name is 'in' the set of attributes. + - To use ``__getitem__``, ``__setitem__`` (like a Dictionary), use the ``as_dict()`` method to convert to a dictionary representation. This also exposes dictionary iteration methods. """ def __init__(self, config_dict, name=None, parent=None, file_=None): @@ -191,13 +220,19 @@ def __init__(self, config_dict, name=None, parent=None, file_=None): self._contents[key] = value @property - def name(self): return self._name + def name(self): + """Short name of each part of the config. For non-root Configs, this will be the name of the attribute used + to access this Config from the parent.""" + return self._name @property - def parent(self): return self._parent + def parent(self): + """Pointer to the parent of non-root Configs.""" + return self._parent @property def namespace(self): + """The dot-separated namespace of this part of the full Config.""" name = self._name if self._name is not None else '' if self._parent is None: return name @@ -224,10 +259,13 @@ def as_dict(self, key_type=None, value_type=None): Converts this entry to a primitive dictionary, using specified types for the keys and values. Args: - key_type (type): The type to which the keys will be cast, or None to ignore casting. - value_type (type): The type to which the values will be cast, or None to ignore casting. + key_type (type, optional): Defaults to ``None``. The type to which the keys will be cast, or None to ignore + casting. + value_type (type, optional): Defaults to ``None``. The type to which the values will be cast, or None to + ignore casting. - Returns: dict + Returns: + dict: A dictionary containing the entry's keys and values """ @@ -290,7 +328,7 @@ def from_file(cls, fp): Config: The Config object representing the JSON file. Raises: - ConfigParseError if there's a problem parsing the JSON file + ConfigParseError: if there's a problem parsing the JSON file """ with open_file(fp, mode='r') as reader: @@ -314,10 +352,11 @@ def from_string(cls, s, file_name='', root_name=''): root_name (str): Optional root name for display purposes. Returns: - Config: The Config object representing the JSON file. + Config: + The Config object representing the JSON file. Raises: - ConfigParseError if there's a problem parsing the JSON file + ConfigParseError: if there's a problem parsing the JSON file """ sio = StringIO(s) @@ -334,11 +373,12 @@ def from_dict(dict_, file_name='', root_name=''): Converts a raw dictionary to a Config object. Args: - dict_ (dict): The + dict_ (dict): The dictionary to create a Config from file_name: root_name: Returns: + Config """ return Config(dict_, name=root_name, file_=file_name) diff --git a/balsa/configuration/configuration.pyi b/balsa/configuration.pyi similarity index 50% rename from balsa/configuration/configuration.pyi rename to balsa/configuration.pyi index 629bb8b..d58d85a 100644 --- a/balsa/configuration/configuration.pyi +++ b/balsa/configuration.pyi @@ -1,25 +1,65 @@ from io import FileIO -from typing import Union, Optional, Dict, Any +from typing import Union, Optional, Dict, Any, List from six import string_types try: from pathlib import Path + PATHLIB_LOADED = True file_types = Union[string_types, Path, FileIO] except ImportError: Path = None + PATHLIB_LOADED = False file_types = Union[string_types, FileIO] -class ConfigValue: +class ConfigParseError(IOError): + pass + + +class ConfigSpecificationError(AttributeError): + pass + + +class ConfigTypeError(ValueError): pass +class ConfigValue: + + value: Union[str, List[Union[str, ConfigValue]]] + _name: str + _owner: Config + + def namespace(self) -> str: + pass + + def as_type(self, type_): pass + + def as_bool(self) -> bool: pass + + def as_int(self) -> int: pass + + def as_float(self) -> float: pass + + def as_str(self) -> str: pass + + def as_list(self, sub_type=None) -> list: pass + + if PATHLIB_LOADED: + def as_path(self, parent: Optional[Path]=None) -> Path: + pass + + def as_set(self, sub_type=None) -> set: pass + + def serialize(self) -> Union[list, Any]: pass + + class Config: _contents: dict _name: str - _parent: Optional[Config] - _file = Optional[str] + _parent = Optional['Config'] + _file = file_types def name(self) -> string_types: pass @@ -41,3 +81,6 @@ class Config: @staticmethod def from_dict(dict_: Dict[string_types, Any], file_name: string_types, root_name: string_types) -> 'Config': pass + + @staticmethod + def _parse_comments(reader): pass diff --git a/balsa/configuration/__init__.py b/balsa/configuration/__init__.py deleted file mode 100644 index fbdd3f9..0000000 --- a/balsa/configuration/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .configuration import Config, ConfigParseError, ConfigSpecificationError, ConfigTypeError diff --git a/balsa/logging/logging.py b/balsa/logging.py similarity index 68% rename from balsa/logging/logging.py rename to balsa/logging.py index ed1a95b..e6509e6 100644 --- a/balsa/logging/logging.py +++ b/balsa/logging.py @@ -5,7 +5,7 @@ from contextlib import contextmanager import traceback as tb -_DEFAULT_FMT = "%(asctime)s %(levelname)s %(name)s -> %(message)s" +_DEFAULT_FMT = "%(asctime)s %(levelname)s %(name)s ➔ %(message)s" _COLOUR_CODES = {'red': 31, 'green': 32, 'gold': 33, 'yellow': 33, 'blue': 34} # region Custom levels @@ -19,7 +19,7 @@ # region Classes -class SetFilter(object): +class _SetFilter(object): def __init__(self): self._excluded = set() @@ -34,7 +34,7 @@ def include(self, level): self._excluded.remove(level) -class RangeFilter(object): +class _RangeFilter(object): def __init__(self, low, high): self._low = int(low) @@ -44,10 +44,10 @@ def filter(self, record): return self. _low <= record.levelno <= self._high -class SwitchFormatter(logging.Formatter): +class _SwitchFormatter(logging.Formatter): def __init__(self, default_format): - super(SwitchFormatter, self).__init__() + super(_SwitchFormatter, self).__init__() self._default = logging.Formatter(default_format) self._formats = {} @@ -68,14 +68,23 @@ def clear(self): class ModelLogger(logging.Logger): + """ + Extends the standard Python Logger object, adding additional logging statements such as ``.report()``. + """ def report(self, msg, *args, **kwargs): + """Report useful model statistics or results to the user. Distinct from ``.info()`` which provides status + information. Printed in green when colours are available.""" self.log(REPORT_LEVEL, msg, *args, **kwargs) def tip(self, msg, *args, **kwargs): + """Provide a more significant status statement (e.g. new section of the model). Similar to ``.info()``, but + more emphasized. Printed in blue when colours are available.""" self.log(TIP_LEVEL, msg, *args, **kwargs) def pipe(self, msg, levelno=0, levelname=None, asctime=None, name=None): + """Turn a string message into a logging statement. Designed to work with stdout when working with + sub-processes.""" d = {"msg": msg} if levelno is not None: d['levelno'] = levelno if levelname is not None: d['levelname'] = levelname @@ -111,17 +120,17 @@ def include_console_level(level): _CONSOLE_FILTER.include(level) -_CONSOLE_FILTER = SetFilter() -_CONSOLE_FORMATTER = SwitchFormatter(_DEFAULT_FMT) +_CONSOLE_FILTER = _SetFilter() +_CONSOLE_FORMATTER = _SwitchFormatter(_DEFAULT_FMT) -_STDOUT_HANLDER = logging.StreamHandler(sys.stdout) -_STDOUT_HANLDER.addFilter(_CONSOLE_FILTER) -_STDOUT_HANLDER.addFilter(RangeFilter(0, logging.ERROR - 1)) -_STDOUT_HANLDER.setFormatter(_CONSOLE_FORMATTER) +_STDOUT_HANDLER = logging.StreamHandler(sys.stdout) +_STDOUT_HANDLER.addFilter(_CONSOLE_FILTER) +_STDOUT_HANDLER.addFilter(_RangeFilter(0, logging.ERROR - 1)) +_STDOUT_HANDLER.setFormatter(_CONSOLE_FORMATTER) _STDERR_HANDLER = logging.StreamHandler(sys.stderr) _STDERR_HANDLER.addFilter(_CONSOLE_FILTER) -_STDERR_HANDLER.addFilter(RangeFilter(logging.ERROR, 100)) +_STDERR_HANDLER.addFilter(_RangeFilter(logging.ERROR, 100)) _STDERR_HANDLER.setFormatter(_CONSOLE_FORMATTER) set_console_format(_DEFAULT_FMT, level=TIP_LEVEL, colour='blue') @@ -134,13 +143,25 @@ def include_console_level(level): def init_root(root_name: str) -> ModelLogger: + """ + Call this function **at the start of a program** to setup Balsa's logging features, including colours. + + Args: + root_name: Loggers use dotted namespaces to determine inheritance, which also gets passed into the logging + statements. So use a descriptive name for your program, with no spaces in it. Subsequent loggers should + include this name, then a dot, and then the model-specific name. + + Returns: + ModelLogger: The root ModelLogger object. + + """ logging.setLoggerClass(ModelLogger) root = logging.getLogger(root_name) root.propagate = True root.setLevel(1) # Log everything root.handlers.clear() # Remove any existing handlers - root.addHandler(_STDOUT_HANLDER) + root.addHandler(_STDOUT_HANDLER) root.addHandler(_STDERR_HANDLER) return root @@ -156,9 +177,9 @@ def log_to_file(file_name: str, name, *, append=False): root = logging.getLogger(name) write_mode = 'a' if append else 'w' - handler = logging.FileHandler(file_name, mode=write_mode) + handler = logging.FileHandler(file_name, mode=write_mode, encoding='utf-8') handler.setFormatter(logging.Formatter(_DEFAULT_FMT)) - handler.addFilter(RangeFilter(0, 100)) + handler.addFilter(_RangeFilter(0, 100)) root.addHandler(handler) try: diff --git a/balsa/logging/__init__.py b/balsa/logging/__init__.py deleted file mode 100644 index 2066c10..0000000 --- a/balsa/logging/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from .logging import (exclude_console_level, get_model_logger, include_console_level, init_root, log_to_file, - set_console_format, ModelLogger, RangeFilter, SetFilter, SwitchFormatter) diff --git a/balsa/matrices/__init__.py b/balsa/matrices/__init__.py deleted file mode 100644 index 63679e2..0000000 --- a/balsa/matrices/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -from .routines import matrix_balancing_1d, matrix_balancing_2d, matrix_bucket_rounding, split_zone_in_matrix, \ - aggregate_matrix -from .io import read_omx, read_emx, read_fortran_rectangle, read_fortran_square, read_mdf, to_omx, to_emx, to_fortran, \ - to_mdf, expand_array, peek_mdf, coerce_matrix diff --git a/balsa/matrices/io.py b/balsa/matrices/io.py deleted file mode 100644 index b8bcab8..0000000 --- a/balsa/matrices/io.py +++ /dev/null @@ -1,665 +0,0 @@ -from __future__ import division -from __future__ import print_function - -import numpy as np -import pandas as pd -from six import iteritems, itervalues, iterkeys - -from balsa.utils.utils import open_file -from balsa.pandas_utils import fast_unstack - -try: - from pathlib import Path -except ImportError: - Path = None - -try: - import openmatrix as omx -except ImportError: - omx = None - -# region Common functions - - -def coerce_matrix(matrix, allow_raw=True, force_square=True): - """ - Infers a NumPy array from given input - - Args: - matrix (DataFrame or Series or ndarray or Iterable): - - Returns: - 2D ndarray of type float32 - """ - if isinstance(matrix, pd.DataFrame): - if force_square: - assert matrix.index.equals(matrix.columns) - return matrix.values.astype(np.float32) - elif isinstance(matrix, pd.Series): - assert matrix.index.nlevels == 2, "Cannot infer a matrix from a Series with more or fewer than 2 levels" - wide = matrix.unstack() - - union = wide.index | wide.columns - wide = wide.reindex_axis(union, fill_value=0.0, axis=0).reindex_axis(union, fill_value=0.0, axis=1) - return wide.values.astype(np.float32) - - if not allow_raw: - raise NotImplementedError() - - matrix = np.array(matrix, dtype=np.float32) - assert len(matrix.shape) == 2 - i, j = matrix.shape - assert i == j - - return matrix - - -def expand_array(a, n, axis=None): - """ - Expands an array across all dimensions by a set amount - - Args: - a: The array to expand - n: The (non-negative) number of items to expand by. - axis (int or None): The axis to expand along, or None to exapnd along all axes - - Returns: The expanded array - """ - - if axis is None: new_shape = [dim + n for dim in a.shape] - else: - new_shape = [] - for i, dim in enumerate(a.shape): - dim += n if i == axis else 0 - new_shape.append(dim) - - out = np.zeros(new_shape, dtype=a.dtype) - - indexer = [slice(0, dim) for dim in a.shape] - out[indexer] = a - - return out - - -# endregion - -# region INRO MDF (MatrixData File) format - - -def read_mdf(file, raw=False, tall=False): - """ - Reads Emme's official matrix "binary serialization" format, created using inro.emme.matrix.MatrixData.save(). There - is no official extension for this type of file; '.mdf' is recommended. '.emxd' is also sometimes encountered. - - Args: - file (str or File or Path): The file to read. - raw (bool): If True, returns an unlabelled ndarray. Otherwise, a DataFrame will be returned. - tall (bool): If True, a 1D data structure will be returned. If `raw` is False, a Series will be returned, - otherwise a 1D ndarray. - Returns: - ndarray or DataFrame of the matrix stored in the file. - """ - with open_file(file, mode='rb') as file_handler: - magic, version, dtype_index, ndim = np.fromfile(file_handler, np.uint32, count=4) - - if magic != 0xC4D4F1B2 or version != 1 or not (0 < dtype_index <= 4) or not (0 < ndim <= 2): - raise IOError("Unexpected file header: magic number: %X, version: %d, data type: %d, dimensions: %d." - % (magic, version, dtype_index, ndim)) - - shape = np.fromfile(file_handler, np.uint32, count=ndim) - - index_list = [] - for n_items in shape: - indices = np.fromfile(file_handler, np.int32, n_items) - index_list.append(indices) - - dtype = {1: np.float32, 2: np.float64, 3: np.int32, 4: np.uint32}[dtype_index] - flat_length = shape.prod() # Multiply the shape tuple - matrix = np.fromfile(file_handler, dtype, count=flat_length) - - if raw and tall: return matrix - - matrix.shape = shape - - if raw: return matrix - - if ndim == 1: - return pd.Series(matrix, index=index_list[0]) - elif ndim == 2: - matrix = pd.DataFrame(matrix, index=index_list[0], columns=index_list[1]) - - return matrix.stack() if tall else matrix - - raise NotImplementedError() # This should never happen - - -def to_mdf(matrix, file): - """ - Writes a matrix to Emme's official "binary serialization" format, to load using inro.emme.matrix.MatrixData.load(). - There is no official extension for this type of file; '.mdf' is recommended. - - Args: - matrix (DataFrame or Series): The matrix to write to disk. If a Series is given, it MUST have a - MultiIndex with exactly 2 levels to unstack. - file (basestring or File or Path): The path or file handler to write to. - """ - if isinstance(matrix, pd.Series): - row_index = matrix.index.get_level_values(0).unique() - column_index = matrix.index.get_level_values(1).unique() - elif isinstance(matrix, pd.DataFrame): - row_index = matrix.index - column_index = matrix.columns - else: - raise TypeError("Only labelled matrix objects are supported") - - with open_file(file, mode='wb') as writer: - data = coerce_matrix(matrix, allow_raw=False) - - np.array([0xC4D4F1B2, 1, 1, 2], dtype=np.uint32).tofile(writer) # Header - np.array(data.shape, dtype=np.uint32).tofile(writer) # Shape - - np.array(row_index, dtype=np.int32).tofile(writer) - np.array(column_index, dtype=np.int32).tofile(writer) - - data.tofile(writer) - - -def peek_mdf(file, as_index=True): - """ - Partially opens an MDF file to get the zone system of its rows and its columns. - Args: - file (str or File or Path): The file to read. - as_index (bool): Set to True to return pandas.Index objects rather than List[int] - - Returns: - list: One item for each dimension. If as_index is True, the items will be pandas.Index objects, - otherwise they will be List[int] - - """ - with open_file(file, mode='rb') as file_handler: - magic, version, dtype_index, ndim = np.fromfile(file_handler, np.uint32, count=4) - - if magic != 0xC4D4F1B2 or version != 1 or not (0 < dtype_index <= 4) or not (0 < ndim <= 2): - raise IOError("Unexpected file header: magic number: %X, version: %d, data type: %d, dimensions: %d." - % (magic, version, dtype_index, ndim)) - - shape = np.fromfile(file_handler, np.uint32, count=ndim) - - index_list = [] - for n_items in shape: - indices = np.fromfile(file_handler, np.int32, n_items) - index_list.append(indices) - - if not as_index: return index_list - - return [pd.Index(zones) for zones in index_list] - -# endregion -# region Raw INRO binary matrix (EMX) format - - -def read_emx(file, zones=None, tall=False): - """ - Reads an "internal" Emme matrix (found in /Database/emmemat); with an '.emx' extension. This data - format does not contain information about zones. Its size is determined by the dimensions of the Emmebank - (Emmebank.dimensions['centroids']), regardless of the number of zones actually used in all scenarios. - - Args: - file (str or File or Path): The file to read. - zones (Index or int or None): An Index or Iterable will be interpreted as the zone labels for the matrix rows - and columns; returning a DataFrame or Series (depending on `tall`). If an integer is provided, the returned - ndarray will be truncated to this 'number of zones'. Otherwise, the returned ndarray will be size to the - maximum number of zone dimensioned by the Emmebank. - tall (bool): If True, a 1D data structure will be returned. If `zone_index` is provided, a Series will be - returned, otherwise a 1D ndarray. - - Returns: - DataFrame or Series or ndarray. - - Examples: - For a project with 20 zones: - - matrix = from_emx("Database/emmemat/mf1.emx") - print type(matrix), matrix.shape - >> (numpy.ndarray, (20, 20)) - - matrix = from_emx("Database/emmemat/mf1.emx", zones=10) - print type(matrix), matrix.shape - >> (numpy.ndarray, (10, 10)) - - matrix = from_emx("Database/emmemat/mf1.emx", zones=range(10)) - print type(matrix), matrix.shape - >> (10, 10) - - matrix = from_emx("Database/emmemat/mf1.emx", zones=range(10), tall=True) - print type(matrix), matrix.shape - >> 100 - - """ - with open_file(file, mode='rb') as reader: - data = np.fromfile(reader, dtype=np.float32) - - n = int(len(data) ** 0.5) - assert len(data) == n ** 2 - - if zones is None and tall: - return data - - data.shape = n, n - - if isinstance(zones, (int, np.int_)): - data = data[:zones, :zones] - - if tall: - data.shape = zones * zones - return data - return data - elif zones is None: - return data - - zones = pd.Index(zones) - n = len(zones) - data = data[:n, :n] - - matrix = pd.DataFrame(data, index=zones, columns=zones) - - return matrix.stack() if tall else matrix - - -def to_emx(matrix, file, emmebank_zones): - """ - Writes an "internal" Emme matrix (found in /Database/emmemat); with an '.emx' extension. The number of - zones that the Emmebank is dimensioned for must be known in order for the file to be written correctly. - - Args: - matrix (DataFrame or Series or ndarray): The matrix to write to disk. If a Series is given, it MUST have a - MultiIndex with exactly 2 levels to unstack. - file (basestring or File): The path or file handler to write to. - emmebank_zones (int): The number of zones the target Emmebank is dimensioned for. - """ - assert emmebank_zones > 0 - - with open_file(file, mode='wb') as writer: - data = coerce_matrix(matrix) - n = data.shape[0] - if n > emmebank_zones: - out = data[:emmebank_zones, :emmebank_zones].astype(np.float32) - else: - out = np.zeros([emmebank_zones, emmebank_zones], dtype=np.float32) - out[:n, :n] = data - - out.tofile(writer) - -# endregion -# region FORTRAN Optimized formats - - -def _infer_fortran_zones(n_words): - """Returns the inverse of n_words = matrix_size * (matrix_size + 1)""" - n = int(0.5 + ((1 + 4 * n_words)**0.5)/2) - 1 - assert n_words == (n * (n + 1)), "Could not infer a square matrix from file" - return n - - -def read_fortran_rectangle(file, n_columns, zones=None, tall=False, reindex_rows=False, fill_value=None): - """ - Reads a FORTRAN-friendly .bin file (a.k.a. 'simple binary format') which is known to NOT be square. Also works with - square matrices. - - This file format is an array of 4-bytes, where each row is prefaced by an integer referring to the 1-based positional - index that FORTRAN uses. The rest of the data are in 4-byte floats. To read this, the number of columns present - must be known, since the format does not self-specify. - - Args: - file(str or File or Path): The file to read. - n_columns (int): The number of columns in the matrix. - zones (None or int or pandas.Index): An Index or Iterable will be interpreted as the zone labels for the matrix - rows and columns; returning a DataFrame or Series (depending on `tall`). If an integer is provided, the - returned ndarray will be truncated to this 'number of zones'. - tall (bool): If true, a 'tall' version of the matrix will be returned. - reindex_rows (bool): If true, and zones is an Index, the returned DataFrame will be reindexed to fill-in any - missing rows. - fill_value: The value to pass to pandas.reindex() - - Returns: - ndarray or DataFrame or Series - - Raises: - AssertionError if the shape is not valid. - """ - with open_file(file, mode='rb') as reader: - n_columns = int(n_columns) - - matrix = np.fromfile(reader, dtype=np.float32) - rows = len(matrix) // (n_columns + 1) - assert len(matrix) == (rows * (n_columns + 1)) - - matrix.shape = rows, n_columns + 1 - - # Convert binary representation from float to int, then subtract 1 since FORTRAN uses 1-based positional - # indexing - row_index = np.frombuffer(matrix[:, 0].tobytes(), dtype=np.int32) - 1 - matrix = matrix[:, 1:] - - if zones is None: - if tall: - matrix.shape = matrix.shape[0] * matrix.shape[1] - return matrix - - if isinstance(zones, (int, np.int_)): - matrix = matrix[: zones, :zones] - - if tall: - matrix.shape = zones * zones - return matrix - - nzones = len(zones) - matrix = matrix[: nzones, : nzones] - row_labels = zones.take(row_index[:nzones]) - matrix = pd.DataFrame(matrix, index=row_labels, columns=zones) - - if reindex_rows: - matrix = matrix.reindex_axis(zones, axis=0, fill_value=fill_value) - - if tall: - return matrix.stack() - return matrix - - -def read_fortran_square(file, zones=None, tall=False): - """ - Reads a FORTRAN-friendly .bin file (a.k.a. 'simple binary format') which is known to be square. - - This file format is an array of 4-bytes, where each row is prefaced by an integer referring to the 1-based positional - index that FORTRAN uses. The rest of the data are in 4-byte floats. To read this, the number of columns present - must be known, since the format does not self-specify. This method can infer the shape if it is square. - - Args: - file (str or File or Path): The file to read. - zones (Index or int or None): An Index or Iterable will be interpreted as the zone labels for the matrix rows - and columns; returning a DataFrame or Series (depending on `tall`). If an integer is provided, the returned - ndarray will be truncated to this 'number of zones'. Otherwise, the returned ndarray will be size to the - maximum number of zone dimensioned by the Emmebank. - tall (bool): If True, a 1D data structure will be returned. If `zone_index` is provided, a Series will be - returned, otherwise a 1D ndarray. - - Returns: - DataFrame or ndarray - - """ - with open_file(file, mode='rb') as reader: - floats = np.fromfile(reader, dtype=np.float32) - n_words = len(floats) - matrix_size = _infer_fortran_zones(n_words) - floats.shape = matrix_size, matrix_size + 1 - - data = floats[:, 1:] - - if zones is None: - if tall: - n = np.prod(data.shape) - data.shape = n - return data - return data - - if isinstance(zones, (int, np.int_)): - data = data[:zones, :zones] - - if tall: - data.shape = zones * zones - return data - return data - elif zones is None: - return data - - zones = pd.Index(zones) - n = len(zones) - data = data[:n, :n] - - matrix = pd.DataFrame(data, index=zones, columns=zones) - - return matrix.stack() if tall else matrix - - -def to_fortran(matrix, file, n_columns=None, min_index=1, force_square=True): - """ - Reads a FORTRAN-friendly .bin file (a.k.a. 'simple binary format'), in a square format. - - Args: - matrix (DataFrame or Series or ndarray): The matrix to write to disk. If a Series is given, it MUST have a - MultiIndex with exactly 2 levels to unstack. - file (basestring or File): The path or file handler to write to. - n_columns (int): Optionally specify a desired "width" of the matrix file. For example, n_columns=4000 on a - matrix 3500x3500 will pad the width with 500 extra columns containing 0. If None if provided or if the - number of columns <= the width of the given matrix, no padding will be performed. - min_index (int): The lowest numbered row. Used when slicing matrices - forec_square (bool): - - """ - assert min_index >= 1 - array = coerce_matrix(matrix, force_square=force_square) - - if n_columns is not None and n_columns > array.shape[1]: - extra_columns = n_columns - array.shape[1] - array = expand_array(array, extra_columns, axis=1) - - with open_file(file, mode='wb') as writer: - rows, columns = array.shape - temp = np.zeros([rows, columns + 1], dtype=np.float32) - temp[:, 1:] = array - - index = np.arange(min_index, rows + 1, dtype=np.int32) - # Mask the integer binary representation as floating point - index_as_float = np.frombuffer(index.tobytes(), dtype=np.float32) - temp[:, 0] = index_as_float - - temp.tofile(writer) - -# endregion - -# region OMX Files - -if omx is not None: - - def read_omx(file, matrices=None, mapping=None, raw=False, tall=False, squeeze=True): - """ - Reads Open Matrix (OMX) files. An OMX file can contain multiple matrices, so this function - typically returns a Dict. - - Args: - file: OMX file from which to read. Cannot be an open file handler. - matrices: List of matrices to read from the file. If None, all matrices will be read. - mapping: The zone number mapping to use, if known in advance. If None, and the OMX file only contains - one mapping, then that one is used. No mapping is read if raw is False. - raw: If True, matrices will be returned as raw Numpy arrays. Otherwise, Pandas objects are returned - tall: If True, matrices will be returned in 1D format (pd.Series if raw is False). Otherwise, a 2D object - is returned. - squeeze: If True, and the file contains exactly one matrix, return that matrix instead of a Dict. - - Returns: - The matrix, or matrices contained in the OMX file. - - """ - file = str(file) - with omx.open_file(file, mode='r') as omx_file: - if mapping is None and not raw: - all_mappings = omx_file.list_mappings() - assert len(all_mappings) == 1 - mapping = all_mappings[0] - - if matrices is None: - matrices = sorted(omx_file.list_matrices()) - else: - matrices = sorted(matrices) - - if not raw: - labels = pd.Index(omx_file.mapping(mapping).keys()) - if tall: - labels = pd.MultiIndex.from_product([labels, labels], names=['o', 'd']) - - return_value = {} - for matrix_name in matrices: - wrapper = omx_file[matrix_name] - matrix = wrapper.read() - - if tall: - n = matrix.shape[0] * matrix.shape[1] - matrix.shape = n - - if not raw: - if tall: matrix = pd.Series(matrix, index=labels) - else: matrix = pd.DataFrame(matrix, index=labels, columns=labels) - - matrix.name = matrix_name - - return_value[matrix_name] = matrix - - if len(matrices) == 1 and squeeze: - return return_value[matrices[0]] - return return_value - - - def to_omx(file, matrices, zone_index=None, title='', descriptions=None, attrs=None, mapping='zone_numbers'): - """ - Creates a new (or overwrites an old) OMX file with a collection of matrices. - - Args: - file: OMX to write. - matrices: Collection of matrices to write. MUST be a dict, to permit the encoding of matrix metadata, - and must contain the same types: all Numpy arrays, all Series, or all DataFrames. Checking is done to - ensure that all items have the same shape and labels. - zone_index: Override zone labels to use. Generally only useful if writing a dict of raw Numpy arrays. - title: The title saved in the OMX file. - descriptions: A dict of descriptions (one for each given matrix), or None to not use. - attrs: A dict of dicts (one for each given matrix), or None to not use - mapping: Name of the mapping internal to the OMX file - """ - - matrices, zone_index = _prep_matrix_dict(matrices, zone_index) - - if descriptions is None: - descriptions = {name: '' for name in iterkeys(matrices)} - if attrs is None: - attrs = {name: None for name in iterkeys(matrices)} - - file = str(file) # Converts from Path - with omx.open_file(file, mode='w', title=title) as omx_file: - omx_file.create_mapping(mapping, zone_index.tolist()) - - for name, array in iteritems(matrices): - description = descriptions[name] - attr = attrs[name] - - omx_file.create_matrix(name, obj=np.ascontiguousarray(array), title=description, attrs=attr) - - return - - def _prep_matrix_dict(matrices, desired_zone_index): - collection_type = _check_types(matrices) - - if collection_type == 'RAW': - checked, n = _check_raw_matrices(matrices) - zone_index = pd.Index(range(n)) - elif collection_type == 'SERIES': - checked, zone_index = _check_matrix_series(matrices) - elif collection_type == 'FRAME': - checked, zone_index = _check_matrix_frames(matrices) - else: - raise NotImplementedError(collection_type) - - if desired_zone_index is not None: - assert desired_zone_index.equals(zone_index) - - return checked, zone_index - - def _check_types(matrices): - gen = iter(itervalues(matrices)) - first = next(gen) - - item_type = 'RAW' - if isinstance(first, pd.Series): item_type = 'SERIES' - elif isinstance(first, pd.DataFrame): item_type = 'FRAME' - - msg = "All items must be the same type" - - for item in gen: - if item_type == 'FRAME': - assert isinstance(item, pd.DataFrame), msg - elif item_type == 'SERIES': - assert isinstance(item, pd.Series), msg - else: - assert isinstance(item, np.ndarray), msg - - return item_type - - def _check_raw_matrices(matrices): - gen = iter(iteritems(matrices)) - name, matrix = next(gen) - - n_dim = len(matrix.shape) - if n_dim == 1: - shape = matrix.shape[0] - n = int(shape ** 0.5) - assert n * n == shape, "Only tall matrices that decompose to square shapes are permitted." - matrix = matrix[...] - matrix.shape = n, n - elif n_dim == 2: - n, cols = matrix.shape - assert n == cols, "Only square matrices are permitted" - else: - raise TypeError("Only 1D and 2D arrays can be saved to OMX files") - - retval = {name: matrix} - for name, matrix in gen: - assert len(matrix.shape) == n_dim - - if n_dim == 1: - assert matrix.shape[0] == (n * n) - matrix = matrix[...] - matrix.shape = n, n - else: - assert matrix.shape == (n, n) - - retval[name] = matrix - - return retval, n - - def _check_matrix_series(matrices): - gen = iter(iteritems(matrices)) - name, matrix = next(gen) - - tall_index = matrix.index - assert tall_index.nlevels == 2 - matrix = matrix.unstack() - zone_index = matrix.index - assert zone_index.equals(matrix.columns) - - retval = {name: matrix.values} - for name, matrix in gen: - assert tall_index.equals(matrix.index) - matrix = fast_unstack(matrix, zone_index, zone_index).values - retval[name] = matrix - return retval, zone_index - - def _check_matrix_frames(matrices): - gen = iter(iteritems(matrices)) - name, matrix = next(gen) - - zone_index = matrix.index - assert zone_index.equals(matrix.columns) - - retval = {name: matrix.values} - for name, matrix in gen: - assert zone_index.equals(matrix.index) - assert zone_index.equals(matrix.columns) - retval[name] = matrix.values - return retval, zone_index - -else: - def read_omx(*args, **kwargs): - raise NotImplementedError() - - def to_omx(*args, **kwargs): - raise NotImplementedError() - -# endregion diff --git a/balsa/matrices/io.pyi b/balsa/matrices/io.pyi deleted file mode 100644 index fc8a6b8..0000000 --- a/balsa/matrices/io.pyi +++ /dev/null @@ -1,59 +0,0 @@ -from typing import Union, List, Any, Iterable, Dict, Tuple -from io import FileIO - -import numpy as np -import pandas as pd - -try: - from pathlib import Path - file_type = Union[str, FileIO, Path] -except ImportError: - Path = None - file_type = Union[str, FileIO] - -def coerce_matrix(matrix, allow_raw=True, force_square=True): pass - -def expand_array(a: np.ndarray, n: int, axis: int=None) -> np.ndarray: pass - -def read_mdf(file: file_type, raw: bool=False, tall: bool=False) -> Union[pd.DataFrame, np.ndarray]: pass - -def to_mdf(matrix: Union[pd.Series, pd.DataFrame], file: file_type): pass - -def peek_mdf(file: file_type, as_index: bool=True) -> Union[List[pd.Index], List[List[int]]]: pass - -def read_emx(file: file_type, zones: pd.Index=None, tall: bool=False) -> Union[pd.Series, pd.DataFrame, np.ndarray]: - pass - -def to_emx(matrix: Union[pd.Series, pd.DataFrame, np.ndarray], file: file_type, emmebank_zones: int): pass - -def _infer_fortran_zones(n_words): pass - -def read_fortran_rectangle(file: file_type, n_columns: int, zones: pd.Index=None, tall: bool=False, - reindex_rows: bool=False, fill_value: Any=None - ) -> Union[pd.Series, pd.DataFrame, np.ndarray]: pass - -def read_fortran_square(file: file_type, zones: pd.Index=None, tall: bool=False - ) -> Union[pd.Series, pd.DataFrame, np.ndarray]: pass - - -def to_fortran(matrix: Union[pd.Series, pd.DataFrame, np.ndarray], file: file_type, n_columns: int=None, - min_index: int=1, forec_square: bool=True): pass - -MATRIX_TYPES = Union[pd.DataFrame, pd.Series, np.ndarray] - -def read_omx(file: Union[Path, str], matrices: Iterable[str]=None, mapping: str=None, raw=False, tall=False, - squeeze=True) -> Union[MATRIX_TYPES, Dict[str, MATRIX_TYPES]]: pass - -def _check_types(matrices: Dict[str, MATRIX_TYPES]) -> str: pass - -def _check_raw_matrices(matrices: Dict[str, np.ndarray]) -> Tuple[Dict[str, np.ndarray], int]: pass - -def _check_matrix_series(matrices: Dict[str, pd.Series]) -> Tuple[Dict[str, np.ndarray], pd.Index]: pass - -def _check_matrix_frames(matrices: Dict[str, pd.DataFrame]) -> Tuple[Dict[str, np.ndarray], pd.Index]: pass - -def _prep_matrix_dict(matrices: Dict[str, MATRIX_TYPES], desired_zone_index: pd.Index - ) -> Tuple[Dict[str, np.ndarray], pd.Index]: pass - -def to_omx(file: str, matrices: Dict[str, MATRIX_TYPES], zone_index: pd.Index=None, title: str='', - descriptions: Dict[str, str]=None, attrs: Dict[str, dict]=None, mapping: str='zone_numbers'): pass diff --git a/balsa/matrices/routines.py b/balsa/matrices/routines.py deleted file mode 100644 index 15b5c0e..0000000 --- a/balsa/matrices/routines.py +++ /dev/null @@ -1,369 +0,0 @@ -""" -Core Matrix Manipulation Tools -============================== - -========================================================================== -Matrix Balancing -========================================================================== -matrix_balancing_1d Singly-constrained matrix balancing -matrix_balancing_2d Doubly-constrained balancing using - iterative-proportional fitting -========================================================================== - -""" - -from __future__ import division as _division - -import multiprocessing as _mp -import numba as _nb -import numpy as _np -import pandas as _pd - -EPS = 1.0e-7 - - -def matrix_balancing_1d(m, a, axis): - """ Balances a matrix using a single constraint. - - Args: - m (numpy ndarray (M, M)): Matrix to be balanced - a (numpy ndarray (M)): Totals - axis (int): Direction to constrain (0 = along columns, 1 = along rows) - - Return: - w : Numpy ndarray(..., M, M) - """ - - assert axis in [0, 1], "axis must be either 0 or 1" - assert m.ndim == 2, "m must be a two-dimensional matrix" - assert a.ndim == 1, "a must be a two-dimensional matrix" - assert m.shape[axis] == a.shape[0], "axis %d of matrice 'm' and 'a' must be the same." % axis - - return _balance(m, a, axis) - - -def matrix_balancing_2d(m, a, b, totals_to_use='raise', max_iterations=1000, - rel_error=0.0001, n_procs=1): - """ Balances a two-dimensional matrix using iterative proportional fitting. - - Args: - m (numpy ndarray (M, M): Matrix to be balanced - a (numpy ndarray (M)): Row totals - b (numpy ndarray (M)): Column totals - totals_to_use (str, optional): - Describes how to scale the row and column totals if their sums do not match - Must be one of ['rows', 'columns', 'average', 'raise']. Defaults to 'raise' - - rows: scales the columns totals so that their sums matches the row totals - - columns: scales the row totals so that their sums matches the colum totals - - average: scales both row and column totals to the average value of their sums - - raise: raises an Exception if the sums of the row and column totals do not match - max_iterations (int, optional): Maximum number of iterations, defaults to 1000 - rel_error (float, optional): Relative error stopping criteria, defaults to 1e-4 - n_procs (int, optional): Number of processors for parallel computation. Defaults to 1. (Not used) - - Return: - Numpy ndarray(M, M): balanced matrix - float: residual - int: n_iterations - """ - max_iterations = int(max_iterations) - n_procs = int(n_procs) - - # Test if matrix is Pandas DataFrame - data_type = '' - if isinstance(m, _pd.DataFrame): - data_type = 'pd' - m_pd = m - m = m_pd.values - - if isinstance(a, _pd.Series) or isinstance(a, _pd.DataFrame): - a = a.values - if isinstance(b, _pd.Series) or isinstance(b, _pd.DataFrame): - b = b.values - - # ################################################################################## - # Validations: - # - m is an MxM square matrix, a and b are vectors of size M - # - totals_to_use is one of ['rows', 'columns', 'average'] - # - the max_iterations is a +'ve integer - # - rel_error is a +'ve float between 0 and 1 - # - the n_procs is a +'ve integer between 1 and the number of available processors - # ################################################################################## - valid_totals_to_use = ['rows', 'columns', 'average', 'raise'] - assert m.ndim == 2 and m.shape[0] == m.shape[1], "m must be a two-dimensional square matrix" - assert a.ndim == 1 and a.shape[0] == m.shape[0], \ - "'a' must be a one-dimensional array, whose size matches that of 'm'" - assert b.ndim == 1 and b.shape[0] == m.shape[0], \ - "'a' must be a one-dimensional array, whose size matches that of 'm'" - assert totals_to_use in valid_totals_to_use, "totals_to_use must be one of %s" % valid_totals_to_use - assert max_iterations >= 1, "max_iterations must be integer >= 1" - assert 0 < rel_error < 1.0, "rel_error must be float between 0.0 and 1.0" - assert 1 <= n_procs <= _mp.cpu_count(), \ - "n_procs must be integer between 1 and the number of processors (%d) " % _mp.cpu_count() - if n_procs > 1: - raise NotImplementedError("Multiprocessing capability is not implemented yet.") - - # Scale row and column totals, if required - a_sum = a.sum() - b_sum = b.sum() - if not _np.isclose(a_sum, b_sum): - if totals_to_use == 'rows': - b = _np.multiply(b, a_sum / b_sum) - elif totals_to_use == 'columns': - a = _np.multiply(a, b_sum / a_sum) - elif totals_to_use == 'average': - avg_sum = 0.5 * (a_sum + b_sum) - a = _np.multiply(a, avg_sum / a_sum) - b = _np.multiply(b, avg_sum / b_sum) - else: - raise RuntimeError("a and b vector totals do not match.") - - initial_error = _calc_error(m, a, b) - err = 1.0 - i = 0 - while err > rel_error: - if i > max_iterations: - # todo: convert to logger, if possible - print("Matrix balancing did not converge") - break - m = _balance(m, a, 1) - m = _balance(m, b, 0) - err = _calc_error(m, a, b) / initial_error - i += 1 - - if data_type == 'pd': - new_df = _pd.DataFrame(m, index=m_pd.index, columns=m_pd.columns) - return new_df, err, i - else: - return m, err, i - - -def _balance(matrix, tot, axis): - """ Balances a matrix using a single constraint. - - Args: - matrix (numpy ndarray: Matrix to be balanced - tot (numpy ndarray): Totals - axis (int): Direction to constrain (0 = along columns, 1 = along rows) - - Return: - w : Numpy ndarray(..., M, M) - """ - sc = tot / (matrix.sum(axis) + EPS) - sc = _np.nan_to_num(sc) # replace divide by 0 errors from the prev. line - if axis: # along rows - matrix = _np.multiply(matrix.T, sc).T - else: # along columns - matrix = _np.multiply(matrix, sc) - return matrix - - -def _calc_error(m, a, b): - row_sum = _np.absolute(a - m.sum(1)).sum() - col_sum = _np.absolute(b - m.sum(0)).sum() - return row_sum + col_sum - - -@_nb.jit(_nb.float64[:, :](_nb.float64[:, :], _nb.int64)) -def _nbf_bucket_round(a_, decimals=0): - a = a_.ravel() - b = _np.copy(a) - - residual = 0 - for i in range(0, len(b)): - b[i] = _np.round(a[i] + residual, decimals) - residual += a[i] - b[i] - - return b.reshape(a_.shape) - - -def matrix_bucket_rounding(m, decimals=0): - """ Bucket rounds to the given number of decimals. - - Args: - m (np.ndarray or pd.DataFrame): Matrix to be balanced - decimals (int, optional): - Number of decimal places to round to (default: 0). If decimals is negative, - it specifies the number of positions to the left of the decimal point. - - Return: - np.ndarray: rounded matrix - """ - - # Test if matrix is Pandas DataFrame - data_type = '' - if isinstance(m, _pd.DataFrame): - data_type = 'pd' - m_pd = m - m = m_pd.values - - decimals = int(decimals) - - # I really can't think of a way to vectorize bucket rounding, - # so here goes the slow for loop - b = _nbf_bucket_round(m, decimals) - - if decimals <= 0: - b = b.astype(_np.int32) - - if data_type == 'pd': - new_df = _pd.DataFrame(b.reshape(m.shape), index=m_pd.index, columns=m_pd.columns) - return new_df - else: - return b.reshape(m.shape) - - -def split_zone_in_matrix(base_matrix, old_zone, new_zones, proportions): - """ - Takes a zone in a matrix (represented as a DataFrame) and splits it into several new zones, - prorating affected cells by a vector of proportions (one value for each new zone). The old - zone is removed. - - Args: - base_matrix: The matrix to re-shape, as a DataFrame - old_zone: Integer number of the original zone to split - new_zones: List of integers of the new zones to add - proportions: List of floats of proportions to split the original zone to. Must be the same - length as `new_zones` and sum to 1.0 - - Returns: Re-shaped DataFrame - """ - - assert isinstance(base_matrix, _pd.DataFrame), "Base matrix must be a DataFrame" - - old_zone = int(old_zone) - new_zones = _np.array(new_zones, dtype=_np.int32) - proportions = _np.array(proportions, dtype=_np.float64) - - assert len(new_zones) == len(proportions), "Proportion array must be the same length as the new zone array" - assert len(new_zones.shape) == 1, "New zones must be a vector" - assert base_matrix.index.equals(base_matrix.columns), "DataFrame is not a matrix" - assert _np.isclose(proportions.sum(), 1.0), "Proportions must sum to 1.0 " - - n_new_zones = len(new_zones) - - intersection_index = base_matrix.index.drop(old_zone) - new_index = intersection_index - for z in new_zones: new_index = new_index.insert(-1, z) - new_index = _pd.Index(sorted(new_index)) - - new_matrix = _pd.DataFrame(0, index=new_index, columns=new_index, dtype=base_matrix.dtypes.iat[0]) - - # 1. Copy over the values from the regions of the matrix not being updated - new_matrix.loc[intersection_index, intersection_index] = base_matrix - - # 2. Prorate the row corresponding to the dropped zone - # This section (and the next) works with the underlying Numpy arrays, since they handle - # broadcasting better than Pandas does - original_row = base_matrix.loc[old_zone, intersection_index] - original_row = original_row.values[:] # Make a shallow copy to preserve shape of the original data - original_row.shape = 1, len(intersection_index) - proportions.shape = n_new_zones, 1 - result = _pd.DataFrame(original_row * proportions, index=new_zones, columns=intersection_index) - new_matrix.loc[result.index, result.columns] = result - - # 3. Proprate the column corresponding to the dropped zone - original_column = base_matrix.loc[intersection_index, old_zone] - original_column = original_column.values[:] - original_column.shape = len(intersection_index), 1 - proportions.shape = 1, n_new_zones - result = _pd.DataFrame(original_column * proportions, index=intersection_index, columns=new_zones) - new_matrix.loc[result.index, result.columns] = result - - # 4. Expand the old intrazonal - proportions_copy = proportions[:,:] - proportions_copy.shape = 1, n_new_zones - proportions.shape = n_new_zones, 1 - - intrzonal_matrix = proportions * proportions_copy - intrazonal_scalar = base_matrix.at[old_zone, old_zone] - - result = _pd.DataFrame(intrazonal_scalar * intrzonal_matrix, index=new_zones, columns=new_zones) - new_matrix.loc[result.index, result.columns] = result - - return new_matrix - - -def aggregate_matrix(matrix, groups=None, row_groups=None, col_groups=None, aggfunc=_np.sum): - """ - Aggregates a matrix based on mappings provided for each axis, using a specified aggregation function. - - Args: - matrix: Matrix data to aggregate. DataFrames and Series with 2-level indices are supported - groups: Syntactic sugar to specify both row_groups and col_groups to use the same grouping series. - row_groups: Groups for the rows. If aggregating a DataFrame, this must match the index of the matrix. For a - "tall" matrix, this series can match either the "full" index of the series, or it can match the first level - of the matrix (it would be the same as if aggregating a DataFrame). Alternatively, an array can be provided, - but it must be the same length as the DataFrame's index, or the full length of the Series. - col_groups: Groups for the columns. If aggregating a DataFrame, this must match the columns of the matrix. For a - "tall" matrix, this series can match either the "full" index of the series, or it can match the second level - of the matrix (it would be the same as if aggregating a DataFrame). Alternatively, an array can be provided, - but it must be the same length as the DataFrame's columns, or the full length of the Series. - aggfunc: The aggregation function to use. Default is sum. - - Returns: The aggregated matrix, in the same type as was provided, e.g. Series -> Series, DataFrame -> DataFrame. - - """ - if groups is not None: - row_groups = groups - col_groups = groups - - assert row_groups is not None, "Row groups must be specified" - assert col_groups is not None, "Column groups must be specified" - - if isinstance(matrix, _pd.DataFrame): - row_groups = _prep_square_index(matrix.index, row_groups) - col_groups = _prep_square_index(matrix.columns, col_groups) - - return _aggregate_frame(matrix, row_groups, col_groups, aggfunc) - elif isinstance(matrix, _pd.Series): - assert matrix.index.nlevels == 2 - - row_groups, col_groups = _prep_tall_index(matrix.index, row_groups, col_groups) - return _aggregate_series(matrix, row_groups, col_groups, aggfunc) - else: - raise NotImplementedError() - - -def _prep_tall_index(target_index, row_aggregator, col_aggregator): - - if isinstance(row_aggregator, _pd.Series): - if row_aggregator.index.equals(target_index): - row_aggregator = row_aggregator.values - else: - assert target_index.levels[0].equals(row_aggregator.index) - reindexed = row_aggregator.reindex(target_index, level=0) - row_aggregator = reindexed.values - else: - assert len(row_aggregator) == len(target_index) - row_aggregator = _np.array(row_aggregator) - - if isinstance(col_aggregator, _pd.Series): - if col_aggregator.index.equals(target_index): - col_aggregator = col_aggregator.values - else: - assert target_index.levels[1].equals(col_aggregator.index) - reindexed = col_aggregator.reindex(target_index, level=1) - col_aggregator = reindexed.values - else: - assert len(col_aggregator) == len(target_index) - col_aggregator = _np.array(col_aggregator) - - return row_aggregator, col_aggregator - - -def _prep_square_index(index, aggregator): - if isinstance(aggregator, _pd.Series): - assert aggregator.index.equals(index) - return aggregator.values - else: - assert len(aggregator) == len(index) - return _np.array(aggregator) - - -def _aggregate_frame(matrix, row_aggregator, col_aggregator, aggfunc): - return matrix.groupby(row_aggregator, axis=0).aggregate(aggfunc).groupby(col_aggregator, axis=1).aggregate(aggfunc) - - -def _aggregate_series(matrix, row_aggregator, col_aggregator, aggfunc): - return matrix.groupby([row_aggregator, col_aggregator]).aggregate(aggfunc) diff --git a/balsa/models/__init__.py b/balsa/models/__init__.py deleted file mode 100644 index 51a31ed..0000000 --- a/balsa/models/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from .analyses import tlfd, distance_matrix -# from .plots import location_summary, convergence_boxplot diff --git a/balsa/models/analyses.py b/balsa/models/analyses.py deleted file mode 100644 index a5b814d..0000000 --- a/balsa/models/analyses.py +++ /dev/null @@ -1,252 +0,0 @@ -import pandas as pd -import numpy as np -import numexpr as ne -from six import iteritems - - -def tlfd(values, bin_start=0, bin_end=200, bin_step=2, weights=None, intrazonal=None, label_type='MULTI', - include_top=False): - """ - Generates a Trip Length Frequency Distribution (i.e. a histogram) from given data. Produces a "pretty" Pandas object - suitable for charting. - - Args: - values (ndarray or Series): Vector of trip lengths, with a length of "N". Can be provided from a table of - trips, or from a matrix (in "tall" format). - bin_start (int): The minimum bin value, in the same units as `values`. Default is 0. - bin_end (int): The maximum bin value, in the same units as `values`. Defaults to 200. Values over this limit - are either ignored, or counted under a separate category (see `include top`) - bin_step (int): The size of each bin, in the same unit as `values`. Default is 2. - weights (ndarray, Series, or None: Optional vector of weights to use of length "N", to produce a weighted - histogram. - intrazonal (ndarray, Seires, or None): Optional boolean vector indicating which values are considered - "intrazonal". When specified, prepends an "intrazonal" category to the front of the histogram. - label_type (str): String indicating the format of the returned index. Options are: - - "MULTI": The returned index will be a 2-level MultiIndex ['from', 'to']; - - "TEXT": The returned index will be text-based: "0 to 2"; - - "BOTTOM": The returned index will be the bottom of each bin; and - - "TOP": The returned index will be the top of each bin. - include_top (bool): If True, the function will count all values (and weights, if provided) above the `bin_top`, - and add them to the returned Series. This bin is described as going from `bin_top` to `inf`. - - Returns: - Series: The weighted or unweighted histogram, depending on the options configured above. - - """ - bins = list(range(bin_start, bin_end + bin_step, bin_step)) - - if intrazonal is not None: - if weights is not None: - iz_total = weights.loc[intrazonal].sum() - weights = weights.loc[~intrazonal] - else: - iz_total = intrazonal.sum() - - values = values.loc[~intrazonal] - - if weights is not None: - hist, _ = np.histogram(values, weights=weights, bins=bins) - else: - hist, _ = np.histogram(values, bins=bins) - - new_len = len(hist) - if intrazonal is not None: new_len += 1 - if include_top: new_len += 1 - new_hist = np.zeros(shape=new_len, dtype=hist.dtype) - lower_index = 0 - upper_index = new_len - - if intrazonal is not None: - new_hist[0] = iz_total - bins.insert(0, 'intrazonal') - lower_index += 1 - if include_top: - if weights is not None: - top = weights.loc[values >= bin_end].sum() - else: - top = (values >= bin_end).sum() - - new_hist[-1] = top - bins.append(np.inf) - upper_index -= 1 - new_hist[lower_index: upper_index] = hist - - label_type = label_type.upper() - if label_type == 'MULTI': - index = pd.MultiIndex.from_arrays([bins[:-1], bins[1:]], names=['from', 'to']) - elif label_type == 'TOP': - index = pd.Index(bins[1:]) - elif label_type == 'BOTTOM': - index = pd.Index(bins[:-1]) - elif label_type == 'TEXT': - s0 = pd.Series(bins[:-1], dtype=str).astype(str) - s1 = pd.Series(bins[1:], dtype=str).astype(str) - index = pd.Index(s0 + ' to ' + s1) - else: - raise NotImplementedError(label_type) - - new_hist = pd.Series(new_hist, index=index) - - return new_hist - - -def _get_distance_equation(method): - if method.lower() == 'euclidean': - expr = "sqrt((x0 - x1)**2 + (y0 - y1) ** 2) * coord_unit" - elif method.lower() == 'manhattan': - expr = "(abs(x0 - x1) + abs(y0 - y1)) * coord_unit" - elif method.lower() == 'haversine': - y0 = "(y0 * pi / 180)" - y1 = "(y1 * pi / 180)" - delta_lon = "((x1 - x0) * pi / 180)" - delta_lat = "((y1 - y0) * pi / 180)" - part1 = "sin({delta_lat} / 2.0)**2 + cos({y0}) * cos({y1}) * (sin({delta_lon} / 2.0))**2"\ - .format(delta_lat=delta_lat, delta_lon=delta_lon, y0=y0, y1=y1) - expr = "6371.0 * earth_radius_factor* 2.0 * arctan2(sqrt({part1}), sqrt(1.0 - {part1})) * coord_unit".\ - format(part1=part1) - else: - raise NotImplementedError(method.lower()) - return expr - - -def _prepare_distance_kwargs(kwargs): - defaults = {'coord_unit': 1.0, 'earth_radius_factor': 1.0, 'pi': np.pi} - for key, val in iteritems(defaults): - if key not in kwargs: - kwargs[key] = val - - -def distance_matrix(x, y, tall=False, method='euclidean', labels=None, **kwargs): - """ - Fastest method of computing a distance matrix from 2 vectors of coordinates, using the NumExpr package. Supports - several equations for computing distances - - Args: - x (ndarray or Series): Vector of x-coordinates, of length N. Can be a Series to specify labels. - y (ndarray or Series): Vector of y-coordinates, of length N. Can be a Series to specify labels. - tall (bool): If True, returns a vecotr whose shape is (N x N,). Otherwise, returns a matrix whose shape is - (N, N). - method (str): Specifies the method by which to compute distance. Valid options are: - 'EUCLIDEAN': Computes straight-line, 'as-the-crow flies' distance. - 'MANHATTAN': Computes the Manhattan distance - 'HAVERSINE': Computes distance based on lon/lat. - labels (None or list or Index): Optional labels for each item in the x, y vectors; of length N. - - **kwargs: Additional scalars to pass into the evaluation context - coord_unit (float): Factor applies directly to the result, defaulting to 1.0 (no conversion). Useful when - the coordinates are provided in one unit (e.g. m) and the desired result is in a different unit (e.g. - km). Only used for Euclidean or Manhattan distance - earth_radius_factor (float): Factor to convert from km to other units when using Haversine distance - - Returns: - Series: Returned when `tall=True`, and labels can be inferred (see note below). Will always be have 2-level - MultiIndex. - DataFrame: Returned when `tall=False` and labels can be inferred (see notes below). - ndarray: Returned when labels could not be inferred (see notes below). If `tall=True` the array will be - 1-dimensional, with shape (N x N,). Otherwise, it will 2-dimensional with shape (N, N) - - Notes: - The type of the returned object depends on whether labels can be inferred from the arguments. This is always - true when the `labels` argument is specified, and the returned value will use cross-product of the `labels` - vector. - - Otherwise, the function will try and infer the labels from the `x` and `y` objects, if one or both of them are - provided as Series. - - """ - - x_is_series, y_is_series = isinstance(x, pd.Series), isinstance(y, pd.Series) - - if x_is_series and y_is_series: - assert x.index.equals(y.index), "X and Y series must have the same index" - if labels is None: labels = x.index - x0, y0 = x.values[...], y.values[...] - elif x_is_series: - assert len(y) == len(x), "The length of the Y array does not match the length of the X series" - if labels is None: labels = x.index - x0, y0 = x.values[...], y[...] - elif y_is_series: - assert len(y) == len(x), "The length of the X array does not match the length of the Y series" - if labels is None: labels = y.index - x0, y0 = x[...], y.values[...] - else: - assert len(x) == len(y), "X and Y arrays are not the same length" - if labels is not None: assert len(labels) == len(x), "Vector length of labels does not match X/Y vector length" - x0, y0 = x[...], y[...] - - x1, y1 = x0[...], y0[...] - n = len(x0) - - x0.shape = 1, n - y0.shape = 1, n - x1.shape = n, 1 - y1.shape = n, 1 - - expr = _get_distance_equation(method) - kwargs = kwargs.copy() - _prepare_distance_kwargs(kwargs) - kwargs['x0'] = x0 - kwargs['x1'] = x1 - kwargs['y0'] = y0 - kwargs['y1'] = y1 - - raw_matrix = ne.evaluate(expr, local_dict=kwargs) - - if tall: - raw_matrix.shape = n * n - if labels is None: return raw_matrix - - mi = pd.MultiIndex.from_product([labels, labels]) - return pd.Series(raw_matrix, index=mi) - elif labels is None: - return raw_matrix - - return pd.DataFrame(raw_matrix, index=labels, columns=labels) - - -def distance_array(x0, y0, x1, y1, method='euclidean', **kwargs): - """ - Fast method to compute distance between 2 (x, y) points, represented by 4 separate arrays, using the NumExpr - package. Supports several equations for computing distances - - Args: - x0: X or Lon coordinate of first point - y0: Y or Lat coordinate of first point - x1: X or Lon coordinate of second point - y1: Y or Lat coordinate of second point - method: method (str): Specifies the method by which to compute distance. Valid options are: - 'EUCLIDEAN': Computes straight-line, 'as-the-crow flies' distance. - 'MANHATTAN': Computes the Manhattan distance - 'HAVERSINE': Computes distance based on lon/lat. - **kwargs: Additional scalars to pass into the evaluation context - coord_unit (float): Factor applies directly to the result, defaulting to 1.0 (no conversion). Useful when - the coordinates are provided in one unit (e.g. m) and the desired result is in a different unit (e.g. - km). Only used for Euclidean or Manhattan distance - earth_radius_factor (float): Factor to convert from km to other units when using Haversine distance - - Returns: - ndarray: Distance from the vectors of first points to the vectors of second points. - - """ - assert len(x0) == len(y0) == len(x1) == len(y1) - - def coerce_to_numpy(item): - if isinstance(item, pd.Series): - return item.values - return item - - x0 = coerce_to_numpy(x0) - x1 = coerce_to_numpy(x1) - y0 = coerce_to_numpy(y0) - y1 = coerce_to_numpy(y1) - - expr = _get_distance_equation(method) - kwargs = kwargs.copy() - _prepare_distance_kwargs(kwargs) - kwargs['x0'] = x0 - kwargs['x1'] = x1 - kwargs['y0'] = y0 - kwargs['y1'] = y1 - - result_array = ne.evaluate(expr, local_dict=kwargs) - return result_array diff --git a/balsa/models/analyses.pyi b/balsa/models/analyses.pyi deleted file mode 100644 index a8d0b50..0000000 --- a/balsa/models/analyses.pyi +++ /dev/null @@ -1,18 +0,0 @@ -from typing import Iterable, Union -from pandas import DataFrame, Series, Index -from numpy import ndarray - - -_vector_type = Union[ndarray, Series] - - -def tlfd(values: _vector_type, bin_start: int=0, bin_end: int=200, bin_step: int=2, weights: _vector_type=None, - intrazonal: _vector_type=None, label_type: str='MULTI', include_top: bool=False - ) -> Series: - pass - - -def distance_matrix(x: _vector_type, y: _vector_type, tall: bool=False, method: str='euclidean', coord_unit: float=1.0, - labels: Union[Iterable, Index]=None - ) -> Union[ndarray, Series, DataFrame]: - pass diff --git a/balsa/pandas_utils/__init__.py b/balsa/pandas_utils/__init__.py deleted file mode 100644 index cae4b1f..0000000 --- a/balsa/pandas_utils/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .pandas_utils import align_categories, fast_unstack, fast_stack, reindex_series diff --git a/balsa/pandas_utils/pandas_utils.pyi b/balsa/pandas_utils/pandas_utils.pyi deleted file mode 100644 index d97b158..0000000 --- a/balsa/pandas_utils/pandas_utils.pyi +++ /dev/null @@ -1,14 +0,0 @@ -import pandas as pd -from typing import Union, Iterable, List - -def fast_stack(frame: pd.DataFrame, multi_index: pd.MultiIndex, deep_copy: bool=True) -> pd.Series: - pass - -def fast_unstack(series: pd.Series, index: pd.Index, columns: pd.Index, deep_copy: bool=True) -> pd.DataFrame: - pass - -def align_categories(iterable: Iterable[Union[pd.Series, pd.DataFrame]]) -> None: - pass - -def sum_df_sequence(seq: Iterable[pd.DataFrame], fill_value: Union[int, float]=0) -> pd.DataFrame: - pass diff --git a/balsa/routines/__init__.py b/balsa/routines/__init__.py new file mode 100644 index 0000000..bbe3dc2 --- /dev/null +++ b/balsa/routines/__init__.py @@ -0,0 +1,6 @@ +from .general import align_categories, is_identifier, reindex_series, sum_df_sequence +from .io import * +from .matrices import (fast_unstack, fast_stack, aggregate_matrix, matrix_balancing_1d, matrix_balancing_2d, + matrix_bucket_rounding, split_zone_in_matrix, disaggregate_matrix) +from .modelling import (tlfd, distance_array, distance_matrix) +from .plotting import trumpet_diagram, convergence_boxplot, location_summary diff --git a/balsa/routines/best_intermediates.py b/balsa/routines/best_intermediates.py new file mode 100644 index 0000000..c7ef536 --- /dev/null +++ b/balsa/routines/best_intermediates.py @@ -0,0 +1,235 @@ +from typing import Tuple, List, Dict, Union +from threading import Thread +from multiprocessing import cpu_count + +from numba import njit +from numpy import int16 as nshort, inf, ndarray, float32 as nfloat +import numpy as np +from pandas import DataFrame, Index, MultiIndex + + +_NULL_INDEX = -1 +_NEG_INF = -inf + + +def _get_breaks(n_items: int, n_workers: int) -> List[Tuple[int, int]]: + """Gets starts and stops for breaks similar to numpy.array_slice()""" + div, remainder = divmod(n_items, n_workers) + slices = [] + bottom = 0 + for i in range(n_workers): + top = bottom + div + top += int(i < remainder) + slices.append((bottom, top)) + bottom = top + return slices + + +@njit +def _update_heap(utilities: ndarray, zones: ndarray, new_u: float, new_zone: int): + """Inserts a value into a sorted array, maintaining sort order, and the number of items in that array. The array + is sorted lowest-to-highest""" + i = 0 + top = len(utilities) + while i < top: + current_u = utilities[i] + if new_u < current_u: break + i += 1 + if i <= 0: return + for j in range(i - 1): + utilities[j] = utilities[j + 1] + zones[j] = zones[j + 1] + utilities[i - 1] = new_u + zones[i - 1] = new_zone + + +@njit(nogil=True) +def _nbf_twopart_worker(access_utils: ndarray, egress_utils: ndarray, result_utils: ndarray, result_stations: ndarray, + start: int, stop: int, k: int): + """Performance-tuned NBF to operate on its own thread""" + n_origins, n_intermediate = access_utils.shape + n_destinations = egress_utils.shape[1] + + # Allocate the sorted heap of utilities (and associated indices) once per thread + util_heap = np.full(shape=k, fill_value=-inf) + zones_heap = np.full(shape=k, fill_value=_NULL_INDEX) + + for offset in range(start, stop): + origin_zone, destination_zone = divmod(offset, n_destinations) + + # Reset the heap for this OD + util_heap[:] = _NEG_INF + zones_heap[:] = _NULL_INDEX + + for interim_zone in range(n_intermediate): + + if interim_zone >= n_intermediate: + print("ERR", offset, origin_zone, destination_zone, interim_zone) + raise AssertionError() + + interim_util = access_utils[origin_zone, interim_zone] + egress_utils[interim_zone, destination_zone] + + # In general, for problems where (n_origins * n_destinations) >> k, most values will not be in the top k. + # So quickly check against the lowest utility in the heap to avoid calling the updater func + if interim_util < util_heap[0] or interim_util == _NEG_INF: continue + _update_heap(util_heap, zones_heap, interim_util, interim_zone) + + result_utils[origin_zone, destination_zone, :] = util_heap + result_stations[origin_zone, destination_zone, :] = zones_heap + + +def _validate_access_egress_tables(access_table: DataFrame, egress_table: DataFrame) -> Tuple[Index, Index, Index]: + + assert access_table.index.nlevels == 2, "Access table index must have two levels" + assert egress_table.index.nlevels == 2, "Egress table index must have two levels" + + # Take the unique index of each level, as Pandas can return more items than is present, if the frame is a slice + origin_zones: Index = access_table.index.unique(level=0) + intermediate_zones: Index = access_table.index.unique(level=1) + destination_zones: Index = egress_table.index.unique(level=1) + + # Check that the access and egress tables have compatible indices + assert intermediate_zones.equals(egress_table.index.unique(level=0)), \ + "Access index level 2 and egress index level 1 must be the same" + + return origin_zones, intermediate_zones, destination_zones + + +def best_intermediate_zones(access_table: DataFrame, egress_table: DataFrame, cost_column: str, k: int = 1, + n_threads: int = None, squeeze=True, other_columns=True, + intermediate_name: str = "intermediate_zone", maximize=True, + availability_column: str = "available", null_index=0 + ) -> Union[DataFrame, Dict[int, DataFrame]]: + """ + Numba-accelerated. + + Triple-index operation for two matrices, finding the most- or least-cost intermediate zones. Takes an access matrix + of the shape (O, I) and an egress matrix of the shape (I, D) to produce a combined matrix of the shape (O, D), with + the best intermediate I. Also works to construct multiple (O, D) matrices - for the top _k_ intermediate zones in + _I_. + + There is no restriction on the label dtypes, as long as the access and egress tables share the same _I_ index. + + Both the input matrices must be provided in "tall" format - as Pandas Series with a 2-level MultiIndex. + Essentially, the access and egress tables are DataFrames with multiple matrices defined within. The output table(s) + are also returned in a tall format. + + When constructing the result tables, columns in the access and egress tables are "carried forward" such that the + results columns will be the union of columns in the input tables. Columns in one table only will be carried forward + unmodified and retain their dtype. Columns in both tables will be added together, and thus MUST be numeric. + + In the specified cost column, a value of `-inf` (or `inf` when minimizing) is respected as the sentinel value for + unavailable. (O, I) or (I, D) interchanges with this sentinel value will not be considered. + + Args: + access_table: DataFrame with 2-level MultiIndex of the shape ((O, I), A). Must include the specified cost column + egress_table: DataFrame with 2-level MultiIndex of the shape ((I, D), E). Must include the specified cost column + cost_column: Name of the column in the access and egress table to use as the cost to minimize/maximize. Values + of `+/- inf` are respected to indicate unavailable choices + k: The number of ranks to return (e.g., find the _k_ best intermediate zones). If k <= 0, it will be corrected + to 1. + n_threads: Number of threads to use. Defaults to cpu_count() + squeeze: If k == 1 and squeeze=True, a single DataFrame is returned. Otherwise, a Dictionary of DataFrames will + be returned. + other_columns: If True, the result DataFrame will include all columns in the access and egress tables. The + result table will be of the shape ((O, D), A | E + 3) + intermediate_name: Name of the column in the result table containing the selected intermediate zone. + maximize: If True, this function maximize the result. If False, it minimizes it. + availability_column: Name of the column in the result table containing a flag whether ANY intermediate zone + was found to be available. + null_index: Fill value used if NO intermediate zone is available. + + Returns: + DataFrame: If k == 1 and squeeze=True. A DataFrame of the shape ((O, D), A | E + 3), containing the intermediate + zone selected, the associated max/min cost, and a flag indicating its availability. Additional columns from + the access and egress tables, indexed for the appropriately chosen intermediate zone, will also be included + if other_columns=True. + Dict[int, DataFrame]: If k > 1. The keys represent the ranks, so result[1] is the best intermediate zone, + result[2] is the second-best, etc. The value DataFrames are in the same format as if k == 1, just with + different intermediate zones chosen. + + """ + + # Check inputs + k = max(1, k) + if n_threads is None: n_threads = cpu_count() + origins, intermediates, destinations = _validate_access_egress_tables(access_table, egress_table) + n_origins, n_intermediate, n_destinations = len(origins), len(intermediates), len(destinations) + + # Compute best path(s) + access_cost = access_table[cost_column].values.reshape([n_origins, n_intermediate]).astype(nfloat) + egress_cost = egress_table[cost_column].values.reshape([n_intermediate, n_destinations]).astype(nfloat) + + if not maximize: # Invert the cost to use code set to maximize + access_cost *= -1 + egress_cost *= -1 + + result_cost = np.zeros(shape=(n_origins, n_destinations, k), dtype=nfloat) + result_indices = np.zeros(dtype=nshort, shape=(n_origins, n_destinations, k)) + + # Setup the workers (1 per thread) to select best paths for a subset of ODs + breaks = _get_breaks(n_origins * n_destinations, n_threads) + threads = [ + Thread(target=_nbf_twopart_worker, args=[ + access_cost, egress_cost, result_cost, result_indices, + start, stop, k + ]) + for start, stop in breaks + ] + for t in threads: t.start() + for t in threads: t.join() + + # Construct composite result tables + if other_columns: + remaining_columns = set(access_table.columns | egress_table.columns) - {cost_column, intermediate_name, + availability_column} + else: + remaining_columns = set() + + row_index = MultiIndex.from_product([origins, destinations]) # Labels for the rows + access_indexer = np.repeat(np.arange(n_origins), n_destinations) # Indexer for the access table + egress_indexer = np.tile(np.arange(n_destinations), n_origins) # Indexer for the egress table + tables = {} # Results + + for i in range(k): + table = DataFrame(index=row_index) + + offsets_i = result_indices[:, :, i] # 2D indexer for this (i ∈ k) path + flat_offsets_i = offsets_i.flatten() # Convert to 1D indexer + availability_i = flat_offsets_i != _NULL_INDEX + + intermediate_result_i = intermediates.take(flat_offsets_i) + intermediate_result_i[~availability_i] = null_index + table[intermediate_name] = intermediate_result_i + + table[availability_column] = availability_i + table[cost_column] = result_cost[offsets_i] + + # If there are any columns left, add them to the composite table + for column in remaining_columns: + in_access = column in access_table + in_egress = column in egress_table + + if in_access: + access_matrix = access_table[column].values.reshape([n_origins, n_intermediate]) + access_component = access_matrix[access_indexer, flat_offsets_i] + if in_egress: + egress_matrix = egress_table[column].values.reshape([n_intermediate, n_destinations]) + egress_component = egress_matrix[flat_offsets_i, egress_indexer] + + if in_access and in_egress: + composite_data = access_component + egress_component + elif in_access: + composite_data = access_component + elif in_egress: + composite_data = egress_component + else: + raise RuntimeError("This shouldn't happen") + + table[column] = composite_data + + tables[k - i] = table + + if k == 1 and squeeze: + return tables[1] + return tables diff --git a/balsa/pandas_utils/pandas_utils.py b/balsa/routines/general.py similarity index 51% rename from balsa/pandas_utils/pandas_utils.py rename to balsa/routines/general.py index 09a35df..8dd7386 100644 --- a/balsa/pandas_utils/pandas_utils.py +++ b/balsa/routines/general.py @@ -1,77 +1,12 @@ from __future__ import division, absolute_import, print_function, unicode_literals +from keyword import kwlist + from pandas import DataFrame, Series, Index, MultiIndex -import numpy as np from six import iteritems - - -def fast_stack(frame, multi_index, deep_copy=True): - """ - Performs the same action as DataFrame.stack(), but provides better performance when the target stacked index is - known before hand. Useful in converting a lot of matrices from "wide" to "tall" format. The inverse of fast_unstack() - - Notes: - This function does not check that the entries in the multi_index are compatible with the index and columns of the - source DataFrame, only that the lengths are compatible. It can therefore be used to assign a whole new set of - labels to the result. - - Args: - frame (DataFrame): DataFrame to stack. - multi_index (Index): The 2-level MultiIndex known ahead-of-time. - deep_copy (bool): Flag indicating if the returned Series should be a view of the underlying data - (deep_copy=False) or a copy of it (deep_copy=True). A deep copy takes a little longer to convert and takes - up more memory but preserves the original data of the DataFrame. The default value of True is recommended - for most uses. - - Returns: - Series: The stacked data. - - """ - - assert multi_index.nlevels == 2, "Target index must be a MultiIndex with exactly 2 levels" - assert len(multi_index) == len(frame.index) * len(frame.columns), "Target index and source index and columns do " \ - "not have compatible lengths" - - array = np.ascontiguousarray(frame.values) - array = array.copy() if deep_copy else array[:, :] - array.shape = len(frame.index) * len(frame.columns) - - return Series(array, index=multi_index) - - -def fast_unstack(series, index, columns, deep_copy=True): - """ - Performs the same action as DataFrame.unstack(), but provides better performance when the target unstacked index and - columns are known before hand. Useful in converting a lot of matrices from "tall" to "wide" format. The inverse of - fast_stack(). - - Notes: - This function does not check that the entries in index and columns are compatible with the MultiIndex of the - source Series, only that the lengths are compatible. It can therefore be used to assign a whole new set of - labels to the result. - - Args: - series (Series): Series with 2-level MultiIndex to stack() - index (Index): The row index known ahead-of-time - columns (Index): The columns index known ahead-of-time. - deep_copy (bool): Flag indicating if the returned DataFrame should be a view of the underlying data - (deep_copy=False) or a copy of it (deep_copy=True). A deep copy takes a little longer to convert and takes - up more memory but preserves the original data of the Series. The default value of True is recommended - for most uses. - - Returns: - DataFrame: The unstacked data - - """ - - assert series.index.nlevels == 2, "Source Series must have an index with exactly 2 levels" - assert len(series) == len(index) * len(columns), "Source index and target index and columns do not have " \ - "compatible lengths" - - array = series.values.copy() if deep_copy else series.values[:] - array.shape = len(index), len(columns) - - return DataFrame(array, index=index, columns=columns) +import six +import re +import tokenize def reindex_series(series, target_series, source_levels=None, target_levels=None, fill_value=None): @@ -96,16 +31,17 @@ def reindex_series(series, target_series, source_levels=None, target_levels=None def align_categories(iterable): """ - Pre-processing step for pandas.concat() which attempts to align any Categorical series in the sequence to using the - same set of categories. It passes through the sequence twice: once to accumulate the complete set of all categories - used in the sequence; and a second time to modify the sequence's contents to use this full set. The contents of the - sequence are modified in-place. + Pre-processing step for ``pd.concat()`` which attempts to align any Categorical series in the sequence to using + the same set of categories. It passes through the sequence twice: once to accumulate the complete set of all + categories used in the sequence; and a second time to modify the sequence's contents to use this full set. The + contents of the sequence are modified in-place. - Notes: - The resulting categories will be lex-sorted (based on the sorted() builtin) + Note: + The resulting categories will be lex-sorted (based on the ``sorted()`` builtin) Args: - iterable: Any iterable of Series or DataFrame objects (anything that is acceptable to pandas.concat()) + iterable (Union[pandas.Series, pandas.DataFrame]): Any iterable of Series or DataFrame objects (anything that is + acceptable to ``pandas.concat()``) """ iterable_type = None @@ -174,11 +110,11 @@ def sum_df_sequence(seq, fill_value=0): the same indexes and columns but might be missing a few values. Args: - seq (Iterable[DataFrame]): Any iterable of DataFrame type, ordered or unordered. - fill_value: The value fo use for missing cells. Preferably a number to avoid erros. + seq (Iterable[pandas.DataFrame]): Any iterable of DataFrame type, ordered or unordered. + fill_value: Defaults to ``0``. The value to use for missing cells. Preferably a number to avoid errors. Returns: - DataFrame: The sum over all items in seq. + pandas.DataFrame: The sum over all items in seq. """ common_index = Index([]) @@ -196,3 +132,34 @@ def sum_df_sequence(seq, fill_value=0): df = df.reindex_axis(common_columns, axis=1, fill_value=fill_value) accumulator += df return accumulator + + +if six.PY3: + def is_identifier(name): + """ + Tests that the name is a valid Python variable name and does not collide with reserved keywords + + Args: + name (str): Name to test + + Returns: + bool: If the name is 'Pythonic' + + """ + + return name.isidentifier() and name not in kwlist +else: + def is_identifier(name): + """ + Tests that the name is a valid Python variable name and does not collide with reserved keywords + + Args: + name (str): Name to test + + Returns: + bool: If the name is 'Pythonic' + + """ + + return bool(re.match(tokenize.Name + '$', name)) and name not in kwlist + diff --git a/balsa/routines/general.pyi b/balsa/routines/general.pyi new file mode 100644 index 0000000..884e7f9 --- /dev/null +++ b/balsa/routines/general.pyi @@ -0,0 +1,15 @@ +import pandas as pd +from typing import Union, Iterable + + +def reindex_series(series, target_series, source_levels=None, target_levels=None, fill_value=None): + pass + +def align_categories(iterable: Iterable[Union[pd.Series, pd.DataFrame]]) -> None: + pass + +def sum_df_sequence(seq: Iterable[pd.DataFrame], fill_value: Union[int, float]=0) -> pd.DataFrame: + pass + +def is_identifier(name: str) -> bool: + pass diff --git a/balsa/routines/io/__init__.py b/balsa/routines/io/__init__.py new file mode 100644 index 0000000..155cc84 --- /dev/null +++ b/balsa/routines/io/__init__.py @@ -0,0 +1,4 @@ +from .common import coerce_matrix, expand_array, open_file +from .fortran import read_fortran_rectangle, read_fortran_square, to_fortran +from .inro import read_mdf, read_emx, peek_mdf, to_mdf, to_emx +from .omx import read_omx, to_omx diff --git a/balsa/routines/io/common.py b/balsa/routines/io/common.py new file mode 100644 index 0000000..dbf617a --- /dev/null +++ b/balsa/routines/io/common.py @@ -0,0 +1,107 @@ +import numpy as np +import pandas as pd +from contextlib import contextmanager + +from six import string_types + +try: + from pathlib import Path +except ImportError: + Path = None + + +def coerce_matrix(matrix, allow_raw=True, force_square=True): + """ + Infers a NumPy array from given input + + Args: + matrix: + allow_raw (bool, optional): Defaults to ``True``. + force_square (bool, optional): Defaults to ``True``. + + Returns: + numpy.ndarray: + A 2D ndarray of type float32 + """ + if isinstance(matrix, pd.DataFrame): + if force_square: + assert matrix.index.equals(matrix.columns) + return matrix.values.astype(np.float32) + elif isinstance(matrix, pd.Series): + assert matrix.index.nlevels == 2, "Cannot infer a matrix from a Series with more or fewer than 2 levels" + wide = matrix.unstack() + + union = wide.index | wide.columns + wide = wide.reindex_axis(union, fill_value=0.0, axis=0).reindex_axis(union, fill_value=0.0, axis=1) + return wide.values.astype(np.float32) + + if not allow_raw: + raise NotImplementedError() + + matrix = np.array(matrix, dtype=np.float32) + assert len(matrix.shape) == 2 + i, j = matrix.shape + assert i == j + + return matrix + + +def expand_array(a, n, axis=None): + """ + Expands an array across all dimensions by a set amount + + Args: + a (numpy.ndarray): The array to expand + n (numpy.ndarray): The (non-negative) number of items to expand by. + axis (int, optional): Defaults to ``None``. The axis to expand along, or None to expand along all axes. + + Returns: + numpy.ndarray: + The expanded array + """ + + if axis is None: new_shape = [dim + n for dim in a.shape] + else: + new_shape = [] + for i, dim in enumerate(a.shape): + dim += n if i == axis else 0 + new_shape.append(dim) + + out = np.zeros(new_shape, dtype=a.dtype) + + indexer = [slice(0, dim) for dim in a.shape] + out[indexer] = a + + return out + + +@contextmanager +def open_file(file_handle, **kwargs): + """ + Context manager for opening files provided as several different types. Supports a file handler as a str, unicode, + ``pathlib.Path``, or an already-opened handler. + + Args: + file_handle (Union[str, unicode, Path, File]): The item to be opened or is already open. + **kwargs: Keyword args passed to ``open()``. Usually mode='w'. + + Yields: + File: + The opened file handler. Automatically closed once out of context. + + """ + opened = False + if isinstance(file_handle, string_types): + f = open(file_handle, **kwargs) + opened = True + elif Path is not None and isinstance(file_handle, Path): + f = file_handle.open(**kwargs) + opened = True + else: + f = file_handle + + try: + yield f + finally: + if opened: + f.close() diff --git a/balsa/routines/io/common.pyi b/balsa/routines/io/common.pyi new file mode 100644 index 0000000..95824d5 --- /dev/null +++ b/balsa/routines/io/common.pyi @@ -0,0 +1,17 @@ +from typing import Union +from io import FileIO + +import numpy as np + +try: + from pathlib import Path + file_type = Union[str, FileIO, Path] +except ImportError: + Path = None + file_type = Union[str, FileIO] + +def coerce_matrix(matrix, allow_raw=True, force_square=True): pass + +def expand_array(a: np.ndarray, n: int, axis: int=None) -> np.ndarray: pass + +def open_file(file_handle, **kwargs): pass diff --git a/balsa/routines/io/fortran.py b/balsa/routines/io/fortran.py new file mode 100644 index 0000000..ebf0661 --- /dev/null +++ b/balsa/routines/io/fortran.py @@ -0,0 +1,166 @@ +import pandas as pd +import numpy as np + +from .common import coerce_matrix, open_file, expand_array + + +def _infer_fortran_zones(n_words): + """Returns the inverse of n_words = matrix_size * (matrix_size + 1)""" + n = int(0.5 + ((1 + 4 * n_words)**0.5)/2) - 1 + assert n_words == (n * (n + 1)), "Could not infer a square matrix from file" + return n + + +def read_fortran_rectangle(file, n_columns, zones=None, tall=False, reindex_rows=False, fill_value=None): + """ + Reads a FORTRAN-friendly .bin file (a.k.a. 'simple binary format') which is known to NOT be square. Also works with + square matrices. + + This file format is an array of 4-bytes, where each row is prefaced by an integer referring to the 1-based positional + index that FORTRAN uses. The rest of the data are in 4-byte floats. To read this, the number of columns present + must be known, since the format does not self-specify. + + Args: + file(Union[str, File, Path]): The file to read. + n_columns (int): The number of columns in the matrix. + zones (Union[int, pandas.Index], optional): Defaults to ``None``. An Index or Iterable will be interpreted as + the zone labels for the matrix rows and columns; returning a DataFrame or Series (depending on `tall`). If + an integer is provided, the returned ndarray will be truncated to this 'number of zones'. + tall (bool, optional): Defaults to ``False``. If true, a 'tall' version of the matrix will be returned. + reindex_rows (bool, optional): Defaults to ``False``. If true, and zones is an Index, the returned DataFrame + will be reindexed to fill-in any missing rows. + fill_value (optional): Defaults to ``None``. The value to pass to ``pandas.reindex()`` + + Returns: + numpy.ndarray, pandas.DataFrame or pandas.Series + + Raises: + AssertionError: if the shape is not valid. + """ + with open_file(file, mode='rb') as reader: + n_columns = int(n_columns) + + matrix = np.fromfile(reader, dtype=np.float32) + rows = len(matrix) // (n_columns + 1) + assert len(matrix) == (rows * (n_columns + 1)) + + matrix.shape = rows, n_columns + 1 + + # Convert binary representation from float to int, then subtract 1 since FORTRAN uses 1-based positional + # indexing + row_index = np.frombuffer(matrix[:, 0].tobytes(), dtype=np.int32) - 1 + matrix = matrix[:, 1:] + + if zones is None: + if tall: + matrix.shape = matrix.shape[0] * matrix.shape[1] + return matrix + + if isinstance(zones, (int, np.int_)): + matrix = matrix[: zones, :zones] + + if tall: + matrix.shape = zones * zones + return matrix + + nzones = len(zones) + matrix = matrix[: nzones, : nzones] + row_labels = zones.take(row_index[:nzones]) + matrix = pd.DataFrame(matrix, index=row_labels, columns=zones) + + if reindex_rows: + matrix = matrix.reindex_axis(zones, axis=0, fill_value=fill_value) + + if tall: + return matrix.stack() + return matrix + + +def read_fortran_square(file, zones=None, tall=False): + """ + Reads a FORTRAN-friendly .bin file (a.k.a. 'simple binary format') which is known to be square. + + This file format is an array of 4-bytes, where each row is prefaced by an integer referring to the 1-based + positional index that FORTRAN uses. The rest of the data are in 4-byte floats. To read this, the number of columns + present must be known, since the format does not self-specify. This method can infer the shape if it is square. + + Args: + file (Union[str, File, Path]): The file to read. + zones (Union[pandas.Index, int], optional): Defaults to ``None``. An Index or Iterable will be interpreted as + the zone labels for the matrix rows and columns; returning a DataFrame or Series (depending on ``tall``). + If an integer is provided, the returned ndarray will be truncated to this 'number of zones'. Otherwise, the + returned ndarray will be size to the maximum number of zone dimensioned by the Emmebank. + tall (bool, optional): Defaults to ``False``. If True, a 1D data structure will be returned. If ``zone_index`` + is provided, a Series will be returned, otherwise a 1D ndarray. + + Returns: + pandas.DataFrame, pandas.Series or numpy.ndarray + + """ + with open_file(file, mode='rb') as reader: + floats = np.fromfile(reader, dtype=np.float32) + n_words = len(floats) + matrix_size = _infer_fortran_zones(n_words) + floats.shape = matrix_size, matrix_size + 1 + + data = floats[:, 1:] + + if zones is None: + if tall: + n = np.prod(data.shape) + data.shape = n + return data + return data + + if isinstance(zones, (int, np.int_)): + data = data[:zones, :zones] + + if tall: + data.shape = zones * zones + return data + return data + elif zones is None: + return data + + zones = pd.Index(zones) + n = len(zones) + data = data[:n, :n] + + matrix = pd.DataFrame(data, index=zones, columns=zones) + + return matrix.stack() if tall else matrix + + +def to_fortran(matrix, file, n_columns=None, min_index=1, force_square=True): + """ + Reads a FORTRAN-friendly .bin file (a.k.a. 'simple binary format'), in a square format. + + Args: + matrix (Union[pandas.DataFrame, pandas.Series, numpy.ndarray]): The matrix to write to disk. If a Series is + given, it MUST have a MultiIndex with exactly 2 levels to unstack. + file (Union[basestring, File]): The path or file handler to write to. + n_columns (int, optional): Defaults to ``None``. Specifies a desired "width" of the matrix file. For example, + ``n_columns=4000`` on a 3500x3500 matrix will pad the width with 500 extra columns containing 0. If ``None`` + is provided or the value is <= the width of the given matrix, no padding will be performed. + min_index (int, optional): Defaults to ``1``. The lowest numbered row. Used when slicing matrices + force_square (bool, optional): Defaults to ``True``. + + """ + assert min_index >= 1 + array = coerce_matrix(matrix, force_square=force_square) + + if n_columns is not None and n_columns > array.shape[1]: + extra_columns = n_columns - array.shape[1] + array = expand_array(array, extra_columns, axis=1) + + with open_file(file, mode='wb') as writer: + rows, columns = array.shape + temp = np.zeros([rows, columns + 1], dtype=np.float32) + temp[:, 1:] = array + + index = np.arange(min_index, rows + 1, dtype=np.int32) + # Mask the integer binary representation as floating point + index_as_float = np.frombuffer(index.tobytes(), dtype=np.float32) + temp[:, 0] = index_as_float + + temp.tofile(writer) diff --git a/balsa/routines/io/fortran.pyi b/balsa/routines/io/fortran.pyi new file mode 100644 index 0000000..a22f9d5 --- /dev/null +++ b/balsa/routines/io/fortran.pyi @@ -0,0 +1,18 @@ +from typing import Any, Union + +import pandas as pd +import numpy as np + +from .common import file_type + + +def read_fortran_rectangle(file: file_type, n_columns: int, zones: pd.Index=None, tall: bool=False, + reindex_rows: bool=False, fill_value: Any=None + ) -> Union[pd.Series, pd.DataFrame, np.ndarray]: pass + +def read_fortran_square(file: file_type, zones: pd.Index=None, tall: bool=False + ) -> Union[pd.Series, pd.DataFrame, np.ndarray]: pass + + +def to_fortran(matrix: Union[pd.Series, pd.DataFrame, np.ndarray], file: file_type, n_columns: int=None, + min_index: int=1, forec_square: bool=True): pass \ No newline at end of file diff --git a/balsa/routines/io/inro.py b/balsa/routines/io/inro.py new file mode 100644 index 0000000..779b95a --- /dev/null +++ b/balsa/routines/io/inro.py @@ -0,0 +1,211 @@ +import numpy as np +import pandas as pd + +from .common import open_file, coerce_matrix + + +def read_mdf(file, raw=False, tall=False): + """ + Reads Emme's official matrix "binary serialization" format, created using ``inro.emme.matrix.MatrixData.save()``. + There is no official extension for this type of file; '.mdf' is recommended. '.emxd' is also sometimes encountered. + + Args: + file (Union[str, File, Path]): The file to read. + raw (bool, optional): Defaults to ``False``. If ``True``, returns an unlabelled ndarray. Otherwise, a DataFrame + will be returned. + tall (bool, optional): Defaults to ``False``. If ``True``, a 1D data structure will be returned. If + ``raw=False``, a Series will be returned, otherwise a 1D ndarray. + Returns: + numpy.ndarray or pandas.DataFrame: + The matrix stored in the file. + """ + with open_file(file, mode='rb') as file_handler: + magic, version, dtype_index, ndim = np.fromfile(file_handler, np.uint32, count=4) + + if magic != 0xC4D4F1B2 or version != 1 or not (0 < dtype_index <= 4) or not (0 < ndim <= 2): + raise IOError("Unexpected file header: magic number: %X, version: %d, data type: %d, dimensions: %d." + % (magic, version, dtype_index, ndim)) + + shape = np.fromfile(file_handler, np.uint32, count=ndim) + + index_list = [] + for n_items in shape: + indices = np.fromfile(file_handler, np.int32, n_items) + index_list.append(indices) + + dtype = {1: np.float32, 2: np.float64, 3: np.int32, 4: np.uint32}[dtype_index] + flat_length = shape.prod() # Multiply the shape tuple + matrix = np.fromfile(file_handler, dtype, count=flat_length) + + if raw and tall: return matrix + + matrix.shape = shape + + if raw: return matrix + + if ndim == 1: + return pd.Series(matrix, index=index_list[0]) + elif ndim == 2: + matrix = pd.DataFrame(matrix, index=index_list[0], columns=index_list[1]) + + return matrix.stack() if tall else matrix + + raise NotImplementedError() # This should never happen + + +def to_mdf(matrix, file): + """ + Writes a matrix to Emme's official "binary serialization" format, to load using + ``inro.emme.matrix.MatrixData.load()``. There is no official extension for this type of file; '.mdf' is recommended. + + Args: + matrix (Union[pandas.DataFrame, panda.Series]): The matrix to write to disk. If a Series is given, it MUST have + a MultiIndex with exactly 2 levels to unstack. + file (Union[basestring, File, Path]): The path or file handler to write to. + """ + if isinstance(matrix, pd.Series): + row_index = matrix.index.get_level_values(0).unique() + column_index = matrix.index.get_level_values(1).unique() + elif isinstance(matrix, pd.DataFrame): + row_index = matrix.index + column_index = matrix.columns + else: + raise TypeError("Only labelled matrix objects are supported") + + with open_file(file, mode='wb') as writer: + data = coerce_matrix(matrix, allow_raw=False) + + np.array([0xC4D4F1B2, 1, 1, 2], dtype=np.uint32).tofile(writer) # Header + np.array(data.shape, dtype=np.uint32).tofile(writer) # Shape + + np.array(row_index, dtype=np.int32).tofile(writer) + np.array(column_index, dtype=np.int32).tofile(writer) + + data.tofile(writer) + + +def peek_mdf(file, as_index=True): + """ + Partially opens an MDF file to get the zone system of its rows and its columns. + + Args: + file (Union[str, File, Path]): The file to read. + as_index (bool, optional): Defaults to ``True``. Set to ``True`` to return a pandas.Index object rather than + List[int] + + Returns: + List[int] or pandas.Index: + One item for each dimension. If ``as_index=True``, the items will be pandas.Index objects, otherwise they + will be List[int] + + """ + with open_file(file, mode='rb') as file_handler: + magic, version, dtype_index, ndim = np.fromfile(file_handler, np.uint32, count=4) + + if magic != 0xC4D4F1B2 or version != 1 or not (0 < dtype_index <= 4) or not (0 < ndim <= 2): + raise IOError("Unexpected file header: magic number: %X, version: %d, data type: %d, dimensions: %d." + % (magic, version, dtype_index, ndim)) + + shape = np.fromfile(file_handler, np.uint32, count=ndim) + + index_list = [] + for n_items in shape: + indices = np.fromfile(file_handler, np.int32, n_items) + index_list.append(indices) + + if not as_index: return index_list + + return [pd.Index(zones) for zones in index_list] + + +def read_emx(file, zones=None, tall=False): + """ + Reads an "internal" Emme matrix (found in `/Database/emmemat`); with an '.emx' extension. This data + format does not contain information about zones. Its size is determined by the dimensions of the Emmebank + (``Emmebank.dimensions['centroids']``), regardless of the number of zones actually used in all scenarios. + + Args: + file (Union[str, File, Path]): The file to read. + zones (Union[pandas.Index, int], optional): Defaults to ``None``. An Index or Iterable will be interpreted as + the zone labels for the matrix rows and columns; returning a DataFrame or Series (depending on ``tall``). If + an integer is provided, the returned ndarray will be truncated to this 'number of zones'. Otherwise, the + returned ndarray will be size to the maximum number of zone dimensioned by the Emmebank. + tall (bool, optional): Defaults to ``False``. If True, a 1D data structure will be returned. If ``zone_index`` + is provided, a Series will be returned, otherwise a 1D ndarray. + + Returns: + DataFrame or Series or ndarray. + + Examples: + For a project with 20 zones: + + >>> matrix = from_emx("Database/emmemat/mf1.emx") + >>> print type(matrix), matrix.shape + (numpy.ndarray, (20, 20)) + + >>> matrix = from_emx("Database/emmemat/mf1.emx", zones=10) + >>> print type(matrix), matrix.shape + (numpy.ndarray, (10, 10)) + + >>> matrix = from_emx("Database/emmemat/mf1.emx", zones=range(10)) + >>> print type(matrix), matrix.shape + (10, 10) + + >>> matrix = from_emx("Database/emmemat/mf1.emx", zones=range(10), tall=True) + >>> print type(matrix), matrix.shape + 100 + + """ + with open_file(file, mode='rb') as reader: + data = np.fromfile(reader, dtype=np.float32) + + n = int(len(data) ** 0.5) + assert len(data) == n ** 2 + + if zones is None and tall: + return data + + data.shape = n, n + + if isinstance(zones, (int, np.int_)): + data = data[:zones, :zones] + + if tall: + data.shape = zones * zones + return data + return data + elif zones is None: + return data + + zones = pd.Index(zones) + n = len(zones) + data = data[:n, :n] + + matrix = pd.DataFrame(data, index=zones, columns=zones) + + return matrix.stack() if tall else matrix + + +def to_emx(matrix, file, emmebank_zones): + """ + Writes an "internal" Emme matrix (found in `/Database/emmemat`); with an '.emx' extension. The number + of zones that the Emmebank is dimensioned for must be known in order for the file to be written correctly. + + Args: + matrix (Union[pandas.DataFrame, pandas.Series, pandas.ndarray]): The matrix to write to disk. If a Series is + given, it MUST have a MultiIndex with exactly 2 levels to unstack. + file (Union[basestring, File]): The path or file handler to write to. + emmebank_zones (int): The number of zones the target Emmebank is dimensioned for. + """ + assert emmebank_zones > 0 + + with open_file(file, mode='wb') as writer: + data = coerce_matrix(matrix) + n = data.shape[0] + if n > emmebank_zones: + out = data[:emmebank_zones, :emmebank_zones].astype(np.float32) + else: + out = np.zeros([emmebank_zones, emmebank_zones], dtype=np.float32) + out[:n, :n] = data + + out.tofile(writer) diff --git a/balsa/routines/io/inro.pyi b/balsa/routines/io/inro.pyi new file mode 100644 index 0000000..73bd7a6 --- /dev/null +++ b/balsa/routines/io/inro.pyi @@ -0,0 +1,17 @@ +from typing import Union, List, Any, Iterable, Dict, Tuple + +import pandas as pd +import numpy as np + +from .common import file_type + +def read_mdf(file: file_type, raw: bool=False, tall: bool=False) -> Union[pd.DataFrame, np.ndarray]: pass + +def to_mdf(matrix: Union[pd.Series, pd.DataFrame], file: file_type): pass + +def peek_mdf(file: file_type, as_index: bool=True) -> Union[List[pd.Index], List[List[int]]]: pass + +def read_emx(file: file_type, zones: pd.Index=None, tall: bool=False) -> Union[pd.Series, pd.DataFrame, np.ndarray]: + pass + +def to_emx(matrix: Union[pd.Series, pd.DataFrame, np.ndarray], file: file_type, emmebank_zones: int): pass diff --git a/balsa/routines/io/omx.py b/balsa/routines/io/omx.py new file mode 100644 index 0000000..ab0c775 --- /dev/null +++ b/balsa/routines/io/omx.py @@ -0,0 +1,221 @@ +""" +OMX format +========== + +The Python library openmatrix already exists, but doesn't provide a lot of interoperability with Pandas. Balsa provides +wrapper functions that produce Pandas DataFrames and Series directly from OMX files. +""" + +import numpy as np +import pandas as pd +from six import iteritems, itervalues, iterkeys + +from ..matrices import fast_unstack + +try: + import openmatrix as omx +except ImportError: + omx = None + +if omx is not None: + + def read_omx(file, matrices=None, mapping=None, raw=False, tall=False, squeeze=True): + """ + Reads Open Matrix (OMX) files. An OMX file can contain multiple matrices, so this function + typically returns a Dict. + + Args: + file: OMX file from which to read. Cannot be an open file handler. + matrices: List of matrices to read from the file. If None, all matrices will be read. + mapping: The zone number mapping to use, if known in advance. If None, and the OMX file only contains + one mapping, then that one is used. No mapping is read if raw is False. + raw: If True, matrices will be returned as raw Numpy arrays. Otherwise, Pandas objects are returned + tall: If True, matrices will be returned in 1D format (pd.Series if raw is False). Otherwise, a 2D object + is returned. + squeeze: If True, and the file contains exactly one matrix, return that matrix instead of a Dict. + + Returns: + The matrix, or matrices contained in the OMX file. + + """ + file = str(file) + with omx.open_file(file, mode='r') as omx_file: + if mapping is None and not raw: + all_mappings = omx_file.list_mappings() + assert len(all_mappings) == 1 + mapping = all_mappings[0] + + if matrices is None: + matrices = sorted(omx_file.list_matrices()) + else: + matrices = sorted(matrices) + + if not raw: + labels = pd.Index(omx_file.mapping(mapping).keys()) + if tall: + labels = pd.MultiIndex.from_product([labels, labels], names=['o', 'd']) + + return_value = {} + for matrix_name in matrices: + wrapper = omx_file[matrix_name] + matrix = wrapper.read() + + if tall: + n = matrix.shape[0] * matrix.shape[1] + matrix.shape = n + + if not raw: + if tall: matrix = pd.Series(matrix, index=labels) + else: matrix = pd.DataFrame(matrix, index=labels, columns=labels) + + matrix.name = matrix_name + + return_value[matrix_name] = matrix + + if len(matrices) == 1 and squeeze: + return return_value[matrices[0]] + return return_value + + + def to_omx(file, matrices, zone_index=None, title='', descriptions=None, attrs=None, mapping='zone_numbers'): + """ + Creates a new (or overwrites an old) OMX file with a collection of matrices. + + Args: + file: OMX to write. + matrices: Collection of matrices to write. MUST be a dict, to permit the encoding of matrix metadata, + and must contain the same types: all Numpy arrays, all Series, or all DataFrames. Checking is done to + ensure that all items have the same shape and labels. + zone_index: Override zone labels to use. Generally only useful if writing a dict of raw Numpy arrays. + title: The title saved in the OMX file. + descriptions: A dict of descriptions (one for each given matrix), or None to not use. + attrs: A dict of dicts (one for each given matrix), or None to not use + mapping: Name of the mapping internal to the OMX file + """ + + matrices, zone_index = _prep_matrix_dict(matrices, zone_index) + + if descriptions is None: + descriptions = {name: '' for name in iterkeys(matrices)} + if attrs is None: + attrs = {name: None for name in iterkeys(matrices)} + + file = str(file) # Converts from Path + with omx.open_file(file, mode='w', title=title) as omx_file: + omx_file.create_mapping(mapping, zone_index.tolist()) + + for name, array in iteritems(matrices): + description = descriptions[name] + attr = attrs[name] + + omx_file.create_matrix(name, obj=np.ascontiguousarray(array), title=description, attrs=attr) + + return + + def _prep_matrix_dict(matrices, desired_zone_index): + collection_type = _check_types(matrices) + + if collection_type == 'RAW': + checked, n = _check_raw_matrices(matrices) + zone_index = pd.Index(range(n)) + elif collection_type == 'SERIES': + checked, zone_index = _check_matrix_series(matrices) + elif collection_type == 'FRAME': + checked, zone_index = _check_matrix_frames(matrices) + else: + raise NotImplementedError(collection_type) + + if desired_zone_index is not None: + assert desired_zone_index.equals(zone_index) + + return checked, zone_index + + def _check_types(matrices): + gen = iter(itervalues(matrices)) + first = next(gen) + + item_type = 'RAW' + if isinstance(first, pd.Series): item_type = 'SERIES' + elif isinstance(first, pd.DataFrame): item_type = 'FRAME' + + msg = "All items must be the same type" + + for item in gen: + if item_type == 'FRAME': + assert isinstance(item, pd.DataFrame), msg + elif item_type == 'SERIES': + assert isinstance(item, pd.Series), msg + else: + assert isinstance(item, np.ndarray), msg + + return item_type + + def _check_raw_matrices(matrices): + gen = iter(iteritems(matrices)) + name, matrix = next(gen) + + n_dim = len(matrix.shape) + if n_dim == 1: + shape = matrix.shape[0] + n = int(shape ** 0.5) + assert n * n == shape, "Only tall matrices that decompose to square shapes are permitted." + matrix = matrix[...] + matrix.shape = n, n + elif n_dim == 2: + n, cols = matrix.shape + assert n == cols, "Only square matrices are permitted" + else: + raise TypeError("Only 1D and 2D arrays can be saved to OMX files") + + retval = {name: matrix} + for name, matrix in gen: + assert len(matrix.shape) == n_dim + + if n_dim == 1: + assert matrix.shape[0] == (n * n) + matrix = matrix[...] + matrix.shape = n, n + else: + assert matrix.shape == (n, n) + + retval[name] = matrix + + return retval, n + + def _check_matrix_series(matrices): + gen = iter(iteritems(matrices)) + name, matrix = next(gen) + + tall_index = matrix.index + assert tall_index.nlevels == 2 + matrix = matrix.unstack() + zone_index = matrix.index + assert zone_index.equals(matrix.columns) + + retval = {name: matrix.values} + for name, matrix in gen: + assert tall_index.equals(matrix.index) + matrix = fast_unstack(matrix, zone_index, zone_index).values + retval[name] = matrix + return retval, zone_index + + def _check_matrix_frames(matrices): + gen = iter(iteritems(matrices)) + name, matrix = next(gen) + + zone_index = matrix.index + assert zone_index.equals(matrix.columns) + + retval = {name: matrix.values} + for name, matrix in gen: + assert zone_index.equals(matrix.index) + assert zone_index.equals(matrix.columns) + retval[name] = matrix.values + return retval, zone_index + +else: + def read_omx(*args, **kwargs): + raise NotImplementedError() + + def to_omx(*args, **kwargs): + raise NotImplementedError() diff --git a/balsa/routines/io/omx.pyi b/balsa/routines/io/omx.pyi new file mode 100644 index 0000000..9988f9d --- /dev/null +++ b/balsa/routines/io/omx.pyi @@ -0,0 +1,14 @@ +from typing import Union, Iterable, Dict + +import pandas as pd +import numpy as np + +from .common import file_type + +MATRIX_TYPES = Union[pd.DataFrame, pd.Series, np.ndarray] + +def read_omx(file: file_type, matrices: Iterable[str]=None, mapping: str=None, raw=False, tall=False, + squeeze=True) -> Union[MATRIX_TYPES, Dict[str, MATRIX_TYPES]]: pass + +def to_omx(file: str, matrices: Dict[str, MATRIX_TYPES], zone_index: pd.Index=None, title: str='', + descriptions: Dict[str, str]=None, attrs: Dict[str, dict]=None, mapping: str='zone_numbers'): pass \ No newline at end of file diff --git a/balsa/routines/matrices.py b/balsa/routines/matrices.py new file mode 100644 index 0000000..d78e3ca --- /dev/null +++ b/balsa/routines/matrices.py @@ -0,0 +1,606 @@ +from __future__ import division as _division + +import multiprocessing as _mp +import numba as _nb +import numpy as _np +import pandas as _pd +from pandas import Series, DataFrame, Index +import numexpr as _ne + +EPS = 1.0e-7 + + +def matrix_balancing_1d(m, a, axis): + """ + Balances a matrix using a single constraint. + + Args: + m (numpy.ndarray): The matrix (a 2-dimensional ndarray) to be balanced + a (numpy.ndarray): The totals vector (a 1-dimensional ndarray) constraint + axis (int): Direction to constrain (0 = along columns, 1 = along rows) + + Return: + numpy.ndarray: A balanced matrix + + """ + + assert axis in [0, 1], "axis must be either 0 or 1" + assert m.ndim == 2, "`m` must be a two-dimensional matrix" + assert a.ndim == 1, "`a` must be an one-dimensional vector" + assert m.shape[axis] == a.shape[0], "axis %d of matrices 'm' and 'a' must be the same." % axis + + return _balance(m, a, axis) + + +def matrix_balancing_2d(m, a, b, totals_to_use='raise', max_iterations=1000, rel_error=0.0001, n_procs=1): + """ + Balances a two-dimensional matrix using iterative proportional fitting. + + Args: + m (numpy.ndarray): The matrix (a 2-dimensional ndarray) to be balanced + a (numpy.ndarray): The row totals (a 1-dimensional ndarray) to use for balancing + b (numpy.ndarray): The column totals (a 1-dimensional ndarray) to use for balancing + totals_to_use (str, optional): Defaults to ``'raise'``. Describes how to scale the row and column totals if + their sums do not match. Must be one of ['rows', 'columns', 'average', 'raise']. + - rows: scales the columns totals so that their sums matches the row totals + - columns: scales the row totals so that their sums matches the column totals + - average: scales both row and column totals to the average value of their sums + - raise: raises an Exception if the sums of the row and column totals do not match + max_iterations (int, optional): Defaults to ``1000``. Maximum number of iterations + rel_error (float, optional): Defaults to ``1.0E-4``. Relative error stopping criteria + n_procs (int, optional): Defaults to ``1``. Number of processors for parallel computation. (Not used) + + Return: + Tuple[numpy.ndarray, float, int]: The balanced matrix, residual, and n_iterations + """ + max_iterations = int(max_iterations) + n_procs = int(n_procs) + + # Test if matrix is Pandas DataFrame + data_type = '' + if isinstance(m, _pd.DataFrame): + data_type = 'pd' + m_pd = m + m = m_pd.values + + if isinstance(a, _pd.Series) or isinstance(a, _pd.DataFrame): + a = a.values + if isinstance(b, _pd.Series) or isinstance(b, _pd.DataFrame): + b = b.values + + # ################################################################################## + # Validations: + # - m is an MxM square matrix, a and b are vectors of size M + # - totals_to_use is one of ['rows', 'columns', 'average'] + # - the max_iterations is a +'ve integer + # - rel_error is a +'ve float between 0 and 1 + # - the n_procs is a +'ve integer between 1 and the number of available processors + # ################################################################################## + valid_totals_to_use = ['rows', 'columns', 'average', 'raise'] + assert m.ndim == 2 and m.shape[0] == m.shape[1], "m must be a two-dimensional square matrix" + assert a.ndim == 1 and a.shape[0] == m.shape[0], \ + "'a' must be a one-dimensional array, whose size matches that of 'm'" + assert b.ndim == 1 and b.shape[0] == m.shape[0], \ + "'a' must be a one-dimensional array, whose size matches that of 'm'" + assert totals_to_use in valid_totals_to_use, "totals_to_use must be one of %s" % valid_totals_to_use + assert max_iterations >= 1, "max_iterations must be integer >= 1" + assert 0 < rel_error < 1.0, "rel_error must be float between 0.0 and 1.0" + assert 1 <= n_procs <= _mp.cpu_count(), \ + "n_procs must be integer between 1 and the number of processors (%d) " % _mp.cpu_count() + if n_procs > 1: + raise NotImplementedError("Multiprocessing capability is not implemented yet.") + + # Scale row and column totals, if required + a_sum = a.sum() + b_sum = b.sum() + if not _np.isclose(a_sum, b_sum): + if totals_to_use == 'rows': + b = _np.multiply(b, a_sum / b_sum) + elif totals_to_use == 'columns': + a = _np.multiply(a, b_sum / a_sum) + elif totals_to_use == 'average': + avg_sum = 0.5 * (a_sum + b_sum) + a = _np.multiply(a, avg_sum / a_sum) + b = _np.multiply(b, avg_sum / b_sum) + else: + raise RuntimeError("a and b vector totals do not match.") + + initial_error = _calc_error(m, a, b) + err = 1.0 + i = 0 + while err > rel_error: + if i > max_iterations: + # todo: convert to logger, if possible + print("Matrix balancing did not converge") + break + m = _balance(m, a, 1) + m = _balance(m, b, 0) + err = _calc_error(m, a, b) / initial_error + i += 1 + + if data_type == 'pd': + new_df = _pd.DataFrame(m, index=m_pd.index, columns=m_pd.columns) + return new_df, err, i + else: + return m, err, i + + +def _balance(matrix, tot, axis): + """ + Balances a matrix using a single constraint. + + Args: + matrix (numpy.ndarray): The matrix to be balanced + tot (numpy.ndarray): The totals constraint + axis (int): Direction to constrain (0 = along columns, 1 = along rows) + + Return: + numpy.ndarray: The balanced matrix + + """ + sc = tot / (matrix.sum(axis) + EPS) + sc = _np.nan_to_num(sc) # replace divide by 0 errors from the prev. line + if axis: # along rows + matrix = _np.multiply(matrix.T, sc).T + else: # along columns + matrix = _np.multiply(matrix, sc) + return matrix + + +def _calc_error(m, a, b): + row_sum = _np.absolute(a - m.sum(1)).sum() + col_sum = _np.absolute(b - m.sum(0)).sum() + return row_sum + col_sum + + +@_nb.jit(_nb.float64[:, :](_nb.float64[:, :], _nb.int64)) +def _nbf_bucket_round(a_, decimals=0): + a = a_.ravel() + b = _np.copy(a) + + residual = 0 + for i in range(0, len(b)): + b[i] = _np.round(a[i] + residual, decimals) + residual += a[i] - b[i] + + return b.reshape(a_.shape) + + +def matrix_bucket_rounding(m, decimals=0): + """ + Bucket rounds to the given number of decimals. + + Args: + m (Union[numpy.ndarray, pandas.DataFrame]): The matrix to be rounded + decimals (int, optional): Defaults to ``0``. Number of decimal places to round to. If decimals is negative, it + specifies the number of positions to the left of the decimal point. + + Return: + numpy.ndarray: The rounded matrix + + """ + + # Test if matrix is Pandas DataFrame + data_type = '' + if isinstance(m, _pd.DataFrame): + data_type = 'pd' + m_pd = m + m = m_pd.values + + decimals = int(decimals) + + # I really can't think of a way to vectorize bucket rounding, + # so here goes the slow for loop + b = _nbf_bucket_round(m, decimals) + + if decimals <= 0: + b = b.astype(_np.int32) + + if data_type == 'pd': + new_df = _pd.DataFrame(b.reshape(m.shape), index=m_pd.index, columns=m_pd.columns) + return new_df + else: + return b.reshape(m.shape) + + +def split_zone_in_matrix(base_matrix, old_zone, new_zones, proportions): + """ + Takes a zone in a matrix (as a DataFrame) and splits it into several new zones, prorating affected cells by a vector + of proportions (one value for each new zone). The old zone is removed. + + Args: + base_matrix (pandas.DataFrame): The matrix to re-shape + old_zone (int): The original zone to split + new_zones (List[int]): The list of new zones to add + proportions (List[float]): The proportions to split the original zone to. The list must be the same length as + ``new_zones`` and sum to 1.0 + + Returns: + pandas.DataFrame: The re-shaped matrix + + """ + + assert isinstance(base_matrix, _pd.DataFrame), "Base matrix must be a DataFrame" + + old_zone = int(old_zone) + new_zones = _np.array(new_zones, dtype=_np.int32) + proportions = _np.array(proportions, dtype=_np.float64) + + assert len(new_zones) == len(proportions), "Proportion array must be the same length as the new zone array" + assert len(new_zones.shape) == 1, "New zones must be a vector" + assert base_matrix.index.equals(base_matrix.columns), "DataFrame is not a matrix" + assert _np.isclose(proportions.sum(), 1.0), "Proportions must sum to 1.0 " + + n_new_zones = len(new_zones) + + intersection_index = base_matrix.index.drop(old_zone) + new_index = intersection_index + for z in new_zones: new_index = new_index.insert(-1, z) + new_index = _pd.Index(sorted(new_index)) + + new_matrix = _pd.DataFrame(0, index=new_index, columns=new_index, dtype=base_matrix.dtypes.iat[0]) + + # 1. Copy over the values from the regions of the matrix not being updated + new_matrix.loc[intersection_index, intersection_index] = base_matrix + + # 2. Prorate the row corresponding to the dropped zone + # This section (and the next) works with the underlying Numpy arrays, since they handle + # broadcasting better than Pandas does + original_row = base_matrix.loc[old_zone, intersection_index] + original_row = original_row.values[:] # Make a shallow copy to preserve shape of the original data + original_row.shape = 1, len(intersection_index) + proportions.shape = n_new_zones, 1 + result = _pd.DataFrame(original_row * proportions, index=new_zones, columns=intersection_index) + new_matrix.loc[result.index, result.columns] = result + + # 3. Proprate the column corresponding to the dropped zone + original_column = base_matrix.loc[intersection_index, old_zone] + original_column = original_column.values[:] + original_column.shape = len(intersection_index), 1 + proportions.shape = 1, n_new_zones + result = _pd.DataFrame(original_column * proportions, index=intersection_index, columns=new_zones) + new_matrix.loc[result.index, result.columns] = result + + # 4. Expand the old intrazonal + proportions_copy = proportions[:,:] + proportions_copy.shape = 1, n_new_zones + proportions.shape = n_new_zones, 1 + + intrzonal_matrix = proportions * proportions_copy + intrazonal_scalar = base_matrix.at[old_zone, old_zone] + + result = _pd.DataFrame(intrazonal_scalar * intrzonal_matrix, index=new_zones, columns=new_zones) + new_matrix.loc[result.index, result.columns] = result + + return new_matrix + + +def aggregate_matrix(matrix, groups=None, row_groups=None, col_groups=None, aggfunc=_np.sum): + """ + Aggregates a matrix based on mappings provided for each axis, using a specified aggregation function. + + Args: + matrix (Union[pandas.DataFrame, pandas.Series]): Matrix data to aggregate. DataFrames and Series with 2-level + indices are supported + groups: Syntactic sugar to specify both row_groups and col_groups to use the same grouping series. + row_groups: Groups for the rows. If aggregating a DataFrame, this must match the index of the matrix. For a + "tall" matrix, this series can match either the "full" index of the series, or it can match the first level + of the matrix (it would be the same as if aggregating a DataFrame). Alternatively, an array can be provided, + but it must be the same length as the DataFrame's index, or the full length of the Series. + col_groups: Groups for the columns. If aggregating a DataFrame, this must match the columns of the matrix. For a + "tall" matrix, this series can match either the "full" index of the series, or it can match the second level + of the matrix (it would be the same as if aggregating a DataFrame). Alternatively, an array can be provided, + but it must be the same length as the DataFrame's columns, or the full length of the Series. + aggfunc: The aggregation function to use. Default is sum. + + Returns: + pandas.Series or pandas.DataFrame: + The aggregated matrix, in the same type as was provided, e.g. Series -> Series, DataFrame -> DataFrame. + + Example: + + matrix: + + +-------+---+---+---+---+---+---+---+ + | | 1 | 2 | 3 | 4 | 5 | 6 | 7 | + +=======+===+===+===+===+===+===+===+ + | **1** | 2 | 1 | 9 | 6 | 7 | 8 | 5 | + +-------+---+---+---+---+---+---+---+ + | **2** | 4 | 1 | 1 | 4 | 8 | 7 | 6 | + +-------+---+---+---+---+---+---+---+ + | **3** | 5 | 8 | 5 | 3 | 5 | 9 | 4 | + +-------+---+---+---+---+---+---+---+ + | **4** | 1 | 1 | 2 | 9 | 4 | 9 | 9 | + +-------+---+---+---+---+---+---+---+ + | **5** | 6 | 3 | 4 | 6 | 9 | 9 | 3 | + +-------+---+---+---+---+---+---+---+ + | **6** | 7 | 2 | 5 | 8 | 2 | 5 | 9 | + +-------+---+---+---+---+---+---+---+ + | **7** | 3 | 1 | 8 | 6 | 3 | 5 | 6 | + +-------+---+---+---+---+---+---+---+ + + groups: + + +-------+---+ + | **1** | A | + +-------+---+ + | **2** | B | + +-------+---+ + | **3** | A | + +-------+---+ + | **4** | A | + +-------+---+ + | **5** | C | + +-------+---+ + | **6** | C | + +-------+---+ + | **7** | B | + +-------+---+ + + ``new_matrix = aggregate_matrix(matrix, groups=groups)`` + + new_matrix: + + +-------+----+----+----+ + | | A | B | C | + +=======+====+====+====+ + | **A** | 42 | 28 | 42 | + +-------+----+----+----+ + | **B** | 26 | 14 | 23 | + +-------+----+----+----+ + | **C** | 36 | 17 | 25 | + +-------+----+----+----+ + + """ + if groups is not None: + row_groups = groups + col_groups = groups + + assert row_groups is not None, "Row groups must be specified" + assert col_groups is not None, "Column groups must be specified" + + if isinstance(matrix, _pd.DataFrame): + row_groups = _prep_square_index(matrix.index, row_groups) + col_groups = _prep_square_index(matrix.columns, col_groups) + + return _aggregate_frame(matrix, row_groups, col_groups, aggfunc) + elif isinstance(matrix, _pd.Series): + assert matrix.index.nlevels == 2 + + row_groups, col_groups = _prep_tall_index(matrix.index, row_groups, col_groups) + return _aggregate_series(matrix, row_groups, col_groups, aggfunc) + else: + raise NotImplementedError() + + +def _prep_tall_index(target_index, row_aggregator, col_aggregator): + + if isinstance(row_aggregator, _pd.Series): + if row_aggregator.index.equals(target_index): + row_aggregator = row_aggregator.values + else: + assert target_index.levels[0].equals(row_aggregator.index) + reindexed = row_aggregator.reindex(target_index, level=0) + row_aggregator = reindexed.values + else: + assert len(row_aggregator) == len(target_index) + row_aggregator = _np.array(row_aggregator) + + if isinstance(col_aggregator, _pd.Series): + if col_aggregator.index.equals(target_index): + col_aggregator = col_aggregator.values + else: + assert target_index.levels[1].equals(col_aggregator.index) + reindexed = col_aggregator.reindex(target_index, level=1) + col_aggregator = reindexed.values + else: + assert len(col_aggregator) == len(target_index) + col_aggregator = _np.array(col_aggregator) + + return row_aggregator, col_aggregator + + +def _prep_square_index(index, aggregator): + if isinstance(aggregator, _pd.Series): + assert aggregator.index.equals(index) + return aggregator.values + else: + assert len(aggregator) == len(index) + return _np.array(aggregator) + + +def _aggregate_frame(matrix, row_aggregator, col_aggregator, aggfunc): + return matrix.groupby(row_aggregator, axis=0).aggregate(aggfunc).groupby(col_aggregator, axis=1).aggregate(aggfunc) + + +def _aggregate_series(matrix, row_aggregator, col_aggregator, aggfunc): + return matrix.groupby([row_aggregator, col_aggregator]).aggregate(aggfunc) + + +def fast_stack(frame, multi_index, deep_copy=True): + """ + Performs the same action as ``DataFrame.stack()``, but provides better performance when the target stacked index is + known before hand. Useful in converting a lot of matrices from "wide" to "tall" format. The inverse of + ``fast_unstack()``. + + Notes: + This function does not check that the entries in the multi_index are compatible with the index and columns of + the source DataFrame, only that the lengths are compatible. It can therefore be used to assign a whole new set + of labels to the result. + + Args: + frame (pandas.DataFrame): The DataFrame to stack. + multi_index (pandas.Index): The 2-level MultiIndex known ahead-of-time. + deep_copy (bool, optional): Defaults to ``True``. A flag indicating if the returned Series should be a view of + the underlying data (deep_copy=False) or a copy of it (deep_copy=True). A deep copy takes a little longer to + convert and takes up more memory but preserves the original data of the DataFrame. The default value of True + is recommended for most uses. + + Returns: + pandas.Series: The stacked data. + + """ + + assert multi_index.nlevels == 2, "Target index must be a MultiIndex with exactly 2 levels" + assert len(multi_index) == len(frame.index) * len(frame.columns), "Target index and source index and columns do " \ + "not have compatible lengths" + + array = _np.ascontiguousarray(frame.values) + array = array.copy() if deep_copy else array[:, :] + array.shape = len(frame.index) * len(frame.columns) + + return Series(array, index=multi_index) + + +def fast_unstack(series, index, columns, deep_copy=True): + """ + Performs the same action as ``DataFrame.unstack()``, but provides better performance when the target unstacked index + and columns are known before hand. Useful in converting a lot of matrices from "tall" to "wide" format. The inverse + of ``fast_stack()``. + + Notes: + This function does not check that the entries in index and columns are compatible with the MultiIndex of the + source Series, only that the lengths are compatible. It can therefore be used to assign a whole new set of + labels to the result. + + Args: + series (pandas.Series): The Series with 2-level MultiIndex to unstack + index (pandas.Index): The row index known ahead-of-time + columns (pandas.Index): The columns index known ahead-of-time. + deep_copy (bool): Defaults to ``True``. A flag indicating if the returned DataFrame should be a view of the + underlying data (deep_copy=False) or a copy of it (deep_copy=True). A deep copy takes a little longer to + convert and takes up more memory but preserves the original data of the Series. The default value of True is + recommended for most uses. + + Returns: + pandas.DataFrame: The unstacked data + + """ + + assert series.index.nlevels == 2, "Source Series must have an index with exactly 2 levels" + assert len(series) == len(index) * len(columns), "Source index and target index and columns do not have " \ + "compatible lengths" + + array = series.values.copy() if deep_copy else series.values[:] + array.shape = len(index), len(columns) + + return DataFrame(array, index=index, columns=columns) + + +def _check_disaggregation_input(mapping: Series, proportions: Series) -> _np.ndarray: + assert mapping is not None + assert proportions is not None + assert mapping.index.equals(proportions.index) + + # Force proportions to sum to 1 by dividing by the total in each parent + parent_totals = ( + proportions.groupby(mapping) # Group the proportions by parent zones + .sum() # Sum the total for each parent + .reindex(mapping) # Reindex for all child zones + .values # Get the ndarray to avoid index alignment problems + ) + + return proportions.values / parent_totals + + +def disaggregate_matrix(matrix, mapping=None, proportions=None, row_mapping=None, row_proportions=None, + col_mapping=None, col_proportions=None): + """ + Split multiple rows and columns in a matrix all at once. The cells in the matrix MUST be numeric, but the row and + column labels do not. + + Args: + matrix: The input matrix to disaggregate + mapping: Dict-like Series of "New label" : "Old label". Sets both the row_mapping and col_mapping variables if + provided (resulting in a square matrix). + proportions: Dict-like Series of "New label": "Proportion of old label". Its index must match the index of + the mapping argument. Sets both the row_proportions and col_proportions arguments if provided. + row_mapping: Same as mapping, except applied only to the rows. + row_proportions: Same as proportions, except applied only to the rows + col_mapping: Same as mapping, except applied only to the columns. + col_proportions: Same as proportions, except applied only to the columns + + Returns: + An expanded DataFrame with the new indices. The new matrix will sum to the same total as the original. + + Examples: + + df: + + +---+----+----+----+ + | | A | B | C | + +===+====+====+====+ + | A | 10 | 30 | 20 | + +---+----+----+----+ + | B | 20 | 10 | 10 | + +---+----+----+----+ + | C | 30 | 20 | 20 | + +---+----+----+----+ + + correspondence: + + +-----+-----+------+ + | new | old | prop | + +=====+=====+======+ + | A1 | A | 0.25 | + +-----+-----+------+ + | A2 | A | 0.75 | + +-----+-----+------+ + | B1 | B | 0.55 | + +-----+-----+------+ + | B2 | B | 0.45 | + +-----+-----+------+ + | C1 | C | 0.62 | + +-----+-----+------+ + | C2 | C | 0.38 | + +-----+-----+------+ + + ``new_matrix = disaggregate_matrix(df, mapping=correspondence['old'], proportions=correspondence['prop'])`` + + new_matrix: + + +-----+-------+-------+--------+--------+-------+-------+ + | new | A1 | A2 | B1 | B2 | C1 | C2 | + +=====+=======+=======+========+========+=======+=======+ + | A1 | 0.625 | 1.875 | 4.125 | 3.375 | 3.100 | 1.900 | + +-----+-------+-------+--------+--------+-------+-------+ + | A2 | 1.875 | 5.625 | 12.375 | 10.125 | 9.300 | 5.700 | + +-----+-------+-------+--------+--------+-------+-------+ + | B1 | 2.750 | 8.250 | 3.025 | 2.475 | 3.410 | 2.090 | + +-----+-------+-------+--------+--------+-------+-------+ + | B2 | 2.250 | 6.750 | 2.475 | 2.025 | 2.790 | 1.710 | + +-----+-------+-------+--------+--------+-------+-------+ + | C1 | 4.650 | 13.95 | 6.820 | 5.580 | 7.688 | 4.712 | + +-----+-------+-------+--------+--------+-------+-------+ + | C2 | 2.850 | 8.55 | 4.180 | 3.420 | 4.712 | 2.888 | + +-----+-------+-------+--------+--------+-------+-------+ + + """ + + # Check that all inputs are specified + if mapping is not None: + row_mapping, col_mapping = mapping, mapping + if proportions is not None: + row_proportions, col_proportions = proportions, proportions + + row_proportions = _check_disaggregation_input(row_mapping, row_proportions) + col_proportions = _check_disaggregation_input(col_mapping, col_proportions) + + # Validate inputs + new_rows = row_mapping.index + new_cols = col_mapping.index + + # Get raw indexers for NumPy & lookup the value in each parent cell + row_indexer = matrix.index.get_indexer(row_mapping)[:, _np.newaxis] + col_indexer = matrix.columns.get_indexer(col_mapping)[_np.newaxis, :] + parent_cells = matrix.values[row_indexer, col_indexer] + + # Convert proportions to 2D vectors + row_proportions = row_proportions[:, _np.newaxis] + col_proportions = col_proportions[_np.newaxis, :] + + # Multiply each parent cell by its disaggregation proportion & return + result_matrix = _ne.evaluate("parent_cells * row_proportions * col_proportions") + + result_matrix = DataFrame(result_matrix, index=new_rows, columns=new_cols) + return result_matrix diff --git a/balsa/matrices/routines.pyi b/balsa/routines/matrices.pyi similarity index 57% rename from balsa/matrices/routines.pyi rename to balsa/routines/matrices.pyi index e3ddc74..6c81f37 100644 --- a/balsa/matrices/routines.pyi +++ b/balsa/routines/matrices.pyi @@ -23,5 +23,18 @@ Vector = Union[pd.Series, np.ndarray] def aggregate_matrix(matrix: Union[pd.DataFrame, pd.Series], groups: Vector=None, row_groups: Vector=None, col_groups: Vector=None, - aggfunc: Callable[List[Iterable[Num]], Num]=np.sum) -> Union[pd.DataFrame, pd.Series]: + aggfunc: Callable[[Iterable[Num]], Num]=np.sum) -> Union[pd.DataFrame, pd.Series]: + pass + + +def fast_stack(frame: pd.DataFrame, multi_index: pd.MultiIndex, deep_copy: bool=True) -> pd.Series: + pass + + +def fast_unstack(series: pd.Series, index: pd.Index, columns: pd.Index, deep_copy: bool=True) -> pd.DataFrame: + pass + +def disaggregate_matrix(matrix: pd.DataFrame, mapping: pd.Series = None, proportions: pd.Series = None, + row_mapping: pd.Series = None, row_proportions: pd.Series = None, col_mapping: pd.Series = None, + col_proportions: pd.Series = None) -> pd.DataFrame: pass \ No newline at end of file diff --git a/balsa/routines/modelling.py b/balsa/routines/modelling.py new file mode 100644 index 0000000..9cafeba --- /dev/null +++ b/balsa/routines/modelling.py @@ -0,0 +1,356 @@ +from typing import Tuple, Optional, Union + +import pandas as pd +import numpy as np +import numexpr as ne +from six import iteritems + + +def tlfd(values, bin_start=0, bin_end=200, bin_step=2, weights=None, intrazonal=None, label_type='MULTI', + include_top=False): + """ + Generates a Trip Length Frequency Distribution (i.e. a histogram) from given data. Produces a "pretty" Pandas object + suitable for charting. + + Args: + values (Union[numpy.ndarray, pandas.Series]): A vector of trip lengths, with a length of "N". Can be provided + from a table of trips, or from a matrix (in "tall" format). + bin_start (int): Defaults is ``0``. The minimum bin value, in the same units as ``values``. + bin_end (int): Defaults to ``200``. The maximum bin value, in the same units as ``values``. Values over this + limit are either ignored, or counted under a separate category (see ``include_top``) + bin_step (int): Default is ``2``. The size of each bin, in the same unit as ``values``. + weights (Union[numpy.ndarray, pandas.Series], optional): Defaults to ``None``. A vector of weights to use of + length "N", to produce a weighted histogram. + intrazonal (Union[numpy.ndarray, pandas.Series], optional): Defaults to ``None``. A boolean vector indicating + which values are considered "intrazonal". When specified, prepends an ``intrazonal`` category to the front + of the histogram. + label_type (str, optional): Defaults to ``'MULTI'``. The format of the returned index. Options are: + - ``MULTI``: The returned index will be a 2-level MultiIndex ['from', 'to']; + - ``TEXT``: The returned index will be text-based: "0 to 2"; + - ``BOTTOM``: The returned index will be the bottom of each bin; and + - ``TOP``: The returned index will be the top of each bin. + include_top (bool, optional): Defaults to ``False``. If True, the function will count all values (and weights, + if provided) above the `bin_top`, and add them to the returned Series. This bin is described as going from + `bin_top` to `inf`. + + Returns: + pandas.Series: + The weighted or unweighted histogram, depending on the options configured above. + + """ + bins = list(range(bin_start, bin_end + bin_step, bin_step)) + + if intrazonal is not None: + if weights is not None: + iz_total = weights.loc[intrazonal].sum() + weights = weights.loc[~intrazonal] + else: + iz_total = intrazonal.sum() + + values = values.loc[~intrazonal] + + if weights is not None: + hist, _ = np.histogram(values, weights=weights, bins=bins) + else: + hist, _ = np.histogram(values, bins=bins) + + new_len = len(hist) + if intrazonal is not None: new_len += 1 + if include_top: new_len += 1 + new_hist = np.zeros(shape=new_len, dtype=hist.dtype) + lower_index = 0 + upper_index = new_len + + if intrazonal is not None: + new_hist[0] = iz_total + bins.insert(0, 'intrazonal') + lower_index += 1 + if include_top: + if weights is not None: + top = weights.loc[values >= bin_end].sum() + else: + top = (values >= bin_end).sum() + + new_hist[-1] = top + bins.append(np.inf) + upper_index -= 1 + new_hist[lower_index: upper_index] = hist + + label_type = label_type.upper() + if label_type == 'MULTI': + index = pd.MultiIndex.from_arrays([bins[:-1], bins[1:]], names=['from', 'to']) + elif label_type == 'TOP': + index = pd.Index(bins[1:]) + elif label_type == 'BOTTOM': + index = pd.Index(bins[:-1]) + elif label_type == 'TEXT': + s0 = pd.Series(bins[:-1], dtype=str).astype(str) + s1 = pd.Series(bins[1:], dtype=str).astype(str) + index = pd.Index(s0 + ' to ' + s1) + else: + raise NotImplementedError(label_type) + + new_hist = pd.Series(new_hist, index=index) + + return new_hist + + +def _get_distance_equation(method): + if method.lower() == 'euclidean': + expr = "sqrt((x0 - x1)**2 + (y0 - y1) ** 2) * coord_unit" + elif method.lower() == 'manhattan': + expr = "(abs(x0 - x1) + abs(y0 - y1)) * coord_unit" + elif method.lower() == 'haversine': + y0 = "(y0 * pi / 180)" + y1 = "(y1 * pi / 180)" + delta_lon = "((x1 - x0) * pi / 180)" + delta_lat = "((y1 - y0) * pi / 180)" + part1 = "sin({delta_lat} / 2.0)**2 + cos({y0}) * cos({y1}) * (sin({delta_lon} / 2.0))**2"\ + .format(delta_lat=delta_lat, delta_lon=delta_lon, y0=y0, y1=y1) + expr = "6371.0 * earth_radius_factor* 2.0 * arctan2(sqrt({part1}), sqrt(1.0 - {part1})) * coord_unit".\ + format(part1=part1) + else: + raise NotImplementedError(method.lower()) + return expr + + +def _prepare_distance_kwargs(kwargs): + defaults = {'coord_unit': 1.0, 'earth_radius_factor': 1.0, 'pi': np.pi} + for key, val in iteritems(defaults): + if key not in kwargs: + kwargs[key] = val + + +def _check_vectors(description: str, *vectors): + if len(vectors) < 1: return [] + + first = vectors[0] + retval = [] + common_index, common_length = None, None + if isinstance(first, pd.Series): + common_index = first.index + common_length = len(common_index) + retval.append(first.values[...]) + else: + retval.append(first[...]) + + for vector in vectors[1:]: + if isinstance(vector, pd.Series): + assert vector.index.equals(common_index), "All %s Series must have the same index" % description + retval.append(vector.values[...]) + else: + assert len(vector) == common_length, "All %s vectors must have the same length" % description + retval.append(vector[...]) + + return common_index, retval + + +def distance_matrix(x0, y0, tall=False, method='EUCLIDEAN', labels0=None, x1=None, y1=None, labels1=None, **kwargs): + """ + Fastest method of computing a distance matrix from vectors of coordinates, using the NumExpr package. Supports + several equations for computing distances. + + Accepts two or four vectors of x-y coordinates. If only two vectors are provided (x0, y0), the result will be the + 2D product of this vector with itself (vector0 * vector0). If all four are provided (x0, y0, x1, y1), the result + will be the 2D product of the first and second vector (vector0 * vector1). + + Args: + x0 (Union[numpy.ndarray, pandas.Series]): Vector of x-coordinates, of length N0. Can be a Series to specify labels. + y0 (Union[numpy.ndarray, pandas.Series]): Vector of y-coordinates, of length N0. Can be a Series to specify labels. + tall (bool, optional): Defaults to ``False``. If True, returns a vector whose shape is N0 x N1. Otherwise, + returns a matrix whose shape is (N0, N1). + method (str, optional): Defaults to ``'EUCLIDEAN'``. Specifies the method by which to compute distance. Valid + options are: + ``'EUCLIDEAN'``: Computes straight-line, 'as-the-crow flies' distance. + ``'MANHATTAN'``: Computes the Manhattan distance + ``'HAVERSINE'``: Computes distance based on lon/lat. + labels0 (pandas.Index-like, optional): Defaults to ``None``. Override set of labels to use if x0 and y0 are both raw + Numpy arrays + x1 (Union[numpy.ndarray, pandas.Series], optional): Defaults to ``None``. A second vector of x-coordinates, of length + N1. Can be a Series to specify labels + y1 (Union[numpy.ndarray, pandas.Series], optional): Defaults to ``None``. A second vector of y-coordinates, of length + N1. Can be a Series to specify labels + labels1 (pandas.Index-like): Override set of labels to use if x1 and y1 are both raw Numpy arrays + **kwargs: Additional scalars to pass into the evaluation context + + Kwargs: + coord_unit (float): + Factor applies directly to the result, defaulting to 1.0 (no conversion). Useful when the coordinates are + provided in one unit (e.g. m) and the desired result is in a different unit (e.g. km). Only used for + Euclidean or Manhattan distance + earth_radius_factor (float): + Factor to convert from km to other units when using Haversine distance + + Returns: + pandas.Series, pandas.DataFrame or numpy.ndarray: + A *Series* will be returned when ``tall=True``, and labels can be inferred and will always have 2-level + MultiIndex. A *DataFrame* will be returned when ``tall=False`` and labels can be inferred. A *ndarray* will + be returned when labels could not be inferred; if ``tall=True`` the array will be 1-dimensional, with shape + (N x N,). Otherwise, it will 2-dimensional with shape (N, N) + + Note: + The type of the returned object depends on whether labels can be inferred from the arguments. This is always + true when the `labels` argument is specified, and the returned value will use cross-product of the `labels` + vector. + + Otherwise, the function will try and infer the labels from the `x` and `y` objects, if one or both of them are + provided as Series. + + """ + + second_coords = x1 is not None and y1 is not None + + descr = "first coordinate" if second_coords else "coordinate" + temp_labels, (x_array0, y_array0) = _check_vectors(descr, x0, y0) + if labels0 is None: labels0 = temp_labels + + if second_coords: + temp_labels, (x_array1, y_array1) = _check_vectors("second coordinate", x1, y1) + if labels1 is None: labels1 = temp_labels + else: + x_array1 = x_array0[...] + y_array1 = y_array0[...] + labels1 = labels0 + + n0, n1 = len(x_array0), len(x_array1) + + x_array0.shape = n0, 1 + y_array0.shape = n0, 1 + x_array1.shape = 1, n1 + y_array1.shape = 1, n1 + + expr = _get_distance_equation(method) + kwargs = kwargs.copy() + _prepare_distance_kwargs(kwargs) + kwargs['x0'] = x_array0 + kwargs['x1'] = x_array1 + kwargs['y0'] = y_array0 + kwargs['y1'] = y_array1 + + raw_matrix = ne.evaluate(expr, local_dict=kwargs) + labelled_result = labels0 is not None and labels1 is not None + + if tall: + raw_matrix.shape = n0 * n1 + if not labelled_result: return raw_matrix + + mi = pd.MultiIndex.from_product([labels0, labels1]) + return pd.Series(raw_matrix, index=mi) + elif not labelled_result: + return raw_matrix + + return pd.DataFrame(raw_matrix, index=labels0, columns=labels1) + + +def distance_array(x0, y0, x1, y1, method='euclidean', **kwargs): + """ + Fast method to compute distance between 2 (x, y) points, represented by 4 separate arrays, using the NumExpr + package. Supports several equations for computing distances + + Args: + x0: X or Lon coordinate of first point + y0: Y or Lat coordinate of first point + x1: X or Lon coordinate of second point + y1: Y or Lat coordinate of second point + method (str, optional): Defaults to ``'EUCLIDEAN'``. Specifies the method by which to compute distance. Valid + options are: + ``'EUCLIDEAN'``: Computes straight-line, 'as-the-crow flies' distance. + ``'MANHATTAN'``: Computes the Manhattan distance + ``'HAVERSINE'``: Computes distance based on lon/lat. + **kwargs: Additional scalars to pass into the evaluation context + + Kwargs: + coord_unit (float): + Factor applies directly to the result, defaulting to 1.0 (no conversion). Useful when the + coordinates are provided in one unit (e.g. m) and the desired result is in a different unit (e.g. km). + Only used for Euclidean or Manhattan distance + earth_radius_factor (float): + Factor to convert from km to other units when using Haversine distance + + Returns: + numpy.ndarray or pandas.Series: + Distance from the vectors of first points to the vectors of second points. A Series is returned when one or + more coordinate arrays are given as a Series object + + """ + + labels, (x0, y0, x1, y1) = _check_vectors("coordinate", x0, y0, x1, y1) + + expr = _get_distance_equation(method) + kwargs = kwargs.copy() + _prepare_distance_kwargs(kwargs) + kwargs['x0'] = x0 + kwargs['x1'] = x1 + kwargs['y0'] = y0 + kwargs['y1'] = y1 + + result_array = ne.evaluate(expr, local_dict=kwargs) + + if labels is not None: + return pd.Series(result_array, index=labels) + + return result_array + + +def indexers_for_map_matrix(row_labels, col_labels, superset, check=True): + if check: + assert np.all(row_labels.isin(superset)) + assert np.all(col_labels.isin(superset)) + + row_offsets = superset.get_indexer(row_labels) + col_offsets = superset.get_indexer(col_labels) + + return row_offsets, col_offsets + + +def map_to_matrix(values, super_labels, fill_value=0, row_col_labels=None, row_col_offsets=None, out=None, + grouper_func='sum', out_operand='+'): + + # TODO: Check that `values` dtype is numeric, or at least, add-able + + if row_col_labels is not None: + row_labels, col_labels = row_col_labels + assert len(row_labels) == len(values) + assert len(col_labels) == len(values) + else: + assert values.index.nlevels == 2 + row_labels = values.index.get_level_values(0) + col_labels = values.index.get_level_values(1) + + if row_col_offsets is None: + row_offsets, col_offsets = indexers_for_map_matrix(row_labels, col_labels, super_labels) + else: + row_offsets, col_offsets = row_col_offsets + assert row_offsets.min() >= 0 + assert row_offsets.max() < len(super_labels) + assert col_offsets.min() >= 0 + assert col_offsets.max() < len(super_labels) + + if out is not None: + if isinstance(out, pd.DataFrame): + assert out.index.equals(super_labels) + assert out.columns.equals(super_labels) + out = out.values # Get the raw array from inside the frame + elif isinstance(out, np.ndarray): + nrows, ncols = out.shape + assert nrows >= (row_offsets.max() - 1) + assert ncols >= (col_offsets.max() - 1) + else: + raise TypeError(type(out)) + out_is_new = False + else: + out = np.full([len(super_labels)] * 2, fill_value=fill_value, dtype=values.dtype) + out_is_new = True + + aggregated: pd.Series = values.groupby([row_offsets, col_offsets]).aggregate(func=grouper_func) + xs, ys = aggregated.index.get_level_values(0), aggregated.index.get_level_values(1) + if out_is_new: + out[xs, ys] = aggregated.values + else: + out_name = '__OUT__' + other_name = '__OTHER__' + ld = {out_name: out[xs, ys], other_name: aggregated.values} + ne.evaluate("{0} = {0} {1} {2}".format(out_name, out_operand, other_name), local_dict=ld) + + out = pd.DataFrame(out, index=super_labels, columns=super_labels) + return out diff --git a/balsa/routines/modelling.pyi b/balsa/routines/modelling.pyi new file mode 100644 index 0000000..5d06524 --- /dev/null +++ b/balsa/routines/modelling.pyi @@ -0,0 +1,37 @@ +from typing import Iterable, Union, Tuple, Optional +from pandas import DataFrame, Series, Index +from numpy import ndarray + + +_vector_type = Union[ndarray, Series] + + +def tlfd(values: _vector_type, bin_start: int=0, bin_end: int=200, bin_step: int=2, weights: _vector_type=None, + intrazonal: _vector_type=None, label_type: str='MULTI', include_top: bool=False + ) -> Series: + pass + + +def distance_matrix(x: _vector_type, y: _vector_type, tall: bool=False, method: str='euclidean', + labels0: Union[Iterable, Index]=None, x1: _vector_type=None, y1: _vector_type=None, + labels1: Union[Iterable, Index]=None, earth_radius_factor: float=1.0, coord_unit: float=1.0 + ) -> Union[ndarray, Series, DataFrame]: + pass + + +def distance_array(x0: _vector_type, y0: _vector_type, x1: _vector_type, y1: _vector_type, method: str='euclidean', + earth_radius_factor: float=1.0, coord_unit: float=1.0): + pass + +def indexers_for_map_matrix(row_labels: Series, col_labels: Series, superset: Index, check=True + ) -> Tuple[ndarray, ndarray]: + pass + + +def map_to_matrix(values: Series, super_labels: Index, fill_value=0, *, + row_col_labels: Optional[Tuple[Series, Series]] = None, + row_col_offsets: Optional[Tuple[ndarray, ndarray]] = None, + out: Optional[Union[DataFrame, ndarray]] = None, + grouper_func='sum', out_operand: str='+' + ) -> DataFrame: + pass diff --git a/balsa/models/plots.py b/balsa/routines/plotting.py similarity index 67% rename from balsa/models/plots.py rename to balsa/routines/plotting.py index a8089d6..d10db06 100644 --- a/balsa/models/plots.py +++ b/balsa/routines/plotting.py @@ -1,6 +1,3 @@ -from typing import Callable, Tuple -from pathlib import Path - import pandas as pd import numpy as np from matplotlib import pyplot as plt @@ -8,8 +5,7 @@ from matplotlib.ticker import FormatStrFormatter, StrMethodFormatter, FuncFormatter -def convergence_boxplot(targets, results, filter_func, - adjust_target=True, percentage=True, band=None, +def convergence_boxplot(targets, results, filter_func, adjust_target=True, percentage=True, band=None, simple_labels=True, ax=None, fp=None, title=None): """ Measures convergence of constrained location-choice models (such as work-location choice). Can be used to @@ -31,24 +27,6 @@ def convergence_boxplot(targets, results, filter_func, """ - ''' - # Type-hinting signature - - def convergence_boxplot( - targets: pd.DataFrame, - results: pd.DataFrame, - filter_func: Callable[[pd.Series], pd.Series], - adjust_target: bool=True, - percentage: bool=True, - band: Tuple[float, float]=None, - simple_labels: bool=True, - ax=None, - fp: str=None, - title: str=None - ) -> Axes: - - ''' - assert results.columns.equals(targets.columns) columns, filters, n = [], [], 0 @@ -114,8 +92,7 @@ def convergence_boxplot( return ax -def location_summary(model, target, ensemble_names, title='', fp=None, - dpi=150, district_name='Ensemble'): +def location_summary(model, target, ensemble_names, title='', fp=None, dpi=150, district_name='Ensemble'): """ Creates a compound plot showing total attractions to specified locations @@ -129,22 +106,9 @@ def location_summary(model, target, ensemble_names, title='', fp=None, district_name: Returns: - + matplotlib.Axes """ - ''' - # Type-hinting signature: - def location_summary( - model: pd.DataFrame, - target: pd.DataFrame, - ensemble_names: pd.Series, - title: str='', - fp: Path=None, - dpi: int=150, - district_name: str='Ensemble' - ) -> Axes: - ''' - fig, ax = plt.subplots(1, 3, figsize=[16, 8], gridspec_kw={'width_ratios': [4, 2, 2]}) model_col = model.reindex(ensemble_names.index, fill_value=0) @@ -208,52 +172,36 @@ def trumpet_diagram(counts, model_volume, categories=None, category_colours=None on FHWA guidelines. Can be used to plot different categories of count locations. Args: - counts (Series): Target counts. Each item represents a different count location. Index does not need to be + counts (pandas.Series): Target counts. Each item represents a different count location. Index does not need to be unique. - model_volume (Series): Modelled volumes for each location. The index must match the counts Series. - categories (None or Series or List[Series]): Optional classification of each count location. Must match the - index of the count Series. Can be provided as a List of Series (which all must match the count index) to - enable tuple-based categorization. - category_colours (None or Dict[Union[str, tuple], str]): Mapping of each category to a colour, specified as a - hex string. Only used when categories are provided. Missing categories revert to None, using the default - colour for the current style. - category_markers (None or Dict[Union[str, tuple], str]): Mapping of each category to a matplotlib marker string - (see https://matplotlib.org/api/markers_api.html for options). Only used when categories are provided. - Missing categories revert to None, using the default marker for the current style. - label_format (str): Used to convert category values (especially tuples) into readable strings for the plot - legend. The relevant line of code is `current_label = label_format % category_key`. Only used when - categories are provided. - title (str): Optional title to set on the plot. - y_bounds (tuple): Limit of the Y-Axis. This is needed because relative errors can be very high close to the - y-intercept of the plot. Defaults to (-2, 2), or (-200%, 200%). - ax (None or Axes): Sub-Axes to add this plot to, if using subplots(). - x_label (str): Label to use for the X-axis. The Y-axis is always "Relative Error" - legend (bool): Flag to add a legend. - kwargs: Additional kwargs to pass to DataFrame.plot.scatter() + model_volume (pandas.Series): Modelled volumes for each location. The index must match the counts Series. + categories (Union[pandas.Series, List[pandas.Series]], optional): Defaults to ``None``. Optional classification of each + count location. Must match the index of the count Series. Can be provided as a List of Series (which all + must match the count index) to enable tuple-based categorization. + category_colours (Dict[Union[str, tuple], str], optional): Defaults to ``None``. Mapping of each category to a + colour, specified as a hex string. Only used when categories are provided. Missing categories revert to + ``None``, using the default colour for the style. + category_markers (Dict[Union[str, tuple], str], optional): Defaults to ``None``. Mapping of each category to a + matplotlib marker string (see https://matplotlib.org/api/markers_api.html for options). Only used when + categories are provided. Missing categories revert to ``None``, using the default marker for the style. + label_format (str, optional): Defaults to ``None``. Used to convert category values (especially tuples) into + readable strings for the plot legend. The relevant line of code is + ``current_label = label_format % category_key``. Only used when categories are provided. + title (str, optional): The title to set on the plot. + y_bounds (tuple, optional): Defaults to ``(-2, 2)``, or (-200%, 200%). Limit of the Y-Axis. This is needed + because relative errors can be very high close to the y-intercept of the plot. + ax (matplotlib.Axes, optional): Defaults to ``None``. Sub-Axes to add this plot to, if using ``subplots()``. + x_label (str, optional): Defaults to ``'Count volume'``. Label to use for the X-axis. The Y-axis is always + "Relative Error" + legend (bool, optional): Defaults to ``True``. Flag to add a legend. + **kwargs: Additional kwargs to pass to ``DataFrame.plot.scatter()`` Returns: - Axes: The Axes object generated from the plot. For most use cases, this is not really needed. + matplotlib.Axes: + The Axes object generated from the plot. For most use cases, this is not really needed. """ - ''' - # Type-hinting signature - - def trumpet_diagram( - counts: Series, - model_volume: Series, - categories: Union[Series, List[Series]]=None, - category_colours: Dict[Union[Any, tuple]]=None, - category_markers: Dict[Union[Any, tuple]]=None, - label_format: str=None, - title: str='', - y_bounds: Tuple[float, float]=(-2, 2), - ax: Optional[Axes]=None, - alpha: float=1.0, - x_label: str="Count volume" - ) -> Axes: - ''' - assert model_volume.index.equals(counts.index) n_categories = 0 diff --git a/balsa/routines/plotting.pyi b/balsa/routines/plotting.pyi new file mode 100644 index 0000000..38e4422 --- /dev/null +++ b/balsa/routines/plotting.pyi @@ -0,0 +1,53 @@ +from typing import List, Union, Dict, Callable, Tuple, Any, Optional + +from pandas import DataFrame, Series +from matplotlib.axes import Axes + +import six +if six.PY3: + from pathlib import Path as PathType +else: + PathType = str + + +def convergence_boxplot( + targets: DataFrame, + results: DataFrame, + filter_func: Callable[[Series], Series], + adjust_target: bool=True, + percentage: bool=True, + band: Tuple[float, float]=None, + simple_labels: bool=True, + ax=None, + fp: str=None, + title: str=None + ) -> Axes: + pass + + +def location_summary( + model: DataFrame, + target: DataFrame, + ensemble_names: Series, + title: str='', + fp: PathType=None, + dpi: int=150, + district_name: str='Ensemble' + ) -> Axes: + pass + + +def trumpet_diagram( + counts: Series, + model_volume: Series, + categories: Union[Series, List[Series]]=None, + category_colours: Dict[Union[Any, tuple]]=None, + category_markers: Dict[Union[Any, tuple]]=None, + label_format: str=None, + title: str='', + y_bounds: Tuple[float, float]=(-2, 2), + ax: Optional[Axes]=None, + alpha: float=1.0, + x_label: str="Count volume" + ) -> Axes: + pass diff --git a/balsa/test/test_cheval/__init__.py b/balsa/test/test_cheval/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/balsa/test/test_cheval/test_api.py b/balsa/test/test_cheval/test_api.py deleted file mode 100644 index b2b60aa..0000000 --- a/balsa/test/test_cheval/test_api.py +++ /dev/null @@ -1,68 +0,0 @@ -import unittest - -import pandas as pd -import numpy as np - -from balsa.cheval.api import sample_from_weights, ChoiceTree - - -class RandomStateProxy(object): - def __init__(self, fixed_draws): - self.data = np.array(fixed_draws, dtype=np.float64) - - def uniform(self, size, *args, **kwargs): - n = np.prod(size) - assert n <= len(self.data) - - ret = self.data[: n] - ret.shape = size - - return ret - - -class TestAPI(unittest.TestCase): - - def test_weighted_probabilities(self): - weights = [ - [0, 0, 1, 0, 0, 2, 3, 0, 0, 1], - [0, 0, 1, 0, 0, 2, 3, 0, 0, 1], - [0, 0, 1, 0, 0, 2, 3, 0, 0, 1], - [0, 0, 1, 0, 0, 2, 3, 0, 0, 1], - [0, 0, 1, 0, 0, 2, 3, 0, 0, 1], - [0, 0, 1, 0, 0, 2, 3, 0, 0, 1] - ] - weights = pd.DataFrame(weights, index=[101, 102, 103, 104, 105, 106], columns="A B C D E F G H I J".split()) - randomizer = RandomStateProxy([0, 0.05, 0.2, 0.5, 0.8, 0.9]) - - expected_result = [2, 2, 5, 6, 6, 9] - - actual_result = sample_from_weights(weights, randomizer, astype='index').values - assert np.all(actual_result == expected_result) - - def test_flatten(self): - tree = ChoiceTree(None) - a1 = tree.add_node('A1', 0.3) - a2 = tree.add_node('A2', 0.6) - - a1.add_node('A3') - a1.add_node('A4') - - a2.add_node('A5') - a6 = a2.add_node('A6', 0.5) - - a6.add_node('A7') - a6.add_node('A8') - - hierarchy, levels, ls_scales = tree.flatten() - - expected_hierarchy = np.int64([-1, -1, 0, 0, 1, 1, 5, 5]) - expected_levels = np.int64([0, 0, 1, 1, 1, 1, 2, 2]) - expected_scales = np.float64([0.3, 0.6, 1, 1, 1, 0.5, 1, 1]) - - assert np.all(expected_hierarchy == hierarchy) - assert np.all(expected_levels == levels) - assert np.allclose(expected_scales, ls_scales) - - -if __name__ == '__main__': - unittest.main() diff --git a/balsa/test/test_cheval/test_core.py b/balsa/test/test_cheval/test_core.py deleted file mode 100644 index c9c12af..0000000 --- a/balsa/test/test_cheval/test_core.py +++ /dev/null @@ -1,166 +0,0 @@ -import unittest -from bisect import bisect_right - -import numpy as np -from numpy.testing import assert_allclose - -from balsa.cheval.core import (sample_multinomial_worker, sample_nested_worker, stochastic_multinomial_worker, - nested_probabilities, logarithmic_search) -from balsa.cheval.tree import ChoiceTree - - -class RandomStateProxy(object): - def __init__(self, fixed_draws): - self.data = np.array(fixed_draws, dtype=np.float64) - - def uniform(self, shape, *args, **kwargs): - n = np.prod(shape) - assert n < len(self.data) - - ret = self.data[: n] - ret.shape = shape - - return ret - - -class TestCore(unittest.TestCase): - - def _get_utility_row(self): - ret = np.array( - [1.678, 1.689, 1.348, 0.903, 1.845, 0.877, 0.704, 0.482], - dtype=np.float64 - ) - n = len(ret) - ret.shape = 1, n - - return ret - - def _get_flattened_tree(self): - tree = ChoiceTree(None) - a1 = tree.add_node('A1', 0.3) - a2 = tree.add_node('A2', 0.6) - - a1.add_node('A3') - a1.add_node('A4') - - a2.add_node('A5') - a6 = a2.add_node('A6', 0.5) - - a6.add_node('A7') - a6.add_node('A8') - - return tree.flatten() - - def test_multinomial_probabilities(self): - utilities = self._get_utility_row() - - expected_result = np.array( - [0.181775432, 0.183785999, 0.130682672, 0.083744629, 0.214813892, 0.08159533, 0.068632901, 0.054969145], - dtype=np.float64 - ) - n = len(expected_result) - expected_result.shape = 1, n - - test_result = np.zeros(utilities.shape) - stochastic_multinomial_worker(utilities, test_result) - - assert_allclose(test_result, expected_result) - - def test_nested_probabilities(self): - utilities = np.float64([0, -1.63364, -0.73754, -0.05488, -0.66127, -1.17165, 0, 0]) - - expected_result = np.float64([0, 0.06527, 0.17893, 0.47448, 0.23791, 0.04341, 0, 0]) - - tree = ChoiceTree(None) - auto = tree.add_node("Auto", logsum_scale=0.7) - auto.add_node("Carpool") - auto.add_node("Drive") - pt = tree.add_node("Transit", logsum_scale=0.7) - pt.add_node("Bus") - train = pt.add_node("Train", logsum_scale=0.3) - train.add_node("T Access Drive") - train.add_node("T Access Walk") - - hierarchy, levels, logsum_scales = tree.flatten() - test_result = nested_probabilities(utilities, hierarchy, levels, logsum_scales) - - assert_allclose(test_result, expected_result, rtol=1e-4) - - def test_multinomial_sampling(self): - utilities = self._get_utility_row() - - expected_samples = [ - (0.0, 0), - (0.1, 0), - (0.2, 1), - (0.4, 2), - (0.5, 3), - (0.6, 4), - (0.8, 5), - (0.9, 6), - (0.98, 7), - (1.0, 7) - ] - for row_number, (random_draw, expected_index) in enumerate(expected_samples): - random_numbers = np.array([[random_draw]], dtype=np.float64) - test_result = np.zeros([1, 1], dtype=np.int64) - sample_multinomial_worker(utilities, random_numbers, test_result) - - assert test_result[0, 0] == expected_index, "Bad result for row %s" % row_number - - def test_nested_sampling(self): - - utilities = np.float64([[0, -1.63364, -0.73754, -0.05488, -0.66127, -1.17165, 0, 0]]) - - tree = ChoiceTree(None) - auto = tree.add_node("Auto", logsum_scale=0.7) - auto.add_node("Carpool") - auto.add_node("Drive") - pt = tree.add_node("Transit", logsum_scale=0.7) - pt.add_node("Bus") - train = pt.add_node("Train", logsum_scale=0.3) - train.add_node("T Access Drive") - train.add_node("T Access Walk") - - hierarchy, levels, logsum_scales = tree.flatten() - - expected_samples = [ - (0, 1), - (0.04, 1), - (0.2, 2), - (0.5, 3), - (0.75, 4), - (0.99, 5) - ] - - for row_number, (random_draw, expected_index) in enumerate(expected_samples): - random_numbers = np.array([[random_draw]], dtype=np.float64) - test_result = np.zeros([1, 1], dtype=np.int64) - - sample_nested_worker(utilities, random_numbers, hierarchy, levels, logsum_scales, test_result) - result_cell = test_result[0, 0] - - assert result_cell == expected_index, "Bad result for row %s. Expected %s got %s" % (row_number, expected_index, result_cell) - - def test_bisection_search(self): - cumsums = np.array([0, 0, 0.25, 0.25, 0.25, 0.25, 0.25, 0.5, 0.75, 1.0, 1.0, 1.0], dtype=np.float64) - - expected_samples = [ - (0.0, 2), - (0.2, 2), - (0.4, 7), - (0.6, 8), - (0.8, 9), - (0.99, 9) - ] - - for random_draw, expected_index in expected_samples: - test_result = logarithmic_search(np.float64(random_draw), cumsums) - assert test_result == expected_index - - standard_result = bisect_right(cumsums, random_draw) - assert test_result == standard_result - - -if __name__ == '__main__': - unittest.main() diff --git a/balsa/test/test_cheval/test_expressions.py b/balsa/test/test_cheval/test_expressions.py deleted file mode 100644 index 01e4205..0000000 --- a/balsa/test/test_cheval/test_expressions.py +++ /dev/null @@ -1,130 +0,0 @@ -import unittest - -import pandas as pd -import six - -from balsa.cheval.scope.expressions import Expression -from balsa.cheval.scope import SimpleUsage, AttributedUsage, DictLiteral, LinkedFrameUsage, UnsupportedSyntaxError - - -class TestExpressionParsing(unittest.TestCase): - - # TODO: Test Expression constructor with bad inputs - - def test_literals(self): - exprs_to_pass = [ - "'bob'", - "1.0", - "2", - "1E6", - "True", - "False", - "None", - "u'pi'" - ] - for e in exprs_to_pass: - # Just test to ensure that there are no errors - _ = Expression(e) - - def test_simple_usage(self): - expr = Expression("a + b") - - assert 'a' in expr._symbols - assert 'b' in expr._symbols - - assert all(isinstance(use, SimpleUsage) for use in expr._symbols['a']) - assert all(isinstance(use, SimpleUsage) for use in expr._symbols['b']) - - expr = Expression("a") - assert 'a' in expr._symbols - - def test_attributed_usage(self): - expr = Expression("a.b") - # Expression("abc.def") - # Expression("'a bc'.d) - # Fail: # Expression("'a'.upper) - assert 'a' in expr._symbols - assert len(expr._symbols['a']) == 1 - assert isinstance(expr._symbols['a'][0], AttributedUsage) - assert expr._symbols['a'][0].attribute == 'b' - - def test_dict_literal(self): - - expr_pass = [ - "{abc: 1, ghi: 1.0}", - "{'choice 1': 5.0, 'choice 2': 6.0}", - "{'jane & finch': 0.25}" - ] - - for e in expr_pass: - expr = Expression(e) - assert '__dict0' in expr._symbols - assert isinstance(expr._symbols['__dict0'], DictLiteral) - assert isinstance(expr._symbols['__dict0'].series, pd.Series) - - expr_fail = [ - "{'a': 'b'}", - "{'c': None}", - "{'d': NaN}", - "{@a: 1}", - "{a b: 2}" - ] - with self.assertRaises(Exception): - for e in expr_fail: _ = Expression(e) - - def test_top_level_func(self): - expr = Expression('log(a.b)') - - assert expr._parsed_expr.startswith('log') - - assert 'a' in expr._symbols - assert len(expr._symbols['a']) == 1 - assert isinstance(expr._symbols['a'][0], AttributedUsage) - assert expr._symbols['a'][0].attribute == 'b' - - with self.assertRaises(UnsupportedSyntaxError): - _ = Expression("bob(1 + 2)") - - def test_linked_usage(self): - expr1 = Expression("a.b.c") - - assert 'a' in expr1._symbols - assert len(expr1._symbols['a']) == 1 - assert isinstance(expr1._symbols['a'][0], LinkedFrameUsage) - assert len(expr1._symbols['a'][0].stack) == 2 - - expr2 = Expression("a.b.sum(c > d)") - - assert 'a' in expr2._symbols - assert len(expr2._symbols['a']) == 1 - usage = expr2._symbols['a'][0] - assert isinstance(usage, LinkedFrameUsage) - assert usage.func == 'sum' - assert usage.func_expr == "(c > d)" - - def test_rejected_syntax(self): - - with self.assertRaises(UnsupportedSyntaxError) as context: - Expression("[a + 1 for a in range(1,6)]") # List comprehension - Expression("{a: a + 1 for a in range(5)}") # Dict comprehension - Expression("{1,2,3}") # Set literal - Expression("q + a if 1 == 2 else b") # Block If/Else - Expression("a = b") # Store - Expression("del a") # Del - Expression("a[1]") # Subscript - Expression("a[0:5]") # Subscript w slice - Expression("a + {") # Malformed syntax - - def test_replacements(self): - expression = Expression("a and not b or c == 'notandor'") - expected = "((a & (~ b)) | (c == b'notandor'))" if six.PY3 else "((a & (~ b)) | (c == 'notandor'))" - assert expression._parsed_expr == expected - - def test_py3_string_replacement(self): - if six.PY3: - expression = Expression("where(my_array == 'some_value', 1, 0)") - actual = expression._parsed_expr - assert actual == "where((my_array == b'some_value'), 1, 0)" - -if __name__ == '__main__': - unittest.main() diff --git a/balsa/test/test_cheval/test_ldf.py b/balsa/test/test_cheval/test_ldf.py deleted file mode 100644 index 5857b3a..0000000 --- a/balsa/test/test_cheval/test_ldf.py +++ /dev/null @@ -1,183 +0,0 @@ -import unittest -import numpy as np -import pandas as pd -from pandas.util.testing import assert_series_equal - -from balsa.cheval import LinkedDataFrame - - -class TestA(unittest.TestCase): - - def testGetItem(self): - sa = pd.Series([1, 2, 3]) - df = LinkedDataFrame({'a': sa}) - - result = df['a'] - - assert_series_equal(sa, result, check_names=False) - - def testGetAttr(self): - sa = pd.Series([1, 2, 3]) - df = LinkedDataFrame({'ldf_test_column': sa}) - - result = df.ldf_test_column - - assert_series_equal(sa, result, check_names=False) - - -def generate_counter(reps): - new_array = np.zeros(reps.sum()) - - i = 0 - for rep in reps: - val = 0 - for j in range(rep): - new_array[i] = val - val += 1 - i += 1 - return new_array - - -class TestLookups(unittest.TestCase): - - # TODO: Write test for __dir__ - # TODO: Write test for refresh_links - # TODO: Write test for __getitem__ - # TODO: Write test for remove_link - - def setUp(self): - randomizer = np.random.RandomState(12345) - - n_zones = 3 - n_hh = 10 - max_hh_size = 4 - trips_probability = pd.Series({1: 0.1, 2: 0.7, 3: 0.2}) - - zone_index = np.arange(0, n_zones) - zone_area = randomizer.uniform(10.0, 100.0, size=n_zones) - zones = pd.DataFrame({'area': zone_area}, - index=pd.Index(zone_index, name='zone_id')) - - hh_index = np.arange(0, n_hh) - hh_dwellings = randomizer.choice(['house', 'apartment'], size=n_hh, replace=True, p=[0.3, 0.7]) - hh_zone = randomizer.choice(zone_index, size=n_hh, replace=True) - hh_vehicles = randomizer.choice([0, 1, 2, 3], size=n_hh, replace=True) - households = LinkedDataFrame({'dwelling_type': hh_dwellings, 'zone_id': hh_zone, 'vehicles': hh_vehicles}, - index=pd.Index(hh_index, name='hhid')) - - hh_repetitions = randomizer.randint(1, max_hh_size, n_hh) - person_hh = np.repeat(hh_index, hh_repetitions) - person_index = generate_counter(hh_repetitions) - person_ages = randomizer.randint(14, 50, len(person_index)) - person_sex = randomizer.choice(['M', 'F'], size=len(person_index), replace=True) - persons = LinkedDataFrame({'age': person_ages, 'sex': person_sex}, - index=pd.MultiIndex.from_arrays([person_hh, person_index], names=['hhid', 'pid'])) - - person_repetitions = randomizer.choice(trips_probability.index.values, size=len(persons), replace=True, - p=trips_probability.values) - trip_hh = np.repeat(persons.index.get_level_values(0).values, person_repetitions) - trip_persons = np.repeat(persons.index.get_level_values(1).values, person_repetitions) - trip_index = generate_counter(person_repetitions) - trips = LinkedDataFrame(index=pd.MultiIndex.from_arrays([trip_hh, trip_persons, trip_index], names=['hhid', 'pid', - 'trip_id'])) - trips['km'] = randomizer.uniform(2.0, 50.0, size=len(trips)) - - households.link_to(zones, 'zone', on_self='zone_id') - households.link_to(persons, 'persons', levels='hhid') - - persons.link_to(households, 'household', levels='hhid') - persons.link_to(trips, 'trips', levels=['hhid', 'pid']) - - trips.link_to(persons, 'person', levels=['hhid', 'pid']) - trips.link_to(households, 'household', levels='hhid') - - self.zones = zones - self.hh = households - self.persons = persons - self.trips = trips - - def tearDown(self): - del self.zones, self.trips, self.hh, self.persons - - def testStandardLookup(self): - oracle = pd.DataFrame(self.hh.to_dict()).zone_id - result = self.hh.zone_id - - assert_series_equal(oracle, result, check_names=False, check_dtype=False) - - def testSingleLookup(self): - oracle = self.hh.dwelling_type - oracle = oracle.reindex(self.persons.index.get_level_values('hhid')) - oracle.index = self.persons.index - - result = self.persons.household.dwelling_type - - assert_series_equal(oracle, result, check_names=False) - - def testDoubleLookup(self): - result = self.persons.household.zone.area - - oracle = self.zones.area.reindex(self.hh.zone_id) - oracle.index = self.hh.index - oracle = oracle.reindex(self.persons.index.get_level_values('hhid')) - oracle.index = self.persons.index - - assert_series_equal(oracle, result, check_names=False) - - def testAggregateCount(self): - oracle = self.persons.age.groupby(level=['hhid']).count() - - result = self.hh.persons.sum() - - assert_series_equal(oracle, result, check_names=False, check_dtype=False) - - def testAggregateCountWithExpression(self): - oracle = self.persons.age.loc[self.persons.age > 20].groupby( - level=['hhid'] - ).count().reindex(self.hh.index, fill_value=0) - - result = self.hh.persons.sum("age > 20") - - assert_series_equal(oracle, result, check_names=False, check_dtype=False) - - def testMultiIndexLookup(self): - oracle = self.persons.sex - oracle = oracle.reindex(self.trips.index.droplevel('trip_id')) - oracle.index = self.trips.index - - result = self.trips.person.sex - - assert_series_equal(oracle, result, check_names=False) - - def testSliceSubset(self): - persons_raw = pd.DataFrame(self.persons) # Make a copy as a plain ol' DataFrame - person_subset = persons_raw.sample(5, replace=False, random_state=12345) - - oracle = self.hh.dwelling_type - oracle = oracle.reindex(person_subset.index.get_level_values('hhid')) - oracle.index = person_subset.index - - person_subset_2 = self.persons.sample(5, replace=False, random_state=12345) - assert person_subset.index.equals(person_subset_2.index) - - result = person_subset_2.household.dwelling_type - - assert_series_equal(oracle, result, check_names=False) - - def testSliceSuperset(self): - persons_raw = pd.DataFrame(self.persons) # Make a copy as a plain ol' DataFrame - person_superset = persons_raw.sample(100, replace=True, random_state=12345) - - oracle = self.hh.dwelling_type - oracle = oracle.reindex(person_superset.index.get_level_values('hhid')) - oracle.index = person_superset.index - - person_superset_2 = self.persons.sample(100, replace=True, random_state=12345) - assert person_superset.index.equals(person_superset_2.index) - - result = person_superset_2.household.dwelling_type - - assert_series_equal(oracle, result, check_names=False) - -if __name__ == '__main__': - unittest.main() diff --git a/balsa/test/test_cheval/test_scope.py b/balsa/test/test_cheval/test_scope.py deleted file mode 100644 index a1dc093..0000000 --- a/balsa/test/test_cheval/test_scope.py +++ /dev/null @@ -1,185 +0,0 @@ -import unittest - -from numpy.random import uniform, choice -import numpy as np -import pandas as pd - -from balsa.cheval import ChoiceModel, ScopeOrientationError -from balsa.cheval.scope.scope import (ScalarSymbol, Array1DSymbol, Array2DSymbol, FrameSymbol, LinkedFrameSymbol, - PanelSymbol) -from balsa.cheval.scope import SimpleUsage, AttributedUsage, LinkedFrameUsage -from balsa.cheval.ldf import LinkedDataFrame - - -class TestScope(unittest.TestCase): - - def setUp(self): - self.__record_index = pd.Index(range(10)) - self.__alts_index = pd.Index(list('abcdefg')) - - model = ChoiceModel() - for char in 'abcdefg': - model.tree.add_node(char) - - self.__model = model - - def __prep_for_testing(self, expression): - self.__model.expressions.clear() - self.__model.expressions.append(expression) - self.__model.scope.set_record_index(self.__record_index) - - def test_scalar_symbol(self): - self.__prep_for_testing('a * 2') - model = self.__model - - model.scope.fill_symbol('a', 0.5) - assert 'a' in model.scope._filled_symbols - symbol = model.scope._filled_symbols['a'] - assert isinstance(symbol, ScalarSymbol) - assert symbol.get_value(None) == 0.5 - - def test_array_1d_symbol(self): - self.__prep_for_testing('a + b + c + d') - model = self.__model - - data1 = uniform(low=-1.0, high=1.0, size=10) - model.scope.fill_symbol('a', data1) - symbol_a = model.scope._filled_symbols['a'] - assert isinstance(symbol_a, Array1DSymbol) - assert symbol_a._orientation == 0 - value = symbol_a.get_value(None) - assert value.shape == (10, 1) - - data2 = uniform(low=-1.0, high=1.0, size=7) - model.scope.fill_symbol('b', data2) - symbol_b = model.scope._filled_symbols['b'] - assert isinstance(symbol_b, Array1DSymbol) - assert symbol_b._orientation == 1 - value = symbol_b.get_value(None) - assert value.shape == (1, 7) - - model.scope.fill_symbol('c', pd.Series(data1, index=self.__record_index)) - symbol_c = model.scope._filled_symbols['c'] - assert isinstance(symbol_c, Array1DSymbol) - assert symbol_c._orientation == 0 - value = symbol_c.get_value(None) - assert value.shape == (10, 1) - - model.scope.fill_symbol('d', pd.Series(data2, self.__alts_index)) - symbol_d = model.scope._filled_symbols['d'] - assert isinstance(symbol_d, Array1DSymbol) - assert symbol_d._orientation == 1 - value = symbol_d.get_value(None) - assert value.shape == (1, 7) - - def test_array_2d_symbol(self): - self.__prep_for_testing('d + e + f + g') - model = self.__model - - # 3. Test Array2DSymbol from unlabelled arrays - data3 = uniform(size=(10, 7)) - model.scope.fill_symbol('d', data3) - symbol_d = model.scope._filled_symbols['d'] - assert isinstance(symbol_d, Array2DSymbol) - assert symbol_d._data.shape == (10, 7) - value = symbol_d.get_value(SimpleUsage()) - assert value.shape == (10, 7) - - data4 = uniform(size=(7, 10)) - model.scope.fill_symbol('e', data4) - symbol_e = model.scope._filled_symbols['e'] - assert isinstance(symbol_e, Array2DSymbol) - assert symbol_e._data.shape == (10, 7) - value = symbol_e.get_value(SimpleUsage()) - assert value.shape == (10, 7) - - # 4. Test Array2DSymbol from labelled arrays - data5 = pd.DataFrame(uniform(size=(10, 7)), index=self.__record_index, columns=self.__alts_index) - model.scope.fill_symbol('f', data5) - symbol_f = model.scope._filled_symbols['f'] - assert isinstance(symbol_f, Array2DSymbol) - assert symbol_f._data.shape == (10, 7) - value = symbol_f.get_value(SimpleUsage()) - assert value.shape == (10, 7) - - model.scope.fill_symbol('g', data5.transpose()) - symbol_g = model.scope._filled_symbols['g'] - assert isinstance(symbol_g, Array2DSymbol) - assert symbol_g._data.shape == (10, 7) - value = symbol_g.get_value(SimpleUsage()) - assert value.shape == (10, 7) - - def test_frame_symbol(self): - self.__prep_for_testing('a.b + c.d') - model = self.__model - - # 1. Fill with record-oriented DataFrame - data1 = pd.DataFrame({'b': uniform(size=10), 'colour': choice(['red', 'blue'], size=10, replace=True)}, - index=self.__record_index) - data1['colour'] = data1['colour'].astype('category') - model.scope.fill_symbol('a', data1) - symbol_a = model.scope._filled_symbols['a'] - assert isinstance(symbol_a, FrameSymbol) - assert symbol_a._orientation == 0 - numeric_value = symbol_a.get_value(AttributedUsage('', 'b')) - assert numeric_value.shape == (10, 1) - cat_value = symbol_a.get_value(AttributedUsage('', 'colour')) - assert cat_value.dtype == np.dtype('a4') - - # 2. Fill with alternatives-oriented DataFrame - data2 = pd.DataFrame({'d': uniform(size=7)}, index=self.__alts_index) - model.scope.fill_symbol('c', data2) - symbol_c = model.scope._filled_symbols['c'] - assert isinstance(symbol_c, FrameSymbol) - assert symbol_c._orientation == 1 - value = symbol_c.get_value(AttributedUsage('', 'd')) - assert value.shape == (1, 7) - - def test_linked_frame_symbol(self): - self.__prep_for_testing('record.height + record.zone.area + record.hats.count(colour == "red")') - model = self.__model - - df = LinkedDataFrame({'height': uniform(high=3, size=10), - 'zone': choice(range(101, 104), size=10, replace=True)}, - index=self.__record_index) - zones = LinkedDataFrame({'area': uniform(100, 500, size=3)}, index=range(101, 104)) - hats = LinkedDataFrame({'colour': choice(['red', 'blue'], size=15, replace=True), - 'record': [0, 0, 1, 2, 2, 2, 3, 4, 5, 5, 6, 7, 8, 9, 10]}) - - df.link_to(zones, 'zone', on_self='zone') - df.link_to(hats, 'hats', on_other='record') - - hats.link_to(df, 'record', on_self='record') - - model.scope.fill_symbol('record', df) - symbol = model.scope._filled_symbols['record'] - assert isinstance(symbol, LinkedFrameSymbol) - assert symbol._orientation == 0 - - for expr in model.expressions: - assert 'record' in expr._symbols - for usage in expr._symbols['record']: - value = symbol.get_value(usage) - assert value.shape == (10, 1) - - def test_panel_symbol(self): - self.__prep_for_testing('e.f + 2') - model = self.__model - - data3 = pd.Panel({'f': uniform(size=[10, 7])}, major_axis=self.__record_index, minor_axis=self.__alts_index) - model.scope.fill_symbol('e', data3) - symbol_e = model.scope._filled_symbols['e'] - assert isinstance(symbol_e, PanelSymbol) - - def test_dict_literal(self): - self.__prep_for_testing('{a: 1.2, c: 2.5, d: 0.5}') - model = self.__model - - dict_symbol = model.scope._filled_symbols['__dict0'] - assert isinstance(dict_symbol, Array1DSymbol) - value = dict_symbol.get_value(None) - assert value.shape == (1, 7) - - -if __name__ == '__main__': - unittest.main() diff --git a/balsa/utils/__init__.py b/balsa/utils/__init__.py deleted file mode 100644 index dcfd171..0000000 --- a/balsa/utils/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .utils import is_identifier, open_file diff --git a/balsa/utils/utils.py b/balsa/utils/utils.py deleted file mode 100644 index 6ee871f..0000000 --- a/balsa/utils/utils.py +++ /dev/null @@ -1,75 +0,0 @@ -from __future__ import division, absolute_import, print_function, unicode_literals - -from contextlib import contextmanager - -import six -import re -import tokenize -from keyword import kwlist - -from six import string_types - -try: - from pathlib import Path -except ImportError: - Path = None - -if six.PY3: - def is_identifier(name): - """ - Tests that the name is a valid Python variable name and does not collide with reserved keywords - - Args: - name (str): Name to test - - Returns: - bool: If the name is 'Pythonic' - - """ - - return name.isidentifier() and name not in kwlist -else: - def is_identifier(name): - """ - Tests that the name is a valid Python variable name and does not collide with reserved keywords - - Args: - name (str): Name to test - - Returns: - bool: If the name is 'Pythonic' - - """ - - return bool(re.match(tokenize.Name + '$', name)) and name not in kwlist - - -@contextmanager -def open_file(file_handle, **kwargs): - """ - Context manager for opening files provided as several different types. Supports a file handler as a str, unicode, - pathlib.Path, or an already-opened handler. - - Args: - file_handle (str or unicode or Path or File): The item to be opened or is already open. - **kwargs: Keyword args passed to open. Usually mode='w'. - - Yields: - File: The opened file handler. Automatically closed once out of context. - - """ - opened = False - if isinstance(file_handle, string_types): - f = open(file_handle, **kwargs) - opened = True - elif Path is not None and isinstance(file_handle, Path): - f = file_handle.open(**kwargs) - opened = True - else: - f = file_handle - - try: - yield f - finally: - if opened: - f.close() diff --git a/doc/Makefile b/doc/Makefile new file mode 100644 index 0000000..d4bb2cb --- /dev/null +++ b/doc/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = . +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/doc/conf.py b/doc/conf.py new file mode 100644 index 0000000..756da45 --- /dev/null +++ b/doc/conf.py @@ -0,0 +1,63 @@ +# Configuration file for the Sphinx documentation builder. +# +# This file only contains a selection of the most common options. For a full +# list see the documentation: +# http://www.sphinx-doc.org/en/master/config + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +import os +import sys +sys.path.insert(0, os.path.abspath('..')) + + +# -- Project information ----------------------------------------------------- + +project = 'balsa' +copyright = '2019, WSP, Peter Kucirek' +author = 'WSP, Peter Kucirek' + +# The short X.Y version +version = '1.0' + +# The full version, including alpha/beta/rc tags +release = '1.0' + + +# -- General configuration --------------------------------------------------- + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + 'sphinx.ext.napoleon', + 'sphinx.ext.autodoc' +] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = 'sphinx' + + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = 'sphinx_rtd_theme' + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] diff --git a/doc/index.rst b/doc/index.rst new file mode 100644 index 0000000..5c85276 --- /dev/null +++ b/doc/index.rst @@ -0,0 +1,21 @@ +Balsa: Common modelling tools +============================= + +Balsa is a collection of functions and tools for Python to facilitate travel demand forecasting applications and analyses. It is designed to work the the "scientific stack" of Python, namely NumPy, Pandas, and Matplotlib; which are optimized for speed and usability. Most of `balsa` consists of standalone functions; for input/output, for analysis, etc.; as well as a few lightweight class-based data structures for specific applications. + +Balsa is published by the Systems Analytics for Policy group inside WSP Canada. + +.. toctree:: + :maxdepth: 2 + :caption: Contents: + + modules/balsa.routines + modules/balsa.configuration + modules/balsa.logging + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` diff --git a/doc/make.bat b/doc/make.bat new file mode 100644 index 0000000..922152e --- /dev/null +++ b/doc/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=. +set BUILDDIR=_build + +if "%1" == "" goto help + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/doc/modules/balsa.configuration.rst b/doc/modules/balsa.configuration.rst new file mode 100644 index 0000000..c54e97c --- /dev/null +++ b/doc/modules/balsa.configuration.rst @@ -0,0 +1,10 @@ +Configuration +============= + +Contains the `Config` class for working with JSON model configuration files. Designed to allow modellers to use code to specify JSON contents, such as name and types of variables, raising errors if the JSON file is not formatted correctly. Also allows such files to include comments. + +Contents +-------- + +.. automodule:: balsa.configuration + :members: diff --git a/doc/modules/balsa.logging.rst b/doc/modules/balsa.logging.rst new file mode 100644 index 0000000..7dbcc0d --- /dev/null +++ b/doc/modules/balsa.logging.rst @@ -0,0 +1,10 @@ +Logging +======= + +Collection of classes and functions to configuration Python Logger objects, re-purposed to provide detailed status messages during model program execution. + +Contents +-------- + +.. automodule:: balsa.logging + :members: diff --git a/doc/modules/balsa.routines.rst b/doc/modules/balsa.routines.rst new file mode 100644 index 0000000..6d856f9 --- /dev/null +++ b/doc/modules/balsa.routines.rst @@ -0,0 +1,44 @@ +Routines +======== + +A collection of standalone functions for performing various modelling tasks. + +General utilities +----------------- + +.. automodule:: balsa.routines.general + :members: + +Modelling analyses and applications +----------------------------------- + +.. automodule:: balsa.routines.modelling + :members: + +Routines for reading binary matrix files +---------------------------------------- + +.. automodule:: balsa.routines.io.inro + :members: + +.. automodule:: balsa.routines.io.omx + :members: + +.. automodule:: balsa.routines.io.fortran + :members: + +.. automodule:: balsa.routines.io.common + :members: + +Routines for working with matrices +---------------------------------- + +.. automodule:: balsa.routines.matrices + :members: + +Routines for plotting +--------------------- + +.. automodule:: balsa.routines.plotting + :members: + diff --git a/setup.py b/setup.py index fb81b5d..81ff187 100644 --- a/setup.py +++ b/setup.py @@ -2,14 +2,14 @@ setup( name='balsa', - version='0.6.1', + version='1.0', packages=find_packages(), - requires=[ - 'pandas', - 'numpy', - 'astor', - 'numba', - 'numexpr', - 'six' + install_requires=[ + 'pandas>=0.21, <0.24', + 'numpy>=1.15', + 'numba>=0.35', + 'numexpr>=2.6', + 'six>=1.10', + 'matplotlib>=3.0' ] )