Skip to content

Commit

Permalink
Merge pull request #62 from iterait/dev
Browse files Browse the repository at this point in the history
v0.1.2
  • Loading branch information
FloopCZ authored Feb 12, 2019
2 parents 726ac55 + a758b5e commit aaa2b69
Show file tree
Hide file tree
Showing 29 changed files with 522 additions and 94 deletions.
31 changes: 16 additions & 15 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,31 +7,32 @@ references:
name: Install dependencies on Ubuntu.
command: |
apt-get update -qy
apt-get install -y git python3-dev python3-pip curl
apt-get install -y git python3-dev python3-numpy python3-pip curl
pip3 install coveralls coverage
arch_deps: &arch_deps
run:
name: Install dependencies on Arch Linux.
command: |
pacman -Syu --noconfirm python python-pip git base-devel curl
pacman -Syu --noconfirm python python-numpy python-pip git curl
pip install coveralls coverage
ubuntu_deps_opencv: &ubuntu_deps_opencv
ubuntu_deps_extra_pkgs: &ubuntu_deps_extra_pkgs
run:
name: Install opencv dependencies on Ubuntu.
name: Install opencv & scikit dependencies on Ubuntu.
command: |
export DEBIAN_FRONTEND=noninteractive
apt-get install -y libopencv-dev python3-opencv
ldconfig
echo 'export OPENCV=true' >> $BASH_ENV
pip3 install scikit-learn
echo 'export EXTRA_PKGS=true' >> $BASH_ENV
arch_deps_opencv: &arch_deps_opencv
arch_deps_extra_pkgs: &arch_deps_extra_pkgs
run:
name: Install opencv dependencies on Arch Linux.
name: Install opencv & scikit dependencies on Arch Linux.
command: |
pacman -Syu --noconfirm opencv hdf5 gtk3
echo 'export OPENCV=true' >> $BASH_ENV
pacman -Syu --noconfirm opencv hdf5 gtk3 python-scikit-learn
echo 'export EXTRA_PKGS=true' >> $BASH_ENV
install: &install
run:
Expand Down Expand Up @@ -83,7 +84,7 @@ jobs:
- checkout
- *install
- *test
- *ubuntu_deps_opencv
- *ubuntu_deps_extra_pkgs
- *test

test_ubuntu_rolling:
Expand All @@ -95,24 +96,24 @@ jobs:
- checkout
- *install
- *test
- *ubuntu_deps_opencv
- *ubuntu_deps_extra_pkgs
- *test

test_archlinux:
docker:
- image: archimg/base-devel
- image: archlinux/base
working_directory: ~/emloop
steps:
- *arch_deps
- checkout
- *install
- *test
- *arch_deps_opencv
- *arch_deps_extra_pkgs
- *test

coverage:
docker:
- image: archimg/base-devel
- image: archlinux/base
working_directory: ~/emloop
steps:
- *arch_deps
Expand All @@ -133,7 +134,7 @@ jobs:

deploy:
docker:
- image: archimg/base-devel
- image: archlinux/base
working_directory: ~/emloop
steps:
- *arch_deps
Expand Down
1 change: 0 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@ For that reason the whole functionality is divided into various extensions with
At the moment we support the following extensions:

- [emloop-tensorflow](https://github.com/iterait/emloop-tensorflow) - TensorFlow support
- [emloop-scikit](https://github.com/iterait/emloop-scikit) - scientific computations and statistics
- [emloop-rethinkdb](https://github.com/iterait/emloop-rethinkdb) - RethinkDB hook for training management with NoSQL (experimental)

## Contributions
Expand Down
2 changes: 1 addition & 1 deletion docs/_base
Submodule _base updated 3 files
+1 −0 LICENSE
+3 −3 _templates/related.html
+5 −5 conf.py
2 changes: 1 addition & 1 deletion emloop/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,4 @@

__all__ = ['MainLoop']

__version__ = '0.1.1'
__version__ = '0.1.2'
4 changes: 2 additions & 2 deletions emloop/cli/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from ..hooks import AbstractHook
from ..constants import EL_LOG_FILE, EL_HOOKS_MODULE, EL_CONFIG_FILE, EL_LOG_DATE_FORMAT, EL_LOG_FORMAT
from ..utils.reflection import get_class_module, parse_fully_qualified_name, create_object
from ..utils.yaml import yaml_to_str, yaml_to_file, make_simple
from ..utils.yaml import yaml_to_str, yaml_to_file
from ..utils import get_random_name
from ..utils.training_trace import TrainingTrace, TrainingTraceKeys
from ..main_loop import MainLoop
Expand Down Expand Up @@ -76,7 +76,7 @@ def create_dataset(config: dict, output_dir: Optional[str]=None) -> AbstractData
"""
logging.info('Creating dataset')

dataset_config = make_simple(config)['dataset']
dataset_config = dict(config)['dataset']
assert 'class' in dataset_config, '`dataset.class` not present in the config'
dataset_module, dataset_class = parse_fully_qualified_name(dataset_config['class'])

Expand Down
14 changes: 1 addition & 13 deletions emloop/cli/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,25 +29,13 @@ def evaluate(model_path: str, stream_name: str, config_path: Optional[str], cl_a
try:
model_dir = path.dirname(model_path) if not path.isdir(model_path) else model_path
config_path = find_config(model_dir if config_path is None else config_path)
config = load_config(config_file=config_path, additional_args=cl_arguments)
config = load_config(config_file=config_path, additional_args=cl_arguments, override_stream=stream_name)

if stream_name == EL_PREDICT_STREAM and stream_name in config: # old style ``emloop predict ...``
logging.warning('Old style ``predict`` configuration section is deprecated and will not be supported, '
'use ``eval.predict`` section instead.')
config['eval'] = {'predict': config['predict']}

if 'eval' in config and stream_name in config['eval']:
update_section = config['eval'][stream_name]
for subsection in ['dataset', 'model', 'main_loop']:
if subsection in update_section:
config[subsection].update(update_section[subsection])
if 'hooks' in update_section:
config['hooks'] = update_section['hooks']
else:
logging.warning('Config does not contain `eval.%s.hooks` section. '
'No hook will be employed during the evaluation.', stream_name)
config['hooks'] = []

validate_config(config)

logging.debug('\tLoaded config: %s', config)
Expand Down
4 changes: 2 additions & 2 deletions emloop/datasets/base_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from collections import namedtuple
import traceback

import yaml
import ruamel.yaml
import tabulate
import numpy as np

Expand Down Expand Up @@ -36,7 +36,7 @@ def __init__(self, config_str: str):
"""
super().__init__(config_str)

config = yaml.load(config_str)
config = ruamel.yaml.load(config_str, ruamel.yaml.RoundTripLoader)
self._configure_dataset(**config)

@abstractmethod
Expand Down
34 changes: 18 additions & 16 deletions emloop/hooks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,31 +13,33 @@
"""
from .abstract_hook import AbstractHook, TrainingTerminated
from .every_n_epoch import EveryNEpoch
from .accumulate_variables import AccumulateVariables
from .write_csv import WriteCSV
from .stop_after import StopAfter
from .log_variables import LogVariables
from .log_profile import LogProfile
from .log_dir import LogDir
from .save import SaveEvery, SaveBest, SaveLatest
from .compute_stats import ComputeStats
from .benchmark import Benchmark
from .check import Check
from .show_progress import ShowProgress
from .on_plateau import OnPlateau
from .stop_on_plateau import StopOnPlateau
from .stop_on_nan import StopOnNaN
from .save_cm import SaveConfusionMatrix
from .classification_metrics import ClassificationMetrics
from .compute_stats import ComputeStats
from .every_n_epoch import EveryNEpoch
from .flatten import Flatten
from .plot_lines import PlotLines
from .log_dir import LogDir
from .log_profile import LogProfile
from .log_variables import LogVariables
from .logits_to_csv import LogitsToCsv
from .sequence_to_csv import SequenceToCsv
from .on_plateau import OnPlateau
from .plot_lines import PlotLines
from .save import SaveEvery, SaveBest, SaveLatest
from .save_cm import SaveConfusionMatrix
from .save_file import SaveFile
from .sequence_to_csv import SequenceToCsv
from .show_progress import ShowProgress
from .stop_after import StopAfter
from .stop_on_nan import StopOnNaN
from .stop_on_plateau import StopOnPlateau
from .write_csv import WriteCSV

AbstractHook.__module__ = '.hooks'

__all__ = ['AbstractHook', 'TrainingTerminated', 'AccumulateVariables', 'WriteCSV', 'StopAfter', 'LogVariables',
'LogProfile', 'LogDir', 'SaveEvery', 'SaveBest', 'SaveLatest', 'ComputeStats', 'Check', 'ShowProgress',
'EveryNEpoch', 'OnPlateau', 'StopOnPlateau', 'StopOnNaN', 'SaveConfusionMatrix', 'Flatten', 'PlotLines',
'LogitsToCsv', 'SequenceToCsv', 'SaveFile']
'LogitsToCsv', 'SequenceToCsv', 'SaveFile', 'Benchmark', 'ClassificationMetrics']

14 changes: 14 additions & 0 deletions emloop/hooks/abstract_hook.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ def __init__(self, **kwargs):
logging.warning('Argument `%s` was not recognized by `%s`. Recognized arguments are `%s`.',
key, type(self).__name__, list(inspect.signature(type(self)).parameters.keys()))

self._main_loop = None

def before_training(self) -> None:
"""
Before training event.
Expand Down Expand Up @@ -102,3 +104,15 @@ def after_training(self) -> None:
This method is called exactly once during the training.
"""
pass

def register_mainloop(self, main_loop: 'emloop.MainLoop') -> None:
"""
Pass :py:class:`emloop.MainLoop` to hook. Raise :py:class:`ValueError` if MainLoop was already passed before.
:param main_loop: **emloop** main loop for training
:raise ValueError: if MainLoop was already passed before
"""
if self._main_loop is not None:
raise ValueError('A MainLoop was already registered with this hook.')

self._main_loop = main_loop
44 changes: 44 additions & 0 deletions emloop/hooks/benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
"""
Hook for benchmarking models and logging average example times.
"""
import logging
import numpy as np
from typing import List

from . import AbstractHook
from ..types import TimeProfile


class Benchmark(AbstractHook):
"""
Log mean and median example times via standard :py:mod:`logging`.
.. code-block:: yaml
:caption: log mean and median example times after each epoch
hooks:
- Benchmark
"""

def __init__(self, batch_size: int, **kwargs):
super().__init__(**kwargs)
self._batch_size = batch_size

def after_epoch_profile(self, epoch_id: int, profile: TimeProfile, streams: List[str]):
"""
Log average example times after each epoch.
The profile is expected to contain at least `eval_batch_{stream}` entry for each logged stream.
:param epoch_id: number of the processed epoch
:param profile: epoch timings profile
:param streams: streams for which example times will be logged
"""
for stream_name in streams:
batch_times = profile.get('eval_batch_' + stream_name, [])
# last batch may be smaller than the other ones, so we drop it to not skew the measurement
example_times = list(map(lambda x: x / float(self._batch_size), batch_times[:-1]))
logging.info('{} - time per example: mean={:.5f}s, median={:.5f}s'.format(stream_name,
np.mean(example_times),
np.median(example_times)))
89 changes: 89 additions & 0 deletions emloop/hooks/classification_metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
"""
Hook computing epoch statistics for classification tasks.
"""

from typing import Mapping, List, Union, Optional
import logging

try:
import sklearn.metrics as sk
except ImportError:
logging.info('This hook requires SciKit.')

from . import AccumulateVariables
from ..types import EpochData


class ClassificationMetrics(AccumulateVariables):
"""
Accumulate the specified prediction and gt variables and compute their classification statistics after each epoch.
In particular, accuracy, precisions, recalls, f1s and sometimes specificity (if f1_average is set to 'binary') are
computed and saved to epoch data.
.. warning::
Specificity will be computed only if `f1_average` is set to `binary`.
.. code-block:: yaml
:caption: Compute and save classification statistics between model output
`prediction` and stream source `labels`.
hooks:
- ClassificationMetrics:
predicted_variable: prediction
gt_variable: labels
"""

def __init__(self, predicted_variable: str, gt_variable: str, f1_average: Optional[str]=None,
var_prefix: str='', **kwargs):
"""
:param predicted_variable: name of the predicted variable.
:param gt_variable: name of the ground truth variable
:param f1_average: averaging type {binary, micro, macro, weighted, samples} defined by
`sklearn.metrics.precision_recall_fscore_support
<https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_recall_fscore_support.html>`_
:param var_prefix: prefix for the output variables to avoid name conflicts; e.g. `classification_`
"""
super().__init__(variables=[predicted_variable, gt_variable], **kwargs)

self._predicted_variable = predicted_variable
self._gt_variable = gt_variable
self._f1_average = f1_average
self._var_prefix = var_prefix

def _get_metrics(self, gt: List[float], predicted: List[float]) -> Mapping[str, Union[float, List[float]]]:
"""Compute accuracy, precision, recall, f1 and sometimes specificity (if f1_average is set to 'binary')."""
metrics = {}
metrics[self._var_prefix+'precision'], metrics[self._var_prefix+'recall'], metrics[self._var_prefix+'f1'], _ = \
sk.precision_recall_fscore_support(gt, predicted, average=self._f1_average)
metrics[self._var_prefix+'accuracy'] = sk.accuracy_score(gt, predicted, normalize=True)
if self._f1_average == 'binary':
tn, fp, fn, tp = sk.confusion_matrix(gt, predicted).ravel()
metrics[self._var_prefix+'specificity'] = tn / (tn + fp)
return metrics

def _save_metrics(self, epoch_data: EpochData) -> None:
"""
Compute the classification statistics from the accumulator and save the results to the given epoch data.
Set up 'accuracy', 'precision', 'recall', 'f1' and sometimes 'specificity' (if f1_average is set to 'binary')
epoch data variables prefixed with self._var_prefix.
:param epoch_data: epoch data to save the results to
:raise ValueError: if the output variables are already set
"""
for stream_name, stream_data in epoch_data.items():
# variables are already checked in the AccumulatingHook; hence, we do not check them here
metrics = self._get_metrics(self._accumulator[stream_name][self._gt_variable],
self._accumulator[stream_name][self._predicted_variable])

for var_name, var_data in metrics.items():
if var_name in stream_data:
raise ValueError('Variable `{}` is set more than once for stream `{}` in epoch data. '
'Use `var_prefix` parameter to avoid name conflicts.'
.format(var_name, stream_name))

stream_data[var_name] = var_data

def after_epoch(self, epoch_data: EpochData, **kwargs) -> None:
"""Compute and save the classification statistics and reset the accumulator."""
self._save_metrics(epoch_data)
super().after_epoch(**kwargs)
2 changes: 1 addition & 1 deletion emloop/hooks/log_profile.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""
Module with a hook which reports the time profile data in the stanard logging.
Module with a hook which reports the time profile data in the standard logging.
"""
import logging
from itertools import chain
Expand Down
Loading

0 comments on commit aaa2b69

Please sign in to comment.