Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Runtime error handling #135

Merged
merged 30 commits into from
Dec 4, 2021
Merged
Show file tree
Hide file tree
Changes from 29 commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
f456afb
- Initial enablement of RefPort and VarPorts
PhilippPlank Nov 12, 2021
2d0ec74
- Initial enablement of RefPort and VarPorts
PhilippPlank Nov 12, 2021
25a3a68
- Initial enablement of RefPort and VarPorts
PhilippPlank Nov 12, 2021
a2d1765
- Initial enablement of RefPort and VarPorts
PhilippPlank Nov 12, 2021
74dfdf6
Merge branch 'lava-nc:main' into main
PhilippPlank Nov 15, 2021
1daa9af
- Enablement of RefPorts and VarPorts - addressed change requests fro…
PhilippPlank Nov 16, 2021
5908401
- Enablement of RefPorts and VarPorts - addressed change requests fro…
PhilippPlank Nov 16, 2021
3df2847
- Enablement of RefPorts and VarPorts - addressed change requests fro…
PhilippPlank Nov 16, 2021
9581fae
Merge branch 'lava-nc:main' into main
PhilippPlank Nov 16, 2021
6e4716a
- Enablement of RefPorts and VarPorts - addressed change requests fro…
PhilippPlank Nov 16, 2021
e22def6
Merge branch 'lava-nc:main' into main
PhilippPlank Nov 17, 2021
bd25a80
Merge branch 'lava-nc:main' into main
PhilippPlank Nov 17, 2021
7edeb1f
Merge branch 'lava-nc:main' into main
PhilippPlank Nov 18, 2021
23fb8d7
Merge branch 'lava-nc:main' into main
PhilippPlank Nov 18, 2021
f6686b4
Merge branch 'lava-nc:main' into main
PhilippPlank Nov 23, 2021
86867c2
Merge branch 'lava-nc:main' into main
PhilippPlank Nov 23, 2021
859a195
Merge branch 'lava-nc:main' into main
PhilippPlank Nov 24, 2021
f7007f8
Merge branch 'lava-nc:main' into main
PhilippPlank Nov 25, 2021
0163202
Merge branch 'lava-nc:main' into main
PhilippPlank Nov 25, 2021
bf934ef
modified connection tutorial for release 0.2.0
PhilippPlank Nov 26, 2021
0eef67e
Merge branch 'lava-nc:main' into main
PhilippPlank Nov 26, 2021
f08a35c
fixed typos
PhilippPlank Nov 26, 2021
ceddf2a
Merge branch 'lava-nc:main' into main
PhilippPlank Nov 26, 2021
f798b0a
Merge branch 'lava-nc:main' into main
PhilippPlank Nov 30, 2021
080118a
- Initial commit on passing exception from ProcessModels up to the ru…
PhilippPlank Nov 30, 2021
e0477fb
- Initial commit on passing exception from ProcessModels up to the ru…
PhilippPlank Nov 30, 2021
b011e4f
- Updated commit on passing exception from ProcessModels up to the ru…
PhilippPlank Dec 1, 2021
ad47ff7
- Updated commit on passing exception from ProcessModels up to the ru…
PhilippPlank Dec 3, 2021
eb61ba6
Merge branch 'lava-nc:main' into runtime_error_handling
PhilippPlank Dec 3, 2021
60ef89b
Merge branch 'main' into runtime_error_handling
PhilippPlank Dec 4, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 35 additions & 28 deletions src/lava/magma/core/model/py/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,34 +125,41 @@ def run(self):
self.process_to_service_ack.send(MGMT_RESPONSE.TERMINATED)
self.join()
return
# Spiking phase - increase time step
if enum_equal(phase, PyLoihiProcessModel.Phase.SPK):
self.current_ts += 1
self.run_spk()
self.process_to_service_ack.send(MGMT_RESPONSE.DONE)
# Pre-management phase
elif enum_equal(phase, PyLoihiProcessModel.Phase.PRE_MGMT):
# Enable via guard method
if self.pre_guard():
self.run_pre_mgmt()
self.process_to_service_ack.send(MGMT_RESPONSE.DONE)
# Learning phase
elif enum_equal(phase, PyLoihiProcessModel.Phase.LRN):
# Enable via guard method
if self.lrn_guard():
self.run_lrn()
self.process_to_service_ack.send(MGMT_RESPONSE.DONE)
# Post-management phase
elif enum_equal(phase, PyLoihiProcessModel.Phase.POST_MGMT):
# Enable via guard method
if self.post_guard():
self.run_post_mgmt()
self.process_to_service_ack.send(MGMT_RESPONSE.DONE)
# Host phase - called at the last time step before STOP
elif enum_equal(phase, PyLoihiProcessModel.Phase.HOST):
pass
else:
raise ValueError(f"Wrong Phase Info Received : {phase}")
try:
# Spiking phase - increase time step
if enum_equal(phase, PyLoihiProcessModel.Phase.SPK):
self.current_ts += 1
self.run_spk()
self.process_to_service_ack.send(MGMT_RESPONSE.DONE)
# Pre-management phase
elif enum_equal(phase, PyLoihiProcessModel.Phase.PRE_MGMT):
# Enable via guard method
if self.pre_guard():
self.run_pre_mgmt()
self.process_to_service_ack.send(MGMT_RESPONSE.DONE)
# Learning phase
elif enum_equal(phase, PyLoihiProcessModel.Phase.LRN):
# Enable via guard method
if self.lrn_guard():
self.run_lrn()
self.process_to_service_ack.send(MGMT_RESPONSE.DONE)
# Post-management phase
elif enum_equal(phase, PyLoihiProcessModel.Phase.POST_MGMT):
# Enable via guard method
if self.post_guard():
self.run_post_mgmt()
self.process_to_service_ack.send(MGMT_RESPONSE.DONE)
# Host phase - called at the last time step before STOP
elif enum_equal(phase, PyLoihiProcessModel.Phase.HOST):
pass
else:
raise ValueError(f"Wrong Phase Info Received : {phase}")
except Exception as inst:
# Inform runtime service about termination
self.process_to_service_ack.send(MGMT_RESPONSE.ERROR)
self.join()
raise inst

elif action == 'req':
# Handle get/set Var requests from runtime service
self._handle_get_set_var()
Expand Down
24 changes: 23 additions & 1 deletion src/lava/magma/runtime/message_infrastructure/multiprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,9 @@
from lava.magma.compiler.builder import PyProcessBuilder, \
AbstractRuntimeServiceBuilder

from multiprocessing import Process as SystemProcess
import multiprocessing as mp
from multiprocessing.managers import SharedMemoryManager
import traceback

from lava.magma.compiler.channels.interfaces import ChannelType, Channel
from lava.magma.compiler.channels.pypychannel import PyPyChannel
Expand All @@ -18,6 +19,27 @@
import MessageInfrastructureInterface


class SystemProcess(mp.Process):
def __init__(self, *args, **kwargs):
mp.Process.__init__(self, *args, **kwargs)
self._pconn, self._cconn = mp.Pipe()
self._exception = None

def run(self):
try:
mp.Process.run(self)
self._cconn.send(None)
except Exception as e:
tb = traceback.format_exc()
self._cconn.send((e, tb))

@property
def exception(self):
if self._pconn.poll():
self._exception = self._pconn.recv()
return self._exception


class MultiProcessing(MessageInfrastructureInterface):
"""Implements message passing using shared memory and multiprocessing"""
def __init__(self):
Expand Down
4 changes: 3 additions & 1 deletion src/lava/magma/runtime/mgmt_token_enums.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,5 +65,7 @@ class MGMT_RESPONSE:
"""Signfies Ack or Finished with the Command"""
TERMINATED = enum_to_np(-1)
"""Signifies Termination"""
PAUSED = enum_to_np(-2)
ERROR = enum_to_np(-2)
"""Signifies Error raised"""
PAUSED = enum_to_np(-3)
"""Signifies Execution State to be Paused"""
17 changes: 16 additions & 1 deletion src/lava/magma/runtime/runtime.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,7 +228,22 @@ def _run(self, run_condition):
for recv_port in self.service_to_runtime_ack:
data = recv_port.recv()
if not enum_equal(data, MGMT_RESPONSE.DONE):
raise RuntimeError(f"Runtime Received {data}")
if enum_equal(data, MGMT_RESPONSE.ERROR):
# Receive all errors from the ProcessModels
error_cnt = 0
for actors in \
self._messaging_infrastructure.actors:
actors.join()
if actors.exception:
_, traceback = actors.exception
print(traceback)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this actually identify the problem from which the error came? Perhaps attach this to the Exception object.
You should print the class name, Process name and id.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To be clear. Both the processes should identify themselves but also the RuntimeServices in case the error happened in the RuntimeService.
Perhaps you even want to distinguish them clearly in the command line prints.

Perhaps you even want to first print a summary of everything that has thrown any errors at the top of the command line to orient the user because otherwise there could be quite a messy stack trace.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The printed traceback leads you to the line of code the exception occurred, e.g., "line 50, in run_spk
raise AssertionError("All the error info")" -> line 50 is within PyProcModel1. So you know exactly which exception happened and on which code line.

These tracebacks are printed for every exception occurred in all ProcessModels.

error_cnt += 1

raise RuntimeError(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You could define own Execptions. RuntimeError for actual errors in the Runtime and ProcessModelError for ... you know what.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This exception is a RuntimeError, due to other Exceptions happened in the ProcessModel. The details have been printed already, this is just to stop the Runtime and tell the user that there have been other Exceptions (+ the number of them).

For all the details the user only has to scroll upwards in the console.

f"{error_cnt} Exception(s) occurred. See "
f"output above for details.")
else:
raise RuntimeError(f"Runtime Received {data}")
if run_condition.blocking:
self.current_ts += self.num_steps
self._is_running = False
Expand Down
16 changes: 12 additions & 4 deletions src/lava/magma/runtime/runtime_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,11 +228,19 @@ def run(self):
# ProcessModels respond with DONE if not HOST phase
if not enum_equal(
phase, LoihiPyRuntimeService.Phase.HOST):
rsps = self._get_pm_resp()
for rsp in rsps:

for rsp in self._get_pm_resp():
if not enum_equal(rsp, MGMT_RESPONSE.DONE):
raise ValueError(
f"Wrong Response Received : {rsp}")
if enum_equal(rsp, MGMT_RESPONSE.ERROR):
# Forward error to runtime
self.service_to_runtime_ack.send(
MGMT_RESPONSE.ERROR)
# stop all other pm
self._send_pm_cmd(MGMT_COMMAND.STOP)
return
else:
raise ValueError(
f"Wrong Response Received : {rsp}")

# If HOST phase (last time step ended) break the loop
if enum_equal(
Expand Down
146 changes: 146 additions & 0 deletions tests/lava/magma/runtime/test_exception_handling.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
# Copyright (C) 2021 Intel Corporation
# SPDX-License-Identifier: BSD-3-Clause
# See: https://spdx.org/licenses/

import unittest

from lava.magma.core.decorator import implements, requires, tag
from lava.magma.core.model.py.model import PyLoihiProcessModel
from lava.magma.core.model.py.ports import PyOutPort, PyInPort
from lava.magma.core.model.py.type import LavaPyType
from lava.magma.core.process.ports.ports import OutPort, InPort
from lava.magma.core.process.process import AbstractProcess
from lava.magma.core.resources import CPU
from lava.magma.core.run_configs import Loihi1SimCfg
from lava.magma.core.sync.protocols.loihi_protocol import LoihiProtocol
from lava.magma.core.run_conditions import RunSteps


# A minimal process with an OutPort
class P1(AbstractProcess):
def __init__(self, **kwargs):
super().__init__(**kwargs)
self.out = OutPort(shape=(2,))


# A minimal process with an InPort
class P2(AbstractProcess):
def __init__(self, **kwargs):
super().__init__(**kwargs)
self.inp = InPort(shape=(2,))


# A minimal process with an InPort
class P3(AbstractProcess):
def __init__(self, **kwargs):
super().__init__(**kwargs)
self.inp = InPort(shape=(2,))


# A minimal PyProcModel implementing P1
@implements(proc=P1, protocol=LoihiProtocol)
@requires(CPU)
@tag('floating_pt')
class PyProcModel1(PyLoihiProcessModel):
out: PyOutPort = LavaPyType(PyOutPort.VEC_DENSE, int)

def run_spk(self):
if self.current_ts > 1:
# Raise exception
raise AssertionError("All the error info")


# A minimal PyProcModel implementing P2
@implements(proc=P2, protocol=LoihiProtocol)
@requires(CPU)
@tag('floating_pt')
class PyProcModel2(PyLoihiProcessModel):
inp: PyInPort = LavaPyType(PyInPort.VEC_DENSE, int)

def run_spk(self):
if self.current_ts > 1:
# Raise exception
raise TypeError("All the error info")


# A minimal PyProcModel implementing P3
@implements(proc=P3, protocol=LoihiProtocol)
@requires(CPU)
@tag('floating_pt')
class PyProcModel3(PyLoihiProcessModel):
inp: PyInPort = LavaPyType(PyInPort.VEC_DENSE, int)

def run_spk(self):
...


class TestExceptionHandling(unittest.TestCase):
def test_one_pm(self):
"""Checks the forwarding of exceptions within a ProcessModel to the
runtime."""

# Create an instance of P1
proc = P1()

# Run the network for 1 time step -> no exception
proc.run(condition=RunSteps(num_steps=1), run_cfg=Loihi1SimCfg())

# Run the network for another time step -> expect exception
with self.assertRaises(RuntimeError) as context:
proc.run(condition=RunSteps(num_steps=1), run_cfg=Loihi1SimCfg())

exception = context.exception
self.assertEqual(RuntimeError, type(exception))
# 1 exception in the ProcessModel expected
self.assertTrue('1 Exception(s) occurred' in str(exception))

def test_two_pm(self):
"""Checks the forwarding of exceptions within two ProcessModel to the
runtime."""

# Create a sender instance of P1 and a receiver instance of P2
sender = P1()
recv = P2()

# Connect sender with receiver
sender.out.connect(recv.inp)

# Run the network for 1 time step -> no exception
sender.run(condition=RunSteps(num_steps=1), run_cfg=Loihi1SimCfg())

# Run the network for another time step -> expect exception
with self.assertRaises(RuntimeError) as context:
sender.run(condition=RunSteps(num_steps=1), run_cfg=Loihi1SimCfg())

exception = context.exception
self.assertEqual(RuntimeError, type(exception))
# 2 Exceptions in the ProcessModels expected
self.assertTrue('2 Exception(s) occurred' in str(exception))

def test_three_pm(self):
"""Checks the forwarding of exceptions within three ProcessModel to the
runtime."""

# Create a sender instance of P1 and receiver instances of P2 and P3
sender = P1()
recv1 = P2()
recv2 = P3()

# Connect sender with receiver
sender.out.connect([recv1.inp, recv2.inp])

# Run the network for 1 time step -> no exception
sender.run(condition=RunSteps(num_steps=1), run_cfg=Loihi1SimCfg())

# Run the network for another time step -> expect exception
with self.assertRaises(RuntimeError) as context:
sender.run(condition=RunSteps(num_steps=1), run_cfg=Loihi1SimCfg())

exception = context.exception
self.assertEqual(RuntimeError, type(exception))
# 2 Exceptions in the ProcessModels expected
self.assertTrue('2 Exception(s) occurred' in str(exception))


if __name__ == '__main__':
unittest.main(buffer=True)
3 changes: 0 additions & 3 deletions tests/lava/magma/runtime/test_loihi_protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,6 @@ def pre_guard(self):
def lrn_guard(self):
return False

def host_guard(self):
return True


class TestProcess(unittest.TestCase):
def test_synchronization_single_process_model(self):
Expand Down