Skip to content

Commit

Permalink
Legacy checkpoint read, fix checkpoint re-training issue (IBM#434)
Browse files Browse the repository at this point in the history
Signed-off-by: Henry Ye <yehenry11@gmail.com>
  • Loading branch information
maljoras authored and HCY-11 committed Dec 7, 2022
1 parent be7c5a0 commit c1ccc01
Show file tree
Hide file tree
Showing 17 changed files with 266 additions and 392 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@ src/aihwkit/simulator/*.so
# Temporary folder for the example downloads.
data/

# Folder for cibuildwheel.
wheelhouse/

## From https://github.com/github/gitignore/blob/master/Python.gitignore

# Byte-compiled / optimized / DLL files
Expand Down
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ The format is based on [Keep a Changelog], and this project adheres to
* `WeightModifiers` of the `InferenceRPUConfig` are no longer called
in the forward pass, but instead in the `post_update_step`
method to avoid issues with repeated forward calls. (\#423)
* Fix training `learn_out_scales` issue after checkpoint load. (\#434)

### Changed

Expand All @@ -81,7 +82,7 @@ The format is based on [Keep a Changelog], and this project adheres to
### Removed

* The `_scaled` versions of the weight getter and setter methods are
removed (\#423)
removed (\#423)


## [0.6.0] - 2022/05/16
Expand Down
4 changes: 2 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,13 @@ project(aihwkit C CXX)

# Project options.
option(BUILD_TEST "Build C++ test binaries" OFF)
option(USE_CUDA "Build with CUDA support" OFF)
option(USE_CUDA "Build with CUDA support" $ENV{USE_CUDA})
option(RPU_DEBUG "Enable debug printing" OFF)
option(RPU_USE_FASTMOD "Use fast mod" ON)
option(RPU_USE_FASTRAND "Use fastrand" OFF)

set(RPU_BLAS "OpenBLAS" CACHE STRING "BLAS backend of choice (OpenBLAS, MKL)")
set(RPU_CUDA_ARCHITECTURES "60" CACHE STRING "Target CUDA architectures")
set(RPU_CUDA_ARCHITECTURES "60;70;75;80" CACHE STRING "Target CUDA architectures")

# Internal variables.
set(CUDA_TARGET_PROPERTIES POSITION_INDEPENDENT_CODE ON
Expand Down
1 change: 0 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
# Build dependencies.
cmake>=3.18
setuptools<=46.0; python_version < '3.9'
scikit-build>=0.11.1
pybind11>=2.6.2
# Runtime dependencies.
Expand Down
2 changes: 1 addition & 1 deletion src/aihwkit/VERSION.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.6.0
0.6.2
19 changes: 13 additions & 6 deletions src/aihwkit/cloud/client/entities.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,11 @@

from aihwkit.cloud.client.exceptions import ExperimentStatusError
from aihwkit.cloud.converter.definitions.input_file_pb2 import TrainingInput
from aihwkit.cloud.converter.definitions.i_input_file_pb2 import InferenceInput
from aihwkit.cloud.converter.definitions.output_file_pb2 import TrainingOutput
from aihwkit.cloud.converter.v1.training import BasicTrainingConverter, BasicTrainingResultConverter
from aihwkit.experiments import BasicTraining
from aihwkit.cloud.converter.v1.inferencing import BasicInferencingConverter
# from aihwkit.experiments import BasicTraining, BasicInferencing


class CloudJobStatus(Enum):
Expand Down Expand Up @@ -62,7 +64,7 @@ class CloudExperiment:
input_id: Optional[str] = field(repr=False)
job: Optional[CloudJob] = field(repr=False)

def get_experiment(self) -> BasicTraining:
def get_experiment(self) -> Any:
"""Return a data Experiment.
Returns:
Expand All @@ -76,11 +78,16 @@ def get_experiment(self) -> BasicTraining:

input_ = self._api_client.input_get(self.input_id)

input_proto = TrainingInput()
input_proto.ParseFromString(input_)
if 'InferenceRPUConfig' in str(input_):
input_proto = InferenceInput()
input_proto.ParseFromString(input_)
proto = BasicInferencingConverter().from_proto(input_proto)
else:
input_proto = TrainingInput()
input_proto.ParseFromString(input_)
proto = BasicTrainingConverter().from_proto(input_proto)

converter = BasicTrainingConverter()
return converter.from_proto(input_proto)
return proto

def get_result(self) -> list:
"""Return the result of an Experiment.
Expand Down
33 changes: 27 additions & 6 deletions src/aihwkit/cloud/converter/v1/i_mappings.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,13 @@

from aihwkit.simulator.configs import InferenceRPUConfig
from aihwkit.simulator.presets.web import (
WebComposerInferenceRPUConfig, OldWebComposerInferenceRPUConfig
)
WebComposerInferenceRPUConfig, OldWebComposerInferenceRPUConfig)
from aihwkit.cloud.converter.definitions.i_onnx_common_pb2 import AttributeProto
from aihwkit.cloud.converter.exceptions import ConversionError
from aihwkit.nn import AnalogConv2d, AnalogLinear
from aihwkit.nn import (
AnalogConv2d, AnalogConv2dMapped,
AnalogLinear, AnalogLinearMapped
)
from aihwkit.optim import AnalogSGD
from aihwkit.cloud.converter.v1.rpu_config_info import RPUconfigInfo

Expand Down Expand Up @@ -139,10 +141,13 @@ def get_field_value_to_proto(self, source: Any, field: str, default: Any = None)
if field == 'bias':
return getattr(source, 'bias', None) is not None
if field == 'rpu_config':
preset_cls = type(source.analog_tile.rpu_config)
# preset_cls = type(source.analog_tile.rpu_config)
analog_tile = next(source.analog_tiles())
preset_cls = type(analog_tile.rpu_config)
if preset_cls not in Mappings.presets:
raise ConversionError('Invalid rpu_config in layer: {} not '
'among the presets'.format(preset_cls))
raise ConversionError('Invalid rpu_config in layer: '
f'{preset_cls} not '
'among the presets')
return Mappings.presets[preset_cls]
return super().get_field_value_to_proto(source, field, default)

Expand Down Expand Up @@ -182,12 +187,28 @@ class Mappings:
'bias': bool,
'rpu_config': str,
}),
AnalogConv2dMapped: LayerFunction('AnalogConv2dMapped', {
'in_channels': int,
'out_channels': int,
'kernel_size': [int],
'stride': [int],
'padding': [int],
'dilation': [int],
'bias': bool,
'rpu_config': str,
}),
AnalogLinear: LayerFunction('AnalogLinear', {
'in_features': int,
'out_features': int,
'bias': bool,
'rpu_config': str,
}),
AnalogLinearMapped: LayerFunction('AnalogLinearMapped', {
'in_features': int,
'out_features': int,
'bias': bool,
'rpu_config': str,
}),
BatchNorm2d: LayerFunction('BatchNorm2d', {
'num_features': int
}),
Expand Down
16 changes: 9 additions & 7 deletions src/aihwkit/cloud/converter/v1/inferencing.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,11 +68,11 @@ def from_proto(self, protobuf: Any) -> BasicInferencing:
"""Convert a protobuf representation to an `Experiment`."""

dataset = InverseMappings.datasets[protobuf.dataset.dataset_id]

layers = protobuf.network.layers
# build RPUconfig_info to be used when it is instantiated dynamically
alog_info = AnalogInfo(protobuf.inferencing.analog_info)
nm_info = NoiseModelInfo(protobuf.inferencing.noise_model_info)
rc_info = RPUconfigInfo(nm_info, alog_info)
rc_info = RPUconfigInfo(nm_info, alog_info, layers)

model = self._model_from_proto(protobuf.network, rc_info)

Expand Down Expand Up @@ -104,8 +104,8 @@ def _version_to_proto() -> Any:

@staticmethod
def _dataset_to_proto(dataset: type, batch_size: int) -> Any:
if dataset not in Mappings.datasets.keys():
raise ConversionError('Unsupported dataset: {}'.format(dataset))
if dataset not in Mappings.datasets:
raise ConversionError(f'Unsupported dataset: {dataset}')

return Dataset(
dataset_id=Mappings.datasets[dataset],
Expand All @@ -121,7 +121,8 @@ def _model_to_proto(model: Module, weight_template_id: str) -> Any:
children_types = {type(layer) for layer in model.children()}
valid_types = set(Mappings.layers.keys()) | set(Mappings.activation_functions.keys())
if children_types - valid_types:
raise ConversionError('Unsupported layers: {}'.format(children_types - valid_types))
raise ConversionError('Unsupported layers: '
f'{children_types - valid_types}')

# Create a new input_file pb Network object with weight_template_id
network = Network(weight_template_id=weight_template_id)
Expand Down Expand Up @@ -189,7 +190,7 @@ def rpu_config_info_from_info(analog_info: Dict,
nm_info = NoiseModelInfo(BasicInferencingConverter._noise_model_to_proto(
noise_model_info)) # type: ignore[name-defined]
a_info = AnalogInfo(AnalogProto(**analog_info))
return RPUconfigInfo(nm_info, a_info)
return RPUconfigInfo(nm_info, a_info, None)

@staticmethod
def rpu_config_from_info(analog_info: Dict,
Expand All @@ -200,7 +201,8 @@ def rpu_config_from_info(analog_info: Dict,
nm_info = NoiseModelInfo(BasicInferencingConverter._noise_model_to_proto(
noise_model_info)) # type: ignore[name-defined]
a_info = AnalogInfo(AnalogProto(**analog_info))
return RPUconfigInfo(nm_info, a_info).create_inference_rpu_config(func_id)
return RPUconfigInfo(nm_info,
a_info, None).create_inference_rpu_config(func_id)

@staticmethod
def _inferencing_to_proto(
Expand Down
76 changes: 68 additions & 8 deletions src/aihwkit/cloud/converter/v1/rpu_config_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,34 +11,81 @@
# that they have been altered from the originals.

"""Creates InferenceRPUConfig to add to nn model"""

from typing import Dict, Any
from collections import OrderedDict

from aihwkit.simulator.configs.configs import InferenceRPUConfig
from aihwkit.simulator.presets.web import OldWebComposerInferenceRPUConfig
from aihwkit.simulator.presets.web import (
WebComposerInferenceRPUConfig,
OldWebComposerInferenceRPUConfig
)
from aihwkit.inference.noise.pcm import PCMLikeNoiseModel
from aihwkit.inference.noise.custom import StateIndependentNoiseModel
from aihwkit.inference.compensation.drift import GlobalDriftCompensation
from aihwkit.cloud.converter.v1.analog_info import AnalogInfo
from aihwkit.cloud.converter.v1.noise_model_info import NoiseModelInfo

# pylint: disable=too-few-public-methods
RPU_CLASSES = {
'InferenceRPUConfig': InferenceRPUConfig,
'WebComposerInferenceRPUConfig': WebComposerInferenceRPUConfig,
'OldWebComposerInferenceRPUConfig': OldWebComposerInferenceRPUConfig
}


# pylint: disable=too-few-public-methods
class NoiseModelDeviceIDException(Exception):
"""Exception raised if noise model device id is not correct"""


class RPUconfigInfo:
"""Data only class for RPUConfig fields"""

def __init__(self, nm_info: NoiseModelInfo, a_info: AnalogInfo):
""""Constructor for this class"""

def __init__(self, nm_info: NoiseModelInfo,
a_info: AnalogInfo,
layers: Any = None):
"""
The only constructor for this class
"""
self._noise_model_info = nm_info
self._analog_info = a_info
self._layers = layers
self._device_id = ''

@staticmethod
def _get_common_rpucfg_name(layers: Any) -> Any:
"""Set common rpu config name by search all analog layers"""
# Use default RPU config for Composer
if layers is None:
return 'WebComposerInferenceRPUConfig'
# Need to loop through protobuf layers and figure out
# common rpu_config value.
names: Dict[str, int] = {}
# pylint: disable=too-many-nested-blocks
for layer_proto in layers: # type: ignore[attr-defined]
if layer_proto.WhichOneof('item') == 'layer':
layer = layer_proto.layer
if layer.id.startswith('Analog'):
# Loop though all AttributeProto objecs in layer.arguments
for argument in layer.arguments:
if argument.name == 'rpu_config':
# stored as UTF8 byte string in attribute s
arg_value = getattr(argument, 's')
# update count of this rpu_config in all analog layers
if arg_value in names:
names[arg_value] += 1
else:
names[arg_value] = 1
# pylint: enable=too-many-nested-blocks
# should have exactly on in dictionary 'names'
if len(names) > 1:
print(f'>>> ERROR: more than one rpu_config: {names}')
return None
if len(names) == 1:
# keys() returns dict_keys object, need a list
return list(names.keys())[0].decode('UTF-8') # type: ignore[attr-defined]
print('>>> INFO: experiment has not analog layers')
return ''

def _print_rpu_config(
self,
rpu_config: InferenceRPUConfig,
Expand Down Expand Up @@ -92,9 +139,22 @@ def _print_rpu_config(
def create_inference_rpu_config(self, func_id: str,
verbose: bool = False) -> InferenceRPUConfig:
"""Creates a InferenceRPUConfig class using noise and analog info"""
# Need to find name of 'common-rpu-conf-class-name' in protobuf
# This should be the consistent across all layers.
# The Composer Validator should have already caught this but
# it is checked here for testcases and other unknown environments
rpu_class_name = self._get_common_rpucfg_name(self._layers)
print(f'>>> INFO: rpu_class_name={rpu_class_name}')
if rpu_class_name is None or len(rpu_class_name) == 0:
raise Exception('class name error. see previous messages')
rpu_config_class = None
if rpu_class_name in RPU_CLASSES:
rpu_config_class = RPU_CLASSES[rpu_class_name]
else:
raise Exception(f"rpu class name '{rpu_class_name}' not one of '{RPU_CLASSES.keys()}'")

rpu_config = OldWebComposerInferenceRPUConfig()

# Dynamically create the right InferenceRPUConfig class
rpu_config = rpu_config_class()
# Assign values from AnalogProto
rpu_config.forward.out_noise = self._analog_info.output_noise_strength

Expand Down
6 changes: 3 additions & 3 deletions src/aihwkit/experiments/experiments/inferencing.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ def get_model(
if weight_template_id.startswith('http'):
template_url = weight_template_id
else:
print('weights_template_id: ', weight_template_id)
# print('weights_template_id: ', weight_template_id)
template_path = template_dir + "/" + weight_template_id + ".pth"
template_url = WEIGHT_TEMPLATE_URL + weight_template_id + ".pth"
# check if the file exists
Expand All @@ -194,7 +194,7 @@ def get_model(
if not path.exists(template_path):
download(template_url, template_path)

print('template_path: ', template_path)
# print('template_path: ', template_path)
if path.exists(template_path):
model.load_state_dict(load(template_path, map_location=device),
load_rpu_config=False)
Expand All @@ -203,7 +203,7 @@ def get_model(

if self.remap_weights:
for module in model.analog_modules():
module.remap_weights()
module.remap_weights(1.0)

return model.to(device)

Expand Down
Loading

0 comments on commit c1ccc01

Please sign in to comment.