From de817c2c973c9464908009af722ecdec7999ba4a Mon Sep 17 00:00:00 2001
From: quzha <Quanlu.Zhang@microsoft.com>
Date: Wed, 1 Jan 2020 12:03:48 +0800
Subject: [PATCH 01/33] update

---
 .../compression/speedup/torch/compressor.py   | 406 ++++++++++++++++++
 1 file changed, 406 insertions(+)
 create mode 100644 src/sdk/pynni/nni/compression/speedup/torch/compressor.py

diff --git a/src/sdk/pynni/nni/compression/speedup/torch/compressor.py b/src/sdk/pynni/nni/compression/speedup/torch/compressor.py
new file mode 100644
index 0000000000..b3711302e2
--- /dev/null
+++ b/src/sdk/pynni/nni/compression/speedup/torch/compressor.py
@@ -0,0 +1,406 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+import logging
+import torch
+from . import default_layers
+
+_logger = logging.getLogger(__name__)
+
+
+class LayerInfo:
+    def __init__(self, name, module):
+        self.module = module
+        self.name = name
+        self.type = type(module).__name__
+
+        self._forward = None
+
+
+class ModelSpeedup:
+    """
+    Abstract base PyTorch ModelSpeedup
+    """
+
+    def __init__(self, model, masks):
+        """
+        Record necessary info in class members
+
+        Parameters
+        ----------
+        model : pytorch model
+            the model user wants to compress
+        masks : dict
+            the generated masks for modules,
+            key is module name,
+            value is a dict including key `weight`, or also key `bias`
+        """
+        self.bound_model = model
+        self.masks = masks
+
+    def expand_masks(self):
+        """
+        detect all modules should be compressed, and save the result in `self.modules_to_compress`.
+        The model will be instrumented and user should never edit it after calling this method.
+        """
+        if self.modules_to_compress is None:
+            self.modules_to_compress = []
+            for name, module in self.bound_model.named_modules():
+                layer = LayerInfo(name, module)
+                config = self.select_config(layer)
+                if config is not None:
+                    self.modules_to_compress.append((layer, config))
+        return self.modules_to_compress
+
+    def compress_modules(self):
+        """
+        Compress the model with algorithm implemented by subclass.
+
+        The model will be instrumented and user should never edit it after calling this method.
+        `self.modules_to_compress` records all the to-be-compressed layers
+        """
+        modules_to_compress = self.detect_modules_to_compress()
+        for layer, config in modules_to_compress:
+            self._instrument_layer(layer, config)
+        return self.bound_model
+
+    def get_modules_to_compress(self):
+        """
+        To obtain all the to-be-compressed layers.
+
+        Returns
+        -------
+        list
+            a list of the layers, each of which is a tuple (`layer`, `config`),
+            `layer` is `LayerInfo`, `config` is a `dict`
+        """
+        return self.modules_to_compress
+
+    def select_config(self, layer):
+        """
+        Find the configuration for `layer` by parsing `self.config_list`
+
+        Parameters
+        ----------
+        layer : LayerInfo
+            one layer
+
+        Returns
+        -------
+        config or None
+            the retrieved configuration for this layer, if None, this layer should
+            not be compressed
+        """
+        ret = None
+        for config in self.config_list:
+            config = config.copy()
+            config['op_types'] = self._expand_config_op_types(config)
+            if layer.type not in config['op_types']:
+                continue
+            if config.get('op_names') and layer.name not in config['op_names']:
+                continue
+            ret = config
+        if ret is None or ret.get('exclude'):
+            return None
+        return ret
+
+    def update_epoch(self, epoch):
+        """
+        If user want to update model every epoch, user can override this method.
+        This method should be called at the beginning of each epoch
+
+        Parameters
+        ----------
+        epoch : num
+            the current epoch number
+        """
+
+    def step(self):
+        """
+        If user want to update model every step, user can override this method
+        """
+
+    def _instrument_layer(self, layer, config):
+        """
+        This method is implemented in the subclasses, i.e., `Pruner` and `Quantizer`
+
+        Parameters
+        ----------
+        layer : LayerInfo
+            the layer to instrument the compression operation
+        config : dict
+            the configuration for compressing this layer
+        """
+        raise NotImplementedError()
+
+    def _expand_config_op_types(self, config):
+        if config is None:
+            return []
+        expanded_op_types = []
+        for op_type in config.get('op_types', []):
+            if op_type == 'default':
+                expanded_op_types.extend(default_layers.weighted_modules)
+            else:
+                expanded_op_types.append(op_type)
+        return expanded_op_types
+
+
+class Pruner(Compressor):
+    """
+    Prune to an exact pruning level specification
+
+    Attributes
+    ----------
+    mask_dict : dict
+        Dictionary for saving masks, `key` should be layer name and
+        `value` should be a tensor which has the same shape with layer's weight
+
+    """
+
+    def __init__(self, model, config_list):
+        super().__init__(model, config_list)
+        self.mask_dict = {}
+
+    def calc_mask(self, layer, config):
+        """
+        Pruners should overload this method to provide mask for weight tensors.
+        The mask must have the same shape and type comparing to the weight.
+        It will be applied with `mul()` operation on the weight.
+        This method is effectively hooked to `forward()` method of the model.
+
+        Parameters
+        ----------
+        layer : LayerInfo
+            calculate mask for `layer`'s weight
+        config : dict
+            the configuration for generating the mask
+        """
+        raise NotImplementedError("Pruners must overload calc_mask()")
+
+    def _instrument_layer(self, layer, config):
+        """
+        Create a wrapper forward function to replace the original one.
+
+        Parameters
+        ----------
+        layer : LayerInfo
+            the layer to instrument the mask
+        config : dict
+            the configuration for generating the mask
+        """
+        assert layer._forward is None, 'Each model can only be compressed once'
+        if not _check_weight(layer.module):
+            _logger.warning('Module %s does not have parameter "weight"', layer.name)
+            return
+        layer._forward = layer.module.forward
+
+        def new_forward(*inputs):
+            mask = self.calc_mask(layer, config)
+            # apply mask to weight
+            old_weight = layer.module.weight.data
+            mask_weight = mask['weight']
+            layer.module.weight.data = old_weight.mul(mask_weight)
+            # apply mask to bias
+            if mask.__contains__('bias') and hasattr(layer.module, 'bias') and layer.module.bias is not None:
+                old_bias = layer.module.bias.data
+                mask_bias = mask['bias']
+                layer.module.bias.data = old_bias.mul(mask_bias)
+            # calculate forward
+            ret = layer._forward(*inputs)
+            return ret
+
+        layer.module.forward = new_forward
+
+    def export_model(self, model_path, mask_path=None, onnx_path=None, input_shape=None):
+        """
+        Export pruned model weights, masks and onnx model(optional)
+
+        Parameters
+        ----------
+        model_path : str
+            path to save pruned model state_dict
+        mask_path : str
+            (optional) path to save mask dict
+        onnx_path : str
+            (optional) path to save onnx model
+        input_shape : list or tuple
+            input shape to onnx model
+        """
+        if self.detect_modules_to_compress() and not self.mask_dict:
+            _logger.warning('You may not use self.mask_dict in base Pruner class to record masks')
+        assert model_path is not None, 'model_path must be specified'
+        for name, m in self.bound_model.named_modules():
+            if name == "":
+                continue
+            masks = self.mask_dict.get(name)
+            if masks is not None:
+                mask_sum = masks['weight'].sum().item()
+                mask_num = masks['weight'].numel()
+                _logger.info('Layer: %s  Sparsity: %.2f', name, 1 - mask_sum / mask_num)
+                m.weight.data = m.weight.data.mul(masks['weight'])
+                if masks.__contains__('bias') and hasattr(m, 'bias') and m.bias is not None:
+                    m.bias.data = m.bias.data.mul(masks['bias'])
+            else:
+                _logger.info('Layer: %s  NOT compressed', name)
+        torch.save(self.bound_model.state_dict(), model_path)
+        _logger.info('Model state_dict saved to %s', model_path)
+        if mask_path is not None:
+            torch.save(self.mask_dict, mask_path)
+            _logger.info('Mask dict saved to %s', mask_path)
+        if onnx_path is not None:
+            assert input_shape is not None, 'input_shape must be specified to export onnx model'
+            # input info needed
+            input_data = torch.Tensor(*input_shape)
+            torch.onnx.export(self.bound_model, input_data, onnx_path)
+            _logger.info('Model in onnx with input shape %s saved to %s', input_data.shape, onnx_path)
+
+
+class Quantizer(Compressor):
+    """
+    Base quantizer for pytorch quantizer
+    """
+
+    def __init__(self, model, config_list):
+        super().__init__(model, config_list)
+        self.quant_grad = QuantGrad
+
+    def quantize_weight(self, weight, config, op, op_type, op_name):
+        """
+        quantize should overload this method to quantize weight.
+        This method is effectively hooked to :meth:`forward` of the model.
+        Parameters
+        ----------
+        weight : Tensor
+            weight that needs to be quantized
+        config : dict
+            the configuration for weight quantization
+        """
+        raise NotImplementedError('Quantizer must overload quantize_weight()')
+
+    def quantize_output(self, output, config, op, op_type, op_name):
+        """
+        quantize should overload this method to quantize output.
+        This method is effectively hooked to :meth:`forward` of the model.
+        Parameters
+        ----------
+        output : Tensor
+            output that needs to be quantized
+        config : dict
+            the configuration for output quantization
+        """
+        raise NotImplementedError('Quantizer must overload quantize_output()')
+
+    def quantize_input(self, *inputs, config, op, op_type, op_name):
+        """
+        quantize should overload this method to quantize input.
+        This method is effectively hooked to :meth:`forward` of the model.
+        Parameters
+        ----------
+        inputs : Tensor
+            inputs that needs to be quantized
+        config : dict
+            the configuration for inputs quantization
+        """
+        raise NotImplementedError('Quantizer must overload quantize_input()')
+
+
+    def _instrument_layer(self, layer, config):
+        """
+        Create a wrapper forward function to replace the original one.
+        Parameters
+        ----------
+        layer : LayerInfo
+            the layer to instrument the mask
+        config : dict
+            the configuration for quantization
+        """
+        assert layer._forward is None, 'Each model can only be compressed once'
+        assert 'quant_types' in config, 'must provide quant_types in config'
+        assert isinstance(config['quant_types'], list), 'quant_types must be list type'
+        assert 'quant_bits' in config, 'must provide quant_bits in config'
+        assert isinstance(config['quant_bits'], int) or isinstance(config['quant_bits'], dict), 'quant_bits must be dict type or int type'
+
+        if isinstance(config['quant_bits'], dict):
+            for quant_type in config['quant_types']:
+                assert quant_type in config['quant_bits'], 'bits length for %s must be specified in quant_bits dict' % quant_type
+
+        if 'weight' in config['quant_types']:
+            if not _check_weight(layer.module):
+                _logger.warning('Module %s does not have parameter "weight"', layer.name)
+            else:
+                # old_weight is used to store origin weight and weight is used to store quantized weight
+                # the reason why weight is buffer instead of parameter is because in pytorch parameter is used as leaf
+                # if weight is leaf , then old_weight can not be updated.
+                layer.module.register_parameter('old_weight', torch.nn.Parameter(layer.module.weight))
+                delattr(layer.module, 'weight')
+                layer.module.register_buffer('weight', layer.module.old_weight)
+
+        layer._forward = layer.module.forward
+
+        def new_forward(*inputs):
+            if 'input' in config['quant_types']:
+                inputs = self.quant_grad.apply(inputs, QuantType.QUANT_INPUT, self.quantize_input, config, layer)
+
+            if 'weight' in config['quant_types'] and _check_weight(layer.module):
+                new_weight = self.quant_grad.apply(layer.module.old_weight, QuantType.QUANT_WEIGHT, self.quantize_weight, config, layer)
+                layer.module.weight = new_weight
+                result = layer._forward(*inputs)
+            else:
+                result = layer._forward(*inputs)
+
+            if 'output' in config['quant_types']:
+                result = self.quant_grad.apply(result, QuantType.QUANT_OUTPUT, self.quantize_output, config, layer)
+            return result
+
+        layer.module.forward = new_forward
+
+class QuantType:
+    """
+    Enum class for quantization type.
+    """
+    QUANT_INPUT = 0
+    QUANT_WEIGHT = 1
+    QUANT_OUTPUT = 2
+
+class QuantGrad(torch.autograd.Function):
+    """
+    Base class for overriding backward function of quantization operation.
+    """
+    @staticmethod
+    def quant_backward(tensor, grad_output, quant_type):
+        """
+        This method should be overrided by subclass to provide customized backward function,
+        default implementation is Straight-Through Estimator
+        Parameters
+        ----------
+        tensor : Tensor
+            input of quantization operation
+        grad_output : Tensor
+            gradient of the output of quantization operation
+        quant_type : QuantType
+            the type of quantization, it can be `QuantType.QUANT_INPUT`, `QuantType.QUANT_WEIGHT`, `QuantType.QUANT_OUTPUT`,
+            you can define different behavior for different types.
+        Returns
+        -------
+        tensor
+            gradient of the input of quantization operation
+        """
+        return grad_output
+
+    @staticmethod
+    def forward(ctx, tensor, quant_type, quant_func, config, layer):
+        ctx.save_for_backward(tensor, torch.Tensor([quant_type]))
+        return quant_func(tensor, config, op=layer.module, op_type=layer.type, op_name=layer.name)
+
+    @classmethod
+    def backward(cls, ctx, grad_output):
+        tensor, quant_type = ctx.saved_variables
+        output = cls.quant_backward(tensor, grad_output, quant_type)
+        return output, None, None, None, None
+
+def _check_weight(module):
+    try:
+        return isinstance(module.weight.data, torch.Tensor)
+    except AttributeError:
+        return False
+    
\ No newline at end of file

From 9e4a3d9c3ea1b837fc7c72e9c118706d0d6c1379 Mon Sep 17 00:00:00 2001
From: quzha <Quanlu.Zhang@microsoft.com>
Date: Fri, 10 Jan 2020 09:22:02 +0800
Subject: [PATCH 02/33] update

---
 .../compression/speedup/torch/compressor.py   | 164 +-----------------
 .../pynni/nni/compression/torch/compressor.py |   8 +-
 2 files changed, 15 insertions(+), 157 deletions(-)

diff --git a/src/sdk/pynni/nni/compression/speedup/torch/compressor.py b/src/sdk/pynni/nni/compression/speedup/torch/compressor.py
index b3711302e2..29e66c0920 100644
--- a/src/sdk/pynni/nni/compression/speedup/torch/compressor.py
+++ b/src/sdk/pynni/nni/compression/speedup/torch/compressor.py
@@ -2,6 +2,7 @@
 # Licensed under the MIT license.
 
 import logging
+import onnx
 import torch
 from . import default_layers
 
@@ -22,7 +23,7 @@ class ModelSpeedup:
     Abstract base PyTorch ModelSpeedup
     """
 
-    def __init__(self, model, masks):
+    def __init__(self, model, dummy_input, masks_file):
         """
         Record necessary info in class members
 
@@ -34,15 +35,19 @@ def __init__(self, model, masks):
             the generated masks for modules,
             key is module name,
             value is a dict including key `weight`, or also key `bias`
+        onnx_graph : xxx
+            it is used to parse dependencies between modules
         """
         self.bound_model = model
-        self.masks = masks
+        self.masks = torch.load(masks_file)
+        self.model_graph = torch.jit.trace(model, dummy_input)
 
     def expand_masks(self):
         """
-        detect all modules should be compressed, and save the result in `self.modules_to_compress`.
-        The model will be instrumented and user should never edit it after calling this method.
         """
+        for name, mask in self.masks:
+            print(name)
+
         if self.modules_to_compress is None:
             self.modules_to_compress = []
             for name, module in self.bound_model.named_modules():
@@ -253,154 +258,3 @@ def export_model(self, model_path, mask_path=None, onnx_path=None, input_shape=N
             input_data = torch.Tensor(*input_shape)
             torch.onnx.export(self.bound_model, input_data, onnx_path)
             _logger.info('Model in onnx with input shape %s saved to %s', input_data.shape, onnx_path)
-
-
-class Quantizer(Compressor):
-    """
-    Base quantizer for pytorch quantizer
-    """
-
-    def __init__(self, model, config_list):
-        super().__init__(model, config_list)
-        self.quant_grad = QuantGrad
-
-    def quantize_weight(self, weight, config, op, op_type, op_name):
-        """
-        quantize should overload this method to quantize weight.
-        This method is effectively hooked to :meth:`forward` of the model.
-        Parameters
-        ----------
-        weight : Tensor
-            weight that needs to be quantized
-        config : dict
-            the configuration for weight quantization
-        """
-        raise NotImplementedError('Quantizer must overload quantize_weight()')
-
-    def quantize_output(self, output, config, op, op_type, op_name):
-        """
-        quantize should overload this method to quantize output.
-        This method is effectively hooked to :meth:`forward` of the model.
-        Parameters
-        ----------
-        output : Tensor
-            output that needs to be quantized
-        config : dict
-            the configuration for output quantization
-        """
-        raise NotImplementedError('Quantizer must overload quantize_output()')
-
-    def quantize_input(self, *inputs, config, op, op_type, op_name):
-        """
-        quantize should overload this method to quantize input.
-        This method is effectively hooked to :meth:`forward` of the model.
-        Parameters
-        ----------
-        inputs : Tensor
-            inputs that needs to be quantized
-        config : dict
-            the configuration for inputs quantization
-        """
-        raise NotImplementedError('Quantizer must overload quantize_input()')
-
-
-    def _instrument_layer(self, layer, config):
-        """
-        Create a wrapper forward function to replace the original one.
-        Parameters
-        ----------
-        layer : LayerInfo
-            the layer to instrument the mask
-        config : dict
-            the configuration for quantization
-        """
-        assert layer._forward is None, 'Each model can only be compressed once'
-        assert 'quant_types' in config, 'must provide quant_types in config'
-        assert isinstance(config['quant_types'], list), 'quant_types must be list type'
-        assert 'quant_bits' in config, 'must provide quant_bits in config'
-        assert isinstance(config['quant_bits'], int) or isinstance(config['quant_bits'], dict), 'quant_bits must be dict type or int type'
-
-        if isinstance(config['quant_bits'], dict):
-            for quant_type in config['quant_types']:
-                assert quant_type in config['quant_bits'], 'bits length for %s must be specified in quant_bits dict' % quant_type
-
-        if 'weight' in config['quant_types']:
-            if not _check_weight(layer.module):
-                _logger.warning('Module %s does not have parameter "weight"', layer.name)
-            else:
-                # old_weight is used to store origin weight and weight is used to store quantized weight
-                # the reason why weight is buffer instead of parameter is because in pytorch parameter is used as leaf
-                # if weight is leaf , then old_weight can not be updated.
-                layer.module.register_parameter('old_weight', torch.nn.Parameter(layer.module.weight))
-                delattr(layer.module, 'weight')
-                layer.module.register_buffer('weight', layer.module.old_weight)
-
-        layer._forward = layer.module.forward
-
-        def new_forward(*inputs):
-            if 'input' in config['quant_types']:
-                inputs = self.quant_grad.apply(inputs, QuantType.QUANT_INPUT, self.quantize_input, config, layer)
-
-            if 'weight' in config['quant_types'] and _check_weight(layer.module):
-                new_weight = self.quant_grad.apply(layer.module.old_weight, QuantType.QUANT_WEIGHT, self.quantize_weight, config, layer)
-                layer.module.weight = new_weight
-                result = layer._forward(*inputs)
-            else:
-                result = layer._forward(*inputs)
-
-            if 'output' in config['quant_types']:
-                result = self.quant_grad.apply(result, QuantType.QUANT_OUTPUT, self.quantize_output, config, layer)
-            return result
-
-        layer.module.forward = new_forward
-
-class QuantType:
-    """
-    Enum class for quantization type.
-    """
-    QUANT_INPUT = 0
-    QUANT_WEIGHT = 1
-    QUANT_OUTPUT = 2
-
-class QuantGrad(torch.autograd.Function):
-    """
-    Base class for overriding backward function of quantization operation.
-    """
-    @staticmethod
-    def quant_backward(tensor, grad_output, quant_type):
-        """
-        This method should be overrided by subclass to provide customized backward function,
-        default implementation is Straight-Through Estimator
-        Parameters
-        ----------
-        tensor : Tensor
-            input of quantization operation
-        grad_output : Tensor
-            gradient of the output of quantization operation
-        quant_type : QuantType
-            the type of quantization, it can be `QuantType.QUANT_INPUT`, `QuantType.QUANT_WEIGHT`, `QuantType.QUANT_OUTPUT`,
-            you can define different behavior for different types.
-        Returns
-        -------
-        tensor
-            gradient of the input of quantization operation
-        """
-        return grad_output
-
-    @staticmethod
-    def forward(ctx, tensor, quant_type, quant_func, config, layer):
-        ctx.save_for_backward(tensor, torch.Tensor([quant_type]))
-        return quant_func(tensor, config, op=layer.module, op_type=layer.type, op_name=layer.name)
-
-    @classmethod
-    def backward(cls, ctx, grad_output):
-        tensor, quant_type = ctx.saved_variables
-        output = cls.quant_backward(tensor, grad_output, quant_type)
-        return output, None, None, None, None
-
-def _check_weight(module):
-    try:
-        return isinstance(module.weight.data, torch.Tensor)
-    except AttributeError:
-        return False
-    
\ No newline at end of file
diff --git a/src/sdk/pynni/nni/compression/torch/compressor.py b/src/sdk/pynni/nni/compression/torch/compressor.py
index d8ae199d43..cdb1da3477 100644
--- a/src/sdk/pynni/nni/compression/torch/compressor.py
+++ b/src/sdk/pynni/nni/compression/torch/compressor.py
@@ -194,7 +194,9 @@ def _instrument_layer(self, layer, config):
         layer._forward = layer.module.forward
 
         def new_forward(*inputs):
-            mask = self.calc_mask(layer, config)
+            '''mask = self.calc_mask(layer, config)
+            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+            mask['weight'].to(device)
             # apply mask to weight
             old_weight = layer.module.weight.data
             mask_weight = mask['weight']
@@ -203,7 +205,8 @@ def new_forward(*inputs):
             if mask.__contains__('bias') and hasattr(layer.module, 'bias') and layer.module.bias is not None:
                 old_bias = layer.module.bias.data
                 mask_bias = mask['bias']
-                layer.module.bias.data = old_bias.mul(mask_bias)
+                mask_bias.to(device)
+                layer.module.bias.data = old_bias.mul(mask_bias)'''
             # calculate forward
             ret = layer._forward(*inputs)
             return ret
@@ -250,6 +253,7 @@ def export_model(self, model_path, mask_path=None, onnx_path=None, input_shape=N
             assert input_shape is not None, 'input_shape must be specified to export onnx model'
             # input info needed
             input_data = torch.Tensor(*input_shape)
+            input_data = input_data.to('cuda')
             torch.onnx.export(self.bound_model, input_data, onnx_path)
             _logger.info('Model in onnx with input shape %s saved to %s', input_data.shape, onnx_path)
 

From e401f2b8cae4509654a14c5b056ec565ff7ca60b Mon Sep 17 00:00:00 2001
From: quzha <Quanlu.Zhang@microsoft.com>
Date: Wed, 15 Jan 2020 15:29:19 +0800
Subject: [PATCH 03/33] update

---
 .../compression/speedup/torch/compressor.py   | 318 +++++-------------
 1 file changed, 93 insertions(+), 225 deletions(-)

diff --git a/src/sdk/pynni/nni/compression/speedup/torch/compressor.py b/src/sdk/pynni/nni/compression/speedup/torch/compressor.py
index 29e66c0920..3de1decfa2 100644
--- a/src/sdk/pynni/nni/compression/speedup/torch/compressor.py
+++ b/src/sdk/pynni/nni/compression/speedup/torch/compressor.py
@@ -2,22 +2,13 @@
 # Licensed under the MIT license.
 
 import logging
-import onnx
+import re
 import torch
 from . import default_layers
 
 _logger = logging.getLogger(__name__)
 
 
-class LayerInfo:
-    def __init__(self, name, module):
-        self.module = module
-        self.name = name
-        self.type = type(module).__name__
-
-        self._forward = None
-
-
 class ModelSpeedup:
     """
     Abstract base PyTorch ModelSpeedup
@@ -40,221 +31,98 @@ def __init__(self, model, dummy_input, masks_file):
         """
         self.bound_model = model
         self.masks = torch.load(masks_file)
-        self.model_graph = torch.jit.trace(model, dummy_input)
-
-    def expand_masks(self):
-        """
-        """
-        for name, mask in self.masks:
-            print(name)
-
-        if self.modules_to_compress is None:
-            self.modules_to_compress = []
-            for name, module in self.bound_model.named_modules():
-                layer = LayerInfo(name, module)
-                config = self.select_config(layer)
-                if config is not None:
-                    self.modules_to_compress.append((layer, config))
-        return self.modules_to_compress
-
-    def compress_modules(self):
-        """
-        Compress the model with algorithm implemented by subclass.
-
-        The model will be instrumented and user should never edit it after calling this method.
-        `self.modules_to_compress` records all the to-be-compressed layers
-        """
-        modules_to_compress = self.detect_modules_to_compress()
-        for layer, config in modules_to_compress:
-            self._instrument_layer(layer, config)
-        return self.bound_model
-
-    def get_modules_to_compress(self):
-        """
-        To obtain all the to-be-compressed layers.
-
-        Returns
-        -------
-        list
-            a list of the layers, each of which is a tuple (`layer`, `config`),
-            `layer` is `LayerInfo`, `config` is a `dict`
-        """
-        return self.modules_to_compress
-
-    def select_config(self, layer):
-        """
-        Find the configuration for `layer` by parsing `self.config_list`
-
-        Parameters
-        ----------
-        layer : LayerInfo
-            one layer
-
-        Returns
-        -------
-        config or None
-            the retrieved configuration for this layer, if None, this layer should
-            not be compressed
-        """
-        ret = None
-        for config in self.config_list:
-            config = config.copy()
-            config['op_types'] = self._expand_config_op_types(config)
-            if layer.type not in config['op_types']:
-                continue
-            if config.get('op_names') and layer.name not in config['op_names']:
-                continue
-            ret = config
-        if ret is None or ret.get('exclude'):
-            return None
-        return ret
-
-    def update_epoch(self, epoch):
-        """
-        If user want to update model every epoch, user can override this method.
-        This method should be called at the beginning of each epoch
-
-        Parameters
-        ----------
-        epoch : num
-            the current epoch number
-        """
-
-    def step(self):
-        """
-        If user want to update model every step, user can override this method
-        """
-
-    def _instrument_layer(self, layer, config):
-        """
-        This method is implemented in the subclasses, i.e., `Pruner` and `Quantizer`
-
-        Parameters
-        ----------
-        layer : LayerInfo
-            the layer to instrument the compression operation
-        config : dict
-            the configuration for compressing this layer
-        """
-        raise NotImplementedError()
-
-    def _expand_config_op_types(self, config):
-        if config is None:
-            return []
-        expanded_op_types = []
-        for op_type in config.get('op_types', []):
-            if op_type == 'default':
-                expanded_op_types.extend(default_layers.weighted_modules)
+        self.trace_graph = torch.jit.trace(model, dummy_input)
+        self.output_to_node, self.input_to_node, self.module_inputs, self.module_outputs = self._build_graph()
+
+    def _build_graph(self):
+        """
+        """
+        graph = self.trace_graph.graph
+        # build output mapping, from output debugName to its node
+        output_to_node = dict()
+        # build input mapping, from input debugName to its node
+        input_to_node = dict()
+        #build module mapping, from module name to all nodes (as list) under this module scope
+        module_to_nodes = dict()
+        for node in graph.nodes():
+            for output in node.outputs():
+                output_name = output.debugName()
+                output_to_node[output_name] = node
+            for _input in node.inputs():
+                input_name = _input.debugName()
+                input_to_node[input_name] = node
+            scope_name = node.scopeName() # example: scope_name, 'MyCell/Linear[linear]'
+            module_name_slices = re.findall(r'\[(.*?)\]', scope_name)
+            module_name = '.'.join(module_name_slices)
+            if module_name in module_to_nodes:
+                module_to_nodes[module_name].append(node)
             else:
-                expanded_op_types.append(op_type)
-        return expanded_op_types
-
-
-class Pruner(Compressor):
-    """
-    Prune to an exact pruning level specification
-
-    Attributes
-    ----------
-    mask_dict : dict
-        Dictionary for saving masks, `key` should be layer name and
-        `value` should be a tensor which has the same shape with layer's weight
-
-    """
-
-    def __init__(self, model, config_list):
-        super().__init__(model, config_list)
-        self.mask_dict = {}
-
-    def calc_mask(self, layer, config):
+                module_to_nodes[module_name] = [node]
+        # for each module, find its inputs and outputs
+        # build module mapping, from module name to its inputs debugName and outputs debugName,
+        module_inputs = dict()
+        module_outputs = dict()
+        for module_name, nodes in module_to_nodes.items():
+            inputs = set()
+            outputs = set()
+            for node in nodes:
+                for output in node.outputs():
+                    outputs.add(output.debugName())
+                for _input in node.inputs():
+                    inputs.add(_input.debugName())
+            m_inputs = list()
+            m_outputs = list()
+            for output in outputs:
+                # TODO: one input could be the input of multiple nodes
+                if not input_to_node[output] in nodes:
+                    m_outputs.append(output)
+            for _input in inputs:
+                if not output_to_node[_input] in nodes:
+                    m_inputs.append(_input)
+            module_inputs[module_name] = m_inputs
+            module_outputs[module_name] = m_outputs
+        return output_to_node, input_to_node, module_inputs, module_outputs
+
+    def _do_module_replace(self, module_name, mask=None, in_shape=None, out_shape=None):
+        """
+        """
+        changed_in_shape = changed_out_shape = None
+        assert module_name in self.module_inputs, "module does not exist in trace graph"
+        if mask is not None:
+            assert in_shape is None and out_shape is None
+        # fine-grained tensor sparse
+        #...
+        # coarse-grained shape sparse
+        #...
+        if in_shape is not None:
+            #...
+        if out_shape is not None:
+            #...
+        return changed_in_shape, changed_out_shape
+
+    def _find_predecessors(self):
+        """
+        """
+
+    def _find_successors(self):
+        """
+        """
+
+    def replace_module(self, module_name, mask=None, in_shape=None, out_shape=None):
+        """
+        """
+        changed_in_shape, changed_out_shape = self._do_module_replace(module_name, mask, in_shape, out_shape)
+        if changed_in_shape:
+            predecessors = self._find_predecessors()
+            for module_name in predecessors:
+                self.replace_module(module_name, out_shape=changed_in_shape)
+        if changed_out_shape:
+            successors = self._find_successors()
+            for module_name in successors:
+                self.replace_module(module_name, in_shape=changed_out_shape)
+
+    def speedup_model(self):
         """
-        Pruners should overload this method to provide mask for weight tensors.
-        The mask must have the same shape and type comparing to the weight.
-        It will be applied with `mul()` operation on the weight.
-        This method is effectively hooked to `forward()` method of the model.
-
-        Parameters
-        ----------
-        layer : LayerInfo
-            calculate mask for `layer`'s weight
-        config : dict
-            the configuration for generating the mask
         """
-        raise NotImplementedError("Pruners must overload calc_mask()")
-
-    def _instrument_layer(self, layer, config):
-        """
-        Create a wrapper forward function to replace the original one.
-
-        Parameters
-        ----------
-        layer : LayerInfo
-            the layer to instrument the mask
-        config : dict
-            the configuration for generating the mask
-        """
-        assert layer._forward is None, 'Each model can only be compressed once'
-        if not _check_weight(layer.module):
-            _logger.warning('Module %s does not have parameter "weight"', layer.name)
-            return
-        layer._forward = layer.module.forward
-
-        def new_forward(*inputs):
-            mask = self.calc_mask(layer, config)
-            # apply mask to weight
-            old_weight = layer.module.weight.data
-            mask_weight = mask['weight']
-            layer.module.weight.data = old_weight.mul(mask_weight)
-            # apply mask to bias
-            if mask.__contains__('bias') and hasattr(layer.module, 'bias') and layer.module.bias is not None:
-                old_bias = layer.module.bias.data
-                mask_bias = mask['bias']
-                layer.module.bias.data = old_bias.mul(mask_bias)
-            # calculate forward
-            ret = layer._forward(*inputs)
-            return ret
-
-        layer.module.forward = new_forward
-
-    def export_model(self, model_path, mask_path=None, onnx_path=None, input_shape=None):
-        """
-        Export pruned model weights, masks and onnx model(optional)
-
-        Parameters
-        ----------
-        model_path : str
-            path to save pruned model state_dict
-        mask_path : str
-            (optional) path to save mask dict
-        onnx_path : str
-            (optional) path to save onnx model
-        input_shape : list or tuple
-            input shape to onnx model
-        """
-        if self.detect_modules_to_compress() and not self.mask_dict:
-            _logger.warning('You may not use self.mask_dict in base Pruner class to record masks')
-        assert model_path is not None, 'model_path must be specified'
-        for name, m in self.bound_model.named_modules():
-            if name == "":
-                continue
-            masks = self.mask_dict.get(name)
-            if masks is not None:
-                mask_sum = masks['weight'].sum().item()
-                mask_num = masks['weight'].numel()
-                _logger.info('Layer: %s  Sparsity: %.2f', name, 1 - mask_sum / mask_num)
-                m.weight.data = m.weight.data.mul(masks['weight'])
-                if masks.__contains__('bias') and hasattr(m, 'bias') and m.bias is not None:
-                    m.bias.data = m.bias.data.mul(masks['bias'])
-            else:
-                _logger.info('Layer: %s  NOT compressed', name)
-        torch.save(self.bound_model.state_dict(), model_path)
-        _logger.info('Model state_dict saved to %s', model_path)
-        if mask_path is not None:
-            torch.save(self.mask_dict, mask_path)
-            _logger.info('Mask dict saved to %s', mask_path)
-        if onnx_path is not None:
-            assert input_shape is not None, 'input_shape must be specified to export onnx model'
-            # input info needed
-            input_data = torch.Tensor(*input_shape)
-            torch.onnx.export(self.bound_model, input_data, onnx_path)
-            _logger.info('Model in onnx with input shape %s saved to %s', input_data.shape, onnx_path)
+        for name, mask in self.masks:
+            self.replace_module(name, mask=mask)

From 7ba30c303db5f546c898b75c6e276b9e1603023d Mon Sep 17 00:00:00 2001
From: quzha <Quanlu.Zhang@microsoft.com>
Date: Mon, 20 Jan 2020 19:45:43 +0800
Subject: [PATCH 04/33] update

---
 .../nni/compression/speedup/torch/__init__.py |   0
 .../speedup/torch/compress_modules.py         |  58 ++++++++++
 .../compression/speedup/torch/compressor.py   | 101 +++++++++++++-----
 .../compression/speedup/torch/infer_shape.py  |  86 +++++++++++++++
 4 files changed, 218 insertions(+), 27 deletions(-)
 create mode 100644 src/sdk/pynni/nni/compression/speedup/torch/__init__.py
 create mode 100644 src/sdk/pynni/nni/compression/speedup/torch/compress_modules.py
 create mode 100644 src/sdk/pynni/nni/compression/speedup/torch/infer_shape.py

diff --git a/src/sdk/pynni/nni/compression/speedup/torch/__init__.py b/src/sdk/pynni/nni/compression/speedup/torch/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/src/sdk/pynni/nni/compression/speedup/torch/compress_modules.py b/src/sdk/pynni/nni/compression/speedup/torch/compress_modules.py
new file mode 100644
index 0000000000..59262d0230
--- /dev/null
+++ b/src/sdk/pynni/nni/compression/speedup/torch/compress_modules.py
@@ -0,0 +1,58 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+import torch
+from .infer_shape import CoarseMask
+
+compress_modules = {
+    'BatchNorm2d': lambda module, mask: compress_batchnorm2d(module, mask),
+    'Conv2d': lambda module, mask: compress_conv2d(module, mask)
+}
+
+cms_output = {
+    'BatchNorm2d': lambda module, output_cmask: compress_batchnorm2d_output(module, output_cmask),
+    'Conv2d': lambda module, output_cmask: compress_conv2d_output(module, output_cmask)
+}
+
+
+def compress_batchnorm2d_output(module, output_cmask):
+    """
+    """
+
+def compress_batchnorm2d(norm, mask):
+    """
+    """
+    assert 'weight' in mask and 'bias' in mask
+    sum_mask = mask['weight'] + mask['bias']
+    nonzero_index = torch.nonzero(sum_mask, as_tuple=True)[0]
+    new_norm = torch.nn.BatchNorm2d(num_features=nonzero_index.size()[0],
+                                    eps=norm.eps,
+                                    momentum=norm.momentum,
+                                    affine=norm.affine,
+                                    track_running_stats=norm.track_running_stats)
+    # assign weights
+    new_norm.weight.data = torch.index_select(norm.weight.data, 0, nonzero_index)
+    new_norm.bias.data = torch.index_select(norm.bias.data, 0, nonzero_index)
+    if norm.track_running_stats:
+        new_norm.running_mean.data = torch.index_select(norm.running_mean.data, 0, nonzero_index)
+        new_norm.running_var.data = torch.index_select(norm.running_var.data, 0, nonzero_index)
+    # infer shape of input tensor
+    input_cmask = CoarseMask(num_dim=4)
+    input_cmask.add_index_mask(dim=1,
+                               index=torch.nonzero(mask['weight'], as_tuple=True)[0])
+    # infer shape of output tensor
+    output_cmask = CoarseMask(num_dim=4)
+    output_cmask.add_index_mask(dim=1, index=nonzero_index)
+    return new_norm, input_cmask, output_cmask
+
+def compress_conv2d_output(module, output_cmask):
+    """
+    """
+
+def compress_conv2d(conv, mask):
+    """
+    """
+    # fine-grained tensor sparse
+    #...
+    # coarse-grained shape sparse
+    #...
diff --git a/src/sdk/pynni/nni/compression/speedup/torch/compressor.py b/src/sdk/pynni/nni/compression/speedup/torch/compressor.py
index 3de1decfa2..74b17d1f8b 100644
--- a/src/sdk/pynni/nni/compression/speedup/torch/compressor.py
+++ b/src/sdk/pynni/nni/compression/speedup/torch/compressor.py
@@ -4,11 +4,19 @@
 import logging
 import re
 import torch
-from . import default_layers
+from .compress_modules import compress_modules as cms
+from .infer_shape import ModuleMasks, infer_from_mask, infer_from_inshape, infer_from_outshape
 
 _logger = logging.getLogger(__name__)
 
 
+def get_module_by_name(model, module_name):
+    name_list = module_name.split(".")
+    for name in name_list[:-1]:
+        model = getattr(model, name)
+    leaf_module = getattr(model, name_list[-1])
+    return model, leaf_module
+
 class ModelSpeedup:
     """
     Abstract base PyTorch ModelSpeedup
@@ -32,7 +40,10 @@ def __init__(self, model, dummy_input, masks_file):
         self.bound_model = model
         self.masks = torch.load(masks_file)
         self.trace_graph = torch.jit.trace(model, dummy_input)
-        self.output_to_node, self.input_to_node, self.module_inputs, self.module_outputs = self._build_graph()
+        self.output_to_node, self.input_to_node, self.module_to_inputs, self.module_to_outputs, self.module_to_type = self._build_graph()
+        
+        #self.replaced_modules = dict()
+        self.inferred_masks = dict() # key: module_name, value: ModuleMasks
 
     def _build_graph(self):
         """
@@ -54,14 +65,17 @@ def _build_graph(self):
             scope_name = node.scopeName() # example: scope_name, 'MyCell/Linear[linear]'
             module_name_slices = re.findall(r'\[(.*?)\]', scope_name)
             module_name = '.'.join(module_name_slices)
+            # TODO: check module_name is not empty
             if module_name in module_to_nodes:
                 module_to_nodes[module_name].append(node)
             else:
                 module_to_nodes[module_name] = [node]
         # for each module, find its inputs and outputs
         # build module mapping, from module name to its inputs debugName and outputs debugName,
-        module_inputs = dict()
-        module_outputs = dict()
+        module_to_inputs = dict()
+        module_to_outputs = dict()
+        # TODO: fullfill modules_type
+        module_to_type = dict()
         for module_name, nodes in module_to_nodes.items():
             inputs = set()
             outputs = set()
@@ -79,50 +93,83 @@ def _build_graph(self):
             for _input in inputs:
                 if not output_to_node[_input] in nodes:
                     m_inputs.append(_input)
-            module_inputs[module_name] = m_inputs
-            module_outputs[module_name] = m_outputs
-        return output_to_node, input_to_node, module_inputs, module_outputs
+            module_to_inputs[module_name] = m_inputs
+            module_to_outputs[module_name] = m_outputs
+        return output_to_node, input_to_node, module_to_inputs, module_to_outputs, module_to_type
 
     def _do_module_replace(self, module_name, mask=None, in_shape=None, out_shape=None):
         """
         """
-        changed_in_shape = changed_out_shape = None
+        assert not module_name in self.replaced_modules
+        input_cmask = output_cmask = None
         assert module_name in self.module_inputs, "module does not exist in trace graph"
         if mask is not None:
             assert in_shape is None and out_shape is None
-        # fine-grained tensor sparse
-        #...
-        # coarse-grained shape sparse
-        #...
+            super_module, leaf_module = get_module_by_name(self.bound_model, module_name)
+            m_type = self.module_to_type[module_name]
+            compressed_module, input_cmask, output_cmask = cms[m_type](leaf_module, mask)
+            setattr(super_module, module_name, compressed_module)
+
         if in_shape is not None:
-            #...
+            assert not module_name in self.masks
+            super_module, leaf_module = get_module_by_name(self.bound_model, module_name)
+            m_type = self.module_to_type[module_name]
+            compressed_module, input_cmask, output_cmask = cms_input[m_type](leaf_module, in_shape)
+
         if out_shape is not None:
+            assert not module_name in self.masks
             #...
-        return changed_in_shape, changed_out_shape
+        return input_cmask, output_cmask
 
-    def _find_predecessors(self):
+    def _find_predecessors(self, module_name):
         """
         """
+        predecessors = []
+        for _input in self.module_to_inputs[module_name]:
+            assert _input in self.input_to_node
+            node = self.input_to_node[_input]
+            scope_name = node.scopeName() # example: scope_name, 'MyCell/Linear[linear]'
+            module_name_slices = re.findall(r'\[(.*?)\]', scope_name)
+            module_name = '.'.join(module_name_slices)
+            if module_name == '':
+                raise RuntimeError("_find_predecessors: cannot handle non-module node!")
+            else:
+                predecessors.append(module_name)
+        return predecessors
 
-    def _find_successors(self):
+    def _find_successors(self, module_name):
         """
         """
 
-    def replace_module(self, module_name, mask=None, in_shape=None, out_shape=None):
+    def infer_module_mask(self, module_name, mask=None, in_shape=None, out_shape=None):
         """
         """
-        changed_in_shape, changed_out_shape = self._do_module_replace(module_name, mask, in_shape, out_shape)
-        if changed_in_shape:
-            predecessors = self._find_predecessors()
+        input_cmask = output_cmask = None
+        if module_name in self.inferred_masks:
+            module_masks = self.inferred_masks[module_name]
+        else:
+            module_masks = ModuleMasks(module_name)
+            self.inferred_masks[module_name] = module_masks
+
+        m_type = self.module_to_type[module_name]
+        if mask is not None:
+            input_cmask, output_cmask = infer_from_mask[m_type](module_masks, mask)
+        if in_shape is not None:
+            infer_from_inshape[m_type](module_masks, in_shape)
+        if out_shape is not None:
+            infer_from_outshape[m_type](module_masks, out_shape)
+
+        if input_cmask:
+            predecessors = self._find_predecessors(module_name)
             for module_name in predecessors:
-                self.replace_module(module_name, out_shape=changed_in_shape)
-        if changed_out_shape:
-            successors = self._find_successors()
+                self.infer_module_mask(module_name, out_shape=input_cmask)
+        if output_cmask:
+            successors = self._find_successors(module_name)
             for module_name in successors:
-                self.replace_module(module_name, in_shape=changed_out_shape)
+                self.infer_module_mask(module_name, in_shape=output_cmask)
 
-    def speedup_model(self):
+    def infer_modules_masks(self):
         """
         """
-        for name, mask in self.masks:
-            self.replace_module(name, mask=mask)
+        for module_name, mask in self.masks:
+            self.infer_module_mask(module_name, mask=mask)
diff --git a/src/sdk/pynni/nni/compression/speedup/torch/infer_shape.py b/src/sdk/pynni/nni/compression/speedup/torch/infer_shape.py
new file mode 100644
index 0000000000..b753a25f3d
--- /dev/null
+++ b/src/sdk/pynni/nni/compression/speedup/torch/infer_shape.py
@@ -0,0 +1,86 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+"""
+For each operation or module, there are two functions.
+One is given output shape, infer its input shape and initialization parameters (e.g., weight's shape)
+The other is given input shape, infer its output shape and initialization parameters (e.g., weight's shape)
+"""
+
+import torch
+
+class CoarseMask:
+    def __init__(self, num_dim):
+        self.mask_index = [None for _ in range(num_dim)]
+
+    def add_index_mask(self, dim, index):
+        self.mask_index[dim] = index
+
+class ModuleMasks:
+    def __init__(self, module_name):
+        """
+        """
+        self.module_name = module_name
+        self.param_masks = dict()
+        self.input_mask = None
+        self.output_mask = None
+    
+    def set_param_masks(self, name, mask):
+        self.param_masks[name] = mask
+
+    def set_input_mask(self, mask):
+        self.input_mask = mask
+
+    def set_output_mask(self, mask):
+        self.output_mask = mask
+
+
+infer_from_mask = {
+    'BatchNorm2d': lambda module_masks, mask: batchnorm2d_mask(module_masks, mask),
+    'Conv2d': lambda module_masks, mask: conv2d_mask(module_masks, mask)
+}
+
+infer_from_inshape = {}
+
+infer_from_outshape = {
+    'Conv2d': lambda module_masks, mask: conv2d_outshape(module_masks, mask)
+}
+
+def batchnorm2d_mask(module_masks, mask):
+    """
+    """
+    assert 'weight' in mask and 'bias' in mask
+    sum_mask = mask['weight'] + mask['bias']
+    nonzero_index = torch.nonzero(sum_mask, as_tuple=True)[0]
+    # infer shape of parameters
+    param_cmask = CoarseMask(num_dim=0)
+    param_cmask.add_index_mask(dim=0, index=nonzero_index)
+    module_masks.set_param_masks('weight', param_cmask)
+    module_masks.set_param_masks('bias', param_cmask)
+    # infer shape of input tensor
+    input_cmask = CoarseMask(num_dim=4)
+    input_cmask.add_index_mask(dim=1,
+                               index=torch.nonzero(mask['weight'], as_tuple=True)[0])
+    module_masks.set_input_mask(input_cmask)
+    # infer shape of output tensor
+    output_cmask = CoarseMask(num_dim=4)
+    output_cmask.add_index_mask(dim=1, index=nonzero_index)
+    module_masks.set_output_mask(output_cmask)
+    return input_cmask, output_cmask
+
+def conv2d_mask(module_masks, mask):
+    """
+    """
+
+def conv2d_outshape(module_masks, mask):
+    """
+    """
+    assert isinstance(mask, CoarseMask)
+    assert mask.mask_index[1] is not None
+    assert mask.mask_index[0] is None
+    assert mask.mask_index[2] is None
+    assert mask.mask_index[3] is None
+    if module_masks.output_mask is not None:
+        # ...
+        return
+    #...
+    
\ No newline at end of file

From dc865fe422013173939dce5aef2c9549c944a3ce Mon Sep 17 00:00:00 2001
From: quzha <Quanlu.Zhang@microsoft.com>
Date: Tue, 21 Jan 2020 16:00:17 +0800
Subject: [PATCH 05/33] update

---
 .../nni/compression/speedup/torch/__init__.py |  1 +
 .../compression/speedup/torch/compressor.py   | 72 +++++++++++++++----
 .../compression/speedup/torch/infer_shape.py  | 50 +++++++++++--
 .../pynni/nni/compression/torch/compressor.py |  4 +-
 4 files changed, 106 insertions(+), 21 deletions(-)

diff --git a/src/sdk/pynni/nni/compression/speedup/torch/__init__.py b/src/sdk/pynni/nni/compression/speedup/torch/__init__.py
index e69de29bb2..cef8ebd76c 100644
--- a/src/sdk/pynni/nni/compression/speedup/torch/__init__.py
+++ b/src/sdk/pynni/nni/compression/speedup/torch/__init__.py
@@ -0,0 +1 @@
+from .compressor import ModelSpeedup
\ No newline at end of file
diff --git a/src/sdk/pynni/nni/compression/speedup/torch/compressor.py b/src/sdk/pynni/nni/compression/speedup/torch/compressor.py
index 74b17d1f8b..a3d48967ed 100644
--- a/src/sdk/pynni/nni/compression/speedup/torch/compressor.py
+++ b/src/sdk/pynni/nni/compression/speedup/torch/compressor.py
@@ -49,12 +49,23 @@ def _build_graph(self):
         """
         """
         graph = self.trace_graph.graph
+        print(graph)
         # build output mapping, from output debugName to its node
         output_to_node = dict()
         # build input mapping, from input debugName to its node
         input_to_node = dict()
         #build module mapping, from module name to all nodes (as list) under this module scope
         module_to_nodes = dict()
+        # module name to its type
+        module_to_type = dict()
+
+        graph_inputs = list()
+        graph_outputs = list()
+        for _input in graph.inputs():
+            graph_inputs.append(_input.debugName())
+        for output in graph.outputs():
+            graph_outputs.append(output.debugName())
+
         for node in graph.nodes():
             for output in node.outputs():
                 output_name = output.debugName()
@@ -65,7 +76,12 @@ def _build_graph(self):
             scope_name = node.scopeName() # example: scope_name, 'MyCell/Linear[linear]'
             module_name_slices = re.findall(r'\[(.*?)\]', scope_name)
             module_name = '.'.join(module_name_slices)
-            # TODO: check module_name is not empty
+            # if module_name is empty, it is not a module
+            if module_name == '':
+                continue
+            scope_slice = scope_name.split('/')[-1]
+            module_type = scope_slice.split('[')[0]
+            module_to_type[module_name] = module_type
             if module_name in module_to_nodes:
                 module_to_nodes[module_name].append(node)
             else:
@@ -74,8 +90,6 @@ def _build_graph(self):
         # build module mapping, from module name to its inputs debugName and outputs debugName,
         module_to_inputs = dict()
         module_to_outputs = dict()
-        # TODO: fullfill modules_type
-        module_to_type = dict()
         for module_name, nodes in module_to_nodes.items():
             inputs = set()
             outputs = set()
@@ -88,10 +102,14 @@ def _build_graph(self):
             m_outputs = list()
             for output in outputs:
                 # TODO: one input could be the input of multiple nodes
-                if not input_to_node[output] in nodes:
+                if not output in input_to_node and output in graph_outputs:
+                    m_outputs.append(output)
+                elif not input_to_node[output] in nodes:
                     m_outputs.append(output)
             for _input in inputs:
-                if not output_to_node[_input] in nodes:
+                if not _input in output_to_node and _input in graph_inputs:
+                    m_inputs.append(_input)
+                elif not output_to_node[_input] in nodes:
                     m_inputs.append(_input)
             module_to_inputs[module_name] = m_inputs
             module_to_outputs[module_name] = m_outputs
@@ -126,9 +144,13 @@ def _find_predecessors(self, module_name):
         """
         predecessors = []
         for _input in self.module_to_inputs[module_name]:
-            assert _input in self.input_to_node
-            node = self.input_to_node[_input]
+            assert _input in self.output_to_node
+            node = self.output_to_node[_input]
+            #print("node: ", node)
             scope_name = node.scopeName() # example: scope_name, 'MyCell/Linear[linear]'
+            #print("scope name: ", scope_name)
+            if scope_name == '':
+                continue
             module_name_slices = re.findall(r'\[(.*?)\]', scope_name)
             module_name = '.'.join(module_name_slices)
             if module_name == '':
@@ -140,6 +162,20 @@ def _find_predecessors(self, module_name):
     def _find_successors(self, module_name):
         """
         """
+        successors = []
+        for output in self.module_to_outputs[module_name]:
+            assert output in self.input_to_node
+            node = self.input_to_node[output]
+            scope_name = node.scopeName()
+            if scope_name == '':
+                continue
+            module_name_slices = re.findall(r'\[(.*?)\]', scope_name)
+            module_name = '.'.join(module_name_slices)
+            if module_name == '':
+                raise RuntimeError("_find_successors: cannot handle non-module node!")
+            else:
+                successors.append(module_name)
+        return successors
 
     def infer_module_mask(self, module_name, mask=None, in_shape=None, out_shape=None):
         """
@@ -152,24 +188,32 @@ def infer_module_mask(self, module_name, mask=None, in_shape=None, out_shape=Non
             self.inferred_masks[module_name] = module_masks
 
         m_type = self.module_to_type[module_name]
+        print("infer_module_mask: {}, module type: {}".format(module_name, m_type))
         if mask is not None:
+            print("mask is not None")
             input_cmask, output_cmask = infer_from_mask[m_type](module_masks, mask)
         if in_shape is not None:
-            infer_from_inshape[m_type](module_masks, in_shape)
+            print("in_shape is not None")
+            output_cmask = infer_from_inshape[m_type](module_masks, in_shape)
         if out_shape is not None:
-            infer_from_outshape[m_type](module_masks, out_shape)
+            print("out_shape is not None")
+            input_cmask = infer_from_outshape[m_type](module_masks, out_shape)
 
         if input_cmask:
+            print("input_cmask is not None")
             predecessors = self._find_predecessors(module_name)
-            for module_name in predecessors:
-                self.infer_module_mask(module_name, out_shape=input_cmask)
+            for _module_name in predecessors:
+                print("input_cmask, module_name: ", _module_name)
+                self.infer_module_mask(_module_name, out_shape=input_cmask)
         if output_cmask:
+            print("output_cmask is not None")
             successors = self._find_successors(module_name)
-            for module_name in successors:
-                self.infer_module_mask(module_name, in_shape=output_cmask)
+            for _module_name in successors:
+                print("output_cmask, module_name: ", _module_name)
+                self.infer_module_mask(_module_name, in_shape=output_cmask)
 
     def infer_modules_masks(self):
         """
         """
-        for module_name, mask in self.masks:
+        for module_name, mask in self.masks.items():
             self.infer_module_mask(module_name, mask=mask)
diff --git a/src/sdk/pynni/nni/compression/speedup/torch/infer_shape.py b/src/sdk/pynni/nni/compression/speedup/torch/infer_shape.py
index b753a25f3d..7ba6936515 100644
--- a/src/sdk/pynni/nni/compression/speedup/torch/infer_shape.py
+++ b/src/sdk/pynni/nni/compression/speedup/torch/infer_shape.py
@@ -15,6 +15,26 @@ def __init__(self, num_dim):
     def add_index_mask(self, dim, index):
         self.mask_index[dim] = index
 
+    @staticmethod
+    def merge_index(index_a, index_b):
+        s = set()
+        for num in index_a:
+            s.add(num)
+        for num in index_b:
+            s.add(num)
+        return torch.tensor(sorted(s))
+
+    def merge(self, cmask):
+        assert isinstance(cmask, CoarseMask)
+        assert len(self.mask_index) == len(cmask.mask_index)
+        for i, index in enumerate(self.mask_index):
+            if index is None:
+                self.mask_index[i] = cmask.mask_index[i]
+            elif cmask.mask_index[i] is not None:
+                self.mask_index[i] = CoarseMask.merge_index(self.mask_index[i],
+                                                            cmask.mask_index[i])
+        return self.mask_index
+
 class ModuleMasks:
     def __init__(self, module_name):
         """
@@ -39,12 +59,20 @@ def set_output_mask(self, mask):
     'Conv2d': lambda module_masks, mask: conv2d_mask(module_masks, mask)
 }
 
-infer_from_inshape = {}
+infer_from_inshape = {
+    'ReLU': lambda module_masks, mask: relu_inshape(module_masks, mask)
+}
 
 infer_from_outshape = {
     'Conv2d': lambda module_masks, mask: conv2d_outshape(module_masks, mask)
 }
 
+def relu_inshape(module_masks, mask):
+    """
+    """
+    module_masks
+    return None # return shape of output tensor
+
 def batchnorm2d_mask(module_masks, mask):
     """
     """
@@ -52,7 +80,7 @@ def batchnorm2d_mask(module_masks, mask):
     sum_mask = mask['weight'] + mask['bias']
     nonzero_index = torch.nonzero(sum_mask, as_tuple=True)[0]
     # infer shape of parameters
-    param_cmask = CoarseMask(num_dim=0)
+    param_cmask = CoarseMask(num_dim=1)
     param_cmask.add_index_mask(dim=0, index=nonzero_index)
     module_masks.set_param_masks('weight', param_cmask)
     module_masks.set_param_masks('bias', param_cmask)
@@ -79,8 +107,20 @@ def conv2d_outshape(module_masks, mask):
     assert mask.mask_index[0] is None
     assert mask.mask_index[2] is None
     assert mask.mask_index[3] is None
+
     if module_masks.output_mask is not None:
-        # ...
-        return
-    #...
+        assert isinstance(module_masks.output_mask, CoarseMask)
+        # set shape of output
+        mask = module_masks.output_mask.merge(mask)
+    else:
+        module_masks.output_mask = mask
+    # infer shape of parameters
+    weight_cmask = CoarseMask(num_dim=4)
+    weight_cmask.add_index_mask(dim=0, index=mask.mask_index[1])
+    bias_cmask = CoarseMask(num_dim=1)
+    bias_cmask.add_index_mask(dim=0, index=mask.mask_index[1])
+    module_masks.set_param_masks('weight', weight_cmask)
+    module_masks.set_param_masks('bias', bias_cmask)
+    # input shape is not changed
+    return None # return shape of input tensor
     
\ No newline at end of file
diff --git a/src/sdk/pynni/nni/compression/torch/compressor.py b/src/sdk/pynni/nni/compression/torch/compressor.py
index cdb1da3477..74bc3cb8a3 100644
--- a/src/sdk/pynni/nni/compression/torch/compressor.py
+++ b/src/sdk/pynni/nni/compression/torch/compressor.py
@@ -194,7 +194,7 @@ def _instrument_layer(self, layer, config):
         layer._forward = layer.module.forward
 
         def new_forward(*inputs):
-            '''mask = self.calc_mask(layer, config)
+            mask = self.calc_mask(layer, config)
             device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
             mask['weight'].to(device)
             # apply mask to weight
@@ -206,7 +206,7 @@ def new_forward(*inputs):
                 old_bias = layer.module.bias.data
                 mask_bias = mask['bias']
                 mask_bias.to(device)
-                layer.module.bias.data = old_bias.mul(mask_bias)'''
+                layer.module.bias.data = old_bias.mul(mask_bias)
             # calculate forward
             ret = layer._forward(*inputs)
             return ret

From 10c05100419888d09e567a337bc4b27b69f5fc5c Mon Sep 17 00:00:00 2001
From: quzha <Quanlu.Zhang@microsoft.com>
Date: Tue, 21 Jan 2020 19:22:09 +0800
Subject: [PATCH 06/33] update

---
 .../compression/speedup/torch/infer_shape.py  | 33 +++++++++++++++++--
 1 file changed, 30 insertions(+), 3 deletions(-)

diff --git a/src/sdk/pynni/nni/compression/speedup/torch/infer_shape.py b/src/sdk/pynni/nni/compression/speedup/torch/infer_shape.py
index 7ba6936515..a4a76da491 100644
--- a/src/sdk/pynni/nni/compression/speedup/torch/infer_shape.py
+++ b/src/sdk/pynni/nni/compression/speedup/torch/infer_shape.py
@@ -60,18 +60,37 @@ def set_output_mask(self, mask):
 }
 
 infer_from_inshape = {
-    'ReLU': lambda module_masks, mask: relu_inshape(module_masks, mask)
+    'ReLU': lambda module_masks, mask: relu_inshape(module_masks, mask),
+    'Conv2d': lambda module_masks, mask: conv2d_inshape(module_masks, mask),
+    'MaxPool2d': lambda module_masks, mask: maxpool2d_inshape(module_masks, mask)
 }
 
 infer_from_outshape = {
     'Conv2d': lambda module_masks, mask: conv2d_outshape(module_masks, mask)
 }
 
+def maxpool2d_inshape(module_masks, mask):
+    """
+    """
+    assert isinstance(mask, CoarseMask)
+    assert mask.mask_index[1] is not None
+    assert mask.mask_index[0] is None
+    assert mask.mask_index[2] is None
+    assert mask.mask_index[3] is None
+    assert module_masks.input_mask is None
+    module_masks.set_input_mask(mask)
+    module_masks.set_output_mask(mask)
+    return mask
+
 def relu_inshape(module_masks, mask):
     """
     """
-    module_masks
-    return None # return shape of output tensor
+    assert isinstance(mask, CoarseMask)
+    # TODO: double check this assert, is it possible that a module is passed twice
+    assert module_masks.input_mask is None
+    module_masks.set_input_mask(mask)
+    module_masks.set_output_mask(mask)
+    return mask # return shape of output tensor
 
 def batchnorm2d_mask(module_masks, mask):
     """
@@ -99,6 +118,14 @@ def conv2d_mask(module_masks, mask):
     """
     """
 
+def conv2d_inshape(module_masks, mask):
+    """
+    """
+    assert isinstance(mask, CoarseMask)
+    assert module_masks.input_mask is None
+    module_masks.set_input_mask(mask)
+    return None
+
 def conv2d_outshape(module_masks, mask):
     """
     """

From df1dda746b93eff9bc04551f429920cc8828e3c9 Mon Sep 17 00:00:00 2001
From: quzha <Quanlu.Zhang@microsoft.com>
Date: Wed, 22 Jan 2020 18:47:58 +0800
Subject: [PATCH 07/33] update

---
 .../speedup/torch/compress_modules.py         |  81 ++++---
 .../compression/speedup/torch/compressor.py   | 226 ++++++++++++++----
 .../compression/speedup/torch/infer_shape.py  |  43 +++-
 3 files changed, 269 insertions(+), 81 deletions(-)

diff --git a/src/sdk/pynni/nni/compression/speedup/torch/compress_modules.py b/src/sdk/pynni/nni/compression/speedup/torch/compress_modules.py
index 59262d0230..09b06670ca 100644
--- a/src/sdk/pynni/nni/compression/speedup/torch/compress_modules.py
+++ b/src/sdk/pynni/nni/compression/speedup/torch/compress_modules.py
@@ -2,57 +2,68 @@
 # Licensed under the MIT license.
 
 import torch
-from .infer_shape import CoarseMask
+from .infer_shape import CoarseMask, ModuleMasks
 
-compress_modules = {
-    'BatchNorm2d': lambda module, mask: compress_batchnorm2d(module, mask),
-    'Conv2d': lambda module, mask: compress_conv2d(module, mask)
+replace_module = {
+    'BatchNorm2d': lambda module, mask: replace_batchnorm2d(module, mask),
+    'Conv2d': lambda module, mask: replace_conv2d(module, mask)
 }
 
-cms_output = {
-    'BatchNorm2d': lambda module, output_cmask: compress_batchnorm2d_output(module, output_cmask),
-    'Conv2d': lambda module, output_cmask: compress_conv2d_output(module, output_cmask)
-}
-
-
-def compress_batchnorm2d_output(module, output_cmask):
-    """
-    """
 
-def compress_batchnorm2d(norm, mask):
+def replace_batchnorm2d(norm, mask):
     """
     """
-    assert 'weight' in mask and 'bias' in mask
-    sum_mask = mask['weight'] + mask['bias']
-    nonzero_index = torch.nonzero(sum_mask, as_tuple=True)[0]
-    new_norm = torch.nn.BatchNorm2d(num_features=nonzero_index.size()[0],
+    assert isinstance(mask, ModuleMasks)
+    assert 'weight' in mask.param_masks and 'bias' in mask.param_masks
+    index = mask.param_masks['weight'].mask_index[0]
+    num_features = index.size()[0]
+    new_norm = torch.nn.BatchNorm2d(num_features=num_features,
                                     eps=norm.eps,
                                     momentum=norm.momentum,
                                     affine=norm.affine,
                                     track_running_stats=norm.track_running_stats)
     # assign weights
-    new_norm.weight.data = torch.index_select(norm.weight.data, 0, nonzero_index)
-    new_norm.bias.data = torch.index_select(norm.bias.data, 0, nonzero_index)
+    new_norm.weight.data = torch.index_select(norm.weight.data, 0, index)
+    new_norm.bias.data = torch.index_select(norm.bias.data, 0, index)
     if norm.track_running_stats:
-        new_norm.running_mean.data = torch.index_select(norm.running_mean.data, 0, nonzero_index)
-        new_norm.running_var.data = torch.index_select(norm.running_var.data, 0, nonzero_index)
-    # infer shape of input tensor
-    input_cmask = CoarseMask(num_dim=4)
-    input_cmask.add_index_mask(dim=1,
-                               index=torch.nonzero(mask['weight'], as_tuple=True)[0])
-    # infer shape of output tensor
-    output_cmask = CoarseMask(num_dim=4)
-    output_cmask.add_index_mask(dim=1, index=nonzero_index)
-    return new_norm, input_cmask, output_cmask
-
-def compress_conv2d_output(module, output_cmask):
-    """
-    """
+        new_norm.running_mean.data = torch.index_select(norm.running_mean.data, 0, index)
+        new_norm.running_var.data = torch.index_select(norm.running_var.data, 0, index)
+    return new_norm
 
-def compress_conv2d(conv, mask):
+def replace_conv2d(conv, mask):
     """
     """
     # fine-grained tensor sparse
     #...
     # coarse-grained shape sparse
     #...
+    assert isinstance(mask, ModuleMasks)
+    if mask.input_mask is None:
+        in_channels = conv.in_channels
+    else:
+        in_channels_index = mask.input_mask.mask_index[1]
+        in_channels = in_channels_index.size()[0]
+    if mask.output_mask is None:
+        out_channels = conv.out_channels
+    else:
+        out_channels_index = mask.output_mask.mask_index[1]
+        out_channels = out_channels_index.size()[0]
+    new_conv = torch.nn.Conv2d(in_channels=in_channels,
+                               out_channels=out_channels,
+                               kernel_size=conv.kernel_size,
+                               stride=conv.stride,
+                               padding=conv.padding,
+                               dilation=conv.dilation)
+    tmp_weight_data = tmp_bias_data = None
+    if mask.output_mask is not None:
+        tmp_weight_data = torch.index_select(conv.weight.data, 0, out_channels_index)
+        if conv.bias is not None:
+            tmp_bias_data = torch.index_select(conv.bias.data, 0, out_channels_index)
+    # NOTE: does not support group
+    if mask.input_mask is not None:
+        tmp_weight_data = torch.index_select(tmp_weight_data, 1, in_channels_index)
+    if tmp_weight_data is not None:
+        new_conv.weight.data = tmp_weight_data
+    if tmp_bias_data is not None:
+        new_conv.bias.data = tmp_bias_data
+    return new_conv
diff --git a/src/sdk/pynni/nni/compression/speedup/torch/compressor.py b/src/sdk/pynni/nni/compression/speedup/torch/compressor.py
index a3d48967ed..f935e01c56 100644
--- a/src/sdk/pynni/nni/compression/speedup/torch/compressor.py
+++ b/src/sdk/pynni/nni/compression/speedup/torch/compressor.py
@@ -2,9 +2,10 @@
 # Licensed under the MIT license.
 
 import logging
+import queue
 import re
 import torch
-from .compress_modules import compress_modules as cms
+from .compress_modules import replace_module
 from .infer_shape import ModuleMasks, infer_from_mask, infer_from_inshape, infer_from_outshape
 
 _logger = logging.getLogger(__name__)
@@ -17,6 +18,15 @@ def get_module_by_name(model, module_name):
     leaf_module = getattr(model, name_list[-1])
     return model, leaf_module
 
+class GNode:
+    def __init__(self, node_name, node_type, op_type, inputs, outputs, nodes):
+        self.name = node_name # module name if is module, scope name + seq if is func
+        self.type = node_type # module or func
+        self.op_type = op_type
+        self.inputs = inputs
+        self.outputs = outputs
+        self.nodes = nodes
+
 class ModelSpeedup:
     """
     Abstract base PyTorch ModelSpeedup
@@ -40,10 +50,74 @@ def __init__(self, model, dummy_input, masks_file):
         self.bound_model = model
         self.masks = torch.load(masks_file)
         self.trace_graph = torch.jit.trace(model, dummy_input)
-        self.output_to_node, self.input_to_node, self.module_to_inputs, self.module_to_outputs, self.module_to_type = self._build_graph()
-        
-        #self.replaced_modules = dict()
         self.inferred_masks = dict() # key: module_name, value: ModuleMasks
+        self.g_nodes = list()
+        self.global_count = 0
+        self.name_to_gnode, self.input_to_gnode, self.output_to_gnode = self._build_graph()
+        #self.replaced_modules = dict()
+
+    def _build_index_for_gnodes(self, g_nodes):
+        """
+        """
+        name_to_gnode = dict()
+        input_to_gnode = dict()
+        output_to_gnode = dict()
+        for node in g_nodes:
+            name_to_gnode[node.name] = node
+            for _input in node.inputs:
+                if _input in input_to_gnode:
+                    input_to_gnode[_input].append(node)
+                else:
+                    input_to_gnode[_input] = [node]
+            for output in node.outputs:
+                if output in output_to_gnode:
+                    print("output: ", output)
+                    print("gnode: ", output_to_gnode[output].name)
+                assert not output in output_to_gnode, "One output cannot be generated by multiple nodes"
+                output_to_gnode[output] = node
+        return name_to_gnode, input_to_gnode, output_to_gnode
+
+    def _expand_non_prim_node(self, node, nodes, input_to_node, output_to_node):
+        """
+        """
+        #print('^=' * 30)
+        #for n in nodes:
+        #    print(n)
+        #print('v=' * 30)
+        # TODO: scope name could be empty
+        node_name = '.'.join([node.scopeName(), node.kind(), str(self.global_count)])
+        print('node_name: ', node_name)
+        self.global_count += 1
+        op_type = node.kind()
+
+        node_group = [node]
+        inputs = list()
+        outputs = list()
+        node_queue = queue.Queue()
+        node_queue.put(node)
+        while not node_queue.empty():
+            curr_node = node_queue.get()
+            for _input in curr_node.inputs():
+                print('_input: ', _input)
+                input_name = _input.debugName()
+                if input_name in output_to_node and output_to_node[input_name] in nodes:
+                        predecessor_node = output_to_node[input_name]
+                        print("predecessor_node: ", predecessor_node)
+                        if predecessor_node.kind().startswith('prim::'):
+                            node_group.append(predecessor_node)
+                            node_queue.put(predecessor_node)
+                        else:
+                            inputs.append(input_name)
+                else:
+                    inputs.append(input_name)
+        for output in node.outputs():
+            outputs.append(output.debugName())
+        g_node = GNode(node_name, 'func', op_type, inputs, outputs, node_group)
+        print('^' * 30)
+        for n in g_node.nodes:
+            print(n)
+        print('v' * 30)
+        return g_node
 
     def _build_graph(self):
         """
@@ -54,10 +128,12 @@ def _build_graph(self):
         output_to_node = dict()
         # build input mapping, from input debugName to its node
         input_to_node = dict()
-        #build module mapping, from module name to all nodes (as list) under this module scope
+        # build module mapping, from module name to all nodes (as list) under this module scope
         module_to_nodes = dict()
         # module name to its type
         module_to_type = dict()
+        # the mapping of function (non-module in forward) to nodes, key is scope name
+        func_to_nodes = dict()
 
         graph_inputs = list()
         graph_outputs = list()
@@ -66,6 +142,9 @@ def _build_graph(self):
         for output in graph.outputs():
             graph_outputs.append(output.debugName())
 
+        #print("graph_inputs: ", graph_inputs)
+        #print("graph_outputs: ", graph_outputs)
+
         for node in graph.nodes():
             for output in node.outputs():
                 output_name = output.debugName()
@@ -78,18 +157,32 @@ def _build_graph(self):
             module_name = '.'.join(module_name_slices)
             # if module_name is empty, it is not a module
             if module_name == '':
-                continue
-            scope_slice = scope_name.split('/')[-1]
-            module_type = scope_slice.split('[')[0]
-            module_to_type[module_name] = module_type
-            if module_name in module_to_nodes:
-                module_to_nodes[module_name].append(node)
+                if scope_name == '':
+                    continue
+                else:
+                    # TODO: there might be more than one funcs in scope_name
+                    if scope_name in func_to_nodes:
+                        func_to_nodes[scope_name].append(node)
+                    else:
+                        func_to_nodes[scope_name] = [node]
             else:
-                module_to_nodes[module_name] = [node]
+                scope_slice = scope_name.split('/')[-1]
+                module_type = scope_slice.split('[')[0]
+                module_to_type[module_name] = module_type
+                if module_name in module_to_nodes:
+                    module_to_nodes[module_name].append(node)
+                else:
+                    module_to_nodes[module_name] = [node]
+
+        print('xx' * 30)
+        for k in output_to_node:
+            print(k)
+        print('yy' * 30)
+
         # for each module, find its inputs and outputs
         # build module mapping, from module name to its inputs debugName and outputs debugName,
-        module_to_inputs = dict()
-        module_to_outputs = dict()
+        #module_to_inputs = dict()
+        #module_to_outputs = dict()
         for module_name, nodes in module_to_nodes.items():
             inputs = set()
             outputs = set()
@@ -111,11 +204,33 @@ def _build_graph(self):
                     m_inputs.append(_input)
                 elif not output_to_node[_input] in nodes:
                     m_inputs.append(_input)
-            module_to_inputs[module_name] = m_inputs
-            module_to_outputs[module_name] = m_outputs
-        return output_to_node, input_to_node, module_to_inputs, module_to_outputs, module_to_type
+            #module_to_inputs[module_name] = m_inputs
+            #module_to_outputs[module_name] = m_outputs
+            print("module node_name: ", module_name)
+            if module_name == '':
+                for n in nodes:
+                    print(n)
+            g_node = GNode(module_name, 'module', module_to_type[module_name], m_inputs, m_outputs, nodes)
+            self.g_nodes.append(g_node)
+
+        # each scope_name may have multiple funcs, we split them and create GNode for each of them
+        for scope_name, nodes in func_to_nodes.items():
+            # extract non prim:: nodes
+            non_prim_nodes = list()
+            for node in nodes:
+                if not node.kind().startswith('prim::'):
+                    non_prim_nodes.append(node)
+            # for each non prim node, expand it has a GNode
+            for node in non_prim_nodes:
+                g_node = self._expand_non_prim_node(node, nodes, input_to_node, output_to_node)
+                self.g_nodes.append(g_node)
 
-    def _do_module_replace(self, module_name, mask=None, in_shape=None, out_shape=None):
+        # build index for g_nodes
+        name_to_gnode, input_to_gnode, output_to_gnode = self._build_index_for_gnodes(self.g_nodes)
+
+        return name_to_gnode, input_to_gnode, output_to_gnode #output_to_node, input_to_node
+
+    '''def _do_module_replace(self, module_name, mask=None, in_shape=None, out_shape=None):
         """
         """
         assert not module_name in self.replaced_modules
@@ -137,44 +252,34 @@ def _do_module_replace(self, module_name, mask=None, in_shape=None, out_shape=No
         if out_shape is not None:
             assert not module_name in self.masks
             #...
-        return input_cmask, output_cmask
+        return input_cmask, output_cmask'''
 
     def _find_predecessors(self, module_name):
         """
         """
         predecessors = []
-        for _input in self.module_to_inputs[module_name]:
-            assert _input in self.output_to_node
-            node = self.output_to_node[_input]
-            #print("node: ", node)
-            scope_name = node.scopeName() # example: scope_name, 'MyCell/Linear[linear]'
-            #print("scope name: ", scope_name)
-            if scope_name == '':
-                continue
-            module_name_slices = re.findall(r'\[(.*?)\]', scope_name)
-            module_name = '.'.join(module_name_slices)
-            if module_name == '':
-                raise RuntimeError("_find_predecessors: cannot handle non-module node!")
+        for _input in self.name_to_gnode[module_name].inputs:
+            if not _input in self.output_to_gnode:
+                print(_input)
+            if not _input in self.output_to_gnode:
+                # TODO: check _input which does not have node
+                print("output with no gnode: ", _input)
             else:
-                predecessors.append(module_name)
+                g_node = self.output_to_gnode[_input]
+                predecessors.append(g_node.name)
         return predecessors
 
     def _find_successors(self, module_name):
         """
         """
         successors = []
-        for output in self.module_to_outputs[module_name]:
-            assert output in self.input_to_node
-            node = self.input_to_node[output]
-            scope_name = node.scopeName()
-            if scope_name == '':
-                continue
-            module_name_slices = re.findall(r'\[(.*?)\]', scope_name)
-            module_name = '.'.join(module_name_slices)
-            if module_name == '':
-                raise RuntimeError("_find_successors: cannot handle non-module node!")
-            else:
-                successors.append(module_name)
+        for output in self.name_to_gnode[module_name].outputs:
+            if not output in self.input_to_gnode:
+                print(output)
+            assert output in self.input_to_gnode
+            g_nodes = self.input_to_gnode[output]
+            for g_node in g_nodes:
+                successors.append(g_node.name)
         return successors
 
     def infer_module_mask(self, module_name, mask=None, in_shape=None, out_shape=None):
@@ -187,7 +292,11 @@ def infer_module_mask(self, module_name, mask=None, in_shape=None, out_shape=Non
             module_masks = ModuleMasks(module_name)
             self.inferred_masks[module_name] = module_masks
 
-        m_type = self.module_to_type[module_name]
+        m_type = self.name_to_gnode[module_name].op_type
+        if m_type == 'VGG':
+            print("VGG module name: ", module_name)
+            for node in self.name_to_gnode[module_name].nodes:
+                print(node)
         print("infer_module_mask: {}, module type: {}".format(module_name, m_type))
         if mask is not None:
             print("mask is not None")
@@ -217,3 +326,30 @@ def infer_modules_masks(self):
         """
         for module_name, mask in self.masks.items():
             self.infer_module_mask(module_name, mask=mask)
+
+    def replace_compressed_modules(self):
+        """
+        """
+        print('*' * 30)
+        for module_name in self.inferred_masks:
+            #module_masks = self.inferred_masks[module_name]
+            #print(module_masks.param_masks)
+            #print(module_masks.input_mask)
+            #print(module_masks.output_mask)
+            g_node = self.name_to_gnode[module_name]
+            print(module_name, g_node.op_type)
+            if g_node.type == 'module':
+                super_module, leaf_module = get_module_by_name(self.bound_model, module_name)
+                m_type = g_node.op_type
+                compressed_module = replace_module[m_type](leaf_module, self.inferred_masks[module_name])
+                setattr(super_module, module_name.split('.')[-1], compressed_module)
+            elif g_node.type == 'func':
+                ...
+            else:
+                raise RuntimeError("Unsupported GNode type: {}".format(g_node.type))
+
+    def speedup_model(self):
+        """
+        """
+        self.infer_modules_masks()
+        self.replace_compressed_modules()
\ No newline at end of file
diff --git a/src/sdk/pynni/nni/compression/speedup/torch/infer_shape.py b/src/sdk/pynni/nni/compression/speedup/torch/infer_shape.py
index a4a76da491..713ee0f7bd 100644
--- a/src/sdk/pynni/nni/compression/speedup/torch/infer_shape.py
+++ b/src/sdk/pynni/nni/compression/speedup/torch/infer_shape.py
@@ -10,6 +10,7 @@
 
 class CoarseMask:
     def __init__(self, num_dim):
+        # index existing ones
         self.mask_index = [None for _ in range(num_dim)]
 
     def add_index_mask(self, dim, index):
@@ -62,13 +63,53 @@ def set_output_mask(self, mask):
 infer_from_inshape = {
     'ReLU': lambda module_masks, mask: relu_inshape(module_masks, mask),
     'Conv2d': lambda module_masks, mask: conv2d_inshape(module_masks, mask),
-    'MaxPool2d': lambda module_masks, mask: maxpool2d_inshape(module_masks, mask)
+    'MaxPool2d': lambda module_masks, mask: maxpool2d_inshape(module_masks, mask),
+    'aten::avg_pool2d': lambda module_masks, mask: maxpool2d_inshape(module_masks, mask),
+    'AvgPool2d': lambda module_masks, mask: maxpool2d_inshape(module_masks, mask),
+    'aten::size': lambda module_masks, mask: size_inshape(module_masks, mask),
+    'aten::view': lambda module_masks, mask: view_inshape(module_masks, mask),
+    'Linear': lambda module_masks, mask: linear_inshape(module_masks, mask)
 }
 
 infer_from_outshape = {
     'Conv2d': lambda module_masks, mask: conv2d_outshape(module_masks, mask)
 }
 
+def linear_inshape(module_masks, mask):
+    """
+    """
+    assert isinstance(mask, CoarseMask)
+    assert mask.mask_index[0] is None
+    assert module_masks.input_mask is None
+    module_masks.set_input_mask(mask)
+    return None
+
+def view_inshape(module_masks, mask):
+    """
+    """
+    # TODO: currently hard code view(N, -1)
+    assert isinstance(mask, CoarseMask)
+    assert mask.mask_index[1] is not None
+    assert mask.mask_index[0] is None
+    assert mask.mask_index[2] is None
+    assert mask.mask_index[3] is None
+    assert module_masks.input_mask is None
+    module_masks.set_input_mask(mask)
+    output_cmask = CoarseMask(num_dim=2)
+    # TODO: hard code for this case, %x : Float(64, 512, 1, 1)
+    index = []
+    for loc in mask.mask_index[1]:
+        index.append(loc * 1)
+    output_cmask.mask_index[1] = torch.tensor(index)
+    module_masks.set_output_mask(output_cmask)
+    return output_cmask
+
+
+def size_inshape(module_masks, mask):
+    """
+    """
+    return None
+
 def maxpool2d_inshape(module_masks, mask):
     """
     """

From 9680f3effaafbc6265f952594b50cf3b1b3305de Mon Sep 17 00:00:00 2001
From: quzha <Quanlu.Zhang@microsoft.com>
Date: Fri, 24 Jan 2020 17:41:31 +0800
Subject: [PATCH 08/33] update

---
 .../speedup/torch/compress_modules.py         | 47 ++++++++++++++++++-
 .../compression/speedup/torch/compressor.py   | 11 ++++-
 2 files changed, 54 insertions(+), 4 deletions(-)

diff --git a/src/sdk/pynni/nni/compression/speedup/torch/compress_modules.py b/src/sdk/pynni/nni/compression/speedup/torch/compress_modules.py
index 09b06670ca..c208975b46 100644
--- a/src/sdk/pynni/nni/compression/speedup/torch/compress_modules.py
+++ b/src/sdk/pynni/nni/compression/speedup/torch/compress_modules.py
@@ -6,9 +6,43 @@
 
 replace_module = {
     'BatchNorm2d': lambda module, mask: replace_batchnorm2d(module, mask),
-    'Conv2d': lambda module, mask: replace_conv2d(module, mask)
+    'Conv2d': lambda module, mask: replace_conv2d(module, mask),
+    'MaxPool2d': lambda module, mask: no_replace(module, mask),
+    'ReLU': lambda module, mask: no_replace(module, mask),
+    'Linear': lambda module, mask: replace_linear(module, mask)
 }
 
+def no_replace(module, mask):
+    """
+    """
+    return module
+
+def replace_linear(linear, mask):
+    """
+    """
+    assert isinstance(mask, ModuleMasks)
+    assert mask.input_mask is not None
+    assert mask.output_mask is None
+    assert not mask.param_masks
+    index = mask.input_mask.mask_index[-1]
+    print(mask.input_mask.mask_index)
+    in_features = index.size()[0]
+    print('linear: ', in_features)
+    new_linear = torch.nn.Linear(in_features=in_features,
+                                 out_features=linear.out_features,
+                                 bias=linear.bias is not None)
+    print(linear.weight.data.size())
+    print(new_linear.weight.data.size())
+    print(linear.weight.t().size())
+    print(new_linear.weight.t().size())
+    new_linear.weight.data = torch.index_select(linear.weight.data, -1, index.to('cuda:0'))
+    print(new_linear.weight.data.size())
+    if linear.bias is not None:
+        print(linear.bias.data.size())
+        new_linear.bias.data = torch.index_select(linear.bias.data, 0, index.to('cuda:0'))
+        print(new_linear.bias.data.size())
+    print("last print: ", new_linear.weight.t().size())
+    return new_linear
 
 def replace_batchnorm2d(norm, mask):
     """
@@ -53,11 +87,17 @@ def replace_conv2d(conv, mask):
                                kernel_size=conv.kernel_size,
                                stride=conv.stride,
                                padding=conv.padding,
-                               dilation=conv.dilation)
+                               dilation=conv.dilation,
+                               groups=1, # currently only support groups is 1
+                               bias=conv.bias,
+                               padding_mode=conv.padding_mode)
+    #print('weight: ', conv.weight.get_device())
+    #print('bias', conv.bias.get_device())
     tmp_weight_data = tmp_bias_data = None
     if mask.output_mask is not None:
         tmp_weight_data = torch.index_select(conv.weight.data, 0, out_channels_index)
         if conv.bias is not None:
+            print('bias is not None')
             tmp_bias_data = torch.index_select(conv.bias.data, 0, out_channels_index)
     # NOTE: does not support group
     if mask.input_mask is not None:
@@ -66,4 +106,7 @@ def replace_conv2d(conv, mask):
         new_conv.weight.data = tmp_weight_data
     if tmp_bias_data is not None:
         new_conv.bias.data = tmp_bias_data
+    #new_conv.weight.to('cuda:0')
+    #new_conv.bias.to('cuda:0')
+    #print(new_conv.weight.get_device(), new_conv.bias.data, new_conv.bias.get_device())
     return new_conv
diff --git a/src/sdk/pynni/nni/compression/speedup/torch/compressor.py b/src/sdk/pynni/nni/compression/speedup/torch/compressor.py
index f935e01c56..cff997dec5 100644
--- a/src/sdk/pynni/nni/compression/speedup/torch/compressor.py
+++ b/src/sdk/pynni/nni/compression/speedup/torch/compressor.py
@@ -48,6 +48,7 @@ def __init__(self, model, dummy_input, masks_file):
             it is used to parse dependencies between modules
         """
         self.bound_model = model
+        self.dummy_input = dummy_input
         self.masks = torch.load(masks_file)
         self.trace_graph = torch.jit.trace(model, dummy_input)
         self.inferred_masks = dict() # key: module_name, value: ModuleMasks
@@ -344,12 +345,18 @@ def replace_compressed_modules(self):
                 compressed_module = replace_module[m_type](leaf_module, self.inferred_masks[module_name])
                 setattr(super_module, module_name.split('.')[-1], compressed_module)
             elif g_node.type == 'func':
-                ...
+                print("Cannot replace func...")
             else:
                 raise RuntimeError("Unsupported GNode type: {}".format(g_node.type))
 
     def speedup_model(self):
         """
         """
+        self.bound_model(self.dummy_input)
+        print("start to compress")
         self.infer_modules_masks()
-        self.replace_compressed_modules()
\ No newline at end of file
+        self.replace_compressed_modules()
+        print("finished compressing")
+        for name, module in self.bound_model.named_modules():
+            print(name, module)
+        self.bound_model(self.dummy_input)
\ No newline at end of file

From e51f28814320077af8348d1911a8838f83883de6 Mon Sep 17 00:00:00 2001
From: QuanluZhang
 <quzha@v100test.j54c4fvz4ikexg1bdsarwjfncg.ix.internal.cloudapp.net>
Date: Tue, 28 Jan 2020 04:09:45 +0000
Subject: [PATCH 09/33] update

---
 examples/model_compress/slim_torch_cifar10.py | 81 ++++++++++---------
 examples/model_compress/test.py               | 26 ++++++
 .../speedup/torch/compress_modules.py         | 18 +++--
 .../compression/speedup/torch/compressor.py   | 12 ++-
 4 files changed, 92 insertions(+), 45 deletions(-)
 create mode 100644 examples/model_compress/test.py

diff --git a/examples/model_compress/slim_torch_cifar10.py b/examples/model_compress/slim_torch_cifar10.py
index ebd36f44d4..36c89fb65c 100644
--- a/examples/model_compress/slim_torch_cifar10.py
+++ b/examples/model_compress/slim_torch_cifar10.py
@@ -5,6 +5,7 @@
 from torchvision import datasets, transforms
 from nni.compression.torch import SlimPruner
 from models.cifar10.vgg import VGG
+from nni.compression.speedup.torch import ModelSpeedup
 
 
 def updateBN(model):
@@ -17,6 +18,7 @@ def train(model, device, train_loader, optimizer, sparse_bn=False):
     model.train()
     for batch_idx, (data, target) in enumerate(train_loader):
         data, target = data.to(device), target.to(device)
+        #print('data: ', data.size())
         optimizer.zero_grad()
         output = model(data)
         loss = F.cross_entropy(output, target)
@@ -73,7 +75,7 @@ def main():
 
     # Train the base VGG-19 model
     print('=' * 10 + 'Train the unpruned base model' + '=' * 10)
-    epochs = 160
+    epochs = 0
     optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=1e-4)
     for epoch in range(epochs):
         if epoch in [epochs * 0.5, epochs * 0.75]:
@@ -89,41 +91,48 @@ def main():
     test(model, device, test_loader)
     # top1 = 93.60%
 
-    # Pruning Configuration, in paper 'Learning efficient convolutional networks through network slimming',
-    configure_list = [{
-        'sparsity': 0.7,
-        'op_types': ['BatchNorm2d'],
-    }]
-
-    # Prune model and test accuracy without fine tuning.
-    print('=' * 10 + 'Test the pruned model before fine tune' + '=' * 10)
-    pruner = SlimPruner(model, configure_list)
-    model = pruner.compress()
-    test(model, device, test_loader)
-    # top1 = 93.55%
-
-    # Fine tune the pruned model for 40 epochs and test accuracy
-    print('=' * 10 + 'Fine tuning' + '=' * 10)
-    optimizer_finetune = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9, weight_decay=1e-4)
-    best_top1 = 0
-    for epoch in range(40):
-        pruner.update_epoch(epoch)
-        print('# Epoch {} #'.format(epoch))
-        train(model, device, train_loader, optimizer_finetune)
-        top1 = test(model, device, test_loader)
-        if top1 > best_top1:
-            best_top1 = top1
-            # Export the best model, 'model_path' stores state_dict of the pruned model,
-            # mask_path stores mask_dict of the pruned model
-            pruner.export_model(model_path='pruned_vgg19_cifar10.pth', mask_path='mask_vgg19_cifar10.pth')
-
-    # Test the exported model
-    print('=' * 10 + 'Test the export pruned model after fine tune' + '=' * 10)
-    new_model = VGG(depth=19)
-    new_model.to(device)
-    new_model.load_state_dict(torch.load('pruned_vgg19_cifar10.pth'))
-    test(new_model, device, test_loader)
-    # top1 = 93.74%
+    speedup = True
+    if speedup == True:
+        #print(model)
+        dummy_input = torch.randn(64, 3, 32, 32)
+        m_speedup = ModelSpeedup(model, dummy_input.to(device), 'mask_vgg19_cifar10.pth')
+        m_speedup.speedup_model()
+    else:
+        # Pruning Configuration, in paper 'Learning efficient convolutional networks through network slimming',
+        configure_list = [{
+            'sparsity': 0.7,
+            'op_types': ['BatchNorm2d'],
+        }]
+
+        # Prune model and test accuracy without fine tuning.
+        print('=' * 10 + 'Test the pruned model before fine tune' + '=' * 10)
+        pruner = SlimPruner(model, configure_list)
+        model = pruner.compress()
+        test(model, device, test_loader)
+        # top1 = 93.55%
+
+        # Fine tune the pruned model for 40 epochs and test accuracy
+        print('=' * 10 + 'Fine tuning' + '=' * 10)
+        optimizer_finetune = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9, weight_decay=1e-4)
+        best_top1 = 0
+        for epoch in range(4):
+            pruner.update_epoch(epoch)
+            print('# Epoch {} #'.format(epoch))
+            train(model, device, train_loader, optimizer_finetune)
+            top1 = test(model, device, test_loader)
+            if top1 > best_top1:
+                best_top1 = top1
+                # Export the best model, 'model_path' stores state_dict of the pruned model,
+                # mask_path stores mask_dict of the pruned model
+                pruner.export_model(model_path='pruned_vgg19_cifar10.pth', mask_path='mask_vgg19_cifar10.pth')
+
+        # Test the exported model
+        print('=' * 10 + 'Test the export pruned model after fine tune' + '=' * 10)
+        new_model = VGG(depth=19)
+        new_model.to(device)
+        new_model.load_state_dict(torch.load('pruned_vgg19_cifar10.pth'))
+        test(new_model, device, test_loader)
+        # top1 = 93.74%
 
 
 if __name__ == '__main__':
diff --git a/examples/model_compress/test.py b/examples/model_compress/test.py
new file mode 100644
index 0000000000..09cd1bd5fd
--- /dev/null
+++ b/examples/model_compress/test.py
@@ -0,0 +1,26 @@
+import torch
+import torch.nn as nn
+
+class Net(nn.Module):
+    def __init__(self):
+        super(Net, self).__init__()
+        self.conv = nn.Conv2d(1, 1, 3)
+
+    def forward(self, x):
+        return self.conv(x)
+
+if __name__ == '__main__':
+    n = Net()
+    example_weight = torch.rand(1, 1, 3, 3)
+    example_forward_input = torch.rand(1, 1, 3, 3)
+
+    # Trace a specific method and construct `ScriptModule` with
+    # a single `forward` method
+    module = torch.jit.trace(n.forward, example_forward_input)
+
+    # Trace a module (implicitly traces `forward`) and construct a
+    # `ScriptModule` with a single `forward` method
+    module = torch.jit.trace(n, example_forward_input)
+    print(module.graph)
+    print(torch._C._jit_pass_inline(module.graph))
+    print(module.graph)
diff --git a/src/sdk/pynni/nni/compression/speedup/torch/compress_modules.py b/src/sdk/pynni/nni/compression/speedup/torch/compress_modules.py
index c208975b46..01a6ab8569 100644
--- a/src/sdk/pynni/nni/compression/speedup/torch/compress_modules.py
+++ b/src/sdk/pynni/nni/compression/speedup/torch/compress_modules.py
@@ -74,14 +74,20 @@ def replace_conv2d(conv, mask):
     assert isinstance(mask, ModuleMasks)
     if mask.input_mask is None:
         in_channels = conv.in_channels
+        print('in_channels: ', in_channels)
     else:
         in_channels_index = mask.input_mask.mask_index[1]
+        #print('in_channels_index: ', in_channels_index)
         in_channels = in_channels_index.size()[0]
+        #print('in_channels: ', in_channels)
     if mask.output_mask is None:
         out_channels = conv.out_channels
+        #print('out_channels: ', out_channels)
     else:
         out_channels_index = mask.output_mask.mask_index[1]
+        #print('out_channels_index: ', out_channels_index)
         out_channels = out_channels_index.size()[0]
+        #print('out_channels: ', out_channels)
     new_conv = torch.nn.Conv2d(in_channels=in_channels,
                                out_channels=out_channels,
                                kernel_size=conv.kernel_size,
@@ -93,6 +99,7 @@ def replace_conv2d(conv, mask):
                                padding_mode=conv.padding_mode)
     #print('weight: ', conv.weight.get_device())
     #print('bias', conv.bias.get_device())
+    new_conv.to(conv.weight.device)
     tmp_weight_data = tmp_bias_data = None
     if mask.output_mask is not None:
         tmp_weight_data = torch.index_select(conv.weight.data, 0, out_channels_index)
@@ -101,11 +108,12 @@ def replace_conv2d(conv, mask):
             tmp_bias_data = torch.index_select(conv.bias.data, 0, out_channels_index)
     # NOTE: does not support group
     if mask.input_mask is not None:
-        tmp_weight_data = torch.index_select(tmp_weight_data, 1, in_channels_index)
-    if tmp_weight_data is not None:
-        new_conv.weight.data = tmp_weight_data
-    if tmp_bias_data is not None:
-        new_conv.bias.data = tmp_bias_data
+        tmp_weight_data = torch.index_select(conv.weight.data if tmp_weight_data is None else tmp_weight_data,
+                                             1, in_channels_index)
+    assert tmp_weight_data is not None
+    new_conv.weight.data.copy_(tmp_weight_data)
+    if conv.bias is not None:
+        new_conv.bias.data.copy_(conv.bias.data if tmp_bias_data is None else tmp_bias_data)
     #new_conv.weight.to('cuda:0')
     #new_conv.bias.to('cuda:0')
     #print(new_conv.weight.get_device(), new_conv.bias.data, new_conv.bias.get_device())
diff --git a/src/sdk/pynni/nni/compression/speedup/torch/compressor.py b/src/sdk/pynni/nni/compression/speedup/torch/compressor.py
index cff997dec5..08e4ee2bf1 100644
--- a/src/sdk/pynni/nni/compression/speedup/torch/compressor.py
+++ b/src/sdk/pynni/nni/compression/speedup/torch/compressor.py
@@ -51,6 +51,9 @@ def __init__(self, model, dummy_input, masks_file):
         self.dummy_input = dummy_input
         self.masks = torch.load(masks_file)
         self.trace_graph = torch.jit.trace(model, dummy_input)
+        #print("masks: ", self.masks)
+        #print(self.trace_graph)
+        #print(self.trace_graph.graph)
         self.inferred_masks = dict() # key: module_name, value: ModuleMasks
         self.g_nodes = list()
         self.global_count = 0
@@ -124,6 +127,7 @@ def _build_graph(self):
         """
         """
         graph = self.trace_graph.graph
+        #torch._C._jit_pass_inline(graph)
         print(graph)
         # build output mapping, from output debugName to its node
         output_to_node = dict()
@@ -352,11 +356,11 @@ def replace_compressed_modules(self):
     def speedup_model(self):
         """
         """
-        self.bound_model(self.dummy_input)
+        #self.bound_model(self.dummy_input)
         print("start to compress")
         self.infer_modules_masks()
         self.replace_compressed_modules()
         print("finished compressing")
-        for name, module in self.bound_model.named_modules():
-            print(name, module)
-        self.bound_model(self.dummy_input)
\ No newline at end of file
+        #for name, module in self.bound_model.named_modules():
+        #    print(name, module)
+        #self.bound_model(self.dummy_input)
\ No newline at end of file

From f8304303f029ee69e226aba84c7e08ee89860112 Mon Sep 17 00:00:00 2001
From: QuanluZhang
 <quzha@v100test.j54c4fvz4ikexg1bdsarwjfncg.ix.internal.cloudapp.net>
Date: Tue, 28 Jan 2020 07:42:58 +0000
Subject: [PATCH 10/33] update

---
 examples/model_compress/slim_torch_cifar10.py | 18 ++++++++++----
 .../speedup/torch/compress_modules.py         | 24 ++++++++++---------
 .../compression/speedup/torch/compressor.py   |  3 ++-
 .../pynni/nni/compression/torch/__init__.py   |  1 +
 4 files changed, 30 insertions(+), 16 deletions(-)

diff --git a/examples/model_compress/slim_torch_cifar10.py b/examples/model_compress/slim_torch_cifar10.py
index 36c89fb65c..49b6bc6aaa 100644
--- a/examples/model_compress/slim_torch_cifar10.py
+++ b/examples/model_compress/slim_torch_cifar10.py
@@ -6,6 +6,7 @@
 from nni.compression.torch import SlimPruner
 from models.cifar10.vgg import VGG
 from nni.compression.speedup.torch import ModelSpeedup
+from nni.compression.torch import apply_compression_results
 
 
 def updateBN(model):
@@ -92,11 +93,20 @@ def main():
     # top1 = 93.60%
 
     speedup = True
+    mask_flag = True
     if speedup == True:
-        #print(model)
-        dummy_input = torch.randn(64, 3, 32, 32)
-        m_speedup = ModelSpeedup(model, dummy_input.to(device), 'mask_vgg19_cifar10.pth')
-        m_speedup.speedup_model()
+        dummy_input = torch.ones([64, 3, 32, 32])
+        if mask_flag:
+            apply_compression_results(model, 'mask_vgg19_cifar10.pth')
+            out = model(dummy_input.to(device))
+            print(out.size(), out)
+            return
+        else:
+            m_speedup = ModelSpeedup(model, dummy_input.to(device), 'mask_vgg19_cifar10.pth')
+            m_speedup.speedup_model()
+            out = model(dummy_input.to(device))
+            print(out.size(), out)
+            return
     else:
         # Pruning Configuration, in paper 'Learning efficient convolutional networks through network slimming',
         configure_list = [{
diff --git a/src/sdk/pynni/nni/compression/speedup/torch/compress_modules.py b/src/sdk/pynni/nni/compression/speedup/torch/compress_modules.py
index 01a6ab8569..cd3df0cc2d 100644
--- a/src/sdk/pynni/nni/compression/speedup/torch/compress_modules.py
+++ b/src/sdk/pynni/nni/compression/speedup/torch/compress_modules.py
@@ -25,23 +25,25 @@ def replace_linear(linear, mask):
     assert mask.output_mask is None
     assert not mask.param_masks
     index = mask.input_mask.mask_index[-1]
-    print(mask.input_mask.mask_index)
+    #print(mask.input_mask.mask_index)
     in_features = index.size()[0]
-    print('linear: ', in_features)
+    #print('linear: ', in_features)
     new_linear = torch.nn.Linear(in_features=in_features,
                                  out_features=linear.out_features,
                                  bias=linear.bias is not None)
-    print(linear.weight.data.size())
-    print(new_linear.weight.data.size())
-    print(linear.weight.t().size())
-    print(new_linear.weight.t().size())
+    new_linear.to(linear.weight.device)
+    #print(linear.weight.data.size())
+    #print(new_linear.weight.data.size())
+    #print(linear.weight.t().size())
+    #print(new_linear.weight.t().size())
     new_linear.weight.data = torch.index_select(linear.weight.data, -1, index.to('cuda:0'))
-    print(new_linear.weight.data.size())
+    #print(new_linear.weight.data.size())
     if linear.bias is not None:
-        print(linear.bias.data.size())
-        new_linear.bias.data = torch.index_select(linear.bias.data, 0, index.to('cuda:0'))
-        print(new_linear.bias.data.size())
-    print("last print: ", new_linear.weight.t().size())
+        #print(linear.bias.data.size())
+        #new_linear.bias.data = torch.index_select(linear.bias.data, 0, index.to('cuda:0'))
+        new_linear.bias.data.copy_(linear.bias.data)
+        #print(new_linear.bias.data.size())
+    #print("last print: ", new_linear.weight.t().size())
     return new_linear
 
 def replace_batchnorm2d(norm, mask):
diff --git a/src/sdk/pynni/nni/compression/speedup/torch/compressor.py b/src/sdk/pynni/nni/compression/speedup/torch/compressor.py
index 08e4ee2bf1..adce001098 100644
--- a/src/sdk/pynni/nni/compression/speedup/torch/compressor.py
+++ b/src/sdk/pynni/nni/compression/speedup/torch/compressor.py
@@ -49,7 +49,8 @@ def __init__(self, model, dummy_input, masks_file):
         """
         self.bound_model = model
         self.dummy_input = dummy_input
-        self.masks = torch.load(masks_file)
+        ori_masks = torch.load(masks_file)
+        self.masks = {'feature.1': ori_masks['feature.1']}
         self.trace_graph = torch.jit.trace(model, dummy_input)
         #print("masks: ", self.masks)
         #print(self.trace_graph)
diff --git a/src/sdk/pynni/nni/compression/torch/__init__.py b/src/sdk/pynni/nni/compression/torch/__init__.py
index d79a8f76c4..432cdf1529 100644
--- a/src/sdk/pynni/nni/compression/torch/__init__.py
+++ b/src/sdk/pynni/nni/compression/torch/__init__.py
@@ -6,3 +6,4 @@
 from .weight_rank_filter_pruners import *
 from .activation_rank_filter_pruners import *
 from .quantizers import *
+from .apply_compression import apply_compression_results

From ab7f23dfd74022aa8f716b925e167379901f6d06 Mon Sep 17 00:00:00 2001
From: QuanluZhang
 <quzha@v100test.j54c4fvz4ikexg1bdsarwjfncg.ix.internal.cloudapp.net>
Date: Tue, 28 Jan 2020 14:35:16 +0000
Subject: [PATCH 11/33] update

---
 examples/model_compress/slim_torch_cifar10.py |  14 ++-
 examples/model_compress/test2.py              | 118 ++++++++++++++++++
 examples/model_compress/test3.py              |  40 ++++++
 .../speedup/torch/compress_modules.py         |  20 ++-
 .../compression/speedup/torch/compressor.py   |   2 +
 .../pynni/nni/compression/torch/compressor.py |   2 +
 6 files changed, 185 insertions(+), 11 deletions(-)
 create mode 100644 examples/model_compress/test2.py
 create mode 100644 examples/model_compress/test3.py

diff --git a/examples/model_compress/slim_torch_cifar10.py b/examples/model_compress/slim_torch_cifar10.py
index 49b6bc6aaa..6e2850b49b 100644
--- a/examples/model_compress/slim_torch_cifar10.py
+++ b/examples/model_compress/slim_torch_cifar10.py
@@ -8,6 +8,7 @@
 from nni.compression.speedup.torch import ModelSpeedup
 from nni.compression.torch import apply_compression_results
 
+torch.manual_seed(0)
 
 def updateBN(model):
     for m in model.modules():
@@ -84,13 +85,14 @@ def main():
                 param_group['lr'] *= 0.1
         train(model, device, train_loader, optimizer, True)
         test(model, device, test_loader)
-    torch.save(model.state_dict(), 'vgg19_cifar10.pth')
+    #torch.save(model.state_dict(), 'vgg19_cifar10.pth')
 
     # Test base model accuracy
     print('=' * 10 + 'Test the original model' + '=' * 10)
-    model.load_state_dict(torch.load('vgg19_cifar10.pth'))
-    test(model, device, test_loader)
+    #model.load_state_dict(torch.load('vgg19_cifar10.pth'))
+    #test(model, device, test_loader)
     # top1 = 93.60%
+    model.train()
 
     speedup = True
     mask_flag = True
@@ -99,13 +101,15 @@ def main():
         if mask_flag:
             apply_compression_results(model, 'mask_vgg19_cifar10.pth')
             out = model(dummy_input.to(device))
-            print(out.size(), out)
+            #print(out.size(), out)
             return
         else:
+            #print("model before: ", model)
             m_speedup = ModelSpeedup(model, dummy_input.to(device), 'mask_vgg19_cifar10.pth')
             m_speedup.speedup_model()
+            #print("model after: ", model)
             out = model(dummy_input.to(device))
-            print(out.size(), out)
+            #print(out.size(), out)
             return
     else:
         # Pruning Configuration, in paper 'Learning efficient convolutional networks through network slimming',
diff --git a/examples/model_compress/test2.py b/examples/model_compress/test2.py
new file mode 100644
index 0000000000..ebc1e233d7
--- /dev/null
+++ b/examples/model_compress/test2.py
@@ -0,0 +1,118 @@
+import torch
+import torch.nn as nn
+
+torch.manual_seed(0)
+
+class Net(nn.Module):
+    def __init__(self):
+        super(Net, self).__init__()
+        self.conv = nn.Conv2d(4, 8, 3)
+        self.bn = nn.BatchNorm2d(8)
+        self.relu = nn.ReLU()
+        self.conv2 = nn.Conv2d(8, 4, 5)
+
+    def forward(self, x):
+        out = self.conv(x)
+        out = self.bn(out)
+        #print(out)
+        out = self.relu(out)
+        #print(out)
+        out = self.conv2(out)
+        #print(out)
+        return out
+
+def add_masks(model):
+    bn = getattr(model, 'bn')
+    print(bn)
+    #print('before: ', bn.weight.data)
+    bn.weight.data = bn.weight.data * torch.tensor([0,1,0,1,0,1,0,1])
+    #print('after', bn.weight.data)
+    bn.bias.data = bn.bias.data * torch.tensor([0,1,0,1,0,1,0,1])
+
+def model_speedup(model):
+    index = torch.tensor([1,3,5,7])
+    #-----
+    conv = getattr(model, 'conv')
+    new_conv = torch.nn.Conv2d(in_channels=conv.in_channels,
+                               out_channels=4,
+                               kernel_size=conv.kernel_size,
+                               stride=conv.stride,
+                               padding=conv.padding,
+                               dilation=conv.dilation,
+                               groups=1, # currently only support groups is 1
+                               bias=conv.bias is not None,
+                               padding_mode=conv.padding_mode)
+    tmp_weight_data = tmp_bias_data = None
+    tmp_weight_data = torch.index_select(conv.weight.data, 0, index)
+    if conv.bias is not None:
+        tmp_bias_data = torch.index_select(conv.bias.data, 0, index)
+    new_conv.weight.data.copy_(tmp_weight_data)
+    if conv.bias is not None:
+        new_conv.bias.data.copy_(conv.bias.data if tmp_bias_data is None else tmp_bias_data)
+    setattr(model, 'conv', new_conv)
+    #-------
+    norm = getattr(model, 'bn')
+    new_norm = torch.nn.BatchNorm2d(num_features=4,
+                                    eps=norm.eps,
+                                    momentum=norm.momentum,
+                                    affine=norm.affine,
+                                    track_running_stats=norm.track_running_stats)
+    # assign weights
+    new_norm.weight.data = torch.index_select(norm.weight.data, 0, index)
+    new_norm.bias.data = torch.index_select(norm.bias.data, 0, index)
+    if norm.track_running_stats:
+        new_norm.running_mean.data = torch.index_select(norm.running_mean.data, 0, index)
+        new_norm.running_var.data = torch.index_select(norm.running_var.data, 0, index)
+    setattr(model, 'bn', new_norm)
+    #---------
+    conv2 = getattr(model, 'conv2')
+    new_conv2 = torch.nn.Conv2d(in_channels=4,
+                               out_channels=conv2.out_channels,
+                               kernel_size=conv2.kernel_size,
+                               stride=conv2.stride,
+                               padding=conv2.padding,
+                               dilation=conv2.dilation,
+                               groups=1, # currently only support groups is 1
+                               bias=conv2.bias is not None,
+                               padding_mode=conv2.padding_mode)
+    tmp_weight_data = tmp_bias_data = None
+    #print('before select: ', conv2.weight.data.size(), conv2.weight.data)
+    tmp_weight_data = torch.index_select(conv2.weight.data, 1, index)
+    #print('after select: ', tmp_weight_data.size(), tmp_weight_data)
+    new_conv2.weight.data.copy_(tmp_weight_data)
+    if conv2.bias is not None:
+        new_conv2.bias.data.copy_(conv2.bias.data)
+    setattr(model, 'conv2', new_conv2)
+    #----------
+
+if __name__ == '__main__':
+    n = Net()
+    n.train()
+    dummy_input = torch.randn(6, 4, 16, 16)
+    mask_flag = True
+    if mask_flag:
+        add_masks(n)
+    else:
+        conv = getattr(n, 'conv')
+        #print('conv before: ', conv.weight.data.size(), conv.weight.data)
+        model_speedup(n)
+        bn = getattr(n, 'bn')
+        #print('bn: ', bn.weight.data)
+        conv = getattr(n, 'conv')
+        #print('conv after: ', conv.weight.data.size(), conv.weight.data)
+    out = n(dummy_input)
+    print(out.size(), out)
+
+    '''example_weight = torch.rand(1, 1, 3, 3)
+    example_forward_input = torch.rand(1, 1, 3, 3)
+
+    # Trace a specific method and construct `ScriptModule` with
+    # a single `forward` method
+    module = torch.jit.trace(n.forward, example_forward_input)
+
+    # Trace a module (implicitly traces `forward`) and construct a
+    # `ScriptModule` with a single `forward` method
+    module = torch.jit.trace(n, example_forward_input)
+    print(module.graph)
+    print(torch._C._jit_pass_inline(module.graph))
+    print(module.graph)'''
diff --git a/examples/model_compress/test3.py b/examples/model_compress/test3.py
new file mode 100644
index 0000000000..1fe508e5a5
--- /dev/null
+++ b/examples/model_compress/test3.py
@@ -0,0 +1,40 @@
+import torch
+import torch.nn as nn
+
+class Net(nn.Module):
+    def __init__(self):
+        super(Net, self).__init__()
+        self.conv = nn.Conv2d(2, 1, 3)
+
+    def forward(self, x):
+        return self.conv(x)
+
+if __name__ == '__main__':
+    n = Net()
+    conv = getattr(n, 'conv')
+    conv.weight.data.fill_(0.5)
+    conv.bias.data.fill_(0.1)
+    #---------------
+    index = torch.tensor([1])
+    conv2 = getattr(n, 'conv')
+    new_conv2 = torch.nn.Conv2d(in_channels=1,
+                               out_channels=conv2.out_channels,
+                               kernel_size=conv2.kernel_size,
+                               stride=conv2.stride,
+                               padding=conv2.padding,
+                               dilation=conv2.dilation,
+                               groups=1, # currently only support groups is 1
+                               bias=conv2.bias is not None,
+                               padding_mode=conv2.padding_mode)
+    tmp_weight_data = tmp_bias_data = None
+    print('before select: ', conv2.weight.data.size(), conv2.weight.data)
+    tmp_weight_data = torch.index_select(conv2.weight.data, 1, index)
+    print('after select: ', tmp_weight_data.size(), tmp_weight_data)
+    new_conv2.weight.data.copy_(tmp_weight_data)
+    if conv2.bias is not None:
+        new_conv2.bias.data.copy_(conv2.bias.data)
+    setattr(n, 'conv', new_conv2)
+    #----------------
+    dummy_input = torch.zeros([1, 1, 3, 3])
+    out = n(dummy_input)
+    print(out)
diff --git a/src/sdk/pynni/nni/compression/speedup/torch/compress_modules.py b/src/sdk/pynni/nni/compression/speedup/torch/compress_modules.py
index cd3df0cc2d..bb38782849 100644
--- a/src/sdk/pynni/nni/compression/speedup/torch/compress_modules.py
+++ b/src/sdk/pynni/nni/compression/speedup/torch/compress_modules.py
@@ -53,6 +53,7 @@ def replace_batchnorm2d(norm, mask):
     assert 'weight' in mask.param_masks and 'bias' in mask.param_masks
     index = mask.param_masks['weight'].mask_index[0]
     num_features = index.size()[0]
+    print("replace batchnorm2d: ", num_features, index)
     new_norm = torch.nn.BatchNorm2d(num_features=num_features,
                                     eps=norm.eps,
                                     momentum=norm.momentum,
@@ -61,6 +62,8 @@ def replace_batchnorm2d(norm, mask):
     # assign weights
     new_norm.weight.data = torch.index_select(norm.weight.data, 0, index)
     new_norm.bias.data = torch.index_select(norm.bias.data, 0, index)
+    #print('new_norm weight data: ', new_norm.weight.data)
+    #print('new_norm bias data: ', new_norm.bias.data)
     if norm.track_running_stats:
         new_norm.running_mean.data = torch.index_select(norm.running_mean.data, 0, index)
         new_norm.running_var.data = torch.index_select(norm.running_var.data, 0, index)
@@ -79,17 +82,17 @@ def replace_conv2d(conv, mask):
         print('in_channels: ', in_channels)
     else:
         in_channels_index = mask.input_mask.mask_index[1]
-        #print('in_channels_index: ', in_channels_index)
+        print('in_channels_index: ', in_channels_index)
         in_channels = in_channels_index.size()[0]
-        #print('in_channels: ', in_channels)
+        print('in_channels: ', in_channels)
     if mask.output_mask is None:
         out_channels = conv.out_channels
-        #print('out_channels: ', out_channels)
+        print('out_channels: ', out_channels)
     else:
         out_channels_index = mask.output_mask.mask_index[1]
-        #print('out_channels_index: ', out_channels_index)
+        print('out_channels_index: ', out_channels_index)
         out_channels = out_channels_index.size()[0]
-        #print('out_channels: ', out_channels)
+        print('out_channels: ', out_channels)
     new_conv = torch.nn.Conv2d(in_channels=in_channels,
                                out_channels=out_channels,
                                kernel_size=conv.kernel_size,
@@ -97,26 +100,31 @@ def replace_conv2d(conv, mask):
                                padding=conv.padding,
                                dilation=conv.dilation,
                                groups=1, # currently only support groups is 1
-                               bias=conv.bias,
+                               bias=conv.bias is not None,
                                padding_mode=conv.padding_mode)
     #print('weight: ', conv.weight.get_device())
     #print('bias', conv.bias.get_device())
+    #print('conv2d weight: ', conv.weight.data.size(), conv.weight.data)
     new_conv.to(conv.weight.device)
     tmp_weight_data = tmp_bias_data = None
     if mask.output_mask is not None:
+        print('mask output_mask is not None')
         tmp_weight_data = torch.index_select(conv.weight.data, 0, out_channels_index)
         if conv.bias is not None:
             print('bias is not None')
             tmp_bias_data = torch.index_select(conv.bias.data, 0, out_channels_index)
     # NOTE: does not support group
     if mask.input_mask is not None:
+        print('mask input_mask is not None')
         tmp_weight_data = torch.index_select(conv.weight.data if tmp_weight_data is None else tmp_weight_data,
                                              1, in_channels_index)
     assert tmp_weight_data is not None
     new_conv.weight.data.copy_(tmp_weight_data)
     if conv.bias is not None:
+        print('final conv.bias is not None')
         new_conv.bias.data.copy_(conv.bias.data if tmp_bias_data is None else tmp_bias_data)
     #new_conv.weight.to('cuda:0')
     #new_conv.bias.to('cuda:0')
     #print(new_conv.weight.get_device(), new_conv.bias.data, new_conv.bias.get_device())
+    #print('new conv2d weight: ', new_conv.weight.data.size(), new_conv.weight.data)
     return new_conv
diff --git a/src/sdk/pynni/nni/compression/speedup/torch/compressor.py b/src/sdk/pynni/nni/compression/speedup/torch/compressor.py
index adce001098..2156afbf91 100644
--- a/src/sdk/pynni/nni/compression/speedup/torch/compressor.py
+++ b/src/sdk/pynni/nni/compression/speedup/torch/compressor.py
@@ -51,7 +51,9 @@ def __init__(self, model, dummy_input, masks_file):
         self.dummy_input = dummy_input
         ori_masks = torch.load(masks_file)
         self.masks = {'feature.1': ori_masks['feature.1']}
+        model.eval()
         self.trace_graph = torch.jit.trace(model, dummy_input)
+        model.train()
         #print("masks: ", self.masks)
         #print(self.trace_graph)
         #print(self.trace_graph.graph)
diff --git a/src/sdk/pynni/nni/compression/torch/compressor.py b/src/sdk/pynni/nni/compression/torch/compressor.py
index 74bc3cb8a3..7b29e45027 100644
--- a/src/sdk/pynni/nni/compression/torch/compressor.py
+++ b/src/sdk/pynni/nni/compression/torch/compressor.py
@@ -201,12 +201,14 @@ def new_forward(*inputs):
             old_weight = layer.module.weight.data
             mask_weight = mask['weight']
             layer.module.weight.data = old_weight.mul(mask_weight)
+            #print('instrument weight: ', layer.module.weight.data)
             # apply mask to bias
             if mask.__contains__('bias') and hasattr(layer.module, 'bias') and layer.module.bias is not None:
                 old_bias = layer.module.bias.data
                 mask_bias = mask['bias']
                 mask_bias.to(device)
                 layer.module.bias.data = old_bias.mul(mask_bias)
+                #print('instrument bias: ', layer.module.bias.data)
             # calculate forward
             ret = layer._forward(*inputs)
             return ret

From 98e75c215fc18c14ef57bea30616aaf7ca9b36cb Mon Sep 17 00:00:00 2001
From: QuanluZhang
 <quzha@v100test.j54c4fvz4ikexg1bdsarwjfncg.ix.internal.cloudapp.net>
Date: Wed, 29 Jan 2020 03:08:14 +0000
Subject: [PATCH 12/33] pass eval result validate, but has very small
 difference

---
 examples/model_compress/models/cifar10/vgg.py |  1 +
 examples/model_compress/slim_torch_cifar10.py | 10 ++--
 examples/model_compress/test2.py              | 10 +++-
 .../compression/speedup/torch/compressor.py   | 18 +++++--
 .../compression/torch/apply_compression.py    | 51 +++++++++++++++++++
 5 files changed, 79 insertions(+), 11 deletions(-)
 create mode 100644 src/sdk/pynni/nni/compression/torch/apply_compression.py

diff --git a/examples/model_compress/models/cifar10/vgg.py b/examples/model_compress/models/cifar10/vgg.py
index f293770c72..44222d1fdb 100644
--- a/examples/model_compress/models/cifar10/vgg.py
+++ b/examples/model_compress/models/cifar10/vgg.py
@@ -43,6 +43,7 @@ def make_layers(self, cfg, batch_norm=False):
 
     def forward(self, x):
         x = self.feature(x)
+        #print('x'*10, x)
         x = nn.AvgPool2d(2)(x)
         x = x.view(x.size(0), -1)
         y = self.classifier(x)
diff --git a/examples/model_compress/slim_torch_cifar10.py b/examples/model_compress/slim_torch_cifar10.py
index 6e2850b49b..563afa59e9 100644
--- a/examples/model_compress/slim_torch_cifar10.py
+++ b/examples/model_compress/slim_torch_cifar10.py
@@ -92,16 +92,16 @@ def main():
     #model.load_state_dict(torch.load('vgg19_cifar10.pth'))
     #test(model, device, test_loader)
     # top1 = 93.60%
-    model.train()
+    model.eval()
 
     speedup = True
-    mask_flag = True
+    mask_flag = False
     if speedup == True:
-        dummy_input = torch.ones([64, 3, 32, 32])
+        dummy_input = torch.randn(64, 3, 32, 32)
         if mask_flag:
             apply_compression_results(model, 'mask_vgg19_cifar10.pth')
             out = model(dummy_input.to(device))
-            #print(out.size(), out)
+            print(out.size(), out)
             return
         else:
             #print("model before: ", model)
@@ -109,7 +109,7 @@ def main():
             m_speedup.speedup_model()
             #print("model after: ", model)
             out = model(dummy_input.to(device))
-            #print(out.size(), out)
+            print(out.size(), out)
             return
     else:
         # Pruning Configuration, in paper 'Learning efficient convolutional networks through network slimming',
diff --git a/examples/model_compress/test2.py b/examples/model_compress/test2.py
index ebc1e233d7..93d0582208 100644
--- a/examples/model_compress/test2.py
+++ b/examples/model_compress/test2.py
@@ -61,6 +61,7 @@ def model_speedup(model):
     new_norm.weight.data = torch.index_select(norm.weight.data, 0, index)
     new_norm.bias.data = torch.index_select(norm.bias.data, 0, index)
     if norm.track_running_stats:
+        print('*'*30, norm.track_running_stats)
         new_norm.running_mean.data = torch.index_select(norm.running_mean.data, 0, index)
         new_norm.running_var.data = torch.index_select(norm.running_var.data, 0, index)
     setattr(model, 'bn', new_norm)
@@ -87,15 +88,22 @@ def model_speedup(model):
 
 if __name__ == '__main__':
     n = Net()
-    n.train()
+    n.eval()
+    bn = getattr(n, 'bn')
+    print('bn track_running_stats: ', bn.track_running_stats)
+    print('bn running mean: ', bn.running_mean)
+    print('bn running var: ', bn.running_var)
+    print('bn momentum: ', bn.momentum)
     dummy_input = torch.randn(6, 4, 16, 16)
     mask_flag = True
     if mask_flag:
         add_masks(n)
+        #print('abc')
     else:
         conv = getattr(n, 'conv')
         #print('conv before: ', conv.weight.data.size(), conv.weight.data)
         model_speedup(n)
+        n.eval()
         bn = getattr(n, 'bn')
         #print('bn: ', bn.weight.data)
         conv = getattr(n, 'conv')
diff --git a/src/sdk/pynni/nni/compression/speedup/torch/compressor.py b/src/sdk/pynni/nni/compression/speedup/torch/compressor.py
index 2156afbf91..3a925815e3 100644
--- a/src/sdk/pynni/nni/compression/speedup/torch/compressor.py
+++ b/src/sdk/pynni/nni/compression/speedup/torch/compressor.py
@@ -49,11 +49,15 @@ def __init__(self, model, dummy_input, masks_file):
         """
         self.bound_model = model
         self.dummy_input = dummy_input
-        ori_masks = torch.load(masks_file)
-        self.masks = {'feature.1': ori_masks['feature.1']}
-        model.eval()
+        self.masks = torch.load(masks_file)
+        #ori_masks = torch.load(masks_file)
+        #self.masks = {'feature.1': ori_masks['feature.1']}
+        self.is_training = model.training
+        if self.is_training:
+            model.eval()
         self.trace_graph = torch.jit.trace(model, dummy_input)
-        model.train()
+        if self.is_training:
+            model.train()
         #print("masks: ", self.masks)
         #print(self.trace_graph)
         #print(self.trace_graph.graph)
@@ -366,4 +370,8 @@ def speedup_model(self):
         print("finished compressing")
         #for name, module in self.bound_model.named_modules():
         #    print(name, module)
-        #self.bound_model(self.dummy_input)
\ No newline at end of file
+        #self.bound_model(self.dummy_input)
+        if self.is_training:
+            self.bound_model.train()
+        else:
+            self.bound_model.eval()
\ No newline at end of file
diff --git a/src/sdk/pynni/nni/compression/torch/apply_compression.py b/src/sdk/pynni/nni/compression/torch/apply_compression.py
new file mode 100644
index 0000000000..416f95d33e
--- /dev/null
+++ b/src/sdk/pynni/nni/compression/torch/apply_compression.py
@@ -0,0 +1,51 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+import logging
+import torch
+from .compressor import Pruner
+
+logger = logging.getLogger('torch apply compression')
+
+def apply_compression_results(model, masks_file):
+    """
+    """
+    apply_comp = ApplyCompression(model, masks_file)
+    apply_comp.compress()
+
+class ApplyCompression(Pruner):
+    """
+    Prune to an exact pruning level specification
+    """
+
+    def __init__(self, model, masks_file):
+        """
+        Parameters
+        ----------
+        model : torch.nn.module
+            Model to be pruned
+        config_list : list
+            List on pruning configs
+        """
+        self.bound_model = model
+        self.masks = torch.load(masks_file)
+        #ori_masks = torch.load(masks_file)
+        #self.masks = {'feature.1': ori_masks['feature.1']}
+        for module_name in self.masks:
+            print('module_name: ', module_name)
+        config_list = self._build_config()
+        super().__init__(model, config_list)
+
+    def _build_config(self):
+        op_names = []
+        for module_name in self.masks:
+            op_names.append(module_name)
+        return [{'sparsity': 1, 'op_types': ['BatchNorm2d'], 'op_names': op_names}]
+
+    def calc_mask(self, layer, config):
+        """
+        """
+        assert layer.name in self.masks
+        #print('calc_mask: ', layer.name, self.masks[layer.name])
+        print('calc_mask: ', layer.name, layer.type)
+        return self.masks[layer.name]

From ff413d1061f50109c2cd56eecf69cb72f13e606d Mon Sep 17 00:00:00 2001
From: QuanluZhang
 <quzha@v100test.j54c4fvz4ikexg1bdsarwjfncg.ix.internal.cloudapp.net>
Date: Wed, 29 Jan 2020 08:15:56 +0000
Subject: [PATCH 13/33] add model_speedup.py

---
 examples/model_compress/model_speedup.py | 95 ++++++++++++++++++++++++
 1 file changed, 95 insertions(+)
 create mode 100644 examples/model_compress/model_speedup.py

diff --git a/examples/model_compress/model_speedup.py b/examples/model_compress/model_speedup.py
new file mode 100644
index 0000000000..007a95ce4c
--- /dev/null
+++ b/examples/model_compress/model_speedup.py
@@ -0,0 +1,95 @@
+import argparse
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torchvision import datasets, transforms
+from models.cifar10.vgg import VGG
+from nni.compression.speedup.torch import ModelSpeedup
+from nni.compression.torch import apply_compression_results
+
+torch.manual_seed(0)
+use_mask = True
+
+def fpgm_speedup(masks_file, model_checkpoint):
+    from fpgm_torch_mnist import Mnist
+    device = torch.device('cpu')
+    trans = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])
+    train_loader = torch.utils.data.DataLoader(
+        datasets.MNIST('data', train=True, download=True, transform=trans),
+        batch_size=64, shuffle=True)
+    test_loader = torch.utils.data.DataLoader(
+        datasets.MNIST('data', train=False, transform=trans),
+        batch_size=1000, shuffle=True)
+
+    model = Mnist()
+    model.to(device)
+    model.print_conv_filter_sparsity()
+
+    dummy_input = torch.randn(64, 1, 28, 28)
+    if use_mask:
+        apply_compression_results(model, masks_file)
+        out = model(dummy_input.to(device))
+        print(out.size(), out)
+        return
+    else:
+        m_speedup = ModelSpeedup(model, dummy_input.to(device), masks_file)
+        m_speedup.speedup_model()
+        out = model(dummy_input.to(device))
+        print(out.size(), out)
+        return
+
+def slim_speedup(masks_file, model_checkpoint):
+    device = torch.device('cuda')
+    train_loader = torch.utils.data.DataLoader(
+        datasets.CIFAR10('./data.cifar10', train=True, download=True,
+                         transform=transforms.Compose([
+                             transforms.Pad(4),
+                             transforms.RandomCrop(32),
+                             transforms.RandomHorizontalFlip(),
+                             transforms.ToTensor(),
+                             transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
+                         ])),
+        batch_size=64, shuffle=True)
+    test_loader = torch.utils.data.DataLoader(
+        datasets.CIFAR10('./data.cifar10', train=False, transform=transforms.Compose([
+            transforms.ToTensor(),
+            transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
+        ])),
+        batch_size=200, shuffle=False)
+
+    model = VGG(depth=19)
+    model.to(device)
+    model.eval()
+
+    dummy_input = torch.randn(64, 3, 32, 32)
+    if use_mask:
+        apply_compression_results(model, masks_file)
+        out = model(dummy_input.to(device))
+        print(out.size(), out)
+        return
+    else:
+        #print("model before: ", model)
+        m_speedup = ModelSpeedup(model, dummy_input.to(device), masks_file)
+        m_speedup.speedup_model()
+        #print("model after: ", model)
+        out = model(dummy_input.to(device))
+        print(out.size(), out)
+        return
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser("speedup")
+    parser.add_argument("--example_name", type=str, default="fpgm", help="the name of pruning example")
+    parser.add_argument("--masks_file", type=str, default=None, help="the path of the masks file")
+    parser.add_argument("--model_checkpoint", type=str, default=None, help="the path of checkpointed model")
+    args = parser.parse_args()
+    
+    if args.example_name == 'slim':
+        if args.masks_file is None:
+            args.masks_file = 'mask_vgg19_cifar10.pth'
+        slim_speedup(args.masks_file, args.model_checkpoint)
+    elif args.example_name == 'fpgm':
+        if args.masks_file is None:
+            args.masks_file = ''
+        fpgm_speedup(args.masks_file, args.model_checkpoint)
+    else:
+        raise ValueError('unsupported example_name: {}'.format(args.example_name))

From d83f1908099ae48f0889e87f3ef0a4e7b3a896d8 Mon Sep 17 00:00:00 2001
From: QuanluZhang
 <quzha@v100test.j54c4fvz4ikexg1bdsarwjfncg.ix.internal.cloudapp.net>
Date: Thu, 30 Jan 2020 15:28:40 +0000
Subject: [PATCH 14/33] update

---
 examples/model_compress/model_speedup.py      |  4 +-
 .../speedup/torch/compress_modules.py         |  2 +-
 .../compression/speedup/torch/infer_shape.py  | 50 +++++++++++++++++++
 3 files changed, 53 insertions(+), 3 deletions(-)

diff --git a/examples/model_compress/model_speedup.py b/examples/model_compress/model_speedup.py
index 007a95ce4c..afae585f95 100644
--- a/examples/model_compress/model_speedup.py
+++ b/examples/model_compress/model_speedup.py
@@ -8,7 +8,7 @@
 from nni.compression.torch import apply_compression_results
 
 torch.manual_seed(0)
-use_mask = True
+use_mask = False
 
 def fpgm_speedup(masks_file, model_checkpoint):
     from fpgm_torch_mnist import Mnist
@@ -89,7 +89,7 @@ def slim_speedup(masks_file, model_checkpoint):
         slim_speedup(args.masks_file, args.model_checkpoint)
     elif args.example_name == 'fpgm':
         if args.masks_file is None:
-            args.masks_file = ''
+            args.masks_file = 'mask.pth'
         fpgm_speedup(args.masks_file, args.model_checkpoint)
     else:
         raise ValueError('unsupported example_name: {}'.format(args.example_name))
diff --git a/src/sdk/pynni/nni/compression/speedup/torch/compress_modules.py b/src/sdk/pynni/nni/compression/speedup/torch/compress_modules.py
index bb38782849..6b42d475d0 100644
--- a/src/sdk/pynni/nni/compression/speedup/torch/compress_modules.py
+++ b/src/sdk/pynni/nni/compression/speedup/torch/compress_modules.py
@@ -36,7 +36,7 @@ def replace_linear(linear, mask):
     #print(new_linear.weight.data.size())
     #print(linear.weight.t().size())
     #print(new_linear.weight.t().size())
-    new_linear.weight.data = torch.index_select(linear.weight.data, -1, index.to('cuda:0'))
+    new_linear.weight.data = torch.index_select(linear.weight.data, -1, index.to(linear.weight.device))
     #print(new_linear.weight.data.size())
     if linear.bias is not None:
         #print(linear.bias.data.size())
diff --git a/src/sdk/pynni/nni/compression/speedup/torch/infer_shape.py b/src/sdk/pynni/nni/compression/speedup/torch/infer_shape.py
index 713ee0f7bd..fb01e62578 100644
--- a/src/sdk/pynni/nni/compression/speedup/torch/infer_shape.py
+++ b/src/sdk/pynni/nni/compression/speedup/torch/infer_shape.py
@@ -62,8 +62,10 @@ def set_output_mask(self, mask):
 
 infer_from_inshape = {
     'ReLU': lambda module_masks, mask: relu_inshape(module_masks, mask),
+    'aten::relu': lambda module_masks, mask: relu_inshape(module_masks, mask),
     'Conv2d': lambda module_masks, mask: conv2d_inshape(module_masks, mask),
     'MaxPool2d': lambda module_masks, mask: maxpool2d_inshape(module_masks, mask),
+    'aten::max_pool2d': lambda module_masks, mask: maxpool2d_inshape(module_masks, mask),
     'aten::avg_pool2d': lambda module_masks, mask: maxpool2d_inshape(module_masks, mask),
     'AvgPool2d': lambda module_masks, mask: maxpool2d_inshape(module_masks, mask),
     'aten::size': lambda module_masks, mask: size_inshape(module_masks, mask),
@@ -158,6 +160,54 @@ def batchnorm2d_mask(module_masks, mask):
 def conv2d_mask(module_masks, mask):
     """
     """
+    def convert_to_coarse_mask(mask):
+        assert 'weight' in mask
+        assert isinstance(mask['weight'], torch.Tensor)
+        cmask = None
+        weight_mask = mask['weight']
+        shape = weight_mask.size()
+        ones = torch.ones(shape[1:])
+        zeros = torch.zeros(shape[1:])
+        index = []
+        for i in range(shape[0]):
+            if torch.all(torch.eq(weight_mask[i], ones)):
+                index.append(i)
+            elif torch.all(torch.eq(weight_mask[i], zeros)):
+                continue
+            else:
+                index = None
+                break
+        if index is None:
+            return None, None, None
+        else:
+            index = torch.LongTensor(index)
+            weight_cmask = CoarseMask(num_dim=4)
+            weight_cmask.add_index_mask(dim=0, index=index)
+            bias_cmask = None
+            if 'bias' in mask:
+                bias_index = torch.nonzero(mask['bias'], as_tuple=True)[0]
+                assert torch.all(torch.eq(index, bias_index))
+                bias_cmask = CoarseMask(num_dim=1)
+                bias_cmask.add_index_mask(dim=0, index=bias_index)
+            return index, weight_cmask, bias_cmask
+    index, weight_cmask, bias_cmask = convert_to_coarse_mask(mask)
+    if index is None:
+        # TODO: fine grained mask speedup
+        return None, None
+    # deal with coarse grain mask
+    if 'weight' in module_masks.param_masks:
+        module_masks.param_masks['weight'].merge(weight_cmask)
+        module_masks.param_masks['bias'].merge(bias_cmask)
+    else:
+        module_masks.set_param_masks('weight', weight_cmask)
+        module_masks.set_param_masks('bias', bias_cmask)
+    output_cmask = CoarseMask(num_dim=4)
+    output_cmask.add_index_mask(dim=1, index=index)
+    if module_masks.output_mask is None:
+        module_masks.set_output_mask(output_cmask)
+    else:
+        module_masks.output_mask.merge(output_cmask)
+    return None, module_masks.output_mask
 
 def conv2d_inshape(module_masks, mask):
     """

From ff7e79dbbae63002bdb33e7441feb43234fbc521 Mon Sep 17 00:00:00 2001
From: QuanluZhang
 <quzha@v100test.j54c4fvz4ikexg1bdsarwjfncg.ix.internal.cloudapp.net>
Date: Fri, 31 Jan 2020 10:09:02 +0000
Subject: [PATCH 15/33] pass fpgm test

---
 examples/model_compress/fpgm_torch_mnist.py   |  3 +-
 examples/model_compress/model_speedup.py      |  2 +-
 .../speedup/torch/compress_modules.py         |  4 +--
 .../compression/speedup/torch/compressor.py   | 34 ++++++++++++++++---
 .../compression/speedup/torch/infer_shape.py  | 19 +++++++----
 .../compression/torch/apply_compression.py    |  2 +-
 .../pynni/nni/compression/torch/compressor.py |  1 +
 7 files changed, 49 insertions(+), 16 deletions(-)

diff --git a/examples/model_compress/fpgm_torch_mnist.py b/examples/model_compress/fpgm_torch_mnist.py
index e9c70be56c..ae925af842 100644
--- a/examples/model_compress/fpgm_torch_mnist.py
+++ b/examples/model_compress/fpgm_torch_mnist.py
@@ -16,7 +16,8 @@ def forward(self, x):
         x = F.max_pool2d(x, 2, 2)
         x = F.relu(self.conv2(x))
         x = F.max_pool2d(x, 2, 2)
-        x = x.view(-1, 4 * 4 * 50)
+        #x = x.view(-1, 4 * 4 * 50)
+        x = x.view(64, -1)
         x = F.relu(self.fc1(x))
         x = self.fc2(x)
         return F.log_softmax(x, dim=1)
diff --git a/examples/model_compress/model_speedup.py b/examples/model_compress/model_speedup.py
index afae585f95..889b1550fc 100644
--- a/examples/model_compress/model_speedup.py
+++ b/examples/model_compress/model_speedup.py
@@ -8,7 +8,7 @@
 from nni.compression.torch import apply_compression_results
 
 torch.manual_seed(0)
-use_mask = False
+use_mask = True
 
 def fpgm_speedup(masks_file, model_checkpoint):
     from fpgm_torch_mnist import Mnist
diff --git a/src/sdk/pynni/nni/compression/speedup/torch/compress_modules.py b/src/sdk/pynni/nni/compression/speedup/torch/compress_modules.py
index 6b42d475d0..cd421b6e0e 100644
--- a/src/sdk/pynni/nni/compression/speedup/torch/compress_modules.py
+++ b/src/sdk/pynni/nni/compression/speedup/torch/compress_modules.py
@@ -25,9 +25,9 @@ def replace_linear(linear, mask):
     assert mask.output_mask is None
     assert not mask.param_masks
     index = mask.input_mask.mask_index[-1]
-    #print(mask.input_mask.mask_index)
+    print(mask.input_mask.mask_index)
     in_features = index.size()[0]
-    #print('linear: ', in_features)
+    print('linear: ', in_features)
     new_linear = torch.nn.Linear(in_features=in_features,
                                  out_features=linear.out_features,
                                  bias=linear.bias is not None)
diff --git a/src/sdk/pynni/nni/compression/speedup/torch/compressor.py b/src/sdk/pynni/nni/compression/speedup/torch/compressor.py
index 3a925815e3..fe2c7cf64e 100644
--- a/src/sdk/pynni/nni/compression/speedup/torch/compressor.py
+++ b/src/sdk/pynni/nni/compression/speedup/torch/compressor.py
@@ -26,6 +26,7 @@ def __init__(self, node_name, node_type, op_type, inputs, outputs, nodes):
         self.inputs = inputs
         self.outputs = outputs
         self.nodes = nodes
+        self.auxiliary = None # store supplementary information for different op types
 
 class ModelSpeedup:
     """
@@ -130,6 +131,20 @@ def _expand_non_prim_node(self, node, nodes, input_to_node, output_to_node):
         print('v' * 30)
         return g_node
 
+    def _extract_shape_info(self, node):
+        """
+        """
+        t_input = None
+        for _input in node.inputs():
+            t_input = _input
+            break
+        t_output = node.output()
+        assert isinstance(t_input.type(), torch._C.TensorType)
+        assert isinstance(t_output.type(), torch._C.TensorType)
+        in_shape = t_input.type().sizes()
+        out_shape = t_output.type().sizes()
+        return {'in_shape': in_shape, 'out_shape': out_shape}
+
     def _build_graph(self):
         """
         """
@@ -158,6 +173,7 @@ def _build_graph(self):
         #print("graph_outputs: ", graph_outputs)
 
         for node in graph.nodes():
+            # populate output_to_node and input_to_node
             for output in node.outputs():
                 output_name = output.debugName()
                 output_to_node[output_name] = node
@@ -236,6 +252,9 @@ def _build_graph(self):
             for node in non_prim_nodes:
                 g_node = self._expand_non_prim_node(node, nodes, input_to_node, output_to_node)
                 self.g_nodes.append(g_node)
+                # get shape infor for view (aten::view) func
+                if g_node.op_type == 'aten::view':
+                    g_node.auxiliary = self._extract_shape_info(node)
 
         # build index for g_nodes
         name_to_gnode, input_to_gnode, output_to_gnode = self._build_index_for_gnodes(self.g_nodes)
@@ -305,17 +324,22 @@ def infer_module_mask(self, module_name, mask=None, in_shape=None, out_shape=Non
             self.inferred_masks[module_name] = module_masks
 
         m_type = self.name_to_gnode[module_name].op_type
-        if m_type == 'VGG':
-            print("VGG module name: ", module_name)
-            for node in self.name_to_gnode[module_name].nodes:
-                print(node)
+        #if m_type == 'VGG':
+        #    print("VGG module name: ", module_name)
+        #    for node in self.name_to_gnode[module_name].nodes:
+        #        print(node)
         print("infer_module_mask: {}, module type: {}".format(module_name, m_type))
         if mask is not None:
             print("mask is not None")
             input_cmask, output_cmask = infer_from_mask[m_type](module_masks, mask)
         if in_shape is not None:
             print("in_shape is not None")
-            output_cmask = infer_from_inshape[m_type](module_masks, in_shape)
+            if m_type == 'aten::view':
+                output_cmask = infer_from_inshape[m_type](module_masks,
+                                                          in_shape,
+                                                          self.name_to_gnode[module_name].auxiliary)
+            else:
+                output_cmask = infer_from_inshape[m_type](module_masks, in_shape)
         if out_shape is not None:
             print("out_shape is not None")
             input_cmask = infer_from_outshape[m_type](module_masks, out_shape)
diff --git a/src/sdk/pynni/nni/compression/speedup/torch/infer_shape.py b/src/sdk/pynni/nni/compression/speedup/torch/infer_shape.py
index fb01e62578..ee53ca57e4 100644
--- a/src/sdk/pynni/nni/compression/speedup/torch/infer_shape.py
+++ b/src/sdk/pynni/nni/compression/speedup/torch/infer_shape.py
@@ -69,7 +69,7 @@ def set_output_mask(self, mask):
     'aten::avg_pool2d': lambda module_masks, mask: maxpool2d_inshape(module_masks, mask),
     'AvgPool2d': lambda module_masks, mask: maxpool2d_inshape(module_masks, mask),
     'aten::size': lambda module_masks, mask: size_inshape(module_masks, mask),
-    'aten::view': lambda module_masks, mask: view_inshape(module_masks, mask),
+    'aten::view': lambda module_masks, mask, shape: view_inshape(module_masks, mask, shape),
     'Linear': lambda module_masks, mask: linear_inshape(module_masks, mask)
 }
 
@@ -86,10 +86,17 @@ def linear_inshape(module_masks, mask):
     module_masks.set_input_mask(mask)
     return None
 
-def view_inshape(module_masks, mask):
+def view_inshape(module_masks, mask, shape):
     """
+    TODO: consider replace tensor.view with nn.Flatten, because tensor.view is not
+    included in module, thus, cannot be replaced by our framework.
     """
-    # TODO: currently hard code view(N, -1)
+    # NOTE: the case constrained by the following four asserts
+    assert shape['in_shape'][0] == shape['out_shape'][0]
+    assert len(shape['in_shape']) == 4
+    assert len(shape['out_shape']) == 2
+    assert shape['out_shape'][1] == shape['in_shape'][1]*shape['in_shape'][2]*shape['in_shape'][3]
+
     assert isinstance(mask, CoarseMask)
     assert mask.mask_index[1] is not None
     assert mask.mask_index[0] is None
@@ -98,11 +105,11 @@ def view_inshape(module_masks, mask):
     assert module_masks.input_mask is None
     module_masks.set_input_mask(mask)
     output_cmask = CoarseMask(num_dim=2)
-    # TODO: hard code for this case, %x : Float(64, 512, 1, 1)
     index = []
+    step_size = shape['in_shape'][2] * shape['in_shape'][3]
     for loc in mask.mask_index[1]:
-        index.append(loc * 1)
-    output_cmask.mask_index[1] = torch.tensor(index)
+        index.extend([loc * step_size + i for i in range(step_size)])
+    output_cmask.add_index_mask(dim=1, index=torch.tensor(index))
     module_masks.set_output_mask(output_cmask)
     return output_cmask
 
diff --git a/src/sdk/pynni/nni/compression/torch/apply_compression.py b/src/sdk/pynni/nni/compression/torch/apply_compression.py
index 416f95d33e..e94e20c4f4 100644
--- a/src/sdk/pynni/nni/compression/torch/apply_compression.py
+++ b/src/sdk/pynni/nni/compression/torch/apply_compression.py
@@ -40,7 +40,7 @@ def _build_config(self):
         op_names = []
         for module_name in self.masks:
             op_names.append(module_name)
-        return [{'sparsity': 1, 'op_types': ['BatchNorm2d'], 'op_names': op_names}]
+        return [{'sparsity': 1, 'op_types': ['default', 'BatchNorm2d'], 'op_names': op_names}]
 
     def calc_mask(self, layer, config):
         """
diff --git a/src/sdk/pynni/nni/compression/torch/compressor.py b/src/sdk/pynni/nni/compression/torch/compressor.py
index 7b29e45027..2d3354e3e3 100644
--- a/src/sdk/pynni/nni/compression/torch/compressor.py
+++ b/src/sdk/pynni/nni/compression/torch/compressor.py
@@ -196,6 +196,7 @@ def _instrument_layer(self, layer, config):
         def new_forward(*inputs):
             mask = self.calc_mask(layer, config)
             device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+            #device = torch.device("cpu")
             mask['weight'].to(device)
             # apply mask to weight
             old_weight = layer.module.weight.data

From e1240fe0d05813126621cdf78dbe22e7455633df Mon Sep 17 00:00:00 2001
From: QuanluZhang
 <quzha@v100test.j54c4fvz4ikexg1bdsarwjfncg.ix.internal.cloudapp.net>
Date: Fri, 31 Jan 2020 13:00:16 +0000
Subject: [PATCH 16/33] add doc for speedup

---
 examples/model_compress/model_speedup.py      | 83 +++++++++++++++++--
 examples/model_compress/speedup.md            | 31 +++++++
 .../compression/torch/apply_compression.py    |  2 +-
 3 files changed, 107 insertions(+), 9 deletions(-)
 create mode 100644 examples/model_compress/speedup.md

diff --git a/examples/model_compress/model_speedup.py b/examples/model_compress/model_speedup.py
index 889b1550fc..ed220d9ca2 100644
--- a/examples/model_compress/model_speedup.py
+++ b/examples/model_compress/model_speedup.py
@@ -1,4 +1,5 @@
 import argparse
+import time
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -10,6 +11,52 @@
 torch.manual_seed(0)
 use_mask = True
 
+def l1filter_speedup(masks_file, model_checkpoint):
+    device = torch.device('cuda')
+    train_loader = torch.utils.data.DataLoader(
+        datasets.CIFAR10('./data.cifar10', train=True, download=True,
+                         transform=transforms.Compose([
+                             transforms.Pad(4),
+                             transforms.RandomCrop(32),
+                             transforms.RandomHorizontalFlip(),
+                             transforms.ToTensor(),
+                             transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
+                         ])),
+        batch_size=64, shuffle=True)
+    test_loader = torch.utils.data.DataLoader(
+        datasets.CIFAR10('./data.cifar10', train=False, transform=transforms.Compose([
+            transforms.ToTensor(),
+            transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
+        ])),
+        batch_size=200, shuffle=False)
+
+    model = VGG(depth=16)
+    model.to(device)
+    model.eval()
+
+    dummy_input = torch.randn(64, 3, 32, 32)
+    if use_mask:
+        apply_compression_results(model, masks_file)
+        dummy_input = dummy_input.to(device)
+        start = time.time()
+        for _ in range(100):
+            out = model(dummy_input)
+        #print(out.size(), out)
+        print('mask elapsed time: ', time.time() - start)
+        return
+    else:
+        #print("model before: ", model)
+        m_speedup = ModelSpeedup(model, dummy_input.to(device), masks_file)
+        m_speedup.speedup_model()
+        #print("model after: ", model)
+        dummy_input = dummy_input.to(device)
+        start = time.time()
+        for _ in range(100):
+            out = model(dummy_input)
+        #print(out.size(), out)
+        print('speedup elapsed time: ', time.time() - start)
+        return
+
 def fpgm_speedup(masks_file, model_checkpoint):
     from fpgm_torch_mnist import Mnist
     device = torch.device('cpu')
@@ -28,14 +75,22 @@ def fpgm_speedup(masks_file, model_checkpoint):
     dummy_input = torch.randn(64, 1, 28, 28)
     if use_mask:
         apply_compression_results(model, masks_file)
-        out = model(dummy_input.to(device))
-        print(out.size(), out)
+        dummy_input = dummy_input.to(device)
+        start = time.time()
+        for _ in range(40):
+            out = model(dummy_input)
+        print('mask elapsed time: ', time.time() - start)
+        #print(out.size(), out)
         return
     else:
         m_speedup = ModelSpeedup(model, dummy_input.to(device), masks_file)
         m_speedup.speedup_model()
-        out = model(dummy_input.to(device))
-        print(out.size(), out)
+        dummy_input = dummy_input.to(device)
+        start = time.time()
+        for _ in range(40):
+            out = model(dummy_input)
+        print('speedup elapsed time: ', time.time() - start)
+        #print(out.size(), out)
         return
 
 def slim_speedup(masks_file, model_checkpoint):
@@ -64,16 +119,24 @@ def slim_speedup(masks_file, model_checkpoint):
     dummy_input = torch.randn(64, 3, 32, 32)
     if use_mask:
         apply_compression_results(model, masks_file)
-        out = model(dummy_input.to(device))
-        print(out.size(), out)
+        dummy_input = dummy_input.to(device)
+        start = time.time()
+        for _ in range(32):
+            out = model(dummy_input)
+        #print(out.size(), out)
+        print('mask elapsed time: ', time.time() - start)
         return
     else:
         #print("model before: ", model)
         m_speedup = ModelSpeedup(model, dummy_input.to(device), masks_file)
         m_speedup.speedup_model()
         #print("model after: ", model)
-        out = model(dummy_input.to(device))
-        print(out.size(), out)
+        dummy_input = dummy_input.to(device)
+        start = time.time()
+        for _ in range(32):
+            out = model(dummy_input)
+        #print(out.size(), out)
+        print('speedup elapsed time: ', time.time() - start)
         return
 
 if __name__ == '__main__':
@@ -91,5 +154,9 @@ def slim_speedup(masks_file, model_checkpoint):
         if args.masks_file is None:
             args.masks_file = 'mask.pth'
         fpgm_speedup(args.masks_file, args.model_checkpoint)
+    elif args.example_name == 'l1filter':
+        if args.masks_file is None:
+            args.masks_file = 'mask_vgg16_cifar10.pth'
+        l1filter_speedup(args.masks_file, args.model_checkpoint)
     else:
         raise ValueError('unsupported example_name: {}'.format(args.example_name))
diff --git a/examples/model_compress/speedup.md b/examples/model_compress/speedup.md
new file mode 100644
index 0000000000..92147c3e56
--- /dev/null
+++ b/examples/model_compress/speedup.md
@@ -0,0 +1,31 @@
+# Speedup Results
+
+## slim pruner example
+
+on one V100 GPU,
+input tensor: `torch.randn(64, 3, 32, 32)`
+
+|Times| Mask Latency| Speedup Latency |
+|---|---|---|
+| 1 | 0.011968851089477539 | 0.005106925964355469 |
+| 2 | 0.020199298858642578 | 0.008769512176513672 |
+| 4 | 0.027331113815307617 | 0.014809131622314453 |
+| 8 | 0.043100595474243164 | 0.02744126319885254 |
+| 16 | 0.07731318473815918 | 0.05007791519165039 |
+| 32 | 0.14464616775512695 | 0.10027527809143066 |
+
+## fpgm pruner example
+
+on cpu,
+input tensor: `torch.randn(64, 1, 28, 28)`,
+too large variance
+
+|Times| Mask Latency| Speedup Latency |
+|---|---|---|
+| 1 | 0.013831615447998047 | 0.018393278121948242 |
+| 2 | 0.011675357818603516 | 0.0035581588745117188 |
+| 4 | 0.016363859176635742 | 0.01088404655456543 |
+| 40 | 0.14412355422973633 | 0.08268260955810547 |
+| 40 | 1.2938556671142578 | 0.1440880298614502 |
+| 40 | 0.4103574752807617 | 0.4616250991821289 |
+| 400 | 6.290201425552368 | 5.821432113647461 |
\ No newline at end of file
diff --git a/src/sdk/pynni/nni/compression/torch/apply_compression.py b/src/sdk/pynni/nni/compression/torch/apply_compression.py
index e94e20c4f4..13a0366525 100644
--- a/src/sdk/pynni/nni/compression/torch/apply_compression.py
+++ b/src/sdk/pynni/nni/compression/torch/apply_compression.py
@@ -47,5 +47,5 @@ def calc_mask(self, layer, config):
         """
         assert layer.name in self.masks
         #print('calc_mask: ', layer.name, self.masks[layer.name])
-        print('calc_mask: ', layer.name, layer.type)
+        #print('calc_mask: ', layer.name, layer.type)
         return self.masks[layer.name]

From 8d333f2a21f13741aec9968b76dcd13f7eef36a0 Mon Sep 17 00:00:00 2001
From: QuanluZhang
 <quzha@v100test.j54c4fvz4ikexg1bdsarwjfncg.ix.internal.cloudapp.net>
Date: Fri, 31 Jan 2020 13:42:42 +0000
Subject: [PATCH 17/33] pass l1filter

---
 examples/model_compress/model_speedup.py      | 16 ++++++------
 .../compression/speedup/torch/infer_shape.py  | 25 ++++++++++++++++---
 2 files changed, 29 insertions(+), 12 deletions(-)

diff --git a/examples/model_compress/model_speedup.py b/examples/model_compress/model_speedup.py
index ed220d9ca2..2c3d3ee3e8 100644
--- a/examples/model_compress/model_speedup.py
+++ b/examples/model_compress/model_speedup.py
@@ -9,7 +9,7 @@
 from nni.compression.torch import apply_compression_results
 
 torch.manual_seed(0)
-use_mask = True
+use_mask = False
 
 def l1filter_speedup(masks_file, model_checkpoint):
     device = torch.device('cuda')
@@ -39,10 +39,10 @@ def l1filter_speedup(masks_file, model_checkpoint):
         apply_compression_results(model, masks_file)
         dummy_input = dummy_input.to(device)
         start = time.time()
-        for _ in range(100):
+        for _ in range(1):
             out = model(dummy_input)
-        #print(out.size(), out)
-        print('mask elapsed time: ', time.time() - start)
+        print(out.size(), out)
+        #print('mask elapsed time: ', time.time() - start)
         return
     else:
         #print("model before: ", model)
@@ -51,10 +51,10 @@ def l1filter_speedup(masks_file, model_checkpoint):
         #print("model after: ", model)
         dummy_input = dummy_input.to(device)
         start = time.time()
-        for _ in range(100):
+        for _ in range(1):
             out = model(dummy_input)
-        #print(out.size(), out)
-        print('speedup elapsed time: ', time.time() - start)
+        print(out.size(), out)
+        #print('speedup elapsed time: ', time.time() - start)
         return
 
 def fpgm_speedup(masks_file, model_checkpoint):
@@ -141,7 +141,7 @@ def slim_speedup(masks_file, model_checkpoint):
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser("speedup")
-    parser.add_argument("--example_name", type=str, default="fpgm", help="the name of pruning example")
+    parser.add_argument("--example_name", type=str, default="l1filter", help="the name of pruning example")
     parser.add_argument("--masks_file", type=str, default=None, help="the path of the masks file")
     parser.add_argument("--model_checkpoint", type=str, default=None, help="the path of checkpointed model")
     args = parser.parse_args()
diff --git a/src/sdk/pynni/nni/compression/speedup/torch/infer_shape.py b/src/sdk/pynni/nni/compression/speedup/torch/infer_shape.py
index ee53ca57e4..5fca6ba5fb 100644
--- a/src/sdk/pynni/nni/compression/speedup/torch/infer_shape.py
+++ b/src/sdk/pynni/nni/compression/speedup/torch/infer_shape.py
@@ -70,13 +70,30 @@ def set_output_mask(self, mask):
     'AvgPool2d': lambda module_masks, mask: maxpool2d_inshape(module_masks, mask),
     'aten::size': lambda module_masks, mask: size_inshape(module_masks, mask),
     'aten::view': lambda module_masks, mask, shape: view_inshape(module_masks, mask, shape),
-    'Linear': lambda module_masks, mask: linear_inshape(module_masks, mask)
+    'Linear': lambda module_masks, mask: linear_inshape(module_masks, mask),
+    'BatchNorm2d': lambda module_masks, mask: batchnorm2d_inshape(module_masks, mask)
 }
 
 infer_from_outshape = {
     'Conv2d': lambda module_masks, mask: conv2d_outshape(module_masks, mask)
 }
 
+def batchnorm2d_inshape(module_masks, mask):
+    """
+    """
+    assert isinstance(mask, CoarseMask)
+    assert mask.mask_index[1] is not None
+    assert mask.mask_index[0] is None
+    assert mask.mask_index[2] is None
+    assert mask.mask_index[3] is None
+    module_masks.set_input_mask(mask)
+    module_masks.set_output_mask(mask)
+    weight_cmask = CoarseMask(num_dim=1)
+    weight_cmask.add_index_mask(dim=0, index=mask.mask_index[1])
+    module_masks.set_param_masks('weight', weight_cmask)
+    module_masks.set_param_masks('bias', weight_cmask)
+    return mask
+
 def linear_inshape(module_masks, mask):
     """
     """
@@ -173,8 +190,8 @@ def convert_to_coarse_mask(mask):
         cmask = None
         weight_mask = mask['weight']
         shape = weight_mask.size()
-        ones = torch.ones(shape[1:])
-        zeros = torch.zeros(shape[1:])
+        ones = torch.ones(shape[1:]).to(weight_mask.device)
+        zeros = torch.zeros(shape[1:]).to(weight_mask.device)
         index = []
         for i in range(shape[0]):
             if torch.all(torch.eq(weight_mask[i], ones)):
@@ -187,7 +204,7 @@ def convert_to_coarse_mask(mask):
         if index is None:
             return None, None, None
         else:
-            index = torch.LongTensor(index)
+            index = torch.LongTensor(index).to(weight_mask.device)
             weight_cmask = CoarseMask(num_dim=4)
             weight_cmask.add_index_mask(dim=0, index=index)
             bias_cmask = None

From b1b2b14cb7138980c078fbd0f956a1a959805830 Mon Sep 17 00:00:00 2001
From: QuanluZhang
 <quzha@v100test.j54c4fvz4ikexg1bdsarwjfncg.ix.internal.cloudapp.net>
Date: Fri, 31 Jan 2020 13:58:10 +0000
Subject: [PATCH 18/33] update

---
 examples/model_compress/model_speedup.py | 12 ++++++------
 examples/model_compress/speedup.md       | 16 +++++++++++++++-
 2 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/examples/model_compress/model_speedup.py b/examples/model_compress/model_speedup.py
index 2c3d3ee3e8..21b6e2dec0 100644
--- a/examples/model_compress/model_speedup.py
+++ b/examples/model_compress/model_speedup.py
@@ -39,10 +39,10 @@ def l1filter_speedup(masks_file, model_checkpoint):
         apply_compression_results(model, masks_file)
         dummy_input = dummy_input.to(device)
         start = time.time()
-        for _ in range(1):
+        for _ in range(32):
             out = model(dummy_input)
-        print(out.size(), out)
-        #print('mask elapsed time: ', time.time() - start)
+        #print(out.size(), out)
+        print('mask elapsed time: ', time.time() - start)
         return
     else:
         #print("model before: ", model)
@@ -51,10 +51,10 @@ def l1filter_speedup(masks_file, model_checkpoint):
         #print("model after: ", model)
         dummy_input = dummy_input.to(device)
         start = time.time()
-        for _ in range(1):
+        for _ in range(32):
             out = model(dummy_input)
-        print(out.size(), out)
-        #print('speedup elapsed time: ', time.time() - start)
+        #print(out.size(), out)
+        print('speedup elapsed time: ', time.time() - start)
         return
 
 def fpgm_speedup(masks_file, model_checkpoint):
diff --git a/examples/model_compress/speedup.md b/examples/model_compress/speedup.md
index 92147c3e56..7c3aadbe2a 100644
--- a/examples/model_compress/speedup.md
+++ b/examples/model_compress/speedup.md
@@ -28,4 +28,18 @@ too large variance
 | 40 | 0.14412355422973633 | 0.08268260955810547 |
 | 40 | 1.2938556671142578 | 0.1440880298614502 |
 | 40 | 0.4103574752807617 | 0.4616250991821289 |
-| 400 | 6.290201425552368 | 5.821432113647461 |
\ No newline at end of file
+| 400 | 6.290201425552368 | 5.821432113647461 |
+
+## l1filter pruner example
+
+on one V100 GPU,
+input tensor: `torch.randn(64, 3, 32, 32)`
+
+|Times| Mask Latency| Speedup Latency |
+|---|---|---|
+| 1 | 0.010260343551635742 | 0.0036773681640625 |
+| 2 | 0.016577482223510742 | 0.008161306381225586 |
+| 4 | 0.0245821475982666 | 0.02001810073852539 |
+| 8 | 0.034986257553100586 | 0.025504589080810547 |
+| 16 | 0.06757736206054688 | 0.04752326011657715 |
+| 32 | 0.10487151145935059 | 0.08644247055053711 |
\ No newline at end of file

From e988f19cbffb3cbc831b05db08167e599a6143db Mon Sep 17 00:00:00 2001
From: QuanluZhang
 <quzha@v100test.j54c4fvz4ikexg1bdsarwjfncg.ix.internal.cloudapp.net>
Date: Sat, 1 Feb 2020 04:22:30 +0000
Subject: [PATCH 19/33] update

---
 examples/model_compress/model_speedup.py      | 54 ++++++++++++++++++-
 examples/model_compress/speedup.md            | 16 +++++-
 .../compression/speedup/torch/infer_shape.py  |  2 +-
 3 files changed, 68 insertions(+), 4 deletions(-)

diff --git a/examples/model_compress/model_speedup.py b/examples/model_compress/model_speedup.py
index 21b6e2dec0..477089f129 100644
--- a/examples/model_compress/model_speedup.py
+++ b/examples/model_compress/model_speedup.py
@@ -9,7 +9,53 @@
 from nni.compression.torch import apply_compression_results
 
 torch.manual_seed(0)
-use_mask = False
+use_mask = True
+
+def apoz_speedup(masks_file, model_checkpoint):
+    device = torch.device('cuda')
+    train_loader = torch.utils.data.DataLoader(
+        datasets.CIFAR10('./data.cifar10', train=True, download=True,
+                         transform=transforms.Compose([
+                             transforms.Pad(4),
+                             transforms.RandomCrop(32),
+                             transforms.RandomHorizontalFlip(),
+                             transforms.ToTensor(),
+                             transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
+                         ])),
+        batch_size=64, shuffle=True)
+    test_loader = torch.utils.data.DataLoader(
+        datasets.CIFAR10('./data.cifar10', train=False, transform=transforms.Compose([
+            transforms.ToTensor(),
+            transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
+        ])),
+        batch_size=200, shuffle=False)
+
+    model = VGG(depth=16)
+    model.to(device)
+    model.eval()
+
+    dummy_input = torch.randn(64, 3, 32, 32)
+    if use_mask:
+        apply_compression_results(model, masks_file)
+        dummy_input = dummy_input.to(device)
+        start = time.time()
+        for _ in range(32):
+            out = model(dummy_input)
+        #print(out.size(), out)
+        print('mask elapsed time: ', time.time() - start)
+        return
+    else:
+        #print("model before: ", model)
+        m_speedup = ModelSpeedup(model, dummy_input.to(device), masks_file)
+        m_speedup.speedup_model()
+        #print("model after: ", model)
+        dummy_input = dummy_input.to(device)
+        start = time.time()
+        for _ in range(32):
+            out = model(dummy_input)
+        #print(out.size(), out)
+        print('speedup elapsed time: ', time.time() - start)
+        return
 
 def l1filter_speedup(masks_file, model_checkpoint):
     device = torch.device('cuda')
@@ -141,7 +187,7 @@ def slim_speedup(masks_file, model_checkpoint):
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser("speedup")
-    parser.add_argument("--example_name", type=str, default="l1filter", help="the name of pruning example")
+    parser.add_argument("--example_name", type=str, default="apoz", help="the name of pruning example")
     parser.add_argument("--masks_file", type=str, default=None, help="the path of the masks file")
     parser.add_argument("--model_checkpoint", type=str, default=None, help="the path of checkpointed model")
     args = parser.parse_args()
@@ -158,5 +204,9 @@ def slim_speedup(masks_file, model_checkpoint):
         if args.masks_file is None:
             args.masks_file = 'mask_vgg16_cifar10.pth'
         l1filter_speedup(args.masks_file, args.model_checkpoint)
+    elif args.example_name == 'apoz':
+        if args.masks_file is None:
+            args.masks_file = 'mask_vgg16_cifar10.pth'
+        apoz_speedup(args.masks_file, args.model_checkpoint)
     else:
         raise ValueError('unsupported example_name: {}'.format(args.example_name))
diff --git a/examples/model_compress/speedup.md b/examples/model_compress/speedup.md
index 7c3aadbe2a..b629d27820 100644
--- a/examples/model_compress/speedup.md
+++ b/examples/model_compress/speedup.md
@@ -42,4 +42,18 @@ input tensor: `torch.randn(64, 3, 32, 32)`
 | 4 | 0.0245821475982666 | 0.02001810073852539 |
 | 8 | 0.034986257553100586 | 0.025504589080810547 |
 | 16 | 0.06757736206054688 | 0.04752326011657715 |
-| 32 | 0.10487151145935059 | 0.08644247055053711 |
\ No newline at end of file
+| 32 | 0.10487151145935059 | 0.08644247055053711 |
+
+## APoZ pruner example
+
+on one V100 GPU,
+input tensor: `torch.randn(64, 3, 32, 32)`
+
+|Times| Mask Latency| Speedup Latency |
+|---|---|---|
+| 1 | 0.013897180557250977 | 0.004208564758300781 |
+| 2 | 0.016284465789794922 | 0.008310556411743164 |
+| 4 | 0.02521061897277832 | 0.01400899887084961 |
+| 8 | 0.03386855125427246 | 0.023923158645629883 |
+| 16 | 0.060423851013183594 | 0.046183109283447266 |
+| 32 | 0.12421965599060059 | 0.0871133804321289 |
\ No newline at end of file
diff --git a/src/sdk/pynni/nni/compression/speedup/torch/infer_shape.py b/src/sdk/pynni/nni/compression/speedup/torch/infer_shape.py
index 5fca6ba5fb..8e08f2fca8 100644
--- a/src/sdk/pynni/nni/compression/speedup/torch/infer_shape.py
+++ b/src/sdk/pynni/nni/compression/speedup/torch/infer_shape.py
@@ -208,7 +208,7 @@ def convert_to_coarse_mask(mask):
             weight_cmask = CoarseMask(num_dim=4)
             weight_cmask.add_index_mask(dim=0, index=index)
             bias_cmask = None
-            if 'bias' in mask:
+            if 'bias' in mask and mask['bias'] is not None:
                 bias_index = torch.nonzero(mask['bias'], as_tuple=True)[0]
                 assert torch.all(torch.eq(index, bias_index))
                 bias_cmask = CoarseMask(num_dim=1)

From fbb6d485fe016e9a891bb26a22206f78679a7205 Mon Sep 17 00:00:00 2001
From: QuanluZhang
 <quzha@v100test.j54c4fvz4ikexg1bdsarwjfncg.ix.internal.cloudapp.net>
Date: Wed, 5 Feb 2020 12:17:36 +0000
Subject: [PATCH 20/33] remove test files

---
 examples/model_compress/test.py  |  26 -------
 examples/model_compress/test2.py | 126 -------------------------------
 examples/model_compress/test3.py |  40 ----------
 3 files changed, 192 deletions(-)
 delete mode 100644 examples/model_compress/test.py
 delete mode 100644 examples/model_compress/test2.py
 delete mode 100644 examples/model_compress/test3.py

diff --git a/examples/model_compress/test.py b/examples/model_compress/test.py
deleted file mode 100644
index 09cd1bd5fd..0000000000
--- a/examples/model_compress/test.py
+++ /dev/null
@@ -1,26 +0,0 @@
-import torch
-import torch.nn as nn
-
-class Net(nn.Module):
-    def __init__(self):
-        super(Net, self).__init__()
-        self.conv = nn.Conv2d(1, 1, 3)
-
-    def forward(self, x):
-        return self.conv(x)
-
-if __name__ == '__main__':
-    n = Net()
-    example_weight = torch.rand(1, 1, 3, 3)
-    example_forward_input = torch.rand(1, 1, 3, 3)
-
-    # Trace a specific method and construct `ScriptModule` with
-    # a single `forward` method
-    module = torch.jit.trace(n.forward, example_forward_input)
-
-    # Trace a module (implicitly traces `forward`) and construct a
-    # `ScriptModule` with a single `forward` method
-    module = torch.jit.trace(n, example_forward_input)
-    print(module.graph)
-    print(torch._C._jit_pass_inline(module.graph))
-    print(module.graph)
diff --git a/examples/model_compress/test2.py b/examples/model_compress/test2.py
deleted file mode 100644
index 93d0582208..0000000000
--- a/examples/model_compress/test2.py
+++ /dev/null
@@ -1,126 +0,0 @@
-import torch
-import torch.nn as nn
-
-torch.manual_seed(0)
-
-class Net(nn.Module):
-    def __init__(self):
-        super(Net, self).__init__()
-        self.conv = nn.Conv2d(4, 8, 3)
-        self.bn = nn.BatchNorm2d(8)
-        self.relu = nn.ReLU()
-        self.conv2 = nn.Conv2d(8, 4, 5)
-
-    def forward(self, x):
-        out = self.conv(x)
-        out = self.bn(out)
-        #print(out)
-        out = self.relu(out)
-        #print(out)
-        out = self.conv2(out)
-        #print(out)
-        return out
-
-def add_masks(model):
-    bn = getattr(model, 'bn')
-    print(bn)
-    #print('before: ', bn.weight.data)
-    bn.weight.data = bn.weight.data * torch.tensor([0,1,0,1,0,1,0,1])
-    #print('after', bn.weight.data)
-    bn.bias.data = bn.bias.data * torch.tensor([0,1,0,1,0,1,0,1])
-
-def model_speedup(model):
-    index = torch.tensor([1,3,5,7])
-    #-----
-    conv = getattr(model, 'conv')
-    new_conv = torch.nn.Conv2d(in_channels=conv.in_channels,
-                               out_channels=4,
-                               kernel_size=conv.kernel_size,
-                               stride=conv.stride,
-                               padding=conv.padding,
-                               dilation=conv.dilation,
-                               groups=1, # currently only support groups is 1
-                               bias=conv.bias is not None,
-                               padding_mode=conv.padding_mode)
-    tmp_weight_data = tmp_bias_data = None
-    tmp_weight_data = torch.index_select(conv.weight.data, 0, index)
-    if conv.bias is not None:
-        tmp_bias_data = torch.index_select(conv.bias.data, 0, index)
-    new_conv.weight.data.copy_(tmp_weight_data)
-    if conv.bias is not None:
-        new_conv.bias.data.copy_(conv.bias.data if tmp_bias_data is None else tmp_bias_data)
-    setattr(model, 'conv', new_conv)
-    #-------
-    norm = getattr(model, 'bn')
-    new_norm = torch.nn.BatchNorm2d(num_features=4,
-                                    eps=norm.eps,
-                                    momentum=norm.momentum,
-                                    affine=norm.affine,
-                                    track_running_stats=norm.track_running_stats)
-    # assign weights
-    new_norm.weight.data = torch.index_select(norm.weight.data, 0, index)
-    new_norm.bias.data = torch.index_select(norm.bias.data, 0, index)
-    if norm.track_running_stats:
-        print('*'*30, norm.track_running_stats)
-        new_norm.running_mean.data = torch.index_select(norm.running_mean.data, 0, index)
-        new_norm.running_var.data = torch.index_select(norm.running_var.data, 0, index)
-    setattr(model, 'bn', new_norm)
-    #---------
-    conv2 = getattr(model, 'conv2')
-    new_conv2 = torch.nn.Conv2d(in_channels=4,
-                               out_channels=conv2.out_channels,
-                               kernel_size=conv2.kernel_size,
-                               stride=conv2.stride,
-                               padding=conv2.padding,
-                               dilation=conv2.dilation,
-                               groups=1, # currently only support groups is 1
-                               bias=conv2.bias is not None,
-                               padding_mode=conv2.padding_mode)
-    tmp_weight_data = tmp_bias_data = None
-    #print('before select: ', conv2.weight.data.size(), conv2.weight.data)
-    tmp_weight_data = torch.index_select(conv2.weight.data, 1, index)
-    #print('after select: ', tmp_weight_data.size(), tmp_weight_data)
-    new_conv2.weight.data.copy_(tmp_weight_data)
-    if conv2.bias is not None:
-        new_conv2.bias.data.copy_(conv2.bias.data)
-    setattr(model, 'conv2', new_conv2)
-    #----------
-
-if __name__ == '__main__':
-    n = Net()
-    n.eval()
-    bn = getattr(n, 'bn')
-    print('bn track_running_stats: ', bn.track_running_stats)
-    print('bn running mean: ', bn.running_mean)
-    print('bn running var: ', bn.running_var)
-    print('bn momentum: ', bn.momentum)
-    dummy_input = torch.randn(6, 4, 16, 16)
-    mask_flag = True
-    if mask_flag:
-        add_masks(n)
-        #print('abc')
-    else:
-        conv = getattr(n, 'conv')
-        #print('conv before: ', conv.weight.data.size(), conv.weight.data)
-        model_speedup(n)
-        n.eval()
-        bn = getattr(n, 'bn')
-        #print('bn: ', bn.weight.data)
-        conv = getattr(n, 'conv')
-        #print('conv after: ', conv.weight.data.size(), conv.weight.data)
-    out = n(dummy_input)
-    print(out.size(), out)
-
-    '''example_weight = torch.rand(1, 1, 3, 3)
-    example_forward_input = torch.rand(1, 1, 3, 3)
-
-    # Trace a specific method and construct `ScriptModule` with
-    # a single `forward` method
-    module = torch.jit.trace(n.forward, example_forward_input)
-
-    # Trace a module (implicitly traces `forward`) and construct a
-    # `ScriptModule` with a single `forward` method
-    module = torch.jit.trace(n, example_forward_input)
-    print(module.graph)
-    print(torch._C._jit_pass_inline(module.graph))
-    print(module.graph)'''
diff --git a/examples/model_compress/test3.py b/examples/model_compress/test3.py
deleted file mode 100644
index 1fe508e5a5..0000000000
--- a/examples/model_compress/test3.py
+++ /dev/null
@@ -1,40 +0,0 @@
-import torch
-import torch.nn as nn
-
-class Net(nn.Module):
-    def __init__(self):
-        super(Net, self).__init__()
-        self.conv = nn.Conv2d(2, 1, 3)
-
-    def forward(self, x):
-        return self.conv(x)
-
-if __name__ == '__main__':
-    n = Net()
-    conv = getattr(n, 'conv')
-    conv.weight.data.fill_(0.5)
-    conv.bias.data.fill_(0.1)
-    #---------------
-    index = torch.tensor([1])
-    conv2 = getattr(n, 'conv')
-    new_conv2 = torch.nn.Conv2d(in_channels=1,
-                               out_channels=conv2.out_channels,
-                               kernel_size=conv2.kernel_size,
-                               stride=conv2.stride,
-                               padding=conv2.padding,
-                               dilation=conv2.dilation,
-                               groups=1, # currently only support groups is 1
-                               bias=conv2.bias is not None,
-                               padding_mode=conv2.padding_mode)
-    tmp_weight_data = tmp_bias_data = None
-    print('before select: ', conv2.weight.data.size(), conv2.weight.data)
-    tmp_weight_data = torch.index_select(conv2.weight.data, 1, index)
-    print('after select: ', tmp_weight_data.size(), tmp_weight_data)
-    new_conv2.weight.data.copy_(tmp_weight_data)
-    if conv2.bias is not None:
-        new_conv2.bias.data.copy_(conv2.bias.data)
-    setattr(n, 'conv', new_conv2)
-    #----------------
-    dummy_input = torch.zeros([1, 1, 3, 3])
-    out = n(dummy_input)
-    print(out)

From 1ce3c727a0237dfa24523680df5224cffbd573cd Mon Sep 17 00:00:00 2001
From: QuanluZhang
 <quzha@v100test.j54c4fvz4ikexg1bdsarwjfncg.ix.internal.cloudapp.net>
Date: Wed, 5 Feb 2020 12:19:54 +0000
Subject: [PATCH 21/33] update

---
 examples/model_compress/models/cifar10/vgg.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/examples/model_compress/models/cifar10/vgg.py b/examples/model_compress/models/cifar10/vgg.py
index 44222d1fdb..f293770c72 100644
--- a/examples/model_compress/models/cifar10/vgg.py
+++ b/examples/model_compress/models/cifar10/vgg.py
@@ -43,7 +43,6 @@ def make_layers(self, cfg, batch_norm=False):
 
     def forward(self, x):
         x = self.feature(x)
-        #print('x'*10, x)
         x = nn.AvgPool2d(2)(x)
         x = x.view(x.size(0), -1)
         y = self.classifier(x)

From 4db78f7d76a33b6aba8e46fbdf74dce387e119e0 Mon Sep 17 00:00:00 2001
From: QuanluZhang
 <quzha@v100test.j54c4fvz4ikexg1bdsarwjfncg.ix.internal.cloudapp.net>
Date: Wed, 5 Feb 2020 12:21:30 +0000
Subject: [PATCH 22/33] update

---
 examples/model_compress/slim_torch_cifar10.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/examples/model_compress/slim_torch_cifar10.py b/examples/model_compress/slim_torch_cifar10.py
index ce9ce4b81f..5ef11454fa 100644
--- a/examples/model_compress/slim_torch_cifar10.py
+++ b/examples/model_compress/slim_torch_cifar10.py
@@ -6,8 +6,6 @@
 from torchvision import datasets, transforms
 from nni.compression.torch import SlimPruner
 from models.cifar10.vgg import VGG
-from nni.compression.speedup.torch import ModelSpeedup
-from nni.compression.torch import apply_compression_results
 
 def updateBN(model):
     for m in model.modules():
@@ -19,7 +17,6 @@ def train(model, device, train_loader, optimizer, sparse_bn=False):
     model.train()
     for batch_idx, (data, target) in enumerate(train_loader):
         data, target = data.to(device), target.to(device)
-        #print('data: ', data.size())
         optimizer.zero_grad()
         output = model(data)
         loss = F.cross_entropy(output, target)
@@ -96,10 +93,9 @@ def main():
 
     # Test base model accuracy
     print('=' * 10 + 'Test the original model' + '=' * 10)
-    #model.load_state_dict(torch.load('vgg19_cifar10.pth'))
-    #test(model, device, test_loader)
+    model.load_state_dict(torch.load('vgg19_cifar10.pth'))
+    test(model, device, test_loader)
     # top1 = 93.60%
-    model.eval()
 
     # Pruning Configuration, in paper 'Learning efficient convolutional networks through network slimming',
     configure_list = [{

From 3d517272fdac130d7c71621a40d35a1c98ae7801 Mon Sep 17 00:00:00 2001
From: QuanluZhang
 <quzha@v100test.j54c4fvz4ikexg1bdsarwjfncg.ix.internal.cloudapp.net>
Date: Wed, 5 Feb 2020 12:41:46 +0000
Subject: [PATCH 23/33] update

---
 examples/model_compress/model_speedup.py | 4 ++--
 examples/model_compress/speedup.md       | 2 ++
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/examples/model_compress/model_speedup.py b/examples/model_compress/model_speedup.py
index 477089f129..4a2632389b 100644
--- a/examples/model_compress/model_speedup.py
+++ b/examples/model_compress/model_speedup.py
@@ -9,7 +9,7 @@
 from nni.compression.torch import apply_compression_results
 
 torch.manual_seed(0)
-use_mask = True
+use_mask = False
 
 def apoz_speedup(masks_file, model_checkpoint):
     device = torch.device('cuda')
@@ -187,7 +187,7 @@ def slim_speedup(masks_file, model_checkpoint):
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser("speedup")
-    parser.add_argument("--example_name", type=str, default="apoz", help="the name of pruning example")
+    parser.add_argument("--example_name", type=str, default="slim", help="the name of pruning example")
     parser.add_argument("--masks_file", type=str, default=None, help="the path of the masks file")
     parser.add_argument("--model_checkpoint", type=str, default=None, help="the path of checkpointed model")
     args = parser.parse_args()
diff --git a/examples/model_compress/speedup.md b/examples/model_compress/speedup.md
index b629d27820..af85ea2937 100644
--- a/examples/model_compress/speedup.md
+++ b/examples/model_compress/speedup.md
@@ -1,5 +1,7 @@
 # Speedup Results
 
+This code only works on torch 1.3.1 and torchvision 0.4.2
+
 ## slim pruner example
 
 on one V100 GPU,

From 70d3b1ef7467571e785e60ac0017534d2031eca9 Mon Sep 17 00:00:00 2001
From: QuanluZhang
 <quzha@v100test.j54c4fvz4ikexg1bdsarwjfncg.ix.internal.cloudapp.net>
Date: Wed, 5 Feb 2020 14:33:40 +0000
Subject: [PATCH 24/33] add comments

---
 .../compression/speedup/torch/compressor.py   | 242 +++++++++++-------
 1 file changed, 155 insertions(+), 87 deletions(-)

diff --git a/src/sdk/pynni/nni/compression/speedup/torch/compressor.py b/src/sdk/pynni/nni/compression/speedup/torch/compressor.py
index fe2c7cf64e..45a4702e09 100644
--- a/src/sdk/pynni/nni/compression/speedup/torch/compressor.py
+++ b/src/sdk/pynni/nni/compression/speedup/torch/compressor.py
@@ -12,6 +12,21 @@
 
 
 def get_module_by_name(model, module_name):
+    """
+    Get a module specified by its module name
+
+    Parameters
+    ----------
+    model : pytorch model
+        the pytorch model from which to get its module
+    module_name : str
+        the name of the required module
+
+    Returns
+    -------
+    module, module
+        the parent module of the required module, the required module
+    """
     name_list = module_name.split(".")
     for name in name_list[:-1]:
         model = getattr(model, name)
@@ -19,57 +34,87 @@ def get_module_by_name(model, module_name):
     return model, leaf_module
 
 class GNode:
+    """
+    It is used to represent a node in model graph, in this graph a module is a node,
+    a function out of module (in ```forward``` function) could also be a node.
+    """
     def __init__(self, node_name, node_type, op_type, inputs, outputs, nodes):
-        self.name = node_name # module name if is module, scope name + seq if is func
-        self.type = node_type # module or func
+        """
+        Parameters
+        ----------
+        node_name : str
+            It is module name if the node is a module, it is ```scope_name.node_kind.seq``` if it is a func
+        node_type : str
+            It only has two options: `module` or `func`
+        op_type : str
+            The operation type of the module or func
+        inputs : list of str
+            All the inputs of this node, each element is debugName of one input
+        outputs : list of str
+            All the outputs of this node, each element is debugName of one output
+        nodes : list of node
+            All the trace graph nodes included in this module or func
+        """
+        self.name = node_name
+        self.type = node_type
         self.op_type = op_type
         self.inputs = inputs
         self.outputs = outputs
         self.nodes = nodes
-        self.auxiliary = None # store supplementary information for different op types
+        # store supplementary information for different op types
+        # for example, for ```view``` it stores the shape of its input and output
+        self.auxiliary = None
 
 class ModelSpeedup:
     """
-    Abstract base PyTorch ModelSpeedup
+    This class is to speedup the model with provided weight mask
     """
 
     def __init__(self, model, dummy_input, masks_file):
         """
-        Record necessary info in class members
-
         Parameters
         ----------
         model : pytorch model
-            the model user wants to compress
-        masks : dict
-            the generated masks for modules,
-            key is module name,
-            value is a dict including key `weight`, or also key `bias`
-        onnx_graph : xxx
-            it is used to parse dependencies between modules
+            The model user wants to speed up
+        dummy_input : pytorch tensor
+            The dummy input for ```jit.trace```, users should put it on right device before pass in
+        masks_file : str
+            The path of user provided mask file
         """
         self.bound_model = model
         self.dummy_input = dummy_input
         self.masks = torch.load(masks_file)
-        #ori_masks = torch.load(masks_file)
-        #self.masks = {'feature.1': ori_masks['feature.1']}
         self.is_training = model.training
+        # to obtain forward graph, model should be in ```eval``` mode
         if self.is_training:
             model.eval()
         self.trace_graph = torch.jit.trace(model, dummy_input)
         if self.is_training:
             model.train()
-        #print("masks: ", self.masks)
-        #print(self.trace_graph)
-        #print(self.trace_graph.graph)
         self.inferred_masks = dict() # key: module_name, value: ModuleMasks
         self.g_nodes = list()
         self.global_count = 0
         self.name_to_gnode, self.input_to_gnode, self.output_to_gnode = self._build_graph()
-        #self.replaced_modules = dict()
 
     def _build_index_for_gnodes(self, g_nodes):
         """
+        Build indexes for quick search
+
+        Parameters
+        ----------
+        g_nodes : list of GNode
+            All the g_node in processed model graph
+
+        Returns
+        -------
+        dict
+            use name to index g_nodes, key: node name, value: g_node
+        dict
+            use input (its name) to index g_nodes,
+            key: input, value: list of g_nodes that take this input
+        dict
+            use output (its name) to index g_nodes,
+            key: output, value: g_node that generates this output
         """
         name_to_gnode = dict()
         input_to_gnode = dict()
@@ -82,23 +127,38 @@ def _build_index_for_gnodes(self, g_nodes):
                 else:
                     input_to_gnode[_input] = [node]
             for output in node.outputs:
-                if output in output_to_gnode:
-                    print("output: ", output)
-                    print("gnode: ", output_to_gnode[output].name)
-                assert not output in output_to_gnode, "One output cannot be generated by multiple nodes"
+                assert not output in output_to_gnode, \
+                    "One output cannot be generated by multiple nodes"
                 output_to_gnode[output] = node
         return name_to_gnode, input_to_gnode, output_to_gnode
 
     def _expand_non_prim_node(self, node, nodes, input_to_node, output_to_node):
         """
+        For trace graph nodes, some nodes are not in modules, these nodes are usually generated by
+        the functions directly called in module ```forward```. For such nodes, some of them are
+        trivial op which are label by ```prim::```, some of them are not such ops which is call
+        non-prim ops. This function is to merge neighbor prim ops to a non-prim op, to construct
+        a GNode.
+
+        Parameters
+        ----------
+        node : trace graph node
+            The non-prim node to expand
+        nodes : list of trace graph node
+            All the trace graph nodes within the same scope as the non-prim node
+        input_to_node : dict
+            key: input name, value: a node that uses this input
+        output_to_node : dict
+            key: output name, value: a node that generates this output
+
+        Returns
+        -------
+        GNode
+            the expanded non-prim node in GNode format
         """
-        #print('^=' * 30)
-        #for n in nodes:
-        #    print(n)
-        #print('v=' * 30)
         # TODO: scope name could be empty
         node_name = '.'.join([node.scopeName(), node.kind(), str(self.global_count)])
-        print('node_name: ', node_name)
+        #print('node_name: ', node_name)
         self.global_count += 1
         op_type = node.kind()
 
@@ -110,11 +170,10 @@ def _expand_non_prim_node(self, node, nodes, input_to_node, output_to_node):
         while not node_queue.empty():
             curr_node = node_queue.get()
             for _input in curr_node.inputs():
-                print('_input: ', _input)
                 input_name = _input.debugName()
                 if input_name in output_to_node and output_to_node[input_name] in nodes:
                         predecessor_node = output_to_node[input_name]
-                        print("predecessor_node: ", predecessor_node)
+                        #print("predecessor_node: ", predecessor_node)
                         if predecessor_node.kind().startswith('prim::'):
                             node_group.append(predecessor_node)
                             node_queue.put(predecessor_node)
@@ -125,14 +184,21 @@ def _expand_non_prim_node(self, node, nodes, input_to_node, output_to_node):
         for output in node.outputs():
             outputs.append(output.debugName())
         g_node = GNode(node_name, 'func', op_type, inputs, outputs, node_group)
-        print('^' * 30)
-        for n in g_node.nodes:
-            print(n)
-        print('v' * 30)
         return g_node
 
     def _extract_shape_info(self, node):
         """
+        Extract the shape information of ```aten::view``` node
+
+        Parameters
+        ----------
+        node : trace graph node
+            It should be ```aten::view``` node
+
+        Returns
+        -------
+        dict
+            Include shape of input tensor and shape of output tensor
         """
         t_input = None
         for _input in node.inputs():
@@ -147,10 +213,25 @@ def _extract_shape_info(self, node):
 
     def _build_graph(self):
         """
+        Build graph using our defined format from jit trace.
+        There are basically three steps: first, construct necessary information (data structures),
+        second, extract all the modules to convert to GNode, Third, extract all functions to convert
+        to GNode.
+
+        Returns
+        -------
+        dict
+            use name to index g_nodes, key: node name, value: g_node
+        dict
+            use input (its name) to index g_nodes,
+            key: input, value: list of g_nodes that take this input
+        dict
+            use output (its name) to index g_nodes,
+            key: output, value: g_node that generates this output
         """
         graph = self.trace_graph.graph
-        #torch._C._jit_pass_inline(graph)
-        print(graph)
+        # if torch 1.4.0 is used, consider run torch._C._jit_pass_inline(graph) here
+        #print(graph)
         # build output mapping, from output debugName to its node
         output_to_node = dict()
         # build input mapping, from input debugName to its node
@@ -169,9 +250,6 @@ def _build_graph(self):
         for output in graph.outputs():
             graph_outputs.append(output.debugName())
 
-        #print("graph_inputs: ", graph_inputs)
-        #print("graph_outputs: ", graph_outputs)
-
         for node in graph.nodes():
             # populate output_to_node and input_to_node
             for output in node.outputs():
@@ -188,7 +266,6 @@ def _build_graph(self):
                 if scope_name == '':
                     continue
                 else:
-                    # TODO: there might be more than one funcs in scope_name
                     if scope_name in func_to_nodes:
                         func_to_nodes[scope_name].append(node)
                     else:
@@ -202,15 +279,7 @@ def _build_graph(self):
                 else:
                     module_to_nodes[module_name] = [node]
 
-        print('xx' * 30)
-        for k in output_to_node:
-            print(k)
-        print('yy' * 30)
-
-        # for each module, find its inputs and outputs
-        # build module mapping, from module name to its inputs debugName and outputs debugName,
-        #module_to_inputs = dict()
-        #module_to_outputs = dict()
+        # construct GNode from module
         for module_name, nodes in module_to_nodes.items():
             inputs = set()
             outputs = set()
@@ -232,8 +301,6 @@ def _build_graph(self):
                     m_inputs.append(_input)
                 elif not output_to_node[_input] in nodes:
                     m_inputs.append(_input)
-            #module_to_inputs[module_name] = m_inputs
-            #module_to_outputs[module_name] = m_outputs
             print("module node_name: ", module_name)
             if module_name == '':
                 for n in nodes:
@@ -259,34 +326,21 @@ def _build_graph(self):
         # build index for g_nodes
         name_to_gnode, input_to_gnode, output_to_gnode = self._build_index_for_gnodes(self.g_nodes)
 
-        return name_to_gnode, input_to_gnode, output_to_gnode #output_to_node, input_to_node
+        return name_to_gnode, input_to_gnode, output_to_gnode
 
-    '''def _do_module_replace(self, module_name, mask=None, in_shape=None, out_shape=None):
-        """
+    def _find_predecessors(self, module_name):
         """
-        assert not module_name in self.replaced_modules
-        input_cmask = output_cmask = None
-        assert module_name in self.module_inputs, "module does not exist in trace graph"
-        if mask is not None:
-            assert in_shape is None and out_shape is None
-            super_module, leaf_module = get_module_by_name(self.bound_model, module_name)
-            m_type = self.module_to_type[module_name]
-            compressed_module, input_cmask, output_cmask = cms[m_type](leaf_module, mask)
-            setattr(super_module, module_name, compressed_module)
-
-        if in_shape is not None:
-            assert not module_name in self.masks
-            super_module, leaf_module = get_module_by_name(self.bound_model, module_name)
-            m_type = self.module_to_type[module_name]
-            compressed_module, input_cmask, output_cmask = cms_input[m_type](leaf_module, in_shape)
+        Find predecessor GNode of the given GNode
 
-        if out_shape is not None:
-            assert not module_name in self.masks
-            #...
-        return input_cmask, output_cmask'''
+        Parameters
+        ----------
+        module_name : str
+            The name of the GNode
 
-    def _find_predecessors(self, module_name):
-        """
+        Returns
+        -------
+        list
+            a list of GNodes who are the given GNode's predecessor
         """
         predecessors = []
         for _input in self.name_to_gnode[module_name].inputs:
@@ -302,6 +356,17 @@ def _find_predecessors(self, module_name):
 
     def _find_successors(self, module_name):
         """
+        Find successor GNodes of the given GNode
+
+        Parameters
+        ----------
+        module_name : str
+            The name of the GNode
+
+        Returns
+        -------
+        list
+            a list of GNodes who are the given GNode's successor
         """
         successors = []
         for output in self.name_to_gnode[module_name].outputs:
@@ -315,6 +380,16 @@ def _find_successors(self, module_name):
 
     def infer_module_mask(self, module_name, mask=None, in_shape=None, out_shape=None):
         """
+        Parameters
+        ----------
+        module_name : str
+            The name of the GNode
+        mask : tensor of mask or ModuleMasks
+            Mask of the weights in this GNode (i.e., module)
+        in_shape : ModuleMasks
+            Input shape of this GNode
+        out_shape : ModuleMasks
+            Output shape of this GNode
         """
         input_cmask = output_cmask = None
         if module_name in self.inferred_masks:
@@ -324,10 +399,6 @@ def infer_module_mask(self, module_name, mask=None, in_shape=None, out_shape=Non
             self.inferred_masks[module_name] = module_masks
 
         m_type = self.name_to_gnode[module_name].op_type
-        #if m_type == 'VGG':
-        #    print("VGG module name: ", module_name)
-        #    for node in self.name_to_gnode[module_name].nodes:
-        #        print(node)
         print("infer_module_mask: {}, module type: {}".format(module_name, m_type))
         if mask is not None:
             print("mask is not None")
@@ -359,19 +430,17 @@ def infer_module_mask(self, module_name, mask=None, in_shape=None, out_shape=Non
 
     def infer_modules_masks(self):
         """
+        Do mask and shape inference
         """
         for module_name, mask in self.masks.items():
             self.infer_module_mask(module_name, mask=mask)
 
     def replace_compressed_modules(self):
         """
+        Replace all the modules that are compressed
         """
         print('*' * 30)
         for module_name in self.inferred_masks:
-            #module_masks = self.inferred_masks[module_name]
-            #print(module_masks.param_masks)
-            #print(module_masks.input_mask)
-            #print(module_masks.output_mask)
             g_node = self.name_to_gnode[module_name]
             print(module_name, g_node.op_type)
             if g_node.type == 'module':
@@ -386,15 +455,14 @@ def replace_compressed_modules(self):
 
     def speedup_model(self):
         """
+        There are basically two steps: first, do mask/shape inference,
+        second, replace modules
         """
-        #self.bound_model(self.dummy_input)
         print("start to compress")
         self.infer_modules_masks()
         self.replace_compressed_modules()
         print("finished compressing")
-        #for name, module in self.bound_model.named_modules():
-        #    print(name, module)
-        #self.bound_model(self.dummy_input)
+        # resume the model mode to that before the model is speed up
         if self.is_training:
             self.bound_model.train()
         else:

From c80c7a9d1a60161fc62eb6f79720354c435bf030 Mon Sep 17 00:00:00 2001
From: QuanluZhang
 <quzha@v100test.j54c4fvz4ikexg1bdsarwjfncg.ix.internal.cloudapp.net>
Date: Thu, 6 Feb 2020 02:18:19 +0000
Subject: [PATCH 25/33] add comments

---
 .../compression/speedup/torch/compressor.py   | 39 +++++++++++++------
 1 file changed, 27 insertions(+), 12 deletions(-)

diff --git a/src/sdk/pynni/nni/compression/speedup/torch/compressor.py b/src/sdk/pynni/nni/compression/speedup/torch/compressor.py
index 45a4702e09..d23a84d9b6 100644
--- a/src/sdk/pynni/nni/compression/speedup/torch/compressor.py
+++ b/src/sdk/pynni/nni/compression/speedup/torch/compressor.py
@@ -380,6 +380,16 @@ def _find_successors(self, module_name):
 
     def infer_module_mask(self, module_name, mask=None, in_shape=None, out_shape=None):
         """
+        Infer input shape / output shape based on the module's weight mask / input shape / output shape.
+        
+        For a module:
+            Infer its input and output shape from its weight mask
+            Infer its output shape from its input shape
+            Infer its input shape from its output shape
+
+        If its input shape is changed, continue infering its predecessors
+        If its output shape is changed, continue infering its successors
+
         Parameters
         ----------
         module_name : str
@@ -401,10 +411,10 @@ def infer_module_mask(self, module_name, mask=None, in_shape=None, out_shape=Non
         m_type = self.name_to_gnode[module_name].op_type
         print("infer_module_mask: {}, module type: {}".format(module_name, m_type))
         if mask is not None:
-            print("mask is not None")
+            #print("mask is not None")
             input_cmask, output_cmask = infer_from_mask[m_type](module_masks, mask)
         if in_shape is not None:
-            print("in_shape is not None")
+            #print("in_shape is not None")
             if m_type == 'aten::view':
                 output_cmask = infer_from_inshape[m_type](module_masks,
                                                           in_shape,
@@ -412,17 +422,17 @@ def infer_module_mask(self, module_name, mask=None, in_shape=None, out_shape=Non
             else:
                 output_cmask = infer_from_inshape[m_type](module_masks, in_shape)
         if out_shape is not None:
-            print("out_shape is not None")
+            #print("out_shape is not None")
             input_cmask = infer_from_outshape[m_type](module_masks, out_shape)
 
         if input_cmask:
-            print("input_cmask is not None")
+            #print("input_cmask is not None")
             predecessors = self._find_predecessors(module_name)
             for _module_name in predecessors:
                 print("input_cmask, module_name: ", _module_name)
                 self.infer_module_mask(_module_name, out_shape=input_cmask)
         if output_cmask:
-            print("output_cmask is not None")
+            #print("output_cmask is not None")
             successors = self._find_successors(module_name)
             for _module_name in successors:
                 print("output_cmask, module_name: ", _module_name)
@@ -430,16 +440,20 @@ def infer_module_mask(self, module_name, mask=None, in_shape=None, out_shape=Non
 
     def infer_modules_masks(self):
         """
-        Do mask and shape inference
+        Do shape inference of involved modules, including the shape of weights, inputs, output
         """
         for module_name, mask in self.masks.items():
             self.infer_module_mask(module_name, mask=mask)
 
     def replace_compressed_modules(self):
         """
-        Replace all the modules that are compressed
+        Replace all the modules that have changed (weights/inputs/output) shape.
+        The new module is created using the same arguments of the to-be-replaced module,
+        and correctly inherits its weights.
+
+        NOTE: ```func``` type cannot be replaced as it is not a module, thus, one limitation
+        is that ```func``` should be not required to be replaced.
         """
-        print('*' * 30)
         for module_name in self.inferred_masks:
             g_node = self.name_to_gnode[module_name]
             print(module_name, g_node.op_type)
@@ -449,19 +463,20 @@ def replace_compressed_modules(self):
                 compressed_module = replace_module[m_type](leaf_module, self.inferred_masks[module_name])
                 setattr(super_module, module_name.split('.')[-1], compressed_module)
             elif g_node.type == 'func':
-                print("Cannot replace func...")
+                print("Warning: Cannot replace func...")
             else:
                 raise RuntimeError("Unsupported GNode type: {}".format(g_node.type))
 
     def speedup_model(self):
         """
-        There are basically two steps: first, do mask/shape inference,
+        There are basically two steps:
+        first, do mask/shape inference,
         second, replace modules
         """
-        print("start to compress")
+        #print("start to compress")
         self.infer_modules_masks()
         self.replace_compressed_modules()
-        print("finished compressing")
+        #print("finished compressing")
         # resume the model mode to that before the model is speed up
         if self.is_training:
             self.bound_model.train()

From 005a664ae16dee6328ae5fb426687f42f402f812 Mon Sep 17 00:00:00 2001
From: QuanluZhang
 <quzha@v100test.j54c4fvz4ikexg1bdsarwjfncg.ix.internal.cloudapp.net>
Date: Thu, 6 Feb 2020 06:49:06 +0000
Subject: [PATCH 26/33] add comments

---
 .../compression/speedup/torch/compressor.py   |   9 +
 .../compression/speedup/torch/infer_shape.py  | 219 +++++++++++++++++-
 2 files changed, 224 insertions(+), 4 deletions(-)

diff --git a/src/sdk/pynni/nni/compression/speedup/torch/compressor.py b/src/sdk/pynni/nni/compression/speedup/torch/compressor.py
index d23a84d9b6..783036c7ab 100644
--- a/src/sdk/pynni/nni/compression/speedup/torch/compressor.py
+++ b/src/sdk/pynni/nni/compression/speedup/torch/compressor.py
@@ -412,9 +412,15 @@ def infer_module_mask(self, module_name, mask=None, in_shape=None, out_shape=Non
         print("infer_module_mask: {}, module type: {}".format(module_name, m_type))
         if mask is not None:
             #print("mask is not None")
+            if not m_type in infer_from_mask:
+                raise RuntimeError("Has not supported infering \
+                    input/output shape from mask for module/function: `{}`".format(m_type))
             input_cmask, output_cmask = infer_from_mask[m_type](module_masks, mask)
         if in_shape is not None:
             #print("in_shape is not None")
+            if not m_type in infer_from_inshape:
+                raise RuntimeError("Has not supported infering \
+                    output shape from input shape for module/function: `{}`".format(m_type))
             if m_type == 'aten::view':
                 output_cmask = infer_from_inshape[m_type](module_masks,
                                                           in_shape,
@@ -423,6 +429,9 @@ def infer_module_mask(self, module_name, mask=None, in_shape=None, out_shape=Non
                 output_cmask = infer_from_inshape[m_type](module_masks, in_shape)
         if out_shape is not None:
             #print("out_shape is not None")
+            if not m_type in infer_from_outshape:
+                raise RuntimeError("Has not supported infering \
+                    input shape from output shape for module/function: `{}`".format(m_type))
             input_cmask = infer_from_outshape[m_type](module_masks, out_shape)
 
         if input_cmask:
diff --git a/src/sdk/pynni/nni/compression/speedup/torch/infer_shape.py b/src/sdk/pynni/nni/compression/speedup/torch/infer_shape.py
index 8e08f2fca8..735a357ba8 100644
--- a/src/sdk/pynni/nni/compression/speedup/torch/infer_shape.py
+++ b/src/sdk/pynni/nni/compression/speedup/torch/infer_shape.py
@@ -9,15 +9,48 @@
 import torch
 
 class CoarseMask:
+    """
+    Coarse grained mask for a given tensor, here tensor could be weights,
+    input tensor, or output tensor
+    """
     def __init__(self, num_dim):
-        # index existing ones
+        """
+        Parameters
+        ----------
+        num_dim : int
+            The number of dimensions of the tensor that will be masked
+        """
         self.mask_index = [None for _ in range(num_dim)]
 
     def add_index_mask(self, dim, index):
+        """
+        Add mask for the specified dimension
+
+        Parameters
+        ----------
+        dim : int
+            The dimension to add mask
+        index : tensor
+            The mask for this dimension, its a 1 dimension tensor which specifies
+            the index of the elements that are not pruned
+        """
         self.mask_index[dim] = index
 
     @staticmethod
     def merge_index(index_a, index_b):
+        """
+        Parameters
+        ----------
+        index_a : tensor
+            One index (1-dimension) tensor
+        index_b : tensor
+            The other index (1-dimension) tensor
+
+        Returns
+        -------
+        tensor
+            The merged index (1-dimension) tensor
+        """
         s = set()
         for num in index_a:
             s.add(num)
@@ -26,6 +59,19 @@ def merge_index(index_a, index_b):
         return torch.tensor(sorted(s))
 
     def merge(self, cmask):
+        """
+        Merge another CoarseMask
+
+        Parameters
+        ----------
+        cmask : CoarseMask
+            Another CoarseMask to merge
+
+        Returns
+        -------
+        list
+            The member variable ```mask_index```
+        """
         assert isinstance(cmask, CoarseMask)
         assert len(self.mask_index) == len(cmask.mask_index)
         for i, index in enumerate(self.mask_index):
@@ -37,8 +83,15 @@ def merge(self, cmask):
         return self.mask_index
 
 class ModuleMasks:
+    """
+    The masks of a module, including the masks for weights, inputs, output
+    """
     def __init__(self, module_name):
         """
+        Parameters
+        ----------
+        module_name : str
+            The name of the module or function
         """
         self.module_name = module_name
         self.param_masks = dict()
@@ -46,20 +99,45 @@ def __init__(self, module_name):
         self.output_mask = None
     
     def set_param_masks(self, name, mask):
+        """
+        Parameters
+        ----------
+        name : str
+            The name of the weight
+        mask : CoarseMask
+            The mask for this weight
+        """
         self.param_masks[name] = mask
 
     def set_input_mask(self, mask):
+        """
+        Parameters
+        ----------
+        mask : CoarseMask
+            The mask for input
+        """
         self.input_mask = mask
 
     def set_output_mask(self, mask):
+        """
+        Parameters
+        ----------
+        mask : CoarseMask
+            The mask for output
+        """
         self.output_mask = mask
 
-
+"""
+Infer input and output shape of a module/function from its weight mask
+"""
 infer_from_mask = {
     'BatchNorm2d': lambda module_masks, mask: batchnorm2d_mask(module_masks, mask),
     'Conv2d': lambda module_masks, mask: conv2d_mask(module_masks, mask)
 }
 
+"""
+Infer output and weight shape of a module/function from its input shape
+"""
 infer_from_inshape = {
     'ReLU': lambda module_masks, mask: relu_inshape(module_masks, mask),
     'aten::relu': lambda module_masks, mask: relu_inshape(module_masks, mask),
@@ -74,12 +152,28 @@ def set_output_mask(self, mask):
     'BatchNorm2d': lambda module_masks, mask: batchnorm2d_inshape(module_masks, mask)
 }
 
+"""
+Infer input and weight shape of a module/function from its output shape
+"""
 infer_from_outshape = {
     'Conv2d': lambda module_masks, mask: conv2d_outshape(module_masks, mask)
 }
 
 def batchnorm2d_inshape(module_masks, mask):
     """
+    We assume only the second dimension has coarse grained mask
+
+    Parameters
+    ----------
+    module_masks : ModuleMasks
+        The ModuleMasks instance of the batchnorm2d
+    mask : CoarseMask
+        The mask of its input tensor
+
+    Returns
+    -------
+    CoarseMask
+        The mask of its output tensor
     """
     assert isinstance(mask, CoarseMask)
     assert mask.mask_index[1] is not None
@@ -96,6 +190,19 @@ def batchnorm2d_inshape(module_masks, mask):
 
 def linear_inshape(module_masks, mask):
     """
+    Coarse grained input mask does not change the shape of weights and output tensor
+
+    Parameters
+    ----------
+    module_masks : ModuleMasks
+        The ModuleMasks instance of the linear
+    mask : CoarseMask
+        The mask of its input tensor
+
+    Returns
+    -------
+    CoarseMask
+        The mask of its output tensor, ```None``` means shape of output tensor is not changed
     """
     assert isinstance(mask, CoarseMask)
     assert mask.mask_index[0] is None
@@ -105,8 +212,24 @@ def linear_inshape(module_masks, mask):
 
 def view_inshape(module_masks, mask, shape):
     """
+    This is a limited support
+
     TODO: consider replace tensor.view with nn.Flatten, because tensor.view is not
     included in module, thus, cannot be replaced by our framework.
+    
+    Parameters
+    ----------
+    module_masks : ModuleMasks
+        The ModuleMasks instance of the ```view``` op
+    mask : CoarseMask
+        The mask of its input tensor
+    shape : dict
+        Original shape of its input and output tensors
+
+    Returns
+    -------
+    CoarseMask
+        The mask of its output tensor
     """
     # NOTE: the case constrained by the following four asserts
     assert shape['in_shape'][0] == shape['out_shape'][0]
@@ -133,11 +256,25 @@ def view_inshape(module_masks, mask, shape):
 
 def size_inshape(module_masks, mask):
     """
+    No need to do anything for this ```size``` op
     """
     return None
 
 def maxpool2d_inshape(module_masks, mask):
     """
+    Assume only the second dimension is masked
+
+    Parameters
+    ----------
+    module_masks : ModuleMasks
+        The ModuleMasks instance of the maxpool2d
+    mask : CoarseMask
+        The mask of its input tensor
+
+    Returns
+    -------
+    CoarseMask
+        The mask of its output tensor
     """
     assert isinstance(mask, CoarseMask)
     assert mask.mask_index[1] is not None
@@ -151,16 +288,40 @@ def maxpool2d_inshape(module_masks, mask):
 
 def relu_inshape(module_masks, mask):
     """
+    Parameters
+    ----------
+    module_masks : ModuleMasks
+        The ModuleMasks instance of the relu
+    mask : CoarseMask
+        The mask of its input tensor
+
+    Returns
+    -------
+    CoarseMask
+        The mask of its output tensor
     """
     assert isinstance(mask, CoarseMask)
     # TODO: double check this assert, is it possible that a module is passed twice
     assert module_masks.input_mask is None
     module_masks.set_input_mask(mask)
     module_masks.set_output_mask(mask)
-    return mask # return shape of output tensor
+    return mask
 
 def batchnorm2d_mask(module_masks, mask):
     """
+    Infer input and output shape from weight mask
+
+    Parameters
+    ----------
+    module_masks : ModuleMasks
+        The ModuleMasks instance of the batchnorm2d
+    mask : dict
+        The mask of its weights, from the user provided mask file
+
+    Returns
+    -------
+    CoarseMask, CoarseMask
+        The mask of its input tensor, the mask of its output tensor
     """
     assert 'weight' in mask and 'bias' in mask
     sum_mask = mask['weight'] + mask['bias']
@@ -183,8 +344,32 @@ def batchnorm2d_mask(module_masks, mask):
 
 def conv2d_mask(module_masks, mask):
     """
+    Infer input and output shape from weight mask
+
+    Parameters
+    ----------
+    module_masks : ModuleMasks
+        The ModuleMasks instance of the conv2d
+    mask : dict
+        The mask of its weights, from the user provided mask file
+
+    Returns
+    -------
+    CoarseMask, CoarseMask
+        The mask of its input tensor, the mask of its output tensor
     """
     def convert_to_coarse_mask(mask):
+        """
+        Parameters
+        ----------
+        mask : dict
+            Weight mask from user provided mask file
+
+        Returns
+        -------
+        LongTensor, CoarseMask, CoarseMask
+            Index of the masked dimension, weight mask, bias mask
+        """
         assert 'weight' in mask
         assert isinstance(mask['weight'], torch.Tensor)
         cmask = None
@@ -235,6 +420,19 @@ def convert_to_coarse_mask(mask):
 
 def conv2d_inshape(module_masks, mask):
     """
+    Shape change of input tensor does not affect the shape of its output tensor
+
+    Parameters
+    ----------
+    module_masks : ModuleMasks
+        The ModuleMasks instance of the conv2d
+    mask : CoarseMask
+        The mask of its input tensor
+
+    Returns
+    -------
+    CoarseMask
+        The mask of its output tensor
     """
     assert isinstance(mask, CoarseMask)
     assert module_masks.input_mask is None
@@ -243,6 +441,19 @@ def conv2d_inshape(module_masks, mask):
 
 def conv2d_outshape(module_masks, mask):
     """
+    Assume only the second dimension is masked
+
+    Parameters
+    ----------
+    module_masks : ModuleMasks
+        The ModuleMasks instance of the conv2d
+    mask : CoarseMask
+        The mask of its output tensor
+    
+    Returns
+    -------
+    CoarseMask
+        The mask of its input tensor
     """
     assert isinstance(mask, CoarseMask)
     assert mask.mask_index[1] is not None
@@ -264,5 +475,5 @@ def conv2d_outshape(module_masks, mask):
     module_masks.set_param_masks('weight', weight_cmask)
     module_masks.set_param_masks('bias', bias_cmask)
     # input shape is not changed
-    return None # return shape of input tensor
+    return None
     
\ No newline at end of file

From d11a54aa5a26379b895b08e2ca819c2a0f051506 Mon Sep 17 00:00:00 2001
From: QuanluZhang
 <quzha@v100test.j54c4fvz4ikexg1bdsarwjfncg.ix.internal.cloudapp.net>
Date: Thu, 6 Feb 2020 06:58:12 +0000
Subject: [PATCH 27/33] add comments

---
 .../speedup/torch/compress_modules.py         | 65 ++++++++++---------
 .../compression/speedup/torch/compressor.py   |  2 +
 2 files changed, 36 insertions(+), 31 deletions(-)

diff --git a/src/sdk/pynni/nni/compression/speedup/torch/compress_modules.py b/src/sdk/pynni/nni/compression/speedup/torch/compress_modules.py
index cd421b6e0e..90ce8e8218 100644
--- a/src/sdk/pynni/nni/compression/speedup/torch/compress_modules.py
+++ b/src/sdk/pynni/nni/compression/speedup/torch/compress_modules.py
@@ -14,11 +14,23 @@
 
 def no_replace(module, mask):
     """
+    No need to replace
     """
     return module
 
 def replace_linear(linear, mask):
     """
+    Parameters
+    ----------
+    linear : torch.nn.Linear
+        The linear module to be replace
+    mask : ModuleMasks
+        The masks of this module
+
+    Returns
+    -------
+    torch.nn.Linear
+        The new linear module
     """
     assert isinstance(mask, ModuleMasks)
     assert mask.input_mask is not None
@@ -32,22 +44,24 @@ def replace_linear(linear, mask):
                                  out_features=linear.out_features,
                                  bias=linear.bias is not None)
     new_linear.to(linear.weight.device)
-    #print(linear.weight.data.size())
-    #print(new_linear.weight.data.size())
-    #print(linear.weight.t().size())
-    #print(new_linear.weight.t().size())
     new_linear.weight.data = torch.index_select(linear.weight.data, -1, index.to(linear.weight.device))
-    #print(new_linear.weight.data.size())
     if linear.bias is not None:
-        #print(linear.bias.data.size())
-        #new_linear.bias.data = torch.index_select(linear.bias.data, 0, index.to('cuda:0'))
         new_linear.bias.data.copy_(linear.bias.data)
-        #print(new_linear.bias.data.size())
-    #print("last print: ", new_linear.weight.t().size())
     return new_linear
 
 def replace_batchnorm2d(norm, mask):
     """
+    Parameters
+    ----------
+    norm : torch.nn.BatchNorm2d
+        The batchnorm module to be replace
+    mask : ModuleMasks
+        The masks of this module
+
+    Returns
+    -------
+    torch.nn.BatchNorm2d
+        The new batchnorm module
     """
     assert isinstance(mask, ModuleMasks)
     assert 'weight' in mask.param_masks and 'bias' in mask.param_masks
@@ -62,8 +76,6 @@ def replace_batchnorm2d(norm, mask):
     # assign weights
     new_norm.weight.data = torch.index_select(norm.weight.data, 0, index)
     new_norm.bias.data = torch.index_select(norm.bias.data, 0, index)
-    #print('new_norm weight data: ', new_norm.weight.data)
-    #print('new_norm bias data: ', new_norm.bias.data)
     if norm.track_running_stats:
         new_norm.running_mean.data = torch.index_select(norm.running_mean.data, 0, index)
         new_norm.running_var.data = torch.index_select(norm.running_var.data, 0, index)
@@ -71,28 +83,29 @@ def replace_batchnorm2d(norm, mask):
 
 def replace_conv2d(conv, mask):
     """
+    Parameters
+    ----------
+    conv : torch.nn.Conv2d
+        The conv2d module to be replaced
+    mask : ModuleMasks
+        The masks of this module
+
+    Returns
+    -------
+    torch.nn.Conv2d
+        The new conv2d module
     """
-    # fine-grained tensor sparse
-    #...
-    # coarse-grained shape sparse
-    #...
     assert isinstance(mask, ModuleMasks)
     if mask.input_mask is None:
         in_channels = conv.in_channels
-        print('in_channels: ', in_channels)
     else:
         in_channels_index = mask.input_mask.mask_index[1]
-        print('in_channels_index: ', in_channels_index)
         in_channels = in_channels_index.size()[0]
-        print('in_channels: ', in_channels)
     if mask.output_mask is None:
         out_channels = conv.out_channels
-        print('out_channels: ', out_channels)
     else:
         out_channels_index = mask.output_mask.mask_index[1]
-        print('out_channels_index: ', out_channels_index)
         out_channels = out_channels_index.size()[0]
-        print('out_channels: ', out_channels)
     new_conv = torch.nn.Conv2d(in_channels=in_channels,
                                out_channels=out_channels,
                                kernel_size=conv.kernel_size,
@@ -102,20 +115,14 @@ def replace_conv2d(conv, mask):
                                groups=1, # currently only support groups is 1
                                bias=conv.bias is not None,
                                padding_mode=conv.padding_mode)
-    #print('weight: ', conv.weight.get_device())
-    #print('bias', conv.bias.get_device())
-    #print('conv2d weight: ', conv.weight.data.size(), conv.weight.data)
     new_conv.to(conv.weight.device)
     tmp_weight_data = tmp_bias_data = None
     if mask.output_mask is not None:
-        print('mask output_mask is not None')
         tmp_weight_data = torch.index_select(conv.weight.data, 0, out_channels_index)
         if conv.bias is not None:
-            print('bias is not None')
             tmp_bias_data = torch.index_select(conv.bias.data, 0, out_channels_index)
     # NOTE: does not support group
     if mask.input_mask is not None:
-        print('mask input_mask is not None')
         tmp_weight_data = torch.index_select(conv.weight.data if tmp_weight_data is None else tmp_weight_data,
                                              1, in_channels_index)
     assert tmp_weight_data is not None
@@ -123,8 +130,4 @@ def replace_conv2d(conv, mask):
     if conv.bias is not None:
         print('final conv.bias is not None')
         new_conv.bias.data.copy_(conv.bias.data if tmp_bias_data is None else tmp_bias_data)
-    #new_conv.weight.to('cuda:0')
-    #new_conv.bias.to('cuda:0')
-    #print(new_conv.weight.get_device(), new_conv.bias.data, new_conv.bias.get_device())
-    #print('new conv2d weight: ', new_conv.weight.data.size(), new_conv.weight.data)
     return new_conv
diff --git a/src/sdk/pynni/nni/compression/speedup/torch/compressor.py b/src/sdk/pynni/nni/compression/speedup/torch/compressor.py
index 783036c7ab..25959edcbf 100644
--- a/src/sdk/pynni/nni/compression/speedup/torch/compressor.py
+++ b/src/sdk/pynni/nni/compression/speedup/torch/compressor.py
@@ -469,6 +469,8 @@ def replace_compressed_modules(self):
             if g_node.type == 'module':
                 super_module, leaf_module = get_module_by_name(self.bound_model, module_name)
                 m_type = g_node.op_type
+                if not m_type in replace_module:
+                    raise RuntimeError("Has not supported replacing the module: `{}`".format(m_type))
                 compressed_module = replace_module[m_type](leaf_module, self.inferred_masks[module_name])
                 setattr(super_module, module_name.split('.')[-1], compressed_module)
             elif g_node.type == 'func':

From 49e0de1564fa2335b70a879a7ba24ba6d4412fbe Mon Sep 17 00:00:00 2001
From: QuanluZhang
 <quzha@v100test.j54c4fvz4ikexg1bdsarwjfncg.ix.internal.cloudapp.net>
Date: Thu, 6 Feb 2020 07:14:29 +0000
Subject: [PATCH 28/33] update

---
 examples/model_compress/speedup.md            |  2 +
 .../compression/torch/apply_compression.py    | 37 ++++++++++++++-----
 2 files changed, 30 insertions(+), 9 deletions(-)

diff --git a/examples/model_compress/speedup.md b/examples/model_compress/speedup.md
index af85ea2937..58569af4f6 100644
--- a/examples/model_compress/speedup.md
+++ b/examples/model_compress/speedup.md
@@ -1,5 +1,7 @@
 # Speedup Results
 
+*This feature is still in Alpha version.*
+
 This code only works on torch 1.3.1 and torchvision 0.4.2
 
 ## slim pruner example
diff --git a/src/sdk/pynni/nni/compression/torch/apply_compression.py b/src/sdk/pynni/nni/compression/torch/apply_compression.py
index 13a0366525..2531da5039 100644
--- a/src/sdk/pynni/nni/compression/torch/apply_compression.py
+++ b/src/sdk/pynni/nni/compression/torch/apply_compression.py
@@ -9,13 +9,21 @@
 
 def apply_compression_results(model, masks_file):
     """
+    Apply the masks from ```masks_file``` to the model
+
+    Parameters
+    ----------
+    model : torch.nn.module
+        The model to be compressed
+    masks_file : str
+        The path of the mask file
     """
     apply_comp = ApplyCompression(model, masks_file)
     apply_comp.compress()
 
 class ApplyCompression(Pruner):
     """
-    Prune to an exact pruning level specification
+    This class is not to generate masks, but applying existing masks
     """
 
     def __init__(self, model, masks_file):
@@ -23,14 +31,12 @@ def __init__(self, model, masks_file):
         Parameters
         ----------
         model : torch.nn.module
-            Model to be pruned
-        config_list : list
-            List on pruning configs
+            Model to be masked
+        masks_file : str
+            The path of user provided mask file
         """
         self.bound_model = model
         self.masks = torch.load(masks_file)
-        #ori_masks = torch.load(masks_file)
-        #self.masks = {'feature.1': ori_masks['feature.1']}
         for module_name in self.masks:
             print('module_name: ', module_name)
         config_list = self._build_config()
@@ -42,10 +48,23 @@ def _build_config(self):
             op_names.append(module_name)
         return [{'sparsity': 1, 'op_types': ['default', 'BatchNorm2d'], 'op_names': op_names}]
 
-    def calc_mask(self, layer, config):
+    def calc_mask(self, layer, config, **kwargs):
         """
+        Directly return the corresponding mask
+
+        Parameters
+        ----------
+        layer : LayerInfo
+            The layer to be pruned
+        config : dict
+            Pruning configurations for this weight
+        kwargs : dict
+            Auxiliary information
+
+        Returns
+        -------
+        dict
+            Mask of the layer
         """
         assert layer.name in self.masks
-        #print('calc_mask: ', layer.name, self.masks[layer.name])
-        #print('calc_mask: ', layer.name, layer.type)
         return self.masks[layer.name]

From 951b014dcc9a986d2a7f8c8df93e6e36f41219bb Mon Sep 17 00:00:00 2001
From: QuanluZhang
 <quzha@v100test.j54c4fvz4ikexg1bdsarwjfncg.ix.internal.cloudapp.net>
Date: Sat, 8 Feb 2020 13:42:20 +0000
Subject: [PATCH 29/33] resolve comments

---
 examples/model_compress/fpgm_torch_mnist.py   |  3 +-
 examples/model_compress/model_speedup.py      | 59 -------------------
 examples/model_compress/speedup.md            | 50 ++++++++--------
 .../speedup/torch/compress_modules.py         |  2 +-
 .../compression/speedup/torch/compressor.py   |  4 +-
 .../compression/speedup/torch/infer_shape.py  |  8 ++-
 6 files changed, 33 insertions(+), 93 deletions(-)

diff --git a/examples/model_compress/fpgm_torch_mnist.py b/examples/model_compress/fpgm_torch_mnist.py
index ae925af842..82fc329d9b 100644
--- a/examples/model_compress/fpgm_torch_mnist.py
+++ b/examples/model_compress/fpgm_torch_mnist.py
@@ -16,8 +16,7 @@ def forward(self, x):
         x = F.max_pool2d(x, 2, 2)
         x = F.relu(self.conv2(x))
         x = F.max_pool2d(x, 2, 2)
-        #x = x.view(-1, 4 * 4 * 50)
-        x = x.view(64, -1)
+        x = x.view(x.size(0), -1)
         x = F.relu(self.fc1(x))
         x = self.fc2(x)
         return F.log_softmax(x, dim=1)
diff --git a/examples/model_compress/model_speedup.py b/examples/model_compress/model_speedup.py
index 4a2632389b..9d27d98da9 100644
--- a/examples/model_compress/model_speedup.py
+++ b/examples/model_compress/model_speedup.py
@@ -13,23 +13,6 @@
 
 def apoz_speedup(masks_file, model_checkpoint):
     device = torch.device('cuda')
-    train_loader = torch.utils.data.DataLoader(
-        datasets.CIFAR10('./data.cifar10', train=True, download=True,
-                         transform=transforms.Compose([
-                             transforms.Pad(4),
-                             transforms.RandomCrop(32),
-                             transforms.RandomHorizontalFlip(),
-                             transforms.ToTensor(),
-                             transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
-                         ])),
-        batch_size=64, shuffle=True)
-    test_loader = torch.utils.data.DataLoader(
-        datasets.CIFAR10('./data.cifar10', train=False, transform=transforms.Compose([
-            transforms.ToTensor(),
-            transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
-        ])),
-        batch_size=200, shuffle=False)
-
     model = VGG(depth=16)
     model.to(device)
     model.eval()
@@ -59,23 +42,6 @@ def apoz_speedup(masks_file, model_checkpoint):
 
 def l1filter_speedup(masks_file, model_checkpoint):
     device = torch.device('cuda')
-    train_loader = torch.utils.data.DataLoader(
-        datasets.CIFAR10('./data.cifar10', train=True, download=True,
-                         transform=transforms.Compose([
-                             transforms.Pad(4),
-                             transforms.RandomCrop(32),
-                             transforms.RandomHorizontalFlip(),
-                             transforms.ToTensor(),
-                             transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
-                         ])),
-        batch_size=64, shuffle=True)
-    test_loader = torch.utils.data.DataLoader(
-        datasets.CIFAR10('./data.cifar10', train=False, transform=transforms.Compose([
-            transforms.ToTensor(),
-            transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
-        ])),
-        batch_size=200, shuffle=False)
-
     model = VGG(depth=16)
     model.to(device)
     model.eval()
@@ -106,14 +72,6 @@ def l1filter_speedup(masks_file, model_checkpoint):
 def fpgm_speedup(masks_file, model_checkpoint):
     from fpgm_torch_mnist import Mnist
     device = torch.device('cpu')
-    trans = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])
-    train_loader = torch.utils.data.DataLoader(
-        datasets.MNIST('data', train=True, download=True, transform=trans),
-        batch_size=64, shuffle=True)
-    test_loader = torch.utils.data.DataLoader(
-        datasets.MNIST('data', train=False, transform=trans),
-        batch_size=1000, shuffle=True)
-
     model = Mnist()
     model.to(device)
     model.print_conv_filter_sparsity()
@@ -141,23 +99,6 @@ def fpgm_speedup(masks_file, model_checkpoint):
 
 def slim_speedup(masks_file, model_checkpoint):
     device = torch.device('cuda')
-    train_loader = torch.utils.data.DataLoader(
-        datasets.CIFAR10('./data.cifar10', train=True, download=True,
-                         transform=transforms.Compose([
-                             transforms.Pad(4),
-                             transforms.RandomCrop(32),
-                             transforms.RandomHorizontalFlip(),
-                             transforms.ToTensor(),
-                             transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
-                         ])),
-        batch_size=64, shuffle=True)
-    test_loader = torch.utils.data.DataLoader(
-        datasets.CIFAR10('./data.cifar10', train=False, transform=transforms.Compose([
-            transforms.ToTensor(),
-            transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
-        ])),
-        batch_size=200, shuffle=False)
-
     model = VGG(depth=19)
     model.to(device)
     model.eval()
diff --git a/examples/model_compress/speedup.md b/examples/model_compress/speedup.md
index 58569af4f6..e5a094b1de 100644
--- a/examples/model_compress/speedup.md
+++ b/examples/model_compress/speedup.md
@@ -11,12 +11,12 @@ input tensor: `torch.randn(64, 3, 32, 32)`
 
 |Times| Mask Latency| Speedup Latency |
 |---|---|---|
-| 1 | 0.011968851089477539 | 0.005106925964355469 |
-| 2 | 0.020199298858642578 | 0.008769512176513672 |
-| 4 | 0.027331113815307617 | 0.014809131622314453 |
-| 8 | 0.043100595474243164 | 0.02744126319885254 |
-| 16 | 0.07731318473815918 | 0.05007791519165039 |
-| 32 | 0.14464616775512695 | 0.10027527809143066 |
+| 1 | 0.01197 | 0.005107 |
+| 2 | 0.02019 | 0.008769 |
+| 4 | 0.02733 | 0.014809 |
+| 8 | 0.04310 | 0.027441 |
+| 16 | 0.07731 | 0.05008 |
+| 32 | 0.14464 | 0.10027 |
 
 ## fpgm pruner example
 
@@ -26,13 +26,13 @@ too large variance
 
 |Times| Mask Latency| Speedup Latency |
 |---|---|---|
-| 1 | 0.013831615447998047 | 0.018393278121948242 |
-| 2 | 0.011675357818603516 | 0.0035581588745117188 |
-| 4 | 0.016363859176635742 | 0.01088404655456543 |
-| 40 | 0.14412355422973633 | 0.08268260955810547 |
-| 40 | 1.2938556671142578 | 0.1440880298614502 |
-| 40 | 0.4103574752807617 | 0.4616250991821289 |
-| 400 | 6.290201425552368 | 5.821432113647461 |
+| 1 | 0.01383 | 0.01839 |
+| 2 | 0.01167 | 0.003558 |
+| 4 | 0.01636 | 0.01088 |
+| 40 | 0.14412 | 0.08268 |
+| 40 | 1.29385 | 0.14408 |
+| 40 | 0.41035 | 0.46162 |
+| 400 | 6.29020 | 5.82143 |
 
 ## l1filter pruner example
 
@@ -41,12 +41,12 @@ input tensor: `torch.randn(64, 3, 32, 32)`
 
 |Times| Mask Latency| Speedup Latency |
 |---|---|---|
-| 1 | 0.010260343551635742 | 0.0036773681640625 |
-| 2 | 0.016577482223510742 | 0.008161306381225586 |
-| 4 | 0.0245821475982666 | 0.02001810073852539 |
-| 8 | 0.034986257553100586 | 0.025504589080810547 |
-| 16 | 0.06757736206054688 | 0.04752326011657715 |
-| 32 | 0.10487151145935059 | 0.08644247055053711 |
+| 1 | 0.01026 | 0.003677 |
+| 2 | 0.01657 | 0.008161 |
+| 4 | 0.02458 | 0.020018 |
+| 8 | 0.03498 | 0.025504 |
+| 16 | 0.06757 | 0.047523 |
+| 32 | 0.10487 | 0.086442 |
 
 ## APoZ pruner example
 
@@ -55,9 +55,9 @@ input tensor: `torch.randn(64, 3, 32, 32)`
 
 |Times| Mask Latency| Speedup Latency |
 |---|---|---|
-| 1 | 0.013897180557250977 | 0.004208564758300781 |
-| 2 | 0.016284465789794922 | 0.008310556411743164 |
-| 4 | 0.02521061897277832 | 0.01400899887084961 |
-| 8 | 0.03386855125427246 | 0.023923158645629883 |
-| 16 | 0.060423851013183594 | 0.046183109283447266 |
-| 32 | 0.12421965599060059 | 0.0871133804321289 |
\ No newline at end of file
+| 1 | 0.01389 | 0.004208 |
+| 2 | 0.01628 | 0.008310 |
+| 4 | 0.02521 | 0.014008 |
+| 8 | 0.03386 | 0.023923 |
+| 16 | 0.06042 | 0.046183 |
+| 32 | 0.12421 | 0.087113 |
\ No newline at end of file
diff --git a/src/sdk/pynni/nni/compression/speedup/torch/compress_modules.py b/src/sdk/pynni/nni/compression/speedup/torch/compress_modules.py
index 90ce8e8218..540fe115cf 100644
--- a/src/sdk/pynni/nni/compression/speedup/torch/compress_modules.py
+++ b/src/sdk/pynni/nni/compression/speedup/torch/compress_modules.py
@@ -125,7 +125,7 @@ def replace_conv2d(conv, mask):
     if mask.input_mask is not None:
         tmp_weight_data = torch.index_select(conv.weight.data if tmp_weight_data is None else tmp_weight_data,
                                              1, in_channels_index)
-    assert tmp_weight_data is not None
+    assert tmp_weight_data is not None, "Conv2d weight should be updated based on masks"
     new_conv.weight.data.copy_(tmp_weight_data)
     if conv.bias is not None:
         print('final conv.bias is not None')
diff --git a/src/sdk/pynni/nni/compression/speedup/torch/compressor.py b/src/sdk/pynni/nni/compression/speedup/torch/compressor.py
index 25959edcbf..1686a5c209 100644
--- a/src/sdk/pynni/nni/compression/speedup/torch/compressor.py
+++ b/src/sdk/pynni/nni/compression/speedup/torch/compressor.py
@@ -370,9 +370,7 @@ def _find_successors(self, module_name):
         """
         successors = []
         for output in self.name_to_gnode[module_name].outputs:
-            if not output in self.input_to_gnode:
-                print(output)
-            assert output in self.input_to_gnode
+            assert output in self.input_to_gnode, "No gnode with input {}".format(output)
             g_nodes = self.input_to_gnode[output]
             for g_node in g_nodes:
                 successors.append(g_node.name)
diff --git a/src/sdk/pynni/nni/compression/speedup/torch/infer_shape.py b/src/sdk/pynni/nni/compression/speedup/torch/infer_shape.py
index 735a357ba8..995dcf997f 100644
--- a/src/sdk/pynni/nni/compression/speedup/torch/infer_shape.py
+++ b/src/sdk/pynni/nni/compression/speedup/torch/infer_shape.py
@@ -73,7 +73,8 @@ def merge(self, cmask):
             The member variable ```mask_index```
         """
         assert isinstance(cmask, CoarseMask)
-        assert len(self.mask_index) == len(cmask.mask_index)
+        assert len(self.mask_index) == len(cmask.mask_index), \
+            "Only masks with the same number of dimensions can be merged"
         for i, index in enumerate(self.mask_index):
             if index is None:
                 self.mask_index[i] = cmask.mask_index[i]
@@ -302,7 +303,7 @@ def relu_inshape(module_masks, mask):
     """
     assert isinstance(mask, CoarseMask)
     # TODO: double check this assert, is it possible that a module is passed twice
-    assert module_masks.input_mask is None
+    assert module_masks.input_mask is None, "A relu op can only be processed once"
     module_masks.set_input_mask(mask)
     module_masks.set_output_mask(mask)
     return mask
@@ -395,7 +396,8 @@ def convert_to_coarse_mask(mask):
             bias_cmask = None
             if 'bias' in mask and mask['bias'] is not None:
                 bias_index = torch.nonzero(mask['bias'], as_tuple=True)[0]
-                assert torch.all(torch.eq(index, bias_index))
+                assert torch.all(torch.eq(index, bias_index)), \
+                    "bias mask should be consistent with weight mask"
                 bias_cmask = CoarseMask(num_dim=1)
                 bias_cmask.add_index_mask(dim=0, index=bias_index)
             return index, weight_cmask, bias_cmask

From 280fb1b47d4419bce7e75ed8ef638b6d6ae546ad Mon Sep 17 00:00:00 2001
From: QuanluZhang
 <quzha@v100test.j54c4fvz4ikexg1bdsarwjfncg.ix.internal.cloudapp.net>
Date: Mon, 10 Feb 2020 04:20:46 +0000
Subject: [PATCH 30/33] update doc

---
 examples/model_compress/speedup.md | 54 ++++++++++++++++++++++++++----
 1 file changed, 48 insertions(+), 6 deletions(-)

diff --git a/examples/model_compress/speedup.md b/examples/model_compress/speedup.md
index e5a094b1de..06f21688c5 100644
--- a/examples/model_compress/speedup.md
+++ b/examples/model_compress/speedup.md
@@ -1,10 +1,52 @@
-# Speedup Results
+# Speed up Masked Model
 
 *This feature is still in Alpha version.*
 
-This code only works on torch 1.3.1 and torchvision 0.4.2
+## Introduction
 
-## slim pruner example
+Pruning algorithms usually use weight masks to simulate the real pruning. Masks can be used
+to check model performance of a specific pruning (or sparsity), but there is no real speedup.
+Since model speedup is the ultimate goal of model pruning, we try to provide a tool to users
+to convert a model to a smaller one based on user provided masks (the masks come from the
+pruning algorithms).
+
+There are two types of pruning. One is fine-grained pruning, it does not change the shape of weights, and input/output tensors. Sparse kernel is required to speed up a fine-grained pruned layer. The other is coarse-grained pruning (e.g., channels), shape of weights and input/output tensors usually change due to such pruning. To speed up this kind of pruning, there is no need to use sparse kernel, just replace the pruned layer with smaller one. Since the support of sparse kernels in community is limited, we only support the speedup of coarse-grained pruning and leave the support of fine-grained pruning in future.
+
+## Design and Implementation
+
+To speed up a model, the pruned layers should be replaced, either replaced with smaller layer for coarse-grained mask, or replaced with sparse kernel for fine-grained mask. Coarse-grained mask usually changes the shape of weights or input/output tensors, thus, we should do shape inference to check are there other unpruned layers should be replaced as well due to shape change. Therefore, in our design, there are two main steps: first, do shape inference to find out all the modules that should be replaced; second, replace the modules. The first step requires topology (i.e., connections) of the model, we use `jit.trace` to obtain the model grpah for PyTorch.
+
+For each module, we should prepare four functions, three for shape inference and one for module replacement. The three shape inference functions are: given weight shape infer input/output shape, given input shape infer weight/output shape, given output shape infer weight/input shape. The module replacement function returns a newly created module which is smaller.
+
+## Usage
+
+```python
+from nni.compression.speedup.torch import ModelSpeedup
+# model: the model you want to speed up
+# dummy_input: dummy input of the model, given to `jit.trace`
+# masks_file: the mask file created by pruning algorithms
+m_speedup = ModelSpeedup(model, dummy_input.to(device), masks_file)
+m_speedup.speedup_model()
+dummy_input = dummy_input.to(device)
+start = time.time()
+out = model(dummy_input)
+print('elapsed time: ', time.time() - start)
+```
+For complete examples please refer to [the code](https://github.com/microsoft/nni/tree/master/examples/model_compress/model_speedup.py)
+
+NOTE: The current implementation only works on torch 1.3.1 and torchvision 0.4.2
+
+## Limitations
+
+Since every module requires four functions for shape inference and module replacement, this is a large amount of work, we only implemented the ones that are required by the examples. If you want to speed up your own model which cannot supported by the current implementation, you are welcome to contribute.
+
+For PyTorch we can only replace modules, if functions in `forward` should be replaced, our current implementation does not work. One workaround is make the function a PyTorch module.
+
+## Speedup Results of Examples
+
+The code of these experiments can be found [here](https://github.com/microsoft/nni/tree/master/examples/model_compress/model_speedup.py).
+
+### slim pruner example
 
 on one V100 GPU,
 input tensor: `torch.randn(64, 3, 32, 32)`
@@ -18,7 +60,7 @@ input tensor: `torch.randn(64, 3, 32, 32)`
 | 16 | 0.07731 | 0.05008 |
 | 32 | 0.14464 | 0.10027 |
 
-## fpgm pruner example
+### fpgm pruner example
 
 on cpu,
 input tensor: `torch.randn(64, 1, 28, 28)`,
@@ -34,7 +76,7 @@ too large variance
 | 40 | 0.41035 | 0.46162 |
 | 400 | 6.29020 | 5.82143 |
 
-## l1filter pruner example
+### l1filter pruner example
 
 on one V100 GPU,
 input tensor: `torch.randn(64, 3, 32, 32)`
@@ -48,7 +90,7 @@ input tensor: `torch.randn(64, 3, 32, 32)`
 | 16 | 0.06757 | 0.047523 |
 | 32 | 0.10487 | 0.086442 |
 
-## APoZ pruner example
+### APoZ pruner example
 
 on one V100 GPU,
 input tensor: `torch.randn(64, 3, 32, 32)`

From 4c47da7a4404d7968cc96d68fad9c33e031c5b86 Mon Sep 17 00:00:00 2001
From: QuanluZhang
 <quzha@v100test.j54c4fvz4ikexg1bdsarwjfncg.ix.internal.cloudapp.net>
Date: Sat, 15 Feb 2020 09:32:30 +0000
Subject: [PATCH 31/33] add init file

---
 src/sdk/pynni/nni/compression/speedup/__init__.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 src/sdk/pynni/nni/compression/speedup/__init__.py

diff --git a/src/sdk/pynni/nni/compression/speedup/__init__.py b/src/sdk/pynni/nni/compression/speedup/__init__.py
new file mode 100644
index 0000000000..e69de29bb2

From 553879b91bd4a17cab23e6de1936b742500f1a10 Mon Sep 17 00:00:00 2001
From: QuanluZhang
 <quzha@v100test.j54c4fvz4ikexg1bdsarwjfncg.ix.internal.cloudapp.net>
Date: Sat, 15 Feb 2020 09:34:16 +0000
Subject: [PATCH 32/33] remove doc

---
 examples/model_compress/speedup.md | 105 -----------------------------
 1 file changed, 105 deletions(-)
 delete mode 100644 examples/model_compress/speedup.md

diff --git a/examples/model_compress/speedup.md b/examples/model_compress/speedup.md
deleted file mode 100644
index 06f21688c5..0000000000
--- a/examples/model_compress/speedup.md
+++ /dev/null
@@ -1,105 +0,0 @@
-# Speed up Masked Model
-
-*This feature is still in Alpha version.*
-
-## Introduction
-
-Pruning algorithms usually use weight masks to simulate the real pruning. Masks can be used
-to check model performance of a specific pruning (or sparsity), but there is no real speedup.
-Since model speedup is the ultimate goal of model pruning, we try to provide a tool to users
-to convert a model to a smaller one based on user provided masks (the masks come from the
-pruning algorithms).
-
-There are two types of pruning. One is fine-grained pruning, it does not change the shape of weights, and input/output tensors. Sparse kernel is required to speed up a fine-grained pruned layer. The other is coarse-grained pruning (e.g., channels), shape of weights and input/output tensors usually change due to such pruning. To speed up this kind of pruning, there is no need to use sparse kernel, just replace the pruned layer with smaller one. Since the support of sparse kernels in community is limited, we only support the speedup of coarse-grained pruning and leave the support of fine-grained pruning in future.
-
-## Design and Implementation
-
-To speed up a model, the pruned layers should be replaced, either replaced with smaller layer for coarse-grained mask, or replaced with sparse kernel for fine-grained mask. Coarse-grained mask usually changes the shape of weights or input/output tensors, thus, we should do shape inference to check are there other unpruned layers should be replaced as well due to shape change. Therefore, in our design, there are two main steps: first, do shape inference to find out all the modules that should be replaced; second, replace the modules. The first step requires topology (i.e., connections) of the model, we use `jit.trace` to obtain the model grpah for PyTorch.
-
-For each module, we should prepare four functions, three for shape inference and one for module replacement. The three shape inference functions are: given weight shape infer input/output shape, given input shape infer weight/output shape, given output shape infer weight/input shape. The module replacement function returns a newly created module which is smaller.
-
-## Usage
-
-```python
-from nni.compression.speedup.torch import ModelSpeedup
-# model: the model you want to speed up
-# dummy_input: dummy input of the model, given to `jit.trace`
-# masks_file: the mask file created by pruning algorithms
-m_speedup = ModelSpeedup(model, dummy_input.to(device), masks_file)
-m_speedup.speedup_model()
-dummy_input = dummy_input.to(device)
-start = time.time()
-out = model(dummy_input)
-print('elapsed time: ', time.time() - start)
-```
-For complete examples please refer to [the code](https://github.com/microsoft/nni/tree/master/examples/model_compress/model_speedup.py)
-
-NOTE: The current implementation only works on torch 1.3.1 and torchvision 0.4.2
-
-## Limitations
-
-Since every module requires four functions for shape inference and module replacement, this is a large amount of work, we only implemented the ones that are required by the examples. If you want to speed up your own model which cannot supported by the current implementation, you are welcome to contribute.
-
-For PyTorch we can only replace modules, if functions in `forward` should be replaced, our current implementation does not work. One workaround is make the function a PyTorch module.
-
-## Speedup Results of Examples
-
-The code of these experiments can be found [here](https://github.com/microsoft/nni/tree/master/examples/model_compress/model_speedup.py).
-
-### slim pruner example
-
-on one V100 GPU,
-input tensor: `torch.randn(64, 3, 32, 32)`
-
-|Times| Mask Latency| Speedup Latency |
-|---|---|---|
-| 1 | 0.01197 | 0.005107 |
-| 2 | 0.02019 | 0.008769 |
-| 4 | 0.02733 | 0.014809 |
-| 8 | 0.04310 | 0.027441 |
-| 16 | 0.07731 | 0.05008 |
-| 32 | 0.14464 | 0.10027 |
-
-### fpgm pruner example
-
-on cpu,
-input tensor: `torch.randn(64, 1, 28, 28)`,
-too large variance
-
-|Times| Mask Latency| Speedup Latency |
-|---|---|---|
-| 1 | 0.01383 | 0.01839 |
-| 2 | 0.01167 | 0.003558 |
-| 4 | 0.01636 | 0.01088 |
-| 40 | 0.14412 | 0.08268 |
-| 40 | 1.29385 | 0.14408 |
-| 40 | 0.41035 | 0.46162 |
-| 400 | 6.29020 | 5.82143 |
-
-### l1filter pruner example
-
-on one V100 GPU,
-input tensor: `torch.randn(64, 3, 32, 32)`
-
-|Times| Mask Latency| Speedup Latency |
-|---|---|---|
-| 1 | 0.01026 | 0.003677 |
-| 2 | 0.01657 | 0.008161 |
-| 4 | 0.02458 | 0.020018 |
-| 8 | 0.03498 | 0.025504 |
-| 16 | 0.06757 | 0.047523 |
-| 32 | 0.10487 | 0.086442 |
-
-### APoZ pruner example
-
-on one V100 GPU,
-input tensor: `torch.randn(64, 3, 32, 32)`
-
-|Times| Mask Latency| Speedup Latency |
-|---|---|---|
-| 1 | 0.01389 | 0.004208 |
-| 2 | 0.01628 | 0.008310 |
-| 4 | 0.02521 | 0.014008 |
-| 8 | 0.03386 | 0.023923 |
-| 16 | 0.06042 | 0.046183 |
-| 32 | 0.12421 | 0.087113 |
\ No newline at end of file

From 61be34063ad3c4dc6a0bf2f1a7dc6dab502eb18c Mon Sep 17 00:00:00 2001
From: QuanluZhang
 <quzha@v100test.j54c4fvz4ikexg1bdsarwjfncg.ix.internal.cloudapp.net>
Date: Sat, 15 Feb 2020 10:07:53 +0000
Subject: [PATCH 33/33] fix pylint

---
 .../nni/compression/speedup/torch/compress_modules.py |  2 +-
 .../pynni/nni/compression/speedup/torch/compressor.py |  2 +-
 .../nni/compression/speedup/torch/infer_shape.py      | 11 +++++------
 3 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/src/sdk/pynni/nni/compression/speedup/torch/compress_modules.py b/src/sdk/pynni/nni/compression/speedup/torch/compress_modules.py
index 540fe115cf..5bfcc16804 100644
--- a/src/sdk/pynni/nni/compression/speedup/torch/compress_modules.py
+++ b/src/sdk/pynni/nni/compression/speedup/torch/compress_modules.py
@@ -2,7 +2,7 @@
 # Licensed under the MIT license.
 
 import torch
-from .infer_shape import CoarseMask, ModuleMasks
+from .infer_shape import ModuleMasks
 
 replace_module = {
     'BatchNorm2d': lambda module, mask: replace_batchnorm2d(module, mask),
diff --git a/src/sdk/pynni/nni/compression/speedup/torch/compressor.py b/src/sdk/pynni/nni/compression/speedup/torch/compressor.py
index 1686a5c209..ae6b7ce015 100644
--- a/src/sdk/pynni/nni/compression/speedup/torch/compressor.py
+++ b/src/sdk/pynni/nni/compression/speedup/torch/compressor.py
@@ -379,7 +379,7 @@ def _find_successors(self, module_name):
     def infer_module_mask(self, module_name, mask=None, in_shape=None, out_shape=None):
         """
         Infer input shape / output shape based on the module's weight mask / input shape / output shape.
-        
+
         For a module:
             Infer its input and output shape from its weight mask
             Infer its output shape from its input shape
diff --git a/src/sdk/pynni/nni/compression/speedup/torch/infer_shape.py b/src/sdk/pynni/nni/compression/speedup/torch/infer_shape.py
index 995dcf997f..701d1f58e6 100644
--- a/src/sdk/pynni/nni/compression/speedup/torch/infer_shape.py
+++ b/src/sdk/pynni/nni/compression/speedup/torch/infer_shape.py
@@ -56,7 +56,7 @@ def merge_index(index_a, index_b):
             s.add(num)
         for num in index_b:
             s.add(num)
-        return torch.tensor(sorted(s))
+        return torch.tensor(sorted(s)) # pylint: disable=not-callable
 
     def merge(self, cmask):
         """
@@ -98,7 +98,7 @@ def __init__(self, module_name):
         self.param_masks = dict()
         self.input_mask = None
         self.output_mask = None
-    
+
     def set_param_masks(self, name, mask):
         """
         Parameters
@@ -217,7 +217,7 @@ def view_inshape(module_masks, mask, shape):
 
     TODO: consider replace tensor.view with nn.Flatten, because tensor.view is not
     included in module, thus, cannot be replaced by our framework.
-    
+
     Parameters
     ----------
     module_masks : ModuleMasks
@@ -250,7 +250,7 @@ def view_inshape(module_masks, mask, shape):
     step_size = shape['in_shape'][2] * shape['in_shape'][3]
     for loc in mask.mask_index[1]:
         index.extend([loc * step_size + i for i in range(step_size)])
-    output_cmask.add_index_mask(dim=1, index=torch.tensor(index))
+    output_cmask.add_index_mask(dim=1, index=torch.tensor(index)) # pylint: disable=not-callable
     module_masks.set_output_mask(output_cmask)
     return output_cmask
 
@@ -373,7 +373,6 @@ def convert_to_coarse_mask(mask):
         """
         assert 'weight' in mask
         assert isinstance(mask['weight'], torch.Tensor)
-        cmask = None
         weight_mask = mask['weight']
         shape = weight_mask.size()
         ones = torch.ones(shape[1:]).to(weight_mask.device)
@@ -451,7 +450,7 @@ def conv2d_outshape(module_masks, mask):
         The ModuleMasks instance of the conv2d
     mask : CoarseMask
         The mask of its output tensor
-    
+
     Returns
     -------
     CoarseMask