blue-oil · katsuma-inoue-42 · Aug 27, 2019 · Aug 27, 2019 · Aug 28, 2019 · Aug 29, 2019
diff --git a/.gitignore b/.gitignore
@@ -105,3 +105,5 @@ ENV/
 /dataset/
 /config/
 /tmp/
+.DS_Store
+.idea
diff --git a/dlk/python/dlk/core/operators.py b/dlk/python/dlk/core/operators.py
@@ -1428,6 +1428,7 @@ def __init__(self,
                  dimension_format: str = 'NHWC') -> None:
         """Init this quantization operator."""
         super().__init__(name, shape, dtype, input_ops, dimension_format=dimension_format)
+        self._original_shape = shape
 
     def _check_consistency(self) -> None:
         super()._check_consistency()
@@ -1499,6 +1500,9 @@ def binarizer(self, data: np.ndarray) -> np.ndarray:
         """Maps the quantized values into >= 0 integer values."""
         return data
 
+    def restore_shape(self):
+        self.update_shape(self._original_shape, 'NHWC')
+
 
 class Add(Operator):
     """Add operator.

diff --git a/dlk/python/dlk/core/optimizer.py b/dlk/python/dlk/core/optimizer.py
@@ -505,7 +505,25 @@ def pass_quantize_convolutions(graph: Graph) -> None:
             width = qtz.width
             depth = qtz.channel
             depth_upper = (depth + b - 1) // b
-            qtz.update_shape([height, width, depth_upper, 2, b], "HWChBCl")
+            qtz.update_shape([depth_upper, height, width, 2, b], "ChHWBCl")
+
+
+def pass_fix_qtz_types_and_format(graph) -> None:
+    """output of QTZ_linear_mid_tread_half is:
+        PackedUint32 type, ChHWBCl layout
+
+    Parameters
+    ----------
+    graph : Graph
+        The input graph. It will be modified in-place.
+    """
+    exec_list = sort_graph(graph)
+    for m in exec_list:
+        if m.op_type == 'QTZ_linear_mid_tread_half' and m.dimension != 'ChHWBCl':
+            m.dtype = PackedUint32()
+            b = 32
+            shape = [(m.channel + b - 1) // b, m.height, m.width, 2, b]
+            m.update_shape(shape, 'ChHWBCl')
 
 
 def pass_propagate_datatypes(graph) -> None:
@@ -561,7 +579,7 @@ def pass_propagate_output_type_backward(graph: Graph) -> None:
 
     def output_dtype_changer(node, otype):
         for n in node.input_nodes:
-            if n.op_type == 'Conv' and n.is_quantized:
+            if (n.op_type == 'Conv' and n.is_quantized) or n.op_type == 'QTZ_linear_mid_tread_half':
                 n.restore_shape()
                 n.dtype = otype
                 return

diff --git a/dlk/python/dlk/scripts/generate_project.py b/dlk/python/dlk/scripts/generate_project.py
@@ -31,9 +31,9 @@
 from frontend import TensorFlowIO
 from core.optimizer import pass_remove_identities, pass_transpose, pass_constant_folding, \
     pass_propagate_quantization_details_into_conv, pass_compute_thresholds, pass_pack_weights, \
-    pass_quantize_convolutions, pass_propagate_datatypes, \
-    pass_propagate_format, pass_propagate_output_type_backward, \
-    pass_lookup
+    pass_quantize_convolutions, pass_fix_qtz_types_and_format, \
+    pass_propagate_datatypes, pass_propagate_format, \
+    pass_propagate_output_type_backward, pass_lookup
 
 SCRITPS_DIR = path.abspath(path.dirname(__file__))
 DLK_ROOT_DIR = path.abspath(path.join(SCRITPS_DIR, '..'))
@@ -63,9 +63,9 @@ def optimize_graph_step(model: Model, config: Config) -> None:
             pass_compute_thresholds(graph)
         pass_pack_weights(graph)
         pass_quantize_convolutions(graph)
+        pass_fix_qtz_types_and_format(graph)
 
-    if config.threshold_skipping:
-        pass_propagate_output_type_backward(graph)
+    pass_propagate_output_type_backward(graph)
     pass_propagate_datatypes(graph)
     pass_propagate_format(graph)
 

diff --git a/dlk/python/dlk/templates/Makefile.tpl b/dlk/python/dlk/templates/Makefile.tpl
@@ -126,25 +126,36 @@ clean:
 	-$(RM) *.so
 	-$(RM) $(LIB_OBJ)
 	-$(RM) $(LIB_X86_OBJ)
+	-$(RM) $(LIB_X86_AVX_OBJ)
+	-$(RM) $(LIB_ARM_OBJ)
+	-$(RM) $(LIB_FPGA_OBJ)
+	-$(RM) $(LIB_AARCH64_OBJ)
+	-$(RM) $(OBJ)
+
+.PHONY: clear
+clear:
+	-$(RM) $(LIB_OBJ)
+	-$(RM) $(LIB_X86_OBJ)
+	-$(RM) $(LIB_X86_AVX_OBJ)
 	-$(RM) $(LIB_ARM_OBJ)
 	-$(RM) $(LIB_FPGA_OBJ)
 	-$(RM) $(LIB_AARCH64_OBJ)
 	-$(RM) $(OBJ)
 
 lm_x86:           CXX = g++
-lm_x86:           FLAGS += $(INCLUDES) -O3 -std=c++14 -DUSE_PNG -pthread -g
+lm_x86:           FLAGS += $(INCLUDES) -O3 -std=c++14 -DUSE_PNG -pthread -g -DFUNC_TIME_MEASUREMENT
 lm_x86:           CXXFLAGS +=
 
 lm_x86_avx:       CXX = g++
-lm_x86_avx:       FLAGS += $(INCLUDES) -O3 -std=c++14 -mavx2 -mfma -DUSE_AVX -DUSE_PNG -pthread -g -fopenmp
+lm_x86_avx:       FLAGS += $(INCLUDES) -O3 -std=c++14 -mavx2 -mfma -DUSE_AVX -DUSE_PNG -pthread -g -fopenmp -DFUNC_TIME_MEASUREMENT
 lm_x86_avx:       CXXFLAGS +=
 
 lm_aarch64:       CXX = aarch64-linux-gnu-g++
-lm_aarch64:       FLAGS += $(INCLUDES) -std=c++14 -O3 -DUSE_NEON -DUSE_PNG -pthread -g -fopenmp
+lm_aarch64:       FLAGS += $(INCLUDES) -std=c++14 -O3 -DUSE_NEON -DUSE_PNG -pthread -g -fopenmp -DFUNC_TIME_MEASUREMENT
 lm_aarch64:       CXXFLAGS +=
 
 lm_arm:           CXX = arm-linux-gnueabihf-g++
-lm_arm:           FLAGS += $(INCLUDES) -std=c++14 -O3 -DUSE_NEON -DUSE_PNG -DAARCH32 -mcpu=cortex-a9 -mfpu=neon -mthumb -s -pthread -g -fopenmp
+lm_arm:           FLAGS += $(INCLUDES) -std=c++14 -O3 -DUSE_NEON -DUSE_PNG -DAARCH32 -mcpu=cortex-a9 -mfpu=neon -mthumb -s -pthread -g -fopenmp -DFUNC_TIME_MEASUREMENT
 lm_arm:           CXXFLAGS +=
 
 lm_fpga:          CXX = arm-linux-gnueabihf-g++

diff --git a/dlk/python/dlk/templates/include/quantizer.h b/dlk/python/dlk/templates/include/quantizer.h
@@ -52,6 +52,12 @@ void func_QTZ_linear_mid_tread_half(
     const TensorView<T_FLOAT, MemoryLayout::Atom>& max_value,
     const TensorView<QUANTIZED_PACKED, MemoryLayout::HWChBCl>& output);
 
+void func_QTZ_linear_mid_tread_half(
+    const TensorView<T_FLOAT, MemoryLayout::NHWC>& input,
+    const TensorView<T_INT, MemoryLayout::Atom>& nbit,
+    const TensorView<T_FLOAT, MemoryLayout::Atom>& max_value,
+    const TensorView<QUANTIZED_PACKED, MemoryLayout::ChHWBCl>& output);
+
 void func_QTZ_linear_mid_tread_half(
   const TensorView<T_FLOAT, MemoryLayout::NHWC>& input,
   const TensorView<T_INT, MemoryLayout::Atom>& nbit,

diff --git a/dlk/python/dlk/templates/src/quantizer.cpp b/dlk/python/dlk/templates/src/quantizer.cpp
@@ -28,6 +28,7 @@ limitations under the License.
 #include "quantizer.h"
 #include "pack_input_to_qwords.h"
 #include "time_measurement.h"
+#include "tensor_convert.h"
 #ifdef USE_NEON
   #include <arm_neon.h>
 #endif
@@ -152,6 +153,7 @@ void func_QTZ_linear_mid_tread_half_body(
 }
 
 static const auto output_not_packed = std::make_unique<QUANTIZED_NOT_PACKED[]>(MAX_SIZE_INPUTS_PER_LAYER);
+static const auto output_packed = std::make_unique<QUANTIZED_PACKED[]>(MAX_SIZE_QINPUTS_PER_LAYER);
 
 void func_QTZ_linear_mid_tread_half(
     const TensorView<T_FLOAT, MemoryLayout::NHWC>& input,
@@ -184,6 +186,23 @@ void func_QTZ_linear_mid_tread_half(
   Measurement::Stop();
 }
 
+void func_QTZ_linear_mid_tread_half(
+    const TensorView<T_FLOAT, MemoryLayout::NHWC>& input,
+    const TensorView<T_INT, MemoryLayout::Atom>& nbit,
+    const TensorView<T_FLOAT, MemoryLayout::Atom>& max_value,
+    const TensorView<QUANTIZED_PACKED, MemoryLayout::ChHWBCl>& output) {
+  const auto out_shape = output.get_shape();
+  const auto out_height = out_shape[1];
+  const auto out_width = out_shape[2];
+  const auto out_depth = out_shape[0];
+  TensorView<QUANTIZED_PACKED, MemoryLayout::HWChBCl>::tensor_info_t<std::size_t> shape = {
+    out_height, out_width, out_depth, nbit(), QUANTIZED_PACKED::BitCount
+  };
+  TensorView<QUANTIZED_PACKED, MemoryLayout::HWChBCl> tmp(output_packed.get(), shape);
+  func_QTZ_linear_mid_tread_half(input, nbit, max_value, tmp);
+  convert_tensor(tmp, output);
+}
+
 void func_QTZ_linear_mid_tread_half(
   const TensorView<T_FLOAT, MemoryLayout::NHWC>& input,
   const TensorView<T_INT, MemoryLayout::Atom>& nbit,

diff --git a/lmnet/configs/core/optical_flow_estimation/backup/flownet_q_v1.py b/lmnet/configs/core/optical_flow_estimation/backup/flownet_q_v1.py
@@ -0,0 +1,149 @@
+# -*- coding: utf-8 -*-
+# Copyright 2019 The Blueoil Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+import numpy as np
+import tensorflow as tf
+from easydict import EasyDict
+
+from lmnet.common import Tasks
+from lmnet.data_processor import Sequence
+from lmnet.networks.optical_flow_estimation.flownet_q_v1 import (
+    FlowNetQV1
+)
+from lmnet.datasets.optical_flow_estimation import (
+    FlyingChairs, ChairsSDHom
+)
+from lmnet.networks.optical_flow_estimation.data_augmentor import (
+    Brightness, Color, Contrast, Gamma, GaussianBlur, GaussianNoise, Hue,
+    FlipLeftRight, FlipTopBottom, Identity, Scale, Rotate, Translate
+)
+from lmnet.networks.optical_flow_estimation.pre_processor import (
+    DevideBy255, DiscretizeFlow
+)
+
+NETWORK_CLASS = FlowNetQV1
+DATASET_CLASS = FlyingChairs
+
+IMAGE_SIZE = [384, 512]
+DATA_FORMAT = "NHWC"
+TASK = Tasks.OPTICAL_FLOW_ESTIMATION
+
+# NOTE (by ki-42) the number of the label should be SPLIT_NUM + 1
+THRESHOLD_RADIUS = 10.0
+SPLIT_NUM = 10
+DATASET_CLASS.classes = [_ for _ in range(SPLIT_NUM + 1)]
+NETWORK_CLASS.split_num = SPLIT_NUM
+NETWORK_CLASS.threshold_radius = THRESHOLD_RADIUS
+CLASSES = DATASET_CLASS.classes
+
+IS_DEBUG = False
+MAX_STEPS = 1200000
+SAVE_CHECKPOINT_STEPS = 5000
+KEEP_CHECKPOINT_MAX = 5
+TEST_STEPS = 250
+SUMMARISE_STEPS = 250
+BATCH_SIZE = 8
+
+# for debugging
+# IS_DEBUG = True
+# MAX_STEPS = 10
+# BATCH_SIZE = 31
+# SAVE_CHECKPOINT_STEPS = 2
+# KEEP_CHECKPOINT_MAX = 5
+# TEST_STEPS = 10
+# SUMMARISE_STEPS = 2
+
+# pretrain
+IS_PRETRAIN = False
+PRETRAIN_VARS = []
+PRETRAIN_DIR = ""
+PRETRAIN_FILE = ""
+
+# distributed training
+IS_DISTRIBUTION = False
+
+PRE_PROCESSOR = Sequence([
+    DevideBy255(),
+    DiscretizeFlow(THRESHOLD_RADIUS, SPLIT_NUM)
+])
+POST_PROCESSOR = None
+
+NETWORK = EasyDict()
+NETWORK.OPTIMIZER_CLASS = tf.train.AdamOptimizer
+NETWORK.OPTIMIZER_KWARGS = {"beta1": 0.9, "beta2": 0.999}
+NETWORK.LEARNING_RATE_FUNC = tf.train.piecewise_constant
+NETWORK.LEARNING_RATE_KWARGS = {
+    # "values": [0.0001, 0.00005, 0.000025, 0.0000125, 0.00000625],
+    # "boundaries": [400000, 600000, 800000, 1000000],
+    "values": [0.001, 0.0005, 0.00025, 0.000125, 0.0000625],
+    "boundaries": [400000, 600000, 800000, 1000000],
+}
+NETWORK.WEIGHT_DECAY_RATE = 0.0004
+NETWORK.IMAGE_SIZE = IMAGE_SIZE
+NETWORK.BATCH_SIZE = BATCH_SIZE
+NETWORK.DATA_FORMAT = DATA_FORMAT
+
+# dataset
+DATASET = EasyDict()
+DATASET.BATCH_SIZE = BATCH_SIZE
+DATASET.DATA_FORMAT = DATA_FORMAT
+DATASET.TRAIN_ENABLE_PREFETCH = True
+DATASET.TRAIN_PROCESS_NUM = 10
+DATASET.TRAIN_QUEUE_SIZE = 1000
+DATASET.VALIDATION_ENABLE_PREFETCH = True
+DATASET.VALIDATION_PRE_LOAD = False
+DATASET.VALIDATION_PROCESS_NUM = 1
+DATASET.VALIDATION_QUEUE_SIZE = 250
+DATASET.VALIDATION_RATE = 0.1
+DATASET.VALIDATION_SEED = 2019
+
+# TODO I use default values because the metrics used in the paper are different.
+# I didn't add Gaussian Blur because it's different from Gaussian Noise.
+# Augmentation is not available in pytorch repo
+
+# NOTE (by KI-42) in the FlowNetS paper, the following augmentations were used.
+# Geometric transformation
+# translation       U([-20 %, +20 %])
+# rotation          U([-17 deg, +17 deg])
+# scaling           U([0.9, 2.0])
+# Pixel-Wise transformation
+# Gaussian noise    N(0, 1) * U([0.0, 0.04 * (255)])
+# contrast          U([0.2, 1.4])
+# color             U([0.5, 2.0])
+# gamma             U([0.7, 1.5])
+# brightness        1 + 0.2 * N(0, 1)
+
+# NOTE (by KI-42) in this setup, I modified the augmentation setup described above a little bit.
+# hue               U([-128 deg, 128 deg])
+# brightness        U(0.6, 1.4)
+
+DATASET.AUGMENTOR = Sequence([
+    # Geometric transformation
+    # FlipLeftRight(0.5),
+    # FlipTopBottom(0.5),
+    Translate(-0.1, 0.1),
+    Rotate(-5, +5),
+    # Scale(1.0, 2.0),
+    # Pixel-wise augmentation
+    Brightness(0.6, 1.4),
+    Contrast(0.2, 1.4),
+    Color(0.5, 2.0),
+    Gamma(0.7, 1.5),
+    # Hue(-128.0, 128.0),
+    GaussianNoise(10.0)
+    # GaussianBlur(0.0, 2.0)
+])
+DATASET.PRE_PROCESSOR = PRE_PROCESSOR