dmlc · sxjscience · Oct 24, 2020 · Oct 19, 2020 · Oct 19, 2020 · Oct 19, 2020
@@ -26,11 +26,24 @@ jobs:
       - name: Checkout repository
         uses: actions/checkout@v2
 
-      # Install OS specific dependencies
+      - name: Compilation cache
+        uses: actions/cache@v2
+        with:
+          path: ~/.ccache
+          # We include the commit sha in the cache key, as new cache entries are
+          # only created if there is no existing entry for the key yet.
+          key: ${{ runner.os }}-ccache-${{ github.sha }}
+          # Restore any ccache cache entry, if none for
+          # ${{ runner.os }}-ccache-${{ github.sha }} exists
+          restore-keys: |
+            ${{ runner.os }}-ccache
+
+      # Install Linux specific dependencies
       - name: Install Linux dependencies
         if: matrix.os == 'ubuntu-latest'
         # TODO https://github.com/apache/incubator-mxnet/issues/18293
-        run: sudo apt-get install libopenblas-dev
+        run: |
+          sudo apt-get install -y libopenblas-dev ninja-build libedit-dev libxml2-dev
 
       - name: Setup python
         uses: actions/setup-python@v2
@@ -44,6 +57,21 @@ jobs:
           python -m pip install --upgrade cython
           python -m pip install --pre "mxnet>=2.0.0b20200802" -f https://dist.mxnet.io/python
           python -m pip install -U -e .[extras]
+      - name: Build and Install TVM
+        if: matrix.os == 'ubuntu-latest'
+        run: |
+          git clone https://github.com/apache/incubator-tvm tvm --recursive
+          cd tvm
+          mkdir -p build
+          cp cmake/config.cmake build
+          echo set\(USE_LLVM ON\) >> build/config.cmake
+          echo set\(USE_GRAPH_RUNTIME ON\) >> build/config.cmake
+          echo set\(USE_BLAS openblas\) >> build/config.cmake
+          cd build
+          cmake .. -G Ninja
+          ninja
+          cd ../python
+          python -m pip install -U -e .
       - name: Run Unittests
         run: |
           python -m pytest --cov=./ --cov-report=xml --device="cpu" --durations=50 tests/

@@ -16,16 +16,10 @@ process the text data, and train models.
 
 # Features
 
-For NLP Practitioners
-- Easy-to-use Text Processing Tools
-- Automatically Train Models via AutoNLP (TODO)
-
-For Researchers
+- Easy-to-use Text Processing Tools and APIs
 - Pretrained Model Zoo
 - Write Models with Numpy-like API
-
-For Engineers
-- Fast Inference via [TVM](https://tvm.apache.org/) (TODO)
+- Fast Inference via [TVM](https://tvm.apache.org/)
 - AWS Integration via [SageMaker](https://aws.amazon.com/sagemaker/)
 
 
@@ -98,7 +92,7 @@ docker run --gpus all --rm -it -p 8888:8888 -p 8787:8787 -p 8786:8786 --shm-size
 
 # CPU Instance
 docker pull gluonai/gluon-nlp:cpu-latest
-docker run --gpus all --rm -it -p 8888:8888 -p 8787:8787 -p 8786:8786 --shm-size=2g gluonai/gluon-nlp:cpu-latest
+docker run --rm -it -p 8888:8888 -p 8787:8787 -p 8786:8786 --shm-size=2g gluonai/gluon-nlp:cpu-latest
 ``` 
 
 For more details, you can refer to the guidance in [tools/docker](tools/docker).
@@ -39,7 +39,21 @@ It will generate csv files with `gluonnlp_` as the prefix
 ├── gluonnlp_train_fp32_NT_NT.csv
 ├── gluonnlp_train_fp32_NT_TN.csv
 ├── gluonnlp_train_fp32_TN_TN.csv
-├── gluonnlp_infer_fp32_NT_NT.csv
-├── gluonnlp_infer_fp32_NT_TN.csv
-├── gluonnlp_infer_fp32_TN_TN.csv
+├── gluonnlp_infer_fp32_NT_NT_tvm0.csv
+├── gluonnlp_infer_fp32_NT_TN_tvm0.csv
+├── gluonnlp_infer_fp32_TN_TN_tvm0.csv
+```
+
+## GluonNLP + TVM for Inference
+
+Install TVM as described in https://tvm.apache.org/docs/install/index.html
+
+```bash
+bash benchmark_gluonnlp_tvm.sh
+```
+
+```
+├── gluonnlp_infer_fp32_NT_NT_tvm1.csv
+├── gluonnlp_infer_fp32_NT_TN_tvm1.csv
+├── gluonnlp_infer_fp32_TN_TN_tvm1.csv
 ```
@@ -54,12 +54,17 @@ def get_parser():
                         help='The layout of the computation')
     parser.add_argument('--compute_layout', type=str, default=None,
                         help='The compute layout of the computation')
+    parser.add_argument('--use_tvm', action='store_true',
+                        help='Whether to use TVM for inference/training')
+    parser.add_argument('--instance_type', choices=['c4', 'c5', 'g4', 'p3'], default='g4',
+                        help='The instance type that the profiling script will be run on.')
     parser.add_argument('--mode', type=str, default='train',
                         choices=['train', 'inference'])
     return parser
 
 
-def run_benchmark(workload, model_name, out_file_name, is_train):
+def run_benchmark(workload, model_name, out_file_name, is_train,
+                  use_tvm, instance_type):
     if is_train:
         benchmark = GluonNLPBackboneBenchmark(
             workloads=workload,
@@ -75,6 +80,8 @@ def run_benchmark(workload, model_name, out_file_name, is_train):
             model_names=model_name,
             profile_inference=True,
             profile_train=False,
+            use_tvm=use_tvm,
+            instance_type=instance_type,
             to_csv=True,
             inference_out_csv_file=out_file_name)
         benchmark.run()
@@ -93,7 +100,7 @@ def run_benchmark(workload, model_name, out_file_name, is_train):
         else:
             profile_models = [ele for ele in MODELS]
         if args.mode == 'inference':
-            out_dir = 'infer_fp32_{}_{}'.format(layout, compute_layout)
+            out_dir = 'infer_fp32_{}_{}_tvm{}'.format(layout, compute_layout, int(args.use_tvm))
             df = pd.DataFrame(columns=['model', 'batch_size', 'sequence_length',
                                        'latency', 'memory'])
             os.makedirs(out_dir, exist_ok=True)
@@ -103,12 +110,15 @@ def run_benchmark(workload, model_name, out_file_name, is_train):
                                                                            workload[1]))
                     process = Process(
                         target=run_benchmark,
-                        args=(workload, model_name, out_path, False))
+                        args=(workload, model_name, out_path, False,
+                              args.use_tvm, args.instance_type))
                     process.start()
                     process.join()
                     new_df = pd.read_csv(out_path)
                     df = df.append(new_df, ignore_index=True)
-                    df.to_csv('gluonnlp_infer_fp32_{}_{}.csv'.format(layout, compute_layout))
+                    df.to_csv('gluonnlp_infer_fp32_{}_{}_tvm{}.csv'.format(layout,
+                                                                           compute_layout,
+                                                                           int(args.use_tvm)))
         elif args.mode == 'train':
             out_dir = 'train_fp32_{}_{}'.format(layout, compute_layout)
             df = pd.DataFrame(columns=['model', 'batch_size', 'sequence_length',

@@ -0,0 +1,3 @@
+python3 benchmark_gluonnlp.py --layout NT --compute_layout NT --mode inference --use_tvm --instance_type g4
+python3 benchmark_gluonnlp.py --layout NT --compute_layout TN --mode inference --use_tvm --instance_type g4
+python3 benchmark_gluonnlp.py --layout TN --compute_layout TN --mode inference --use_tvm --instance_type g4
@@ -17,7 +17,8 @@
 import numpy as np
 import gluonnlp
 from gluonnlp.models import get_backbone
-from gluonnlp.utils.misc import logging_config
+from gluonnlp.utils.misc import logging_config, get_ec2_tvm_flags
+from gluonnlp.utils.lazy_imports import try_import_tvm
 from collections import defaultdict, namedtuple
 from datetime import datetime
 import multiprocessing as mp
@@ -603,22 +604,127 @@ def bytes_to_mega_bytes(memory_amount: int) -> int:
     return memory_amount >> 20
 
 
+_TVM_RT_CACHE = dict()
+
+
+def compile_tvm_graph_runtime(model, model_name, layout, compute_layout,
+                              batch_size, seq_length, dtype, instance_type):
+    key = (model_name, layout, compute_layout, batch_size, seq_length, dtype, instance_type)
+    if key in _TVM_RT_CACHE:
+        return _TVM_RT_CACHE[key]
+    flags = get_ec2_tvm_flags()[instance_type]
+    tvm = try_import_tvm()
+    from tvm import relay
+    from tvm.contrib import graph_runtime
+    token_ids_shape = (batch_size, seq_length) if layout == 'NT' else (seq_length, batch_size)
+    valid_length_shape = (batch_size,)
+    if 'bart' in model_name:
+        shape_dict = {
+            'data0': token_ids_shape,
+            'data1': valid_length_shape,
+            'data2': token_ids_shape,
+            'data3': valid_length_shape,
+        }
+        dtype_dict = {
+            'data0': 'int32',
+            'data1': 'int32',
+            'data2': 'int32',
+            'data3': 'int32',
+        }
+    elif 'roberta' in model_name or 'xlmr' in model_name:
+        shape_dict = {
+            'data0': token_ids_shape,
+            'data1': valid_length_shape,
+        }
+        dtype_dict = {
+            'data0': 'int32',
+            'data1': 'int32',
+        }
+    else:
+        shape_dict = {
+            'data0': token_ids_shape,
+            'data1': token_ids_shape,
+            'data2': valid_length_shape,
+        }
+        dtype_dict = {
+            'data0': 'int32',
+            'data1': 'int32',
+            'data2': 'int32'
+        }
+    sym = model._cached_graph[1]
+    params = {}
+    for k, v in model.collect_params().items():
+        params[v._var_name] = tvm.nd.array(v.data().asnumpy())
+    mod, params = relay.frontend.from_mxnet(sym, shape=shape_dict, dtype=dtype_dict, arg_params=params)
+    target = flags['target']
+    use_gpu = flags['use_gpu']
+    opt_level = flags['opt_level']
+    required_pass = flags['required_pass']
+    with tvm.transform.PassContext(opt_level=opt_level, required_pass=required_pass):
+        lib = relay.build(mod, target, params=params)
+    if use_gpu:
+        ctx = tvm.gpu()
+    else:
+        ctx = tvm.cpu()
+    rt = graph_runtime.GraphModule(lib["default"](ctx))
+    _TVM_RT_CACHE[key] = rt
+    return rt
+
+
 class GluonNLPBackboneBenchmark:
-    """
-    Benchmarks is a simple but feature-complete benchmarking script
+    """Benchmarks is a simple but feature-complete benchmarking script
     to compare memory and time performance of models in Transformers.
     """
     def __init__(self, workloads, model_names, use_fp16=False,
-                 repeat=3, use_gpu=True, device_idx=0,
+                 repeat=3, use_gpu=True,
+                 device_idx=0,
                  profile_inference=True,
                  profile_train=True,
                  env_print=True,
                  to_csv=False,
+                 use_tvm=False,
+                 instance_type=None,
                  layout='NT',
                  compute_layout='auto',
                  inference_out_csv_file='inference_time_memory.csv',
                  train_out_csv_file='train_time_memory.csv',
                  env_info_file='env_info.csv'):
+        """
+
+        Parameters
+        ----------
+        workloads
+            List of workloads to profile
+        model_names
+            List of model names to profile
+        use_fp16
+            Whether to use fp16
+        repeat
+            The number of repeat
+        use_gpu
+            Whether to use GPU
+        device_idx
+            The GPU ID
+        profile_inference
+            Whether to profile inference
+        profile_train
+            Whether to profile training
+        env_print
+            Whether to print the environment
+        to_csv
+            Whether to dump to csv file
+        use_tvm
+            Whether to use TVM to accelerate the
+        instance_type
+            Type of the instance. This will only be used to set the
+        layout
+            The input + output layout
+        compute_layout
+            The computation layout
+        inference_out_csv_file
+        train_out_csv_file
+        env_info_file
+        """
         self._workloads = workloads
         if not isinstance(workloads, list):
             workloads = [workloads]
@@ -635,6 +741,8 @@ def __init__(self, workloads, model_names, use_fp16=False,
         self._profile_train = profile_train
         self._env_print = env_print
         self._to_csv = to_csv
+        self._use_tvm = use_tvm
+        self._instance_type = instance_type
         self._layout = layout
         self._compute_layout = compute_layout
         self._inference_out_csv_file = inference_out_csv_file
@@ -699,9 +807,40 @@ def run_forward():
             else:
                 out.wait_to_read()
 
-        timeit.repeat(run_forward, repeat=1, number=3)
-        runtimes = timeit.repeat(run_forward, repeat=self._repeat, number=3)
-        mxnet.npx.waitall()
+        if self._use_tvm:
+            tvm = try_import_tvm()
+            run_forward()
+            if self._use_gpu:
+                ctx = tvm.gpu()
+            else:
+                ctx = tvm.cpu()
+            rt = compile_tvm_graph_runtime(model=model, model_name=model_name,
+                                           layout=self._layout, compute_layout=self._compute_layout,
+                                           batch_size=batch_size, seq_length=sequence_length,
+                                           instance_type=self._instance_type,
+                                           dtype='float32' if not self._use_fp16 else 'float16')
+            tvm_input_ids = tvm.nd.array(input_ids.asnumpy(), ctx=ctx)
+            tvm_token_types = tvm.nd.array(token_types.asnumpy(), ctx=ctx)
+            tvm_valid_length = tvm.nd.array(valid_length.asnumpy(), ctx=ctx)
+
+            def run_tvm_forward():
+                if 'roberta' in model_name or 'xlmr' in model_name:
+                    rt.set_input(data0=tvm_input_ids, data1=tvm_valid_length)
+                elif 'bart' in model_name:
+                    rt.set_input(data0=tvm_input_ids, data1=tvm_valid_length)
+                else:
+                    rt.set_input(data0=tvm_input_ids, data1=tvm_token_types,
+                                 data2=tvm_valid_length)
+                rt.run()
+                for i in range(rt.get_num_outputs()):
+                    out = rt.get_output(i)
+            # Warmup
+            timeit.repeat(run_tvm_forward, repeat=1, number=2)
+            runtimes = timeit.repeat(run_tvm_forward, repeat=self._repeat, number=3)
+        else:
+            timeit.repeat(run_forward, repeat=1, number=3)
+            runtimes = timeit.repeat(run_forward, repeat=self._repeat, number=3)
+            mxnet.npx.waitall()
         # Profile memory
         if self._use_gpu:
             nvml.nvmlInit()

@@ -328,7 +328,6 @@ def __init__(self,
             dtype=dtype,
             layout=self._compute_layout
         )
-        self.encoder.hybridize()
         # Construct word embedding
         self.word_embed = nn.Embedding(input_dim=vocab_size,
                                        output_dim=embed_size,
@@ -578,7 +577,6 @@ def __init__(self, backbone_cfg,
                                       flatten=False,
                                       bias_initializer=bias_initializer))
         self.mlm_decoder[-1].weight = self.backbone_model.word_embed.weight
-        self.mlm_decoder.hybridize()
 
     @property
     def layout(self):
@@ -674,7 +672,6 @@ def __init__(self, backbone_cfg,
                                       flatten=False,
                                       bias_initializer=bias_initializer))
         self.mlm_decoder[-1].weight = self.backbone_model.word_embed.weight
-        self.mlm_decoder.hybridize()
 
     @property
     def layout(self):