diff --git a/.pylintrc b/.pylintrc
index a913db5e..2884d8e6 100644
--- a/.pylintrc
+++ b/.pylintrc
@@ -13,7 +13,7 @@ ignore=CVS
 
 # Add files or directories matching the regex patterns to the blacklist. The
 # regex matches against base names, not paths.
-ignore-patterns=
+ignore-patterns=.*_test.py
 
 # Pickle collected data for later comparisons.
 persistent=yes
@@ -65,7 +65,20 @@ confidence=
 # --enable=similarities". If you want to run only the classes checker, but have
 # no Warning level messages displayed, use"--disable=all --enable=classes
 # --disable=W"
-disable=print-statement,parameter-unpacking,unpacking-in-except,old-raise-syntax,backtick,import-star-module-level,apply-builtin,basestring-builtin,buffer-builtin,cmp-builtin,coerce-builtin,execfile-builtin,file-builtin,long-builtin,raw_input-builtin,reduce-builtin,standarderror-builtin,unicode-builtin,xrange-builtin,coerce-method,delslice-method,getslice-method,setslice-method,no-absolute-import,old-division,dict-iter-method,dict-view-method,next-method-called,metaclass-assignment,indexing-exception,raising-string,reload-builtin,oct-method,hex-method,nonzero-method,cmp-method,input-builtin,round-builtin,intern-builtin,unichr-builtin,map-builtin-not-iterating,zip-builtin-not-iterating,range-builtin-not-iterating,filter-builtin-not-iterating,using-cmp-argument,long-suffix,old-ne-operator,old-octal-literal,suppressed-message,useless-suppression
+disable=invalid-name,too-many-branches,too-many-statements,too-many-arguments,  # Unnecessarily strict checks
+        too-many-instance-attributes,too-few-public-methods,too-many-locals,
+        too-many-lines,too-many-return-statements,too-many-boolean-expressions,
+        too-many-ancestors,
+        fixme,
+        no-else-return,no-else-raise,len-as-condition,unnecessary-pass,         # Not exactly good conventions
+        bad-continuation,inconsistent-return-statements,stop-iteration-return,
+        no-member,not-callable,invalid-unary-operand-type,arguments-differ,     # We have mypy for this
+        no-name-in-module,unsubscriptable-object,import-error,
+        access-member-before-definition,
+        redefined-builtin,abstract-method,missing-docstring,                    # Too many false positives
+        no-self-use,                                                            # (cannot ignore overridden methods)
+        unused-wildcard-import,                                                 # (https://github.com/rogalski/astroid/commit/82c6ef644a2efb77217a23d9b8a6cfb5caffb4ba)
+        duplicate-code,                                                         # (will be fixed in next release)
 
 
 [REPORTS]
@@ -82,7 +95,7 @@ output-format=text
 files-output=no
 
 # Tells whether to display a full report or only the messages
-reports=yes
+reports=no
 
 # Python expression which should return a note less than 10 (10 is the highest
 # note). You have access to the variables errors warning, statement which
@@ -253,7 +266,7 @@ ignore-comments=yes
 ignore-docstrings=yes
 
 # Ignore imports when computing similarities.
-ignore-imports=no
+ignore-imports=yes
 
 
 [VARIABLES]
@@ -263,7 +276,7 @@ init-import=no
 
 # A regular expression matching the name of dummy variables (i.e. expectedly
 # not used).
-dummy-variables-rgx=(_+[a-zA-Z0-9]*?$)|dummy
+dummy-variables-rgx=(_+[a-zA-Z0-9]*?$)|dummy|unused
 
 # List of additional names supposed to be defined in builtins. Remember that
 # you should avoid to define new builtins when possible.
@@ -291,7 +304,8 @@ logging-modules=logging
 max-line-length=80
 
 # Regexp for a line that is allowed to be longer than the limit.
-ignore-long-lines=^\s*(# )?<?https?://\S+>?$
+# This regex matches URLs and link anchors.
+ignore-long-lines=^\s*((# )?`?<?https?://\S+>?|\.\. _`.*`:|`.*`_)$
 
 # Allow the body of an if to be on the same line as the test if there is no
 # else.
@@ -314,7 +328,7 @@ indent-string='    '
 indent-after-paren=4
 
 # Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
-expected-line-ending-format=
+expected-line-ending-format=LF
 
 
 [DESIGN]
@@ -367,7 +381,7 @@ valid-metaclass-classmethod-first-arg=mcs
 
 # List of member names, which should be excluded from the protected access
 # warning.
-exclude-protected=_asdict,_fields,_replace,_source,_make
+exclude-protected=_asdict,_fields,_replace,_source,_make,_get_name
 
 
 [IMPORTS]
@@ -399,6 +413,8 @@ known-third-party=enchant
 # only in one or another interpreter, leading to false positives when analysed.
 analyse-fallback-blocks=no
 
+allow-wildcard-with-all=yes
+
 
 [EXCEPTIONS]
 
diff --git a/.readthedocs.yml b/.readthedocs.yml
index 5ab57270..023b56fc 100644
--- a/.readthedocs.yml
+++ b/.readthedocs.yml
@@ -1,6 +1,6 @@
 version: 2
 
 python:
-  version: 3.6
+  version: 3.7
   install:
      - requirements: docs/requirements.txt
diff --git a/.travis.yml b/.travis.yml
index 00618763..592f0b6e 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -10,26 +10,35 @@ python:
 install:
   - pip install --upgrade pip
   - pip install --progress-bar off .[tensorflow-cpu]
-  - pip install flake8==3.7.7
+  - pip install pylint==2.3.1 flake8==3.7.7
   - pip install pytest
+  - pip install coverage codecov
 
 script:
   # Linting
+  - pylint texar/ examples/
   - flake8 texar/ examples/
   # Unit tests
   - pytest
+  # Test coverage
+  - coverage run -m pytest
 
-jobs:
-  include:
-    - stage: docs
-      python: "3.7"
-      install:
-        - pip install --upgrade pip
-        - pip install --progress-bar off -r docs/requirements.txt
-      script:
-        - cd docs
-        # Build documents
-        - sphinx-build -b html -d _build/doctrees . _build/html
+after_success:
+  - codecov
+
+# jobs:
+#   include:
+#     - stage: docs
+#       python: "3.7"
+#       install:
+#         - pip install --upgrade pip
+#         - pip install --progress-bar off -r docs/requirements.txt
+#       script:
+#         - cd docs
+#           # Build documents
+#           - sphinx-build -W -b html -d _build/doctrees . _build/html
+#           # Check for typos
+#           - sphinx-build -W -b spelling -d _build/doctrees . _build/spelling
 
 notifications:
   email: false
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5bfaf33f..87473041 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,12 +2,32 @@
 
 ### New features
 
-* Support only Python 3.6 and 3.7. Drop support of older Python versions.
+* Support TensorFlow 2.0.
 
 ### Feature improvements
 
 ### Fixes
 
+## [v0.2.4](https://github.com/asyml/texar/releases/tag/v0.2.4) (2019-11-18)
+
+### New features
+
+* Support only Python 3.6 and 3.7. Drop support of older Python versions. ([#211](https://github.com/asyml/texar/pull/211))
+* Add Tokenizers including tokenizers for pretrained models (`BERTTokenizer`, `XLNetTokenizer`, etc). ([#225](https://github.com/asyml/texar/pull/225))
+* Add GPT2 modules (`GPT2Encoder`, `GPT2Decoder`, `GPT2Classifier`, etc). ([#228](https://github.com/asyml/texar/pull/228))
+
+### Feature improvements
+
+* Update embedder modules `dropout_strategy=='item'` to support TensorFlow v1.15. ([#231](https://github.com/asyml/texar/pull/231))
+* Update `.gitignore` and add `.gitignore` files to all examples. ([#233](https://github.com/asyml/texar/pull/233)) 
+* Polish code style according to flake8. ([#234](https://github.com/asyml/texar/pull/234))
+* Add GPT2 XL pretrained checkpoint. ([#243](https://github.com/asyml/texar/pull/243))
+
+### Fixes
+
+* Fix `examples/transformer/scripts/wmt14_en_de.sh` to create output dir automatically. ([#238](https://github.com/asyml/texar/pull/238))
+* Fix variable scope issue in `texar.tf.modules.decoders.dynamic_decode`. ([#246](https://github.com/asyml/texar/pull/246))
+
 ## [v0.2.3](https://github.com/asyml/texar/releases/tag/v0.2.3) (2019-09-22)
 
 ### New features
diff --git a/LICENSE b/LICENSE
index 8dada3ed..261eeb9e 100644
--- a/LICENSE
+++ b/LICENSE
@@ -178,7 +178,7 @@
    APPENDIX: How to apply the Apache License to your work.
 
       To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "{}"
+      boilerplate notice, with the fields enclosed by brackets "[]"
       replaced with your own identifying information. (Don't include
       the brackets!)  The text should be enclosed in the appropriate
       comment syntax for the file format. We also recommend that a
@@ -186,7 +186,7 @@
       same "printed page" as the copyright notice for easier
       identification within third-party archives.
 
-   Copyright {yyyy} {name of copyright owner}
+   Copyright [yyyy] [name of copyright owner]
 
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.
diff --git a/README.md b/README.md
index 421235e3..3d3e6fb0 100644
--- a/README.md
+++ b/README.md
@@ -9,6 +9,8 @@
 [![Build Status](https://travis-ci.org/asyml/texar.svg?branch=master)](https://travis-ci.org/asyml/texar)
 [![Documentation Status](https://readthedocs.org/projects/texar/badge/?version=latest)](https://texar.readthedocs.io/en/latest/?badge=latest)
 [![License](https://img.shields.io/badge/license-Apache%202.0-blue.svg)](https://github.com/asyml/texar/blob/master/LICENSE)
+
+# Texar TF2.0 is currently under incubation, please check back soon.
  
 
 **Texar** is a toolkit aiming to support a broad set of machine learning, especially natural language processing and text generation tasks. Texar provides a library of easy-to-use ML modules and functionalities for composing whatever models and algorithms. The tool is designed for both researchers and practitioners for fast prototyping and experimentation.
@@ -18,13 +20,14 @@
 * **Two Versions, (Mostly) Same Interfaces**. Texar-TensorFlow (this repo) and **[Texar-PyTorch](https://github.com/asyml/texar-pytorch)** have mostly the same interfaces. Both further combine the best design of TF and PyTorch:
   - Interfaces and variable sharing in *PyTorch convention*
   - Excellent factorization and rich functionalities in *TF convention*.
-* **Rich Pre-trained Models, Rich Usage with Uniform Interfaces**. BERT, GPT2, XLNet, etc, for encoding, classification, generation, and composing complex models with other Texar components!
-* **Fully Customizable** at multiple abstraction level -- both novice-friendly and expert-friendly. 
-  - Free to plug in whatever external modules, since Texar is fully compatible with the native TF/PyTorch APIs. 
-* **Versatile** to support broad tasks, models, algorithms, data processing, evaluation, etc. 
+* **Versatile** to support broad needs:
+   - data processing, model architectures, loss functions, training and inference algorithms, evaluation, ...
    - encoder(s) to decoder(s), sequential- and self-attentions, memory, hierarchical models, classifiers... 
    - maximum likelihood learning, reinforcement learning, adversarial learning, probabilistic modeling, ... 
+* **Fully Customizable** at multiple abstraction level -- both novice-friendly and expert-friendly. 
+  - Free to plug in whatever external modules, since Texar is fully compatible with the native TensorFlow/PyTorch APIs. 
 * **Modularized** for maximal re-use and clean APIs, based on principled decomposition of *Learning-Inference-Model Architecture*. 
+* **Rich Pre-trained Models, Rich Usage with Uniform Interfaces**. BERT, GPT2, XLNet, etc, for encoding, classification, generation, and composing complex models with other Texar components!
 * **Distributed** model training with multiple GPUs.
 * Clean, detailed [documentation](https://texar.readthedocs.io) and rich [examples](./examples).
 
@@ -33,8 +36,13 @@
    <img src="./docs/_static/img/texar_stack.png"><br><br>
 </div> 
 
+<div align="center">
+   <img src="./docs/_static/img/texar_modules_big.png"><br><br>
+</div> 
+
 ### Library API Example
 Builds an encoder-decoder model, with maximum likelihood learning:
+
 ```python
 import texar.tf as tx
 
@@ -72,6 +80,7 @@ outputs_bs, _, _ = tx.modules.beam_search_decode(
     end_token=data.target_vocab.eos_token_id)
 ```
 The same model, but with adversarial learning:
+
 ```python
 helper = tx.modules.GumbelSoftmaxTraingHelper( # Gumbel-softmax decoding
     start_tokens=[BOS]*batch_size, end_token=EOS, embedding=embedder)
@@ -85,6 +94,7 @@ G_loss, D_loss = tx.losses.binary_adversarial_losses(
     discriminator_fn=discriminator)
 ```
 The same model, but with RL policy gradient learning:
+
 ```python
 agent = tx.agents.SeqPGAgent(samples=outputs.sample_id,
                              logits=outputs.logits,
@@ -99,16 +109,18 @@ Many more examples are available [here](./examples)
 
 Texar requires:
 
-* `tensorflow >= 1.10.0 (but < 2.0.0)`. Follow the [tensorflow official instructions](https://www.tensorflow.org/install) to install the appropriate version
-* `tensorflow_probability >= 0.3.0 (but < 0.8.0)`. Follow the [tensorflow_probability official instractions](https://www.tensorflow.org/probability/install) to install.
+* `tensorflow >= 2.0.0`. Follow the [tensorflow official instructions](https://www.tensorflow.org/install) to install the appropriate version
+* `tensorflow_probability >= 0.3.0`. Follow the [tensorflow_probability official instractions](https://www.tensorflow.org/probability/install) to install.
 
 After `tensorflow` and `tensorflow_probability` are installed, install Texar from PyPI: 
+
 ```bash
 pip install texar
 ```
 
 To use cutting-edge features or develop locally, install from source: 
-```
+
+```bash
 git clone https://github.com/asyml/texar.git
 cd texar
 pip install .
@@ -120,6 +132,7 @@ pip install .
 
 ### Reference
 If you use Texar, please cite the [tech report](https://arxiv.org/abs/1809.00794) with the following BibTex entry:
+
 ```
 Texar: A Modularized, Versatile, and Extensible Toolkit for Text Generation
 Zhiting Hu, Haoran Shi, Bowen Tan, Wentao Wang, Zichao Yang, Tiancheng Zhao, Junxian He, Lianhui Qin, Di Wang, Xuezhe Ma, Zhengzhong Liu, Xiaodan Liang, Wanrong Zhu, Devendra Sachan and Eric Xing
diff --git a/docs/_static/img/texar_modules_big.png b/docs/_static/img/texar_modules_big.png
new file mode 100644
index 00000000..3404d28c
Binary files /dev/null and b/docs/_static/img/texar_modules_big.png differ
diff --git a/examples/README.md b/examples/README.md
index 5779e249..3ed27c73 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -6,104 +6,24 @@ More examples are continuously added...
 
 ## Examples by Models/Algorithms ##
 
-### RNN / Seq2seq ###
-
-* [language_model_ptb](./language_model_ptb): Basic RNN language model
-* [distributed_gpu](./distributed_gpu): Basic RNN language model with distributed training
-* [seq2seq_attn](./seq2seq_attn): Attentional seq2seq
-* [seq2seq_configs](./seq2seq_configs): Seq2seq implemented with Texar model template
-* [seq2seq_rl](./seq2seq_rl): Attentional seq2seq trained with policy gradient
-* [seq2seq_exposure_bias](./seq2seq_exposure_bias): Various algorithms tackling exposure bias in sequence generation
-* [hierarchical_dialog](./hierarchical_dialog): Hierarchical recurrent encoder-decoder model for conversation response generation
-* [torchtext](./torchtext): Use of torchtext data loader
-
 ### Transformer (Self-attention) ###
 
-* [transformer](./transformer): Transformer for machine translation
 * [bert](./bert): Pre-trained BERT model for text representation
-* [gpt-2](./gpt-2): Pre-trained OpenAI GPT-2 language model
-* [vae_text](./vae_text): VAE with a transformer decoder for improved language modeling 
-
-### Variational Autoencoder (VAE) ###
-
-* [vae_text](./vae_text): VAE language model
-
-### GANs / Discriminiator-supervision ###
-
-* [seqGAN](./seqgan): GANs for text generation
-* [text_style_transfer](./text_style_transfer): Discriminator supervision for controlled text generation
-
-### Reinforcement Learning ###
-
-* [seq2seq_rl](./seq2seq_rl): Attentional seq2seq trained with policy gradient.
-* [seqGAN](./seqgan): Policy gradient for sequence generation
-* [rl_gym](./rl_gym): Various RL algoritms for games on OpenAI Gym
 
-### Memory Network ###
-
-* [memory_network_lm](./memory_network_lm): End-to-end memory network for language modeling
-
-### Classifier / Sequence Prediction ###  
+### Classifier / Sequence Prediction ###
 
 * [bert](./bert): Pre-trained BERT model for text representation
-* [sentence_classifier](./sentence_classifier): Basic CNN-based sentence classifier
-* [sequence_tagging](./sequence_tagging): BiLSTM-CNN model for Named Entity Recognition (NER)
-
-### Reward Augmented Maximum Likelihood (RAML) ###
-
-* [seq2seq_exposure_bias](./seq2seq_exposure_bias): RAML and other learning algorithms for sequence generation 
 
 ---
 
 ## Examples by Tasks
 
-### Language Modeling ###
-
-* [gpt-2](./gpt-2): Pre-trained OpenAI GPT-2 language model
-* [language_model_ptb](./language_model_ptb): Basic RNN language model
-* [vae_text](./vae_text): VAE language model
-* [seqGAN](./seqgan): GAN + policy gradient
-* [memory_network_lm](./memory_network_lm): End-to-end memory network for language modeling
-
-### Machine Translation ###
-
-* [seq2seq_attn](./seq2seq_attn): Attentional seq2seq
-* [seq2seq_configs](./seq2seq_configs): Seq2seq implemented with Texar model template.
-* [seq2seq_rl](./seq2seq_rl): Attentional seq2seq trained with policy gradient.
-* [seq2seq_exposure_bias](./seq2seq_exposure_bias): Various algorithms tackling exposure bias in sequence generation (MT and summarization as examples).
-* [transformer](./transformer): Transformer for machine translation
-
-### Dialog ###
-
-* [hierarchical_dialog](./hierarchical_dialog): Hierarchical recurrent encoder-decoder model for conversation response generation.
-
-### Text Summarization ###
-
-* [seq2seq_exposure_bias](./seq2seq_exposure_bias): Various algorithms tackling exposure bias in sequence generation (MT and summarization as examples).
-
-### Text Style Transfer ###
-
-* [text_style_transfer](./text_style_transfer): Discriminator supervision for controlled text generation
-
 ### Classification ###
 
 * [bert](./bert): Pre-trained BERT model for text representation
-* [sentence_classifier](./sentence_classifier): Basic CNN-based sentence classifier
-
-### Sequence Tagging ###
-
-* [sequence_tagging](./sequence_tagging): BiLSTM-CNN model for Named Entity Recognition (NER)
-
-### Games ###
-
-* [rl_gym](./rl_gym): Various RL algoritms for games on OpenAI Gym
-
----
 
 ## MISC ##
 
 ### Distributed training ###
 
-* [distributed_gpu](./distributed_gpu): Basic example of distributed training.
-* [bert](./bert): Distributed training of BERT.
-
+* [bert](./bert): Distributed training of BERT.
\ No newline at end of file
diff --git a/examples/bert/README.md b/examples/bert/README.md
deleted file mode 100644
index 7c097b65..00000000
--- a/examples/bert/README.md
+++ /dev/null
@@ -1,131 +0,0 @@
-# BERT: Pre-trained models and downstream applications
-
-This is a Texar implementation of Google's BERT model, which allows to load pre-trained model parameters downloaded from the [official release](https://github.com/google-research/bert) and build/fine-tune arbitrary downstream applications with **distributed training** (This example showcases BERT for sentence classification).
-
-Texar provides ready-to-use modules including
-[`BERTEncoder`](https://texar.readthedocs.io/en/latest/code/modules.html#bertencoder),
-and [`BERTClassifier`](https://texar.readthedocs.io/en/latest/code/modules.html#bertclassifier).
-This example shows the use of `BERTClassifier` for sentence classification tasks.
-
-In sum, this example showcases:
-
-* Use of pre-trained Google BERT models in Texar
-* Building and fine-tuning on downstream tasks
-* Distributed training of the models
-* Use of Texar `TFRecordData` module for data loading and processing
-
-## Quick Start
-
-### Download Dataset
-
-We explain the use of the example code based on the Microsoft Research Paraphrase Corpus (MRPC) corpus for sentence classification.
-
-Download the data with the following command:
-
-```
-python data/download_glue_data.py --tasks=MRPC
-```
-
-By default, it will download the MRPC dataset into the `data` directory. FYI, the MRPC dataset is part of the [GLUE](https://gluebenchmark.com/tasks) dataset collection.
-
-### Prepare data
-
-We first preprocess the downloaded raw data into [TFRecord](https://www.tensorflow.org/tutorials/load_data/tf_records) files. The preprocessing tokenizes raw text with BPE encoding, truncates sequences, adds special tokens, etc.
-Run the following command to this end: 
-
-```
-    python prepare_data.py --task=MRPC
-    [--max_seq_length=128]
-    [--pretrained_model_name=bert-base-uncased]
-    [--tfrecord_output_dir=data/MRPC] 
-```
-
-- `--task`: Specifies the dataset name to preprocess. BERT provides default support for `{'CoLA', 'MNLI', 'MRPC', 'XNLI', 'SST'}` data.
-- `--max_seq_length`: The maxium length of sequence. This includes BERT special tokens that will be automatically added. Longer sequence will be trimmed. 
-- `--pretrained_model_name`: The name of pre-trained BERT model. See the [doc](https://texar.readthedocs.io/en/latest/code/modules.html#texar.tf.modules.PretrainedBERTMixin) for all supported models.
-- `--tfrecord_output_dir`: The output path where the resulting TFRecord files will be put in. Be default, it is set to `data/{task}` where `{task}` is the (upper-cased) dataset name specified in `--task` above. So in the above cmd, the TFRecord files are output to `data/MRPC`.
-
-**Outcome of the Preprocessing**:
-
-- The preprocessing will output 3 TFRecord data files `{train.tf_record, eval.tf_record, test.tf_record}` in the specified output directory.
-
-- The command also prints logs as follows:
-
-  ```
-    INFO:tensorflow:Loading data
-    INFO:tensorflow:num_classes:2; num_train_data:3668
-    INFO:tensorflow:config_data.py has been updated
-    INFO:tensorflow:Data preparation finished
-  ```
-  **Note that** the data info `num_classes` and `num_train_data`, as well as `max_seq_length` specified in the cmd, are required for BERT training in the following. They should be specified in the data configuration file passed to BERT training (see below). 
-- For convenience, the above cmd automatically writes `num_classes`, `num_train_data` and `max_seq_length` to `config_data.py`.
-
-### Train and Evaluate
-
-For **single-GPU** training (and evaluation), run the following cmd. The training updates the classification layer and fine-tunes the pre-trained BERT parameters.
-
-```
-    python bert_classifier_main.py --do_train --do_eval
-    [--config_downstream=config_classifier]
-    [--config_data=config_data]
-    [--output_dir=output]
-```
-Here:
-
-- `config_downstream`: Configuration of the downstream part. In this example, [`config_classifier`](./config_classifier.py) configures the classification layer and the optimization method.
-- `config_data`: The data configuration. See the default [`config_data.py`](./config_data.py) for example. Make sure to specify `num_classes`, `num_train_data`, `max_seq_length`, and `tfrecord_data_dir` as used or output in the above [data preparation](#prepare-data) step.
-- `output_dir`: The output path where checkpoints and TensorBoard summaries are saved.
-- `pretrained_model_name`: The name of pre-trained BERT model. See the [doc](https://texar.readthedocs.io/en/latest/code/modules.html#texar.tf.modules.PretrainedBERTMixin) for all supported models.
-
-
-For **Multi-GPU training** on one or multiple machines, you may first install the prerequisite OpenMPI and Hovorod packages, as detailed in the [distributed_gpu](https://github.com/asyml/texar/tree/master/examples/distributed_gpu) example. 
-
-Then run the following cmd for training and evaluation. The cmd trains the model on local with 2 GPUs. Evaluation is performed with the single rank-0 GPU.
-
-```
-mpirun -np 2 \
-    -H  localhost:2\
-    -bind-to none -map-by slot \
-    -x NCCL_DEBUG=INFO -x LD_LIBRARY_PATH -x PATH \
-    -mca pml ob1 -mca btl tcp,self \
-    -mca btl_tcp_if_include ens3 \
-    python bert_classifier_main.py --do_train --do_eval --distributed
-    [--config_downstream=config_classifier]
-    [--config_data=config_data]
-    [--output_dir=output] 
-```
-The key configurations of multi-gpu training:
-
-* `-np`: total number of processes
-* `-H`: IP addresses of different servers and the number of processes used in each server. For example, `-H 192.168.11.22:1,192.168.33.44:1`
-
-Please refer to [distributed_gpu](https://github.com/asyml/texar/tree/master/examples/distributed_gpu) example for more details of the other multi-gpu configurations.
-
-Make sure to specifiy the `--distributed` flag as above for multi-gpu training.
-
-After convergence, the evaluation performance is around the following. Due to certain randomness (e.g., random initialization of the classification layer), the evaluation accuracy is reasonable as long as it's `>0.84`.
-
-```
-INFO:tensorflow:dev accu: 0.8676470588235294
-```
-
-### Restore and Test
-
-``
-python bert_classifier_main.py --do_test --checkpoint=output/model.ckpt
-``
-
-The output is by default saved in `output/test_results.tsv`, where each line contains the predicted label for each sample.
-
-
-## Use other datasets/tasks
-
-`bert_classifier_main.py` also support other datasets/tasks. To do this, specify a different value to the `--task` flag when running [data preparation](#prepare-data).
-
-For example, use the following commands to download the SST (Stanford Sentiment Treebank) dataset and run for sentence classification. Make sure to specify the correct data path and other info in the data configuration file.
-
-```
-python data/download_glue_data.py --tasks=SST
-python prepare_data.py --task=SST
-python bert_classifier_main.py --do_train --do_eval --config_data=config_data
-```
diff --git a/examples/bert/bert_classifier_main.py b/examples/bert/bert_classifier_main.py
deleted file mode 100644
index 040c8360..00000000
--- a/examples/bert/bert_classifier_main.py
+++ /dev/null
@@ -1,258 +0,0 @@
-# Copyright 2019 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Example of building a sentence classifier based on pre-trained BERT model.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import importlib
-import tensorflow as tf
-import texar.tf as tx
-
-from utils import model_utils
-
-# pylint: disable=invalid-name, too-many-locals, too-many-statements
-
-flags = tf.flags
-
-FLAGS = flags.FLAGS
-
-flags.DEFINE_string(
-    "config_downstream", "config_classifier",
-    "Configuration of the downstream part of the model.")
-flags.DEFINE_string(
-    "pretrained_model_name", 'bert-base-uncased',
-    "The name of pre-trained BERT model. See the doc of "
-    "`texar.tf.modules.PretrainedBERTMixin for all supported models.`")
-flags.DEFINE_string(
-    "config_data", "config_data",
-    "The dataset config.")
-flags.DEFINE_string(
-    "output_dir", "output/",
-    "The output directory where the model checkpoints will be written.")
-flags.DEFINE_string(
-    "checkpoint", None,
-    "Path to a model checkpoint (including bert modules) to restore from.")
-flags.DEFINE_bool("do_train", False, "Whether to run training.")
-flags.DEFINE_bool("do_eval", False, "Whether to run eval on the dev set.")
-flags.DEFINE_bool("do_test", False, "Whether to run test on the test set.")
-flags.DEFINE_bool("distributed", False, "Whether to run in distributed mode.")
-
-config_data = importlib.import_module(FLAGS.config_data)
-config_downstream = importlib.import_module(FLAGS.config_downstream)
-
-
-def main(_):
-    """
-    Builds the model and runs.
-    """
-    if FLAGS.distributed:
-        import horovod.tensorflow as hvd
-        hvd.init()
-
-    tf.logging.set_verbosity(tf.logging.INFO)
-
-    tx.utils.maybe_create_dir(FLAGS.output_dir)
-
-    # Loads data
-    num_train_data = config_data.num_train_data
-
-    # Configures distribued mode
-    if FLAGS.distributed:
-        config_data.train_hparam["dataset"]["num_shards"] = hvd.size()
-        config_data.train_hparam["dataset"]["shard_id"] = hvd.rank()
-        config_data.train_hparam["batch_size"] //= hvd.size()
-
-    train_dataset = tx.data.TFRecordData(hparams=config_data.train_hparam)
-    eval_dataset = tx.data.TFRecordData(hparams=config_data.eval_hparam)
-    test_dataset = tx.data.TFRecordData(hparams=config_data.test_hparam)
-
-    iterator = tx.data.FeedableDataIterator({
-        'train': train_dataset, 'eval': eval_dataset, 'test': test_dataset})
-    batch = iterator.get_next()
-    input_ids = batch["input_ids"]
-    segment_ids = batch["segment_ids"]
-    batch_size = tf.shape(input_ids)[0]
-    input_length = tf.reduce_sum(1 - tf.cast(tf.equal(input_ids, 0), tf.int32),
-                                 axis=1)
-    # Builds BERT
-    hparams = {
-        'clas_strategy': 'cls_time'
-    }
-    model = tx.modules.BERTClassifier(
-        pretrained_model_name=FLAGS.pretrained_model_name,
-        hparams=hparams)
-    logits, preds = model(input_ids, input_length, segment_ids)
-
-    accu = tx.evals.accuracy(batch['label_ids'], preds)
-
-    # Optimization
-    loss = tf.losses.sparse_softmax_cross_entropy(
-        labels=batch["label_ids"], logits=logits)
-    global_step = tf.Variable(0, trainable=False)
-
-    # Builds learning rate decay scheduler
-    static_lr = config_downstream.lr['static_lr']
-    num_train_steps = int(num_train_data / config_data.train_batch_size
-                          * config_data.max_train_epoch)
-    num_warmup_steps = int(num_train_steps * config_data.warmup_proportion)
-    lr = model_utils.get_lr(global_step, num_train_steps,  # lr is a Tensor
-                            num_warmup_steps, static_lr)
-
-    opt = tx.core.get_optimizer(
-        global_step=global_step,
-        learning_rate=lr,
-        hparams=config_downstream.opt
-    )
-
-    if FLAGS.distributed:
-        opt = hvd.DistributedOptimizer(opt)
-
-    train_op = tf.contrib.layers.optimize_loss(
-        loss=loss,
-        global_step=global_step,
-        learning_rate=None,
-        optimizer=opt)
-
-    # Train/eval/test routine
-
-    def _is_head():
-        if not FLAGS.distributed:
-            return True
-        return hvd.rank() == 0
-
-    def _train_epoch(sess):
-        """Trains on the training set, and evaluates on the dev set
-        periodically.
-        """
-        iterator.restart_dataset(sess, 'train')
-
-        fetches = {
-            'train_op': train_op,
-            'loss': loss,
-            'batch_size': batch_size,
-            'step': global_step
-        }
-
-        while True:
-            try:
-                feed_dict = {
-                    iterator.handle: iterator.get_handle(sess, 'train'),
-                    tx.global_mode(): tf.estimator.ModeKeys.TRAIN,
-                }
-                rets = sess.run(fetches, feed_dict)
-                step = rets['step']
-
-                dis_steps = config_data.display_steps
-                if _is_head() and dis_steps > 0 and step % dis_steps == 0:
-                    tf.logging.info('step:%d; loss:%f;' % (step, rets['loss']))
-
-                eval_steps = config_data.eval_steps
-                if _is_head() and eval_steps > 0 and step % eval_steps == 0:
-                    _eval_epoch(sess)
-
-            except tf.errors.OutOfRangeError:
-                break
-
-    def _eval_epoch(sess):
-        """Evaluates on the dev set.
-        """
-        iterator.restart_dataset(sess, 'eval')
-
-        cum_acc = 0.0
-        cum_loss = 0.0
-        nsamples = 0
-        fetches = {
-            'accu': accu,
-            'loss': loss,
-            'batch_size': batch_size,
-        }
-        while True:
-            try:
-                feed_dict = {
-                    iterator.handle: iterator.get_handle(sess, 'eval'),
-                    tx.context.global_mode(): tf.estimator.ModeKeys.EVAL,
-                }
-                rets = sess.run(fetches, feed_dict)
-
-                cum_acc += rets['accu'] * rets['batch_size']
-                cum_loss += rets['loss'] * rets['batch_size']
-                nsamples += rets['batch_size']
-            except tf.errors.OutOfRangeError:
-                break
-
-        tf.logging.info('eval accu: {}; loss: {}; nsamples: {}'.format(
-            cum_acc / nsamples, cum_loss / nsamples, nsamples))
-
-    def _test_epoch(sess):
-        """Does predictions on the test set.
-        """
-        iterator.restart_dataset(sess, 'test')
-
-        _all_preds = []
-        while True:
-            try:
-                feed_dict = {
-                    iterator.handle: iterator.get_handle(sess, 'test'),
-                    tx.context.global_mode(): tf.estimator.ModeKeys.PREDICT,
-                }
-                _preds = sess.run(preds, feed_dict=feed_dict)
-                _all_preds.extend(_preds.tolist())
-            except tf.errors.OutOfRangeError:
-                break
-
-        output_file = os.path.join(FLAGS.output_dir, "test_results.tsv")
-        with tf.gfile.GFile(output_file, "w") as writer:
-            writer.write('\n'.join(str(p) for p in _all_preds))
-
-    # Broadcasts global variables from rank-0 process
-    if FLAGS.distributed:
-        bcast = hvd.broadcast_global_variables(0)
-
-    session_config = tf.ConfigProto()
-    if FLAGS.distributed:
-        session_config.gpu_options.visible_device_list = str(hvd.local_rank())
-
-    with tf.Session(config=session_config) as sess:
-        sess.run(tf.global_variables_initializer())
-        sess.run(tf.local_variables_initializer())
-        sess.run(tf.tables_initializer())
-
-        if FLAGS.distributed:
-            bcast.run()
-
-        # Restores trained model if specified
-        saver = tf.train.Saver()
-        if FLAGS.checkpoint:
-            saver.restore(sess, FLAGS.checkpoint)
-
-        iterator.initialize_dataset(sess)
-
-        if FLAGS.do_train:
-            for i in range(config_data.max_train_epoch):
-                _train_epoch(sess)
-            saver.save(sess, FLAGS.output_dir + '/model.ckpt')
-
-        if FLAGS.do_eval:
-            _eval_epoch(sess)
-
-        if FLAGS.do_test:
-            _test_epoch(sess)
-
-
-if __name__ == "__main__":
-    tf.app.run()
diff --git a/examples/bert/config_classifier.py b/examples/bert/config_classifier.py
deleted file mode 100644
index 0bd0bbc6..00000000
--- a/examples/bert/config_classifier.py
+++ /dev/null
@@ -1,25 +0,0 @@
-hidden_dim = 768
-
-opt = {
-    'optimizer': {
-        'type': 'AdamWeightDecayOptimizer',
-        'kwargs': {
-            'weight_decay_rate': 0.01,
-            'beta_1': 0.9,
-            'beta_2': 0.999,
-            'epsilon': 1e-6,
-            'exclude_from_weight_decay': ['LayerNorm', 'layer_norm', 'bias']
-        }
-    },
-    'gradient_clip': {
-        'type': 'clip_by_global_norm',
-        'kwargs': {
-            'clip_norm': 1.0,
-        }
-    }
-}
-
-# By default, we use warmup and linear decay for learinng rate
-lr = {
-    'static_lr': 2e-5,
-}
diff --git a/examples/bert/config_data.py b/examples/bert/config_data.py
deleted file mode 100644
index 596e4b46..00000000
--- a/examples/bert/config_data.py
+++ /dev/null
@@ -1,75 +0,0 @@
-tfrecord_data_dir = "data/MRPC"
-max_seq_length = 128
-num_classes = 2
-num_train_data = 3668
-
-train_batch_size = 32
-max_train_epoch = 3
-display_steps = 50  # Print training loss every display_steps; -1 to disable
-eval_steps = -1    # Eval on the dev set every eval_steps; -1 to disable
-# Proportion of training to perform linear learning
-# rate warmup for. E.g., 0.1 = 10% of training.
-warmup_proportion = 0.1
-
-eval_batch_size = 8
-test_batch_size = 8
-
-
-feature_original_types = {
-    # Reading features from TFRecord data file.
-    # E.g., Reading feature "input_ids" as dtype `tf.int64`;
-    # "FixedLenFeature" indicates its length is fixed for all data instances;
-    # and the sequence length is limited by `max_seq_length`.
-    "input_ids": ["tf.int64", "FixedLenFeature", max_seq_length],
-    "input_mask": ["tf.int64", "FixedLenFeature", max_seq_length],
-    "segment_ids": ["tf.int64", "FixedLenFeature", max_seq_length],
-    "label_ids": ["tf.int64", "FixedLenFeature"]
-}
-
-feature_convert_types = {
-    # Converting feature dtype after reading. E.g.,
-    # Converting the dtype of feature "input_ids" from `tf.int64` (as above)
-    # to `tf.int32`
-    "input_ids": "tf.int32",
-    "input_mask": "tf.int32",
-    "label_ids": "tf.int32",
-    "segment_ids": "tf.int32"
-}
-
-train_hparam = {
-    "allow_smaller_final_batch": False,
-    "batch_size": train_batch_size,
-    "dataset": {
-        "data_name": "data",
-        "feature_convert_types": feature_convert_types,
-        "feature_original_types": feature_original_types,
-        "files": "{}/train.tf_record".format(tfrecord_data_dir)
-    },
-    "shuffle": True,
-    "shuffle_buffer_size": 100
-}
-
-eval_hparam = {
-    "allow_smaller_final_batch": True,
-    "batch_size": eval_batch_size,
-    "dataset": {
-        "data_name": "data",
-        "feature_convert_types": feature_convert_types,
-        "feature_original_types": feature_original_types,
-        "files": "{}/eval.tf_record".format(tfrecord_data_dir)
-    },
-    "shuffle": False
-}
-
-test_hparam = {
-    "allow_smaller_final_batch": True,
-    "batch_size": test_batch_size,
-    "dataset": {
-        "data_name": "data",
-        "feature_convert_types": feature_convert_types,
-        "feature_original_types": feature_original_types,
-        "files": "{}/predict.tf_record".format(tfrecord_data_dir)
-    },
-
-    "shuffle": False
-}
diff --git a/examples/bert/data/download_glue_data.py b/examples/bert/data/download_glue_data.py
index 94444c36..f8ff6f7d 100644
--- a/examples/bert/data/download_glue_data.py
+++ b/examples/bert/data/download_glue_data.py
@@ -93,7 +93,6 @@ def download_diagnostic(data_dir):
     data_file = os.path.join(data_dir, "diagnostic", "diagnostic.tsv")
     urllib.request.urlretrieve(TASK2PATH["diagnostic"], data_file)
     print("\tCompleted!")
-    return
 
 
 def get_tasks(task_names):
diff --git a/examples/bert/prepare_data.py b/examples/bert/prepare_data.py
index 4a39273b..32ad802d 100644
--- a/examples/bert/prepare_data.py
+++ b/examples/bert/prepare_data.py
@@ -14,40 +14,37 @@
 """Produces TFRecord files and modifies data configuration file
 """
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
+import argparse
+import logging
 import os
-import tensorflow as tf
+
 import texar.tf as tx
 
-# pylint: disable=no-name-in-module
 from utils import data_utils
 
-# pylint: disable=invalid-name, too-many-locals, too-many-statements
-
-flags = tf.flags
-
-FLAGS = flags.FLAGS
-flags.DEFINE_string(
-    "task", "MRPC",
-    "The task to run experiment on. One of "
-    "{'COLA', 'MNLI', 'MRPC', 'XNLI', 'SST'}.")
-flags.DEFINE_string(
-    "pretrained_model_name", 'bert-base-uncased',
-    "The name of pre-trained BERT model. See the doc of "
-    "`texar.tf.modules.PretrainedBERTMixin for all supported models.`")
-flags.DEFINE_integer(
-    "max_seq_length", 128,
-    "The maximum length of sequence, longer sequence will be trimmed.")
-flags.DEFINE_string(
-    "tfrecord_output_dir", None,
-    "The output directory where the TFRecord files will be generated. "
-    "By default it will be set to 'data/{task}'. E.g.: if "
-    "task is 'MRPC', it will be set as 'data/MRPC'")
-
-tf.logging.set_verbosity(tf.logging.INFO)
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--task", type=str, default="MRPC",
+    choices=['COLA', 'MNLI', 'MRPC', 'XNLI', 'SST'],
+    help="The task to run experiment on.")
+parser.add_argument(
+    '--pretrained-model-name', type=str, default='bert-base-uncased',
+    help="The name of a pre-trained model to load selected in the "
+         "list of: `bert-base-uncased`, `bert-large-uncased`, "
+         "`bert-base-cased`, `bert-large-cased`, "
+         "`bert-base-multilingual-uncased`, `bert-base-multilingual-cased`, "
+         "and `bert-base-chinese`.")
+parser.add_argument(
+    "--max-seq-length", type=int, default=128,
+    help="The maxium length of sequence, longer sequence will be trimmed.")
+parser.add_argument(
+    "--output-dir", type=str, default=None,
+    help="The output directory where the pickled files will be generated. "
+         "By default it will be set to 'data/{task}'. E.g.: if "
+         "task is 'MRPC', it will be set as 'data/MRPC'")
+args = parser.parse_args()
+
+logging.root.setLevel(logging.INFO)
 
 
 def _modify_config_data(max_seq_length, num_train_data, num_classes):
@@ -85,34 +82,34 @@ def _modify_config_data(max_seq_length, num_train_data, num_classes):
 
         with open("./config_data.py", 'w') as file:
             file.write('\n'.join(filedata_lines))
-        tf.logging.info("config_data.py has been updated")
+        logging.info("config_data.py has been updated")
     else:
-        tf.logging.info("config_data.py cannot be found")
+        logging.info("config_data.py cannot be found")
 
-    tf.logging.info("Data preparation finished")
+    logging.info("Data preparation finished")
 
 
 def main():
-    """Prepares data.
+    """ Starts the data preparation
     """
     # Loads data
-    tf.logging.info("Loading data")
+    logging.info("Loading data")
 
     task_datasets_rename = {
         "COLA": "CoLA",
         "SST": "SST-2",
     }
 
-    data_dir = 'data/{}'.format(FLAGS.task)
-    if FLAGS.task.upper() in task_datasets_rename:
+    data_dir = 'data/{}'.format(args.task)
+    if args.task.upper() in task_datasets_rename:
         data_dir = 'data/{}'.format(
-            task_datasets_rename[FLAGS.task])
+            task_datasets_rename[args.task])
 
-    if FLAGS.tfrecord_output_dir is None:
-        tfrecord_output_dir = data_dir
+    if args.output_dir is None:
+        output_dir = data_dir
     else:
-        tfrecord_output_dir = FLAGS.tfrecord_output_dir
-    tx.utils.maybe_create_dir(tfrecord_output_dir)
+        output_dir = args.output_dir
+    tx.utils.maybe_create_dir(output_dir)
 
     processors = {
         "COLA": data_utils.ColaProcessor,
@@ -121,25 +118,25 @@ def main():
         "XNLI": data_utils.XnliProcessor,
         'SST': data_utils.SSTProcessor
     }
-    processor = processors[FLAGS.task]()
+    processor = processors[args.task]()
 
     num_classes = len(processor.get_labels())
     num_train_data = len(processor.get_train_examples(data_dir))
-    tf.logging.info(
-        'num_classes:%d; num_train_data:%d' % (num_classes, num_train_data))
+    logging.info("num_classes: %d; num_train_data: %d",
+                 num_classes, num_train_data)
 
     tokenizer = tx.data.BERTTokenizer(
-        pretrained_model_name=FLAGS.pretrained_model_name)
+        pretrained_model_name=args.pretrained_model_name)
 
     # Produces TFRecord files
     data_utils.prepare_TFRecord_data(
         processor=processor,
         tokenizer=tokenizer,
         data_dir=data_dir,
-        max_seq_length=FLAGS.max_seq_length,
-        output_dir=tfrecord_output_dir)
+        max_seq_length=args.max_seq_length,
+        output_dir=output_dir)
 
-    _modify_config_data(FLAGS.max_seq_length, num_train_data, num_classes)
+    _modify_config_data(args.max_seq_length, num_train_data, num_classes)
 
 
 if __name__ == "__main__":
diff --git a/examples/bert/utils/__init__.py b/examples/bert/utils/__init__.py
index e69de29b..54624eba 100644
--- a/examples/bert/utils/__init__.py
+++ b/examples/bert/utils/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2019 The Texar Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/examples/bert/utils/data_utils.py b/examples/bert/utils/data_utils.py
index d07430e6..d3fed4ae 100644
--- a/examples/bert/utils/data_utils.py
+++ b/examples/bert/utils/data_utils.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Copyright 2018 The Google AI Language Team Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -13,12 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-This is the Data Loading Pipeline for Sentence Classifier Task from:
+This is the Data Loading Pipeline for Sentence Classifier Task adapted from:
     `https://github.com/google-research/bert/blob/master/run_classifier.py`
 """
 
-import os
 import csv
+import logging
+import os
 import collections
 
 import tensorflow as tf
@@ -27,10 +27,11 @@
 
 
 class InputExample():
-    """A single training/test example for simple sequence classification."""
+    r"""A single training/test example for simple sequence classification."""
 
     def __init__(self, guid, text_a, text_b=None, label=None):
-        """Constructs a InputExample.
+        r"""Constructs a InputExample.
+
         Args:
             guid: Unique id for the example.
             text_a: string. The untokenized text of the first sequence.
@@ -47,7 +48,7 @@ def __init__(self, guid, text_a, text_b=None, label=None):
 
 
 class InputFeatures:
-    """A single set of features of data."""
+    r"""A single set of features of data."""
 
     def __init__(self, input_ids, input_mask, segment_ids, label_id):
         self.input_ids = input_ids
@@ -56,29 +57,29 @@ def __init__(self, input_ids, input_mask, segment_ids, label_id):
         self.label_id = label_id
 
 
-class DataProcessor(object):
-    """Base class for data converters for sequence classification data sets."""
+class DataProcessor:
+    r"""Base class for data converters for sequence classification data sets."""
 
     def get_train_examples(self, data_dir):
-        """Gets a collection of `InputExample`s for the train set."""
+        r"""Gets a collection of `InputExample`s for the train set."""
         raise NotImplementedError()
 
     def get_dev_examples(self, data_dir):
-        """Gets a collection of `InputExample`s for the dev set."""
+        r"""Gets a collection of `InputExample`s for the dev set."""
         raise NotImplementedError()
 
     def get_test_examples(self, data_dir):
-        """Gets a collection of `InputExample`s for prediction."""
+        r"""Gets a collection of `InputExample`s for prediction."""
         raise NotImplementedError()
 
     def get_labels(self):
-        """Gets the list of labels for this data set."""
+        r"""Gets the list of labels for this data set."""
         raise NotImplementedError()
 
     @classmethod
     def _read_tsv(cls, input_file, quotechar=None):
-        """Reads a tab separated value file."""
-        with tf.gfile.Open(input_file, "r") as f:
+        r"""Reads a tab separated value file."""
+        with tf.io.gfile.GFile(input_file, "r") as f:
             reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
             lines = []
             for line in reader:
@@ -87,32 +88,32 @@ def _read_tsv(cls, input_file, quotechar=None):
 
 
 class SSTProcessor(DataProcessor):
-    """Processor for the MRPC data set (GLUE version)."""
+    r"""Processor for the MRPC data set (GLUE version)."""
 
     def get_train_examples(self, data_dir):
-        """See base class."""
+        r"""See base class."""
         return self._create_examples(
             self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
 
     def get_dev_examples(self, data_dir):
-        """See base class."""
+        r"""See base class."""
         return self._create_examples(
             self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
 
     def get_test_examples(self, data_dir):
-        """See base class."""
+        r"""See base class."""
         return self._create_examples(
             self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
 
     def get_labels(self):
-        """See base class."""
+        r"""See base class."""
         return ["0", "1"]
 
     @staticmethod
     def _create_examples(lines, set_type):
-        """Creates examples for the training and dev sets."""
+        r"""Creates examples for the training and dev sets."""
         examples = []
-        if set_type == 'train' or set_type == 'dev':
+        if set_type in ('train', 'dev'):
             for (i, line) in enumerate(lines):
                 if i == 0:
                     continue
@@ -138,13 +139,13 @@ def _create_examples(lines, set_type):
 
 
 class XnliProcessor(DataProcessor):
-    """Processor for the XNLI data set."""
+    r"""Processor for the XNLI data set."""
 
     def __init__(self):
         self.language = "zh"
 
     def get_train_examples(self, data_dir):
-        """See base class."""
+        r"""See base class."""
         lines = self._read_tsv(
             os.path.join(data_dir, "multinli",
                          "multinli.train.%s.tsv" % self.language))
@@ -163,7 +164,7 @@ def get_train_examples(self, data_dir):
         return examples
 
     def get_dev_examples(self, data_dir):
-        """See base class."""
+        r"""See base class."""
         lines = self._read_tsv(os.path.join(data_dir, "xnli.dev.tsv"))
         examples = []
         for (i, line) in enumerate(lines):
@@ -181,37 +182,37 @@ def get_dev_examples(self, data_dir):
         return examples
 
     def get_labels(self):
-        """See base class."""
+        r"""See base class."""
         return ["contradiction", "entailment", "neutral"]
 
 
 class MnliProcessor(DataProcessor):
-    """Processor for the MultiNLI data set (GLUE version)."""
+    r"""Processor for the MultiNLI data set (GLUE version)."""
 
     def get_train_examples(self, data_dir):
-        """See base class."""
+        r"""See base class."""
         return self._create_examples(
             self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
 
     def get_dev_examples(self, data_dir):
-        """See base class."""
+        r"""See base class."""
         return self._create_examples(
             self._read_tsv(os.path.join(data_dir, "dev_matched.tsv")),
             "dev_matched")
 
     def get_test_examples(self, data_dir):
-        """See base class."""
+        r"""See base class."""
         return self._create_examples(
             self._read_tsv(os.path.join(data_dir, "test_matched.tsv")),
             "test")
 
     def get_labels(self):
-        """See base class."""
+        r"""See base class."""
         return ["contradiction", "entailment", "neutral"]
 
     @staticmethod
     def _create_examples(lines, set_type):
-        """Creates examples for the training and dev sets."""
+        r"""Creates examples for the training and dev sets."""
         examples = []
         for (i, line) in enumerate(lines):
             if i == 0:
@@ -230,33 +231,33 @@ def _create_examples(lines, set_type):
 
 
 class MrpcProcessor(DataProcessor):
-    """Processor for the MRPC data set (GLUE version)."""
+    r"""Processor for the MRPC data set (GLUE version)."""
 
     def get_train_examples(self, data_dir):
-        """See base class."""
+        r"""See base class."""
         return self._create_examples(
             self._read_tsv(os.path.join(data_dir, "train.tsv")),
             "train")
 
     def get_dev_examples(self, data_dir):
-        """See base class."""
+        r"""See base class."""
         return self._create_examples(
             self._read_tsv(os.path.join(data_dir, "dev.tsv")),
             "dev")
 
     def get_test_examples(self, data_dir):
-        """See base class."""
+        r"""See base class."""
         return self._create_examples(
             self._read_tsv(os.path.join(data_dir, "test.tsv")),
             "test")
 
     def get_labels(self):
-        """See base class."""
+        r"""See base class."""
         return ["0", "1"]
 
     @staticmethod
     def _create_examples(lines, set_type):
-        """Creates examples for the training and dev sets."""
+        r"""Creates examples for the training and dev sets."""
         examples = []
         for (i, line) in enumerate(lines):
             if i == 0:
@@ -274,33 +275,33 @@ def _create_examples(lines, set_type):
 
 
 class ColaProcessor(DataProcessor):
-    """Processor for the CoLA data set (GLUE version)."""
+    r"""Processor for the CoLA data set (GLUE version)."""
 
     def get_train_examples(self, data_dir):
-        """See base class."""
+        r"""See base class."""
         return self._create_examples(
             self._read_tsv(os.path.join(data_dir, "train.tsv")),
             "train")
 
     def get_dev_examples(self, data_dir):
-        """See base class."""
+        r"""See base class."""
         return self._create_examples(
             self._read_tsv(os.path.join(data_dir, "dev.tsv")),
             "dev")
 
     def get_test_examples(self, data_dir):
-        """See base class."""
+        r"""See base class."""
         return self._create_examples(
             self._read_tsv(os.path.join(data_dir, "test.tsv")),
             "test")
 
     def get_labels(self):
-        """See base class."""
+        r"""See base class."""
         return ["0", "1"]
 
     @staticmethod
     def _create_examples(lines, set_type):
-        """Creates examples for the training and dev sets."""
+        r"""Creates examples for the training and dev sets."""
         examples = []
         for (i, line) in enumerate(lines):
             # Only the test set has a header
@@ -320,7 +321,7 @@ def _create_examples(lines, set_type):
 
 def convert_single_example(ex_index, example, label_list, max_seq_length,
                            tokenizer):
-    """Converts a single `InputExample` into a single `InputFeatures`."""
+    r"""Converts a single `InputExample` into a single `InputFeatures`."""
     label_map = {}
     for (i, label) in enumerate(label_list):
         label_map[label] = i
@@ -334,15 +335,13 @@ def convert_single_example(ex_index, example, label_list, max_seq_length,
 
     # here we disable the verbose printing of the data
     if ex_index < 0:
-        tf.logging.info("*** Example ***")
-        tf.logging.info("guid: %s" % example.guid)
-        tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
-        tf.logging.info("input_ids length: %d" % len(input_ids))
-        tf.logging.info("input_mask: %s" %
-                        " ".join([str(x) for x in input_mask]))
-        tf.logging.info("segment_ids: %s" %
-                        " ".join([str(x) for x in segment_ids]))
-        tf.logging.info("label: %s (id = %d)" % (example.label, label_id))
+        logging.info("*** Example ***")
+        logging.info("guid: %s", example.guid)
+        logging.info("input_ids: %s", " ".join([str(x) for x in input_ids]))
+        logging.info("input_ids length: %d", len(input_ids))
+        logging.info("input_mask: %s", " ".join([str(x) for x in input_mask]))
+        logging.info("segment_ids: %s", " ".join([str(x) for x in segment_ids]))
+        logging.info("label: %s (id = %d)", example.label, label_id)
 
     feature = InputFeatures(input_ids=input_ids,
                             input_mask=input_mask,
@@ -353,9 +352,9 @@ def convert_single_example(ex_index, example, label_list, max_seq_length,
 
 def convert_examples_to_features_and_output_to_files(
         examples, label_list, max_seq_length, tokenizer, output_file):
-    """Convert a set of `InputExample`s to a TFRecord file."""
+    r"""Convert a set of `InputExample`s to a TFRecord file."""
 
-    writer = tf.python_io.TFRecordWriter(output_file)
+    writer = tf.io.TFRecordWriter(output_file)
 
     for (ex_index, example) in enumerate(examples):
 
@@ -379,7 +378,8 @@ def create_int_feature(values):
 
 def prepare_TFRecord_data(processor, tokenizer,
                           data_dir, max_seq_length, output_dir):
-    """
+    """Prepare record data.
+
     Args:
         processor: Data Preprocessor, which must have get_lables,
             get_train/dev/test/examples methods defined.
diff --git a/examples/bert/utils/model_utils.py b/examples/bert/utils/model_utils.py
deleted file mode 100644
index 3dd2ceba..00000000
--- a/examples/bert/utils/model_utils.py
+++ /dev/null
@@ -1,38 +0,0 @@
-"""
-Model utility functions
-"""
-
-import tensorflow as tf
-
-
-def get_lr(global_step, num_train_steps, num_warmup_steps, static_lr):
-    """
-    Calculate the learinng rate given global step and warmup steps.
-    The learinng rate is following a linear warmup and linear decay.
-    """
-    learning_rate = tf.constant(value=static_lr,
-                                shape=[], dtype=tf.float32)
-
-    learning_rate = tf.train.polynomial_decay(
-        learning_rate,
-        global_step,
-        num_train_steps,
-        end_learning_rate=0.0,
-        power=1.0,
-        cycle=False)
-
-    if num_warmup_steps:
-        global_steps_int = tf.cast(global_step, tf.int32)
-        warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32)
-
-        global_steps_float = tf.cast(global_steps_int, tf.float32)
-        warmup_steps_float = tf.cast(warmup_steps_int, tf.float32)
-
-        warmup_percent_done = global_steps_float / warmup_steps_float
-        warmup_learning_rate = static_lr * warmup_percent_done
-
-        is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32)
-        learning_rate = ((1.0 - is_warmup) * learning_rate
-                         + is_warmup * warmup_learning_rate)
-
-    return learning_rate
diff --git a/examples/distributed_gpu/README.md b/examples/distributed_gpu/README.md
deleted file mode 100644
index 6c3a8c3c..00000000
--- a/examples/distributed_gpu/README.md
+++ /dev/null
@@ -1,90 +0,0 @@
-# Model Training with Multi/Distributed GPUs
-
-This example shows how models built with Texar can be trained with multiple GPUs on single or multiple machines. Multi/Distributed-GPU training is based on the third-party library [Horovod](https://github.com/uber/horovod).
-
-Here we take language model for example, adapting the [single-GPU language model example](https://github.com/asyml/texar/tree/master/examples/language_model_ptb) by adding a few lines of Horovod-related code to enable distributed training (more details below).
-
-## Prerequisites
-
-Two third-party packages are required:
-
-* `openmpi >= 3.0.0`
-* `horovod`
-
-The following commands install [OpenMPI](https://www.open-mpi.org) 4.0.0 to the path `/usr/local/openmpi`. Run `mpirun --version` to check the version of installed OpenNMT.
-```
-# Download and install OpenMPI
-wget https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.0.tar.gz
-tar xvf openmpi-4.0.0.tar.gz
-cd openmpi-4.0.0/
-./configure --prefix=/usr/local/openmpi
-sudo make all install
-
-# Add path of the installed OpenMPI to your system path
-export PATH=/usr/local/openmpi/bin:$PATH
-export LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH
-```
-
-Then install Horovod with the cmd:
-```
-pip install horovod
-```
-
-## Adapting Single-GPU Code for distributed Training
-
-Based on the [single-GPU code](https://github.com/asyml/texar/tree/master/examples/language_model_ptb), we made the following adaptions. Note that one processor is created for each GPU.
-
-- Setting up Horovod in the code (click the links below to see the corresponding actual code in `lm_ptb_distributed.py`):
-    1. [`hvd.init()`](https://github.com/asyml/texar/blob/master/examples/distributed_gpu/lm_ptb_distributed.py#L76): initialize Horovod
-    2. [`hvd.DistributedOptimizer`](https://github.com/asyml/texar/blob/master/examples/distributed_gpu/lm_ptb_distributed.py#L131): wrap your optimizer.
-    3. [`hvd.broadcast_global_variables(0)`](https://github.com/asyml/texar/blob/master/examples/distributed_gpu/lm_ptb_distributed.py#L191): set the operator to broadcast your global variables to different processes from rank-0 process.
-    4. [set visible GPU list](https://github.com/asyml/texar/blob/master/examples/distributed_gpu/lm_ptb_distributed.py#L194) by `config.gpu_options.visible_device_list = str(hvd.local_rank())`, to make each process see the attached single GPU.
-    5. [run the broadcast node](https://github.com/asyml/texar/blob/master/examples/distributed_gpu/lm_ptb_distributed.py#L203): run the broadcast operator before training
-- Data sharding:
-    1. To make sure different GPUs (processors) receive different data batches in each iteration, we [shard the training data](https://github.com/asyml/texar/blob/master/examples/distributed_gpu/ptb_reader.py#L52) into `N` parts, where `N` is the number of GPUs (processors).
-    2. In this example, `batch_size` in the config files denotes the total batch size in each iteration of all processors. That is, in each iteration, each processor receives `batch_size`/`N` data instances. This replicates the gradients in the single-GPU setting, and we use the same `learning_rate` as in single-GPU.
-
-## Usage ##
-
-Run the following command to train the model with multiple GPUs on multiple machines:
-```
-mpirun -np 2 \
-    -H  [IP-adress-of-server1]:1,[IP-address-of-server2]:1\
-    -bind-to none -map-by slot \
-    -x NCCL_DEBUG=INFO -x LD_LIBRARY_PATH -x PATH \
-    -mca pml ob1 -mca btl tcp,self \
-    -mca btl_tcp_if_include ens3 \
-    python lm_ptb_distributed.py --config config_small --data_path ./
-```
-
-Here:
-  * The key configurations for ordinary users:
-  
-      - `-np`: total number of processes
-      - `-H`: IP addresses of different servers and the number of processes used in each server. For example, `-H 192.168.11.22:1,192.168.33.44:1`. To run on local machines, set, e.g., `-H localhost:2`.
-      
-  * Other advanced configurations:
-  
-      - `--bind-to none`: specifies OpenMPI to not bind a training process to a single CPU core (which would hurt performance).
-      - `-map-by slot`: allows you to have a mixture of different NUMA configurations because the default behavior is to bind to the socket.
-      - `-x`: specifies (`-x NCCL_DEBUG=INFO`) or copies (`-x LD_LIBRARY_PATH`) an environment variable to all the workers.
-      - `-mca`: sets the MPI communication interface. Use the setting specified above to avoid possible multiprocessing and network communication issues.
-      
-          * The above configuration uses the `ens3` network interface. If this interface does not work in your environment (e.g., yielding error message `Unknown interfance name`), you may want to use a different interface (Run cmd `ifconfig` to see alternative interfaces in your environment.)
-          
-  * Language model configurations:
-      - `--config`: specifies the config file to use. E.g., the above use the configuration defined in config_small.py
-      - `--data_path`: specifies the directory containing PTB raw data (e.g., ptb.train.txt). If the data files do not exist, the program will automatically download, extract, and pre-process the data.
-
-The model will begin training on the specified GPUs, and evaluate on the validation data periodically. Evaluation on the test data is performed after the training is done. Note that both validation and test are performed only on the rank-0 GPU (i.e., they are not distributed). 
-
-## Results ##
-
-We did simple test on two AWS p2.xlarge instances. 
-Since the language model is small and the communication cost is considerable, as expected, the example here doesn't scale very well on 2-GPU 2-machine in terms of speedup rate. The perplexity results of multi-GPU are the same with those of single-GPU.
-
-| config | epochs | train | valid  | test  | time/epoch (2-gpu) | time/epoch (single-gpu) |
-| -------| -------| ------| -------| ------| -----| -----|
-| small  | 13     | 40.81 | 118.99 | 114.72| 207s | 137s |
-| medium | 39     | 44.18 |  87.63 |  84.42| 461s | 311s |
-| large  | 55     | 36.54 |  82.55 |  78.72| 1765s | 931s |
diff --git a/examples/distributed_gpu/config_large.py b/examples/distributed_gpu/config_large.py
deleted file mode 100644
index 58c7a55a..00000000
--- a/examples/distributed_gpu/config_large.py
+++ /dev/null
@@ -1,56 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""PTB LM large size config.
-"""
-
-# pylint: disable=invalid-name, too-few-public-methods, missing-docstring
-
-init_scale = 0.04
-num_epochs = 55
-hidden_size = 1500
-keep_prob = 0.35
-batch_size = 20
-num_steps = 35
-
-cell = {
-    "type": "LSTMBlockCell",
-    "kwargs": {
-        "num_units": hidden_size,
-        "forget_bias": 0.
-    },
-    "dropout": {"output_keep_prob": keep_prob},
-    "num_layers": 2
-}
-emb = {
-    "dim": hidden_size
-}
-opt = {
-    "optimizer": {
-        "type": "GradientDescentOptimizer",
-        "kwargs": {"learning_rate": 1.0}
-    },
-    "gradient_clip": {
-        "type": "clip_by_global_norm",
-        "kwargs": {"clip_norm": 10.}
-    },
-    "learning_rate_decay": {
-        "type": "exponential_decay",
-        "kwargs": {
-            "decay_steps": 1,
-            "decay_rate": 1. / 1.15,
-            "staircase": True
-        },
-        "start_decay_step": 14
-    }
-}
diff --git a/examples/distributed_gpu/config_medium.py b/examples/distributed_gpu/config_medium.py
deleted file mode 100644
index ae8d0f73..00000000
--- a/examples/distributed_gpu/config_medium.py
+++ /dev/null
@@ -1,56 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""PTB LM medium size config.
-"""
-
-# pylint: disable=invalid-name, too-few-public-methods, missing-docstring
-
-init_scale = 0.05
-num_epochs = 39
-hidden_size = 650
-keep_prob = 0.5
-batch_size = 20
-num_steps = 35
-
-cell = {
-    "type": "LSTMBlockCell",
-    "kwargs": {
-        "num_units": hidden_size,
-        "forget_bias": 0.
-    },
-    "dropout": {"output_keep_prob": keep_prob},
-    "num_layers": 2
-}
-emb = {
-    "dim": hidden_size
-}
-opt = {
-    "optimizer": {
-        "type": "GradientDescentOptimizer",
-        "kwargs": {"learning_rate": 1.0}
-    },
-    "gradient_clip": {
-        "type": "clip_by_global_norm",
-        "kwargs": {"clip_norm": 5.}
-    },
-    "learning_rate_decay": {
-        "type": "exponential_decay",
-        "kwargs": {
-            "decay_steps": 1,
-            "decay_rate": 0.8,
-            "staircase": True
-        },
-        "start_decay_step": 5
-    }
-}
diff --git a/examples/distributed_gpu/config_small.py b/examples/distributed_gpu/config_small.py
deleted file mode 100644
index 7cfebc7c..00000000
--- a/examples/distributed_gpu/config_small.py
+++ /dev/null
@@ -1,56 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""PTB LM small size config.
-"""
-
-# pylint: disable=invalid-name, too-few-public-methods, missing-docstring
-
-init_scale = 0.1
-num_epochs = 13
-hidden_size = 200
-keep_prob = 1.0
-batch_size = 20
-num_steps = 20
-
-cell = {
-    "type": "LSTMBlockCell",
-    "kwargs": {
-        "num_units": hidden_size,
-        "forget_bias": 0.
-    },
-    "dropout": {"output_keep_prob": keep_prob},
-    "num_layers": 2
-}
-emb = {
-    "dim": hidden_size
-}
-opt = {
-    "optimizer": {
-        "type": "GradientDescentOptimizer",
-        "kwargs": {"learning_rate": 1.0}
-    },
-    "gradient_clip": {
-        "type": "clip_by_global_norm",
-        "kwargs": {"clip_norm": 5.}
-    },
-    "learning_rate_decay": {
-        "type": "exponential_decay",
-        "kwargs": {
-            "decay_steps": 1,
-            "decay_rate": 0.5,
-            "staircase": True
-        },
-        "start_decay_step": 3
-    }
-}
diff --git a/examples/distributed_gpu/lm_ptb_distributed.py b/examples/distributed_gpu/lm_ptb_distributed.py
deleted file mode 100644
index 6f7cc517..00000000
--- a/examples/distributed_gpu/lm_ptb_distributed.py
+++ /dev/null
@@ -1,237 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Example for building the language model.
-
-This is a reimpmentation of the TensorFlow official PTB example in:
-tensorflow/models/rnn/ptb
-
-Model and training are described in:
-(Zaremba, et. al.) Recurrent Neural Network Regularization
- http://arxiv.org/abs/1409.2329
-
-There are 3 provided model configurations:
-===========================================
-| config | epochs | train | valid  | test
-===========================================
-| small  | 13     | 37.99 | 121.39 | 115.91
-| medium | 39     | 48.45 |  86.16 |  82.07
-| large  | 55     | 37.87 |  82.62 |  78.29
-The exact results may vary depending on the random initialization.
-
-The data required for this example is in the `data/` dir of the
-PTB dataset from Tomas Mikolov's webpage:
-
-$ wget http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz
-$ tar xvf simple-examples.tgz
-
-If data is not provided, the program will download from above automatically.
-
-To run:
-
-$ python lm_ptb.py --data_path=simple-examples/data --config=config_small
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# pylint: disable=invalid-name, no-member, too-many-locals
-
-import time
-import importlib
-import numpy as np
-import tensorflow as tf
-import texar.tf as tx
-import horovod.tensorflow as hvd
-
-from ptb_reader import prepare_data, ptb_iterator
-
-flags = tf.flags
-
-flags.DEFINE_string("data_path", "./",
-                    "Directory containing PTB raw data (e.g., ptb.train.txt). "
-                    "E.g., ./simple-examples/data. If not exists, "
-                    "the directory will be created and PTB raw data will "
-                    "be downloaded.")
-flags.DEFINE_string("config", "config_small", "The config to use.")
-
-FLAGS = flags.FLAGS
-
-config = importlib.import_module(FLAGS.config)
-
-
-def _main(_):
-    # Data
-    tf.logging.set_verbosity(tf.logging.INFO)
-
-    # 1. initialize the horovod
-    hvd.init()
-
-    batch_size = config.batch_size
-    num_steps = config.num_steps
-    data = prepare_data(FLAGS.data_path)
-    vocab_size = data["vocab_size"]
-
-    inputs = tf.placeholder(tf.int32, [None, num_steps],
-                            name='inputs')
-    targets = tf.placeholder(tf.int32, [None, num_steps],
-                             name='targets')
-
-    # Model architecture
-    initializer = tf.random_uniform_initializer(
-        -config.init_scale, config.init_scale)
-    with tf.variable_scope("model", initializer=initializer):
-        embedder = tx.modules.WordEmbedder(
-            vocab_size=vocab_size, hparams=config.emb)
-        emb_inputs = embedder(inputs)
-        if config.keep_prob < 1:
-            emb_inputs = tf.nn.dropout(
-                emb_inputs, tx.utils.switch_dropout(config.keep_prob))
-
-        decoder = tx.modules.BasicRNNDecoder(
-            vocab_size=vocab_size, hparams={"rnn_cell": config.cell})
-
-        # This _batch_size equals to batch_size // hvd.size() in
-        # distributed training.
-        # because the mini-batch is distributed to multiple GPUs
-
-        _batch_size = tf.shape(inputs)[0]
-        initial_state = decoder.zero_state(_batch_size,
-                                           tf.float32)
-        seq_length = tf.broadcast_to([num_steps], (_batch_size, ))
-        outputs, final_state, seq_lengths = decoder(
-            decoding_strategy="train_greedy",
-            impute_finished=True,
-            inputs=emb_inputs,
-            sequence_length=seq_length,
-            initial_state=initial_state)
-    # Losses & train ops
-    mle_loss = tx.losses.sequence_sparse_softmax_cross_entropy(
-        labels=targets,
-        logits=outputs.logits,
-        sequence_length=seq_lengths)
-
-    # Use global_step to pass epoch, for lr decay
-    global_step = tf.placeholder(tf.int32)
-
-    opt = tx.core.get_optimizer(
-        global_step=global_step,
-        hparams=config.opt
-    )
-
-    # 2. wrap the optimizer
-    opt = hvd.DistributedOptimizer(opt)
-
-    train_op = tx.core.get_train_op(
-        loss=mle_loss,
-        optimizer=opt,
-        global_step=global_step,
-        learning_rate=None,
-        increment_global_step=False,
-        hparams=config.opt
-    )
-
-    def _run_epoch(sess, data_iter, epoch, is_train=False, verbose=False):
-        start_time = time.time()
-        loss = 0.
-        iters = 0
-
-        fetches = {
-            "mle_loss": mle_loss,
-            "final_state": final_state,
-        }
-        if is_train:
-            fetches["train_op"] = train_op
-            epoch_size = (len(data["train_text_id"]) // batch_size - 1)\
-                // num_steps
-
-        mode = (tf.estimator.ModeKeys.TRAIN
-                if is_train
-                else tf.estimator.ModeKeys.EVAL)
-
-        for step, (x, y) in enumerate(data_iter):
-            if step == 0:
-                state = sess.run(initial_state,
-                                 feed_dict={inputs: x})
-
-            feed_dict = {
-                inputs: x, targets: y, global_step: epoch,
-                tx.global_mode(): mode,
-            }
-            for i, (c, h) in enumerate(initial_state):
-                feed_dict[c] = state[i].c
-                feed_dict[h] = state[i].h
-
-            rets = sess.run(fetches, feed_dict)
-            loss += rets["mle_loss"]
-            state = rets["final_state"]
-            iters += num_steps
-
-            ppl = np.exp(loss / iters)
-            if verbose and is_train and hvd.rank() == 0 \
-                and (step + 1) % (epoch_size // 10) == 0:
-                tf.logging.info("%.3f perplexity: %.3f speed: %.0f wps" %
-                                ((step + 1) * 1.0 / epoch_size, ppl,
-                                 iters * batch_size / (
-                                         time.time() - start_time)))
-        _elapsed_time = time.time() - start_time
-        tf.logging.info("epoch time elapsed: %f" % (_elapsed_time))
-        ppl = np.exp(loss / iters)
-        return ppl, _elapsed_time
-
-    # 3. set broadcase global variables from rank-0 process
-    bcast = hvd.broadcast_global_variables(0)
-
-    # 4. set visible GPU
-    session_config = tf.ConfigProto()
-    session_config.gpu_options.visible_device_list = str(hvd.local_rank())
-
-    with tf.Session(config=session_config) as sess:
-        sess.run(tf.global_variables_initializer())
-        sess.run(tf.local_variables_initializer())
-        sess.run(tf.tables_initializer())
-
-        # 5. run the broadcast_global_variables node before training
-        bcast.run()
-
-        _times = []
-        for epoch in range(config.num_epochs):
-            # Train
-            train_data_iter = ptb_iterator(
-                data["train_text_id"], config.batch_size, num_steps,
-                is_train=True)
-            train_ppl, train_time = _run_epoch(
-                sess, train_data_iter, epoch, is_train=True, verbose=True)
-            _times.append(train_time)
-            tf.logging.info("Epoch: %d Train Perplexity: %.3f" % (epoch, train_ppl))
-            # Valid in the main process
-            if hvd.rank() == 0:
-                valid_data_iter = ptb_iterator(
-                    data["valid_text_id"], config.batch_size, num_steps)
-                valid_ppl, _ = _run_epoch(sess, valid_data_iter, epoch)
-                tf.logging.info("Epoch: %d Valid Perplexity: %.3f"
-                                % (epoch, valid_ppl))
-
-        tf.logging.info('train times: %s' % (_times))
-        tf.logging.info('average train time/epoch %f'
-                        % np.mean(np.array(_times)))
-        # Test in the main process
-        if hvd.rank() == 0:
-            test_data_iter = ptb_iterator(
-                data["test_text_id"], batch_size, num_steps)
-            test_ppl, _ = _run_epoch(sess, test_data_iter, 0)
-            tf.logging.info("Test Perplexity: %.3f" % (test_ppl))
-
-
-if __name__ == '__main__':
-    tf.app.run(main=_main)
diff --git a/examples/distributed_gpu/ptb_reader.py b/examples/distributed_gpu/ptb_reader.py
deleted file mode 100644
index a675a62f..00000000
--- a/examples/distributed_gpu/ptb_reader.py
+++ /dev/null
@@ -1,102 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Utilities for preprocessing and iterating over the PTB data.
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# pylint: disable=invalid-name, too-many-locals
-
-import os
-import numpy as np
-
-import tensorflow as tf
-import horovod.tensorflow as hvd
-import texar.tf as tx
-
-
-def ptb_iterator(data, batch_size, num_steps, is_train=False):
-    """Iterates through the ptb data.
-    """
-
-    data_length = len(data)
-
-    batch_length = data_length // batch_size
-    data = np.asarray(data[:batch_size * batch_length])
-    data = data.reshape([batch_size, batch_length])
-
-    epoch_size = (batch_length - 1) // num_steps
-    if epoch_size == 0:
-        raise ValueError("epoch_size == 0, decrease batch_size or num_steps")
-
-    def _sharded_data(data):
-        _batch_size = len(data)
-        _shard_size = _batch_size // hvd.size()
-        data = [data[i * _shard_size: (i + 1) * _shard_size]
-                for i in range(_shard_size)]
-        data = data[hvd.rank()]
-        return data
-
-    if is_train:
-        # split the dataset into shards to make sure
-        # different processed are loaded with different training data
-        data = _sharded_data(data)
-
-    for i in range(epoch_size):
-        x = data[:, i * num_steps: (i + 1) * num_steps]
-        y = data[:, i * num_steps + 1: (i + 1) * num_steps + 1]
-        yield (x, y)
-
-
-def prepare_data(data_path):
-    """Preprocess PTB data.
-    """
-    train_path = os.path.join(data_path, "ptb.train.txt")
-    if not tf.gfile.Exists(train_path):
-        url = 'http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz'
-        tx.data.maybe_download(url, data_path, extract=True)
-        data_path = os.path.join(data_path, 'simple-examples', 'data')
-
-    train_path = os.path.join(data_path, "ptb.train.txt")
-    valid_path = os.path.join(data_path, "ptb.valid.txt")
-    test_path = os.path.join(data_path, "ptb.test.txt")
-
-    word_to_id = tx.data.make_vocab(
-        train_path, newline_token="<EOS>", return_type="dict")
-    assert len(word_to_id) == 10000
-
-    train_text = tx.data.read_words(
-        train_path, newline_token="<EOS>")
-    train_text_id = [word_to_id[w] for w in train_text if w in word_to_id]
-
-    valid_text = tx.data.read_words(
-        valid_path, newline_token="<EOS>")
-    valid_text_id = [word_to_id[w] for w in valid_text if w in word_to_id]
-
-    test_text = tx.data.read_words(
-        test_path, newline_token="<EOS>")
-    test_text_id = [word_to_id[w] for w in test_text if w in word_to_id]
-
-    data = {
-        "train_text": train_text,
-        "valid_text": valid_text,
-        "test_text": test_text,
-        "train_text_id": train_text_id,
-        "valid_text_id": valid_text_id,
-        "test_text_id": test_text_id,
-        "vocab": word_to_id,
-        "vocab_size": len(word_to_id)
-    }
-    return data
diff --git a/examples/gpt-2/.gitignore b/examples/gpt-2/.gitignore
deleted file mode 100644
index baefa8e0..00000000
--- a/examples/gpt-2/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-/data/toy/*.pkl
-/output/
diff --git a/examples/gpt-2/README.md b/examples/gpt-2/README.md
deleted file mode 100644
index c9caf36c..00000000
--- a/examples/gpt-2/README.md
+++ /dev/null
@@ -1,275 +0,0 @@
-# GPT-2: Pre-trained Langauge Model
-
-This is a Texar implementation of [OpenAI GPT-2 (Generative Pre-Trainning)](https://github.com/openai/gpt-2) language model, which allows to load official pre-trained model parameters, generate samples, and fine-tune the model, etc.
-
-With Texar, building the GPT-2 model is as simple as creating a [`TransformerDecoder`](https://texar.readthedocs.io/en/latest/code/modules.html#transformerdecoder) instance. We can initialize the parameters of the TransformerDecoder using a pre-trained GPT-2 checkpoint by calling `init_gpt2_checkpoint(path_to_gpt2_checkpoint)` .
-
-In sum, this example showcases:
-
-* Contructing and using pre-trained GPT-2 models in Texar
-* Using GPT-2 to generate text samples with or without context
-* **Train or fine-tune** the model with **distributed GPU**
-* Examples of other use cases
-
-## Quick Start (I) - Generation with the Pre-trained Model
-
-### Download GPT-2 Pre-trained Model
-
-Download the GPT-2 `117M` model checkpoint with the following command:
-```
-sh gpt2_pretrained_models/download_model.sh model_117M
-```
-By default, it will download a pretrained model named `model_117M` to `gpt2_pretrained_models/`.
-
-To download the GPT-2 `345M` model checkpoint, use:
-```
-sh gpt2_pretrained_models/download_model.sh model_345M
-```
-
-### Usage
-| WARNING: Samples are unfiltered and may contain offensive content. |
-| --- |
-
-#### Interactive mode (to generate samples with context)
-
-This mode will initialize an interactive interface, which allows users to type in the context sentence. The model then generates continuation of the context. Top-K sample decoding is used. By default, the GPT-2 `117M` model is used.
-
-```
-python gpt2_generate_main.py --is_interactive \
---max_decoding_length=100 \
---temperature=0.7 \
---top_k=40
-```
-
-Here:
-
-- `is_interactive`: Specifies interactive mode.
-- `max_decoding_length`: The maximum number of tokens in the sample. **Note that this includes tokens in the context**. 
-- `temperature`: Softmax temperature of top-k sample decoding. Larger values (above 1.0) result in more random samples, while smaller values push the sampling distribution towards the argmax. Must be strictly greater than 0. Defaults to `0.7`.
-- `top_k`: Number of top most likely candidates from a vocab distribution in each decoding step. Defaults to `40`.
-- `nsamples`: Number of samples to generate for each input. 
-
-To use the GPT-2 `345M` model, specify `--pretrain_checkpoint` and `--config_model`:
-
-```
-python gpt2_generate_main.py --is_interactive \
---max_decoding_length=100 \
---temperature=0.7 \
---top_k=40 \
---config_model=configs.config_model_345M \
---pretrain_checkpoint=gpt2_pretrained_models/model_345M/model.ckpt
---pretrain_model_dir=gpt2_pretrained_models/model_345M
-```
-
-Here:
-
-- `pretrain_checkpoint`: Path to the model checkpoints. Default to `gpt2_pretrained_models/model_117M/model.ckpt`.
-- `config_model`: Model configuration file. Default to `configs.config_model_117M`.
-- `pretrain_model_dir`:  The directory of pretrained model, for loading vocabuary, etc. Default to `gpt2_pretrained_models/model_117M`. 
-
-**Example input:**
-```
-Model input >>> Micheal Jordan is the greatest player in history !
-```
-**Example output:**
-```
-======================================== SAMPLE 1 ========================================
-
-He's the one who has made all the difference. He's a true legend. He's a great athlete, 
-a great athlete. He's a great athlete. I'm so happy for him. I'm so happy for his family, 
-the family, and I'm so happy for him. I'm so happy for his teammates, his teammates, and 
-I'm so happy for him.
-
-The last time we saw him on stage, he
-
-================================================================================
-```
-
-#### Non-interactive mode (to generate samples from scratch)
-
-This mode generates a batch of samples from scratch.
-
-```
-python gpt2_generate_main.py
---nsamples=1 \
---batch_size=1 \
---max_decoding_len=100 \
---temperature=0.7 \
---top_k=40
-```
-
-Here:
-
-- `nsamples`: Total number of samples to generate, must be dividable by the `batch_size`.
-- `batch_size`: Each iteration generates `batch_size` number of samples.
-
-To use GPT-2 `345M` model, specify `--pretrain_checkpoint`, `--config_model` and `--pretrain_model_dir` as above.
-
-**Example output:**
-
-```
-"A new government and a healthy economy have a chance to take this up."
-
-After he said the election's outcome in the House was important and had helped to build 
-confidence in the House, former Ukip leader Nigel Farage spoke about working to boost 
-the economy, saying the vote for the "lefties" and others "were bad optics for Labour 
-in this way".
-```
-
-## Quick Start (II) - Fine-tune the Pre-trained Model 
-
-This section shows how we can fine-tune the pre-trained GPT2 model and use the resulting model for generation.
-
-First of all, **download** the pre-trained model [as above](https://github.com/asyml/texar/tree/master/examples/gpt-2#download-gpt-2-pre-trained-model).
-
-### Prepare data
-
-We first preprocess data with the GPT-2 BPE encoding. 
-
-A toy dataset is provided under [`data/toy/`](data/toy) which includes `train.txt`, `dev.txt`, and `test.txt`. This example will fit the GPT-2 model on `train.txt`, evaluate perplexity on `dev.txt`, and do continuation generation using `test.txt` as the context.
-
-Run the following cmd to transform the data into [TFRecord](https://www.tensorflow.org/tutorials/load_data/tf_records) format and perform processing such as truncation, BPE encoding, adding special tokens, etc:
-
-```
-    python prepare_data.py --data_dir data/toy 
-    [--max_seq_length=128]
-    [--tfrecord_output_dir=data/toy] 
-    [--pretrain_model_dir=gpt2_pretrained_models/model_117M]
-```
-- `data_dir`: The directory of raw data, wherein data files must be named as 'train.txt', 'dev.txt', or 'test.txt'. It is *not* necessary to provide all three files.
-- `max_seq_length`: The maxium length of sequence after BPE encoding. This includes GPT-2 special tokens that will be automatically added. Longer sequence will be trimmed. 
-- `tfrecord_output_dir`: The output path where the resulting TFRecord files will be put in. Be default, it is set to be the same as `data_dir`. 
-- `pretrain_model_dir`: The downloaded pretrained model directory, wherein the vocabulary files are used for data processing.
-
-The above cmd will output TFRecord files in the specified output directory. E.g., if `train.txt` is provided under `data_dir`, the output file `train.tf_record` will be produced under `tfrecord_output_dir`. 
-
-### Train and Evaluate
-
-For **single-GPU** training (and evaluation), run the following cmd. The cmd fine-tunes the pre-trained GPT-2 parameters, and evalautes perplexity on the dev set.
-```
-    python gpt2_train_main.py --do_train --do_eval
-    [--config_train=configs.config_train]
-    [--output_dir=output]
-```
-Here:
-
-- `config_train`: Configurations of GPT-2 training, including data and optimization hyperparameters. By default, the config file [`configs/config_train.py`](configs/config_train.py) is used. Remember to specify correct data path if you are using your own data.
-- `output_dir`: The output path where checkpoints are saved.
-
-By default, the GPT-2 `117M` model is used. To use the GPT-2 `345M` model instead, specify relevant FLAGS as below:
-```
-    python gpt2_train_main.py --do_train --do_eval \
-    --config_model=configs.config_model_345M \
-    --pretrain_model_dir=gpt2_pretrained_models/model_345M \
-    --pretrain_checkpoint=gpt2_pretrained_models/model_345M/model.ckpt \
-    [--config_train=configs.config_train]
-    [--output_dir=output]
-```
-where `--pretrain_checkpoint` is necessary only when you want to load the pretrained checkpoint, and is ignored if `--checkpoint` is specified. 
-
-Please see the FLAGS in the code for more options.
-
-For **Multi-GPU training** on one or multiple machines, you may first install the prerequisite OpenMPI and Hovorod packages, as detailed in the [distributed_gpu](https://github.com/asyml/texar/tree/master/examples/distributed_gpu) example. 
-
-Then run the following cmd for training and evaluation. The cmd trains the model on local with 2 GPUs. Evaluation is performed with the single rank-0 GPU.
-```
-mpirun -np 2 \
-    -H  localhost:2\
-    -bind-to none -map-by slot \
-    -x NCCL_DEBUG=INFO -x LD_LIBRARY_PATH -x PATH \
-    -mca pml ob1 -mca btl tcp,self \
-    -mca btl_tcp_if_include ens3 \
-    python gpt2_train_main.py --do_train --do_eval --distributed
-    [--config_train=configs.config_train]
-    [--output_dir=output]
-```
-The key configurations of multi-gpu training:
-
-* `-np`: total number of processes
-* `-H`: IP addresses of different servers and the number of processes used in each server. For example, `-H 192.168.11.22:1,192.168.33.44:1`
-* `-mca`: sets the MPI communication interface. Use the setting specified above to avoid possible multiprocessing and network communication issues.
-
-  - The above configuration uses the `ens3` network interface. If this interface does not work in your environment (e.g., yielding error message `Unknown interfance name`), you may want to use a different interface (Run cmd `ifconfig` to see alternative interfaces in your environment.)
-
-Please refer to [distributed_gpu](https://github.com/asyml/texar/tree/master/examples/distributed_gpu) example for more details of the other multi-gpu configurations.
-
-Make sure to specifiy the `--distributed` flag as above for multi-gpu training.
-
-
-### Restore and Test
-
-``
-python gpt2_train_main.py --do_test --checkpoint=output/model.ckpt
-[--config_train=config_train]
-[--output_dir=output]
-``
-
-The output is by default saved in `output/test_samples.tsv`, where each line contains the context text and the generated continuation (separated with TAB). 
-
-
-## Other Use Cases
-
-Texar's `TransformerDecoder` (and other RNN-based decoders) easily supports common, advanced, or customized use, such as:
-
-* Sample or continuation generation
-* Greedy / (top-k) sample / Gumbel-softmax / beam-search / ... / your-customized decoding
-* Training / fine-tuning in (un)conditional settings
-* Perplexity evaluation
-
-**For example**, after creating the language model
-```python
-def _embedding_fn(ids, times):
-    return word_embedder(ids) + pos_embedder(times)
-    
-decoder = TransformerDecoder(
-    output_layer=tf.transpose(word_embedder.embedding), 
-    hparams=gpt2_hparams)
-```
-We can do
-
-**Ex. Use 1): Continuation generation w/ greedy decoding**
-
-```python
-output, output_length = decoder(
-    context=ctx,
-    context_sequence_length=ctx_len,
-    decoding_strategy='infer_greedy',
-    end_token=end_token
-    embedding=_embedding_fn)
-    
-sample_id = output.sample_id
-logits = output.logits
-```
-
-**Ex. Use 2): Top-k sample decoding**
-
-```python    
-topk_helper = tx.modules.TopKSampleEmbeddingHelper(
-    embedding=_embedding_fn,
-    start_tokens=ctx[:,0],
-    end_token=end_token,
-    top_k=20,
-    softmax_temperature=0.7)
-    
-output, output_length = decoder(
-    context=ctx,
-    context_sequence_length=ctx_len,
-    helper=topk_helper)
-```
-
-**Ex. Use 3): Fine-tuning for conditional generation**
-
-```python
-tgt_embed = word_embedder(truth_target[:, :-1]) + pos_embedder(sequence_length=tgt_len-1)
-
-output = decoder(
-    memory=source_hidden_states, 
-    memory_sequence_length=src_len,
-    inputs=tgt_embed,
-    decoding_strategy='train_greedy') # teacher-forcing decoding
-    
-loss = tx.losses.sequence_sparse_softmax_cross_entropy(
-    lables=truth_target[:, 1:],
-    logits=output.logits,
-    sequence_length=tgt_len-1)
-```
diff --git a/examples/gpt-2/configs/README.md b/examples/gpt-2/configs/README.md
deleted file mode 100644
index 5ac452cc..00000000
--- a/examples/gpt-2/configs/README.md
+++ /dev/null
@@ -1,3 +0,0 @@
-### Configuration files of GPT-2 models in Texar style.
-
-For example, `config_model_117M.py` and `config_model_345M.py` are the Texar configuration files corresponding to the `model_117M` model and `model_345M` downloaded from [GPT-2 official release](https://github.com/openai/gpt-2).
diff --git a/examples/gpt-2/configs/__init__.py b/examples/gpt-2/configs/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/examples/gpt-2/configs/config_model_117M.py b/examples/gpt-2/configs/config_model_117M.py
deleted file mode 100644
index 6407b37f..00000000
--- a/examples/gpt-2/configs/config_model_117M.py
+++ /dev/null
@@ -1,55 +0,0 @@
-"""Texar config file of the GPT-2 model_117M model.
-"""
-
-vocab_size = 50257
-dim = 768
-
-embed = {
-    "dim": dim,
-}
-
-pos_embed = {
-    "dim": dim
-}
-position_size = 1024
-
-decoder = {
-    "dim": dim,
-    "num_blocks": 12,
-    "multihead_attention": {
-        "use_bias": True,
-        "num_units": dim,
-        "num_heads": 12,
-        "output_dim": dim,
-    },
-    "initializer": {
-        "type": "variance_scaling_initializer",
-        "kwargs": {
-            "scale": 1.0,
-            "mode": "fan_avg",
-            "distribution": "uniform",
-        },
-    },
-    "poswise_feedforward": {
-        "layers": [
-            {
-                "type": "Dense",
-                "kwargs": {
-                    "name": "conv1",
-                    "units": dim * 4,
-                    "activation": "gelu",
-                    "use_bias": True,
-                }
-            },
-            {
-                "type": "Dense",
-                "kwargs": {
-                    "name": "conv2",
-                    "units": dim,
-                    "use_bias": True,
-                }
-            }
-        ],
-        "name": "ffn",
-    },
-}
diff --git a/examples/gpt-2/configs/config_model_345M.py b/examples/gpt-2/configs/config_model_345M.py
deleted file mode 100644
index 449fcee7..00000000
--- a/examples/gpt-2/configs/config_model_345M.py
+++ /dev/null
@@ -1,55 +0,0 @@
-"""Texar config file of the GPT-2 model_345M model.
-"""
-
-vocab_size = 50257
-dim = 1024
-
-embed = {
-    "dim": dim,
-}
-
-pos_embed = {
-    "dim": dim
-}
-position_size = 1024
-
-decoder = {
-    "dim": dim,
-    "num_blocks": 24,
-    "multihead_attention": {
-        "use_bias": True,
-        "num_units": dim,
-        "num_heads": 16,
-        "output_dim": dim,
-    },
-    "initializer": {
-        "type": "variance_scaling_initializer",
-        "kwargs": {
-            "scale": 1.0,
-            "mode": "fan_avg",
-            "distribution": "uniform",
-        },
-    },
-    "poswise_feedforward": {
-        "layers": [
-            {
-                "type": "Dense",
-                "kwargs": {
-                    "name": "conv1",
-                    "units": dim * 4,
-                    "activation": "gelu",
-                    "use_bias": True,
-                }
-            },
-            {
-                "type": "Dense",
-                "kwargs": {
-                    "name": "conv2",
-                    "units": dim,
-                    "use_bias": True,
-                }
-            }
-        ],
-        "name": "ffn",
-    },
-}
diff --git a/examples/gpt-2/configs/config_train.py b/examples/gpt-2/configs/config_train.py
deleted file mode 100644
index 3e81b85e..00000000
--- a/examples/gpt-2/configs/config_train.py
+++ /dev/null
@@ -1,85 +0,0 @@
-"""Config file for GPT2 training.
-"""
-# pylint: disable=invalid-name
-
-tfrecord_data_dir = "data/toy"
-max_seq_length = 128
-max_decoding_length = max_seq_length
-
-train_batch_size = 32
-max_train_epoch = 100
-display_steps = 10  # Print training loss every display_steps; -1 to disable
-eval_steps = -1    # Eval on the dev set every eval_steps; -1 to disable
-# Checkpoint model parameters every checkpoint_steps; -1 to disable
-checkpoint_steps = -1
-
-eval_batch_size = 8
-test_batch_size = 8
-
-# Optimization configs
-
-opt = {
-    'optimizer': {
-        'type': 'AdamOptimizer',
-        'kwargs': {
-            'learning_rate': 0.001
-        }
-    }
-}
-
-# Data configs
-
-feature_original_types = {
-    # Reading features from TFRecord data file.
-    # E.g., Reading feature "text_ids" as dtype `tf.int64`;
-    # "FixedLenFeature" indicates its length is fixed for all data instances;
-    # and the sequence length is limited by `max_seq_length`.
-    "text_ids": ["tf.int64", "FixedLenFeature", max_seq_length],
-    "length": ["tf.int64", "FixedLenFeature"]
-}
-feature_convert_types = {
-    # Converting feature dtype after reading. E.g.,
-    # Converting the dtype of feature "text_ids" from `tf.int64` (as above)
-    # to `tf.int32`
-    "text_ids": "tf.int32",
-    "length": "tf.int32"
-}
-
-train_hparam = {
-    "allow_smaller_final_batch": False,
-    "batch_size": train_batch_size,
-    "dataset": {
-        "data_name": "data",
-        "feature_original_types": feature_original_types,
-        "feature_convert_types": feature_convert_types,
-        "files": "{}/train.tf_record".format(tfrecord_data_dir)
-    },
-    "shuffle": True,
-    "shuffle_buffer_size": 1000
-}
-
-dev_hparam = {
-    "allow_smaller_final_batch": True,
-    "batch_size": eval_batch_size,
-    "dataset": {
-        "data_name": "data",
-        "feature_original_types": feature_original_types,
-        "feature_convert_types": feature_convert_types,
-        "files": "{}/dev.tf_record".format(tfrecord_data_dir)
-    },
-    "shuffle": False
-}
-
-# Set to `test_hparam` to `None` if generating from scratch
-# (instead of generating continuation) at test time
-test_hparam = {
-    "allow_smaller_final_batch": True,
-    "batch_size": test_batch_size,
-    "dataset": {
-        "data_name": "data",
-        "feature_original_types": feature_original_types,
-        "feature_convert_types": feature_convert_types,
-        "files": "{}/test.tf_record".format(tfrecord_data_dir)
-    },
-    "shuffle": False
-}
diff --git a/examples/gpt-2/data/toy/dev.txt b/examples/gpt-2/data/toy/dev.txt
deleted file mode 100644
index 8f20749e..00000000
--- a/examples/gpt-2/data/toy/dev.txt
+++ /dev/null
@@ -1,20 +0,0 @@
-it 's a charming and often affecting journey .
-unflinchingly bleak and desperate
-allows us to hope that nolan is poised to embark a major career as a commercial yet inventive filmmaker .
-the acting , costumes , music , cinematography and sound are all astounding given the production 's austere locales .
-it 's slow -- very , very slow .
-although laced with humor and a few fanciful touches , the film is a refreshingly serious look at young women .
-a sometimes tedious film .
-or doing last year 's taxes with your ex-wife .
-you do n't have to know about music to appreciate the film 's easygoing blend of comedy and romance .
-in exactly 89 minutes , most of which passed as slowly as if i 'd been sitting naked on an igloo , formula 51 sank from quirky to jerky to utter turkey .
-the mesmerizing performances of the leads keep the film grounded and keep the audience riveted .
-it takes a strange kind of laziness to waste the talents of robert forster , anne meara , eugene levy , and reginald veljohnson all in the same movie .
-... the film suffers from a lack of humor ( something needed to balance out the violence ) ...
-we root for ( clara and paul ) , even like them , though perhaps it 's an emotion closer to pity .
-even horror fans will most likely not find what they 're seeking with trouble every day ; the movie lacks both thrills and humor .
-a gorgeous , high-spirited musical from india that exquisitely blends music , dance , song , and high drama .
-the emotions are raw and will strike a nerve with anyone who 's ever had family trauma .
-audrey tatou has a knack for picking roles that magnify her outrageous charm , and in this literate french comedy , she 's as morning-glory exuberant as she was in amélie .
-... the movie is just a plain old monster .
-in its best moments , resembles a bad high school production of grease , without benefit of song .
diff --git a/examples/gpt-2/data/toy/test.txt b/examples/gpt-2/data/toy/test.txt
deleted file mode 100644
index dfb5455f..00000000
--- a/examples/gpt-2/data/toy/test.txt
+++ /dev/null
@@ -1,20 +0,0 @@
-uneasy mishmash of styles and genres .
-this film 's relationship to actual tension is the same as what christmas-tree flocking in a spray can is to actual snow : a poor -- if durable -- imitation .
-by the end of no such thing the audience , like beatrice , has a watchful affection for the monster .
-director rob marshall went out gunning to make a great one .
-lathan and diggs have considerable personal charm , and their screen rapport makes the old story seem new .
-a well-made and often lovely depiction of the mysteries of friendship .
-none of this violates the letter of behan 's book , but missing is its spirit , its ribald , full-throated humor .
-although it bangs a very cliched drum at times , this crowd-pleaser 's fresh dialogue , energetic music , and good-natured spunk are often infectious .
-it is not a mass-market entertainment but an uncompromising attempt by one artist to think about another .
-this is junk food cinema at its greasiest .
-it 's also heavy-handed and devotes too much time to bigoted views .
-it helps that lil bow wow ... tones down his pint-sized gangsta act to play someone who resembles a real kid .
-watching the film is like reading a times portrait of grief that keeps shifting focus to the journalist who wrote it .
-moore 's performance impresses almost as much as her work with haynes in 1995 's safe .
-reinforces the talents of screenwriter charlie kaufman , creator of adaptation and being john malkovich .
-now trimmed by about 20 minutes , this lavish three-year-old production has enough grandeur and scale to satisfy as grown-up escapism .
-a journey through memory , a celebration of living , and a sobering rumination on fatality , classism , and ignorance .
-a remarkable 179-minute meditation on the nature of revolution .
-waydowntown is by no means a perfect film , but its boasts a huge charm factor and smacks of originality .
-it 's just incredibly dull .
diff --git a/examples/gpt-2/data/toy/train.txt b/examples/gpt-2/data/toy/train.txt
deleted file mode 100644
index a77223c2..00000000
--- a/examples/gpt-2/data/toy/train.txt
+++ /dev/null
@@ -1,50 +0,0 @@
-hide new secretions from the parental units
-contains no wit , only labored gags
-that loves its characters and communicates something rather beautiful about human nature
-remains utterly satisfied to remain the same throughout
-on the worst revenge-of-the-nerds clichés the filmmakers could dredge up
-that 's far too tragic to merit such superficial treatment
-demonstrates that the director of such hollywood blockbusters as patriot games can still turn out a small , personal film with an emotional wallop .
-of saucy
-a depressed fifteen-year-old 's suicidal poetry
-are more deeply thought through than in most ` right-thinking ' films
-goes to absurd lengths
-for those moviegoers who complain that ` they do n't make movies like they used to anymore
-the part where nothing 's happening ,
-saw how bad this movie was
-lend some dignity to a dumb story
-the greatest musicians
-cold movie
-with his usual intelligence and subtlety
-redundant concept
-swimming is above all about a young woman 's face , and by casting an actress whose face projects that woman 's doubts and yearnings , it succeeds .
-equals the original and in some ways even betters it
-if anything , see it for karen black , who camps up a storm as a fringe feminist conspiracy theorist named dirty dick .
-a smile on your face
-comes from the brave , uninhibited performances
-excruciatingly unfunny and pitifully unromantic
-enriched by an imaginatively mixed cast of antic spirits
-which half of dragonfly is worse : the part where nothing 's happening , or the part where something 's happening
-in world cinema
-very good viewing alternative
-the plot is nothing but boilerplate clichés from start to finish ,
-the action is stilted
-on all cylinders
-will find little of interest in this film , which is often preachy and poorly acted
-by far the worst movie of the year
-sit through ,
-more than another `` best man '' clone by weaving a theme throughout this funny film
-it 's about issues most adults have to face in marriage and i think that 's what i liked about it -- the real issues tucked between the silly and crude storyline
-heroes
-oblivious to the existence of this film
-sharply
-the entire point of a shaggy dog story , of course , is that it goes nowhere , and this is classic nowheresville in every sense .
-sometimes dry
-as they come , already having been recycled more times than i 'd care to count
-covers this territory with wit and originality , suggesting that with his fourth feature
-a $ 40 million version of a game
-gorgeous and deceptively minimalist
-cross swords with the best of them and
-as a fringe feminist conspiracy theorist
-proves once again he has n't lost his touch , bringing off a superb performance in an admittedly middling film .
-disappointments
diff --git a/examples/gpt-2/gpt2_generate_main.py b/examples/gpt-2/gpt2_generate_main.py
deleted file mode 100644
index 011a2e80..00000000
--- a/examples/gpt-2/gpt2_generate_main.py
+++ /dev/null
@@ -1,227 +0,0 @@
-# Copyright 2019 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Example of building OpenAI GPT-2 language model for sample generation.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import importlib
-import numpy as np
-import tensorflow as tf
-import texar.tf as tx
-
-from utils import model_utils, processor
-
-# pylint: disable=invalid-name, too-many-locals, too-many-statements, no-member
-# pylint: disable=too-many-branches
-
-flags = tf.flags
-
-FLAGS = flags.FLAGS
-
-flags.DEFINE_string("checkpoint", None,
-                    "Model checkpoint to load model weights from. Use "
-                    "`--pretrain_checkpoint` instead if loading OpenAI "
-                    "pretrained checkpoint.")
-flags.DEFINE_string("pretrain_checkpoint",
-                    "gpt2_pretrained_models/model_117M/model.ckpt",
-                    "OpenAI pretrained model checkpoint. Ignored if "
-                    "'--checkpoint' is specified.")
-flags.DEFINE_string("pretrain_model_dir", "gpt2_pretrained_models/model_117M",
-                     "The directory of pretrained model, for loading "
-                     "vocabuary, etc.")
-flags.DEFINE_integer("seed", None, "Random seed.")
-flags.DEFINE_integer("nsamples", 1, "The number of samples per input.")
-flags.DEFINE_integer("batch_size", 1, "The batch size of input.")
-flags.DEFINE_integer("max_decoding_length", 100,
-                     "The maximun length of generated text.")
-flags.DEFINE_float("temperature", 0.7,
-                   "Softmax temperature for top-k sample decoding. Must be "
-                   "strictly greater than 0. Defaults to 0.7.")
-flags.DEFINE_integer("top_k", 40,
-                     "The number of top most likely candidates from a vocab "
-                     "distribution.")
-flags.DEFINE_boolean("is_interactive", False, "Interactive mode or not.")
-flags.DEFINE_string("config_type", "texar",
-                    "The configuration file type. Set to 'json' if the GPT-2 "
-                    "config file is in the same type of the official GPT-2 "
-                    "config file. Set to 'texar' if GPT-2 config file is in "
-                    "Texar type.")
-flags.DEFINE_string("config_model", "configs.config_model_117M",
-                    "The model configuration file to configure the model. "
-                    "The config file type is define by the 'config_type',"
-                    "it be of texar type or json type."
-                    "For '--config_type=json', set the json config file path"
-                    "like: '--config_model gpt2_pretrained_models/model_117M/"
-                    "hparams.json';"
-                    "For '--config_type=texar', set the texar config file "
-                    "like: '--config_model configs.config_model_117M'.")
-
-
-def main(_):
-    """
-    Builds the model and runs
-    """
-    np.random.seed(FLAGS.seed)
-    tf.set_random_seed(FLAGS.seed)
-
-    nsamples = FLAGS.nsamples
-    batch_size = FLAGS.batch_size
-    max_decoding_length = FLAGS.max_decoding_length
-
-    # Load GPT-2 model configuration
-    if FLAGS.config_type == "json":
-        gpt2_config = model_utils.transform_gpt2_to_texar_config(
-            FLAGS.config_model)
-    elif FLAGS.config_type == "texar":
-        gpt2_config = importlib.import_module(
-            FLAGS.config_model)
-    else:
-        raise ValueError("Unknown config_type.")
-
-    assert max_decoding_length <= gpt2_config.position_size, (
-        "max_decoding_length should not be greater than position size")
-    assert nsamples % batch_size == 0, (
-        "nsamples must be dividable by batch_size")
-
-    # Create a data pre-processor for, e.g., BPE encoding
-    proc = processor.get_encoder(
-        FLAGS.pretrain_model_dir)
-
-    context = tf.placeholder(tf.int32, [batch_size, None])
-    context_length = tf.placeholder(tf.int32, [batch_size])
-
-    end_token = proc.encoder["<|endoftext|>"]
-    if FLAGS.is_interactive:
-        start_tokens = context[:, 0]
-    else:
-        start_tokens = tf.fill([batch_size], end_token)
-
-    # Build the GPT-2 model
-    word_embedder = tx.modules.WordEmbedder(
-        vocab_size=gpt2_config.vocab_size,
-        hparams=gpt2_config.embed)
-
-    pos_embedder = tx.modules.PositionEmbedder(
-        position_size=gpt2_config.position_size,
-        hparams=gpt2_config.pos_embed)
-
-    def _embedding_fn(x, y):
-        # `x` is token ids, `y` is time steps
-        return word_embedder(x) + pos_embedder(y)
-
-    helper = tx.modules.TopKSampleEmbeddingHelper(
-        embedding=_embedding_fn,
-        start_tokens=start_tokens,
-        end_token=end_token,
-        top_k=FLAGS.top_k,
-        softmax_temperature=FLAGS.temperature)
-    output_layer = tf.transpose(word_embedder.embedding, (1, 0))
-
-    decoder = tx.modules.TransformerDecoder(
-        vocab_size=gpt2_config.vocab_size,
-        output_layer=output_layer,
-        hparams=gpt2_config.decoder)
-
-    with tf.Session() as sess:
-
-        if FLAGS.is_interactive:
-            # Generate continuations of context
-            lm_output, _ = decoder(
-                context=context,
-                context_sequence_length=context_length,
-                max_decoding_length=max_decoding_length,
-                helper=helper,
-                mode=tf.estimator.ModeKeys.PREDICT)
-
-            # Load model checkpoint
-            if FLAGS.checkpoint:
-                tf.logging.info("Restore from {}".format(FLAGS.checkpoint))
-                saver = tf.train.Saver()
-                saver.restore(sess, FLAGS.checkpoint)
-            elif FLAGS.pretrain_checkpoint:
-                model_utils.init_gpt2_checkpoint(
-                    sess, FLAGS.pretrain_checkpoint)
-                print("\nFinished loading\n")
-
-            # Enter interactive mode
-            while True:
-
-                raw_text = input("Model input >>> ")
-
-                while not raw_text:
-                    print("Input should not be empty!")
-                    raw_text = input("Model input >>> ")
-
-                context_tokens = proc.encode(raw_text)
-
-                feed_dict = {
-                    context: [context_tokens for _ in range(batch_size)],
-                    context_length:
-                        [len(context_tokens) for _ in range(batch_size)],
-                    tx.context.global_mode(): tf.estimator.ModeKeys.PREDICT
-                }
-                generated = 0
-                for _ in range(nsamples // batch_size):
-
-                    output = sess.run(lm_output, feed_dict=feed_dict)
-
-                    sample_id = output.sample_id
-                    for i in range(batch_size):
-
-                        generated += 1
-                        print("=" * 40 +
-                              " SAMPLE " + str(generated) + " " + "=" * 40)
-                        si = sample_id[i][len(context_tokens):]
-                        print(proc.decode(si))
-                print("=" * 80)
-        else:
-            # Generate samples from scratch
-            lm_output, _ = decoder(
-                max_decoding_length=max_decoding_length,
-                helper=helper,
-                mode=tf.estimator.ModeKeys.PREDICT)
-
-            # Load model checkpoint
-            if FLAGS.checkpoint:
-                tf.logging.info("Restore from {}".format(FLAGS.checkpoint))
-                saver = tf.train.Saver()
-                saver.restore(sess, FLAGS.checkpoint)
-            elif FLAGS.pretrain_checkpoint:
-                model_utils.init_gpt2_checkpoint(
-                    sess, FLAGS.pretrain_checkpoint)
-                print("\nFinished loading\n")
-
-            feed_dict = {
-                tx.context.global_mode(): tf.estimator.ModeKeys.PREDICT
-            }
-            generated = 0
-            while nsamples == 0 or generated < nsamples:
-
-                output = sess.run(lm_output, feed_dict=feed_dict)
-
-                sample_id = output.sample_id
-                for i in range(batch_size):
-
-                    generated += batch_size
-                    text = proc.decode(sample_id[i])
-                    print("=" * 40 +
-                          " SAMPLE " + str(generated) + " " + "=" * 40)
-                    print(text)
-
-
-if __name__ == "__main__":
-    tf.app.run()
diff --git a/examples/gpt-2/gpt2_pretrained_models/download_model.sh b/examples/gpt-2/gpt2_pretrained_models/download_model.sh
deleted file mode 100644
index c7dfa07c..00000000
--- a/examples/gpt-2/gpt2_pretrained_models/download_model.sh
+++ /dev/null
@@ -1,16 +0,0 @@
-#!/bin/sh
-
-if [ "$#" -ne 1 ]; then
-    echo "You must enter the model name as a parameter, e.g.: sh gpt2_pretrained_models/download_model.sh model_117M"
-    exit 1
-fi
-
-model=$1
-model_name=${model#*_}
-mkdir -p gpt2_pretrained_models/$model
-for filename in checkpoint encoder.json hparams.json model.ckpt.data-00000-of-00001 model.ckpt.index model.ckpt.meta vocab.bpe; do
-  fetch=$model_name/$filename
-  sub_path=$model/$filename
-  echo "Fetching $fetch"
-  curl --output gpt2_pretrained_models/$sub_path https://storage.googleapis.com/gpt-2/models/$fetch
-done
diff --git a/examples/gpt-2/gpt2_train_main.py b/examples/gpt-2/gpt2_train_main.py
deleted file mode 100644
index c877acce..00000000
--- a/examples/gpt-2/gpt2_train_main.py
+++ /dev/null
@@ -1,387 +0,0 @@
-# Copyright 2019 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Example of fine-tuning OpenAI GPT-2 language model.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import importlib
-import tensorflow as tf
-import texar.tf as tx
-
-from utils import model_utils, processor
-
-# pylint: disable=invalid-name, too-many-locals, too-many-statements, no-member
-# pylint: disable=too-many-branches
-
-flags = tf.flags
-
-FLAGS = flags.FLAGS
-
-flags.DEFINE_string("checkpoint", None,
-                    "Model checkpoint to resume training or for test.")
-flags.DEFINE_string("pretrain_checkpoint",
-                    "gpt2_pretrained_models/model_117M/model.ckpt",
-                    "OpenAI pretrained model checkpoint. Ignored if "
-                    "'--checkpoint' is specified.")
-flags.DEFINE_string("pretrain_model_dir", "gpt2_pretrained_models/model_117M",
-                    "The directory of pretrained model, for loading vocabuary, "
-                    "etc.")
-flags.DEFINE_float("temperature", 0.7,
-                   "Softmax temperature for top-k sample decoding. Must be "
-                   "strictly greater than 0. Defaults to 0.7.")
-flags.DEFINE_integer("top_k", 40,
-                     "The number of top most likely candidates from a vocab "
-                     "distribution.")
-flags.DEFINE_string("config_train", "configs.config_train",
-                    "Configurations of GPT-2 training, including data and "
-                    "optimization hyperparameters.")
-flags.DEFINE_string("config_type", "texar",
-                    "The configuration file type. Set to 'json' if the GPT-2 "
-                    "config file is in the same type of the official GPT-2 "
-                    "config file. Set to 'texar' if GPT-2 config file is in "
-                    "Texar type.")
-flags.DEFINE_string("config_model", "configs.config_model_117M",
-                    "The model configuration file to configure the model. "
-                    "The config file type is define by the 'config_type',"
-                    "it be of texar type or json type."
-                    "For '--config_type=json', set the json config file path"
-                    "like: '--config_model gpt2_pretrained_models/model_117M/"
-                    "hparams.json';"
-                    "For '--config_type=texar', set the texar config file "
-                    "like: '--config_model configs.config_model_117M'.")
-flags.DEFINE_string("output_dir", "output/",
-                    "The output directory where the model checkpoints will be "
-                    "written.")
-flags.DEFINE_bool("do_train", False, "Whether to run training.")
-flags.DEFINE_bool("do_eval", False, "Whether to run eval on the dev set.")
-flags.DEFINE_bool("do_test", False, "Whether to run test on the test set.")
-flags.DEFINE_bool("distributed", False, "Whether to run in distributed mode.")
-
-config_train = importlib.import_module(FLAGS.config_train)
-
-
-def main(_):
-    """
-    Builds the model and runs
-    """
-    if FLAGS.distributed:
-        import horovod.tensorflow as hvd
-        hvd.init()
-
-    tf.logging.set_verbosity(tf.logging.INFO)
-
-    # Loads GPT-2 model configuration
-
-    if FLAGS.config_type == "json":
-        gpt2_config = model_utils.transform_gpt2_to_texar_config(
-            FLAGS.config_model)
-    elif FLAGS.config_type == 'texar':
-        gpt2_config = importlib.import_module(
-            FLAGS.config_model)
-    else:
-        raise ValueError('Unknown config_type.')
-
-    # Creates a data pre-processor for, e.g., BPE encoding
-    proc = processor.get_encoder(FLAGS.pretrain_model_dir)
-
-    max_decoding_length = config_train.max_decoding_length
-    assert max_decoding_length <= gpt2_config.position_size, (
-        "max_decoding_length should not be greater than position_size. "
-        "{}>{}".format(max_decoding_length, gpt2_config.position_size))
-
-    # Loads data
-
-    # Configures training data shard in distributed mode
-    if FLAGS.distributed:
-        config_train.train_hparam["dataset"]["num_shards"] = hvd.size()
-        config_train.train_hparam["dataset"]["shard_id"] = hvd.rank()
-        config_train.train_hparam["batch_size"] //= hvd.size()
-
-    datasets = {}
-    if FLAGS.do_train:
-        train_dataset = tx.data.TFRecordData(hparams=config_train.train_hparam)
-        datasets['train'] = train_dataset
-    if FLAGS.do_eval:
-        dev_dataset = tx.data.TFRecordData(hparams=config_train.dev_hparam)
-        datasets['dev'] = dev_dataset
-    if FLAGS.do_test:
-        test_dataset = tx.data.TFRecordData(hparams=config_train.test_hparam)
-        datasets['test'] = test_dataset
-    iterator = tx.data.FeedableDataIterator(datasets)
-    batch = iterator.get_next()
-    batch_size = tf.shape(batch['text_ids'])[0]
-
-    # Builds the GPT-2 model
-
-    word_embedder = tx.modules.WordEmbedder(
-        vocab_size=gpt2_config.vocab_size,
-        hparams=gpt2_config.embed)
-
-    pos_embedder = tx.modules.PositionEmbedder(
-        position_size=gpt2_config.position_size,
-        hparams=gpt2_config.pos_embed)
-
-    # Ties output layer with input word embedding
-    output_layer = tf.transpose(word_embedder.embedding, (1, 0))
-
-    decoder = tx.modules.TransformerDecoder(
-        vocab_size=gpt2_config.vocab_size,
-        output_layer=output_layer,
-        hparams=gpt2_config.decoder)
-
-    # For training
-    seq_len = tf.fill([batch_size], tf.shape(batch['text_ids'])[1])
-    pos_embeds = pos_embedder(sequence_length=seq_len)
-    input_embeds = word_embedder(batch['text_ids']) + pos_embeds
-
-    outputs = decoder(inputs=input_embeds, decoding_strategy='train_greedy')
-
-    loss = tx.losses.sequence_sparse_softmax_cross_entropy(
-        labels=batch['text_ids'][:, 1:],
-        logits=outputs.logits[:, :-1, :],
-        sequence_length=batch['length'] - 1,
-        average_across_timesteps=True,
-        sum_over_timesteps=False)
-    ppl = tf.exp(loss)
-
-    global_step = tf.Variable(0, trainable=False)
-    opt = tx.core.get_optimizer(
-        global_step=global_step,
-        hparams=config_train.opt)
-
-    if FLAGS.distributed:
-        opt = hvd.DistributedOptimizer(opt)
-
-    train_op = tf.contrib.layers.optimize_loss(
-        loss=loss,
-        global_step=global_step,
-        learning_rate=None,
-        optimizer=opt)
-
-    # For generation: generates continuations of test text
-    def _embedding_fn(x, y):
-        # `x` is token ids, `y` is time steps
-        return word_embedder(x) + pos_embedder(y)
-
-    end_token = proc.encoder['<|endoftext|>']
-    start_tokens = batch['text_ids'][:, 0]
-    helper = tx.modules.TopKSampleEmbeddingHelper(
-        embedding=_embedding_fn,
-        start_tokens=start_tokens,
-        end_token=end_token,
-        top_k=FLAGS.top_k,
-        softmax_temperature=FLAGS.temperature)
-
-    outputs_infer, _ = decoder(
-        context=batch['text_ids'],
-        context_sequence_length=batch['length'],
-        max_decoding_length=max_decoding_length,
-        helper=helper)
-    sample_id = outputs_infer.sample_id
-
-    # Train/eval/test routine
-    saver = tf.train.Saver()
-    saver_best = tf.train.Saver(max_to_keep=1)
-    dev_best = {'loss': 1e8, 'ppl': 1e8}
-
-    def _is_head():
-        if not FLAGS.distributed:
-            return True
-        else:
-            return hvd.rank() == 0
-
-    def _train_epoch(sess):
-        """Trains on the training set, and evaluates on the dev set
-        periodically.
-        """
-        iterator.restart_dataset(sess, 'train')
-
-        fetches = {
-            'loss': train_op,
-            'step': global_step
-        }
-
-        while True:
-            try:
-                feed_dict = {
-                    iterator.handle: iterator.get_handle(sess, 'train'),
-                    tx.global_mode(): tf.estimator.ModeKeys.TRAIN,
-                }
-                rets = sess.run(fetches, feed_dict)
-                step = rets['step']
-
-                dis_steps = config_train.display_steps
-                if _is_head() and dis_steps > 0 and step % dis_steps == 0:
-                    tf.logging.info('step:%d; loss:%f' % (step, rets['loss']))
-
-                eval_steps = config_train.eval_steps
-                if _is_head() and eval_steps > 0 and step % eval_steps == 0:
-                    _dev_epoch(sess)
-
-                ckpt_steps = config_train.checkpoint_steps
-                if _is_head() and ckpt_steps > 0 and step % ckpt_steps == 0:
-                    ckpt_fn = os.path.join(FLAGS.output_dir, 'model.ckpt')
-                    ckpt_fn = saver.save(sess, ckpt_fn, global_step=step)
-                    tf.logging.info('Checkpoint to {}'.format(ckpt_fn))
-
-            except tf.errors.OutOfRangeError:
-                break
-
-    def _dev_epoch(sess):
-        """Evaluates on the dev set.
-        """
-        iterator.restart_dataset(sess, 'dev')
-
-        cum_loss = 0.
-        cum_ppl = 0.
-        nsamples = 0
-        fetches = {
-            'loss': loss,
-            'ppl': ppl,
-            'batch_size': batch_size,
-        }
-        while True:
-            try:
-                feed_dict = {
-                    iterator.handle: iterator.get_handle(sess, 'dev'),
-                    tx.context.global_mode(): tf.estimator.ModeKeys.EVAL,
-                }
-                rets = sess.run(fetches, feed_dict)
-
-                cum_loss += rets['loss'] * rets['batch_size']
-                cum_ppl += rets['ppl'] * rets['batch_size']
-                nsamples += rets['batch_size']
-            except tf.errors.OutOfRangeError:
-                break
-
-        avg_loss = cum_loss / nsamples
-        avg_ppl = cum_ppl / nsamples
-        tf.logging.info('dev loss: {}; ppl: {}; nsamples: {}'.format(
-            avg_loss, avg_ppl, nsamples))
-
-        if FLAGS.do_train and avg_loss < dev_best['loss']:
-            dev_best['loss'] = avg_loss
-            dev_best['ppl'] = avg_ppl
-            ckpt_fn = os.path.join(FLAGS.output_dir, 'model_best.ckpt')
-            ckpt_fn = saver_best.save(sess, ckpt_fn)
-            tf.logging.info('Checkpoint best to {}'.format(ckpt_fn))
-
-    def _test_epoch(sess):
-        """Generates samples on the test set.
-        """
-        iterator.restart_dataset(sess, 'test')
-
-        _all_inputs = []
-        _all_samples = []
-        fetches = {
-            'inputs': batch['text_ids'],
-            'length': batch['length'],
-            'samples': sample_id
-        }
-        while True:
-            try:
-                feed_dict = {
-                    iterator.handle: iterator.get_handle(sess, 'test'),
-                    tx.context.global_mode(): tf.estimator.ModeKeys.PREDICT,
-                }
-                rets = sess.run(fetches, feed_dict=feed_dict)
-
-                _inputs = []
-                for i, l in zip(rets['inputs'], rets['length']):
-                    # Delete padding
-                    _inputs.append(i[:l].tolist())
-                _all_inputs.extend(_inputs)
-
-                _samples = []
-                for s, l in zip(rets['samples'], rets['length']):
-                    # Delete inputs from samples
-                    _samples.append(s[l:].tolist())
-                _all_samples.extend(_samples)
-
-            except tf.errors.OutOfRangeError:
-                break
-
-        # Parse samples and write to file
-
-        eos_token_id = proc.encoder['<|endoftext|>']
-
-        _all_input_text = []
-        for i in _all_inputs:
-            if i[0] == eos_token_id:
-                # '<|endoftext|>' is used as the BOS token. Delete it here
-                i = i[1:]
-            i_text = proc.decode(i)
-            _all_input_text.append(i_text)
-        # '<|endoftext|>' is used as the PAD token. Delete them here
-        _all_input_text = tx.utils.strip_eos(_all_input_text,
-                                             eos_token='<|endoftext|>')
-
-        _all_samples_text = []
-        for i, s in zip(_all_inputs, _all_samples):
-            s_text = proc.decode(s)
-            s_text = s_text.replace('\n', ' ')
-            _all_samples_text.append(s_text)
-        _all_samples_text = tx.utils.strip_eos(_all_samples_text,
-                                               eos_token='<|endoftext|>')
-
-        output_file = os.path.join(FLAGS.output_dir, "test_samples.tsv")
-        tf.logging.info('Write samples to {}'.format(output_file))
-        tx.utils.write_paired_text(
-            _all_input_text, _all_samples_text, output_file)
-
-    # Broadcasts global variables from rank-0 process
-    if FLAGS.distributed:
-        bcast = hvd.broadcast_global_variables(0)
-
-    session_config = tf.ConfigProto()
-    if FLAGS.distributed:
-        session_config.gpu_options.visible_device_list = str(hvd.local_rank())
-
-    with tf.Session(config=session_config) as sess:
-        sess.run(tf.global_variables_initializer())
-        sess.run(tf.local_variables_initializer())
-        sess.run(tf.tables_initializer())
-
-        if FLAGS.distributed:
-            bcast.run()
-
-        # Restores trained model if specified
-        if FLAGS.checkpoint:
-            tf.logging.info('Restore from {}'.format(FLAGS.checkpoint))
-            saver.restore(sess, FLAGS.checkpoint)
-        elif FLAGS.pretrain_checkpoint:
-            tf.logging.info('Restore from {}'.format(FLAGS.pretrain_checkpoint))
-            model_utils.init_gpt2_checkpoint(sess, FLAGS.pretrain_checkpoint)
-            print("\nFinished loading\n")
-
-        iterator.initialize_dataset(sess)
-
-        if FLAGS.do_train:
-            for _ in range(config_train.max_train_epoch):
-                _train_epoch(sess)
-            saver.save(sess, FLAGS.output_dir + '/model.ckpt')
-
-        if FLAGS.do_eval:
-            _dev_epoch(sess)
-
-        if FLAGS.do_test:
-            _test_epoch(sess)
-
-
-if __name__ == "__main__":
-    tf.app.run()
diff --git a/examples/gpt-2/prepare_data.py b/examples/gpt-2/prepare_data.py
deleted file mode 100644
index 9ce6fd7f..00000000
--- a/examples/gpt-2/prepare_data.py
+++ /dev/null
@@ -1,79 +0,0 @@
-# Copyright 2019 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Preprocesses raw data and produces TFRecord files
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow as tf
-import texar.tf as tx
-
-from utils import data_utils, processor
-
-# pylint: disable=invalid-name, too-many-locals, too-many-statements
-
-flags = tf.flags
-
-FLAGS = flags.FLAGS
-flags.DEFINE_string(
-    "data_dir", 'data/toy',
-    "The directory of raw data, wherein data files must be named as "
-    "'train.txt', 'dev.txt', or 'test.txt'.")
-flags.DEFINE_integer(
-    "max_seq_length", 128,
-    "The maxium length of sequence, longer sequence will be trimmed.")
-flags.DEFINE_string(
-    "tfrecord_output_dir", None,
-    "The output directory where the TFRecord files will be generated. "
-    "By default it is set to be the same as `--data_dir`.")
-flags.DEFINE_string(
-    "pretrain_model_dir", "gpt2_pretrained_models/model_117M",
-    "The directory of pretrained model.")
-
-
-tf.logging.set_verbosity(tf.logging.INFO)
-
-
-def prepare_data():
-    """
-    Builds the model and runs.
-    """
-    data_dir = FLAGS.data_dir
-    if FLAGS.tfrecord_output_dir is None:
-        tfrecord_output_dir = data_dir
-    else:
-        tfrecord_output_dir = FLAGS.tfrecord_output_dir
-    tx.utils.maybe_create_dir(tfrecord_output_dir)
-
-    # Creates a data pre-processor for, e.g., BPE encoding
-    proc = processor.get_encoder(FLAGS.pretrain_model_dir)
-
-    # Produces TFRecord files
-    data_utils.prepare_TFRecord_data(
-        data_dir=data_dir,
-        max_seq_length=FLAGS.max_seq_length,
-        encoder=proc,
-        output_dir=tfrecord_output_dir)
-
-
-def main():
-    """Data preparation.
-    """
-    prepare_data()
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/gpt-2/requirements.txt b/examples/gpt-2/requirements.txt
deleted file mode 100644
index be95e5af..00000000
--- a/examples/gpt-2/requirements.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-tensorflow>=1.12
-regex==2017.4.5
diff --git a/examples/gpt-2/utils/__init__.py b/examples/gpt-2/utils/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/examples/gpt-2/utils/data_utils.py b/examples/gpt-2/utils/data_utils.py
deleted file mode 100644
index 416e00f6..00000000
--- a/examples/gpt-2/utils/data_utils.py
+++ /dev/null
@@ -1,125 +0,0 @@
-# Copyright 2019 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Utils of data preprocessing for GPT2 training.
-"""
-
-import os
-import collections
-import tensorflow as tf
-
-# pylint: disable=invalid-name, too-many-arguments
-
-
-def process_single_text(raw_text, max_seq_length, encoder,
-                        BOS_token, EOS_token, PAD_token):
-    """Processes a single piece of text. Performs BPE encoding,
-    converting to indexes, truncation, and padding, etc.
-    """
-    # BPE
-    tokens = encoder.encode(raw_text)
-
-    # Truncate
-    max_len = max_seq_length
-    if BOS_token is not None and len(BOS_token) > 0:
-        max_len -= 1
-    if EOS_token is not None and len(EOS_token) > 0:
-        max_len -= 1
-    tokens = tokens[:max_len]
-
-    # Append special tokens
-    if BOS_token is not None and len(BOS_token) > 0:
-        tokens = [encoder.encoder[BOS_token]] + tokens
-    if EOS_token is not None and len(EOS_token) > 0:
-        tokens = tokens + [encoder.encoder[EOS_token]]
-
-    token_length = len(tokens)
-
-    # Pad
-    PAD_token_id = encoder.encoder[PAD_token]
-    while len(tokens) < max_seq_length:
-        tokens.append(PAD_token_id)
-
-    assert len(tokens) == max_seq_length
-
-    return tokens, token_length
-
-
-def read_raw_data(data_fn):
-    """
-    Reads raw data from a file. Each line contains one example.
-    """
-    examples = []
-    with open(data_fn, "r") as fin:
-        for line in fin:
-            examples.append(line.strip())
-    return examples
-
-
-def file_based_convert_examples_to_features(
-        examples, max_seq_length, encoder, output_file,
-        BOS_token="<|endoftext|>", EOS_token="<|endoftext|>",
-        PAD_token="<|endoftext|>"):
-    """Converts a set of examples to a TFRecord file."""
-
-    writer = tf.python_io.TFRecordWriter(output_file)
-
-    for (_, example) in enumerate(examples):
-
-        text_ids, length = process_single_text(
-            example, max_seq_length, encoder, BOS_token, EOS_token, PAD_token)
-
-        def _create_int_feature(values):
-            return tf.train.Feature(
-                int64_list=tf.train.Int64List(value=list(values)))
-
-        features = collections.OrderedDict()
-        features["text_ids"] = _create_int_feature(text_ids)
-        features["length"] = _create_int_feature([length])
-
-        tf_example = tf.train.Example(
-            features=tf.train.Features(feature=features))
-        writer.write(tf_example.SerializeToString())
-
-
-def prepare_TFRecord_data(data_dir, max_seq_length, encoder, output_dir):
-    """
-    Args:
-        data_dir: The input data directory.
-        max_seq_length: Max sequence length.
-        output_dir: The directory to save the TFRecord files in.
-    """
-    train_fn = os.path.join(data_dir, "train.txt")
-    if os.path.isfile(train_fn):
-        tf.logging.info("Processing %s" % train_fn)
-        train_examples = read_raw_data(train_fn)
-        train_file = os.path.join(output_dir, "train.tf_record")
-        file_based_convert_examples_to_features(
-            train_examples, max_seq_length, encoder, train_file)
-
-    dev_fn = os.path.join(data_dir, "dev.txt")
-    if os.path.isfile(dev_fn):
-        tf.logging.info("Processing %s" % dev_fn)
-        eval_examples = read_raw_data(dev_fn)
-        eval_file = os.path.join(output_dir, "dev.tf_record")
-        file_based_convert_examples_to_features(
-            eval_examples, max_seq_length, encoder, eval_file)
-
-    test_fn = os.path.join(data_dir, "test.txt")
-    if os.path.isfile(test_fn):
-        tf.logging.info("Processing %s" % test_fn)
-        test_examples = read_raw_data(test_fn)
-        test_file = os.path.join(output_dir, "test.tf_record")
-        file_based_convert_examples_to_features(
-            test_examples, max_seq_length, encoder, test_file, EOS_token=None)
diff --git a/examples/gpt-2/utils/model_utils.py b/examples/gpt-2/utils/model_utils.py
deleted file mode 100644
index e0aef9c7..00000000
--- a/examples/gpt-2/utils/model_utils.py
+++ /dev/null
@@ -1,206 +0,0 @@
-"""
-Model utility functions
-"""
-import sys
-import json
-import tensorflow as tf
-import numpy as np
-from texar.tf import HParams
-
-
-def transform_gpt2_to_texar_config(input_json_path):
-    """
-    Remap the config file
-    """
-    config_gpt = json.loads(open(input_json_path).read())
-    configs = dict()
-    configs["vocab_size"] = config_gpt["n_vocab"]
-    configs["context_size"] = config_gpt["n_ctx"]
-    configs["embedding_size"] = config_gpt["n_embd"]
-    hidden_dim = config_gpt["n_embd"]
-    configs["embed"] = {
-        "dim": hidden_dim,
-    }
-    configs["position_size"] = config_gpt["n_ctx"]
-    configs["pos_embed"] = {
-        "dim": hidden_dim
-    }
-    configs["decoder"] = {
-        "dim": hidden_dim,
-        "num_blocks": config_gpt["n_layer"],
-        "multihead_attention": {
-            "use_bias": True,
-            "num_units": hidden_dim,
-            "num_heads": config_gpt["n_head"],
-            "output_dim": hidden_dim,
-        },
-        "initializer": {
-            "type": "variance_scaling_initializer",
-            "kwargs": {
-                "scale": 1.0,
-                "mode": "fan_avg",
-                "distribution": "uniform",
-            },
-        },
-        "poswise_feedforward": {
-            "layers": [
-                {
-                    "type": "Dense",
-                    "kwargs": {
-                        "name": "conv1",
-                        "units": hidden_dim * 4,
-                        "activation": "gelu",
-                        "use_bias": True,
-                    }
-                },
-                {
-                    "type": "Dense",
-                    "kwargs": {
-                        "name": "conv2",
-                        "units": hidden_dim,
-                        "use_bias": True,
-                    }
-                }
-            ],
-            "name": "ffn",
-        },
-    }
-    return HParams(configs, default_hparams=None)
-
-
-def _map_tensor_names(original_tensor_name):
-    """
-    Tensor name mapping
-    """
-    global_tensor_map = {
-        "model/wte": "word_embedder/w",
-        "model/wpe": "position_embedder/w",
-        "model/ln_f/b": "transformer_decoder/beta",
-        "model/ln_f/g": "transformer_decoder/gamma",
-    }
-    if original_tensor_name in global_tensor_map:
-        return global_tensor_map[original_tensor_name]
-    original_tensor_name_split = original_tensor_name.split("/")
-    layer_tensor_map = {
-        "ln_1/b": "beta",
-        "ln_1/g": "gamma",
-        "ln_2/b": "past_poswise_ln/beta",
-        "ln_2/g": "past_poswise_ln/gamma",
-        "mlp/c_fc/b": "ffn/conv1/bias",
-        "mlp/c_fc/w": "ffn/conv1/kernel",
-        "mlp/c_proj/b": "ffn/conv2/bias",
-        "mlp/c_proj/w": "ffn/conv2/kernel",
-        "attn/c_proj/b": "self_attention/multihead_attention/output/bias",
-        "attn/c_proj/w": "self_attention/multihead_attention/output/kernel",
-    }
-    layer_num = int(original_tensor_name_split[1][1:])
-    layer_feature = "/".join(original_tensor_name.split("/")[2:])
-    # pylint: disable=no-else-return
-    if layer_feature in layer_tensor_map:
-        layer_feature_ = layer_tensor_map[layer_feature]
-        tensor_name_ = "/".join(
-            [
-                "transformer_decoder",
-                "layer_{}".format(layer_num),
-                layer_feature_
-            ])
-        return tensor_name_
-    else:
-        return original_tensor_name
-
-
-# pylint: disable=too-many-locals
-def _get_assignment_map_from_checkpoint(sess, all_variables, init_checkpoint):
-    """
-    Load pretrained parameters to texar model
-    """
-
-    assignment_map = {}
-
-    reader = tf.train.NewCheckpointReader(init_checkpoint)
-    var_names_list = reader.get_variable_to_shape_map().keys()
-    ckpt_names_vs_vals = {}
-    for var_name in var_names_list:
-        ckpt_names_vs_vals[var_name] = reader.get_tensor(var_name)
-
-    def _assign_by_name(sess, tensor_name, data):
-        local_tensor = [var for var in all_variables
-                        if tensor_name in var.name][0]
-        sess.run(tf.assign(local_tensor, data))
-
-    def _get_tensor_by_name(tensor_name):
-        local_tensor = [var for var in all_variables
-                        if tensor_name in var.name][0]
-        return local_tensor
-
-    for idx, ckpt_tensor_name in enumerate(ckpt_names_vs_vals):
-        processing = (idx + 1.0) / len(ckpt_names_vs_vals.keys())
-        sys.stdout.write("\rLoading checkpoint: {:.1%}".format(processing))
-        sys.stdout.flush()
-
-        ckpt_tensor_name_feature = ""
-        if len(ckpt_tensor_name.split("/")) > 2:
-            ckpt_tensor_name_feature = "/".join(
-                ckpt_tensor_name.split("/")[2:])
-        if ckpt_tensor_name_feature == "attn/c_attn/w":
-            layer_num = int(ckpt_tensor_name.split("/")[1][1:])
-            template = ("transformer_decoder/layer_{}/self_attention/"
-                        "multihead_attention/{}/kernel")
-            local_tensor_name_q_w = template.format(layer_num, "query")
-            local_tensor_name_k_w = template.format(layer_num, "key")
-            local_tensor_name_v_w = template.format(layer_num, "value")
-
-            data = ckpt_names_vs_vals[ckpt_tensor_name]
-            assert data.shape[2] % 3 == 0, ("tensor 'attn/c_attn/w' "
-                                            "shape is not dividable")
-            index_w = data.shape[2] // 3
-            q_w = data[:, :, :index_w]
-            k_w = data[:, :, index_w: 2 * index_w]
-            v_w = data[:, :, 2 * index_w:]
-            _assign_by_name(sess, local_tensor_name_q_w, np.squeeze(q_w))
-            _assign_by_name(sess, local_tensor_name_k_w, np.squeeze(k_w))
-            _assign_by_name(sess, local_tensor_name_v_w, np.squeeze(v_w))
-
-        elif ckpt_tensor_name_feature == "attn/c_attn/b":
-            layer_num = int(ckpt_tensor_name.split("/")[1][1:])
-            template = ("transformer_decoder/layer_{}/self_attention/"
-                        "multihead_attention/{}/bias")
-            local_tensor_name_q_b = template.format(layer_num, "query")
-            local_tensor_name_k_b = template.format(layer_num, "key")
-            local_tensor_name_v_b = template.format(layer_num, "value")
-
-            data = ckpt_names_vs_vals[ckpt_tensor_name]
-            assert data.shape[0] % 3 == 0, ("tensor 'attn/c_attn/b'"
-                                            " shape is not dividable")
-            index_b = data.shape[0] // 3
-            q_b = data[:index_b]
-            k_b = data[index_b: 2 * index_b]
-            v_b = data[2 * index_b:]
-            _assign_by_name(sess, local_tensor_name_q_b, q_b)
-            _assign_by_name(sess, local_tensor_name_k_b, k_b)
-            _assign_by_name(sess, local_tensor_name_v_b, v_b)
-
-        else:
-            local_tensor_name = _map_tensor_names(ckpt_tensor_name)
-            local_tensor = _get_tensor_by_name(local_tensor_name)
-            assignment_map[ckpt_tensor_name] = local_tensor
-
-    return assignment_map
-
-
-def init_gpt2_checkpoint(sess, init_checkpoint):
-    """
-    Initializes GPT-2 model parameters from a checkpoint
-
-    Args:
-        init_checkpoint (str): Path to the checkpoint.
-    """
-    tvars = tf.trainable_variables()
-    if init_checkpoint:
-        assignment_map = _get_assignment_map_from_checkpoint(
-            sess,
-            tvars,
-            init_checkpoint)
-        init_fn = tf.contrib.framework.assign_from_checkpoint_fn(
-            init_checkpoint, assignment_map, reshape_variables=True)
-        init_fn(sess)
diff --git a/examples/gpt-2/utils/processor.py b/examples/gpt-2/utils/processor.py
deleted file mode 100644
index 092615ee..00000000
--- a/examples/gpt-2/utils/processor.py
+++ /dev/null
@@ -1,130 +0,0 @@
-# -*- coding: utf-8 -*-
-#
-"""
-Byte pair encoding utilities
-
-Adapted from https://github.com/openai/gpt-2/blob/master/src/encoder.py
-"""
-
-import os
-import json
-import regex as re
-from functools import lru_cache
-
-
-@lru_cache()
-def bytes_to_unicode():
-    """
-    Returns list of utf-8 byte and a corresponding list of unicode strings.
-    The reversible bpe codes work on unicode strings.
-    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
-    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
-    This is a signficant percentage of your normal, say, 32K bpe vocab.
-    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
-    And avoids mapping to whitespace/control characters the bpe code barfs on.
-    """
-    bs = list(range(ord("!"), ord("~") + 1)) + list(
-        range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
-    cs = bs[:]
-    n = 0
-    for b in range(2**8):
-        if b not in bs:
-            bs.append(b)
-            cs.append(2**8 + n)
-            n += 1
-    cs = [chr(n) for n in cs]
-    return dict(zip(bs, cs))
-
-
-def get_pairs(word):
-    """Return set of symbol pairs in a word.
-
-    Word is represented as tuple of symbols (symbols being variable-length strings).
-    """
-    pairs = set()
-    prev_char = word[0]
-    for char in word[1:]:
-        pairs.add((prev_char, char))
-        prev_char = char
-    return pairs
-
-
-class Encoder:
-    def __init__(self, encoder, bpe_merges, errors="replace"):
-        self.encoder = encoder
-        self.decoder = {v: k for k, v in self.encoder.items()}
-        self.errors = errors  # how to handle errors in decoding
-        self.byte_encoder = bytes_to_unicode()
-        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
-        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
-        self.cache = {}
-
-        # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
-        self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
-
-    def bpe(self, token):
-        if token in self.cache:
-            return self.cache[token]
-        word = tuple(token)
-        pairs = get_pairs(word)
-
-        if not pairs:
-            return token
-
-        while True:
-            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(
-                pair, float("inf")))
-            if bigram not in self.bpe_ranks:
-                break
-            first, second = bigram
-            new_word = []
-            i = 0
-            while i < len(word):
-                try:
-                    j = word.index(first, i)
-                    new_word.extend(word[i:j])
-                    i = j
-                except BaseException:
-                    new_word.extend(word[i:])
-                    break
-
-                if word[i] == first and i < len(word) - 1 \
-                        and word[i + 1] == second:
-                    new_word.append(first + second)
-                    i += 2
-                else:
-                    new_word.append(word[i])
-                    i += 1
-            new_word = tuple(new_word)
-            word = new_word
-            if len(word) == 1:
-                break
-            else:
-                pairs = get_pairs(word)
-        word = " ".join(word)
-        self.cache[token] = word
-        return word
-
-    def encode(self, text):
-        bpe_tokens = []
-        for token in re.findall(self.pat, text):
-            token = "".join(self.byte_encoder[b] for b in token.encode("utf-8"))
-            bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(" "))
-        return bpe_tokens
-
-    def decode(self, tokens):
-        text = "".join([self.decoder[token] for token in tokens])
-        text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
-        return text
-
-
-def get_encoder(gpt2_pretrained_path):
-    with open(os.path.join(gpt2_pretrained_path, "encoder.json"), "r") as f:
-        encoder = json.load(f)
-    with open(os.path.join(gpt2_pretrained_path, "vocab.bpe"), "r", encoding="utf-8") as f:
-        bpe_data = f.read()
-    bpe_merges = [tuple(merge_str.split()) for merge_str in bpe_data.split("\n")[1:-1]]
-    return Encoder(
-        encoder=encoder,
-        bpe_merges=bpe_merges,
-    )
diff --git a/examples/hierarchical_dialog/.gitignore b/examples/hierarchical_dialog/.gitignore
deleted file mode 100644
index 82f0c3ac..00000000
--- a/examples/hierarchical_dialog/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-/data/
diff --git a/examples/hierarchical_dialog/README.md b/examples/hierarchical_dialog/README.md
deleted file mode 100644
index bf5890e5..00000000
--- a/examples/hierarchical_dialog/README.md
+++ /dev/null
@@ -1,62 +0,0 @@
-# Hierarchical Recurrent Encoder-Decoder (HRED) Dialogue Model
-
-This example builds a HRED dialogue model described in [(Serban et al. 2016) Building End-To-End Dialogue Systems Using Generative Hierarchical Neural Network Models](https://arxiv.org/abs/1507.04808). 
-
-The dataset used here is provided by [(Zhao et al. 2017) Learning Discourse-level Diversity for Neural Dialog Models using Conditional Variational Autoencoders](https://arxiv.org/abs/1703.10960), which adapts [switchboard-1 Release 2](https://catalog.ldc.upenn.edu/ldc97s62). In particular, for evaluation purpose, multiple reference responses for each dialog context in the test set are collected through manual annotations. 
-
-This example demonstrates:
-* Use of `MultiAlignedData` to read parallel data with multiple fields, e.g., (source, target, meta, ...)
-* Use of the `'variable_utterance'` hyperparameter in TextData to read dialog history data.
-* Use of the `'embedding_init'` hyperparameter in TextData to read pre-trained word embedding as initialization. 
-* Use of `HierarchicalRNNEncoder` to encode dialog history with utterance-level and word-level encoding.
-* Use of *beam search decoding* and *random sample decoding* at inference time. 
-* Addition of speaker meta-data in the encoder-decoder model.
-
-## Usage
-
-### Dataset
-
-Download and preprocess the data with the following cmd:
-```
-python sw_loader.py
-```
-* Train/dev/test sets contain 200K, 5K, 5K examples, respectively.
-* Vocab size is 10,000.
-* `./data/switchboard/embedding.txt` contains word embeddings extracted from [glove.twitter.27B.200d](https://nlp.stanford.edu/projects/glove). You can also directly use the original glove.twitter.27B.200d file, and the Texar TextData module will automatically extract relevant embeddings for the vocabulary. 
-
-### Train the model
-
-To train the model, run
-
-```
-python hred.py --config_data config_data --config_model config_model_biminor 
-```
-Evaluation will be performed after each epoch. 
-
-Here:
-* `--config_data` specifies the data configuration.
-* `--config_model` specifies the model configuration. Note not to include the `.py` suffix. Two configs are provided:
-  - [biminor.py](./config_model_biminor.py) uses a bi-directional RNN as the word-level (minor-level) encoder
-  - [uniminor.py](./config_model_uniminor.py) uses a uni-directional RNN as the word-level (minor-level) encoder
-
-Both configs use a uni-directional RNN for the utterance-level (major-level) encoder
-
-## Results
-
-The table shows results of perplexity and BLEU after 10 epochs, comparing the results of [(Zhao et al. 2017)](https://arxiv.org/abs/1703.10960) (See "Baseline" of Table.1 in the paper). Note that:
-* We report results of random sample decoding, which performs slightly better than beam search decoding. 
-* `num_samples` is the number of samples generated for each test instances (for computing precision and recall of BLEU). See sec.5.2 of the paper for the definition of the metrics.
-* (Zhao et al. 2017) uses more meta data besides the speaker meta-data here.
-* Results may vary a bit due to randomness.
-
-|               | biminor<br>num_samples=10   | biminor<br>num_samples=5 | Zhao et al.<br>num_samples=5 |
-| --------------| ---------------| --------------| --------------|
-| Perlexity     | 23.79          | 24.26         | 35.4   |
-| BLEU-1 recall | 0.478          | 0.386         | 0.405  |
-| BLEU-1 prec   | 0.379          | 0.395         | 0.336  |
-| BLEU-2 recall | 0.391          | 0.319         | 0.300  |
-| BLEU-2 prec   | 0.310          | 0.324         | 0.281  |
-| BLEU-3 recall | 0.330          | 0.270         | 0.272  |
-| BLEU-3 prec   | 0.259          | 0.272         | 0.254  |
-| BLEU-4 recall | 0.262          | 0.216         | 0.226  |
-| BLEU-4 prec   | 0.204          | 0.215         | 0.215  |
diff --git a/examples/hierarchical_dialog/config_data.py b/examples/hierarchical_dialog/config_data.py
deleted file mode 100644
index 4789bf52..00000000
--- a/examples/hierarchical_dialog/config_data.py
+++ /dev/null
@@ -1,54 +0,0 @@
-import os
-
-data_root = './data'
-max_utterance_cnt = 9
-
-data_hparams = {
-    stage: {
-        "num_epochs": 1,
-        "shuffle": stage != 'test',
-        "batch_size": 30,
-        "datasets": [
-            {  # source
-                "variable_utterance": True,
-                "max_utterance_cnt": max_utterance_cnt,
-                "files": [
-                    os.path.join(data_root,
-                                 '{}-source.txt'.format(stage))],
-                "vocab_file": os.path.join(data_root, 'vocab.txt'),
-                "embedding_init": {
-                    "file": os.path.join(data_root, 'embedding.txt'),
-                    "dim": 200,
-                    "read_fn": "load_glove"
-                },
-                "data_name": "source"
-            },
-            {  # target
-                "files": [
-                    os.path.join(data_root, '{}-target.txt'.format(stage))],
-                "vocab_share_with": 0,
-                "data_name": "target"
-            },
-        ] + [{  # source speaker token
-                "files": os.path.join(data_root,
-                                      '{}-source-spk-{}.txt'.format(stage, i)),
-                "data_type": "float",
-                "data_name": "spk_{}".format(i)
-            } for i in range(max_utterance_cnt)
-        ] + [{  # target speaker token
-                "files": os.path.join(data_root,
-                                      '{}-target-spk.txt'.format(stage)),
-                "data_type": "float",
-                "data_name": "spk_tgt"
-            }
-        ] + [{  # target refs for BLEU evaluation
-                "variable_utterance": True,
-                "max_utterance_cnt": 10,
-                "files": [os.path.join(data_root,
-                                       '{}-target-refs.txt'.format(stage))],
-                "vocab_share_with": 0,
-                "data_name": "refs"
-            }]
-    }
-    for stage in ['train', 'val', 'test']
-}
diff --git a/examples/hierarchical_dialog/config_model_biminor.py b/examples/hierarchical_dialog/config_model_biminor.py
deleted file mode 100644
index f46d3c5a..00000000
--- a/examples/hierarchical_dialog/config_model_biminor.py
+++ /dev/null
@@ -1,60 +0,0 @@
-
-import tensorflow as tf
-
-num_samples = 10  # Number of samples generated for each test data instance
-beam_width = num_samples
-
-encoder_hparams = {
-    "encoder_minor_type": "BidirectionalRNNEncoder",
-    "encoder_minor_hparams": {
-        "rnn_cell_fw": {
-            "type": "GRUCell",
-            "kwargs": {
-                "num_units": 300,
-                "kernel_initializer": tf.initializers.random_uniform(-0.08, 0.08)
-            },
-            "dropout": {
-                "input_keep_prob": 0.5,
-            }
-        },
-        "rnn_cell_share_config": True
-    },
-    "encoder_major_type": "UnidirectionalRNNEncoder",
-    "encoder_major_hparams": {
-        "rnn_cell": {
-            "type": "GRUCell",
-            "kwargs": {
-                "num_units": 600,
-                "kernel_initializer": tf.initializers.random_uniform(-0.08, 0.08)
-            },
-            "dropout": {
-                "output_keep_prob": 0.3
-            }
-        }
-    }
-}
-decoder_hparams = {
-    "rnn_cell": {
-        "type": "GRUCell",
-        "kwargs": {
-            "num_units": 400,
-            "kernel_initializer": tf.initializers.random_uniform(-0.08, 0.08),
-        },
-        "dropout": {
-            "input_keep_prob": 0.3
-        }
-    }
-}
-opt_hparams = {
-    "optimizer": {
-        "type": "AdamOptimizer",
-        "kwargs": {
-            "learning_rate": 0.001,
-        }
-    },
-    # (It looks gradient clip does not affect the results a lot)
-    # "gradient_clip": {
-    #    "type": "clip_by_global_norm",
-    #    "kwargs": {"clip_norm": 5.}
-    # },
-}
diff --git a/examples/hierarchical_dialog/config_model_uniminor.py b/examples/hierarchical_dialog/config_model_uniminor.py
deleted file mode 100644
index bffeba73..00000000
--- a/examples/hierarchical_dialog/config_model_uniminor.py
+++ /dev/null
@@ -1,54 +0,0 @@
-
-import tensorflow as tf
-
-num_samples = 10  # Number of samples generated for each test data instance
-beam_width = num_samples
-
-encoder_hparams = {
-    "encoder_minor_type": "UnidirectionalRNNEncoder",
-    "encoder_minor_hparams": {
-        "rnn_cell": {
-            "type": "GRUCell",
-            "kwargs": {
-                "num_units": 300,
-                "kernel_initializer": tf.initializers.random_uniform(-0.08, 0.08)
-            },
-            "dropout": {
-                "input_keep_prob": 0.5,
-            }
-        },
-    },
-    "encoder_major_type": "UnidirectionalRNNEncoder",
-    "encoder_major_hparams": {
-        "rnn_cell": {
-            "type": "GRUCell",
-            "kwargs": {
-                "num_units": 600,
-                "kernel_initializer": tf.initializers.random_uniform(-0.08, 0.08)
-            },
-            "dropout": {
-                "input_keep_prob": 0.3,
-            }
-        }
-    }
-}
-decoder_hparams = {
-    "rnn_cell": {
-        "type": "GRUCell",
-        "kwargs": {
-            "num_units": 400,
-            "kernel_initializer": tf.initializers.random_uniform(-0.08, 0.08),
-        },
-        "dropout": {
-            "output_keep_prob": 0.3,
-        }
-    }
-}
-opt_hparams = {
-    "optimizer": {
-        "type": "AdamOptimizer",
-        "kwargs": {
-            "learning_rate": 0.001,
-        }
-    }
-}
diff --git a/examples/hierarchical_dialog/hred.py b/examples/hierarchical_dialog/hred.py
deleted file mode 100644
index 2692faef..00000000
--- a/examples/hierarchical_dialog/hred.py
+++ /dev/null
@@ -1,283 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Hierarchical Recurrent Encoder-Decoder (HRED) for dialog response
-generation.
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# pylint: disable=invalid-name, too-many-locals
-
-import importlib
-import numpy as np
-import tensorflow as tf
-import texar.tf as tx
-
-from nltk.translate.bleu_score import sentence_bleu
-from nltk.translate.bleu_score import SmoothingFunction
-
-flags = tf.flags
-
-flags.DEFINE_string('config_data', 'config_data', 'The data config')
-flags.DEFINE_string('config_model', 'config_model_biminor', 'The model config')
-
-FLAGS = flags.FLAGS
-
-config_data = importlib.import_module(FLAGS.config_data)
-config_model = importlib.import_module(FLAGS.config_model)
-
-encoder_hparams = config_model.encoder_hparams
-decoder_hparams = config_model.decoder_hparams
-opt_hparams = config_model.opt_hparams
-
-
-def main():
-    """Entrypoint.
-    """
-    # Data
-    train_data = tx.data.MultiAlignedData(config_data.data_hparams['train'])
-    val_data = tx.data.MultiAlignedData(config_data.data_hparams['val'])
-    test_data = tx.data.MultiAlignedData(config_data.data_hparams['test'])
-    iterator = tx.data.TrainTestDataIterator(train=train_data,
-                                             val=val_data,
-                                             test=test_data)
-    data_batch = iterator.get_next()
-
-    # (speaker's meta info)
-    spk_src = tf.stack([data_batch['spk_{}'.format(i)]
-                        for i in range(config_data.max_utterance_cnt)], 1)
-    spk_tgt = data_batch['spk_tgt']
-
-    def _add_source_speaker_token(x):
-        return tf.concat([x, tf.reshape(spk_src, (-1, 1))], 1)
-
-    def _add_target_speaker_token(x):
-        return (x, ) + (tf.reshape(spk_tgt, (-1, 1)), )
-
-    # HRED model
-    embedder = tx.modules.WordEmbedder(
-        init_value=train_data.embedding_init_value(0).word_vecs)
-    encoder = tx.modules.HierarchicalRNNEncoder(hparams=encoder_hparams)
-
-    decoder = tx.modules.BasicRNNDecoder(
-        hparams=decoder_hparams, vocab_size=train_data.vocab(0).size)
-
-    connector = tx.modules.connectors.MLPTransformConnector(
-        decoder.cell.state_size)
-
-    context_embed = embedder(data_batch['source_text_ids'])
-    ecdr_states = encoder(
-        context_embed,
-        medium=['flatten', _add_source_speaker_token],
-        sequence_length_minor=data_batch['source_length'],
-        sequence_length_major=data_batch['source_utterance_cnt'])
-    ecdr_states = ecdr_states[1]
-
-    ecdr_states = _add_target_speaker_token(ecdr_states)
-    dcdr_states = connector(ecdr_states)
-
-    # (decoding for training)
-    target_embed = embedder(data_batch['target_text_ids'])
-    outputs, _, lengths = decoder(
-        initial_state=dcdr_states,
-        inputs=target_embed,
-        sequence_length=data_batch['target_length'] - 1)
-
-    # Sentence level lld, for training
-    mle_loss = tx.losses.sequence_sparse_softmax_cross_entropy(
-        labels=data_batch['target_text_ids'][:, 1:],
-        logits=outputs.logits,
-        sequence_length=lengths)
-    # Token level lld, for perplexity evaluation
-    avg_mle_loss = tx.losses.sequence_sparse_softmax_cross_entropy(
-        labels=data_batch['target_text_ids'][:, 1:],
-        logits=outputs.logits,
-        sequence_length=lengths,
-        sum_over_timesteps=False,
-        average_across_timesteps=True)
-    perplexity = tf.exp(avg_mle_loss)
-
-    global_step = tf.Variable(0, name='global_step', trainable=True)
-    train_op = tx.core.get_train_op(
-        mle_loss, global_step=global_step, hparams=opt_hparams)
-
-    # Decoding
-
-    target_bos_token_id = train_data.vocab(0).bos_token_id
-    target_eos_token_id = train_data.vocab(0).eos_token_id
-    start_tokens = \
-        tf.ones_like(data_batch['target_length']) * target_bos_token_id
-
-    # Random sample decoding
-    decoding_strategy = 'infer_' + 'sample'
-    infer_samples, lengths = [], []
-    for _ in range(config_model.num_samples):
-        infer_outputs_i, _, lengths_i = decoder(
-            decoding_strategy=decoding_strategy,
-            initial_state=dcdr_states,
-            start_tokens=start_tokens,
-            end_token=target_eos_token_id,
-            embedding=embedder,
-            max_decoding_length=50)
-        infer_samples.append(
-            tf.expand_dims(infer_outputs_i.sample_id, axis=2))
-        lengths.append(tf.expand_dims(lengths_i, axis=1))
-
-    infer_samples = tx.utils.pad_and_concat(
-        infer_samples, axis=2, pad_axis=1)
-    rand_sample_text = train_data.vocab(0).map_ids_to_tokens(infer_samples)
-    rand_lengths = tf.concat(lengths, axis=1)
-
-    # Beam search decoding
-    beam_search_samples, beam_states, _ = tx.modules.beam_search_decode(
-        decoder,
-        initial_state=dcdr_states,
-        start_tokens=start_tokens,
-        end_token=target_eos_token_id,
-        embedding=embedder,
-        beam_width=config_model.beam_width,
-        max_decoding_length=50)
-
-    beam_sample_text = train_data.vocab(0).map_ids_to_tokens(
-        beam_search_samples.predicted_ids)
-    beam_lengths = beam_states.lengths
-
-    # Running procedures
-
-    def _train_epoch(sess, epoch, display=1000):
-        iterator.switch_to_train_data(sess)
-
-        while True:
-            try:
-                feed = {tx.global_mode(): tf.estimator.ModeKeys.TRAIN}
-                step, loss, _ = sess.run(
-                    [global_step, mle_loss, train_op], feed_dict=feed)
-
-                if step % display == 0:
-                    print('step {} at epoch {}: loss={}'.format(
-                        step, epoch, loss))
-
-            except tf.errors.OutOfRangeError:
-                break
-
-        print('epoch {} train: loss={}'.format(epoch, loss))
-
-    def _test_epoch_ppl(sess, epoch):
-        iterator.switch_to_test_data(sess)
-
-        pples = []
-        while True:
-            try:
-                feed = {tx.global_mode(): tf.estimator.ModeKeys.EVAL}
-                ppl = sess.run(perplexity, feed_dict=feed)
-                pples.append(ppl)
-
-            except tf.errors.OutOfRangeError:
-                avg_ppl = np.mean(pples)
-                print('epoch {} perplexity={}'.format(epoch, avg_ppl))
-                break
-
-    def _test_epoch_bleu(sess, epoch, sample_text, sample_lengths):
-        iterator.switch_to_test_data(sess)
-
-        bleu_prec = [[] for i in range(1, 5)]
-        bleu_recall = [[] for i in range(1, 5)]
-
-        def _bleus(ref, sample):
-            res = []
-            for weight in [[1, 0, 0, 0],
-                           [1, 0, 0, 0],
-                           [1 / 2., 1 / 2., 0, 0],
-                           [1 / 3., 1 / 3., 1 / 3., 0],
-                           [1 / 4., 1 / 4., 1 / 4., 1 / 4.]]:
-                res.append(sentence_bleu(
-                    [ref],
-                    sample,
-                    smoothing_function=SmoothingFunction().method7,
-                    weights=weight))
-            return res
-
-        while True:
-            try:
-                feed = {tx.global_mode(): tf.estimator.ModeKeys.EVAL}
-
-                samples_, sample_lengths_, references, refs_cnt = \
-                    sess.run([sample_text,
-                              sample_lengths,
-                              data_batch['refs_text'][:, :, 1:],
-                              data_batch['refs_utterance_cnt']],
-                             feed_dict=feed)
-
-                samples_ = np.transpose(samples_, (0, 2, 1))
-                samples_ = [
-                    [sample[:l] for sample, l in zip(beam, lens)]
-                    for beam, lens in zip(samples_.tolist(), sample_lengths_)
-                ]
-                references = [
-                    [ref[:ref.index(b'<EOS>')] for ref in refs[:cnt]]
-                    for refs, cnt in zip(references.tolist(), refs_cnt)
-                ]
-
-                for beam, refs in zip(samples_, references):
-                    bleu_scores = [
-                        [_bleus(ref, sample) for ref in refs]
-                        for sample in beam
-                    ]
-                    bleu_scores = np.transpose(np.array(bleu_scores), (2, 0, 1))
-
-                    for i in range(1, 5):
-                        bleu_i = bleu_scores[i]
-                        bleu_i_precision = bleu_i.max(axis=1).mean()
-                        bleu_i_recall = bleu_i.max(axis=0).mean()
-
-                        bleu_prec[i - 1].append(bleu_i_precision)
-                        bleu_recall[i - 1].append(bleu_i_recall)
-
-            except tf.errors.OutOfRangeError:
-                break
-
-        bleu_prec = [np.mean(x) for x in bleu_prec]
-        bleu_recall = [np.mean(x) for x in bleu_recall]
-
-        print('epoch {}:'.format(epoch))
-        for i in range(1, 5):
-            print(' -- bleu-{} prec={}, recall={}'.format(
-                i, bleu_prec[i - 1], bleu_recall[i - 1]))
-
-    with tf.Session() as sess:
-        sess.run(tf.global_variables_initializer())
-        sess.run(tf.local_variables_initializer())
-        sess.run(tf.tables_initializer())
-
-        num_epochs = 10
-        for epoch in range(1, num_epochs + 1):
-            _train_epoch(sess, epoch)
-            _test_epoch_ppl(sess, epoch)
-
-            if epoch % 5 == 0:
-                print('random sample: ')
-                _test_epoch_bleu(sess, epoch, rand_sample_text, rand_lengths)
-                print('beam-search: ')
-                _test_epoch_bleu(sess, epoch, beam_sample_text, beam_lengths)
-
-        if num_epochs % 5 != 0:
-            print('random sample: ')
-            _test_epoch_bleu(sess, num_epochs, rand_sample_text, rand_lengths)
-            print('beam-search: ')
-            _test_epoch_bleu(sess, num_epochs, beam_sample_text, beam_lengths)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/hierarchical_dialog/sw_loader.py b/examples/hierarchical_dialog/sw_loader.py
deleted file mode 100644
index a5ec58e9..00000000
--- a/examples/hierarchical_dialog/sw_loader.py
+++ /dev/null
@@ -1,242 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" loader for switch board dataset.
-"""
-import os
-import json
-from json_lines import reader
-
-from nltk.tokenize import WordPunctTokenizer
-from sklearn.feature_extraction.text import TfidfVectorizer
-
-import texar.tf as tx
-
-from config_data import data_root
-
-# pylint: disable=invalid-name, too-many-locals
-
-wnd_sz = 10
-
-
-class Dataset(object):
-    """Data preprocessor.
-    """
-
-    def __init__(self, jsonl_path, mode=None):
-        self.mode = mode
-        self.raw = []
-        self.lst = []
-        self.refs = []
-        if mode == 'test':
-            lst = json.load(open(jsonl_path, 'r'))
-            for item in lst:
-                context = item['context']
-                dialog = []
-                for utts in context:
-                    p = utts.find(':')
-                    dialog.append((
-                        (utts[p - 1] == 'A') * 2 - 1, utts[p + 2:-1], 0))
-
-                if dialog[0][1][-1] == '>':
-                    dialog = dialog[1:]
-
-                if len(dialog) == 0:
-                    continue
-
-                responses = []
-                for resp in item['responses']:
-                    responses.append(resp)
-
-                spk = (item['speaker'] == 'A') * 2 - 1
-                dialog.append((spk, responses[0], 0))
-                responses = responses[1:]
-                responses = [' '.join(WordPunctTokenizer().tokenize(resp))
-                             for resp in responses]
-
-                if len(responses) == 0:
-                    continue
-
-                self.raw.append(dialog)
-                self.lst.append((len(self.raw) - 1, 0, len(dialog)))
-                self.refs.append(responses)
-
-            return
-
-        from collections import Counter
-        self.ct = Counter()
-        self.topics = []
-        with open(jsonl_path, 'r') as f:
-            for idx, item in enumerate(reader(f)):
-                utts = item['utts']
-                self.topics.append(item['topic'])
-                self.raw.append([(int(speaker == 'A') * 2 - 1, sentence, _)
-                                 for speaker, sentence, _ in utts])
-
-                lst = [(idx, start, start + wnd_sz)
-                       for start in range(0, len(utts) - wnd_sz)] + \
-                      [(idx, 0, end)
-                       for end in range(2, min(wnd_sz + 1, len(utts)))]
-
-                self.lst += lst
-
-        self.refs = [['none']] * len(self.lst)
-
-    def __len__(self):
-        return len(self.lst)
-
-    def __getitem__(self, idx):
-        idx, start, end = self.lst[idx]
-        dialog = self.raw[idx][start:end]
-        source, target = dialog[:-1], dialog[-1]
-
-        spks, utts = list(zip(*[(speaker, WordPunctTokenizer().tokenize(uttr)) for speaker, uttr, _ in source]))
-
-        spks = list(spks)
-
-        while len(spks) < 10:
-            spks.append(0)
-
-        source = '|||'.join([' '.join(uttr) for uttr in utts])
-        target_test = ' '.join(WordPunctTokenizer().tokenize(target[1]))
-
-        return spks, source, target_test, target[0]
-
-    def get(self, idx):
-        idx, start, end = self.lst[idx]
-        source = self.raw[idx][start:end - 1]
-        target = self.raw[idx][end - 1]
-        source = ' '.join([b for a, b, c in source])
-        cct = self.raw[idx][end - 2][0] == self.raw[idx][end - 1][0]
-        return self.topics[idx], cct, source, target
-
-
-def sw1c2r(data_root):
-    dts_train = Dataset(os.path.join(data_root, 'train.jsonl'))
-    dts_valid = Dataset(os.path.join(data_root, 'valid.jsonl'))
-    dts_test = Dataset(os.path.join(data_root, 'test_multi_ref.json'), 'test')
-    datasets = {
-        'train': dts_train,
-        'val': dts_valid,
-        'test': dts_test
-    }
-    return datasets
-
-
-def generate_reference_for_test_dialog(dataset, data_root):
-    vocab = {}
-    with open(os.path.join(data_root, 'vocab.txt'), 'r') as f:
-        p = f.read().splitlines()
-        for i, x in enumerate(p):
-            vocab[x] = i
-
-    dts_train = dataset['train']
-    dts_val = dataset['val']
-    dts_test = dataset['test']
-
-    vectorizer = TfidfVectorizer(tokenizer=WordPunctTokenizer().tokenize,
-                                 vocabulary=vocab)
-
-    saved = []
-    meta = []
-    data = []
-    tidx = {}
-    for i in range(len(dts_test)):
-        topic, cct, source, target = dts_test.get(i)
-        meta.append((topic, cct, target))
-        data.append(source)
-
-    for i in range(len(dts_train)):
-        topic, cct, source, target = dts_train.get(i)
-        saved.append((topic, cct, target))
-        data.append(source)
-
-        if topic not in tidx:
-            tidx[topic] = []
-        tidx[topic].append(i)
-
-    result = vectorizer.fit_transform(data)
-    x = result[:len(dts_test)]
-    y = result[len(dts_test):]
-
-    from tqdm import tqdm
-    from sklearn.preprocessing import normalize
-
-    y = normalize(y)
-    x = normalize(x)
-
-    dts_test.refs = []
-    for i in tqdm(range(len(dts_test))):
-        c = tidx[meta[i][0]]
-        p = (y * x[i].T).toarray().reshape(-1)[c]
-        d = p.argsort()
-
-        cnt = 0
-        refs = []
-        for a in d[::-1]:
-            if saved[a][1] == meta[i][1]:
-                refs.append(' '.join(
-                    WordPunctTokenizer().tokenize(saved[a][2][1])))
-                cnt += 1
-                if cnt == 10:
-                    break
-
-        dts_test.refs.append(refs)
-
-
-def download_and_process(data_root):
-    if not os.path.isdir(data_root):
-        os.makedirs(data_root)
-        os.makedirs(os.path.join(data_root, 'raw'))
-
-        tx.data.maybe_download(
-            urls='https://drive.google.com/file/d/1Gytd-SSetUkIY6aVVKNrBOxkHjAlSGeU/view?usp=sharing',
-            path='./',
-            filenames=os.path.join(data_root, 'sw1c2r.tar.gz'),
-            extract=True)
-
-        os.system('mv {} {}'.format(os.path.join(data_root, 'sw1c2r.tar.gz'),
-                                    os.path.join(data_root, 'raw/sw1c2r.tar.gz')))
-        os.system('mv {}/* {}'.format(
-            os.path.join(data_root, 'switchboard'), data_root))
-
-        datasets = sw1c2r(os.path.join(data_root, 'json_data'))
-
-        for stage in ['train', 'val', 'test']:
-            dts = datasets[stage]
-            spk, src, tgt, meta = list(zip(*[dts[i] for i in range(len(dts))]))
-            src_txt = '\n'.join(src)
-            tgt_txt = '\n'.join(tgt)
-
-            spk = list(zip(*spk))
-
-            for i in range(len(spk)):
-                with open(os.path.join(data_root, '{}-source-spk-{}.txt'.format(stage, i)), 'w') as f:
-                    f.write('\n'.join([str(a) for a in spk[i]]))
-
-            spk_tgt = meta
-
-            with open(os.path.join(data_root, '{}-target-spk.txt'.format(stage)), 'w') as f:
-                f.write('\n'.join([str(a) for a in spk_tgt]))
-
-            with open(os.path.join(data_root, '{}-source.txt'.format(stage)), 'w') as f:
-                f.write(src_txt)
-            with open(os.path.join(data_root, '{}-target.txt'.format(stage)), 'w') as f:
-                f.write(tgt_txt)
-
-            with open(os.path.join(data_root, '{}-target-refs.txt'.format(stage)), 'w') as f:
-                f.write('\n'.join(['|||'.join(v) for v in dts.refs]))
-
-
-if __name__ == '__main__':
-    download_and_process(data_root)
diff --git a/examples/language_model_ptb/.gitignore b/examples/language_model_ptb/.gitignore
deleted file mode 100644
index 1d584f9f..00000000
--- a/examples/language_model_ptb/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-/simple-examples/
diff --git a/examples/language_model_ptb/README.md b/examples/language_model_ptb/README.md
deleted file mode 100644
index bd70c87e..00000000
--- a/examples/language_model_ptb/README.md
+++ /dev/null
@@ -1,33 +0,0 @@
-# Language Model on PTB #
-
-This example builds an LSTM language model, and trains on PTB data. Model and training are described in   
-[(Zaremba, et. al.) Recurrent Neural Network Regularization](https://arxiv.org/pdf/1409.2329.pdf). This is a reimpmentation of the TensorFlow official PTB example in [tensorflow/models/rnn/ptb](https://github.com/tensorflow/models/tree/master/tutorials/rnn/ptb).
-
-The example shows:
-  * Contruction of simple model, involving the `Embedder` and `RNN Decoder`.
-  * Use of Texar with external Python data pipeline ([ptb_reader.py](./ptb_reader.py)).
-  * Specification of various features of train op, like *gradient clipping* and *lr decay*.
-
-## Usage ##
-
-The following cmd trains a small-size model:
-
-```
-python lm_ptb.py [--config config_small] [--data_path ./]
-```
-
-Here:
-  * `--config` specifies the config file to use. E.g., the above use the configuration defined in [config_small.py](./config_small.py)
-  * `--data_path` specifies the directory containing PTB raw data (e.g., `ptb.train.txt`). If the data files do not exist, the program will automatically download, extract, and pre-process the data.
-
-The model will begin training, and will evaluate on the validation data periodically, and evaluate on the test data after the training is done. 
-
-## Results ##
-
-As per the TensorFlow official PTB example, the perplexity of different configs is:
-
-| config | epochs | train | valid  | test  |
-| -------| -------| ------| -------| ------|
-| small  | 13     | 37.99 | 121.39 | 115.91|
-| medium | 39     | 48.45 |  86.16 |  82.07|
-| large  | 55     | 37.87 |  82.62 |  78.29|
diff --git a/examples/language_model_ptb/config_large.py b/examples/language_model_ptb/config_large.py
deleted file mode 100644
index 58c7a55a..00000000
--- a/examples/language_model_ptb/config_large.py
+++ /dev/null
@@ -1,56 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""PTB LM large size config.
-"""
-
-# pylint: disable=invalid-name, too-few-public-methods, missing-docstring
-
-init_scale = 0.04
-num_epochs = 55
-hidden_size = 1500
-keep_prob = 0.35
-batch_size = 20
-num_steps = 35
-
-cell = {
-    "type": "LSTMBlockCell",
-    "kwargs": {
-        "num_units": hidden_size,
-        "forget_bias": 0.
-    },
-    "dropout": {"output_keep_prob": keep_prob},
-    "num_layers": 2
-}
-emb = {
-    "dim": hidden_size
-}
-opt = {
-    "optimizer": {
-        "type": "GradientDescentOptimizer",
-        "kwargs": {"learning_rate": 1.0}
-    },
-    "gradient_clip": {
-        "type": "clip_by_global_norm",
-        "kwargs": {"clip_norm": 10.}
-    },
-    "learning_rate_decay": {
-        "type": "exponential_decay",
-        "kwargs": {
-            "decay_steps": 1,
-            "decay_rate": 1. / 1.15,
-            "staircase": True
-        },
-        "start_decay_step": 14
-    }
-}
diff --git a/examples/language_model_ptb/config_medium.py b/examples/language_model_ptb/config_medium.py
deleted file mode 100644
index ae8d0f73..00000000
--- a/examples/language_model_ptb/config_medium.py
+++ /dev/null
@@ -1,56 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""PTB LM medium size config.
-"""
-
-# pylint: disable=invalid-name, too-few-public-methods, missing-docstring
-
-init_scale = 0.05
-num_epochs = 39
-hidden_size = 650
-keep_prob = 0.5
-batch_size = 20
-num_steps = 35
-
-cell = {
-    "type": "LSTMBlockCell",
-    "kwargs": {
-        "num_units": hidden_size,
-        "forget_bias": 0.
-    },
-    "dropout": {"output_keep_prob": keep_prob},
-    "num_layers": 2
-}
-emb = {
-    "dim": hidden_size
-}
-opt = {
-    "optimizer": {
-        "type": "GradientDescentOptimizer",
-        "kwargs": {"learning_rate": 1.0}
-    },
-    "gradient_clip": {
-        "type": "clip_by_global_norm",
-        "kwargs": {"clip_norm": 5.}
-    },
-    "learning_rate_decay": {
-        "type": "exponential_decay",
-        "kwargs": {
-            "decay_steps": 1,
-            "decay_rate": 0.8,
-            "staircase": True
-        },
-        "start_decay_step": 5
-    }
-}
diff --git a/examples/language_model_ptb/config_small.py b/examples/language_model_ptb/config_small.py
deleted file mode 100644
index 7cfebc7c..00000000
--- a/examples/language_model_ptb/config_small.py
+++ /dev/null
@@ -1,56 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""PTB LM small size config.
-"""
-
-# pylint: disable=invalid-name, too-few-public-methods, missing-docstring
-
-init_scale = 0.1
-num_epochs = 13
-hidden_size = 200
-keep_prob = 1.0
-batch_size = 20
-num_steps = 20
-
-cell = {
-    "type": "LSTMBlockCell",
-    "kwargs": {
-        "num_units": hidden_size,
-        "forget_bias": 0.
-    },
-    "dropout": {"output_keep_prob": keep_prob},
-    "num_layers": 2
-}
-emb = {
-    "dim": hidden_size
-}
-opt = {
-    "optimizer": {
-        "type": "GradientDescentOptimizer",
-        "kwargs": {"learning_rate": 1.0}
-    },
-    "gradient_clip": {
-        "type": "clip_by_global_norm",
-        "kwargs": {"clip_norm": 5.}
-    },
-    "learning_rate_decay": {
-        "type": "exponential_decay",
-        "kwargs": {
-            "decay_steps": 1,
-            "decay_rate": 0.5,
-            "staircase": True
-        },
-        "start_decay_step": 3
-    }
-}
diff --git a/examples/language_model_ptb/lm_ptb.py b/examples/language_model_ptb/lm_ptb.py
deleted file mode 100644
index c674207c..00000000
--- a/examples/language_model_ptb/lm_ptb.py
+++ /dev/null
@@ -1,182 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Example for building the language model.
-
-This is a reimpmentation of the TensorFlow official PTB example in:
-tensorflow/models/rnn/ptb
-
-Model and training are described in:
-(Zaremba, et. al.) Recurrent Neural Network Regularization
- http://arxiv.org/abs/1409.2329
-
-There are 3 provided model configurations:
-===========================================
-| config | epochs | train | valid  | test
-===========================================
-| small  | 13     | 37.99 | 121.39 | 115.91
-| medium | 39     | 48.45 |  86.16 |  82.07
-| large  | 55     | 37.87 |  82.62 |  78.29
-The exact results may vary depending on the random initialization.
-
-The data required for this example is in the `data/` dir of the
-PTB dataset from Tomas Mikolov's webpage:
-
-$ wget http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz
-$ tar xvf simple-examples.tgz
-
-If data is not provided, the program will download from above automatically.
-
-To run:
-
-$ python lm_ptb.py --data_path=simple-examples/data --config=config_small
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# pylint: disable=invalid-name, no-member, too-many-locals
-
-import time
-import importlib
-import numpy as np
-import tensorflow as tf
-import texar.tf as tx
-
-from ptb_reader import prepare_data, ptb_iterator
-
-flags = tf.flags
-
-flags.DEFINE_string("data_path", "./",
-                    "Directory containing PTB raw data (e.g., ptb.train.txt). "
-                    "E.g., ./simple-examples/data. If not exists, "
-                    "the directory will be created and PTB raw data will "
-                    "be downloaded.")
-flags.DEFINE_string("config", "config_small", "The config to use.")
-
-FLAGS = flags.FLAGS
-
-config = importlib.import_module(FLAGS.config)
-
-
-def _main(_):
-    # Data
-    batch_size = config.batch_size
-    num_steps = config.num_steps
-    data = prepare_data(FLAGS.data_path)
-    vocab_size = data["vocab_size"]
-
-    inputs = tf.placeholder(tf.int32, [batch_size, num_steps])
-    targets = tf.placeholder(tf.int32, [batch_size, num_steps])
-
-    # Model architecture
-    initializer = tf.random_uniform_initializer(
-        -config.init_scale, config.init_scale)
-    with tf.variable_scope("model", initializer=initializer):
-        embedder = tx.modules.WordEmbedder(
-            vocab_size=vocab_size, hparams=config.emb)
-        emb_inputs = embedder(inputs)
-        if config.keep_prob < 1:
-            emb_inputs = tf.nn.dropout(
-                emb_inputs, tx.utils.switch_dropout(config.keep_prob))
-
-        decoder = tx.modules.BasicRNNDecoder(
-            vocab_size=vocab_size, hparams={"rnn_cell": config.cell})
-        initial_state = decoder.zero_state(batch_size, tf.float32)
-        outputs, final_state, seq_lengths = decoder(
-            decoding_strategy="train_greedy",
-            impute_finished=True,
-            inputs=emb_inputs,
-            sequence_length=[num_steps] * batch_size,
-            initial_state=initial_state)
-
-    # Losses & train ops
-    mle_loss = tx.losses.sequence_sparse_softmax_cross_entropy(
-        labels=targets,
-        logits=outputs.logits,
-        sequence_length=seq_lengths)
-
-    # Use global_step to pass epoch, for lr decay
-    global_step = tf.placeholder(tf.int32)
-    train_op = tx.core.get_train_op(
-        mle_loss, global_step=global_step, increment_global_step=False,
-        hparams=config.opt)
-
-    def _run_epoch(sess, data_iter, epoch, is_train=False, verbose=False):
-        start_time = time.time()
-        loss = 0.
-        iters = 0
-        state = sess.run(initial_state)
-
-        fetches = {
-            "mle_loss": mle_loss,
-            "final_state": final_state,
-        }
-        if is_train:
-            fetches["train_op"] = train_op
-            epoch_size = (len(data["train_text_id"]) // batch_size - 1)\
-                // num_steps
-
-        mode = (tf.estimator.ModeKeys.TRAIN
-                if is_train
-                else tf.estimator.ModeKeys.EVAL)
-
-        for step, (x, y) in enumerate(data_iter):
-            feed_dict = {
-                inputs: x, targets: y, global_step: epoch,
-                tx.global_mode(): mode,
-            }
-            for i, (c, h) in enumerate(initial_state):
-                feed_dict[c] = state[i].c
-                feed_dict[h] = state[i].h
-
-            rets = sess.run(fetches, feed_dict)
-            loss += rets["mle_loss"]
-            state = rets["final_state"]
-            iters += num_steps
-
-            ppl = np.exp(loss / iters)
-            if verbose and is_train and step % (epoch_size // 10) == 10:
-                print("%.3f perplexity: %.3f speed: %.0f wps" %
-                      ((step + 1) * 1.0 / epoch_size, ppl,
-                       iters * batch_size / (time.time() - start_time)))
-
-        ppl = np.exp(loss / iters)
-        return ppl
-
-    with tf.Session() as sess:
-        sess.run(tf.global_variables_initializer())
-        sess.run(tf.local_variables_initializer())
-        sess.run(tf.tables_initializer())
-
-        for epoch in range(config.num_epochs):
-            # Train
-            train_data_iter = ptb_iterator(
-                data["train_text_id"], config.batch_size, num_steps)
-            train_ppl = _run_epoch(
-                sess, train_data_iter, epoch, is_train=True, verbose=True)
-            print("Epoch: %d Train Perplexity: %.3f" % (epoch, train_ppl))
-            # Valid
-            valid_data_iter = ptb_iterator(
-                data["valid_text_id"], config.batch_size, num_steps)
-            valid_ppl = _run_epoch(sess, valid_data_iter, epoch)
-            print("Epoch: %d Valid Perplexity: %.3f" % (epoch, valid_ppl))
-        # Test
-        test_data_iter = ptb_iterator(
-            data["test_text_id"], batch_size, num_steps)
-        test_ppl = _run_epoch(sess, test_data_iter, 0)
-        print("Test Perplexity: %.3f" % (test_ppl))
-
-
-if __name__ == '__main__':
-    tf.app.run(main=_main)
diff --git a/examples/language_model_ptb/ptb_reader.py b/examples/language_model_ptb/ptb_reader.py
deleted file mode 100644
index f42a68e9..00000000
--- a/examples/language_model_ptb/ptb_reader.py
+++ /dev/null
@@ -1,88 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Utilities for preprocessing and iterating over the PTB data.
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# pylint: disable=invalid-name, too-many-locals
-
-import os
-import numpy as np
-
-import tensorflow as tf
-
-import texar.tf as tx
-
-
-def ptb_iterator(data, batch_size, num_steps):
-    """Iterates through the ptb data.
-    """
-    data_length = len(data)
-    batch_length = data_length // batch_size
-
-    data = np.asarray(data[:batch_size * batch_length])
-    data = data.reshape([batch_size, batch_length])
-
-    epoch_size = (batch_length - 1) // num_steps
-    if epoch_size == 0:
-        raise ValueError("epoch_size == 0, decrease batch_size or num_steps")
-
-    for i in range(epoch_size):
-        x = data[:, i * num_steps: (i + 1) * num_steps]
-        y = data[:, i * num_steps + 1: (i + 1) * num_steps + 1]
-        yield (x, y)
-
-
-def prepare_data(data_path):
-    """Preprocess PTB data.
-    """
-    train_path = os.path.join(data_path, "ptb.train.txt")
-    if not tf.gfile.Exists(train_path):
-        url = 'http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz'
-        tx.data.maybe_download(url, data_path, extract=True)
-        data_path = os.path.join(data_path, 'simple-examples', 'data')
-
-    train_path = os.path.join(data_path, "ptb.train.txt")
-    valid_path = os.path.join(data_path, "ptb.valid.txt")
-    test_path = os.path.join(data_path, "ptb.test.txt")
-
-    word_to_id = tx.data.make_vocab(
-        train_path, newline_token="<EOS>", return_type="dict")
-    assert len(word_to_id) == 10000
-
-    train_text = tx.data.read_words(
-        train_path, newline_token="<EOS>")
-    train_text_id = [word_to_id[w] for w in train_text if w in word_to_id]
-
-    valid_text = tx.data.read_words(
-        valid_path, newline_token="<EOS>")
-    valid_text_id = [word_to_id[w] for w in valid_text if w in word_to_id]
-
-    test_text = tx.data.read_words(
-        test_path, newline_token="<EOS>")
-    test_text_id = [word_to_id[w] for w in test_text if w in word_to_id]
-
-    data = {
-        "train_text": train_text,
-        "valid_text": valid_text,
-        "test_text": test_text,
-        "train_text_id": train_text_id,
-        "valid_text_id": valid_text_id,
-        "test_text_id": test_text_id,
-        "vocab": word_to_id,
-        "vocab_size": len(word_to_id)
-    }
-    return data
diff --git a/examples/memory_network_lm/README.md b/examples/memory_network_lm/README.md
deleted file mode 100644
index d85e0a81..00000000
--- a/examples/memory_network_lm/README.md
+++ /dev/null
@@ -1,41 +0,0 @@
-# End-to-End Memory Network for Language Modeling #
-
-This example builds a Memory Network language model, and trains on PTB data. Model and training are described in   
-[(Sukhbaatar, et. al.) End-To-End Memory Networks](https://arxiv.org/pdf/1503.08895v4.pdf). Model details are implemented in `texar.modules.memnet`.
-
-Though the example is for language modeling, it is easy to adapt to other tasks like Question Answering, etc, as described in the above paper.
-
-## Dataset ##
-
-The standard [Penn Treebank (PTB) dataset](http://www.fit.vutbr.cz/~imikolov/rnnlm/) is used. 
-
-If data does not exist under `data_path`, the program will automatically download the data. 
-
-## Usage ##
-
-The following cmd trains the model:
-
-```bash
-python3 lm_ptb_memnet.py --config config --data_path ./
-```
-
-Here:
-  * `--config` specifies the config file to use. E.g., the above use the configuration defined in [config.py](./config.py).
-  * `--data_path` specifies the directory containing PTB raw data (e.g., `ptb.train.txt`). If the data files do not exist, the program will automatically download, extract, and pre-process the data.
-  * `--lr` specifies the initial learning rate. If not specified, the program will use the learning rate in the config file.
-
-The model will begin training, and will evaluate on the validation data periodically, and evaluate on the test data after the training is done. Checkpoints are saved every 5 epochs.
-
-## Configurations ##
-
-[config.py](./config.py) is the largest and best configuration described on the last line of Table 2 in [(Sukhbaatar, et. al.) End-To-End Memory Networks](https://arxiv.org/pdf/1503.08895v4.pdf). It sets number of hops to 7, hidden dim to 150, and memory size to 200. This model has 4,582,500 parameters in total.
-
-## Results ##
-
-The perplexity of different configs is:
-
-| config        | epochs | train | valid  | test  |
-| ------------- | -------| ------| -------| ------|
-| config        | 51     | 50.70 | 120.97 | 113.06|
-
-This result of `config.py` is slightly inferior to the result presented in the paper, since the result in the paper is the best among 10 runs.
diff --git a/examples/memory_network_lm/config.py b/examples/memory_network_lm/config.py
deleted file mode 100644
index 95f38b85..00000000
--- a/examples/memory_network_lm/config.py
+++ /dev/null
@@ -1,54 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# pylint: disable=invalid-name, too-few-public-methods, missing-docstring
-
-n_hops = 7
-dim = 150
-relu_dim = dim // 2
-batch_size = 128
-num_epochs = 200
-memory_size = 200
-initialize_stddev = 0.05
-query_constant = 0.1
-learning_rate_anneal_factor = 1.5
-terminating_learning_rate = 1e-5
-
-opt = {
-    "optimizer": {
-        "type": "GradientDescentOptimizer",
-        "kwargs": {"learning_rate": 0.01}
-    },
-    "gradient_clip": {
-        "type": "clip_by_global_norm",
-        "kwargs": {"clip_norm": 50.}
-    },
-}
-
-embed = {
-    "embedding": {
-        "dim": dim,
-    },
-    "temporal_embedding": {
-        "dim": dim,
-    }
-}
-
-memnet = {
-    "n_hops": n_hops,
-    "relu_dim": relu_dim,
-    "memory_size": memory_size,
-    "A": embed,
-    "C": embed,
-}
diff --git a/examples/memory_network_lm/lm_ptb_memnet.py b/examples/memory_network_lm/lm_ptb_memnet.py
deleted file mode 100755
index 3c5d2c8f..00000000
--- a/examples/memory_network_lm/lm_ptb_memnet.py
+++ /dev/null
@@ -1,207 +0,0 @@
-#!/usr/bin/env python3
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Example for building the PTB language model with Memory Network.
-
-Memory Network model is described in https://arxiv.org/abs/1503.08895v4
-
-The data required for this example is in the `data/` dir of the
-PTB dataset from Tomas Mikolov's webpage:
-
-$ wget http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz
-$ tar xvf simple-examples.tgz
-
-If data is now provided, the program will download from above automatically.
-
-To run:
-
-$ python lm_ptb_memnet.py --data_path=simple-examples/data \
-  --config=config
-
-This code will automatically save and restore from directory `ckpt/`.
-If the directory doesn't exist, it will be created automatically.
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# pylint: disable=invalid-name, no-member, too-many-locals
-
-import importlib
-import numpy as np
-import tensorflow as tf
-import texar.tf as tx
-
-from ptb_reader import prepare_data
-from ptb_reader import ptb_iterator_memnet as ptb_iterator
-
-flags = tf.flags
-
-flags.DEFINE_string("data_path", "./",
-                    "Directory containing PTB raw data (e.g., ptb.train.txt). "
-                    "E.g., ./simple-examples/data. If not exists, "
-                    "the directory will be created and PTB raw data will "
-                    "be downloaded.")
-flags.DEFINE_string("config", "config", "The config to use.")
-
-FLAGS = flags.FLAGS
-
-config = importlib.import_module(FLAGS.config)
-
-
-def _main(_):
-    # Data
-    batch_size = config.batch_size
-    memory_size = config.memory_size
-    terminating_learning_rate = config.terminating_learning_rate
-    data = prepare_data(FLAGS.data_path)
-    vocab_size = data["vocab_size"]
-    print('vocab_size = {}'.format(vocab_size))
-
-    inputs = tf.placeholder(tf.int32, [None, memory_size], name="inputs")
-    targets = tf.placeholder(tf.int32, [None], name="targets")
-
-    # Model architecture
-    initializer = tf.random_normal_initializer(
-        stddev=config.initialize_stddev)
-    with tf.variable_scope("model", initializer=initializer):
-        memnet = tx.modules.MemNetRNNLike(raw_memory_dim=vocab_size,
-                                          hparams=config.memnet)
-        queries = tf.fill([tf.shape(inputs)[0], config.dim],
-                          config.query_constant)
-        logits = memnet(inputs, queries)
-
-    # Losses & train ops
-    mle_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
-        labels=targets, logits=logits)
-    mle_loss = tf.reduce_sum(mle_loss)
-
-    # Use global_step to pass epoch, for lr decay
-    lr = config.opt["optimizer"]["kwargs"]["learning_rate"]
-    learning_rate = tf.placeholder(tf.float32, [], name="learning_rate")
-    global_step = tf.Variable(0, dtype=tf.int32, name="global_step")
-    increment_global_step = tf.assign_add(global_step, 1)
-    train_op = tx.core.get_train_op(
-        mle_loss,
-        learning_rate=learning_rate,
-        global_step=global_step,
-        increment_global_step=False,
-        hparams=config.opt)
-
-    def _run_epoch(sess, data_iter, epoch, is_train=False):
-        loss = 0.
-        iters = 0
-
-        fetches = {
-            "mle_loss": mle_loss
-        }
-        if is_train:
-            fetches["train_op"] = train_op
-
-        mode = (tf.estimator.ModeKeys.TRAIN
-                if is_train
-                else tf.estimator.ModeKeys.EVAL)
-
-        for _, (x, y) in enumerate(data_iter):
-            batch_size = x.shape[0]
-            feed_dict = {
-                inputs: x, targets: y, learning_rate: lr,
-                tx.global_mode(): mode,
-            }
-
-            rets = sess.run(fetches, feed_dict)
-            loss += rets["mle_loss"]
-            iters += batch_size
-
-        ppl = np.exp(loss / iters)
-        return ppl
-
-    saver = tf.train.Saver()
-
-    with tf.Session() as sess:
-        sess.run(tf.global_variables_initializer())
-        sess.run(tf.local_variables_initializer())
-        sess.run(tf.tables_initializer())
-
-        try:
-            saver.restore(sess, "ckpt/model.ckpt")
-            print('restored checkpoint.')
-        except BaseException:
-            print('restore checkpoint failed.')
-
-        last_valid_ppl = None
-        heuristic_lr_decay = (hasattr(config, 'heuristic_lr_decay')
-                              and config.heuristic_lr_decay)
-        while True:
-            if lr < terminating_learning_rate:
-                break
-
-            epoch = sess.run(global_step)
-            if epoch >= config.num_epochs:
-                print('Too many epochs!')
-                break
-
-            print('epoch: {} learning_rate: {:.6f}'.format(epoch, lr))
-
-            # Train
-            train_data_iter = ptb_iterator(
-                data["train_text_id"], batch_size, memory_size)
-            train_ppl = _run_epoch(
-                sess, train_data_iter, epoch, is_train=True)
-            print("Train Perplexity: {:.3f}".format(train_ppl))
-            sess.run(increment_global_step)
-
-            # checkpoint
-            if epoch % 5 == 0:
-                try:
-                    saver.save(sess, "ckpt/model.ckpt")
-                    print("saved checkpoint.")
-                except BaseException:
-                    print("save checkpoint failed.")
-
-            # Valid
-            valid_data_iter = ptb_iterator(
-                data["valid_text_id"], batch_size, memory_size)
-            valid_ppl = _run_epoch(sess, valid_data_iter, epoch)
-            print("Valid Perplexity: {:.3f}".format(valid_ppl))
-
-            # Learning rate decay
-            if last_valid_ppl:
-                if heuristic_lr_decay:
-                    if valid_ppl > last_valid_ppl * config.heuristic_threshold:
-                        lr /= 1. + (valid_ppl / last_valid_ppl
-                                    - config.heuristic_threshold) \
-                              * config.heuristic_rate
-                    last_valid_ppl = last_valid_ppl \
-                                     * (1 - config.heuristic_smooth_rate) \
-                                     + valid_ppl * config.heuristic_smooth_rate
-                else:
-                    if valid_ppl > last_valid_ppl:
-                        lr /= config.learning_rate_anneal_factor
-                    last_valid_ppl = valid_ppl
-            else:
-                last_valid_ppl = valid_ppl
-            print("last_valid_ppl: {:.6f}".format(last_valid_ppl))
-
-        epoch = sess.run(global_step)
-        print('Terminate after epoch ', epoch)
-
-        # Test
-        test_data_iter = ptb_iterator(data["test_text_id"], 1, memory_size)
-        test_ppl = _run_epoch(sess, test_data_iter, 0)
-        print("Test Perplexity: {:.3f}".format(test_ppl))
-
-
-if __name__ == '__main__':
-    tf.app.run(main=_main)
diff --git a/examples/memory_network_lm/ptb_reader.py b/examples/memory_network_lm/ptb_reader.py
deleted file mode 100644
index 9728d5fc..00000000
--- a/examples/memory_network_lm/ptb_reader.py
+++ /dev/null
@@ -1,108 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Utilities for preprocessing and iterating over the PTB data.
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# pylint: disable=invalid-name, too-many-locals
-
-import os
-import numpy as np
-
-import tensorflow as tf
-
-import texar.tf as tx
-
-
-def ptb_iterator(data, batch_size, num_steps):
-    """Iterates through the ptb data.
-    """
-    data_length = len(data)
-    batch_length = data_length // batch_size
-
-    data = np.asarray(data[:batch_size * batch_length])
-    data = data.reshape([batch_size, batch_length])
-
-    epoch_size = (batch_length - 1) // num_steps
-    if epoch_size == 0:
-        raise ValueError("epoch_size == 0, decrease batch_size or num_steps")
-
-    for i in range(epoch_size):
-        x = data[:, i * num_steps: (i + 1) * num_steps]
-        y = data[:, i * num_steps + 1: (i + 1) * num_steps + 1]
-        yield (x, y)
-
-
-def ptb_iterator_memnet(data, batch_size, memory_size):
-    """Iterates through the ptb data.
-    """
-    data_length = len(data)
-    length = data_length - memory_size
-    order = list(range(length))
-    np.random.shuffle(order)
-
-    data = np.asarray(data)
-
-    for i in range(0, length, batch_size):
-        x, y = [], []
-        for j in range(i, min(i + batch_size, length)):
-            idx = order[j]
-            x.append(data[idx: idx + memory_size])
-            y.append(data[idx + memory_size])
-        x, y = np.asarray(x), np.asarray(y)
-        yield (x, y)
-
-
-def prepare_data(data_path):
-    """Preprocess PTB data.
-    """
-    train_path = os.path.join(data_path, "ptb.train.txt")
-    if not tf.gfile.Exists(train_path):
-        url = 'http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz'
-        tx.data.maybe_download(url, data_path, extract=True)
-        data_path = os.path.join(data_path, 'simple-examples', 'data')
-
-    train_path = os.path.join(data_path, "ptb.train.txt")
-    valid_path = os.path.join(data_path, "ptb.valid.txt")
-    test_path = os.path.join(data_path, "ptb.test.txt")
-
-    word_to_id = tx.data.make_vocab(
-        train_path, newline_token="<EOS>", return_type="dict")
-    assert len(word_to_id) == 10000
-
-    train_text = tx.data.read_words(
-        train_path, newline_token="<EOS>")
-    train_text_id = [word_to_id[w] for w in train_text if w in word_to_id]
-
-    valid_text = tx.data.read_words(
-        valid_path, newline_token="<EOS>")
-    valid_text_id = [word_to_id[w] for w in valid_text if w in word_to_id]
-
-    test_text = tx.data.read_words(
-        test_path, newline_token="<EOS>")
-    test_text_id = [word_to_id[w] for w in test_text if w in word_to_id]
-
-    data = {
-        "train_text": train_text,
-        "valid_text": valid_text,
-        "test_text": test_text,
-        "train_text_id": train_text_id,
-        "valid_text_id": valid_text_id,
-        "test_text_id": test_text_id,
-        "vocab": word_to_id,
-        "vocab_size": len(word_to_id)
-    }
-    return data
diff --git a/examples/rl_gym/README.md b/examples/rl_gym/README.md
deleted file mode 100644
index e114e4da..00000000
--- a/examples/rl_gym/README.md
+++ /dev/null
@@ -1,18 +0,0 @@
-# Reinforcement Learning for Games #
-
-This example implements three RL algorithms for the Cartpole game based on the OpenAI Gym environment:
-* [pg_cartpole.py](./pg_cartpole.py) uses Policy Gradient
-* [dqn_cartpole.py](./dqn_cartpole.py) uses Deep-Q
-* [ac_cartpole.py](./ac_cartpole.py) uses Actor-critic
-
-The example is for demonstrating the Texar RL APIs (for games), and only implements the most basic versions of respective algorithms.
-
-## Usage ##
-
-Run the following cmd to start training:
-
-```
-python pg_cartpole.py --config config 
-python dqn_cartpole.py --config config 
-python ac_cartpole.py --config config 
-```
diff --git a/examples/rl_gym/ac_cartpole.py b/examples/rl_gym/ac_cartpole.py
deleted file mode 100644
index 725f8618..00000000
--- a/examples/rl_gym/ac_cartpole.py
+++ /dev/null
@@ -1,68 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Policy gradient for the CartPole game in OpenAI gym.
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# pylint: disable=invalid-name
-
-import importlib
-import gym
-import tensorflow as tf
-import texar.tf as tx
-
-flags = tf.flags
-
-flags.DEFINE_string("config", "config", "The config to use.")
-
-FLAGS = flags.FLAGS
-
-config = importlib.import_module(FLAGS.config)
-
-if __name__ == '__main__':
-    env = gym.make('CartPole-v0')
-    env = env.unwrapped
-
-    env_config = tx.agents.get_gym_env_config(env)
-
-    agent = tx.agents.ActorCriticAgent(env_config=env_config)
-    with tf.Session() as sess:
-        agent.sess = sess
-
-        sess.run(tf.global_variables_initializer())
-        sess.run(tf.local_variables_initializer())
-        sess.run(tf.tables_initializer())
-
-        feed_dict = {tx.global_mode(): tf.estimator.ModeKeys.TRAIN}
-
-        for e in range(5000):
-            reward_sum = 0.
-            observ = env.reset()
-            agent.reset()
-            while True:
-                action = agent.get_action(observ, feed_dict=feed_dict)
-
-                next_observ, reward, terminal, _ = env.step(action=action)
-                agent.observe(reward, terminal, feed_dict=feed_dict)
-                observ = next_observ
-
-                reward_sum += reward
-                if terminal:
-                    break
-
-            if (e + 1) % 10 == 0:
-                print('episode {}: {}'.format(e + 1, reward_sum))
diff --git a/examples/rl_gym/config.py b/examples/rl_gym/config.py
deleted file mode 100644
index 7b2a4a67..00000000
--- a/examples/rl_gym/config.py
+++ /dev/null
@@ -1,25 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Cartpole config.
-"""
-
-# pylint: disable=invalid-name
-
-policy_hparams = None  # Use default hyperparameters
-
-pg_agent_hparams = {
-    "policy_hparams": policy_hparams,
-    "normalize_reward": True
-}
diff --git a/examples/rl_gym/dqn_cartpole.py b/examples/rl_gym/dqn_cartpole.py
deleted file mode 100644
index e7d89140..00000000
--- a/examples/rl_gym/dqn_cartpole.py
+++ /dev/null
@@ -1,70 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Policy gradient for the CartPole game in OpenAI gym.
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# pylint: disable=invalid-name
-
-import importlib
-import gym
-import tensorflow as tf
-import texar.tf as tx
-from texar.tf.agents import PGAgent
-
-
-flags = tf.flags
-
-flags.DEFINE_string("config", "config", "The config to use.")
-
-FLAGS = flags.FLAGS
-
-config = importlib.import_module(FLAGS.config)
-
-
-if __name__ == '__main__':
-    env = gym.make('CartPole-v0')
-    env = env.unwrapped
-
-    env_config = tx.agents.get_gym_env_config(env)
-
-    with tf.Session() as sess:
-        agent = tx.agents.DQNAgent(sess=sess, env_config=env_config)
-
-        sess.run(tf.global_variables_initializer())
-        sess.run(tf.local_variables_initializer())
-        sess.run(tf.tables_initializer())
-
-        feed_dict = {tx.global_mode(): tf.estimator.ModeKeys.TRAIN}
-
-        for e in range(500):
-            reward_sum = 0.
-            observ = env.reset()
-            agent.reset()
-            while True:
-                action = agent.get_action(observ, feed_dict=feed_dict)
-
-                next_observ, reward, terminal, _ = env.step(action=action)
-                agent.observe(reward, terminal, feed_dict=feed_dict)
-                observ = next_observ
-
-                reward_sum += reward
-                if terminal:
-                    break
-
-            if (e + 1) % 10 == 0:
-                print('episode {}: {}'.format(e + 1, reward_sum))
diff --git a/examples/rl_gym/pg_cartpole.py b/examples/rl_gym/pg_cartpole.py
deleted file mode 100644
index c5cd99f1..00000000
--- a/examples/rl_gym/pg_cartpole.py
+++ /dev/null
@@ -1,79 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Policy gradient for the CartPole game in OpenAI gym.
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# pylint: disable=invalid-name
-
-import importlib
-import gym
-import tensorflow as tf
-import texar.tf as tx
-from texar.tf.agents import PGAgent
-
-flags = tf.flags
-
-flags.DEFINE_string("config", "config", "The config to use.")
-
-FLAGS = flags.FLAGS
-
-config = importlib.import_module(FLAGS.config)
-
-
-def _main(_):
-    env = gym.make('CartPole-v0')
-    env = env.unwrapped
-
-    env_config = tx.agents.get_gym_env_config(env)
-    agent = PGAgent(
-        env_config,
-        policy_kwargs={'action_space': env_config.action_space},
-        hparams=config.pg_agent_hparams)
-
-    sess = tf.Session()
-    agent.sess = sess
-    sess.run(tf.global_variables_initializer())
-    sess.run(tf.local_variables_initializer())
-    sess.run(tf.tables_initializer())
-    feed_dict = {tx.global_mode(): tf.estimator.ModeKeys.TRAIN}
-
-    for e in range(300):
-        reward_sum = 0.
-        observ = env.reset()
-        agent.reset()
-        while True:
-            action = agent.get_action(observ, feed_dict=feed_dict)
-
-            next_observ, reward, terminal, _ = env.step(action=action)
-            if terminal:
-                reward = 0.
-            agent.observe(reward, terminal, feed_dict=feed_dict)
-            observ = next_observ
-
-            reward_sum += reward
-            if terminal:
-                break
-
-        if (e + 1) % 10 == 0:
-            print('episode {}: {}'.format(e + 1, reward_sum))
-
-    sess.close()
-
-
-if __name__ == '__main__':
-    tf.app.run(main=_main)
diff --git a/examples/sentence_classifier/.gitignore b/examples/sentence_classifier/.gitignore
deleted file mode 100644
index 82f0c3ac..00000000
--- a/examples/sentence_classifier/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-/data/
diff --git a/examples/sentence_classifier/README.md b/examples/sentence_classifier/README.md
deleted file mode 100644
index 28f1decb..00000000
--- a/examples/sentence_classifier/README.md
+++ /dev/null
@@ -1,34 +0,0 @@
-# Sentence Sentiment Classifier #
-
-This example builds sentence convolutional classifier, and trains on [SST data](https://nlp.stanford.edu/sentiment/index.html). The example config [config_kim.py](./config_kim.py) corresponds to the paper 
-[(Kim) Convolutional Neural Networks for Sentence Classification](https://arxiv.org/pdf/1408.5882.pdf). 
-
-The example shows:
-  * Contruction of simple model, involving the `Embedder` and `Conv1DClassifier`.
-  * Use of Texar `MultiAlignedData` to read parallel text and label data. 
-
-## Usage ##
-
-Use the following cmd to download and prepare the SST binary data:
-
-```
-python sst_data_preprocessor.py [--data_path ./data]
-```
-
-Here
-  * `--data_path` specifies the directory to store the SST data. If the data files do not exist, the program will automatically download, extract, and pre-process the data.
-
-The following cmd trains the model with Kim's config:
-
-```
-python clas_main.py --config config_kim
-```
-
-Here:
-  * `--config` specifies the config file to use. E.g., the above use the configuration defined in [config_kim.py](./config_kim.py)
-
-The model will begin training and evaluating on the validation data, and will evaluate on the test data after every epoch if a valid accuracy is obtained. 
-
-## Results ##
-
-The model achieves around `83%` test set accuracy.
diff --git a/examples/sentence_classifier/clas_main.py b/examples/sentence_classifier/clas_main.py
deleted file mode 100644
index 0df94df5..00000000
--- a/examples/sentence_classifier/clas_main.py
+++ /dev/null
@@ -1,119 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Example for building a sentence convolutional classifier.
-
-Use `./sst_data_preprocessor.py` to download and clean the SST binary data.
-
-To run:
-
-$ python clas_main.py --config=config_kim
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import importlib
-import tensorflow as tf
-import texar.tf as tx
-
-# pylint: disable=invalid-name, too-many-locals
-
-flags = tf.flags
-
-flags.DEFINE_string("config", "config_kim", "The config to use.")
-
-FLAGS = flags.FLAGS
-
-config = importlib.import_module(FLAGS.config)
-
-
-def _main(_):
-    # Data
-    train_data = tx.data.MultiAlignedData(config.train_data)
-    val_data = tx.data.MultiAlignedData(config.val_data)
-    test_data = tx.data.MultiAlignedData(config.test_data)
-    iterator = tx.data.TrainTestDataIterator(train_data, val_data, test_data)
-    batch = iterator.get_next()
-
-    # Model architecture
-    embedder = tx.modules.WordEmbedder(
-        vocab_size=train_data.vocab('x').size, hparams=config.emb)
-    classifier = tx.modules.Conv1DClassifier(config.clas)
-    logits, pred = classifier(embedder(batch['x_text_ids']))
-
-    # Losses & train ops
-    loss = tf.losses.sparse_softmax_cross_entropy(
-        labels=batch['y'], logits=logits)
-    accu = tx.evals.accuracy(batch['y'], pred)
-
-    train_op = tx.core.get_train_op(loss, hparams=config.opt)
-
-    def _run_epoch(sess, mode, epoch=0, verbose=False):
-        is_train = tx.utils.is_train_mode_py(mode)
-
-        fetches = {
-            "accu": accu,
-            "batch_size": tx.utils.get_batch_size(batch['y'])
-        }
-        if is_train:
-            fetches["train_op"] = train_op
-        feed_dict = {tx.context.global_mode(): mode}
-
-        cum_accu = 0.
-        nsamples = 0
-        step = 0
-        while True:
-            try:
-                rets = sess.run(fetches, feed_dict)
-                step += 1
-
-                accu_ = rets['accu']
-                cum_accu += accu_ * rets['batch_size']
-                nsamples += rets['batch_size']
-
-                if verbose and (step == 1 or step % 100 == 0):
-                    tf.logging.info(
-                        "epoch: {0:2} step: {1:4} accu: {2:.4f}"
-                        .format(epoch, step, accu_))
-            except tf.errors.OutOfRangeError:
-                break
-        return cum_accu / nsamples
-
-    with tf.Session() as sess:
-        sess.run(tf.global_variables_initializer())
-        sess.run(tf.local_variables_initializer())
-        sess.run(tf.tables_initializer())
-
-        best_val_accu = -1.
-        for epoch in range(config.num_epochs):
-            # Train
-            iterator.switch_to_train_data(sess)
-            train_accu = _run_epoch(sess, tf.estimator.ModeKeys.TRAIN, epoch)
-            # Val
-            iterator.switch_to_val_data(sess)
-            val_accu = _run_epoch(sess, tf.estimator.ModeKeys.EVAL, epoch)
-            tf.logging.info('epoch: {0:2} train accu: {1:.4f} val accu: {2:.4f}'
-                            .format(epoch + 1, train_accu, val_accu))
-            # Test
-            if val_accu > best_val_accu:
-                best_val_accu = val_accu
-
-                iterator.switch_to_test_data(sess)
-                test_accu = _run_epoch(sess, tf.estimator.ModeKeys.EVAL)
-                tf.logging.info('test accu: {0:.4f}'.format(test_accu))
-
-
-if __name__ == '__main__':
-    tf.logging.set_verbosity(tf.logging.INFO)
-    tf.app.run(main=_main)
diff --git a/examples/sentence_classifier/config_kim.py b/examples/sentence_classifier/config_kim.py
deleted file mode 100644
index ce6b8693..00000000
--- a/examples/sentence_classifier/config_kim.py
+++ /dev/null
@@ -1,77 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Sentence convolutional classifier config.
-
-This is (approximately) the config of the paper:
-(Kim) Convolutional Neural Networks for Sentence Classification
-  https://arxiv.org/pdf/1408.5882.pdf
-"""
-
-# pylint: disable=invalid-name, too-few-public-methods, missing-docstring
-
-import copy
-
-num_epochs = 15
-
-train_data = {
-    "batch_size": 50,
-    "datasets": [
-        {
-            "files": "./data/sst2.train.sentences.txt",
-            "vocab_file": "./data/sst2.vocab",
-            # Discards samples with length > 56
-            "max_seq_length": 56,
-            "length_filter_mode": "discard",
-            # Do not append BOS/EOS tokens to the sentences
-            "bos_token": "",
-            "eos_token": "",
-            "data_name": "x"
-        },
-        {
-            "files": "./data/sst2.train.labels.txt",
-            "data_type": "int",
-            "data_name": "y"
-        }
-    ]
-}
-# The val and test data have the same config with the train data, except
-# for the file names
-val_data = copy.deepcopy(train_data)
-val_data["datasets"][0]["files"] = "./data/sst2.dev.sentences.txt"
-val_data["datasets"][1]["files"] = "./data/sst2.dev.labels.txt"
-test_data = copy.deepcopy(train_data)
-test_data["datasets"][0]["files"] = "./data/sst2.test.sentences.txt"
-test_data["datasets"][1]["files"] = "./data/sst2.test.labels.txt"
-
-# Word embedding
-emb = {
-    "dim": 300
-}
-
-# Classifier
-clas = {
-    "num_conv_layers": 1,
-    "filters": 100,
-    "kernel_size": [3, 4, 5],
-    "conv_activation": "relu",
-    "pooling": "MaxPooling1D",
-    "num_dense_layers": 0,
-    "dropout_conv": [1],
-    "dropout_rate": 0.5,
-    "num_classes": 2
-}
-
-# Optimization
-# Just use the default config, e.g., Adam Optimizer
-opt = {}
diff --git a/examples/sentence_classifier/sst_data_preprocessor.py b/examples/sentence_classifier/sst_data_preprocessor.py
deleted file mode 100644
index fda6e14f..00000000
--- a/examples/sentence_classifier/sst_data_preprocessor.py
+++ /dev/null
@@ -1,100 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Preparing the SST2 dataset.
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import re
-from io import open  # pylint: disable=redefined-builtin
-import tensorflow as tf
-import texar.tf as tx
-
-# pylint: disable=invalid-name, too-many-locals
-
-flags = tf.flags
-
-flags.DEFINE_string("data_path", "./data",
-                    "Directory containing SST data. "
-                    "E.g., ./data/sst2.train.sentences.txt. If not exists, "
-                    "the directory will be created and SST raw data will "
-                    "be downloaded.")
-
-FLAGS = flags.FLAGS
-
-
-def clean_sst_text(text):
-    """Cleans tokens in the SST data, which has already been tokenized.
-    """
-    text = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", text)
-    text = re.sub(r"\s{2,}", " ", text)
-    return text.strip().lower()
-
-
-def transform_raw_sst(data_path, raw_fn, new_fn):
-    """Transforms the raw data format to a new format.
-    """
-    fout_x_name = os.path.join(data_path, new_fn + '.sentences.txt')
-    fout_x = open(fout_x_name, 'w', encoding='utf-8')
-    fout_y_name = os.path.join(data_path, new_fn + '.labels.txt')
-    fout_y = open(fout_y_name, 'w', encoding='utf-8')
-
-    fin_name = os.path.join(data_path, raw_fn)
-    with open(fin_name, 'r', encoding='utf-8') as fin:
-        for line in fin:
-            parts = line.strip().split()
-            label = parts[0]
-            sent = ' '.join(parts[1:])
-            sent = clean_sst_text(sent)
-            fout_x.write(sent + '\n')
-            fout_y.write(label + '\n')
-
-    return fout_x_name, fout_y_name
-
-
-def prepare_data(data_path):
-    """Preprocesses SST2 data.
-    """
-    train_path = os.path.join(data_path, "sst.train.sentences.txt")
-    if not tf.gfile.Exists(train_path):
-        url = ('https://raw.githubusercontent.com/ZhitingHu/'
-               'logicnn/master/data/raw/')
-        files = ['stsa.binary.phrases.train', 'stsa.binary.dev',
-                 'stsa.binary.test']
-        for fn in files:
-            tx.data.maybe_download(url + fn, data_path, extract=True)
-
-    fn_train, _ = transform_raw_sst(
-        data_path, 'stsa.binary.phrases.train', 'sst2.train')
-    transform_raw_sst(data_path, 'stsa.binary.dev', 'sst2.dev')
-    transform_raw_sst(data_path, 'stsa.binary.test', 'sst2.test')
-
-    vocab = tx.data.make_vocab(fn_train)
-    fn_vocab = os.path.join(data_path, 'sst2.vocab')
-    with open(fn_vocab, 'w', encoding='utf-8') as f_vocab:
-        for v in vocab:
-            f_vocab.write(v + '\n')
-
-    tf.logging.info('Preprocessing done: {}'.format(data_path))
-
-
-def _main(_):
-    prepare_data(FLAGS.data_path)
-
-
-if __name__ == '__main__':
-    tf.logging.set_verbosity(tf.logging.INFO)
-    tf.app.run(main=_main)
diff --git a/examples/seq2seq_attn/.gitignore b/examples/seq2seq_attn/.gitignore
deleted file mode 100644
index 9faa04f8..00000000
--- a/examples/seq2seq_attn/.gitignore
+++ /dev/null
@@ -1,4 +0,0 @@
-/data/
-/data.zip
-/iwslt14.zip
-/toy_copy.zip
diff --git a/examples/seq2seq_attn/README.md b/examples/seq2seq_attn/README.md
deleted file mode 100644
index 73ff0c56..00000000
--- a/examples/seq2seq_attn/README.md
+++ /dev/null
@@ -1,40 +0,0 @@
-# Seq2seq Model #
-
-This example builds an attentional seq2seq model for machine translation.
-
-## Usage ##
-
-### Dataset ###
-
-Two example datasets are provided:
-
-  * toy_copy: A small toy autoencoding dataset from [TF Seq2seq toolkit](https://github.com/google/seq2seq/tree/2500c26add91b079ca00cf1f091db5a99ddab9ae).
-  * iwslt14: The benchmark [IWSLT2014](https://sites.google.com/site/iwsltevaluation2014/home) (de-en) machine translation dataset, following [(Ranzato et al., 2015)](https://arxiv.org/pdf/1511.06732.pdf) for data pre-processing.
-
-Download the data with the following cmds:
-
-```
-python prepare_data.py --data toy_copy
-python prepare_data.py --data iwslt14
-```
-
-### Train the model ###
-
-Train the model with the following cmd:
-
-```
-python seq2seq_attn.py --config_model config_model --config_data config_toy_copy
-```
-
-Here:
-  * `--config_model` specifies the model config. Note not to include the `.py` suffix.
-  * `--config_data` specifies the data config.
-
-[config_model.py](./config_model.py) specifies a single-layer seq2seq model with Luong attention and bi-directional RNN encoder. Hyperparameters taking default values can be omitted from the config file. 
-
-For demonstration purpose, [config_model_full.py](./config_model_full.py) gives all possible hyperparameters for the model. The two config files will lead to the same model.
-
-## Results ##
-
-On the IWSLT14 dataset, using original target texts as reference(no  `<UNK>`  in the reference), the model achieves `BLEU = 26.44 ± 0.18` .
-
diff --git a/examples/seq2seq_attn/config_iwslt14.py b/examples/seq2seq_attn/config_iwslt14.py
deleted file mode 100644
index 0c36dc73..00000000
--- a/examples/seq2seq_attn/config_iwslt14.py
+++ /dev/null
@@ -1,45 +0,0 @@
-
-num_epochs = 15
-display = 500
-
-source_vocab_file = './data/iwslt14/vocab.de'
-target_vocab_file = './data/iwslt14/vocab.en'
-
-train = {
-    'batch_size': 32,
-    'allow_smaller_final_batch': False,
-    'source_dataset': {
-        "files": 'data/iwslt14/train.de',
-        'vocab_file': source_vocab_file,
-        'max_seq_length': 50
-    },
-    'target_dataset': {
-        'files': 'data/iwslt14/train.en',
-        'vocab_file': target_vocab_file,
-        'max_seq_length': 50
-    }
-}
-val = {
-    'batch_size': 32,
-    'shuffle': False,
-    'source_dataset': {
-        "files": 'data/iwslt14/valid.de',
-        'vocab_file': source_vocab_file,
-    },
-    'target_dataset': {
-        'files': 'data/iwslt14/valid.en',
-        'vocab_file': target_vocab_file,
-    }
-}
-test = {
-    'batch_size': 32,
-    'shuffle': False,
-    'source_dataset': {
-        "files": 'data/iwslt14/test.de',
-        'vocab_file': source_vocab_file,
-    },
-    'target_dataset': {
-        'files': 'data/iwslt14/test.en',
-        'vocab_file': target_vocab_file,
-    }
-}
diff --git a/examples/seq2seq_attn/config_model.py b/examples/seq2seq_attn/config_model.py
deleted file mode 100644
index 35758dab..00000000
--- a/examples/seq2seq_attn/config_model.py
+++ /dev/null
@@ -1,37 +0,0 @@
-# Attentional Seq2seq model.
-# Hyperparameters not specified here will take the default values.
-
-num_units = 256
-beam_width = 10
-
-embedder = {
-    'dim': num_units
-}
-encoder = {
-    'rnn_cell_fw': {
-        'kwargs': {
-            'num_units': num_units
-        }
-    }
-}
-decoder = {
-    'rnn_cell': {
-        'kwargs': {
-            'num_units': num_units
-        },
-    },
-    'attention': {
-        'kwargs': {
-            'num_units': num_units,
-        },
-        'attention_layer_size': num_units
-    }
-}
-opt = {
-    'optimizer': {
-        'type':  'AdamOptimizer',
-        'kwargs': {
-            'learning_rate': 0.001,
-        },
-    },
-}
diff --git a/examples/seq2seq_attn/config_model_full.py b/examples/seq2seq_attn/config_model_full.py
deleted file mode 100644
index bfcebd80..00000000
--- a/examples/seq2seq_attn/config_model_full.py
+++ /dev/null
@@ -1,145 +0,0 @@
-# The full possible hyperparameters for the attentional seq2seq model.
-# Most of the hyperparameters take the default values and are not necessary to
-# specify explicitly. The config here results in the same model with the
-# `config_model.py`.
-
-num_units = 256
-beam_width = 10
-
-# --------------------- Embedder --------------------- #
-embedder = {
-    'dim': num_units,
-    'initializer': {
-        'type': 'random_uniform_initializer',
-        'kwargs': {
-            'minval': -0.1,
-            'maxval': 0.1,
-            'seed': None
-        },
-    },
-    'regularizer': {
-        'type': 'L1L2',
-        'kwargs': {
-            'l1': 0,
-            'l2': 0
-        }
-    },
-    'dropout_rate': 0,
-    'dropout_strategy': 'element',
-    'trainable': True,
-    'name': 'word_embedder'
-}
-
-# --------------------- Encoder --------------------- #
-encoder = {
-    'rnn_cell_fw': {
-        'type': 'LSTMCell',
-        'kwargs': {
-            'num_units': num_units,
-            'forget_bias': 1.0,
-            'activation': None,
-            # Other arguments go here for tf.nn.rnn_cell.LSTMCell
-            # ...
-        },
-        'num_layers': 1,
-        'dropout': {
-            'input_keep_prob': 1.0,
-            'output_keep_prob': 1.0,
-            'state_keep_prob': 1.0,
-            'variational_recurrent': False,
-            'input_size': [],
-        },
-        'residual': False,
-        'highway': False,
-    },
-    'rnn_cell_bw': {
-        # The same possible hyperparameters as with 'rnn_cell_fw'
-        # ...
-    },
-    'rnn_cell_share_config': True,
-    'output_layer_fw': {
-        'num_layers': 0,
-        'layer_size': 128,
-        'activation': 'identity',
-        'final_layer_activation': None,
-        'other_dense_kwargs': None,
-        'dropout_layer_ids': [],
-        'dropout_rate': 0.5,
-        'variational_dropout': False
-    },
-    'output_layer_bw': {
-        # The same possible hyperparameters as with 'output_layer_fw'
-        # ...
-    },
-    'output_layer_share_config': True,
-    'name': 'bidirectional_rnn_encoder'
-}
-
-# --------------------- Decoder --------------------- #
-decoder = {
-    'rnn_cell': {
-        'type': 'LSTMCell',
-        'kwargs': {
-            'num_units': num_units,
-            'forget_bias': 1.0,
-            'activation': None,
-            # Other arguments go here for tf.nn.rnn_cell.LSTMCell
-            # ...
-        },
-        'num_layers': 1,
-        'dropout': {
-            'input_keep_prob': 1.0,
-            'output_keep_prob': 1.0,
-            'state_keep_prob': 1.0,
-            'variational_recurrent': False,
-            'input_size': [],
-        },
-        'residual': False,
-        'highway': False,
-    },
-    'attention': {
-        'type': 'LuongAttention',
-        'kwargs': {
-            'num_units': num_units,
-            'scale': False,
-            'probability_fn': None,
-            'score_mask_value': None,
-            # Other arguments go here for tf.contrib.seq2seq.LuongAttention
-            # ...
-        },
-        'attention_layer_size': num_units,
-        'alignment_history': False,
-        'output_attention': True,
-    },
-    'helper_train': {
-        'type': 'TrainingHelper',
-        'kwargs': {
-            # Arguments go here for tf.contrib.seq2seq.TrainingHelper
-        }
-    },
-    'helper_infer': {
-        # The same possible hyperparameters as with 'helper_train'
-        # ...
-    },
-    'max_decoding_length_train': None,
-    'max_decoding_length_infer': None,
-    'name': 'attention_rnn_decoder'
-}
-# --------------------- Optimization --------------------- #
-opt = {
-    'optimizer': {
-        'type':  'AdamOptimizer',
-        'kwargs': {
-            'learning_rate': 0.001,
-            # Other keyword arguments for the optimizer class
-        },
-    },
-    'learning_rate_decay': {
-        # Hyperparameters of learning rate decay
-    },
-    'gradient_clip': {
-        # Hyperparameters of gradient clipping
-    },
-    'gradient_noise_scale': None,
-    'name': None
-}
diff --git a/examples/seq2seq_attn/config_toy_copy.py b/examples/seq2seq_attn/config_toy_copy.py
deleted file mode 100644
index e937f079..00000000
--- a/examples/seq2seq_attn/config_toy_copy.py
+++ /dev/null
@@ -1,40 +0,0 @@
-
-num_epochs = 4
-display = 50
-
-source_vocab_file = './data/toy_copy/train/vocab.sources.txt'
-target_vocab_file = './data/toy_copy/train/vocab.targets.txt'
-
-train = {
-    'batch_size': 32,
-    'source_dataset': {
-        "files": './data/toy_copy/train/sources.txt',
-        'vocab_file': source_vocab_file
-    },
-    'target_dataset': {
-        'files': './data/toy_copy/train/targets.txt',
-        'vocab_file': target_vocab_file
-    }
-}
-val = {
-    'batch_size': 32,
-    'source_dataset': {
-        "files": './data/toy_copy/dev/sources.txt',
-        'vocab_file': source_vocab_file
-    },
-    'target_dataset': {
-        "files": './data/toy_copy/dev/targets.txt',
-        'vocab_file': target_vocab_file
-    }
-}
-test = {
-    'batch_size': 32,
-    'source_dataset': {
-        "files": './data/toy_copy/test/sources.txt',
-        'vocab_file': source_vocab_file
-    },
-    'target_dataset': {
-        "files": './data/toy_copy/test/targets.txt',
-        'vocab_file': target_vocab_file
-    }
-}
diff --git a/examples/seq2seq_attn/prepare_data.py b/examples/seq2seq_attn/prepare_data.py
deleted file mode 100644
index 905e31a8..00000000
--- a/examples/seq2seq_attn/prepare_data.py
+++ /dev/null
@@ -1,56 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Downloads data.
-"""
-import tensorflow as tf
-import texar.tf as tx
-
-# pylint: disable=invalid-name
-
-flags = tf.flags
-
-flags.DEFINE_string("data", "iwslt14", "Data to download [iwslt14|toy_copy]")
-
-FLAGS = flags.FLAGS
-
-
-def prepare_data():
-    """Downloads data.
-    """
-    if FLAGS.data == 'iwslt14':
-        tx.data.maybe_download(
-            urls='https://drive.google.com/file/d/'
-                 '1y4mUWXRS2KstgHopCS9koZ42ENOh6Yb9/view?usp=sharing',
-            path='./',
-            filenames='iwslt14.zip',
-            extract=True)
-    elif FLAGS.data == 'toy_copy':
-        tx.data.maybe_download(
-            urls='https://drive.google.com/file/d/'
-                 '1fENE2rakm8vJ8d3voWBgW4hGlS6-KORW/view?usp=sharing',
-            path='./',
-            filenames='toy_copy.zip',
-            extract=True)
-    else:
-        raise ValueError('Unknown data: {}'.format(FLAGS.data))
-
-
-def main():
-    """Entrypoint.
-    """
-    prepare_data()
-
-
-if __name__ == '__main__':
-    main()
diff --git a/examples/seq2seq_attn/seq2seq_attn.py b/examples/seq2seq_attn/seq2seq_attn.py
deleted file mode 100644
index 979ea76f..00000000
--- a/examples/seq2seq_attn/seq2seq_attn.py
+++ /dev/null
@@ -1,163 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Attentional Seq2seq.
-"""
-from __future__ import absolute_import
-from __future__ import print_function
-from __future__ import division
-
-# pylint: disable=invalid-name, too-many-arguments, too-many-locals
-
-import importlib
-import tensorflow as tf
-import texar.tf as tx
-
-flags = tf.flags
-
-flags.DEFINE_string("config_model", "config_model", "The model config.")
-flags.DEFINE_string("config_data", "config_iwslt14", "The dataset config.")
-
-FLAGS = flags.FLAGS
-
-config_model = importlib.import_module(FLAGS.config_model)
-config_data = importlib.import_module(FLAGS.config_data)
-
-
-def build_model(batch, train_data):
-    """Assembles the seq2seq model.
-    """
-    source_embedder = tx.modules.WordEmbedder(
-        vocab_size=train_data.source_vocab.size, hparams=config_model.embedder)
-
-    encoder = tx.modules.BidirectionalRNNEncoder(
-        hparams=config_model.encoder)
-
-    enc_outputs, _ = encoder(source_embedder(batch['source_text_ids']))
-
-    target_embedder = tx.modules.WordEmbedder(
-        vocab_size=train_data.target_vocab.size, hparams=config_model.embedder)
-
-    decoder = tx.modules.AttentionRNNDecoder(
-        memory=tf.concat(enc_outputs, axis=2),
-        memory_sequence_length=batch['source_length'],
-        vocab_size=train_data.target_vocab.size,
-        hparams=config_model.decoder)
-
-    training_outputs, _, _ = decoder(
-        decoding_strategy='train_greedy',
-        inputs=target_embedder(batch['target_text_ids'][:, :-1]),
-        sequence_length=batch['target_length'] - 1)
-
-    mle_loss = tx.losses.sequence_sparse_softmax_cross_entropy(
-        labels=batch['target_text_ids'][:, 1:],
-        logits=training_outputs.logits,
-        sequence_length=batch['target_length'] - 1)
-
-    train_op = tx.core.get_train_op(mle_loss, hparams=config_model.opt)
-
-    start_tokens = tf.ones_like(batch['target_length']) * \
-            train_data.target_vocab.bos_token_id
-    beam_search_outputs, _, _ = \
-        tx.modules.beam_search_decode(
-            decoder_or_cell=decoder,
-            embedding=target_embedder,
-            start_tokens=start_tokens,
-            end_token=train_data.target_vocab.eos_token_id,
-            beam_width=config_model.beam_width,
-            max_decoding_length=60)
-
-    return train_op, beam_search_outputs
-
-
-def main():
-    """Entrypoint.
-    """
-    train_data = tx.data.PairedTextData(hparams=config_data.train)
-    val_data = tx.data.PairedTextData(hparams=config_data.val)
-    test_data = tx.data.PairedTextData(hparams=config_data.test)
-    data_iterator = tx.data.TrainTestDataIterator(
-        train=train_data, val=val_data, test=test_data)
-
-    batch = data_iterator.get_next()
-
-    train_op, infer_outputs = build_model(batch, train_data)
-
-    def _train_epoch(sess):
-        data_iterator.switch_to_train_data(sess)
-
-        step = 0
-        while True:
-            try:
-                loss = sess.run(train_op)
-                if step % config_data.display == 0:
-                    print("step={}, loss={:.4f}".format(step, loss))
-                step += 1
-            except tf.errors.OutOfRangeError:
-                break
-
-    def _eval_epoch(sess, mode):
-        if mode == 'val':
-            data_iterator.switch_to_val_data(sess)
-        else:
-            data_iterator.switch_to_test_data(sess)
-
-        refs, hypos = [], []
-        while True:
-            try:
-                fetches = [
-                    batch['target_text'][:, 1:],
-                    infer_outputs.predicted_ids[:, :, 0]
-                ]
-                feed_dict = {
-                    tx.global_mode(): tf.estimator.ModeKeys.EVAL
-                }
-                target_texts_ori, output_ids = \
-                    sess.run(fetches, feed_dict=feed_dict)
-
-                target_texts = tx.utils.strip_special_tokens(
-                    target_texts_ori, is_token_list=True)
-                output_texts = tx.utils.map_ids_to_strs(
-                    ids=output_ids, vocab=val_data.target_vocab)
-
-                for hypo, ref in zip(output_texts, target_texts):
-                    hypos.append(hypo)
-                    refs.append([ref])
-            except tf.errors.OutOfRangeError:
-                break
-
-        return tx.evals.corpus_bleu_moses(list_of_references=refs,
-                                          hypotheses=hypos)
-
-    with tf.Session() as sess:
-        sess.run(tf.global_variables_initializer())
-        sess.run(tf.local_variables_initializer())
-        sess.run(tf.tables_initializer())
-
-        best_val_bleu = -1.
-        for i in range(config_data.num_epochs):
-            _train_epoch(sess)
-
-            val_bleu = _eval_epoch(sess, 'val')
-            best_val_bleu = max(best_val_bleu, val_bleu)
-            print('val epoch={}, BLEU={:.4f}; best-ever={:.4f}'.format(
-                i, val_bleu, best_val_bleu))
-
-            test_bleu = _eval_epoch(sess, 'test')
-            print('test epoch={}, BLEU={:.4f}'.format(i, test_bleu))
-
-            print('=' * 50)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/examples/seq2seq_configs/.gitignore b/examples/seq2seq_configs/.gitignore
deleted file mode 100644
index 9faa04f8..00000000
--- a/examples/seq2seq_configs/.gitignore
+++ /dev/null
@@ -1,4 +0,0 @@
-/data/
-/data.zip
-/iwslt14.zip
-/toy_copy.zip
diff --git a/examples/seq2seq_configs/README.md b/examples/seq2seq_configs/README.md
deleted file mode 100644
index 2a7bdfab..00000000
--- a/examples/seq2seq_configs/README.md
+++ /dev/null
@@ -1,29 +0,0 @@
-# Seq2seq Model #
-
-This example builds a (plain) seq2seq model with Texar's model template and Tensorflow estimator. 
-
-## Usage ##
-
-### Dataset ###
-
-Download the example dataset:
-
-  * toy_copy: A small toy autoencoding dataset from [TF Seq2seq toolkit](https://github.com/google/seq2seq/tree/2500c26add91b079ca00cf1f091db5a99ddab9ae).
-
-```
-python [PATH_TEXAR]/examples/seq2seq_attn/prepare_data.py --data toy_copy
-```
-
-### Train the model ###
-
-Train the model with the following cmd:
-
-```
-python [PATH_TEXAR]/bin/train.py --config_paths config_model_small.yml,config_data_toy_copy.yml 
-```
-
-See [train.py](../../bin/train.py) for other available configurations.
-
-[config_model_small.yml](./config_model_small.yml) speicifies a small-size model with single-layer RNN encoder/decoder. [config_model_medium.yml](./config_model_medium.yml) specifies a medium-size one with 2-layer RNN encoder/decoder.
-
-The model will be trained/evaluated/checkpointed within the [Tensorflow Estimator](https://www.tensorflow.org/guide/estimators).
diff --git a/examples/seq2seq_configs/config_data_toy_copy.yml b/examples/seq2seq_configs/config_data_toy_copy.yml
deleted file mode 100644
index 3102fdd9..00000000
--- a/examples/seq2seq_configs/config_data_toy_copy.yml
+++ /dev/null
@@ -1,26 +0,0 @@
-# NMT data config. See `texar.tf.data.PairedTextData.default_hparams()` for
-# hyperparameters of train/eval data. Hyperparameters not specified here will 
-# take the default values. 
-data_hparams_train:
-  num_epochs: 10
-  batch_size: 32
-  source_dataset:
-    files: ./data/toy_copy/train/sources.txt
-    vocab_file: ./data/toy_copy/train/vocab.sources.txt
-    max_seq_length: 30
-  target_dataset:
-    files: ./data/toy_copy/train/targets.txt
-    vocab_file: ./data/toy_copy/train/vocab.targets.txt
-    max_seq_length: 30
-
-data_hparams_eval:
-  batch_size: 32
-  shuffle: False
-  source_dataset:
-    files: ./data/toy_copy/dev/sources.txt
-    vocab_file: ./data/toy_copy/train/vocab.sources.txt
-    max_seq_length: 50
-  target_dataset:
-    files: ./data/toy_copy/dev/targets.txt
-    vocab_file: ./data/toy_copy/train/vocab.targets.txt
-    max_seq_length: 50
diff --git a/examples/seq2seq_configs/config_model_medium.yml b/examples/seq2seq_configs/config_model_medium.yml
deleted file mode 100644
index 4e722987..00000000
--- a/examples/seq2seq_configs/config_model_medium.yml
+++ /dev/null
@@ -1,29 +0,0 @@
-# Basic Seq2seq model of medium size. See 
-# `texar.tf.models.BasicSeq2seq.default_hparams()` for possible hyperparameters
-# default values. Hyperparameters not specified here will take the default 
-# values.
-model: BasicSeq2seq
-model_hparams:
-  source_embedder_hparams:
-    dim: 256
-  encoder_hparams:
-    rnn_cell:
-      type: GRUCell
-      kwargs:
-        num_units: 256 
-        num_layers: 2
-      dropout:
-        input_keep_prob: 0.8
-  decoder_hparams:
-    rnn_cell:
-      type: GRUCell
-      kwargs:
-        num_units: 256
-        num_layers: 2
-      dropout:
-        input_keep_prob: 0.8
-  optimization:
-    optimizer:
-      type: AdamOptimizer
-      kwargs:
-        learning_rate: 0.0001
diff --git a/examples/seq2seq_configs/config_model_small.yml b/examples/seq2seq_configs/config_model_small.yml
deleted file mode 100644
index 36a700af..00000000
--- a/examples/seq2seq_configs/config_model_small.yml
+++ /dev/null
@@ -1,27 +0,0 @@
-# Basic Seq2seq model of small size. See 
-# `texar.tf.models.BasicSeq2seq.default_hparams()` for possible hyperparameters
-# default values. Hyperparameters not specified here will take the default 
-# values.
-model: BasicSeq2seq
-model_hparams:
-  source_embedder_hparams:
-    dim: 128
-  encoder_hparams:
-    rnn_cell:
-      type: GRUCell
-      kwargs:
-        num_units: 128 
-      dropout:
-        input_keep_prob: 0.8
-  decoder_hparams:
-    rnn_cell:
-      type: GRUCell
-      kwargs:
-        num_units: 128
-      dropout:
-        input_keep_prob: 0.8
-  optimization:
-    optimizer:
-      type: AdamOptimizer
-      kwargs:
-        learning_rate: 0.0001
diff --git a/examples/seq2seq_exposure_bias/.gitignore b/examples/seq2seq_exposure_bias/.gitignore
deleted file mode 100644
index 82f0c3ac..00000000
--- a/examples/seq2seq_exposure_bias/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-/data/
diff --git a/examples/seq2seq_exposure_bias/README.md b/examples/seq2seq_exposure_bias/README.md
deleted file mode 100644
index 2c5c6330..00000000
--- a/examples/seq2seq_exposure_bias/README.md
+++ /dev/null
@@ -1,108 +0,0 @@
-# Sequence Generation Algorithms Tackling Exposure Bias #
-
-Despite the computational simplicity and efficiency, maximum likelihood training of sequence generation models (e.g., RNNs) suffers from the exposure bias [(Ranzato et al., 2015)](https://arxiv.org/pdf/1511.06732.pdf). That is, the model is trained to predict the next token given the previous ground-truth tokens; while at test time, since the resulting model does not have access to the ground truth, tokens generated by the model itself are instead used to make the next prediction. This discrepancy between training and test leads to the issue that mistakes in prediction can quickly accumulate.
-
-This example provide implementations of some classic and advanced training algorithms that tackles the exposure bias. The base model is an attentional seq2seq.
-
-* **Maximum Likelihood (MLE)**: attentional seq2seq model with maximum likelihood training.
-* **Reward Augmented Maximum Likelihood (RAML)**: Described in [(Norouzi et al., 2016)](https://arxiv.org/pdf/1609.00150.pdf) and we use the sampling approach (n-gram replacement) by [(Ma et al., 2017)](https://arxiv.org/abs/1705.07136).
-* **Scheduled Sampling**: Described in [(Bengio et al., 2015)](https://arxiv.org/abs/1506.03099)
-* **Interpolation Algorithm**: Described in [(Tan et al., 2018) Connecting the Dots Between MLE and RL for Sequence Generation](https://arxiv.org/abs/1811.09740)
-
-## Usage ##
-
-### Dataset ###
-
-Two example datasets are provided:
-
-  * iwslt14: The benchmark [IWSLT2014](https://sites.google.com/site/iwsltevaluation2014/home) (de-en) machine translation dataset, following [(Ranzato et al., 2015)](https://arxiv.org/pdf/1511.06732.pdf) for data pre-processing.
-  * gigaword: The benchmark [GIGAWORD](https://catalog.ldc.upenn.edu/LDC2003T05) text summarization dataset. we sampled 200K out of the 3.8M pre-processed training examples provided by [(Rush et al., 2015)](https://www.aclweb.org/anthology/D/D15/D15-1044.pdf) for the sake of training efficiency. We used the refined validation and test sets provided by [(Zhou et al., 2017)](https://arxiv.org/pdf/1704.07073.pdf).
-
-Download the data with the following commands:
-
-```
-python utils/prepare_data.py --data iwslt14
-python utils/prepare_data.py --data giga
-```
-
-### Train the models ###
-
-#### Baseline Attentional Seq2seq
-
-```
-python baseline_seq2seq_attn_main.py \
-    --config_model configs.config_model \
-    --config_data configs.config_iwslt14
-```
-
-Here:
-  * `--config_model` specifies the model config. Note not to include the `.py` suffix.
-  * `--config_data` specifies the data config.
-
-[configs.config_model.py](./configs/config_model.py) specifies a single-layer seq2seq model with Luong attention and bi-directional RNN encoder. Hyperparameters taking default values can be omitted from the config file. 
-
-For demonstration purpose, [configs.config_model_full.py](./configs/config_model_full.py) gives all possible hyperparameters for the model. The two config files will lead to the same model.
-
-#### Reward Augmented Maximum Likelihood (RAML)
-```
-python raml_main.py \
-    --config_model configs.config_model \
-    --config_data configs.config_iwslt14 \
-    --raml_file data/iwslt14/samples_iwslt14.txt \
-    --n_samples 10
-```
-Here:
-  * `--raml_file` specifies the file containing the augmented samples and rewards.
-  * `--n_samples` specifies number of augmented samples for every target sentence.
-  * `--tau` specifies the temperature of the exponentiated payoff distribution in RAML.
-
-In the downloaded datasets, we have provided example files for `--raml_file`, which including augmented samples for ```iwslt14``` and ```gigaword``` respectively. We also provide scripts for generating augmented samples by yourself. Please refer to [utils/raml_samples_generation](utils/raml_samples_generation).
-
-
-#### Scheduled Sampling
-```
-python scheduled_sampling_main.py \
-    --config_model configs.config_model \
-    --config_data configs.config_iwslt14 \
-    --decay_factor 500.
-```
-Here:
-  * `--decay_factor` specifies the hyperparameter controling the speed of increasing the probability of sampling from model.
-
-
-#### Interpolation Algorithm
-```
-python interpolation_main.py \
-    --config_model configs.config_model \
-    --config_data configs.config_iwslt14 \
-    --lambdas_init [0.04,0.96,0.0] \
-    --delta_lambda_self 0.06 \
-    --delta_lambda_reward 0.06 \
-    --lambda_reward_steps 4
-```
-Here:
-
-  * `--lambdas_init` specifies the initial value of lambdas.
-  * `--delta_lambda_reward` specifies the increment of lambda_reward every annealing step.
-  * `--delta_lambda_self` specifies the decrement of lambda_self every annealing step.
-  * `--k` specifies the times of increasing lambda_reward after incresing lambda_self once.
-
-## Results ##
-
-### Machine Translation
-| Model      | BLEU Score   |
-| -----------| -------|
-| MLE        | 26.44 ± 0.18  |
-| Scheduled Sampling   | 26.76  ± 0.17  |
-| RAML | 27.22  ± 0.14  |
-| Interpolation | 27.82  ± 0.11  |
-
-### Text Summarization
-| Model      | Rouge-1   | Rouge-2 | Rouge-L |
-| -----------| -------|-------|-------|
-| MLE        | 36.11 ± 0.21  | 16.39 ± 0.16 | 32.32 ± 0.19 |
-| Scheduled Sampling   |  36.59 ± 0.12  |16.79 ± 0.22|32.77 ± 0.17|
-| RAML | 36.30  ± 0.24 | 16.69 ± 0.20 | 32.49 ± 0.17 |
-| Interpolation | 36.72  ± 0.29  |16.99 ± 0.17 | 32.95 ± 0.33|
-
- 
diff --git a/examples/seq2seq_exposure_bias/baseline_seq2seq_attn_main.py b/examples/seq2seq_exposure_bias/baseline_seq2seq_attn_main.py
deleted file mode 100644
index db5a9462..00000000
--- a/examples/seq2seq_exposure_bias/baseline_seq2seq_attn_main.py
+++ /dev/null
@@ -1,234 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Attentional Seq2seq.
-same as examples/seq2seq_attn except that here Rouge is also supported.
-"""
-from __future__ import absolute_import
-from __future__ import print_function
-from __future__ import division
-from __future__ import unicode_literals
-
-# pylint: disable=invalid-name, too-many-arguments, too-many-locals
-
-from io import open
-import importlib
-import tensorflow as tf
-import texar.tf as tx
-from rouge import Rouge
-
-flags = tf.flags
-
-flags.DEFINE_string("config_model", "configs.config_model", "The model config.")
-flags.DEFINE_string("config_data", "configs.config_iwslt14",
-                    "The dataset config.")
-
-flags.DEFINE_string('output_dir', '.', 'where to keep training logs')
-
-FLAGS = flags.FLAGS
-
-config_model = importlib.import_module(FLAGS.config_model)
-config_data = importlib.import_module(FLAGS.config_data)
-
-if not FLAGS.output_dir.endswith('/'):
-    FLAGS.output_dir += '/'
-log_dir = FLAGS.output_dir + 'training_log_baseline/'
-tx.utils.maybe_create_dir(log_dir)
-
-
-def build_model(batch, train_data):
-    """Assembles the seq2seq model.
-    """
-    source_embedder = tx.modules.WordEmbedder(
-        vocab_size=train_data.source_vocab.size, hparams=config_model.embedder)
-
-    encoder = tx.modules.BidirectionalRNNEncoder(
-        hparams=config_model.encoder)
-
-    enc_outputs, _ = encoder(source_embedder(batch['source_text_ids']))
-
-    target_embedder = tx.modules.WordEmbedder(
-        vocab_size=train_data.target_vocab.size, hparams=config_model.embedder)
-
-    decoder = tx.modules.AttentionRNNDecoder(
-        memory=tf.concat(enc_outputs, axis=2),
-        memory_sequence_length=batch['source_length'],
-        vocab_size=train_data.target_vocab.size,
-        hparams=config_model.decoder)
-
-    training_outputs, _, _ = decoder(
-        decoding_strategy='train_greedy',
-        inputs=target_embedder(batch['target_text_ids'][:, :-1]),
-        sequence_length=batch['target_length'] - 1)
-
-    train_op = tx.core.get_train_op(
-        tx.losses.sequence_sparse_softmax_cross_entropy(
-            labels=batch['target_text_ids'][:, 1:],
-            logits=training_outputs.logits,
-            sequence_length=batch['target_length'] - 1),
-        hparams=config_model.opt)
-
-    start_tokens = tf.ones_like(batch['target_length']) *\
-                   train_data.target_vocab.bos_token_id
-    beam_search_outputs, _, _ = \
-        tx.modules.beam_search_decode(
-            decoder_or_cell=decoder,
-            embedding=target_embedder,
-            start_tokens=start_tokens,
-            end_token=train_data.target_vocab.eos_token_id,
-            beam_width=config_model.beam_width,
-            max_decoding_length=60)
-
-    return train_op, beam_search_outputs
-
-
-def print_stdout_and_file(content, file):
-    print(content)
-    print(content, file=file)
-
-
-def main():
-    """Entrypoint.
-    """
-    train_data = tx.data.PairedTextData(hparams=config_data.train)
-    val_data = tx.data.PairedTextData(hparams=config_data.val)
-    test_data = tx.data.PairedTextData(hparams=config_data.test)
-    data_iterator = tx.data.TrainTestDataIterator(
-        train=train_data, val=val_data, test=test_data)
-
-    batch = data_iterator.get_next()
-
-    train_op, infer_outputs = build_model(batch, train_data)
-
-    def _train_epoch(sess, epoch_no):
-        data_iterator.switch_to_train_data(sess)
-        training_log_file = \
-            open(log_dir + 'training_log' + str(epoch_no) + '.txt', 'w',
-                 encoding='utf-8')
-
-        step = 0
-        while True:
-            try:
-                loss = sess.run(train_op)
-                print("step={}, loss={:.4f}".format(step, loss),
-                      file=training_log_file)
-                if step % config_data.observe_steps == 0:
-                    print("step={}, loss={:.4f}".format(step, loss))
-                training_log_file.flush()
-                step += 1
-            except tf.errors.OutOfRangeError:
-                break
-
-    def _eval_epoch(sess, mode, epoch_no):
-        if mode == 'val':
-            data_iterator.switch_to_val_data(sess)
-        else:
-            data_iterator.switch_to_test_data(sess)
-
-        refs, hypos = [], []
-        while True:
-            try:
-                fetches = [
-                    batch['target_text'][:, 1:],
-                    infer_outputs.predicted_ids[:, :, 0]
-                ]
-                feed_dict = {
-                    tx.global_mode(): tf.estimator.ModeKeys.EVAL
-                }
-                target_texts_ori, output_ids = \
-                    sess.run(fetches, feed_dict=feed_dict)
-
-                target_texts = tx.utils.strip_special_tokens(
-                    target_texts_ori.tolist(), is_token_list=True)
-                target_texts = tx.utils.str_join(target_texts)
-                output_texts = tx.utils.map_ids_to_strs(
-                    ids=output_ids, vocab=val_data.target_vocab)
-
-                tx.utils.write_paired_text(
-                    target_texts, output_texts,
-                    log_dir + mode + '_results' + str(epoch_no) + '.txt',
-                    append=True, mode='h', sep=' ||| ')
-
-                for hypo, ref in zip(output_texts, target_texts):
-                    if config_data.eval_metric == 'bleu':
-                        hypos.append(hypo)
-                        refs.append([ref])
-                    elif config_data.eval_metric == 'rouge':
-                        hypos.append(tx.utils.compat_as_text(hypo))
-                        refs.append(tx.utils.compat_as_text(ref))
-            except tf.errors.OutOfRangeError:
-                break
-
-        if config_data.eval_metric == 'bleu':
-            return tx.evals.corpus_bleu_moses(
-                list_of_references=refs, hypotheses=hypos)
-        elif config_data.eval_metric == 'rouge':
-            rouge = Rouge()
-            return rouge.get_scores(hyps=hypos, refs=refs, avg=True)
-
-    def _calc_reward(score):
-        """
-        Return the bleu score or the sum of (Rouge-1, Rouge-2, Rouge-L).
-        """
-        if config_data.eval_metric == 'bleu':
-            return score
-        elif config_data.eval_metric == 'rouge':
-            return sum([value['f'] for key, value in score.items()])
-
-    with tf.Session() as sess:
-        sess.run(tf.global_variables_initializer())
-        sess.run(tf.local_variables_initializer())
-        sess.run(tf.tables_initializer())
-
-        best_val_score = -1.
-        scores_file = open(log_dir + 'scores.txt', 'w', encoding='utf-8')
-        for i in range(config_data.num_epochs):
-            _train_epoch(sess, i)
-
-            val_score = _eval_epoch(sess, 'val', i)
-            test_score = _eval_epoch(sess, 'test', i)
-
-            best_val_score = max(best_val_score, _calc_reward(val_score))
-
-            if config_data.eval_metric == 'bleu':
-                print_stdout_and_file(
-                    'val epoch={}, BLEU={:.4f}; best-ever={:.4f}'.format(
-                        i, val_score, best_val_score), file=scores_file)
-
-                print_stdout_and_file(
-                    'test epoch={}, BLEU={:.4f}'.format(i, test_score),
-                    file=scores_file)
-                print_stdout_and_file('=' * 50, file=scores_file)
-
-            elif config_data.eval_metric == 'rouge':
-                print_stdout_and_file(
-                    'valid epoch {}:'.format(i), file=scores_file)
-                for key, value in val_score.items():
-                    print_stdout_and_file(
-                        '{}: {}'.format(key, value), file=scores_file)
-                print_stdout_and_file('fsum: {}; best_val_fsum: {}'.format(
-                    _calc_reward(val_score), best_val_score), file=scores_file)
-
-                print_stdout_and_file(
-                    'test epoch {}:'.format(i), file=scores_file)
-                for key, value in test_score.items():
-                    print_stdout_and_file(
-                        '{}: {}'.format(key, value), file=scores_file)
-                print_stdout_and_file('=' * 110, file=scores_file)
-
-            scores_file.flush()
-
-
-if __name__ == '__main__':
-    main()
diff --git a/examples/seq2seq_exposure_bias/configs/__init__.py b/examples/seq2seq_exposure_bias/configs/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/examples/seq2seq_exposure_bias/configs/config_giga.py b/examples/seq2seq_exposure_bias/configs/config_giga.py
deleted file mode 100644
index 8a2079f4..00000000
--- a/examples/seq2seq_exposure_bias/configs/config_giga.py
+++ /dev/null
@@ -1,47 +0,0 @@
-num_epochs = 30
-observe_steps = 500
-
-eval_metric = 'rouge'
-
-batch_size = 64
-source_vocab_file = './data/giga/vocab.article'
-target_vocab_file = './data/giga/vocab.title'
-
-train = {
-    'batch_size': batch_size,
-    'allow_smaller_final_batch': False,
-    'source_dataset': {
-        "files": 'data/giga/train.article',
-        'vocab_file': source_vocab_file
-    },
-    'target_dataset': {
-        'files': 'data/giga/train.title',
-        'vocab_file': target_vocab_file
-    }
-}
-val = {
-    'batch_size': batch_size,
-    'shuffle': False,
-    'allow_smaller_final_batch': True,
-    'source_dataset': {
-        "files": 'data/giga/valid.article',
-        'vocab_file': source_vocab_file,
-    },
-    'target_dataset': {
-        'files': 'data/giga/valid.title',
-        'vocab_file': target_vocab_file,
-    }
-}
-test = {
-    'batch_size': batch_size,
-    'shuffle': False,
-    'allow_smaller_final_batch': True,
-    'source_dataset': {
-        "files": 'data/giga/test.article',
-        'vocab_file': source_vocab_file,
-    },
-    'target_dataset': {
-        'files': 'data/giga/test.title',
-        'vocab_file': target_vocab_file,
-    }
-}
diff --git a/examples/seq2seq_exposure_bias/configs/config_iwslt14.py b/examples/seq2seq_exposure_bias/configs/config_iwslt14.py
deleted file mode 100644
index e27b88f5..00000000
--- a/examples/seq2seq_exposure_bias/configs/config_iwslt14.py
+++ /dev/null
@@ -1,50 +0,0 @@
-num_epochs = 50  # the best epoch occurs within 10 epochs in most cases
-observe_steps = 500
-
-eval_metric = 'bleu'
-
-batch_size = 64
-source_vocab_file = './data/iwslt14/vocab.de'
-target_vocab_file = './data/iwslt14/vocab.en'
-
-train = {
-    'batch_size': batch_size,
-    'shuffle': True,
-    'allow_smaller_final_batch': False,
-    'source_dataset': {
-        "files": 'data/iwslt14/train.de',
-        'vocab_file': source_vocab_file,
-        'max_seq_length': 50
-    },
-    'target_dataset': {
-        'files': 'data/iwslt14/train.en',
-        'vocab_file': target_vocab_file,
-        'max_seq_length': 50
-    }
-}
-val = {
-    'batch_size': batch_size,
-    'shuffle': False,
-    'allow_smaller_final_batch': True,
-    'source_dataset': {
-        "files": 'data/iwslt14/valid.de',
-        'vocab_file': source_vocab_file,
-    },
-    'target_dataset': {
-        'files': 'data/iwslt14/valid.en',
-        'vocab_file': target_vocab_file,
-    }
-}
-test = {
-    'batch_size': batch_size,
-    'shuffle': False,
-    'allow_smaller_final_batch': True,
-    'source_dataset': {
-        "files": 'data/iwslt14/test.de',
-        'vocab_file': source_vocab_file,
-    },
-    'target_dataset': {
-        'files': 'data/iwslt14/test.en',
-        'vocab_file': target_vocab_file,
-    }
-}
diff --git a/examples/seq2seq_exposure_bias/configs/config_model.py b/examples/seq2seq_exposure_bias/configs/config_model.py
deleted file mode 100644
index c3d13f73..00000000
--- a/examples/seq2seq_exposure_bias/configs/config_model.py
+++ /dev/null
@@ -1,43 +0,0 @@
-num_units = 256
-beam_width = 5
-decoder_layers = 1
-dropout = 0.2
-
-embedder = {
-    'dim': num_units
-}
-encoder = {
-    'rnn_cell_fw': {
-        'kwargs': {
-            'num_units': num_units
-        },
-        'dropout': {
-            'input_keep_prob': 1. - dropout
-        }
-    }
-}
-decoder = {
-    'rnn_cell': {
-        'kwargs': {
-            'num_units': num_units
-        },
-        'dropout': {
-            'input_keep_prob': 1. - dropout
-        },
-        'num_layers': decoder_layers
-    },
-    'attention': {
-        'kwargs': {
-            'num_units': num_units,
-        },
-        'attention_layer_size': num_units
-    }
-}
-opt = {
-    'optimizer': {
-        'type':  'AdamOptimizer',
-        'kwargs': {
-            'learning_rate': 0.001,
-        },
-    },
-}
diff --git a/examples/seq2seq_exposure_bias/interpolation_decoder.py b/examples/seq2seq_exposure_bias/interpolation_decoder.py
deleted file mode 100644
index eab06245..00000000
--- a/examples/seq2seq_exposure_bias/interpolation_decoder.py
+++ /dev/null
@@ -1,124 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Interpolation Decoder is used for interpolation algorithm
-which stores one more variable in 'state' recording the
-decoded ids(state: [decoded_ids, rnn_state]).
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# pylint: disable=no-name-in-module, too-many-arguments, too-many-locals
-# pylint: disable=not-context-manager, protected-access, invalid-name
-
-import tensorflow as tf
-
-from texar.tf.modules.decoders.rnn_decoders import \
-    AttentionRNNDecoder, AttentionRNNDecoderOutput
-
-
-class InterpolationDecoder(AttentionRNNDecoder):
-    """
-    Basicly the same as AttentionRNNDecoder except one
-    more variable except rnn_state in 'state' recording the
-    decoded ids(state: [decoded_ids, rnn_state])
-
-    Args:
-        memory: The memory to query, e.g., the output of an RNN encoder. This
-            tensor should be shaped `[batch_size, max_time, dim]`.
-        memory_sequence_length (optional): A tensor of shape `[batch_size]`
-            containing the sequence lengths for the batch
-            entries in memory. If provided, the memory tensor rows are masked
-            with zeros for values past the respective sequence lengths.
-        cell (RNNCell, optional): An instance of `RNNCell`. If `None`, a cell
-            is created as specified in :attr:`hparams`.
-        cell_dropout_mode (optional): A Tensor taking value of
-            :tf_main:`tf.estimator.ModeKeys <estimator/ModeKeys>`, which
-            toggles dropout in the RNN cell (e.g., activates dropout in
-            TRAIN mode). If `None`, :func:`~texar.tf.global_mode` is used.
-            Ignored if :attr:`cell` is given.
-        vocab_size (int, optional): Vocabulary size. Required if
-            :attr:`output_layer` is `None`.
-        output_layer (optional): An instance of
-            :tf_main:`tf.layers.Layer <layers/Layer>`, or
-            :tf_main:`tf.identity <identity>`. Apply to the RNN cell
-            output to get logits. If `None`, a dense layer
-            is used with output dimension set to :attr:`vocab_size`.
-            Set `output_layer=tf.identity` if you do not want to have an
-            output layer after the RNN cell outputs.
-        cell_input_fn (callable, optional): A callable that produces RNN cell
-            inputs. If `None` (default), the default is used:
-            `lambda inputs, attention: tf.concat([inputs, attention], -1)`,
-            which cancats regular RNN cell inputs with attentions.
-        hparams (dict, optional): Hyperparameters. Missing
-            hyperparamerter will be set to default values. See
-            :meth:`default_hparams` for the hyperparameter sturcture and
-            default values.
-    """
-    def __init__(self,
-                 memory,
-                 memory_sequence_length=None,
-                 cell=None,
-                 cell_dropout_mode=None,
-                 vocab_size=None,
-                 output_layer=None,
-                 cell_input_fn=None,
-                 hparams=None):
-        AttentionRNNDecoder.__init__(
-            self, memory, memory_sequence_length, cell, cell_dropout_mode,
-            vocab_size, output_layer, cell_input_fn, hparams)
-
-    def initialize(self, name=None):
-        init = AttentionRNNDecoder.initialize(self, name)
-
-        batch_size = tf.shape(init[0])[0]
-
-        # decoded_ids can be initialized as any arbitrary value
-        # because it will be assigned later in decoding
-        initial_decoded_ids = tf.ones((batch_size, 60), dtype=tf.int32)
-
-        initial_rnn_state = init[2]
-        initial_state = [initial_decoded_ids, initial_rnn_state]
-        init[2] = initial_state
-
-        return init
-
-    def step(self, time, inputs, state, name=None):
-        # Basicly the same as in AttentionRNNDecoder except considering
-        # about the different form of 'state'(decoded_ids, rnn_state)
-
-        wrapper_outputs, wrapper_state = self._cell(inputs, state[1])
-        decoded_ids = state[0]
-
-        logits = self._output_layer(wrapper_outputs)
-
-        sample_ids = self._helper.sample(
-            time=time, outputs=logits, state=[decoded_ids, wrapper_state])
-
-        attention_scores = wrapper_state.alignments
-        attention_context = wrapper_state.attention
-        outputs = AttentionRNNDecoderOutput(
-            logits, sample_ids, wrapper_outputs,
-            attention_scores, attention_context)
-
-        return (outputs, wrapper_state)
-
-    def next_inputs(self, time, outputs, state):
-        (finished, next_inputs, next_state) = self._helper.next_inputs(
-            time=time,
-            outputs=outputs.logits,
-            state=[state[0], state],
-            sample_ids=outputs.sample_id)
-        return (finished, next_inputs, next_state)
diff --git a/examples/seq2seq_exposure_bias/interpolation_helper.py b/examples/seq2seq_exposure_bias/interpolation_helper.py
deleted file mode 100644
index 340f555e..00000000
--- a/examples/seq2seq_exposure_bias/interpolation_helper.py
+++ /dev/null
@@ -1,216 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Helper for interpolation algirithm.
-New token is sample from model, ground_truth or reward according to lambdas
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow as tf
-import numpy as np
-
-from tensorflow_probability import distributions as tfpd
-from tensorflow.contrib.seq2seq import SampleEmbeddingHelper
-from texar.tf.evals.bleu import sentence_bleu
-from rouge import Rouge
-
-rouge = Rouge()
-
-
-def calc_reward(refs, hypo, unk_id, metric):
-    """
-    calculate the reward given hypo and refs and will return
-    bleu score if metric is 'bleu' or return
-    sum of (Rouge-1, Rouge-2, Rouge-L) if metric is 'rouge'
-    """
-    if len(hypo) == 0 or len(refs[0]) == 0:
-        return 0.
-
-    for i in range(len(hypo)):
-        assert isinstance(hypo[i], int)
-        if hypo[i] == unk_id:
-            hypo[i] = -1
-
-    if metric == 'bleu':
-        return 0.01 * sentence_bleu(
-            references=refs, hypothesis=hypo, smooth=True)
-    else:
-        ref_str = ' '.join([str(word) for word in refs[0]])
-        hypo_str = ' '.join([str(word) for word in hypo])
-        rouge_scores = \
-            rouge.get_scores(hyps=[hypo_str], refs=[ref_str], avg=True)
-        return sum([value['f'] for key, value in rouge_scores.items()])
-
-
-class InterpolationHelper(SampleEmbeddingHelper):
-    """
-    Helper for interpolation algirithm.
-    New token is sample from model, ground_truth or reward according to lambdas
-
-    Args:
-        embedding: A callable that takes a vector tensor of `ids` (argmax ids),
-            or the `params` argument for `embedding_lookup`. The returned tensor
-            will be passed to the decoder input.
-        start_tokens: `int32` vector shaped `[batch_size]`, the start tokens.
-        end_token: `int32` scalar, the token that marks end of decoding.
-        vocab: texar.Vocab, the vocabularies of training set
-        reward_metric: 'bleu' or 'rouge', the metric of reward
-        ground_truth: the ground truth in training set
-        ground_truth_length: the length of ground truth sentences
-        lambdas: 'float32' vector shapes [3], according to which
-            decide the way of generate the next token in training
-    """
-    def __init__(self,
-                 embedding,
-                 start_tokens,
-                 end_token,
-                 vocab,
-                 reward_metric,
-                 ground_truth,
-                 ground_truth_length,
-                 lambdas):
-        SampleEmbeddingHelper.__init__(self, embedding, start_tokens, end_token)
-
-        self._vocab = vocab
-        self._ground_truth = ground_truth
-        self._lambdas = lambdas
-        self._ground_truth_length = ground_truth_length
-        self._metric = reward_metric
-
-    def sample(self, time, outputs, state, name=None):
-        """
-        sample tokens for next step, notice the special form
-        of 'state'([decoded_ids, rnn_state])
-        """
-        sample_method_sampler = \
-            tfpd.Categorical(probs=self._lambdas)
-        sample_method_id = sample_method_sampler.sample()
-
-        truth_feeding = lambda: tf.cond(
-            tf.less(time, tf.shape(self._ground_truth)[1]),
-            lambda: tf.cast(self._ground_truth[:, time], tf.int32),
-            lambda: tf.ones_like(self._ground_truth[:, 0],
-                                 dtype=tf.int32) * self._vocab.eos_token_id)
-
-        self_feeding = lambda: SampleEmbeddingHelper.sample(
-            self, time, outputs, state, name)
-
-        reward_feeding = lambda: self._sample_by_reward(time, state)
-
-        sample_ids = tf.cond(
-            tf.logical_or(tf.equal(time, 0), tf.equal(sample_method_id, 1)),
-            truth_feeding,
-            lambda: tf.cond(
-                tf.equal(sample_method_id, 2),
-                reward_feeding,
-                self_feeding))
-        return sample_ids
-
-    def next_inputs(self, time, outputs, state, sample_ids, name=None):
-        """
-        notice the special form of 'state'([decoded_ids, rnn_state])
-        """
-        finished, next_inputs, next_state = SampleEmbeddingHelper.next_inputs(
-            self, time, outputs, state[1], sample_ids, name)
-
-        next_state = [tf.concat(
-            [state[0][:, :time], tf.expand_dims(sample_ids, 1),
-             state[0][:, time + 1:]], axis=1), next_state]
-        next_state[0] = tf.reshape(next_state[0], (tf.shape(sample_ids)[0], 60))
-
-        return finished, next_inputs, next_state
-
-    def _sample_by_reward(self, time, state):
-        def _get_rewards(time, prefix_ids, target_ids, ground_truth_length):
-            batch_size = np.shape(target_ids)[0]
-            words_in_target = \
-                [np.unique(target_ids[i]) for i in range(batch_size)]
-            unk_id = self._vocab.unk_token_id
-            eos_id = self._vocab.eos_token_id
-
-            # before append
-            baseline_scores = []
-            baseline_ids = prefix_ids[:, :time]
-            for i in range(batch_size):
-                ref = target_ids[i].tolist()
-                if self._vocab.eos_token_id in ref:
-                    ref = ref[:ref.index(self._vocab.eos_token_id)]
-
-                hypo = baseline_ids[i].tolist()
-                if self._vocab.eos_token_id in hypo:
-                    hypo = hypo[:hypo.index(self._vocab.eos_token_id)]
-
-                baseline_scores.append(calc_reward(
-                    refs=[ref], hypo=hypo, unk_id=unk_id,
-                    metric=self._metric))
-
-            # append UNK
-            syn_ids = np.concatenate([
-                prefix_ids[:, :time],
-                np.ones((batch_size, 1), dtype=np.int32) * unk_id], axis=1)
-
-            reward_unk = []
-            for i in range(batch_size):
-                ref = target_ids[i].tolist()
-                if self._vocab.eos_token_id in ref:
-                    ref = ref[:ref.index(self._vocab.eos_token_id)]
-
-                hypo = syn_ids[i].tolist()
-                if self._vocab.eos_token_id in hypo:
-                    hypo = hypo[:hypo.index(self._vocab.eos_token_id)]
-
-                reward = calc_reward(refs=[ref], hypo=hypo, unk_id=unk_id,
-                                     metric=self._metric)
-                reward_unk.append(
-                    np.ones((1, self._vocab.size), dtype=np.float32) *
-                    reward - baseline_scores[i])
-            result = np.concatenate(reward_unk, axis=0)
-
-            # append tokens
-            for i in range(batch_size):
-                for id in words_in_target[i]:
-                    if id == unk_id:
-                        continue
-
-                    syn_id = np.concatenate(
-                        [prefix_ids[i:i + 1, :time], np.array([[id, ]])],
-                        axis=1)
-                    hypo = syn_id[0].tolist()
-                    if self._vocab.eos_token_id in hypo:
-                        hypo = hypo[:hypo.index(self._vocab.eos_token_id)]
-
-                    ref = target_ids[i].tolist()
-                    if self._vocab.eos_token_id in ref:
-                        ref = ref[:ref.index(self._vocab.eos_token_id)]
-
-                    dup = 1. if prefix_ids[i][time] == id and \
-                                id != unk_id else 0.
-                    eos = 1. if time < ground_truth_length[i] - 1 and \
-                                id == eos_id else 0.
-
-                    reward = calc_reward(
-                        refs=[ref], hypo=hypo, unk_id=unk_id,
-                        metric=self._metric)
-                    result[i][id] = reward - baseline_scores[i] - dup - eos
-
-            return result
-
-        sampler = tfpd.Categorical(
-            logits=tf.py_func(_get_rewards, [
-                time, state[0], self._ground_truth,
-                self._ground_truth_length], tf.float32))
-        return tf.reshape(
-            sampler.sample(), (tf.shape(self._ground_truth)[0],))
diff --git a/examples/seq2seq_exposure_bias/interpolation_main.py b/examples/seq2seq_exposure_bias/interpolation_main.py
deleted file mode 100644
index 788d815a..00000000
--- a/examples/seq2seq_exposure_bias/interpolation_main.py
+++ /dev/null
@@ -1,321 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Interpolation Algorithm.
-"""
-from __future__ import absolute_import
-from __future__ import print_function
-from __future__ import division
-from __future__ import unicode_literals
-
-import importlib
-from io import open
-
-import tensorflow as tf
-import texar.tf as tx
-import numpy as np
-
-from interpolation_decoder import InterpolationDecoder
-from interpolation_helper import InterpolationHelper
-from rouge import Rouge
-
-flags = tf.flags
-
-flags.DEFINE_string("config_model", "configs.config_model", "The model config.")
-flags.DEFINE_string("config_data", "configs.config_iwslt14",
-                    "The dataset config.")
-
-flags.DEFINE_string('lambdas_init', '[0.04,0.96,0.0]',
-                    'initial value of lambdas')
-
-flags.DEFINE_float('delta_lambda_reward', 0.06,
-                   'increment of lambda_reward every annealing')
-flags.DEFINE_float('delta_lambda_self', 0.06,
-                   'decrement of lambda_self every annealing')
-flags.DEFINE_integer('lambda_reward_steps', 4,
-                     'times of increasing lambda_reward '
-                     'after incresing lambda_self once')
-
-flags.DEFINE_string('output_dir', '.', 'where to keep training logs')
-
-FLAGS = flags.FLAGS
-
-config_model = importlib.import_module(FLAGS.config_model)
-config_data = importlib.import_module(FLAGS.config_data)
-
-FLAGS.lambdas_init = eval(FLAGS.lambdas_init)
-
-if not FLAGS.output_dir.endswith('/'):
-    FLAGS.output_dir += '/'
-log_dir = FLAGS.output_dir + 'training_log_interpolation' +\
-          '_init' + '_' + str(FLAGS.lambdas_init[0]) +\
-          '_' + str(FLAGS.lambdas_init[1]) +\
-          '_' + str(FLAGS.lambdas_init[2]) +\
-          '_dr' + str(FLAGS.delta_lambda_reward) +\
-          '_ds' + str(FLAGS.delta_lambda_self) +\
-          '_rstep' + str(FLAGS.lambda_reward_steps) + '/'
-tx.utils.maybe_create_dir(log_dir)
-
-
-def build_model(batch, train_data, lambdas):
-    """
-    This function is basically the same as build_model() in
-    baseline_seq2seq_attn.py, except the
-    InterpolateDecoder and InterpolateHelper.
-    """
-    batch_size = tf.shape(batch['target_length'])[0]
-
-    source_embedder = tx.modules.WordEmbedder(
-        vocab_size=train_data.source_vocab.size, hparams=config_model.embedder)
-
-    encoder = tx.modules.BidirectionalRNNEncoder(
-        hparams=config_model.encoder)
-
-    enc_outputs, _ = encoder(source_embedder(batch['source_text_ids']))
-
-    target_embedder = tx.modules.WordEmbedder(
-        vocab_size=train_data.target_vocab.size, hparams=config_model.embedder)
-
-    decoder = InterpolationDecoder(
-        memory=tf.concat(enc_outputs, axis=2),
-        memory_sequence_length=batch['source_length'],
-        vocab_size=train_data.target_vocab.size,
-        hparams=config_model.decoder)
-
-    start_tokens = tf.ones_like(
-        batch['target_length']) * train_data.target_vocab.bos_token_id
-    helper = InterpolationHelper(
-        embedding=target_embedder,
-        start_tokens=start_tokens,
-        end_token=train_data.target_vocab.eos_token_id,
-        reward_metric=config_data.eval_metric,
-        vocab=train_data.target_vocab,
-        ground_truth=batch['target_text_ids'][:, 1:],
-        ground_truth_length=batch['target_length'] - 1,
-        lambdas=lambdas,)
-
-    training_outputs, _, training_length = decoder(
-        helper=helper,
-        initial_state=decoder.zero_state(
-            batch_size=batch_size, dtype=tf.float32),
-        max_decoding_length=60)
-
-    train_op = tx.core.get_train_op(
-        tx.losses.sequence_sparse_softmax_cross_entropy(
-            labels=training_outputs.sample_id,
-            logits=training_outputs.logits,
-            sequence_length=training_length),
-        hparams=config_model.opt)
-
-    beam_search_outputs, _, _ = \
-        tx.modules.beam_search_decode(
-            decoder_or_cell=decoder,
-            embedding=target_embedder,
-            start_tokens=start_tokens,
-            end_token=train_data.target_vocab.eos_token_id,
-            beam_width=config_model.beam_width,
-            max_decoding_length=60)
-
-    return train_op, beam_search_outputs
-
-
-def print_stdout_and_file(content, file):
-    print(content)
-    print(content, file=file)
-
-
-def main():
-    """Entrypoint.
-    """
-    training_data = tx.data.PairedTextData(hparams=config_data.train)
-    val_data = tx.data.PairedTextData(hparams=config_data.val)
-    test_data = tx.data.PairedTextData(hparams=config_data.test)
-    data_iterator = tx.data.TrainTestDataIterator(
-        train=training_data, val=val_data, test=test_data)
-
-    batch = data_iterator.get_next()
-    lambdas_ts = tf.placeholder(shape=[3], dtype=tf.float32)
-
-    train_op, infer_outputs = build_model(batch, training_data, lambdas_ts)
-
-    def _train_epoch(sess, epoch, lambdas):
-        data_iterator.switch_to_train_data(sess)
-        log_file = open(log_dir + 'training_log' + str(epoch) + '.txt', 'w',
-                        encoding='utf-8')
-
-        step = 0
-        while True:
-            try:
-                loss = sess.run(train_op, feed_dict={
-                    lambdas_ts: np.array(lambdas)})
-                print("step={}, loss={:.4f}, lambdas={}".format(
-                    step, loss, lambdas), file=log_file)
-                if step % config_data.observe_steps == 0:
-                    print("step={}, loss={:.4f}, lambdas={}".format(
-                        step, loss, lambdas))
-                log_file.flush()
-                step += 1
-
-            except tf.errors.OutOfRangeError:
-                break
-
-    def _eval_epoch(sess, mode, epoch_no):
-        """
-        This function is the same as _eval_epoch() in
-        baseline_seq2seq_attn_main.py.
-        """
-        if mode == 'val':
-            data_iterator.switch_to_val_data(sess)
-        else:
-            data_iterator.switch_to_test_data(sess)
-
-        refs, hypos = [], []
-        while True:
-            try:
-                fetches = [
-                    batch['target_text'][:, 1:],
-                    infer_outputs.predicted_ids[:, :, 0]
-                ]
-                feed_dict = {
-                    tx.global_mode(): tf.estimator.ModeKeys.EVAL
-                }
-                target_texts_ori, output_ids = \
-                    sess.run(fetches, feed_dict=feed_dict)
-
-                target_texts = tx.utils.strip_special_tokens(
-                    target_texts_ori.tolist(), is_token_list=True)
-                target_texts = tx.utils.str_join(target_texts)
-                output_texts = tx.utils.map_ids_to_strs(
-                    ids=output_ids, vocab=val_data.target_vocab)
-
-                tx.utils.write_paired_text(
-                    target_texts, output_texts,
-                    log_dir + mode + '_results' + str(epoch_no) + '.txt',
-                    append=True, mode='h', sep=' ||| ')
-
-                for hypo, ref in zip(output_texts, target_texts):
-                    if config_data.eval_metric == 'bleu':
-                        hypos.append(hypo)
-                        refs.append([ref])
-                    elif config_data.eval_metric == 'rouge':
-                        hypos.append(tx.utils.compat_as_text(hypo))
-                        refs.append(tx.utils.compat_as_text(ref))
-            except tf.errors.OutOfRangeError:
-                break
-
-        if config_data.eval_metric == 'bleu':
-            return tx.evals.corpus_bleu_moses(
-                list_of_references=refs, hypotheses=hypos)
-        elif config_data.eval_metric == 'rouge':
-            rouge = Rouge()
-            return rouge.get_scores(hyps=hypos, refs=refs, avg=True)
-
-    def _calc_reward(score):
-        """
-        Return the bleu score or the sum of (Rouge-1, Rouge-2, Rouge-L).
-        """
-        if config_data.eval_metric == 'bleu':
-            return score
-        elif config_data.eval_metric == 'rouge':
-            return sum([value['f'] for key, value in score.items()])
-
-    def _anneal():
-        """
-        Operate lambdas when the reward of val set decrease.
-        """
-        def _update_self():
-            """
-            Decrease lambda_truth and increase lambda_self.
-            """
-            lambdas[1] -= FLAGS.delta_lambda_self
-            lambdas[0] += FLAGS.delta_lambda_self
-            updates.append('self')
-
-        def _update_rew():
-            """
-            Decrease lambda_truth and increase lambda_reward.
-            """
-            lambdas[1] -= FLAGS.delta_lambda_reward
-            lambdas[2] += FLAGS.delta_lambda_reward
-            updates.append('rew')
-
-        if updates[-FLAGS.lambda_reward_steps:] == \
-                ['rew'] * FLAGS.lambda_reward_steps:
-            _update_self()
-        else:
-            _update_rew()
-
-    saver = tf.train.Saver(max_to_keep=2)
-    with tf.Session() as sess:
-        sess.run(tf.global_variables_initializer())
-        sess.run(tf.local_variables_initializer())
-        sess.run(tf.tables_initializer())
-
-        lambdas = FLAGS.lambdas_init
-        updates = ['rew'] * FLAGS.lambda_reward_steps
-
-        best_val_score, best_val_score_current_lambdas = -1., -1.
-        scores_file = open(log_dir + 'scores.txt', 'w', encoding='utf-8')
-
-        for i in range(config_data.num_epochs):
-            print_stdout_and_file(
-                'training epoch={}, lambdas={}'.format(i, lambdas),
-                file=scores_file)
-            _train_epoch(sess, i, lambdas)
-            saver.save(sess, log_dir + 'models/model{}.ckpt'.format(i))
-
-            val_score = _eval_epoch(sess, 'val', i)
-            test_score = _eval_epoch(sess, 'test', i)
-
-            if _calc_reward(val_score) < best_val_score_current_lambdas:
-                _anneal()
-                best_val_score_current_lambdas = -1.
-                saver.restore(
-                    sess, log_dir + 'models/model{}.ckpt'.format(i - 1))
-            else:
-                best_val_score_current_lambdas = _calc_reward(val_score)
-
-            best_val_score = max(best_val_score, _calc_reward(val_score))
-
-            if config_data.eval_metric == 'bleu':
-                print_stdout_and_file(
-                    'val epoch={}, BLEU={:.4f}; best-ever={:.4f}'.format(
-                        i, val_score, best_val_score), file=scores_file)
-
-                print_stdout_and_file(
-                    'test epoch={}, BLEU={:.4f}'.format(i, test_score),
-                    file=scores_file)
-                print_stdout_and_file('=' * 50, file=scores_file)
-
-            elif config_data.eval_metric == 'rouge':
-                print_stdout_and_file(
-                    'valid epoch {}:'.format(i), file=scores_file)
-                for key, value in val_score.items():
-                    print_stdout_and_file(
-                        '{}: {}'.format(key, value), file=scores_file)
-                print_stdout_and_file('fsum: {}; best_val_fsum: {}'.format(
-                    _calc_reward(val_score), best_val_score), file=scores_file)
-
-                print_stdout_and_file(
-                    'test epoch {}:'.format(i), file=scores_file)
-                for key, value in test_score.items():
-                    print_stdout_and_file(
-                        '{}: {}'.format(key, value), file=scores_file)
-                print_stdout_and_file('=' * 110, file=scores_file)
-
-            scores_file.flush()
-
-
-if __name__ == '__main__':
-    main()
diff --git a/examples/seq2seq_exposure_bias/raml_main.py b/examples/seq2seq_exposure_bias/raml_main.py
deleted file mode 100644
index 2619b8fd..00000000
--- a/examples/seq2seq_exposure_bias/raml_main.py
+++ /dev/null
@@ -1,347 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-Attentional Seq2seq with RAML algorithm.
-
-Read a pre-processed file containing the augmented samples and
-corresponding rewards for every target sentence.
-
-RAML Algorithm is described in https://arxiv.org/pdf/1705.07136.pdf
-
-"""
-from __future__ import absolute_import
-from __future__ import print_function
-from __future__ import division
-from __future__ import unicode_literals
-
-from io import open
-import importlib
-import tensorflow as tf
-import texar.tf as tx
-import numpy as np
-import random
-from rouge import Rouge
-
-flags = tf.flags
-
-flags.DEFINE_string("config_model", "configs.config_model", "The model config.")
-flags.DEFINE_string("config_data", "configs.config_iwslt14",
-                    "The dataset config.")
-
-flags.DEFINE_string('raml_file', 'data/iwslt14/samples_iwslt14.txt',
-                    'the samples and rewards described in RAML')
-flags.DEFINE_integer('n_samples', 10,
-                     'number of samples for every target sentence')
-flags.DEFINE_float('tau', 0.4, 'the temperature in RAML algorithm')
-
-flags.DEFINE_string('output_dir', '.', 'where to keep training logs')
-
-FLAGS = flags.FLAGS
-
-config_model = importlib.import_module(FLAGS.config_model)
-config_data = importlib.import_module(FLAGS.config_data)
-
-if not FLAGS.output_dir.endswith('/'):
-    FLAGS.output_dir += '/'
-log_dir = FLAGS.output_dir + 'training_log_raml' +\
-          '_' + str(FLAGS.n_samples) + 'samples' +\
-          '_tau' + str(FLAGS.tau) + '/'
-tx.utils.maybe_create_dir(log_dir)
-
-
-def read_raml_sample_file():
-    raml_file = open(FLAGS.raml_file, encoding='utf-8')
-
-    train_data = []
-    sample_num = -1
-    for line in raml_file.readlines():
-        line = line[:-1]
-        if line.startswith('***'):
-            continue
-        elif line.endswith('samples'):
-            sample_num = eval(line.split()[0])
-            assert sample_num == 1 or sample_num == FLAGS.n_samples
-        elif line.startswith('source:'):
-            train_data.append({'source': line[7:], 'targets': []})
-        else:
-            train_data[-1]['targets'].append(line.split('|||'))
-            if sample_num == 1:
-                for i in range(FLAGS.n_samples - 1):
-                    train_data[-1]['targets'].append(line.split('|||'))
-    return train_data
-
-
-def raml_loss(batch, output, training_rewards):
-    mle_loss = tx.losses.sequence_sparse_softmax_cross_entropy(
-        labels=batch['target_text_ids'][:, 1:],
-        logits=output.logits,
-        sequence_length=batch['target_length'] - 1,
-        average_across_batch=False)
-    return tf.reduce_sum(mle_loss * training_rewards) /\
-           tf.reduce_sum(training_rewards)
-
-
-def build_model(batch, train_data, rewards):
-    """
-    Assembles the seq2seq model.
-    Code in this function is basically the same of build_model() in
-    baseline_seq2seq_attn_main.py except the normalization in loss_fn.
-    """
-    source_embedder = tx.modules.WordEmbedder(
-        vocab_size=train_data.source_vocab.size, hparams=config_model.embedder)
-
-    encoder = tx.modules.BidirectionalRNNEncoder(
-        hparams=config_model.encoder)
-
-    enc_outputs, _ = encoder(source_embedder(batch['source_text_ids']))
-
-    target_embedder = tx.modules.WordEmbedder(
-        vocab_size=train_data.target_vocab.size, hparams=config_model.embedder)
-
-    decoder = tx.modules.AttentionRNNDecoder(
-        memory=tf.concat(enc_outputs, axis=2),
-        memory_sequence_length=batch['source_length'],
-        vocab_size=train_data.target_vocab.size,
-        hparams=config_model.decoder)
-
-    training_outputs, _, _ = decoder(
-        decoding_strategy='train_greedy',
-        inputs=target_embedder(batch['target_text_ids'][:, :-1]),
-        sequence_length=batch['target_length'] - 1)
-
-    train_op = tx.core.get_train_op(
-        raml_loss(batch, training_outputs, rewards),
-        hparams=config_model.opt)
-
-    start_tokens = tf.ones_like(batch['target_length']) *\
-                   train_data.target_vocab.bos_token_id
-    beam_search_outputs, _, _ = \
-        tx.modules.beam_search_decode(
-            decoder_or_cell=decoder,
-            embedding=target_embedder,
-            start_tokens=start_tokens,
-            end_token=train_data.target_vocab.eos_token_id,
-            beam_width=config_model.beam_width,
-            max_decoding_length=60)
-
-    return train_op, beam_search_outputs
-
-
-def print_stdout_and_file(content, file):
-    print(content)
-    print(content, file=file)
-
-
-def main():
-    """Entrypoint.
-    """
-    config_data.train['batch_size'] *= FLAGS.n_samples
-    config_data.val['batch_size'] *= FLAGS.n_samples
-    config_data.test['batch_size'] *= FLAGS.n_samples
-
-    train_data = tx.data.PairedTextData(hparams=config_data.train)
-    val_data = tx.data.PairedTextData(hparams=config_data.val)
-    test_data = tx.data.PairedTextData(hparams=config_data.test)
-    data_iterator = tx.data.TrainTestDataIterator(
-        train=train_data, val=val_data, test=test_data)
-
-    batch = data_iterator.get_next()
-    rewards_ts = tf.placeholder(
-        dtype=tf.float32, shape=[None, ], name='training_rewards')
-
-    train_op, infer_outputs = build_model(batch, train_data, rewards_ts)
-
-    raml_train_data = read_raml_sample_file()
-
-    def _train_epoch(sess, epoch_no):
-        data_iterator.switch_to_train_data(sess)
-        training_log_file = \
-            open(log_dir + 'training_log' + str(epoch_no) + '.txt', 'w',
-                 encoding='utf-8')
-
-        step = 0
-        source_buffer, target_buffer = [], []
-        random.shuffle(raml_train_data)
-        for training_pair in raml_train_data:
-            for target in training_pair['targets']:
-                source_buffer.append(training_pair['source'])
-                target_buffer.append(target)
-
-            if len(target_buffer) != train_data.batch_size:
-                continue
-
-            source_ids = []
-            source_length = []
-            target_ids = []
-            target_length = []
-            scores = []
-
-            trunc_len_src = train_data.hparams.source_dataset.max_seq_length
-            trunc_len_tgt = train_data.hparams.target_dataset.max_seq_length
-
-            for sentence in source_buffer:
-                ids = [train_data.source_vocab.token_to_id_map_py[token]
-                       for token in sentence.split()][:trunc_len_src]
-                ids = ids + [train_data.source_vocab.eos_token_id]
-
-                source_ids.append(ids)
-                source_length.append(len(ids))
-
-            for sentence, score_str in target_buffer:
-                ids = [train_data.target_vocab.bos_token_id]
-                ids = ids + [train_data.target_vocab.token_to_id_map_py[token]
-                             for token in sentence.split()][:trunc_len_tgt]
-                ids = ids + [train_data.target_vocab.eos_token_id]
-
-                target_ids.append(ids)
-                scores.append(eval(score_str))
-                target_length.append(len(ids))
-
-            rewards = []
-            for i in range(0, train_data.batch_size, FLAGS.n_samples):
-                tmp = np.array(scores[i:i + FLAGS.n_samples])
-                tmp = np.exp(tmp / FLAGS.tau) / np.sum(np.exp(tmp / FLAGS.tau))
-                for j in range(0, FLAGS.n_samples):
-                    rewards.append(tmp[j])
-
-            for value in source_ids:
-                while len(value) < max(source_length):
-                    value.append(0)
-            for value in target_ids:
-                while len(value) < max(target_length):
-                    value.append(0)
-
-            feed_dict = {
-                batch['source_text_ids']: np.array(source_ids),
-                batch['target_text_ids']: np.array(target_ids),
-                batch['source_length']: np.array(source_length),
-                batch['target_length']: np.array(target_length),
-                rewards_ts: np.array(rewards)
-            }
-            source_buffer = []
-            target_buffer = []
-
-            loss = sess.run(train_op, feed_dict=feed_dict)
-            print("step={}, loss={:.4f}".format(step, loss),
-                  file=training_log_file)
-            if step % config_data.observe_steps == 0:
-                print("step={}, loss={:.4f}".format(step, loss))
-            training_log_file.flush()
-            step += 1
-
-    # code below this line is exactly the same as baseline_seq2seq_attn_main.py
-
-    def _eval_epoch(sess, mode, epoch_no):
-        if mode == 'val':
-            data_iterator.switch_to_val_data(sess)
-        else:
-            data_iterator.switch_to_test_data(sess)
-
-        refs, hypos = [], []
-        while True:
-            try:
-                fetches = [
-                    batch['target_text'][:, 1:],
-                    infer_outputs.predicted_ids[:, :, 0]
-                ]
-                feed_dict = {
-                    tx.global_mode(): tf.estimator.ModeKeys.EVAL
-                }
-                target_texts_ori, output_ids = \
-                    sess.run(fetches, feed_dict=feed_dict)
-
-                target_texts = tx.utils.strip_special_tokens(
-                    target_texts_ori.tolist(), is_token_list=True)
-                target_texts = tx.utils.str_join(target_texts)
-                output_texts = tx.utils.map_ids_to_strs(
-                    ids=output_ids, vocab=val_data.target_vocab)
-
-                tx.utils.write_paired_text(
-                    target_texts, output_texts,
-                    log_dir + mode + '_results' + str(epoch_no) + '.txt',
-                    append=True, mode='h', sep=' ||| ')
-
-                for hypo, ref in zip(output_texts, target_texts):
-                    if config_data.eval_metric == 'bleu':
-                        hypos.append(hypo)
-                        refs.append([ref])
-                    elif config_data.eval_metric == 'rouge':
-                        hypos.append(tx.utils.compat_as_text(hypo))
-                        refs.append(tx.utils.compat_as_text(ref))
-            except tf.errors.OutOfRangeError:
-                break
-
-        if config_data.eval_metric == 'bleu':
-            return tx.evals.corpus_bleu_moses(
-                list_of_references=refs, hypotheses=hypos)
-        elif config_data.eval_metric == 'rouge':
-            rouge = Rouge()
-            return rouge.get_scores(hyps=hypos, refs=refs, avg=True)
-
-    def _calc_reward(score):
-        """
-        Return the bleu score or the sum of (Rouge-1, Rouge-2, Rouge-L).
-        """
-        if config_data.eval_metric == 'bleu':
-            return score
-        elif config_data.eval_metric == 'rouge':
-            return sum([value['f'] for key, value in score.items()])
-
-    with tf.Session() as sess:
-        sess.run(tf.global_variables_initializer())
-        sess.run(tf.local_variables_initializer())
-        sess.run(tf.tables_initializer())
-
-        best_val_score = -1.
-        scores_file = open(log_dir + 'scores.txt', 'w', encoding='utf-8')
-        for i in range(config_data.num_epochs):
-            _train_epoch(sess, i)
-
-            val_score = _eval_epoch(sess, 'val', i)
-            test_score = _eval_epoch(sess, 'test', i)
-
-            best_val_score = max(best_val_score, _calc_reward(val_score))
-
-            if config_data.eval_metric == 'bleu':
-                print_stdout_and_file(
-                    'val epoch={}, BLEU={:.4f}; best-ever={:.4f}'.format(
-                        i, val_score, best_val_score), file=scores_file)
-
-                print_stdout_and_file(
-                    'test epoch={}, BLEU={:.4f}'.format(i, test_score),
-                    file=scores_file)
-                print_stdout_and_file('=' * 50, file=scores_file)
-
-            elif config_data.eval_metric == 'rouge':
-                print_stdout_and_file(
-                    'valid epoch {}:'.format(i), file=scores_file)
-                for key, value in val_score.items():
-                    print_stdout_and_file(
-                        '{}: {}'.format(key, value), file=scores_file)
-                print_stdout_and_file('fsum: {}; best_val_fsum: {}'.format(
-                    _calc_reward(val_score), best_val_score), file=scores_file)
-
-                print_stdout_and_file(
-                    'test epoch {}:'.format(i), file=scores_file)
-                for key, value in test_score.items():
-                    print_stdout_and_file(
-                        '{}: {}'.format(key, value), file=scores_file)
-                print_stdout_and_file('=' * 110, file=scores_file)
-
-            scores_file.flush()
-
-
-if __name__ == '__main__':
-    main()
diff --git a/examples/seq2seq_exposure_bias/requirements.txt b/examples/seq2seq_exposure_bias/requirements.txt
deleted file mode 100644
index 6255bcab..00000000
--- a/examples/seq2seq_exposure_bias/requirements.txt
+++ /dev/null
@@ -1 +0,0 @@
-rouge==0.2.1
diff --git a/examples/seq2seq_exposure_bias/scheduled_sampling_main.py b/examples/seq2seq_exposure_bias/scheduled_sampling_main.py
deleted file mode 100644
index 929f26a1..00000000
--- a/examples/seq2seq_exposure_bias/scheduled_sampling_main.py
+++ /dev/null
@@ -1,267 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Attentional Seq2seq using Scheduled sampling algorithm.
-
-This code is basically the same as baseline_seq2seq_attn_main.py,
-except using ScheduledEmbeddingTrainingHelper.
-
-Scheduled Sampling Algorithm is described in https://arxiv.org/abs/1506.03099
-"""
-from __future__ import absolute_import
-from __future__ import print_function
-from __future__ import division
-from __future__ import unicode_literals
-
-# pylint: disable=invalid-name, too-many-arguments, too-many-locals
-
-from io import open
-import math
-import importlib
-import tensorflow as tf
-import texar.tf as tx
-from rouge import Rouge
-
-flags = tf.flags
-
-flags.DEFINE_string("config_model", "configs.config_model", "The model config.")
-flags.DEFINE_string("config_data", "configs.config_iwslt14",
-                    "The dataset config.")
-
-flags.DEFINE_float('decay_factor', 500.,
-                   'The hyperparameter controling the speed of increasing '
-                   'the probability of sampling from model')
-
-flags.DEFINE_string('output_dir', '.', 'where to keep training logs')
-
-FLAGS = flags.FLAGS
-
-config_model = importlib.import_module(FLAGS.config_model)
-config_data = importlib.import_module(FLAGS.config_data)
-
-if not FLAGS.output_dir.endswith('/'):
-    FLAGS.output_dir += '/'
-log_dir = FLAGS.output_dir + 'training_log_scheduled_sampling' +\
-          '_decayf' + str(FLAGS.decay_factor) + '/'
-tx.utils.maybe_create_dir(log_dir)
-
-
-def inverse_sigmoid(i):
-    return FLAGS.decay_factor / (
-            FLAGS.decay_factor + math.exp(i / FLAGS.decay_factor))
-
-
-def build_model(batch, train_data, self_sampling_proba):
-    """
-    Assembles the seq2seq model.
-    It is the same as build_model() in baseline_seq2seq_attn.py except
-    using ScheduledEmbeddingTrainingHelper.
-    """
-    source_embedder = tx.modules.WordEmbedder(
-        vocab_size=train_data.source_vocab.size, hparams=config_model.embedder)
-
-    encoder = tx.modules.BidirectionalRNNEncoder(
-        hparams=config_model.encoder)
-
-    enc_outputs, _ = encoder(source_embedder(batch['source_text_ids']))
-
-    target_embedder = tx.modules.WordEmbedder(
-        vocab_size=train_data.target_vocab.size, hparams=config_model.embedder)
-
-    decoder = tx.modules.AttentionRNNDecoder(
-        memory=tf.concat(enc_outputs, axis=2),
-        memory_sequence_length=batch['source_length'],
-        vocab_size=train_data.target_vocab.size,
-        hparams=config_model.decoder)
-
-    helper = tx.modules.get_helper(
-        helper_type='ScheduledEmbeddingTrainingHelper',
-        inputs=target_embedder(batch['target_text_ids'][:, :-1]),
-        sequence_length=batch['target_length'] - 1,
-        embedding=target_embedder,
-        sampling_probability=self_sampling_proba)
-
-    training_outputs, _, _ = decoder(
-        helper=helper, initial_state=decoder.zero_state(
-            batch_size=tf.shape(batch['target_length'])[0], dtype=tf.float32))
-
-    train_op = tx.core.get_train_op(
-        tx.losses.sequence_sparse_softmax_cross_entropy(
-            labels=batch['target_text_ids'][:, 1:],
-            logits=training_outputs.logits,
-            sequence_length=batch['target_length'] - 1),
-        hparams=config_model.opt)
-
-    start_tokens = tf.ones_like(batch['target_length']) *\
-                   train_data.target_vocab.bos_token_id
-    beam_search_outputs, _, _ = \
-        tx.modules.beam_search_decode(
-            decoder_or_cell=decoder,
-            embedding=target_embedder,
-            start_tokens=start_tokens,
-            end_token=train_data.target_vocab.eos_token_id,
-            beam_width=config_model.beam_width,
-            max_decoding_length=60)
-
-    return train_op, beam_search_outputs
-
-
-def print_stdout_and_file(content, file):
-    print(content)
-    print(content, file=file)
-
-
-def main():
-    """Entrypoint.
-    """
-    train_data = tx.data.PairedTextData(hparams=config_data.train)
-    val_data = tx.data.PairedTextData(hparams=config_data.val)
-    test_data = tx.data.PairedTextData(hparams=config_data.test)
-    data_iterator = tx.data.TrainTestDataIterator(
-        train=train_data, val=val_data, test=test_data)
-
-    batch = data_iterator.get_next()
-
-    self_sampling_proba = tf.placeholder(shape=[], dtype=tf.float32)
-    train_op, infer_outputs = \
-        build_model(batch, train_data, self_sampling_proba)
-
-    def _train_epoch(sess, epoch_no, total_step_counter):
-        data_iterator.switch_to_train_data(sess)
-        training_log_file = \
-            open(log_dir + 'training_log' + str(epoch_no) + '.txt', 'w',
-                 encoding='utf-8')
-
-        step = 0
-        while True:
-            try:
-                sampling_proba_ = 1. - inverse_sigmoid(total_step_counter)
-                loss = sess.run(train_op, feed_dict={
-                    self_sampling_proba: sampling_proba_})
-                print("step={}, loss={:.4f}, self_proba={}".format(
-                    step, loss, sampling_proba_), file=training_log_file)
-                if step % config_data.observe_steps == 0:
-                    print("step={}, loss={:.4f}, self_proba={}".format(
-                        step, loss, sampling_proba_))
-                training_log_file.flush()
-                step += 1
-                total_step_counter += 1
-            except tf.errors.OutOfRangeError:
-                break
-
-    # code below this line is exactly the same as baseline_seq2seq_attn_main.py
-
-    def _eval_epoch(sess, mode, epoch_no):
-        if mode == 'val':
-            data_iterator.switch_to_val_data(sess)
-        else:
-            data_iterator.switch_to_test_data(sess)
-
-        refs, hypos = [], []
-        while True:
-            try:
-                fetches = [
-                    batch['target_text'][:, 1:],
-                    infer_outputs.predicted_ids[:, :, 0]
-                ]
-                feed_dict = {
-                    tx.global_mode(): tf.estimator.ModeKeys.EVAL
-                }
-                target_texts_ori, output_ids = \
-                    sess.run(fetches, feed_dict=feed_dict)
-
-                target_texts = tx.utils.strip_special_tokens(
-                    target_texts_ori.tolist(), is_token_list=True)
-                target_texts = tx.utils.str_join(target_texts)
-                output_texts = tx.utils.map_ids_to_strs(
-                    ids=output_ids, vocab=val_data.target_vocab)
-
-                tx.utils.write_paired_text(
-                    target_texts, output_texts,
-                    log_dir + mode + '_results' + str(epoch_no) + '.txt',
-                    append=True, mode='h', sep=' ||| ')
-
-                for hypo, ref in zip(output_texts, target_texts):
-                    if config_data.eval_metric == 'bleu':
-                        hypos.append(hypo)
-                        refs.append([ref])
-                    elif config_data.eval_metric == 'rouge':
-                        hypos.append(tx.utils.compat_as_text(hypo))
-                        refs.append(tx.utils.compat_as_text(ref))
-            except tf.errors.OutOfRangeError:
-                break
-
-        if config_data.eval_metric == 'bleu':
-            return tx.evals.corpus_bleu_moses(
-                list_of_references=refs, hypotheses=hypos)
-        elif config_data.eval_metric == 'rouge':
-            rouge = Rouge()
-            return rouge.get_scores(hyps=hypos, refs=refs, avg=True)
-
-    def _calc_reward(score):
-        """
-        Return the bleu score or the sum of (Rouge-1, Rouge-2, Rouge-L).
-        """
-        if config_data.eval_metric == 'bleu':
-            return score
-        elif config_data.eval_metric == 'rouge':
-            return sum([value['f'] for key, value in score.items()])
-
-    with tf.Session() as sess:
-        sess.run(tf.global_variables_initializer())
-        sess.run(tf.local_variables_initializer())
-        sess.run(tf.tables_initializer())
-
-        best_val_score = -1.
-        total_step_counter = 1
-        scores_file = open(log_dir + 'scores.txt', 'w', encoding='utf-8')
-        for i in range(config_data.num_epochs):
-            _train_epoch(sess, i, total_step_counter)
-
-            val_score = _eval_epoch(sess, 'val', i)
-            test_score = _eval_epoch(sess, 'test', i)
-
-            best_val_score = max(best_val_score, _calc_reward(val_score))
-
-            if config_data.eval_metric == 'bleu':
-                print_stdout_and_file(
-                    'val epoch={}, BLEU={:.4f}; best-ever={:.4f}'.format(
-                        i, val_score, best_val_score), file=scores_file)
-
-                print_stdout_and_file(
-                    'test epoch={}, BLEU={:.4f}'.format(i, test_score),
-                    file=scores_file)
-                print_stdout_and_file('=' * 50, file=scores_file)
-
-            elif config_data.eval_metric == 'rouge':
-                print_stdout_and_file(
-                    'valid epoch {}:'.format(i), file=scores_file)
-                for key, value in val_score.items():
-                    print_stdout_and_file(
-                        '{}: {}'.format(key, value), file=scores_file)
-                print_stdout_and_file('fsum: {}; best_val_fsum: {}'.format(
-                    _calc_reward(val_score), best_val_score), file=scores_file)
-
-                print_stdout_and_file(
-                    'test epoch {}:'.format(i), file=scores_file)
-                for key, value in test_score.items():
-                    print_stdout_and_file(
-                        '{}: {}'.format(key, value), file=scores_file)
-                print_stdout_and_file('=' * 110, file=scores_file)
-
-            scores_file.flush()
-
-
-if __name__ == '__main__':
-    main()
diff --git a/examples/seq2seq_exposure_bias/utils/prepare_data.py b/examples/seq2seq_exposure_bias/utils/prepare_data.py
deleted file mode 100644
index b2eb6e74..00000000
--- a/examples/seq2seq_exposure_bias/utils/prepare_data.py
+++ /dev/null
@@ -1,56 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Downloads data.
-"""
-import tensorflow as tf
-import texar.tf as tx
-
-# pylint: disable=invalid-name
-
-flags = tf.flags
-
-flags.DEFINE_string("data", "iwslt14", "Data to download [iwslt14|toy_copy]")
-
-FLAGS = flags.FLAGS
-
-
-def prepare_data():
-    """Downloads data.
-    """
-    if FLAGS.data == 'giga':
-        tx.data.maybe_download(
-            urls='https://drive.google.com/file/d/'
-                 '12RZs7QFwjj6dfuYNQ_0Ah-ccH1xFDMD5/view?usp=sharing',
-            path='./',
-            filenames='giga.zip',
-            extract=True)
-    elif FLAGS.data == 'iwslt14':
-        tx.data.maybe_download(
-            urls='https://drive.google.com/file/d/'
-                 '1y4mUWXRS2KstgHopCS9koZ42ENOh6Yb9/view?usp=sharing',
-            path='./',
-            filenames='iwslt14.zip',
-            extract=True)
-    else:
-        raise ValueError('Unknown data: {}'.format(FLAGS.data))
-
-
-def main():
-    """Entrypoint.
-    """
-    prepare_data()
-
-
-if __name__ == '__main__':
-    main()
diff --git a/examples/seq2seq_exposure_bias/utils/raml_samples_generation/README.md b/examples/seq2seq_exposure_bias/utils/raml_samples_generation/README.md
deleted file mode 100644
index 12c7ccf4..00000000
--- a/examples/seq2seq_exposure_bias/utils/raml_samples_generation/README.md
+++ /dev/null
@@ -1,5 +0,0 @@
-## Augmented Data Generation for RAML Algorithm
-
-Codes here are mainly copied from [pcyin's github](https://github.com/pcyin/pytorch_nmt), with slightly change for supporting ```rouge``` as reward. Note that we have also provided generated samples in the datasets that you can download.
-
-You may tune hyperparameters in ```gen_samples_giga.sh```  or  ```gen_samples_iwslt14.sh``` and use commands like ```bash gen_samples_giga.sh``` to begin your generation.
diff --git a/examples/seq2seq_exposure_bias/utils/raml_samples_generation/gen_samples_giga.sh b/examples/seq2seq_exposure_bias/utils/raml_samples_generation/gen_samples_giga.sh
deleted file mode 100644
index fa47f351..00000000
--- a/examples/seq2seq_exposure_bias/utils/raml_samples_generation/gen_samples_giga.sh
+++ /dev/null
@@ -1,21 +0,0 @@
-#!/bin/sh
-
-train_src="../../data/giga/train.article"
-train_tgt="../../data/giga/train.title"
-
-python vocab.py \
-	--src_vocab_size 30424 \
-	--tgt_vocab_size 23738 \
-	--train_src ${train_src} \
-	--train_tgt ${train_tgt} \
-	--include_singleton \
-	--output giga_vocab.bin 
-
-python process_samples.py \
-    --mode sample_ngram \
-    --vocab giga_vocab.bin \
-    --src ${train_src} \
-    --tgt ${train_tgt} \
-	--sample_size 10 \
-	--reward rouge \
-    --output samples_giga.txt
diff --git a/examples/seq2seq_exposure_bias/utils/raml_samples_generation/gen_samples_iwslt14.sh b/examples/seq2seq_exposure_bias/utils/raml_samples_generation/gen_samples_iwslt14.sh
deleted file mode 100644
index f44be993..00000000
--- a/examples/seq2seq_exposure_bias/utils/raml_samples_generation/gen_samples_iwslt14.sh
+++ /dev/null
@@ -1,21 +0,0 @@
-#!/bin/sh
-
-train_src="../../data/iwslt14/train.de"
-train_tgt="../../data/iwslt14/train.en"
-
-python vocab.py \
-	--src_vocab_size 32007 \
-	--tgt_vocab_size 22820 \
-	--train_src ${train_src} \
-	--train_tgt ${train_tgt} \
-	--include_singleton \
-	--output iwslt14_vocab.bin 
-
-python process_samples.py \
-    --mode sample_ngram \
-    --vocab iwslt14_vocab.bin \
-    --src ${train_src} \
-    --tgt ${train_tgt} \
-	--sample_size 10 \
-	--reward bleu \
-    --output samples_iwslt14.txt
diff --git a/examples/seq2seq_exposure_bias/utils/raml_samples_generation/process_samples.py b/examples/seq2seq_exposure_bias/utils/raml_samples_generation/process_samples.py
deleted file mode 100644
index 685df684..00000000
--- a/examples/seq2seq_exposure_bias/utils/raml_samples_generation/process_samples.py
+++ /dev/null
@@ -1,312 +0,0 @@
-from __future__ import print_function
-from nltk.translate.bleu_score import sentence_bleu
-from nltk.translate.bleu_score import SmoothingFunction
-import sys
-import re
-import argparse
-import torch
-from util import read_corpus
-import numpy as np
-from scipy.misc import comb
-from vocab import Vocab, VocabEntry
-import math
-from rouge import Rouge
-
-
-def is_valid_sample(sent):
-    tokens = sent.split(' ')
-    return len(tokens) >= 1 and len(tokens) < 50
-
-
-def sample_from_model(args):
-    para_data = args.parallel_data
-    sample_file = args.sample_file
-    output = args.output
-
-    tgt_sent_pattern = re.compile(r"^\[(\d+)\] (.*?)$")
-    para_data = [l.strip().split(' ||| ') for l in open(para_data)]
-
-    f_out = open(output, 'w')
-    f = open(sample_file)
-    f.readline()
-    for src_sent, tgt_sent in para_data:
-        line = f.readline().strip()
-        assert line.startswith('****')
-        line = f.readline().strip()
-        print(line)
-        assert line.startswith('target:')
-
-        tgt_sent2 = line[len('target:'):]
-        assert tgt_sent == tgt_sent2
-
-        line = f.readline().strip()  # samples
-
-        tgt_sent = ' '.join(tgt_sent.split(' ')[1:-1])
-        tgt_samples = set()
-        for i in range(1, 101):
-            line = f.readline().rstrip('\n')
-            m = tgt_sent_pattern.match(line)
-
-            assert m, line
-            assert int(m.group(1)) == i
-
-            sampled_tgt_sent = m.group(2).strip()
-
-            if is_valid_sample(sampled_tgt_sent):
-                tgt_samples.add(sampled_tgt_sent)
-
-        line = f.readline().strip()
-        assert line.startswith('****')
-
-        tgt_samples.add(tgt_sent)
-        tgt_samples = list(tgt_samples)
-
-        assert len(tgt_samples) > 0
-
-        tgt_ref_tokens = tgt_sent.split(' ')
-        bleu_scores = []
-        for tgt_sample in tgt_samples:
-            bleu_score = sentence_bleu([tgt_ref_tokens], tgt_sample.split(' '))
-            bleu_scores.append(bleu_score)
-
-        tgt_ranks = sorted(range(len(tgt_samples)), key=lambda i: bleu_scores[i], reverse=True)
-
-        print('%d samples' % len(tgt_samples))
-
-        print('*' * 50, file=f_out)
-        print('source: ' + src_sent, file=f_out)
-        print('%d samples' % len(tgt_samples), file=f_out)
-        for i in tgt_ranks:
-            print('%s ||| %f' % (tgt_samples[i], bleu_scores[i]), file=f_out)
-        print('*' * 50, file=f_out)
-
-    f_out.close()
-
-
-def get_new_ngram(ngram, n, vocab):
-    """
-    replace ngram `ngram` with a newly sampled ngram of the same length
-    """
-
-    new_ngram_wids = [np.random.randint(3, len(vocab)) for i in range(n)]
-    new_ngram = [vocab.id2word[wid] for wid in new_ngram_wids]
-
-    return new_ngram
-
-
-def sample_ngram(args):
-    src_sents = read_corpus(args.src, 'src')
-    tgt_sents = read_corpus(args.tgt, 'src')  # do not read in <s> and </s>
-    f_out = open(args.output, 'w')
-
-    vocab = torch.load(args.vocab)
-    tgt_vocab = vocab.tgt
-
-    smooth_bleu = args.smooth_bleu
-    sm_func = None
-    if smooth_bleu:
-        sm_func = SmoothingFunction().method3
-
-    for src_sent, tgt_sent in zip(src_sents, tgt_sents):
-        src_sent = ' '.join(src_sent)
-
-        tgt_len = len(tgt_sent)
-        tgt_samples = []
-        tgt_samples_distort_rates = []    # how many unigrams are replaced
-
-        # generate 100 samples
-
-        # append itself
-        tgt_samples.append(tgt_sent)
-        tgt_samples_distort_rates.append(0)
-
-        for sid in range(args.sample_size - 1):
-            n = np.random.randint(1, min(tgt_len, args.max_ngram_size + 1))  # we do not replace the last token: it must be a period!
-
-            idx = np.random.randint(tgt_len - n)
-            ngram = tgt_sent[idx: idx + n]
-            new_ngram = get_new_ngram(ngram, n, tgt_vocab)
-
-            sampled_tgt_sent = list(tgt_sent)
-            sampled_tgt_sent[idx: idx + n] = new_ngram
-
-            # compute the probability of this sample
-            # prob = 1. / args.max_ngram_size * 1. / (tgt_len - 1 + n) * 1 / (len(tgt_vocab) ** n)
-
-            tgt_samples.append(sampled_tgt_sent)
-            tgt_samples_distort_rates.append(n)
-
-        # compute bleu scores or edit distances and rank the samples by bleu scores
-        rewards = []
-        for tgt_sample, tgt_sample_distort_rate in zip(tgt_samples, tgt_samples_distort_rates):
-            if args.reward == 'bleu':
-                reward = sentence_bleu([tgt_sent], tgt_sample, smoothing_function=sm_func)
-            elif args.reward == 'rouge':
-                rouge = Rouge()
-                scores = rouge.get_scores(hyps=[' '.join(tgt_sample).decode('utf-8')], refs=[' '.join(tgt_sent).decode('utf-8')], avg=True)
-                reward = sum([value['f'] for key, value in scores.items()])
-            else:
-                reward = -tgt_sample_distort_rate
-
-            rewards.append(reward)
-
-        tgt_ranks = sorted(range(len(tgt_samples)), key=lambda i: rewards[i], reverse=True)
-        # convert list of tokens into a string
-        tgt_samples = [' '.join(tgt_sample) for tgt_sample in tgt_samples]
-
-        print('*' * 50, file=f_out)
-        print('source: ' + src_sent, file=f_out)
-        print('%d samples' % len(tgt_samples), file=f_out)
-        for i in tgt_ranks:
-            print('%s ||| %f' % (tgt_samples[i], rewards[i]), file=f_out)
-        print('*' * 50, file=f_out)
-
-    f_out.close()
-
-
-def sample_ngram_adapt(args):
-    src_sents = read_corpus(args.src, 'src')
-    tgt_sents = read_corpus(args.tgt, 'src')  # do not read in <s> and </s>
-    f_out = open(args.output, 'w')
-
-    vocab = torch.load(args.vocab)
-    tgt_vocab = vocab.tgt
-
-    max_len = max([len(tgt_sent) for tgt_sent in tgt_sents]) + 1
-
-    for src_sent, tgt_sent in zip(src_sents, tgt_sents):
-        src_sent = ' '.join(src_sent)
-
-        tgt_len = len(tgt_sent)
-        tgt_samples = []
-
-        # generate 100 samples
-
-        # append itself
-        tgt_samples.append(tgt_sent)
-
-        for sid in range(args.sample_size - 1):
-            max_n = min(tgt_len - 1, 4)
-            bias_n = int(max_n * tgt_len / max_len) + 1
-            assert 1 <= bias_n <= 4, 'bias_n={}, not in [1,4], max_n={}, tgt_len={}, max_len={}'.format(bias_n, max_n, tgt_len, max_len)
-
-            p = [1.0 / (max_n + 5)] * max_n
-            p[bias_n - 1] = 1 - p[0] * (max_n - 1)
-            assert abs(sum(p) - 1) < 1e-10, 'sum(p) != 1'
-
-            n = np.random.choice(np.arange(1, int(max_n + 1)), p=p)  # we do not replace the last token: it must be a period!
-            assert n < tgt_len, 'n={}, tgt_len={}'.format(n, tgt_len)
-
-            idx = np.random.randint(tgt_len - n)
-            ngram = tgt_sent[idx: idx + n]
-            new_ngram = get_new_ngram(ngram, n, tgt_vocab)
-
-            sampled_tgt_sent = list(tgt_sent)
-            sampled_tgt_sent[idx: idx + n] = new_ngram
-
-            tgt_samples.append(sampled_tgt_sent)
-
-        # compute bleu scores and rank the samples by bleu scores
-        bleu_scores = []
-        for tgt_sample in tgt_samples:
-            bleu_score = sentence_bleu([tgt_sent], tgt_sample)
-            bleu_scores.append(bleu_score)
-
-        tgt_ranks = sorted(range(len(tgt_samples)), key=lambda i: bleu_scores[i], reverse=True)
-        # convert list of tokens into a string
-        tgt_samples = [' '.join(tgt_sample) for tgt_sample in tgt_samples]
-
-        print('*' * 50, file=f_out)
-        print('source: ' + src_sent, file=f_out)
-        print('%d samples' % len(tgt_samples), file=f_out)
-        for i in tgt_ranks:
-            print('%s ||| %f' % (tgt_samples[i], bleu_scores[i]), file=f_out)
-        print('*' * 50, file=f_out)
-
-    f_out.close()
-
-
-def sample_from_hamming_distance_payoff_distribution(args):
-    src_sents = read_corpus(args.src, 'src')
-    tgt_sents = read_corpus(args.tgt, 'src')  # do not read in <s> and </s>
-    f_out = open(args.output, 'w')
-
-    vocab = torch.load(args.vocab)
-    tgt_vocab = vocab.tgt
-
-    payoff_prob, Z_qs = generate_hamming_distance_payoff_distribution(max(len(sent) for sent in tgt_sents),
-                                                                      vocab_size=len(vocab.tgt),
-                                                                      tau=args.temp)
-
-    for src_sent, tgt_sent in zip(src_sents, tgt_sents):
-        tgt_samples = []  # make sure the ground truth y* is in the samples
-        tgt_sent_len = len(tgt_sent) - 3  # remove <s> and </s> and ending period .
-        tgt_ref_tokens = tgt_sent[1:-1]
-        bleu_scores = []
-
-        # sample an edit distances
-        e_samples = np.random.choice(range(tgt_sent_len + 1), p=payoff_prob[tgt_sent_len], size=args.sample_size,
-                                     replace=True)
-
-        for i, e in enumerate(e_samples):
-            if e > 0:
-                # sample a new tgt_sent $y$
-                old_word_pos = np.random.choice(range(1, tgt_sent_len + 1), size=e, replace=False)
-                new_words = [vocab.tgt.id2word[wid] for wid in np.random.randint(3, len(vocab.tgt), size=e)]
-                new_tgt_sent = list(tgt_sent)
-                for pos, word in zip(old_word_pos, new_words):
-                    new_tgt_sent[pos] = word
-
-                bleu_score = sentence_bleu([tgt_ref_tokens], new_tgt_sent[1:-1])
-                bleu_scores.append(bleu_score)
-            else:
-                new_tgt_sent = list(tgt_sent)
-                bleu_scores.append(1.)
-
-            # print('y: %s' % ' '.join(new_tgt_sent))
-            tgt_samples.append(new_tgt_sent)
-
-
-def generate_hamming_distance_payoff_distribution(max_sent_len, vocab_size, tau=1.):
-    """compute the q distribution for Hamming Distance (substitution only) as in the RAML paper"""
-    probs = dict()
-    Z_qs = dict()
-    for sent_len in range(1, max_sent_len + 1):
-        counts = [1.]  # e = 0, count = 1
-        for e in range(1, sent_len + 1):
-            # apply the rescaling trick as in https://gist.github.com/norouzi/8c4d244922fa052fa8ec18d8af52d366
-            count = comb(sent_len, e) * math.exp(-e / tau) * ((vocab_size - 1) ** (e - e / tau))
-            counts.append(count)
-
-        Z_qs[sent_len] = Z_q = sum(counts)
-        prob = [count / Z_q for count in counts]
-        probs[sent_len] = prob
-
-        # print('sent_len=%d, %s' % (sent_len, prob))
-
-    return probs, Z_qs
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--mode', choices=['sample_from_model', 'sample_ngram_adapt', 'sample_ngram'], required=True)
-    parser.add_argument('--vocab', type=str)
-    parser.add_argument('--src', type=str)
-    parser.add_argument('--tgt', type=str)
-    parser.add_argument('--parallel_data', type=str)
-    parser.add_argument('--sample_file', type=str)
-    parser.add_argument('--output', type=str, required=True)
-    parser.add_argument('--sample_size', type=int, default=100)
-    parser.add_argument('--reward', choices=['bleu', 'edit_dist', 'rouge'], default='bleu')
-    parser.add_argument('--max_ngram_size', type=int, default=4)
-    parser.add_argument('--temp', type=float, default=0.5)
-    parser.add_argument('--smooth_bleu', action='store_true', default=False)
-
-    args = parser.parse_args()
-
-    if args.mode == 'sample_ngram':
-        sample_ngram(args)
-    elif args.mode == 'sample_from_model':
-        sample_from_model(args)
-    elif args.mode == 'sample_ngram_adapt':
-        sample_ngram_adapt(args)
diff --git a/examples/seq2seq_exposure_bias/utils/raml_samples_generation/util.py b/examples/seq2seq_exposure_bias/utils/raml_samples_generation/util.py
deleted file mode 100644
index adb294dc..00000000
--- a/examples/seq2seq_exposure_bias/utils/raml_samples_generation/util.py
+++ /dev/null
@@ -1,53 +0,0 @@
-from collections import defaultdict
-import numpy as np
-
-
-def read_corpus(file_path, source):
-    data = []
-    for line in open(file_path):
-        sent = line.strip().split(' ')
-        # only append <s> and </s> to the target sentence
-        if source == 'tgt':
-            sent = ['<s>'] + sent + ['</s>']
-        data.append(sent)
-
-    return data
-
-
-def batch_slice(data, batch_size, sort=True):
-    batch_num = int(np.ceil(len(data) / float(batch_size)))
-    for i in range(batch_num):
-        cur_batch_size = batch_size if i < batch_num - 1 else len(data) - batch_size * i
-        src_sents = [data[i * batch_size + b][0] for b in range(cur_batch_size)]
-        tgt_sents = [data[i * batch_size + b][1] for b in range(cur_batch_size)]
-
-        if sort:
-            src_ids = sorted(range(cur_batch_size), key=lambda src_id: len(src_sents[src_id]), reverse=True)
-            src_sents = [src_sents[src_id] for src_id in src_ids]
-            tgt_sents = [tgt_sents[src_id] for src_id in src_ids]
-
-        yield src_sents, tgt_sents
-
-
-def data_iter(data, batch_size, shuffle=True):
-    """
-    randomly permute data, then sort by source length, and partition into batches
-    ensure that the length of source sentences in each batch is decreasing
-    """
-
-    buckets = defaultdict(list)
-    for pair in data:
-        src_sent = pair[0]
-        buckets[len(src_sent)].append(pair)
-
-    batched_data = []
-    for src_len in buckets:
-        tuples = buckets[src_len]
-        if shuffle:
-            np.random.shuffle(tuples)
-        batched_data.extend(list(batch_slice(tuples, batch_size)))
-
-    if shuffle:
-        np.random.shuffle(batched_data)
-    for batch in batched_data:
-        yield batch
diff --git a/examples/seq2seq_exposure_bias/utils/raml_samples_generation/vocab.py b/examples/seq2seq_exposure_bias/utils/raml_samples_generation/vocab.py
deleted file mode 100644
index c79790d2..00000000
--- a/examples/seq2seq_exposure_bias/utils/raml_samples_generation/vocab.py
+++ /dev/null
@@ -1,105 +0,0 @@
-from __future__ import print_function
-import argparse
-from collections import Counter
-from itertools import chain
-
-import torch
-
-from util import read_corpus
-
-
-class VocabEntry(object):
-    def __init__(self):
-        self.word2id = dict()
-        self.unk_id = 3
-        self.word2id['<pad>'] = 0
-        self.word2id['<s>'] = 1
-        self.word2id['</s>'] = 2
-        self.word2id['<unk>'] = 3
-
-        self.id2word = {v: k for k, v in self.word2id.iteritems()}
-
-    def __getitem__(self, word):
-        return self.word2id.get(word, self.unk_id)
-
-    def __contains__(self, word):
-        return word in self.word2id
-
-    def __setitem__(self, key, value):
-        raise ValueError('vocabulary is readonly')
-
-    def __len__(self):
-        return len(self.word2id)
-
-    def __repr__(self):
-        return 'Vocabulary[size=%d]' % len(self)
-
-    def id2word(self, wid):
-        return self.id2word[wid]
-
-    def add(self, word):
-        if word not in self:
-            wid = self.word2id[word] = len(self)
-            self.id2word[wid] = word
-            return wid
-        else:
-            return self[word]
-
-    @staticmethod
-    def from_corpus(corpus, size, remove_singleton=True):
-        vocab_entry = VocabEntry()
-
-        word_freq = Counter(chain(*corpus))
-        non_singletons = [w for w in word_freq if word_freq[w] > 1]
-        print('number of word types: %d, number of word types w/ frequency > 1: %d' % (len(word_freq),
-                                                                                       len(non_singletons)))
-
-        top_k_words = sorted(word_freq.keys(), reverse=True, key=word_freq.get)[:size]
-
-        for word in top_k_words:
-            if len(vocab_entry) < size:
-                if not (word_freq[word] == 1 and remove_singleton):
-                    vocab_entry.add(word)
-
-        return vocab_entry
-
-
-class Vocab(object):
-    def __init__(self, src_sents, tgt_sents, src_vocab_size, tgt_vocab_size, remove_singleton=True):
-        assert len(src_sents) == len(tgt_sents)
-
-        print('initialize source vocabulary ..')
-        self.src = VocabEntry.from_corpus(src_sents, src_vocab_size, remove_singleton=remove_singleton)
-
-        print('initialize target vocabulary ..')
-        self.tgt = VocabEntry.from_corpus(tgt_sents, tgt_vocab_size, remove_singleton=remove_singleton)
-
-    def __repr__(self):
-        return 'Vocab(source %d words, target %d words)' % (len(self.src), len(self.tgt))
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--src_vocab_size', default=50000, type=int, help='source vocabulary size')
-    parser.add_argument('--tgt_vocab_size', default=50000, type=int, help='target vocabulary size')
-    parser.add_argument('--include_singleton', action='store_true', default=False, help='whether to include singleton'
-                                                                                        'in the vocabulary (default=False)')
-
-    parser.add_argument('--train_src', type=str, required=True, help='file of source sentences')
-    parser.add_argument('--train_tgt', type=str, required=True, help='file of target sentences')
-
-    parser.add_argument('--output', default='vocab.bin', type=str, help='output vocabulary file')
-
-    args = parser.parse_args()
-
-    print('read in source sentences: %s' % args.train_src)
-    print('read in target sentences: %s' % args.train_tgt)
-
-    src_sents = read_corpus(args.train_src, source='src')
-    tgt_sents = read_corpus(args.train_tgt, source='tgt')
-
-    vocab = Vocab(src_sents, tgt_sents, args.src_vocab_size, args.tgt_vocab_size, remove_singleton=not args.include_singleton)
-    print('generated vocabulary, source %d words, target %d words' % (len(vocab.src), len(vocab.tgt)))
-
-    torch.save(vocab, args.output)
-    print('vocabulary saved to %s' % args.output)
diff --git a/examples/seq2seq_rl/.gitignore b/examples/seq2seq_rl/.gitignore
deleted file mode 100644
index 9faa04f8..00000000
--- a/examples/seq2seq_rl/.gitignore
+++ /dev/null
@@ -1,4 +0,0 @@
-/data/
-/data.zip
-/iwslt14.zip
-/toy_copy.zip
diff --git a/examples/seq2seq_rl/README.md b/examples/seq2seq_rl/README.md
deleted file mode 100644
index 6a7a766b..00000000
--- a/examples/seq2seq_rl/README.md
+++ /dev/null
@@ -1,45 +0,0 @@
-# Seq2seq Model with Policy Gradient Training #
-
-This example builds an attentional seq2seq model that is trained with policy gradient and BLEU reward. The example is mainly for demonstration of the Texar sequence Reinforcement Learning APIs. No MLE pre-training is included so the model collapsed very quickly. In practice one would usually pretrain the model with teacher-forcing MLE (e.g., see the example [seq2seq_attn](../seq2seq_attn)) and continue to fine-tune with policy gradient. 
-
-The data and model configs are exact the same as the [MLE seq2seq example](../seq2seq_attn). The only difference is that MLE cross-entropy minimization is replaced with policy gradient training.
-
-The example shows:
-  * Use of `texar.tf.agents.SeqPGAgent` for policy gradient sequence generation.
-  * Use of the Python-based `texar.tf.evals.sentence/corpus_bleu` for efficient reward computing, and the Moses `texar.tf.evals.sentence/corpus_bleu_moses`
-    for standard test set evaluation.
-  * Use of `texar.tf.data.FeedableDataIterator` for data feeding and resuming from breakpoint. 
-
-## Usage ##
-
-### Dataset ###
-
-Two example datasets are provided:
-
-  * toy_copy: A small toy autoencoding dataset from [TF Seq2seq toolkit](https://github.com/google/seq2seq/tree/2500c26add91b079ca00cf1f091db5a99ddab9ae).
-  * iwslt14: The benchmark [IWSLT2014](https://sites.google.com/site/iwsltevaluation2014/home) (de-en) machine translation dataset. 
-
-Download the data with the following cmds:
-
-```
-python prepare_data.py --data toy_copy
-python prepare_data.py --data iwslt14
-```
-
-### Train the model ###
-
-Train the model with the following cmd:
-
-```
-python seq2seq_attn_pg.py --config_model config_model --config_data config_toy_copy
-```
-
-Here:
-  * `--config_model` specifies the model config. Note not to include the `.py` suffix.
-  * `--config_data` specifies the data config.
-
-All configs are (mostly) the same as those in the [seq2seq_attn example](../seq2seq_attn).
-
-## Results ##
-
-The code is for demonstrating Texar API. With pure policy gradient and without MLE pretraining the model collapse very quickly. 
diff --git a/examples/seq2seq_rl/config_iwslt14.py b/examples/seq2seq_rl/config_iwslt14.py
deleted file mode 100644
index 4726e6d3..00000000
--- a/examples/seq2seq_rl/config_iwslt14.py
+++ /dev/null
@@ -1,46 +0,0 @@
-
-display = 100
-display_eval = 5500
-
-source_vocab_file = './data/iwslt14/vocab.de'
-target_vocab_file = './data/iwslt14/vocab.en'
-
-train = {
-    'num_epochs': 10,
-    'batch_size': 32,
-    'allow_smaller_final_batch': False,
-    'source_dataset': {
-        "files": 'data/iwslt14/train.de',
-        'vocab_file': source_vocab_file,
-        'max_seq_length': 50
-    },
-    'target_dataset': {
-        'files': 'data/iwslt14/train.en',
-        'vocab_file': target_vocab_file,
-        'max_seq_length': 50
-    }
-}
-val = {
-    'batch_size': 32,
-    'shuffle': False,
-    'source_dataset': {
-        "files": 'data/iwslt14/valid.de',
-        'vocab_file': source_vocab_file,
-    },
-    'target_dataset': {
-        'files': 'data/iwslt14/valid.en',
-        'vocab_file': target_vocab_file,
-    }
-}
-test = {
-    'batch_size': 32,
-    'shuffle': False,
-    'source_dataset': {
-        "files": 'data/iwslt14/test.de',
-        'vocab_file': source_vocab_file,
-    },
-    'target_dataset': {
-        'files': 'data/iwslt14/test.en',
-        'vocab_file': target_vocab_file,
-    }
-}
diff --git a/examples/seq2seq_rl/config_model.py b/examples/seq2seq_rl/config_model.py
deleted file mode 100644
index 91dca58f..00000000
--- a/examples/seq2seq_rl/config_model.py
+++ /dev/null
@@ -1,33 +0,0 @@
-# Attentional Seq2seq model.
-# Hyperparameters not specified here will take the default values.
-
-num_units = 256
-beam_width = 10
-
-embedder = {
-    'dim': num_units
-}
-encoder = {
-    'rnn_cell_fw': {
-        'kwargs': {
-            'num_units': num_units
-        }
-    }
-}
-decoder = {
-    'rnn_cell': {
-        'kwargs': {
-            'num_units': num_units
-        },
-    },
-    'attention': {
-        'kwargs': {
-            'num_units': num_units,
-        },
-        'attention_layer_size': num_units
-    }
-}
-agent = {
-    'discount_factor': 0.,
-    'entropy_weight': .5
-}
diff --git a/examples/seq2seq_rl/config_toy_copy.py b/examples/seq2seq_rl/config_toy_copy.py
deleted file mode 100644
index 6d69937b..00000000
--- a/examples/seq2seq_rl/config_toy_copy.py
+++ /dev/null
@@ -1,44 +0,0 @@
-
-display = 10
-display_eval = 300
-
-source_vocab_file = './data/toy_copy/train/vocab.sources.txt'
-target_vocab_file = './data/toy_copy/train/vocab.targets.txt'
-
-train = {
-    'num_epochs': 10,
-    'batch_size': 32,
-    'allow_smaller_final_batch': False,
-    'source_dataset': {
-        "files": './data/toy_copy/train/sources.txt',
-        'vocab_file': source_vocab_file
-    },
-    'target_dataset': {
-        'files': './data/toy_copy/train/targets.txt',
-        'vocab_file': target_vocab_file
-    }
-}
-val = {
-    'batch_size': 32,
-    'allow_smaller_final_batch': False,
-    'source_dataset': {
-        "files": './data/toy_copy/dev/sources.txt',
-        'vocab_file': source_vocab_file
-    },
-    'target_dataset': {
-        "files": './data/toy_copy/dev/targets.txt',
-        'vocab_file': target_vocab_file
-    }
-}
-test = {
-    'batch_size': 32,
-    'allow_smaller_final_batch': False,
-    'source_dataset': {
-        "files": './data/toy_copy/test/sources.txt',
-        'vocab_file': source_vocab_file
-    },
-    'target_dataset': {
-        "files": './data/toy_copy/test/targets.txt',
-        'vocab_file': target_vocab_file
-    }
-}
diff --git a/examples/seq2seq_rl/prepare_data.py b/examples/seq2seq_rl/prepare_data.py
deleted file mode 100644
index e573a1e9..00000000
--- a/examples/seq2seq_rl/prepare_data.py
+++ /dev/null
@@ -1,56 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Downloads data.
-"""
-import tensorflow as tf
-import texar.tf as tx
-
-# pylint: disable=invalid-name
-
-flags = tf.flags
-
-flags.DEFINE_string("data", "iwslt14", "Data to download [iwslt14|toy_copy]")
-
-FLAGS = flags.FLAGS
-
-
-def prepare_data():
-    """Downloads data.
-    """
-    if FLAGS.data == 'iwslt14':
-        tx.data.maybe_download(
-            urls='https://drive.google.com/file/d/'
-                 '1Vuv3bed10qUxrpldHdYoiWLzPKa4pNXd/view?usp=sharing',
-            path='./',
-            filenames='iwslt14.zip',
-            extract=True)
-    elif FLAGS.data == 'toy_copy':
-        tx.data.maybe_download(
-            urls='https://drive.google.com/file/d/'
-                 '1fENE2rakm8vJ8d3voWBgW4hGlS6-KORW/view?usp=sharing',
-            path='./',
-            filenames='toy_copy.zip',
-            extract=True)
-    else:
-        raise ValueError('Unknown data: {}'.format(FLAGS.data))
-
-
-def main():
-    """Entrypoint.
-    """
-    prepare_data()
-
-
-if __name__ == '__main__':
-    main()
diff --git a/examples/seq2seq_rl/seq2seq_attn_pg.py b/examples/seq2seq_rl/seq2seq_attn_pg.py
deleted file mode 100644
index 7606456f..00000000
--- a/examples/seq2seq_rl/seq2seq_attn_pg.py
+++ /dev/null
@@ -1,206 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Attentional Seq2seq trained with policy gradient.
-"""
-from __future__ import absolute_import
-from __future__ import print_function
-from __future__ import division
-
-# pylint: disable=invalid-name, too-many-arguments, too-many-locals
-
-import importlib
-import numpy as np
-import tensorflow as tf
-import texar.tf as tx
-
-flags = tf.flags
-
-flags.DEFINE_string("config_model", "config_model", "The model config.")
-flags.DEFINE_string("config_data", "config_iwslt14", "The dataset config.")
-
-FLAGS = flags.FLAGS
-
-config_model = importlib.import_module(FLAGS.config_model)
-config_data = importlib.import_module(FLAGS.config_data)
-
-# A caveats of using `texar.tf.agents.SeqPGAgent`:
-# The training data iterator should not run to raise `OutOfRangeError`,
-# otherwise the iterator cannot be re-initialized and may raise
-# `CancelledError`. This is probably because the iterator is used by
-# `tf.Session.partial_run` in `SeqPGAgent`.
-#
-# A simple workaround is to set `'num_epochs'` of training data to a large
-# number so that its iterator will never run into `OutOfRangeError`. Use
-# `texar.tf.data.FeedableDataIterator` to periodically switch to dev/test data
-# for evaluation and switch back to the training data to resume from the
-# breakpoint.
-
-
-def build_model(batch, train_data):
-    """Assembles the seq2seq model.
-    """
-    source_embedder = tx.modules.WordEmbedder(
-        vocab_size=train_data.source_vocab.size, hparams=config_model.embedder)
-
-    encoder = tx.modules.BidirectionalRNNEncoder(
-        hparams=config_model.encoder)
-
-    enc_outputs, _ = encoder(source_embedder(batch['source_text_ids']))
-
-    target_embedder = tx.modules.WordEmbedder(
-        vocab_size=train_data.target_vocab.size, hparams=config_model.embedder)
-
-    decoder = tx.modules.AttentionRNNDecoder(
-        memory=tf.concat(enc_outputs, axis=2),
-        memory_sequence_length=batch['source_length'],
-        vocab_size=train_data.target_vocab.size,
-        hparams=config_model.decoder)
-
-    start_tokens = tf.ones_like(batch['target_length']) * \
-            train_data.target_vocab.bos_token_id
-
-    outputs, _, sequence_length = decoder(
-        decoding_strategy='infer_sample',
-        start_tokens=start_tokens,
-        end_token=train_data.target_vocab.eos_token_id,
-        embedding=target_embedder,
-        max_decoding_length=30)
-
-    beam_search_outputs, _, _ = \
-        tx.modules.beam_search_decode(
-            decoder_or_cell=decoder,
-            embedding=target_embedder,
-            start_tokens=start_tokens,
-            end_token=train_data.target_vocab.eos_token_id,
-            beam_width=config_model.beam_width,
-            max_decoding_length=60)
-
-    return outputs, sequence_length, beam_search_outputs
-
-
-def main():
-    """Entrypoint.
-    """
-    train_data = tx.data.PairedTextData(hparams=config_data.train)
-    val_data = tx.data.PairedTextData(hparams=config_data.val)
-    test_data = tx.data.PairedTextData(hparams=config_data.test)
-    iterator = tx.data.FeedableDataIterator(
-        {'train': train_data, 'val': val_data, 'test': test_data})
-
-    batch = iterator.get_next()
-
-    outputs, sequence_length, infer_outputs = build_model(batch, train_data)
-
-    agent = tx.agents.SeqPGAgent(
-        samples=outputs.sample_id,
-        logits=outputs.logits,
-        sequence_length=sequence_length,
-        hparams=config_model.agent)
-
-    def _train_and_eval(sess, agent):
-        iterator.restart_dataset(sess, 'train')
-
-        best_val_bleu = -1.
-        step = 0
-        while True:
-            try:
-                # Samples
-                extra_fetches = {
-                    'truth': batch['target_text_ids'],
-                }
-                feed_dict = {
-                    iterator.handle: iterator.get_handle(sess, 'train')
-                }
-                fetches = agent.get_samples(
-                    extra_fetches=extra_fetches, feed_dict=feed_dict)
-
-                sample_text = tx.utils.map_ids_to_strs(
-                    fetches['samples'], train_data.target_vocab,
-                    strip_eos=False, join=False)
-                truth_text = tx.utils.map_ids_to_strs(
-                    fetches['truth'], train_data.target_vocab,
-                    strip_eos=False, join=False)
-
-                # Computes rewards
-                reward = []
-                for ref, hyp in zip(truth_text, sample_text):
-                    r = tx.evals.sentence_bleu([ref], hyp, smooth=True)
-                    reward.append(r)
-
-                # Updates
-                loss = agent.observe(reward=reward)
-
-                # Displays & evaluates
-                step += 1
-                if step == 1 or step % config_data.display == 0:
-                    print("step={}, loss={:.4f}, reward={:.4f}".format(
-                        step, loss, np.mean(reward)))
-
-                if step % config_data.display_eval == 0:
-                    val_bleu = _eval_epoch(sess, 'val')
-                    best_val_bleu = max(best_val_bleu, val_bleu)
-                    print('val step={}, BLEU={:.4f}; best-ever={:.4f}'.format(
-                        step, val_bleu, best_val_bleu))
-
-                    test_bleu = _eval_epoch(sess, 'test')
-                    print('test step={}, BLEU={:.4f}'.format(step, test_bleu))
-                    print('=' * 50)
-
-            except tf.errors.OutOfRangeError:
-                break
-
-    def _eval_epoch(sess, mode):
-        """`mode` is one of {'val', 'test'}
-        """
-        iterator.restart_dataset(sess, mode)
-
-        refs, hypos = [], []
-        while True:
-            try:
-                fetches = [
-                    batch['target_text'][:, 1:],
-                    infer_outputs.predicted_ids[:, :, 0]
-                ]
-                feed_dict = {
-                    tx.global_mode(): tf.estimator.ModeKeys.PREDICT,
-                    iterator.handle: iterator.get_handle(sess, mode)
-                }
-                target_texts, output_ids = \
-                    sess.run(fetches, feed_dict=feed_dict)
-
-                target_texts = tx.utils.strip_special_tokens(target_texts)
-                output_texts = tx.utils.map_ids_to_strs(
-                    ids=output_ids, vocab=val_data.target_vocab)
-
-                for hypo, ref in zip(output_texts, target_texts):
-                    hypos.append(hypo)
-                    refs.append([ref])
-            except tf.errors.OutOfRangeError:
-                break
-
-        return tx.evals.corpus_bleu_moses(list_of_references=refs,
-                                          hypotheses=hypos)
-
-    with tf.Session() as sess:
-        sess.run(tf.global_variables_initializer())
-        sess.run(tf.local_variables_initializer())
-        sess.run(tf.tables_initializer())
-
-        agent.sess = sess
-
-        _train_and_eval(sess, agent)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/examples/seqgan/README.md b/examples/seqgan/README.md
deleted file mode 100644
index bfc49a61..00000000
--- a/examples/seqgan/README.md
+++ /dev/null
@@ -1,111 +0,0 @@
-# SeqGAN for Text Generation
-
-This example is an implementation of [(Yu et al.) SeqGAN: Sequence Generative Adversarial Nets with Policy Gradient](https://arxiv.org/pdf/1609.05473.pdf), with a language model as the generator and an RNN-based classifier as the discriminator.
-
-Model architecture and parameter settings are in line with the [official implementation](https://github.com/geek-ai/Texygen) of SeqGAN, except that we replace the MC-Tree rollout strategy with token-level reward by the RNN discriminator, which is simpler and provides competitive performance.
-
-Experiments are performed on two datasets:
-* The [PTB dataset](https://corochann.com/penn-tree-bank-ptb-dataset-introduction-1456.html) standard for language modeling
-* The [COCO Captions dataset](http://cocodataset.org/#download): with 2K vocabularies and an average sentence length of 25. We use the [data](https://github.com/geek-ai/Texygen/tree/master/data) provided in the official implementation, where train/test datasets contain 10K sentences, respectively.
-
-## Usage
-
-### Dataset
-Download datasets with the following cmds respectively:
-```shell
-python data_utils.py --config config_ptb_small --data_path ./ --dataset ptb
-python data_utils.py --config config_coco --data_path ./ --dataset coco
-```
-
-Here:
-* `--config` specifies config parameters to use. Default is `config_ptb_small`.
-* `--data_path` is the directory to store the downloaded dataset. Default is `./`.
-* `--dataset` indicates the training dataset. Currently `ptb`(default) and `coco` are supported.
-
-### Train the model
-
-Training on `coco` dataset can be performed with the following command:
-
-```shell
-python seqgan_train.py --config config_coco --data_path ./ --dataset coco
-```
-
-Here:
-
-`--config`, `--data_path` and `--dataset` should be the same with the flags settings used to download the dataset.
-
-The model will start training and will evaluate perplexity and BLEU score every 10 epochs.
-
-## Results
-
-### COCO Caption
-
-We compare the results of SeqGAN and MLE (maximum likelihood training) provided by our and official implemantations, using the default official parameter settings. Each cell below presents the BLEU scores on both the test set and the training set (in the parentheses). 
-
-We use the standard BLEU function [`texar.tf.evals.sentence_bleu_moses`](https://texar.readthedocs.io/en/latest/code/evals.html#sentence-bleu-moses) to evaluate BLEU scores for both the official and our implementations.
-
-|    |Texar - SeqGAN   | Official - SeqGAN | Texar - MLE | Official - MLE |
-|---------------|-------------|----------------|-------------|----------------|
-|BLEU-1 | 0.5670 (0.6850) | 0.6260 (0.7900) | 0.7130 (0.9360) | 0.6620 (0.8770) |
-|BLEU-2 | 0.3490 (0.5330) | 0.3570 (0.5880) | 0.4510 (0.7590) | 0.3780 (0.6910) |
-|BLEU-3 | 0.1940 (0.3480) | 0.1660 (0.3590) | 0.2490 (0.4990) | 0.1790 (0.4470) |
-|BLEU-4 | 0.0940 (0.1890) | 0.0710 (0.1800) | 0.1170 (0.2680) | 0.0790 (0.2390)|
-
-### PTB
-
-On PTB data, we use three different hyperparameter configurations which result in models of different sizes.
-The perplexity on both the test set and the training set are listed in the following table.
-
-|config|train   |Official - train |test    |  Official - test |
-|---   |---     |---              |---     |---               |
-|small |28.4790 |53.2289          |58.9798 | 55.7736          |
-|medium|16.3243 |9.8919           |37.6558 | 20.8537          |
-|large |14.5739 |4.7015           |52.0850 | 39.7949          |
-
-## Training Log
-
-During training, loss and BLEU score are recorded in the log directory. Here, we provide sample log output when training on the  `coco` dataset.
-
-### Training loss
-Training loss will be recorded in coco_log/log.txt.
-```text
-G pretrain epoch   0, step   1: train_ppl: 1781.854030
-G pretrain epoch   1, step 201: train_ppl: 10.483647
-G pretrain epoch   2, step 401: train_ppl: 7.335757
-...
-G pretrain epoch  77, step 12201: train_ppl: 3.372638
-G pretrain epoch  78, step 12401: train_ppl: 3.534658
-D pretrain epoch   0, step   0: dis_total_loss: 27.025223, r_loss: 13.822192, f_loss: 13.203032
-D pretrain epoch   1, step   0: dis_total_loss: 26.331108, r_loss: 13.592842, f_loss: 12.738266
-D pretrain epoch   2, step   0: dis_total_loss: 27.042515, r_loss: 13.592712, f_loss: 13.449802
-...
-D pretrain epoch  77, step   0: dis_total_loss: 25.134272, r_loss: 12.660420, f_loss: 12.473851
-D pretrain epoch  78, step   0: dis_total_loss: 23.727032, r_loss: 12.822734, f_loss: 10.904298
-D pretrain epoch  79, step   0: dis_total_loss: 24.769077, r_loss: 12.733292, f_loss: 12.035786
-G train  epoch  80, step 12601: mean_reward: 0.027631, expect_reward_loss:-0.256241, update_loss: -20.670971
-D train  epoch  80, step   0: dis_total_loss: 25.222481, r_loss: 12.671371, f_loss: 12.551109
-D train  epoch  81, step   0: dis_total_loss: 25.695383, r_loss: 13.037079, f_loss: 12.658304
-...
-G train  epoch 178, step 22401: mean_reward: 3.409714, expect_reward_loss:-3.474687, update_loss: 733.247009
-D train  epoch 178, step   0: dis_total_loss: 24.715553, r_loss: 13.181369, f_loss: 11.534184
-D train  epoch 179, step   0: dis_total_loss: 24.572170, r_loss: 13.176209, f_loss: 11.395961
-```
-
-### BLEU
-BLEU1~BLEU4 scores will be calculated every 10 epochs, the results are written to log_dir/bleu.txt.
-```text
-...
-epoch 170 BLEU1~4 on train dataset:
-0.726647
-0.530675
-0.299362
-0.133602
-
- epoch 170 BLEU1~4 on test dataset:
-0.548151
-0.283765
-0.118528
-0.042177
-...
-```
-
diff --git a/examples/seqgan/config_coco.py b/examples/seqgan/config_coco.py
deleted file mode 100644
index 981308cc..00000000
--- a/examples/seqgan/config_coco.py
+++ /dev/null
@@ -1,100 +0,0 @@
-generator_pretrain_epoch = 80
-discriminator_pretrain_epoch = 80
-adversial_epoch = 100
-
-hidden_size = 32
-batch_size = 64
-max_num_steps = 20
-
-enc_keep_prob_in = 1.0
-dec_keep_prob_out = 1.0
-
-log_dir = './coco_log/'
-log_file = log_dir + 'log.txt'
-bleu_file = log_dir + 'bleu.txt'
-ckpt = './checkpoint/ckpt'
-
-dec_cell_hparams = {
-    "type": "LSTMBlockCell",
-    "kwargs": {
-        "num_units": hidden_size,
-        "forget_bias": 0.
-    },
-    "dropout": {"output_keep_prob": dec_keep_prob_out},
-    "num_layers": 1
-}
-
-emb_hparams = {
-    'name': 'lookup_table',
-    "dim": hidden_size,
-    'initializer': {
-        'type': 'random_normal_initializer',
-        'kwargs': {
-            'mean': 0.0,
-            'stddev': hidden_size**-0.5,
-        },
-    }
-}
-
-train_data_hparams = {
-    "num_epochs": 1,
-    "batch_size": batch_size,
-    "seed": 123,
-    "dataset": {
-        "files": 'coco_data/coco.train.txt',
-        "vocab_file": 'coco_data/vocab.txt',
-        "max_seq_length": max_num_steps
-    }
-}
-
-val_data_hparams = {
-    "num_epochs": 1,
-    "batch_size": batch_size,
-    "seed": 123,
-    "dataset": {
-        "files": 'coco_data/coco.valid.txt',
-        "vocab_file": 'coco_data/vocab.txt',
-        "max_seq_length": max_num_steps
-    }
-}
-
-test_data_hparams = {
-    "num_epochs": 1,
-    "batch_size": batch_size,
-    "dataset": {
-        "files": 'coco_data/coco.test.txt',
-        "vocab_file": 'coco_data/vocab.txt',
-        "max_seq_length": max_num_steps
-    }
-}
-
-g_opt_hparams = {
-    "optimizer": {
-        "type": "AdamOptimizer",
-        "kwargs": {
-            "learning_rate": 0.01
-        }
-    },
-    "gradient_clip": {
-        "type": "clip_by_global_norm",
-        "kwargs": {"clip_norm": 5.}
-    }
-}
-
-d_opt_hparams = {
-    "optimizer": {
-        "type": "AdamOptimizer",
-        "kwargs": {
-            "learning_rate": 0.0001
-        }
-    }
-}
-
-update_opt_hparams = {
-    "optimizer": {
-        "type": "AdamOptimizer",
-        "kwargs": {
-            "learning_rate": 0.0004
-        }
-    }
-}
diff --git a/examples/seqgan/config_ptb_large.py b/examples/seqgan/config_ptb_large.py
deleted file mode 100644
index 5e8616f4..00000000
--- a/examples/seqgan/config_ptb_large.py
+++ /dev/null
@@ -1,98 +0,0 @@
-generator_pretrain_epoch = 55
-discriminator_pretrain_epoch = 15
-adversial_epoch = 20
-
-hidden_size = 1500
-batch_size = 64
-max_num_steps = 35
-
-enc_keep_prob_in = 1.0
-dec_keep_prob_out = 0.35
-
-log_dir = './ptb_log.large/'
-log_file = log_dir + 'log.txt'
-bleu_file = log_dir + 'bleu.txt'
-ckpt = './checkpoint/ckpt'
-
-dec_cell_hparams = {
-    "type": "LSTMBlockCell",
-    "kwargs": {
-        "num_units": hidden_size,
-        "forget_bias": 0.
-    },
-    "dropout": {"output_keep_prob": dec_keep_prob_out},
-    "num_layers": 2
-}
-
-emb_hparams = {
-    'name': 'lookup_table',
-    "dim": hidden_size,
-    'initializer': {
-        'type': 'random_normal_initializer',
-        'kwargs': {
-            'mean': 0.0,
-            'stddev': hidden_size**-0.5,
-        },
-    }
-}
-
-train_data_hparams = {
-    "num_epochs": 1,
-    "batch_size": batch_size,
-    "seed": 123,
-    "dataset": {
-        "files": 'ptb_data/ptb.train.txt',
-        "vocab_file": 'ptb_data/vocab.txt',
-        "max_seq_length": max_num_steps
-    }
-}
-
-val_data_hparams = {
-    "num_epochs": 1,
-    "batch_size": batch_size,
-    "seed": 123,
-    "dataset": {
-        "files": 'ptb_data/ptb.valid.txt',
-        "vocab_file": 'ptb_data/vocab.txt',
-        "max_seq_length": max_num_steps
-    }
-}
-
-test_data_hparams = {
-    "num_epochs": 1,
-    "batch_size": batch_size,
-    "dataset": {
-        "files": 'ptb_data/ptb.test.txt',
-        "vocab_file": 'ptb_data/vocab.txt',
-        "max_seq_length": max_num_steps
-    }
-}
-
-g_opt_hparams = {
-    "optimizer": {
-        "type": "GradientDescentOptimizer",
-        "kwargs": {"learning_rate": 1.0}
-    },
-    "gradient_clip": {
-        "type": "clip_by_global_norm",
-        "kwargs": {"clip_norm": 10.}
-    }
-}
-
-d_opt_hparams = {
-    "optimizer": {
-        "type": "AdamOptimizer",
-        "kwargs": {
-            "learning_rate": 0.0001
-        }
-    }
-}
-
-update_opt_hparams = {
-    "optimizer": {
-        "type": "AdamOptimizer",
-        "kwargs": {
-            "learning_rate": 0.0004
-        }
-    }
-}
diff --git a/examples/seqgan/config_ptb_medium.py b/examples/seqgan/config_ptb_medium.py
deleted file mode 100644
index 5aed29a7..00000000
--- a/examples/seqgan/config_ptb_medium.py
+++ /dev/null
@@ -1,98 +0,0 @@
-generator_pretrain_epoch = 39
-discriminator_pretrain_epoch = 15
-adversial_epoch = 20
-
-hidden_size = 650
-batch_size = 64
-max_num_steps = 35
-
-enc_keep_prob_in = 1.0
-dec_keep_prob_out = 0.5
-
-log_dir = './ptb_log.medium/'
-log_file = log_dir + 'log.txt'
-bleu_file = log_dir + 'bleu.txt'
-ckpt = './checkpoint/ckpt'
-
-dec_cell_hparams = {
-    "type": "LSTMBlockCell",
-    "kwargs": {
-        "num_units": hidden_size,
-        "forget_bias": 0.
-    },
-    "dropout": {"output_keep_prob": dec_keep_prob_out},
-    "num_layers": 2
-}
-
-emb_hparams = {
-    'name': 'lookup_table',
-    "dim": hidden_size,
-    'initializer': {
-        'type': 'random_normal_initializer',
-        'kwargs': {
-            'mean': 0.0,
-            'stddev': hidden_size**-0.5,
-        },
-    }
-}
-
-train_data_hparams = {
-    "num_epochs": 1,
-    "batch_size": batch_size,
-    "seed": 123,
-    "dataset": {
-        "files": 'ptb_data/ptb.train.txt',
-        "vocab_file": 'ptb_data/vocab.txt',
-        "max_seq_length": max_num_steps
-    }
-}
-
-val_data_hparams = {
-    "num_epochs": 1,
-    "batch_size": batch_size,
-    "seed": 123,
-    "dataset": {
-        "files": 'ptb_data/ptb.valid.txt',
-        "vocab_file": 'ptb_data/vocab.txt',
-        "max_seq_length": max_num_steps
-    }
-}
-
-test_data_hparams = {
-    "num_epochs": 1,
-    "batch_size": batch_size,
-    "dataset": {
-        "files": 'ptb_data/ptb.test.txt',
-        "vocab_file": 'ptb_data/vocab.txt',
-        "max_seq_length": max_num_steps
-    }
-}
-
-g_opt_hparams = {
-    "optimizer": {
-        "type": "GradientDescentOptimizer",
-        "kwargs": {"learning_rate": 1.0}
-    },
-    "gradient_clip": {
-        "type": "clip_by_global_norm",
-        "kwargs": {"clip_norm": 5.}
-    }
-}
-
-d_opt_hparams = {
-    "optimizer": {
-        "type": "AdamOptimizer",
-        "kwargs": {
-            "learning_rate": 0.0001
-        }
-    }
-}
-
-update_opt_hparams = {
-    "optimizer": {
-        "type": "AdamOptimizer",
-        "kwargs": {
-            "learning_rate": 0.0004
-        }
-    }
-}
diff --git a/examples/seqgan/config_ptb_small.py b/examples/seqgan/config_ptb_small.py
deleted file mode 100644
index 30337c8d..00000000
--- a/examples/seqgan/config_ptb_small.py
+++ /dev/null
@@ -1,98 +0,0 @@
-generator_pretrain_epoch = 13
-discriminator_pretrain_epoch = 15
-adversial_epoch = 10
-
-hidden_size = 200
-batch_size = 64
-max_num_steps = 20
-
-enc_keep_prob_in = 1.0
-dec_keep_prob_out = 1.0
-
-log_dir = './ptb_log.small/'
-log_file = log_dir + 'log.txt'
-bleu_file = log_dir + 'bleu.txt'
-ckpt = './checkpoint/ckpt'
-
-dec_cell_hparams = {
-    "type": "LSTMBlockCell",
-    "kwargs": {
-        "num_units": hidden_size,
-        "forget_bias": 0.
-    },
-    "dropout": {"output_keep_prob": dec_keep_prob_out},
-    "num_layers": 2
-}
-
-emb_hparams = {
-    'name': 'lookup_table',
-    "dim": hidden_size,
-    'initializer': {
-        'type': 'random_normal_initializer',
-        'kwargs': {
-            'mean': 0.0,
-            'stddev': hidden_size**-0.5,
-        },
-    }
-}
-
-train_data_hparams = {
-    "num_epochs": 1,
-    "batch_size": batch_size,
-    "seed": 123,
-    "dataset": {
-        "files": 'ptb_data/ptb.train.txt',
-        "vocab_file": 'ptb_data/vocab.txt',
-        "max_seq_length": max_num_steps
-    }
-}
-
-val_data_hparams = {
-    "num_epochs": 1,
-    "batch_size": batch_size,
-    "seed": 123,
-    "dataset": {
-        "files": 'ptb_data/ptb.valid.txt',
-        "vocab_file": 'ptb_data/vocab.txt',
-        "max_seq_length": max_num_steps
-    }
-}
-
-test_data_hparams = {
-    "num_epochs": 1,
-    "batch_size": batch_size,
-    "dataset": {
-        "files": 'ptb_data/ptb.test.txt',
-        "vocab_file": 'ptb_data/vocab.txt',
-        "max_seq_length": max_num_steps
-    }
-}
-
-g_opt_hparams = {
-    "optimizer": {
-        "type": "GradientDescentOptimizer",
-        "kwargs": {"learning_rate": 1.0}
-    },
-    "gradient_clip": {
-        "type": "clip_by_global_norm",
-        "kwargs": {"clip_norm": 5.}
-    }
-}
-
-d_opt_hparams = {
-    "optimizer": {
-        "type": "AdamOptimizer",
-        "kwargs": {
-            "learning_rate": 0.0001
-        }
-    }
-}
-
-update_opt_hparams = {
-    "optimizer": {
-        "type": "AdamOptimizer",
-        "kwargs": {
-            "learning_rate": 0.0004
-        }
-    }
-}
diff --git a/examples/seqgan/data_utils.py b/examples/seqgan/data_utils.py
deleted file mode 100644
index 7c78e954..00000000
--- a/examples/seqgan/data_utils.py
+++ /dev/null
@@ -1,54 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""SeqGAN for language modeling
-"""
-import os
-import argparse
-import importlib
-import tensorflow as tf
-import texar.tf as tx
-
-parser = argparse.ArgumentParser(description='prepare data')
-parser.add_argument('--dataset', type=str, default='ptb',
-                    help='dataset to prepare')
-parser.add_argument('--data_path', type=str, default='./',
-                    help="Directory containing coco. If not exists, "
-                    "the directory will be created, and the data "
-                    "will be downloaded.")
-parser.add_argument('--config', type=str, default='config_ptb_small',
-                    help='The config to use.')
-args = parser.parse_args()
-
-config = importlib.import_module(args.config)
-
-
-def prepare_data(args, config, train_path):
-    """Downloads the PTB or COCO dataset
-    """
-    if not os.path.exists(config.log_dir):
-        os.mkdir(config.log_dir)
-
-    ptb_url = 'https://jxhe.github.io/download/ptb_data.tgz'
-    coco_url = 'https://VegB.github.io/downloads/coco_data.tgz'
-
-    data_path = args.data_path
-
-    if not tf.gfile.Exists(train_path):
-        url = ptb_url if args.dataset == 'ptb' else coco_url
-        tx.data.maybe_download(url, data_path, extract=True)
-        os.remove('%s_data.tgz' % args.dataset)
-
-
-if __name__ == '__main__':
-    prepare_data(args, config, config.train_data_hparams['dataset']['files'])
diff --git a/examples/seqgan/seqgan_train.py b/examples/seqgan/seqgan_train.py
deleted file mode 100644
index 47ad7181..00000000
--- a/examples/seqgan/seqgan_train.py
+++ /dev/null
@@ -1,342 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""SeqGAN for language modeling
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# pylint: disable=invalid-name, no-member, too-many-locals
-
-import importlib
-import numpy as np
-import tensorflow as tf
-import texar.tf as tx
-
-flags = tf.flags
-flags.DEFINE_string("dataset", "ptb",
-                    "perform training on ptb or coco.")
-flags.DEFINE_string("data_path", "./",
-                    "Directory containing coco. If not exists, "
-                    "the directory will be created, and the data "
-                    "will be downloaded.")
-flags.DEFINE_string("config", "config_ptb_small", "The config to use.")
-FLAGS = flags.FLAGS
-
-config = importlib.import_module(FLAGS.config)
-
-
-def _main(_):
-    log = open(config.log_file, 'w')
-    bleu_log = open(config.bleu_file, 'w')
-
-    # Data
-    train_data = tx.data.MonoTextData(config.train_data_hparams)
-    val_data = tx.data.MonoTextData(config.val_data_hparams)
-    test_data = tx.data.MonoTextData(config.test_data_hparams)
-    iterator = tx.data.TrainTestDataIterator(train=train_data,
-                                             val=val_data,
-                                             test=test_data)
-    data_batch = iterator.get_next()
-
-    batch_size = tf.shape(data_batch["text_ids"])[0]
-    num_steps = tf.shape(data_batch["text_ids"])[1]
-    vocab_size = train_data.vocab.size
-
-    # Model architecture
-    g_embedder = tx.modules.WordEmbedder(vocab_size=vocab_size,
-                                         hparams=config.emb_hparams)
-    input_embed = g_embedder(data_batch["text_ids"][:, :-1])
-
-    if config.enc_keep_prob_in < 1:
-        input_embed = tf.nn.dropout(
-            input_embed, tx.utils.switch_dropout(config.enc_keep_prob_in))
-
-    decoder = tx.modules.BasicRNNDecoder(
-        vocab_size=vocab_size,
-        hparams={"rnn_cell": config.dec_cell_hparams,
-                 "max_decoding_length_infer": config.max_num_steps + 2})
-    initial_state = decoder.zero_state(batch_size=batch_size,
-                                       dtype=tf.float32)
-
-    # ------------Pretrain Generator---------------
-    outputs, _, _ = decoder(
-        initial_state=initial_state,
-        decoding_strategy="train_greedy",
-        inputs=input_embed,
-        sequence_length=data_batch["length"] - 1)
-
-    mle_loss = tx.losses.sequence_sparse_softmax_cross_entropy(
-        labels=data_batch["text_ids"][:, 1:],
-        logits=outputs.logits,
-        sequence_length=data_batch["length"] - 1)
-
-    global_step = tf.Variable(0, trainable=False)
-    g_variables = tx.utils.collect_trainable_variables([g_embedder, decoder])
-    gen_train_op = tx.core.get_train_op(mle_loss,
-                                        variables=g_variables,
-                                        global_step=global_step,
-                                        increment_global_step=True,
-                                        hparams=config.g_opt_hparams)
-
-    # -------------Generator Infer-------------------
-    start_tokens = tf.cast(tf.fill([batch_size],
-                                   train_data.vocab.bos_token_id),
-                           dtype=tf.int32)
-    infer_outputs, _, sequence_length = decoder(
-        decoding_strategy="infer_sample",
-        start_tokens=start_tokens,
-        end_token=train_data.vocab.eos_token_id,
-        embedding=g_embedder,
-        initial_state=initial_state,
-        max_decoding_length=config.max_num_steps)
-
-    infer_logits = infer_outputs.logits
-    infer_sample_ids = infer_outputs.sample_id
-
-    # ------------Pretrain Discriminator---------------
-    discriminator = tx.modules.UnidirectionalRNNClassifier(
-        hparams={"clas_strategy": "time_wise", "num_classes": 1})
-    d_embedder = tx.modules.WordEmbedder(vocab_size=vocab_size,
-                                         hparams=config.emb_hparams)
-
-    r_logits, _ = discriminator(d_embedder(data_batch["text_ids"][:, 1:]),
-                                sequence_length=data_batch["length"] - 1)
-    f_logits, _ = discriminator(d_embedder(infer_sample_ids), sequence_length=sequence_length)
-
-    r_loss = tx.losses.sequence_sigmoid_cross_entropy(
-        labels=tf.ones_like(data_batch["text_ids"][:, 1:], dtype=tf.float32),
-        logits=tf.squeeze(r_logits),
-        sequence_length=data_batch["length"] - 1)  # r_preds -> 1.
-    f_loss = tx.losses.sequence_sigmoid_cross_entropy(
-        labels=tf.zeros_like(infer_sample_ids, dtype=tf.float32),
-        logits=tf.squeeze(f_logits),
-        sequence_length=sequence_length)  # infer_logits -> 0.
-    dis_loss = r_loss + f_loss
-    dis_loss.set_shape(())
-
-    d_variables = tx.utils.collect_trainable_variables([discriminator, d_embedder])
-    dis_train_op = tx.core.get_train_op(dis_loss,
-                                        variables=d_variables,
-                                        global_step=global_step,
-                                        increment_global_step=False,
-                                        hparams=config.d_opt_hparams)
-
-    # ------------Adeversarial---------------
-    infer_logits = tf.clip_by_value(
-        tf.nn.softmax(infer_logits) *
-        tf.one_hot(infer_sample_ids, vocab_size), 1e-20, 1)
-
-    expected_reward = tf.Variable(tf.zeros((config.max_num_steps,)))
-    reward = tf.reshape(f_logits, shape=(batch_size, -1)) - \
-            expected_reward[:tf.shape(f_logits)[1]]
-    mean_reward = tf.reduce_mean(reward)
-    exp_reward_loss = -tf.reduce_mean(tf.abs(reward))
-    exp_reward_loss.set_shape(())
-    exp_op = tx.core.get_train_op(exp_reward_loss,
-                                  variables=[expected_reward],
-                                  global_step=global_step,
-                                  increment_global_step=False,
-                                  hparams=config.update_opt_hparams)
-    reward = tx.losses.discount_reward(
-        reward, sequence_length=tf.squeeze(sequence_length), tensor_rank=2)
-    update_loss = -tf.reduce_mean(tf.log(infer_logits) *
-                                  tf.expand_dims(reward, -1))
-    update_loss.set_shape(())
-    gen_op = tx.core.get_train_op(update_loss,
-                                  variables=g_variables,
-                                  global_step=global_step,
-                                  increment_global_step=True,
-                                  hparams=config.update_opt_hparams)
-    update_op = tf.group(gen_op, exp_op)
-
-    def _g_train_epoch(sess, epoch, mode_string):
-        iterator.switch_to_train_data(sess)
-        while True:
-            try:
-                if mode_string == 'train':
-                    fetches = {
-                        'mean_rwd': mean_reward,
-                        'exp_rwd_loss': exp_reward_loss,
-                        'update_loss': update_loss,
-                        'update_op': update_op,
-                        'exp_rwd': expected_reward,
-                        'step': global_step
-                    }
-                elif mode_string == 'pretrain':
-                    fetches = {
-                        'mle_loss': mle_loss,
-                        'num_steps': num_steps,
-                        'train_op': gen_train_op,
-                        'step': global_step
-                    }
-                else:
-                    raise ValueError(
-                        "Expect mode_string to be one of "
-                        "['pretrain', 'train'], got %s" % mode_string)
-                rtns = sess.run(fetches)
-                step = rtns['step']
-                if step % 200 == 1:
-                    if mode_string == 'pretrain':
-                        ppl = np.exp(rtns['mle_loss'] / rtns["num_steps"])
-                        rst = "G {0:6s} epoch {1:3d}, step {2:3d}:" \
-                              " train_ppl: {3:6f}".format(mode_string,
-                                                          epoch, step, ppl)
-                    else:
-                        rst = "G {0:6s} epoch {1:3d}, step {2:3d}: " \
-                              "mean_reward: {3:6f}, " \
-                              "expect_reward_loss:{4:6f}, " \
-                              "update_loss: {5:6f}".format(
-                                  mode_string, epoch, step, rtns['mean_rwd'],
-                                  rtns['exp_rwd_loss'], rtns['update_loss'])
-                    log.write(rst + '\n')
-                    log.flush()
-                    print(rst)
-                    if mode_string == 'train':  # a batch per adversarial epoch
-                        break
-            except tf.errors.OutOfRangeError:
-                break
-        return
-
-    def _g_test_epoch(sess, epoch, mode_string):
-        def _id2word_map(id_arrays):
-            return [' '.join([train_data.vocab.id_to_token_map_py[i]
-                              for i in sent]) for sent in id_arrays]
-
-        if mode_string == 'valid':
-            iterator.switch_to_val_data(sess)
-        elif mode_string == 'test':
-            iterator.switch_to_test_data(sess)
-        else:
-            raise ValueError("Expect mode_string to be one of "
-                             "['valid', 'test'], got %s" % mode_string)
-
-        target_list, inference_list = [], []
-        loss, steps = 0., 0
-        while True:
-            try:
-                fetches = {
-                    "mle_loss": mle_loss,
-                    "num_steps": num_steps
-                }
-                if mode_string == 'test':
-                    fetches['target_sample_id'] = data_batch["text_ids"]
-                    fetches['infer_sample_id'] = infer_sample_ids
-
-                feed_dict = {tx.global_mode(): tf.estimator.ModeKeys.EVAL}
-
-                rtns = sess.run(fetches, feed_dict)
-
-                loss += rtns['mle_loss']
-                steps += rtns['num_steps']
-
-                if mode_string == 'test':
-                    targets = _id2word_map(rtns['target_sample_id'][:, 1:].tolist())  # remove <BOS>
-                    for t in targets:
-                        target_list.extend(t.split('<EOS>')[0].strip().split())
-
-                    inferences = _id2word_map(rtns['infer_sample_id'].tolist())
-                    for inf in inferences:
-                        inference_list.extend(inf.split('<EOS>')[0].strip().split())
-
-            except tf.errors.OutOfRangeError:
-                break
-
-        ppl = np.exp(loss / steps)
-        rst = "G {0:6s} epoch {1:3d}, step {2:3s}:" \
-              " {3:5s}_ppl: {4:6f}"\
-            .format(mode_string, epoch, '-', mode_string, ppl)
-        log.write(rst + '\n')
-        log.flush()
-        print(rst)
-
-        if mode_string == 'test':
-            bleu_test = tx.evals.sentence_bleu_moses(
-                references=[target_list],
-                hypothesis=inference_list,
-                lowercase=True, return_all=True)
-            if not isinstance(bleu_test, np.ndarray):  # might return 0.0 if inference_list is null
-                bleu_test = [bleu_test] * 5
-            rst_test = "epoch %d BLEU1~4 on test dataset:\n" \
-                       "%f\n%f\n%f\n%f\n\n" % \
-                       (epoch, bleu_test[1], bleu_test[2],
-                        bleu_test[3], bleu_test[4])
-            print(rst_test)
-            bleu_log.write(rst_test)
-            bleu_log.flush()
-
-        return
-
-    def _d_run_epoch(sess, epoch, mode_string='pretrain'):
-        iterator.switch_to_train_data(sess)
-        step = 0
-        while True:
-            try:
-                fetches = {
-                    "mle_loss": dis_loss,
-                    "r_loss": r_loss,
-                    "f_loss": f_loss,
-                    "train_op": dis_train_op
-                }
-                rtns = sess.run(fetches)
-                if step % 200 == 0:
-                    rst = "D {0:6s} epoch {1:3d}, step {2:3d}: " \
-                          "dis_total_loss: {3:6f}, r_loss: {4:6f}, " \
-                          "f_loss: {5:6f}".format(
-                              mode_string, epoch, step, rtns['mle_loss'],
-                              rtns['r_loss'], rtns['f_loss'])
-                    log.write(rst + '\n')
-                    log.flush()
-                    print(rst)
-                step += 1
-                if step == 15 and mode_string == 'train':
-                    break
-            except tf.errors.OutOfRangeError:
-                break
-
-    tf_config = tf.ConfigProto()
-    tf_config.gpu_options.allow_growth = True
-    with tf.Session(config=tf_config) as sess:
-        sess.run(tf.global_variables_initializer())
-        sess.run(tf.local_variables_initializer())
-        sess.run(tf.tables_initializer())
-
-        # Generator pre-training
-        for g_epoch in range(config.generator_pretrain_epoch):
-            _g_train_epoch(sess, g_epoch, 'pretrain')
-            if g_epoch % 10 == 0 or \
-                    g_epoch == config.generator_pretrain_epoch - 1:
-                _g_test_epoch(sess, g_epoch, 'valid')
-                _g_test_epoch(sess, g_epoch, 'test')
-
-        # Discriminator pre-training
-        for d_epoch in range(config.discriminator_pretrain_epoch):
-            _d_run_epoch(sess, d_epoch)
-
-        # Adversarial training
-        for update_epoch in range(config.adversial_epoch):
-            cur_epoch = update_epoch + config.generator_pretrain_epoch
-            _g_train_epoch(sess, cur_epoch, 'train')
-            _d_run_epoch(sess, cur_epoch, mode_string='train')
-            if update_epoch % 10 == 0 or \
-                    update_epoch == config.adversial_epoch - 1:
-                _g_test_epoch(sess, cur_epoch, 'valid')
-                _g_test_epoch(sess, cur_epoch, 'test')
-
-    log.close()
-    bleu_log.close()
-
-
-if __name__ == '__main__':
-    tf.app.run(main=_main)
diff --git a/examples/sequence_tagging/.gitignore b/examples/sequence_tagging/.gitignore
deleted file mode 100644
index fe8904f0..00000000
--- a/examples/sequence_tagging/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-/data/
-/tmp/
diff --git a/examples/sequence_tagging/README.md b/examples/sequence_tagging/README.md
deleted file mode 100644
index cd827573..00000000
--- a/examples/sequence_tagging/README.md
+++ /dev/null
@@ -1,28 +0,0 @@
-# Sequence tagging on CoNLL-2003 #
-
-This example builds a bi-directional LSTM-CNN model for NER task and trains on CoNLL-2003 data. Model and training are described in   
->[(Ma et al.) End-to-end Sequence Labeling via Bi-directional LSTM-CNNs-CRF](http://www.cs.cmu.edu/~xuezhem/publications/P16-1101.pdf)
-
-The top CRF layer is not used here.
-
-## Dataset ##
-
-The code uses [CoNLL-2003 NER dataset](https://www.clips.uantwerpen.be/conll2003/ner/) (English). Please put data files (e.g., `eng.train.bio.conll`) under `./data` folder. Pretrained Glove word embeddings can also be used (set `load_glove=True` in [config.py](./config.py)). The Glove file should also be under `./data`. 
-
-## Run ##
-
-To train a NER model,
-
-    python ner.py
-
-The model will begin training, and will evaluate on the validation data periodically, and evaluate on the test data after the training is done. 
-
-## Results ##
-
-The results on validation and test data is:
-
-|       |   prec   |  recall  |    F1    |
-|-------|----------|----------|----------|
-| valid |  91.18   |  92.41   |  91.79   |
-| test  |  86.13   |  88.31   |  87.21   |
-
diff --git a/examples/sequence_tagging/config.py b/examples/sequence_tagging/config.py
deleted file mode 100644
index 4ff4b819..00000000
--- a/examples/sequence_tagging/config.py
+++ /dev/null
@@ -1,74 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""NER config.
-"""
-
-# pylint: disable=invalid-name, too-few-public-methods, missing-docstring
-
-num_epochs = 200
-char_dim = 30
-embed_dim = 100
-hidden_size = 256
-tag_space = 128
-keep_prob = 0.5
-batch_size = 16
-encoder = None
-load_glove = True
-
-emb = {
-    "name": "embedding",
-    "dim": embed_dim,
-    "dropout_rate": 0.33,
-    "dropout_strategy": 'item'
-}
-
-char_emb = {
-    "name": "char_embedding",
-    "dim": char_dim
-}
-
-conv = {
-    "filters": 30,
-    "kernel_size": [3],
-    "conv_activation": "tanh",
-    "num_dense_layers": 0,
-    "dropout_rate": 0.
-}
-
-cell = {
-    "type": "LSTMCell",
-    "kwargs": {
-        "num_units": hidden_size,
-        "forget_bias": 1.
-    },
-    "dropout": {"output_keep_prob": keep_prob},
-    "num_layers": 1
-}
-opt = {
-    "optimizer": {
-        "type": "MomentumOptimizer",
-        "kwargs": {"learning_rate": 0.1,
-                   "momentum": 0.9,
-                   "use_nesterov": True}
-    },
-    "learning_rate_decay": {
-        "type": "inverse_time_decay",
-        "kwargs": {
-            "decay_steps": 1,
-            "decay_rate": 0.05,
-            "staircase": True
-        },
-        "start_decay_step": 1
-    }
-}
diff --git a/examples/sequence_tagging/conll_reader.py b/examples/sequence_tagging/conll_reader.py
deleted file mode 100644
index 58722073..00000000
--- a/examples/sequence_tagging/conll_reader.py
+++ /dev/null
@@ -1,266 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Utilities for preprocessing and iterating over the CoNLL 2003 data.
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import re
-from collections import defaultdict
-import numpy as np
-import tensorflow as tf
-
-
-# pylint: disable=invalid-name, too-many-locals
-
-MAX_CHAR_LENGTH = 45
-NUM_CHAR_PAD = 2
-
-UNK_WORD, UNK_CHAR, UNK_NER = 0, 0, 0
-PAD_WORD, PAD_CHAR, PAD_NER = 1, 1, 1
-
-# Regular expressions used to normalize digits.
-DIGIT_RE = re.compile(r"\d")
-
-
-def create_vocabs(train_path, dev_path, test_path, normalize_digits=True, min_occur=1, glove_dict=None):
-    word_vocab = defaultdict(lambda: len(word_vocab))
-    word_count = defaultdict(lambda: 0)
-    char_vocab = defaultdict(lambda: len(char_vocab))
-    ner_vocab = defaultdict(lambda: len(ner_vocab))
-
-    UNK_WORD = word_vocab["<unk>"]
-    PAD_WORD = word_vocab["<pad>"]
-    UNK_CHAR = char_vocab["<unk>"]
-    PAD_CHAR = char_vocab["<pad>"]
-    UNK_NER = ner_vocab["<unk>"]
-    PAD_NER = ner_vocab["<pad>"]
-
-    print("Creating Vocabularies:")
-
-    for file_path in [train_path, dev_path, test_path]:
-        with open(file_path, 'r') as file:
-            for line in file:
-                line = line.strip()
-                if len(line) == 0:
-                    continue
-
-                tokens = line.split(' ')
-                for char in tokens[1]:
-                    cid = char_vocab[char]
-
-                word = DIGIT_RE.sub("0", tokens[1]) if normalize_digits else tokens[1]
-                ner = tokens[4]
-
-                if glove_dict is not None and (word in glove_dict or word.lower() in glove_dict):
-                    word_count[word] += min_occur + 1
-                elif file_path == train_path:
-                    word_count[word] += 1
-
-                nid = ner_vocab[ner]
-
-    print("Total Vocabulary Size: %d" % len(word_count))
-    for word in word_count:
-        if word_count[word] > min_occur:
-            wid = word_vocab[word]
-
-    print("Word Vocabulary Size: %d" % len(word_vocab))
-    print("Character Alphabet Size: %d" % len(char_vocab))
-    print("NER Alphabet Size: %d" % len(ner_vocab))
-
-    word_vocab = defaultdict(lambda: UNK_WORD, word_vocab)
-    char_vocab = defaultdict(lambda: UNK_CHAR, char_vocab)
-    ner_vocab = defaultdict(lambda: UNK_NER, ner_vocab)
-
-    i2w = {v: k for k, v in word_vocab.items()}
-    i2n = {v: k for k, v in ner_vocab.items()}
-    return (word_vocab, char_vocab, ner_vocab), (i2w, i2n)
-
-
-def read_data(source_path, word_vocab, char_vocab, ner_vocab, normalize_digits=True):
-    data = []
-    print('Reading data from %s' % source_path)
-    counter = 0
-    reader = CoNLLReader(source_path, word_vocab, char_vocab, ner_vocab)
-    inst = reader.getNext(normalize_digits)
-    while inst is not None:
-        counter += 1
-        sent = inst.sentence
-        data.append([sent.word_ids, sent.char_id_seqs, inst.ner_ids])
-        inst = reader.getNext(normalize_digits)
-
-    reader.close()
-    print("Total number of data: %d" % counter)
-    return data
-
-
-def iterate_batch(data, batch_size, shuffle=False):
-    if shuffle:
-        np.random.shuffle(data)
-
-    for start_idx in range(0, len(data), batch_size):
-        excerpt = slice(start_idx, start_idx + batch_size)
-        batch = data[excerpt]
-
-        batch_length = max([len(batch[i][0]) for i in range(len(batch))])
-
-        wid_inputs = np.empty([len(batch), batch_length], dtype=np.int64)
-        cid_inputs = np.empty([len(batch), batch_length, MAX_CHAR_LENGTH], dtype=np.int64)
-        nid_inputs = np.empty([len(batch), batch_length], dtype=np.int64)
-        masks = np.zeros([len(batch), batch_length], dtype=np.float32)
-        lengths = np.empty(len(batch), dtype=np.int64)
-
-        for i, inst in enumerate(batch):
-            wids, cid_seqs, nids = inst
-
-            inst_size = len(wids)
-            lengths[i] = inst_size
-            # word ids
-            wid_inputs[i, :inst_size] = wids
-            wid_inputs[i, inst_size:] = PAD_WORD
-            for c, cids in enumerate(cid_seqs):
-                cid_inputs[i, c, :len(cids)] = cids
-                cid_inputs[i, c, len(cids):] = PAD_CHAR
-            cid_inputs[i, inst_size:, :] = PAD_CHAR
-            nid_inputs[i, :inst_size] = nids
-            nid_inputs[i, inst_size:] = PAD_NER
-            masks[i, :inst_size] = 1.0
-
-        yield wid_inputs, cid_inputs, nid_inputs, masks, lengths
-
-
-def load_glove(filename, emb_dim, normalize_digits=True):
-    """Loads embeddings in the glove text format in which each line is
-    '<word-string> <embedding-vector>'. Dimensions of the embedding vector
-    are separated with whitespace characters.
-
-    Args:
-        filename (str): Path to the embedding file.
-        vocab (dict): A dictionary that maps token strings to integer index.
-            Tokens not in :attr:`vocab` are not read.
-        word_vecs: A 2D numpy array of shape `[vocab_size, embed_dim]`
-            which is updated as reading from the file.
-
-    Returns:
-        The updated :attr:`word_vecs`.
-    """
-    glove_dict = dict()
-    with tf.gfile.Open(filename) as fin:
-        for line in fin:
-            vec = line.strip().split()
-            if len(vec) == 0:
-                continue
-            word, vec = vec[0], vec[1:]
-            word = tf.compat.as_text(word)
-            word = DIGIT_RE.sub("0", word) if normalize_digits else word
-            glove_dict[word] = np.array([float(v) for v in vec])
-            if len(vec) != emb_dim:
-                raise ValueError("Inconsistent word vector sizes: %d vs %d" %
-                                 (len(vec), emb_dim))
-    return glove_dict
-
-
-def construct_init_word_vecs(vocab, word_vecs, glove_dict):
-    for word, index in vocab.items():
-        if word in glove_dict:
-            embedding = glove_dict[word]
-        elif word.lower() in glove_dict:
-            embedding = glove_dict[word.lower()]
-        else:
-            embedding = None
-
-        if embedding is not None:
-            word_vecs[index] = embedding
-    return word_vecs
-
-
-class CoNLLReader(object):
-    def __init__(self, file_path, word_vocab, char_vocab, ner_vocab):
-        self.__source_file = open(file_path, 'r', encoding='utf-8')
-        self.__word_vocab = word_vocab
-        self.__char_vocab = char_vocab
-        self.__ner_vocab = ner_vocab
-
-    def close(self):
-        self.__source_file.close()
-
-    def getNext(self, normalize_digits=True):
-        line = self.__source_file.readline()
-        # skip multiple blank lines.
-        while len(line) > 0 and len(line.strip()) == 0:
-            line = self.__source_file.readline()
-        if len(line) == 0:
-            return None
-
-        lines = []
-        while len(line.strip()) > 0:
-            line = line.strip()
-            lines.append(line.split(' '))
-            line = self.__source_file.readline()
-
-        length = len(lines)
-        if length == 0:
-            return None
-
-        words = []
-        word_ids = []
-        char_seqs = []
-        char_id_seqs = []
-        ner_tags = []
-        ner_ids = []
-
-        for tokens in lines:
-            chars = []
-            char_ids = []
-            for char in tokens[1]:
-                chars.append(char)
-                char_ids.append(self.__char_vocab[char])
-            if len(chars) > MAX_CHAR_LENGTH:
-                chars = chars[:MAX_CHAR_LENGTH]
-                char_ids = char_ids[:MAX_CHAR_LENGTH]
-            char_seqs.append(chars)
-            char_id_seqs.append(char_ids)
-
-            word = DIGIT_RE.sub("0", tokens[1]) if normalize_digits else tokens[1]
-            ner = tokens[4]
-
-            words.append(word)
-            word_ids.append(self.__word_vocab[word])
-
-            ner_tags.append(ner)
-            ner_ids.append(self.__ner_vocab[ner])
-
-        return NERInstance(Sentence(words, word_ids, char_seqs, char_id_seqs), ner_tags, ner_ids)
-
-
-class NERInstance(object):
-    def __init__(self, sentence, ner_tags, ner_ids):
-        self.sentence = sentence
-        self.ner_tags = ner_tags
-        self.ner_ids = ner_ids
-
-    def length(self):
-        return self.sentence.length()
-
-
-class Sentence(object):
-    def __init__(self, words, word_ids, char_seqs, char_id_seqs):
-        self.words = words
-        self.word_ids = word_ids
-        self.char_seqs = char_seqs
-        self.char_id_seqs = char_id_seqs
-
-    def length(self):
-        return len(self.words)
diff --git a/examples/sequence_tagging/conll_writer.py b/examples/sequence_tagging/conll_writer.py
deleted file mode 100644
index 1a92b3f7..00000000
--- a/examples/sequence_tagging/conll_writer.py
+++ /dev/null
@@ -1,24 +0,0 @@
-__author__ = 'max'
-
-
-class CoNLLWriter(object):
-    def __init__(self, i2w, i2n):
-        self.__source_file = None
-        self.__i2w = i2w
-        self.__i2n = i2n
-
-    def start(self, file_path):
-        self.__source_file = open(file_path, 'w', encoding='utf-8')
-
-    def close(self):
-        self.__source_file.close()
-
-    def write(self, word, predictions, targets, lengths):
-        batch_size, _ = word.shape
-        for i in range(batch_size):
-            for j in range(lengths[i]):
-                w = self.__i2w[word[i, j]]
-                tgt = self.__i2n[targets[i, j]]
-                pred = self.__i2n[predictions[i, j]]
-                self.__source_file.write('%d %s %s %s %s %s\n' % (j + 1, w, "_", "_", tgt, pred))
-            self.__source_file.write('\n')
diff --git a/examples/sequence_tagging/conlleval b/examples/sequence_tagging/conlleval
deleted file mode 100644
index 12341bae..00000000
--- a/examples/sequence_tagging/conlleval
+++ /dev/null
@@ -1,315 +0,0 @@
-#!/usr/bin/perl -w
-# conlleval: evaluate result of processing CoNLL-2000 shared task
-# usage:     conlleval [-l] [-r] [-d delimiterTag] [-o oTag] < file
-#            README: http://cnts.uia.ac.be/conll2000/chunking/output.html
-# options:   l: generate LaTeX output for tables like in
-#               http://cnts.uia.ac.be/conll2003/ner/example.tex
-#            r: accept raw result tags (without B- and I- prefix;
-#                                       assumes one word per chunk)
-#            d: alternative delimiter tag (default is single space)
-#            o: alternative outside tag (default is O)
-# note:      the file should contain lines with items separated
-#            by $delimiter characters (default space). The final
-#            two items should contain the correct tag and the 
-#            guessed tag in that order. Sentences should be
-#            separated from each other by empty lines or lines
-#            with $boundary fields (default -X-).
-# url:       http://lcg-www.uia.ac.be/conll2000/chunking/
-# started:   1998-09-25
-# version:   2004-01-26
-# author:    Erik Tjong Kim Sang <erikt@uia.ua.ac.be>
-
-use strict;
-
-my $false = 0;
-my $true = 42;
-
-my $boundary = "-X-";     # sentence boundary
-my $correct;              # current corpus chunk tag (I,O,B)
-my $correctChunk = 0;     # number of correctly identified chunks
-my $correctTags = 0;      # number of correct chunk tags
-my $correctType;          # type of current corpus chunk tag (NP,VP,etc.)
-my $delimiter = " ";      # field delimiter
-my $FB1 = 0.0;            # FB1 score (Van Rijsbergen 1979)
-my $firstItem;            # first feature (for sentence boundary checks)
-my $foundCorrect = 0;     # number of chunks in corpus
-my $foundGuessed = 0;     # number of identified chunks
-my $guessed;              # current guessed chunk tag
-my $guessedType;          # type of current guessed chunk tag
-my $i;                    # miscellaneous counter
-my $inCorrect = $false;   # currently processed chunk is correct until now
-my $lastCorrect = "O";    # previous chunk tag in corpus
-my $latex = 0;            # generate LaTeX formatted output
-my $lastCorrectType = ""; # type of previously identified chunk tag
-my $lastGuessed = "O";    # previously identified chunk tag
-my $lastGuessedType = ""; # type of previous chunk tag in corpus
-my $lastType;             # temporary storage for detecting duplicates
-my $line;                 # line
-my $nbrOfFeatures = -1;   # number of features per line
-my $precision = 0.0;      # precision score
-my $oTag = "O";           # outside tag, default O
-my $raw = 0;              # raw input: add B to every token
-my $recall = 0.0;         # recall score
-my $tokenCounter = 0;     # token counter (ignores sentence breaks)
-
-my %correctChunk = ();    # number of correctly identified chunks per type
-my %foundCorrect = ();    # number of chunks in corpus per type
-my %foundGuessed = ();    # number of identified chunks per type
-
-my @features;             # features on line
-my @sortedTypes;          # sorted list of chunk type names
-
-# sanity check
-while (@ARGV and $ARGV[0] =~ /^-/) {
-   if ($ARGV[0] eq "-l") { $latex = 1; shift(@ARGV); }
-   elsif ($ARGV[0] eq "-r") { $raw = 1; shift(@ARGV); }
-   elsif ($ARGV[0] eq "-d") { 
-      shift(@ARGV); 
-      if (not defined $ARGV[0]) { 
-         die "conlleval: -d requires delimiter character"; 
-      }
-      $delimiter = shift(@ARGV);
-   } elsif ($ARGV[0] eq "-o") {
-      shift(@ARGV);
-      if (not defined $ARGV[0]) {
-         die "conlleval: -o requires delimiter character";
-      }
-      $oTag = shift(@ARGV);
-   } else { die "conlleval: unknown argument $ARGV[0]\n"; }
-}
-if (@ARGV) { die "conlleval: unexpected command line argument\n"; }
-# process input
-while (<STDIN>) {
-   chomp($line = $_);
-   @features = split(/$delimiter/,$line);
-   if ($nbrOfFeatures < 0) { $nbrOfFeatures = $#features; }
-   elsif ($nbrOfFeatures != $#features and @features != 0) {
-      printf STDERR "unexpected number of features: %d (%d)\n",
-         $#features+1,$nbrOfFeatures+1;
-      exit(1);
-   }
-   if (@features == 0 or 
-       $features[0] eq $boundary) { @features = ($boundary,"O","O"); }
-   if (@features < 2) { 
-      die "conlleval: unexpected number of features in line $line\n"; 
-   }
-   if ($raw) {
-      if ($features[$#features] eq $oTag) { $features[$#features] = "O"; } 
-      if ($features[$#features-1] eq $oTag) { $features[$#features-1] = "O"; } 
-      if ($features[$#features] ne "O") { 
-         $features[$#features] = "B-$features[$#features]";
-      }
-      if ($features[$#features-1] ne "O") { 
-         $features[$#features-1] = "B-$features[$#features-1]";
-      }
-   }
-   # 20040126 ET code which allows hyphens in the types
-   if ($features[$#features] =~ /^([^-]*)-(.*)$/) {
-      $guessed = $1;
-      $guessedType = $2;
-   } else { 
-      $guessed = $features[$#features]; 
-      $guessedType = ""; 
-   }
-   pop(@features);
-   if ($features[$#features] =~ /^([^-]*)-(.*)$/) {
-      $correct = $1;
-      $correctType = $2;
-   } else { 
-      $correct = $features[$#features]; 
-      $correctType = ""; 
-   }
-   pop(@features);
-#  ($guessed,$guessedType) = split(/-/,pop(@features));
-#  ($correct,$correctType) = split(/-/,pop(@features));
-   $guessedType = $guessedType ? $guessedType : "";
-   $correctType = $correctType ? $correctType : "";
-   $firstItem = shift(@features);
-
-   # 1999-06-26 sentence breaks should always be counted as out of chunk
-   if ( $firstItem eq $boundary ) { $guessed = "O"; }
-
-   if ($inCorrect) {
-      if ( &endOfChunk($lastCorrect,$correct,$lastCorrectType,$correctType) and
-           &endOfChunk($lastGuessed,$guessed,$lastGuessedType,$guessedType) and
-           $lastGuessedType eq $lastCorrectType) {
-         $inCorrect=$false;
-         $correctChunk++;
-         $correctChunk{$lastCorrectType} = $correctChunk{$lastCorrectType} ?
-             $correctChunk{$lastCorrectType}+1 : 1;
-      } elsif ( 
-           &endOfChunk($lastCorrect,$correct,$lastCorrectType,$correctType) != 
-           &endOfChunk($lastGuessed,$guessed,$lastGuessedType,$guessedType) or
-           $guessedType ne $correctType ) {
-         $inCorrect=$false; 
-      }
-   }
-
-   if ( &startOfChunk($lastCorrect,$correct,$lastCorrectType,$correctType) and 
-        &startOfChunk($lastGuessed,$guessed,$lastGuessedType,$guessedType) and
-        $guessedType eq $correctType) { $inCorrect = $true; }
-
-   if ( &startOfChunk($lastCorrect,$correct,$lastCorrectType,$correctType) ) {
-      $foundCorrect++; 
-      $foundCorrect{$correctType} = $foundCorrect{$correctType} ?
-          $foundCorrect{$correctType}+1 : 1;
-   }
-   if ( &startOfChunk($lastGuessed,$guessed,$lastGuessedType,$guessedType) ) {
-      $foundGuessed++; 
-      $foundGuessed{$guessedType} = $foundGuessed{$guessedType} ?
-          $foundGuessed{$guessedType}+1 : 1;
-   }
-   if ( $firstItem ne $boundary ) { 
-      if ( $correct eq $guessed and $guessedType eq $correctType ) { 
-         $correctTags++; 
-      }
-      $tokenCounter++; 
-   }
-
-   $lastGuessed = $guessed;
-   $lastCorrect = $correct;
-   $lastGuessedType = $guessedType;
-   $lastCorrectType = $correctType;
-}
-if ($inCorrect) { 
-   $correctChunk++;
-   $correctChunk{$lastCorrectType} = $correctChunk{$lastCorrectType} ?
-       $correctChunk{$lastCorrectType}+1 : 1;
-}
-
-if (not $latex) {
-   # compute overall precision, recall and FB1 (default values are 0.0)
-   $precision = 100*$correctChunk/$foundGuessed if ($foundGuessed > 0);
-   $recall = 100*$correctChunk/$foundCorrect if ($foundCorrect > 0);
-   $FB1 = 2*$precision*$recall/($precision+$recall)
-      if ($precision+$recall > 0);
-   
-   # print overall performance
-   printf "processed $tokenCounter tokens with $foundCorrect phrases; ";
-   printf "found: $foundGuessed phrases; correct: $correctChunk.\n";
-   if ($tokenCounter>0) {
-      printf "accuracy: %6.2f%%; ",100*$correctTags/$tokenCounter;
-      printf "precision: %6.2f%%; ",$precision;
-      printf "recall: %6.2f%%; ",$recall;
-      printf "FB1: %6.2f\n",$FB1;
-   }
-}
-
-# sort chunk type names
-undef($lastType);
-@sortedTypes = ();
-foreach $i (sort (keys %foundCorrect,keys %foundGuessed)) {
-   if (not($lastType) or $lastType ne $i) { 
-      push(@sortedTypes,($i));
-   }
-   $lastType = $i;
-}
-# print performance per chunk type
-if (not $latex) {
-   for $i (@sortedTypes) {
-      $correctChunk{$i} = $correctChunk{$i} ? $correctChunk{$i} : 0;
-      if (not($foundGuessed{$i})) { $foundGuessed{$i} = 0; $precision = 0.0; }
-      else { $precision = 100*$correctChunk{$i}/$foundGuessed{$i}; }
-      if (not($foundCorrect{$i})) { $recall = 0.0; }
-      else { $recall = 100*$correctChunk{$i}/$foundCorrect{$i}; }
-      if ($precision+$recall == 0.0) { $FB1 = 0.0; }
-      else { $FB1 = 2*$precision*$recall/($precision+$recall); }
-      printf "%17s: ",$i;
-      printf "precision: %6.2f%%; ",$precision;
-      printf "recall: %6.2f%%; ",$recall;
-      printf "FB1: %6.2f  %d\n",$FB1,$foundGuessed{$i};
-   }
-} else {
-   print "        & Precision &  Recall  & F\$_{\\beta=1} \\\\\\hline";
-   for $i (@sortedTypes) {
-      $correctChunk{$i} = $correctChunk{$i} ? $correctChunk{$i} : 0;
-      if (not($foundGuessed{$i})) { $precision = 0.0; }
-      else { $precision = 100*$correctChunk{$i}/$foundGuessed{$i}; }
-      if (not($foundCorrect{$i})) { $recall = 0.0; }
-      else { $recall = 100*$correctChunk{$i}/$foundCorrect{$i}; }
-      if ($precision+$recall == 0.0) { $FB1 = 0.0; }
-      else { $FB1 = 2*$precision*$recall/($precision+$recall); }
-      printf "\n%-7s &  %6.2f\\%% & %6.2f\\%% & %6.2f \\\\",
-             $i,$precision,$recall,$FB1;
-   }
-   print "\\hline\n";
-   $precision = 0.0;
-   $recall = 0;
-   $FB1 = 0.0;
-   $precision = 100*$correctChunk/$foundGuessed if ($foundGuessed > 0);
-   $recall = 100*$correctChunk/$foundCorrect if ($foundCorrect > 0);
-   $FB1 = 2*$precision*$recall/($precision+$recall)
-      if ($precision+$recall > 0);
-   printf STDOUT "Overall &  %6.2f\\%% & %6.2f\\%% & %6.2f \\\\\\hline\n",
-          $precision,$recall,$FB1;
-}
-
-exit 0;
-
-# endOfChunk: checks if a chunk ended between the previous and current word
-# arguments:  previous and current chunk tags, previous and current types
-# note:       this code is capable of handling other chunk representations
-#             than the default CoNLL-2000 ones, see EACL'99 paper of Tjong
-#             Kim Sang and Veenstra http://xxx.lanl.gov/abs/cs.CL/9907006
-
-sub endOfChunk {
-   my $prevTag = shift(@_);
-   my $tag = shift(@_);
-   my $prevType = shift(@_);
-   my $type = shift(@_);
-   my $chunkEnd = $false;
-
-   if ( $prevTag eq "B" and $tag eq "B" ) { $chunkEnd = $true; }
-   if ( $prevTag eq "B" and $tag eq "O" ) { $chunkEnd = $true; }
-   if ( $prevTag eq "I" and $tag eq "B" ) { $chunkEnd = $true; }
-   if ( $prevTag eq "I" and $tag eq "O" ) { $chunkEnd = $true; }
-
-   if ( $prevTag eq "E" and $tag eq "E" ) { $chunkEnd = $true; }
-   if ( $prevTag eq "E" and $tag eq "I" ) { $chunkEnd = $true; }
-   if ( $prevTag eq "E" and $tag eq "O" ) { $chunkEnd = $true; }
-   if ( $prevTag eq "I" and $tag eq "O" ) { $chunkEnd = $true; }
-
-   if ($prevTag ne "O" and $prevTag ne "." and $prevType ne $type) { 
-      $chunkEnd = $true; 
-   }
-
-   # corrected 1998-12-22: these chunks are assumed to have length 1
-   if ( $prevTag eq "]" ) { $chunkEnd = $true; }
-   if ( $prevTag eq "[" ) { $chunkEnd = $true; }
-
-   return($chunkEnd);   
-}
-
-# startOfChunk: checks if a chunk started between the previous and current word
-# arguments:    previous and current chunk tags, previous and current types
-# note:         this code is capable of handling other chunk representations
-#               than the default CoNLL-2000 ones, see EACL'99 paper of Tjong
-#               Kim Sang and Veenstra http://xxx.lanl.gov/abs/cs.CL/9907006
-
-sub startOfChunk {
-   my $prevTag = shift(@_);
-   my $tag = shift(@_);
-   my $prevType = shift(@_);
-   my $type = shift(@_);
-   my $chunkStart = $false;
-
-   if ( $prevTag eq "B" and $tag eq "B" ) { $chunkStart = $true; }
-   if ( $prevTag eq "I" and $tag eq "B" ) { $chunkStart = $true; }
-   if ( $prevTag eq "O" and $tag eq "B" ) { $chunkStart = $true; }
-   if ( $prevTag eq "O" and $tag eq "I" ) { $chunkStart = $true; }
-
-   if ( $prevTag eq "E" and $tag eq "E" ) { $chunkStart = $true; }
-   if ( $prevTag eq "E" and $tag eq "I" ) { $chunkStart = $true; }
-   if ( $prevTag eq "O" and $tag eq "E" ) { $chunkStart = $true; }
-   if ( $prevTag eq "O" and $tag eq "I" ) { $chunkStart = $true; }
-
-   if ($tag ne "O" and $tag ne "." and $prevType ne $type) { 
-      $chunkStart = $true; 
-   }
-
-   # corrected 1998-12-22: these chunks are assumed to have length 1
-   if ( $tag eq "[" ) { $chunkStart = $true; }
-   if ( $tag eq "]" ) { $chunkStart = $true; }
-
-   return($chunkStart);   
-}
diff --git a/examples/sequence_tagging/ner.py b/examples/sequence_tagging/ner.py
deleted file mode 100644
index cb717f9b..00000000
--- a/examples/sequence_tagging/ner.py
+++ /dev/null
@@ -1,219 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Sequence tagging.
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import time
-import importlib
-import numpy as np
-import tensorflow as tf
-import texar.tf as tx
-
-from examples.sequence_tagging.conll_reader import create_vocabs, read_data, iterate_batch, load_glove, construct_init_word_vecs
-from examples.sequence_tagging.conll_writer import CoNLLWriter
-from examples.sequence_tagging import scores
-
-flags = tf.flags
-
-flags.DEFINE_string("data_path", "./data",
-                    "Directory containing NER data (e.g., eng.train.bio.conll).")
-flags.DEFINE_string("train", "eng.train.bio.conll",
-                    "the file name of the training data.")
-flags.DEFINE_string("dev", "eng.dev.bio.conll",
-                    "the file name of the dev data.")
-flags.DEFINE_string("test", "eng.test.bio.conll",
-                    "the file name of the test data.")
-flags.DEFINE_string("embedding", "glove.6B.100d.txt",
-                    "the file name of the GloVe embedding.")
-flags.DEFINE_string("config", "config", "The config to use.")
-
-FLAGS = flags.FLAGS
-
-config = importlib.import_module(FLAGS.config)
-
-train_path = os.path.join(FLAGS.data_path, FLAGS.train)
-dev_path = os.path.join(FLAGS.data_path, FLAGS.dev)
-test_path = os.path.join(FLAGS.data_path, FLAGS.test)
-embedding_path = os.path.join(FLAGS.data_path, FLAGS.embedding)
-EMBEDD_DIM = config.embed_dim
-CHAR_DIM = config.char_dim
-
-# Prepares/loads data
-if config.load_glove:
-    print('loading GloVe embedding...')
-    glove_dict = load_glove(embedding_path, EMBEDD_DIM)
-else:
-    glove_dict = None
-
-(word_vocab, char_vocab, ner_vocab), (i2w, i2n) = create_vocabs(train_path, dev_path, test_path, glove_dict=glove_dict)
-
-data_train = read_data(train_path, word_vocab, char_vocab, ner_vocab)
-data_dev = read_data(dev_path, word_vocab, char_vocab, ner_vocab)
-data_test = read_data(test_path, word_vocab, char_vocab, ner_vocab)
-
-scale = np.sqrt(3.0 / EMBEDD_DIM)
-word_vecs = np.random.uniform(-scale, scale, [len(word_vocab), EMBEDD_DIM]).astype(np.float32)
-if config.load_glove:
-    word_vecs = construct_init_word_vecs(word_vocab, word_vecs, glove_dict)
-
-scale = np.sqrt(3.0 / CHAR_DIM)
-char_vecs = np.random.uniform(-scale, scale, [len(char_vocab), CHAR_DIM]).astype(np.float32)
-
-# Builds TF graph
-inputs = tf.placeholder(tf.int64, [None, None])
-chars = tf.placeholder(tf.int64, [None, None, None])
-targets = tf.placeholder(tf.int64, [None, None])
-masks = tf.placeholder(tf.float32, [None, None])
-seq_lengths = tf.placeholder(tf.int64, [None])
-
-vocab_size = len(word_vecs)
-embedder = tx.modules.WordEmbedder(vocab_size=vocab_size, init_value=word_vecs, hparams=config.emb)
-emb_inputs = embedder(inputs)
-
-char_size = len(char_vecs)
-char_embedder = tx.modules.WordEmbedder(vocab_size=char_size, init_value=char_vecs, hparams=config.char_emb)
-emb_chars = char_embedder(chars)
-char_shape = tf.shape(emb_chars)  # [batch, length, char_length, char_dim]
-emb_chars = tf.reshape(emb_chars, (-1, char_shape[2], CHAR_DIM))
-char_encoder = tx.modules.Conv1DEncoder(config.conv)
-char_outputs = char_encoder(emb_chars)
-char_outputs = tf.reshape(char_outputs, (char_shape[0], char_shape[1], config.conv['filters']))
-
-emb_inputs = tf.concat([emb_inputs, char_outputs], axis=2)
-emb_inputs = tf.nn.dropout(emb_inputs, keep_prob=0.67)
-
-encoder = tx.modules.BidirectionalRNNEncoder(hparams={"rnn_cell_fw": config.cell, "rnn_cell_bw": config.cell})
-outputs, _ = encoder(emb_inputs, sequence_length=seq_lengths)
-outputs = tf.concat(outputs, axis=2)
-
-rnn_shape = tf.shape(outputs)
-outputs = tf.reshape(outputs, (-1, 2 * config.hidden_size))
-
-outputs = tf.layers.dense(outputs, config.tag_space, activation=tf.nn.elu)
-outputs = tf.nn.dropout(outputs, keep_prob=config.keep_prob)
-
-logits = tf.layers.dense(outputs, len(ner_vocab))
-
-logits = tf.reshape(logits, tf.concat([rnn_shape[0:2], [len(ner_vocab)]], axis=0))
-
-mle_loss = tx.losses.sequence_sparse_softmax_cross_entropy(
-    labels=targets,
-    logits=logits,
-    sequence_length=seq_lengths,
-    average_across_batch=True,
-    average_across_timesteps=True,
-    sum_over_timesteps=False)
-
-predicts = tf.argmax(logits, axis=2)
-corrects = tf.reduce_sum(tf.cast(tf.equal(targets, predicts), tf.float32) * masks)
-
-global_step = tf.placeholder(tf.int32)
-train_op = tx.core.get_train_op(
-    mle_loss, global_step=global_step, increment_global_step=False,
-    hparams=config.opt)
-
-# Training/eval processes
-
-
-def _train_epoch(sess, epoch):
-    start_time = time.time()
-    loss = 0.
-    corr = 0.
-    num_tokens = 0.
-
-    fetches = {
-        "mle_loss": mle_loss,
-        "correct": corrects,
-    }
-    fetches["train_op"] = train_op
-
-    mode = tf.estimator.ModeKeys.TRAIN
-    num_inst = 0
-    for batch in iterate_batch(data_train, config.batch_size, shuffle=True):
-        word, char, ner, mask, length = batch
-        feed_dict = {
-            inputs: word, chars: char, targets: ner, masks: mask, seq_lengths: length,
-            global_step: epoch, tx.global_mode(): mode,
-        }
-
-        rets = sess.run(fetches, feed_dict)
-        nums = np.sum(length)
-        num_inst += len(word)
-        loss += rets["mle_loss"] * nums
-        corr += rets["correct"]
-        num_tokens += nums
-
-        print("train: %d (%d/%d) loss: %.4f, acc: %.2f%%" % (epoch, num_inst, len(data_train), loss / num_tokens, corr / num_tokens * 100))
-    print("train: %d loss: %.4f, acc: %.2f%%, time: %.2fs" % (epoch, loss / num_tokens, corr / num_tokens * 100, time.time() - start_time))
-
-
-def _eval(sess, epoch, data_tag):
-    fetches = {
-        "predicts": predicts,
-    }
-    mode = tf.estimator.ModeKeys.EVAL
-    file_name = 'tmp/%s%d' % (data_tag, epoch)
-    writer = CoNLLWriter(i2w, i2n)
-    writer.start(file_name)
-    data = data_dev if data_tag == 'dev' else data_test
-    for batch in iterate_batch(data, config.batch_size, shuffle=False):
-        word, char, ner, mask, length = batch
-        feed_dict = {
-            inputs: word, chars: char, targets: ner, masks: mask, seq_lengths: length,
-            global_step: epoch, tx.global_mode(): mode,
-        }
-        rets = sess.run(fetches, feed_dict)
-        predictions = rets['predicts']
-        writer.write(word, predictions, ner, length)
-    writer.close()
-    acc, precision, recall, f1 = scores.scores(file_name)
-    print('%s acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%%' % (data_tag, acc, precision, recall, f1))
-    return acc, precision, recall, f1
-
-
-with tf.Session() as sess:
-    sess.run(tf.global_variables_initializer())
-    sess.run(tf.local_variables_initializer())
-    sess.run(tf.tables_initializer())
-
-    dev_f1 = 0.0
-    dev_acc = 0.0
-    dev_precision = 0.0
-    dev_recall = 0.0
-    best_epoch = 0
-
-    test_f1 = 0.0
-    test_acc = 0.0
-    test_prec = 0.0
-    test_recall = 0.0
-
-    tx.utils.maybe_create_dir('./tmp')
-
-    for epoch in range(config.num_epochs):
-        _train_epoch(sess, epoch)
-        acc, precision, recall, f1 = _eval(sess, epoch, 'dev')
-        if dev_f1 < f1:
-            dev_f1 = f1
-            dev_acc = acc
-            dev_precision = precision
-            dev_recall = recall
-            best_epoch = epoch
-            test_acc, test_prec, test_recall, test_f1 = _eval(sess, epoch, 'test')
-        print('best acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%%, epoch: %d' % (dev_acc, dev_precision, dev_recall, dev_f1, best_epoch))
-        print('test acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%%, epoch: %d' % (test_acc, test_prec, test_recall, test_f1, best_epoch))
-        print('---------------------------------------------------')
diff --git a/examples/sequence_tagging/scores.py b/examples/sequence_tagging/scores.py
deleted file mode 100644
index 54427a78..00000000
--- a/examples/sequence_tagging/scores.py
+++ /dev/null
@@ -1,12 +0,0 @@
-import subprocess
-
-
-def scores(path):
-    bashCommand = 'perl conlleval'
-    process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE,
-                               stdin=open(path))
-    output, error = process.communicate()
-    output = output.decode().split('\n')[1].split('%; ')
-    output = [out.split(' ')[-1] for out in output]
-    acc, prec, recall, fb1 = tuple(output)
-    return float(acc), float(prec), float(recall), float(fb1)
diff --git a/examples/text_style_transfer/.gitignore b/examples/text_style_transfer/.gitignore
deleted file mode 100644
index ba6dc662..00000000
--- a/examples/text_style_transfer/.gitignore
+++ /dev/null
@@ -1,4 +0,0 @@
-/checkpoints/
-/samples/
-/data/
-/yelp.zip
diff --git a/examples/text_style_transfer/README.md b/examples/text_style_transfer/README.md
deleted file mode 100644
index 2f9dd0db..00000000
--- a/examples/text_style_transfer/README.md
+++ /dev/null
@@ -1,108 +0,0 @@
-# Text Style Transfer #
-
-This example implements a simplified variant of the `ctrl-gen` model from 
-
-[Toward Controlled Generation of Text](https://arxiv.org/pdf/1703.00955.pdf)  
-*Zhiting Hu, Zichao Yang, Xiaodan Liang, Ruslan Salakhutdinov, Eric Xing; ICML 2017*
-
-The model roughly has an architecture of `Encoder--Decoder--Classifier`. Compared to the paper, following simplications are made:
-  
-  * Replaces the base Variational Autoencoder (VAE) model with an attentional Autoencoder (AE) -- VAE is not necessary in the text style transfer setting since we do not need to interpolate the latent space as in the paper.
-  * Attribute classifier (i.e., discriminator) is trained with real data only. Samples generated by the decoder are not used.
-  * Independency constraint is omitted.
-
-## Usage ##
-
-### Dataset ###
-Download the yelp sentiment dataset with the following cmd:
-```
-python prepare_data.py
-```
-
-### Train the model ###
-
-Train the model on the above data to do sentiment transfer.
-```
-python main.py --config config
-```
-
-[config.py](./config.py) contains the data and mode configurations. 
-
-* The model will first be pre-trained for a few epochs (specified in `config.py`). During pre-training, the `Encoder-Decoder` part is trained as an autoencoder, while the `Classifier` part is trained with the classification labels.
-* Full-training is then performed for another few epochs. During full-training, the `Classifier` part is fixed, and the `Encoder-Decoder` part is trained to fit the classifier, along with continuing to minimize the autoencoding loss.
-
-(**Note:** When using your own dataset, make sure to set `max_decoding_length_train` and `max_decoding_length_infer` in [config.py](https://github.com/asyml/texar/blob/master/examples/text_style_transfer/config.py#L85-L86).) 
-
-Training log is printed as below:
-```
-gamma: 1.0, lambda_g: 0.0
-step: 1, loss_d: 0.6903 accu_d: 0.5625
-step: 1, loss_g_clas: 0.6991 loss_g: 9.1452 accu_g: 0.2812 loss_g_ae: 9.1452 accu_g_gdy: 0.2969
-step: 500, loss_d: 0.0989 accu_d: 0.9688
-step: 500, loss_g_clas: 0.2985 loss_g: 3.9696 accu_g: 0.8891 loss_g_ae: 3.9696 accu_g_gdy: 0.7734
-...
-step: 6500, loss_d: 0.0806 accu_d: 0.9703
-step: 6500, loss_g_clas: 5.7137 loss_g: 0.2887 accu_g: 0.0844 loss_g_ae: 0.2887 accu_g_gdy: 0.0625
-epoch: 1, loss_d: 0.0876 accu_d: 0.9719
-epoch: 1, loss_g_clas: 6.7360 loss_g: 0.2195 accu_g: 0.0627 loss_g_ae: 0.2195 accu_g_gdy: 0.0642
-val: accu_g: 0.0445 loss_g_ae: 0.1302 accu_d: 0.9774 bleu: 90.7896 loss_g: 0.1302 loss_d: 0.0666 loss_g_clas: 7.0310 accu_g_gdy: 0.0482
-...
-
-```
-where:
-- `loss_d` and `accu_d` are the classification loss/accuracy of the `Classifier` part.
-- `loss_g_clas` is the classification loss of the generated sentences.
-- `loss_g_ae` is the autoencoding loss.
-- `loss_g` is the joint loss `= loss_g_ae + lambda_g * loss_g_clas`.
-- `accu_g` is the classification accuracy of the generated sentences with soft represetations (i.e., Gumbel-softmax).
-- `accu_g_gdy` is the classification accuracy of the generated sentences with greedy decoding.
-- `bleu` is the BLEU score between the generated and input sentences.
-
-## Results ##
-
-Text style transfer has two primary goals:
-1. The generated sentence should have desired attribute (e.g., positive/negative sentiment)
-2. The generated sentence should keep the content of the original one
-
-We use automatic metrics to evaluate both: 
-* For (1), we can use a pre-trained classifier to classify the generated sentences and evaluate the accuracy (the higher the better). In this code we have not implemented a stand-alone classifier for evaluation, which could be very easy though. The `Classifier` part in the model gives a reasonably good estimation (i.e., `accu_g_gdy` in the above) of the accuracy. 
-* For (2), we evaluate the BLEU score between the generated sentences and the original sentences, i.e., `bleu` in the above (the higher the better) (See [Yang et al., 2018](https://arxiv.org/pdf/1805.11749.pdf) for more details.)
-
-The implementation here gives the following performance after 10 epochs of pre-training and 2 epochs of full-training:
-
-| Accuracy (by the `Classifier` part)  | BLEU (with the original sentence) |
-| -------------------------------------| ----------------------------------|
-| 0.92 | 54.0  |
-
-Also refer to the following papers that used this code and compared to other text style transfer approaches:
-
-* [Unsupervised Text Style Transfer using Language Models as Discriminators](https://papers.nips.cc/paper/7959-unsupervised-text-style-transfer-using-language-models-as-discriminators.pdf). Zichao Yang, Zhiting Hu, Chris Dyer, Eric Xing, Taylor Berg-Kirkpatrick. NeurIPS 2018
-* [Structured Content Preservation for Unsupervised Text Style Transfer](https://arxiv.org/pdf/1810.06526.pdf). Youzhi Tian, Zhiting Hu, Zhou Yu. 2018
-
-### Samples ###
-Here are some randomly-picked samples. In each pair, the first sentence is the original sentence and the second is the generated.
-```
-go to place for client visits with gorgeous views .
-go to place for client visits with lacking views .
-
-there was lots of people but they still managed to provide great service .
-there was lots of people but they still managed to provide careless service .
-
-this was the best dining experience i have ever had .
-this was the worst dining experience i have ever had .
-
-needless to say , we skipped desert .
-gentle to say , we edgy desert . 
-
-the first time i was missing an entire sandwich and a side of fries .
-the first time i was beautifully an entire sandwich and a side of fries .
-
-her boutique has a fabulous selection of designer brands !
-her annoying has a sketchy selection of bland warned !
-
-service is pretty good .
-service is trashy rude .
-
-ok nothing new .
-exceptional impressed new .
-```
diff --git a/examples/text_style_transfer/config.py b/examples/text_style_transfer/config.py
deleted file mode 100644
index af67a813..00000000
--- a/examples/text_style_transfer/config.py
+++ /dev/null
@@ -1,107 +0,0 @@
-"""Config
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# pylint: disable=invalid-name
-
-import copy
-
-# Total number of training epochs (including pre-train and full-train)
-max_nepochs = 12
-pretrain_nepochs = 10  # Number of pre-train epochs (training as autoencoder)
-display = 500  # Display the training results every N training steps.
-# Display the dev results every N training steps (set to a
-# very large value to disable it).
-display_eval = 1e10
-
-sample_path = './samples'
-checkpoint_path = './checkpoints'
-restore = ''   # Model snapshot to restore from
-
-lambda_g = 0.1  # Weight of the classification loss
-gamma_decay = 0.5  # Gumbel-softmax temperature anneal rate
-
-train_data = {
-    'batch_size': 64,
-    # 'seed': 123,
-    'datasets': [
-        {
-            'files': './data/yelp/sentiment.train.text',
-            'vocab_file': './data/yelp/vocab',
-            'data_name': ''
-        },
-        {
-            'files': './data/yelp/sentiment.train.labels',
-            'data_type': 'int',
-            'data_name': 'labels'
-        }
-    ],
-    'name': 'train'
-}
-
-val_data = copy.deepcopy(train_data)
-val_data['datasets'][0]['files'] = './data/yelp/sentiment.dev.text'
-val_data['datasets'][1]['files'] = './data/yelp/sentiment.dev.labels'
-
-test_data = copy.deepcopy(train_data)
-test_data['datasets'][0]['files'] = './data/yelp/sentiment.test.text'
-test_data['datasets'][1]['files'] = './data/yelp/sentiment.test.labels'
-
-model = {
-    'dim_c': 200,
-    'dim_z': 500,
-    'embedder': {
-        'dim': 100,
-    },
-    'encoder': {
-        'rnn_cell': {
-            'type': 'GRUCell',
-            'kwargs': {
-                'num_units': 700
-            },
-            'dropout': {
-                'input_keep_prob': 0.5
-            }
-        }
-    },
-    'decoder': {
-        'rnn_cell': {
-            'type': 'GRUCell',
-            'kwargs': {
-                'num_units': 700,
-            },
-            'dropout': {
-                'input_keep_prob': 0.5,
-                'output_keep_prob': 0.5
-            },
-        },
-        'attention': {
-            'type': 'BahdanauAttention',
-            'kwargs': {
-                'num_units': 700,
-            },
-            'attention_layer_size': 700,
-        },
-        'max_decoding_length_train': 21,
-        'max_decoding_length_infer': 20,
-    },
-    'classifier': {
-        'kernel_size': [3, 4, 5],
-        'filters': 128,
-        'other_conv_kwargs': {'padding': 'same'},
-        'dropout_conv': [1],
-        'dropout_rate': 0.5,
-        'num_dense_layers': 0,
-        'num_classes': 1
-    },
-    'opt': {
-        'optimizer': {
-            'type':  'AdamOptimizer',
-            'kwargs': {
-                'learning_rate': 5e-4,
-            },
-        },
-    },
-}
diff --git a/examples/text_style_transfer/ctrl_gen_model.py b/examples/text_style_transfer/ctrl_gen_model.py
deleted file mode 100644
index 3c320458..00000000
--- a/examples/text_style_transfer/ctrl_gen_model.py
+++ /dev/null
@@ -1,186 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Text style transfer
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# pylint: disable=invalid-name, too-many-locals
-
-import tensorflow as tf
-
-import texar.tf as tx
-from texar.tf.modules import WordEmbedder, UnidirectionalRNNEncoder, \
-        MLPTransformConnector, AttentionRNNDecoder, \
-        GumbelSoftmaxEmbeddingHelper, Conv1DClassifier
-from texar.tf.core import get_train_op
-from texar.tf.utils import collect_trainable_variables, get_batch_size
-
-
-class CtrlGenModel(object):
-    """Control
-    """
-
-    def __init__(self, inputs, vocab, gamma, lambda_g, hparams=None):
-        self._hparams = tx.HParams(hparams, None)
-        self._build_model(inputs, vocab, gamma, lambda_g)
-
-    def _build_model(self, inputs, vocab, gamma, lambda_g):
-        """Builds the model.
-        """
-        embedder = WordEmbedder(
-            vocab_size=vocab.size,
-            hparams=self._hparams.embedder)
-        encoder = UnidirectionalRNNEncoder(hparams=self._hparams.encoder)
-
-        # text_ids for encoder, with BOS token removed
-        enc_text_ids = inputs['text_ids'][:, 1:]
-        enc_outputs, final_state = encoder(embedder(enc_text_ids),
-                                           sequence_length=inputs['length'] - 1)
-        z = final_state[:, self._hparams.dim_c:]
-
-        # Encodes label
-        label_connector = MLPTransformConnector(self._hparams.dim_c)
-
-        # Gets the sentence representation: h = (c, z)
-        labels = tf.cast(tf.reshape(inputs['labels'], [-1, 1]), tf.float32)
-        c = label_connector(labels)
-        c_ = label_connector(1 - labels)
-        h = tf.concat([c, z], 1)
-        h_ = tf.concat([c_, z], 1)
-
-        # Teacher-force decoding and the auto-encoding loss for G
-        decoder = AttentionRNNDecoder(
-            memory=enc_outputs,
-            memory_sequence_length=inputs['length'] - 1,
-            cell_input_fn=lambda inputs, attention: inputs,
-            vocab_size=vocab.size,
-            hparams=self._hparams.decoder)
-
-        connector = MLPTransformConnector(decoder.state_size)
-
-        g_outputs, _, _ = decoder(
-            initial_state=connector(h), inputs=inputs['text_ids'],
-            embedding=embedder, sequence_length=inputs['length'] - 1)
-
-        loss_g_ae = tx.losses.sequence_sparse_softmax_cross_entropy(
-            labels=inputs['text_ids'][:, 1:],
-            logits=g_outputs.logits,
-            sequence_length=inputs['length'] - 1,
-            average_across_timesteps=True,
-            sum_over_timesteps=False)
-
-        # Gumbel-softmax decoding, used in training
-        start_tokens = tf.ones_like(inputs['labels']) * vocab.bos_token_id
-        end_token = vocab.eos_token_id
-        gumbel_helper = GumbelSoftmaxEmbeddingHelper(
-            embedder.embedding, start_tokens, end_token, gamma)
-
-        soft_outputs_, _, soft_length_, = decoder(
-            helper=gumbel_helper, initial_state=connector(h_))
-
-        # Greedy decoding, used in eval
-        outputs_, _, length_ = decoder(
-            decoding_strategy='infer_greedy', initial_state=connector(h_),
-            embedding=embedder, start_tokens=start_tokens, end_token=end_token)
-
-        # Creates classifier
-        classifier = Conv1DClassifier(hparams=self._hparams.classifier)
-        clas_embedder = WordEmbedder(vocab_size=vocab.size,
-                                     hparams=self._hparams.embedder)
-
-        # Classification loss for the classifier
-        clas_logits, clas_preds = classifier(
-            inputs=clas_embedder(ids=inputs['text_ids'][:, 1:]),
-            sequence_length=inputs['length'] - 1)
-        loss_d_clas = tf.nn.sigmoid_cross_entropy_with_logits(
-            labels=tf.cast(inputs['labels'], tf.float32), logits=clas_logits)
-        loss_d_clas = tf.reduce_mean(loss_d_clas)
-        accu_d = tx.evals.accuracy(labels=inputs['labels'], preds=clas_preds)
-
-        # Classification loss for the generator, based on soft samples
-        soft_logits, soft_preds = classifier(
-            inputs=clas_embedder(soft_ids=soft_outputs_.sample_id),
-            sequence_length=soft_length_)
-        loss_g_clas = tf.nn.sigmoid_cross_entropy_with_logits(
-            labels=tf.cast(1 - inputs['labels'], tf.float32),
-            logits=soft_logits)
-        loss_g_clas = tf.reduce_mean(loss_g_clas)
-
-        # Accuracy on soft samples, for training progress monitoring
-        accu_g = tx.evals.accuracy(labels=1 - inputs['labels'],
-                                   preds=soft_preds)
-
-        # Accuracy on greedy-decoded samples, for training progress monitoring
-        _, gdy_preds = classifier(
-            inputs=clas_embedder(ids=outputs_.sample_id),
-            sequence_length=length_)
-        accu_g_gdy = tx.evals.accuracy(
-            labels=1 - inputs['labels'], preds=gdy_preds)
-
-        # Aggregates losses
-        loss_g = loss_g_ae + lambda_g * loss_g_clas
-        loss_d = loss_d_clas
-
-        # Creates optimizers
-        g_vars = collect_trainable_variables(
-            [embedder, encoder, label_connector, connector, decoder])
-        d_vars = collect_trainable_variables([clas_embedder, classifier])
-
-        train_op_g = get_train_op(
-            loss_g, g_vars, hparams=self._hparams.opt)
-        train_op_g_ae = get_train_op(
-            loss_g_ae, g_vars, hparams=self._hparams.opt)
-        train_op_d = get_train_op(
-            loss_d, d_vars, hparams=self._hparams.opt)
-
-        # Interface tensors
-        self.losses = {
-            "loss_g": loss_g,
-            "loss_g_ae": loss_g_ae,
-            "loss_g_clas": loss_g_clas,
-            "loss_d": loss_d_clas
-        }
-        self.metrics = {
-            "accu_d": accu_d,
-            "accu_g": accu_g,
-            "accu_g_gdy": accu_g_gdy,
-        }
-        self.train_ops = {
-            "train_op_g": train_op_g,
-            "train_op_g_ae": train_op_g_ae,
-            "train_op_d": train_op_d
-        }
-        self.samples = {
-            "original": inputs['text_ids'][:, 1:],
-            "transferred": outputs_.sample_id
-        }
-
-        self.fetches_train_g = {
-            "loss_g": self.train_ops["train_op_g"],
-            "loss_g_ae": self.losses["loss_g_ae"],
-            "loss_g_clas": self.losses["loss_g_clas"],
-            "accu_g": self.metrics["accu_g"],
-            "accu_g_gdy": self.metrics["accu_g_gdy"],
-        }
-        self.fetches_train_d = {
-            "loss_d": self.train_ops["train_op_d"],
-            "accu_d": self.metrics["accu_d"]
-        }
-        fetches_eval = {"batch_size": get_batch_size(inputs['text_ids'])}
-        fetches_eval.update(self.losses)
-        fetches_eval.update(self.metrics)
-        fetches_eval.update(self.samples)
-        self.fetches_eval = fetches_eval
diff --git a/examples/text_style_transfer/main.py b/examples/text_style_transfer/main.py
deleted file mode 100644
index 0dd8fc37..00000000
--- a/examples/text_style_transfer/main.py
+++ /dev/null
@@ -1,193 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Text style transfer
-
-This is a simplified implementation of:
-
-Toward Controlled Generation of Text, ICML2017
-Zhiting Hu, Zichao Yang, Xiaodan Liang, Ruslan Salakhutdinov, Eric Xing
-
-Download the data with the cmd:
-
-$ python prepare_data.py
-
-Train the model with the cmd:
-
-$ python main.py --config config
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# pylint: disable=invalid-name, too-many-locals, too-many-arguments, no-member
-
-import os
-import importlib
-import numpy as np
-import tensorflow as tf
-import texar.tf as tx
-
-from ctrl_gen_model import CtrlGenModel
-
-flags = tf.flags
-
-flags.DEFINE_string('config', 'config', 'The config to use.')
-
-FLAGS = flags.FLAGS
-
-config = importlib.import_module(FLAGS.config)
-
-
-def _main(_):
-    # Data
-    train_data = tx.data.MultiAlignedData(config.train_data)
-    val_data = tx.data.MultiAlignedData(config.val_data)
-    test_data = tx.data.MultiAlignedData(config.test_data)
-    vocab = train_data.vocab(0)
-
-    # Each training batch is used twice: once for updating the generator and
-    # once for updating the discriminator. Feedable data iterator is used for
-    # such case.
-    iterator = tx.data.FeedableDataIterator(
-        {'train_g': train_data, 'train_d': train_data,
-         'val': val_data, 'test': test_data})
-    batch = iterator.get_next()
-
-    # Model
-    gamma = tf.placeholder(dtype=tf.float32, shape=[], name='gamma')
-    lambda_g = tf.placeholder(dtype=tf.float32, shape=[], name='lambda_g')
-    model = CtrlGenModel(batch, vocab, gamma, lambda_g, config.model)
-
-    def _train_epoch(sess, gamma_, lambda_g_, epoch, verbose=True):
-        avg_meters_d = tx.utils.AverageRecorder(size=10)
-        avg_meters_g = tx.utils.AverageRecorder(size=10)
-
-        step = 0
-        while True:
-            try:
-                step += 1
-                feed_dict = {
-                    iterator.handle: iterator.get_handle(sess, 'train_d'),
-                    gamma: gamma_,
-                    lambda_g: lambda_g_
-                }
-
-                vals_d = sess.run(model.fetches_train_d, feed_dict=feed_dict)
-                avg_meters_d.add(vals_d)
-
-                feed_dict = {
-                    iterator.handle: iterator.get_handle(sess, 'train_g'),
-                    gamma: gamma_,
-                    lambda_g: lambda_g_
-                }
-                vals_g = sess.run(model.fetches_train_g, feed_dict=feed_dict)
-                avg_meters_g.add(vals_g)
-
-                if verbose and (step == 1 or step % config.display == 0):
-                    print('step: {}, {}'.format(step, avg_meters_d.to_str(4)))
-                    print('step: {}, {}'.format(step, avg_meters_g.to_str(4)))
-
-                if verbose and step % config.display_eval == 0:
-                    iterator.restart_dataset(sess, 'val')
-                    _eval_epoch(sess, gamma_, lambda_g_, epoch)
-
-            except tf.errors.OutOfRangeError:
-                print('epoch: {}, {}'.format(epoch, avg_meters_d.to_str(4)))
-                print('epoch: {}, {}'.format(epoch, avg_meters_g.to_str(4)))
-                break
-
-    def _eval_epoch(sess, gamma_, lambda_g_, epoch, val_or_test='val'):
-        avg_meters = tx.utils.AverageRecorder()
-
-        while True:
-            try:
-                feed_dict = {
-                    iterator.handle: iterator.get_handle(sess, val_or_test),
-                    gamma: gamma_,
-                    lambda_g: lambda_g_,
-                    tx.context.global_mode(): tf.estimator.ModeKeys.EVAL
-                }
-
-                vals = sess.run(model.fetches_eval, feed_dict=feed_dict)
-
-                batch_size = vals.pop('batch_size')
-
-                # Computes BLEU
-                samples = tx.utils.dict_pop(vals, list(model.samples.keys()))
-                hyps = tx.utils.map_ids_to_strs(samples['transferred'], vocab)
-
-                refs = tx.utils.map_ids_to_strs(samples['original'], vocab)
-                refs = np.expand_dims(refs, axis=1)
-
-                bleu = tx.evals.corpus_bleu_moses(refs, hyps)
-                vals['bleu'] = bleu
-
-                avg_meters.add(vals, weight=batch_size)
-
-                # Writes samples
-                tx.utils.write_paired_text(
-                    refs.squeeze(), hyps,
-                    os.path.join(config.sample_path, 'val.%d' % epoch),
-                    append=True, mode='v')
-
-            except tf.errors.OutOfRangeError:
-                print('{}: {}'.format(
-                    val_or_test, avg_meters.to_str(precision=4)))
-                break
-
-        return avg_meters.avg()
-
-    tf.gfile.MakeDirs(config.sample_path)
-    tf.gfile.MakeDirs(config.checkpoint_path)
-
-    # Runs the logics
-    with tf.Session() as sess:
-        sess.run(tf.global_variables_initializer())
-        sess.run(tf.local_variables_initializer())
-        sess.run(tf.tables_initializer())
-
-        saver = tf.train.Saver(max_to_keep=None)
-        if config.restore:
-            print('Restore from: {}'.format(config.restore))
-            saver.restore(sess, config.restore)
-
-        iterator.initialize_dataset(sess)
-
-        gamma_ = 1.
-        lambda_g_ = 0.
-        for epoch in range(1, config.max_nepochs + 1):
-            if epoch > config.pretrain_nepochs:
-                # Anneals the gumbel-softmax temperature
-                gamma_ = max(0.001, gamma_ * config.gamma_decay)
-                lambda_g_ = config.lambda_g
-            print('gamma: {}, lambda_g: {}'.format(gamma_, lambda_g_))
-
-            # Train
-            iterator.restart_dataset(sess, ['train_g', 'train_d'])
-            _train_epoch(sess, gamma_, lambda_g_, epoch)
-
-            # Val
-            iterator.restart_dataset(sess, 'val')
-            _eval_epoch(sess, gamma_, lambda_g_, epoch, 'val')
-
-            saver.save(
-                sess, os.path.join(config.checkpoint_path, 'ckpt'), epoch)
-
-            # Test
-            iterator.restart_dataset(sess, 'test')
-            _eval_epoch(sess, gamma_, lambda_g_, epoch, 'test')
-
-
-if __name__ == '__main__':
-    tf.app.run(main=_main)
diff --git a/examples/text_style_transfer/prepare_data.py b/examples/text_style_transfer/prepare_data.py
deleted file mode 100644
index 72d85b4d..00000000
--- a/examples/text_style_transfer/prepare_data.py
+++ /dev/null
@@ -1,39 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Downloads data.
-"""
-import texar.tf as tx
-
-# pylint: disable=invalid-name
-
-
-def prepare_data():
-    """Downloads data.
-    """
-    tx.data.maybe_download(
-        urls='https://drive.google.com/file/d/'
-             '1HaUKEYDBEk6GlJGmXwqYteB-4rS9q8Lg/view?usp=sharing',
-        path='./',
-        filenames='yelp.zip',
-        extract=True)
-
-
-def main():
-    """Entrypoint.
-    """
-    prepare_data()
-
-
-if __name__ == '__main__':
-    main()
diff --git a/examples/torchtext/.gitignore b/examples/torchtext/.gitignore
deleted file mode 100644
index 57dd7a8b..00000000
--- a/examples/torchtext/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-.data/
-.vector_cache/
diff --git a/examples/torchtext/README.md b/examples/torchtext/README.md
deleted file mode 100644
index ae9b0be5..00000000
--- a/examples/torchtext/README.md
+++ /dev/null
@@ -1,11 +0,0 @@
-# Data loading with torchtext #
-
-This example demonstrates the use of [torchtext](https://github.com/pytorch/text) package as data loader for Texar models. 
-
-## Usage ##
-
-The following command trains a small-sized language model on PTB:
-
-```
-python lm_torchtext.py --config config_small
-```
diff --git a/examples/torchtext/batchfirst_bptt.py b/examples/torchtext/batchfirst_bptt.py
deleted file mode 100644
index 386f7dfb..00000000
--- a/examples/torchtext/batchfirst_bptt.py
+++ /dev/null
@@ -1,71 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import math
-from torchtext.data import BPTTIterator, Dataset, Batch
-
-
-class BatchFirstBPTTIterator(BPTTIterator):
-    """Defines an iterator for language modeling tasks that use BPTT.
-
-    Provides contiguous streams of examples together with targets that are
-    one timestep further forward, for language modeling training with
-    backpropagation through time (BPTT). Expects a Dataset with a single
-    example and a single field called 'text' and produces Batches with text and
-    target attributes.
-
-    All batches will have sizes [batch_size, bptt_len]
-
-    Attributes:
-        dataset: The Dataset object to load Examples from.
-        batch_size: Batch size.
-        bptt_len: Length of sequences for backpropagation through time.
-        sort_key: A key to use for sorting examples in order to batch together
-            examples with similar lengths and minimize padding. The sort_key
-            provided to the Iterator constructor overrides the sort_key
-            attribute of the Dataset, or defers to it if None.
-        train: Whether the iterator represents a train set.
-        repeat: Whether to repeat the iterator for multiple epochs.
-        shuffle: Whether to shuffle examples between epochs.
-        sort: Whether to sort examples according to self.sort_key.
-            Note that repeat, shuffle, and sort default to train, train, and
-            (not train).
-        device: Device to create batches on. Use -1 for CPU and None for the
-            currently active GPU device.
-    """
-
-    def __len__(self):
-        return math.floor(
-            (len(self.dataset[0].text) / self.batch_size - 1) / self.bptt_len)
-
-    def __iter__(self):
-        text = self.dataset[0].text
-        TEXT = self.dataset.fields['text']
-        TEXT.eos_token = None
-        pad_num = int(math.ceil(len(text) / self.batch_size) *
-                      self.batch_size - len(text))
-        text = text + ([TEXT.pad_token] * pad_num)
-        data = TEXT.numericalize([text], device=self.device)
-        data = data.view(self.batch_size, -1).contiguous()
-        dataset = Dataset(examples=self.dataset.examples,
-                          fields=[('text', TEXT), ('target', TEXT)])
-        while True:
-            for i in range(0, len(self) * self.bptt_len, self.bptt_len):
-                self.iterations += 1
-                seq_len = self.bptt_len
-                yield Batch.fromvars(
-                    dataset, self.batch_size,
-                    text=data[:, i:i + seq_len],
-                    target=data[:, i + 1:i + 1 + seq_len])
-            if not self.repeat:
-                return
diff --git a/examples/torchtext/config_small.py b/examples/torchtext/config_small.py
deleted file mode 100644
index 7cfebc7c..00000000
--- a/examples/torchtext/config_small.py
+++ /dev/null
@@ -1,56 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""PTB LM small size config.
-"""
-
-# pylint: disable=invalid-name, too-few-public-methods, missing-docstring
-
-init_scale = 0.1
-num_epochs = 13
-hidden_size = 200
-keep_prob = 1.0
-batch_size = 20
-num_steps = 20
-
-cell = {
-    "type": "LSTMBlockCell",
-    "kwargs": {
-        "num_units": hidden_size,
-        "forget_bias": 0.
-    },
-    "dropout": {"output_keep_prob": keep_prob},
-    "num_layers": 2
-}
-emb = {
-    "dim": hidden_size
-}
-opt = {
-    "optimizer": {
-        "type": "GradientDescentOptimizer",
-        "kwargs": {"learning_rate": 1.0}
-    },
-    "gradient_clip": {
-        "type": "clip_by_global_norm",
-        "kwargs": {"clip_norm": 5.}
-    },
-    "learning_rate_decay": {
-        "type": "exponential_decay",
-        "kwargs": {
-            "decay_steps": 1,
-            "decay_rate": 0.5,
-            "staircase": True
-        },
-        "start_decay_step": 3
-    }
-}
diff --git a/examples/torchtext/lm_torchtext.py b/examples/torchtext/lm_torchtext.py
deleted file mode 100644
index b3f81f4c..00000000
--- a/examples/torchtext/lm_torchtext.py
+++ /dev/null
@@ -1,164 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Language Modeling example using torchtext
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import time
-import importlib
-import numpy as np
-import tensorflow as tf
-import texar.tf as tx
-
-from torchtext import data
-from torchtext import datasets
-
-from batchfirst_bptt import BatchFirstBPTTIterator
-
-# pylint: disable=invalid-name, too-many-locals, no-member
-
-flags = tf.flags
-
-flags.DEFINE_string("data_path", "./",
-                    "Directory containing PTB raw data (e.g., ptb.train.txt). "
-                    "E.g., ./simple-examples/data. If not exists, "
-                    "the directory will be created and PTB raw data will "
-                    "be downloaded.")
-flags.DEFINE_string("config", "config_small", "The config to use.")
-
-FLAGS = flags.FLAGS
-
-config = importlib.import_module(FLAGS.config)
-
-
-def _main(_):
-    # Data
-    batch_size = config.batch_size
-    num_steps = config.num_steps
-
-    # setup vocabulary and data iterators with torchtext
-    TEXT = data.Field()
-    # make splits for data
-    train, valid, test = datasets.PennTreebank.splits(TEXT)
-
-    # build the vocabulary
-    TEXT.build_vocab(train, vectors=None)
-    vocab_size = len(TEXT.vocab)
-
-    # make iterator for splits
-    train_iter, valid_iter, test_iter = BatchFirstBPTTIterator.splits(
-        (train, valid, test), batch_size=batch_size, bptt_len=num_steps,
-        repeat=False)
-
-    inputs = tf.placeholder(tf.int32, [batch_size, num_steps])
-    targets = tf.placeholder(tf.int32, [batch_size, num_steps])
-
-    # Model architecture
-    initializer = tf.random_uniform_initializer(
-        -config.init_scale, config.init_scale)
-    with tf.variable_scope("model", initializer=initializer):
-        embedder = tx.modules.WordEmbedder(
-            vocab_size=vocab_size, hparams=config.emb)
-        emb_inputs = embedder(inputs)
-        if config.keep_prob < 1:
-            emb_inputs = tf.nn.dropout(
-                emb_inputs, tx.utils.switch_dropout(config.keep_prob))
-
-        decoder = tx.modules.BasicRNNDecoder(
-            vocab_size=vocab_size, hparams={"rnn_cell": config.cell})
-        initial_state = decoder.zero_state(batch_size, tf.float32)
-        outputs, final_state, seq_lengths = decoder(
-            decoding_strategy="train_greedy",
-            impute_finished=True,
-            inputs=emb_inputs,
-            sequence_length=[num_steps] * batch_size,
-            initial_state=initial_state)
-
-    # Losses & train ops
-    mle_loss = tx.losses.sequence_sparse_softmax_cross_entropy(
-        labels=targets,
-        logits=outputs.logits,
-        sequence_length=seq_lengths)
-
-    # Use global_step to pass epoch, for lr decay
-    global_step = tf.placeholder(tf.int32)
-    train_op = tx.core.get_train_op(
-        mle_loss, global_step=global_step, increment_global_step=False,
-        hparams=config.opt)
-
-    def _run_epoch(sess, data_iter, epoch, is_train=False, verbose=False):
-        start_time = time.time()
-        loss = 0.
-        iters = 0
-        state = sess.run(initial_state)
-
-        fetches = {
-            "mle_loss": mle_loss,
-            "final_state": final_state,
-        }
-        if is_train:
-            fetches["train_op"] = train_op
-
-        mode = (tf.estimator.ModeKeys.TRAIN
-                if is_train
-                else tf.estimator.ModeKeys.EVAL)
-        epoch_size = (len(train) // batch_size - 1) // num_steps
-        for step, data_batch in enumerate(data_iter):
-            feed_dict = {
-                inputs: data_batch.text,
-                targets: data_batch.target,
-                global_step: epoch,
-                tx.global_mode(): mode,
-            }
-            for i, (c, h) in enumerate(initial_state):
-                feed_dict[c] = state[i].c
-                feed_dict[h] = state[i].h
-
-            rets = sess.run(fetches, feed_dict)
-            loss += rets["mle_loss"]
-            state = rets["final_state"]
-            iters += num_steps
-
-            ppl = np.exp(loss / iters)
-            if verbose and step % (epoch_size // 10) == 10:
-                print("%.3f perplexity: %.3f speed: %.0f wps" %
-                      (step * 1.0 / epoch_size, ppl,
-                       iters * batch_size / (time.time() - start_time)))
-
-        ppl = np.exp(loss / iters)
-        return ppl
-
-    with tf.Session() as sess:
-        sess.run(tf.global_variables_initializer())
-        sess.run(tf.local_variables_initializer())
-        sess.run(tf.tables_initializer())
-
-        for epoch in range(config.num_epochs):
-            # Train
-            train_ppl = _run_epoch(
-                sess, train_iter, epoch, is_train=True, verbose=True)
-            print("Epoch: %d Train Perplexity: %.3f" % (epoch, train_ppl))
-            # Valid
-            valid_ppl = _run_epoch(sess, valid_iter, epoch)
-            print("Epoch: %d Valid Perplexity: %.3f" % (epoch, valid_ppl))
-        # Test
-        test_ppl = _run_epoch(sess, test_iter, 0)
-        print("Test Perplexity: %.3f" % (test_ppl))
-
-
-if __name__ == '__main__':
-    tf.app.run(main=_main)
diff --git a/examples/torchtext/requirements.txt b/examples/torchtext/requirements.txt
deleted file mode 100644
index 979fc59a..00000000
--- a/examples/torchtext/requirements.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-# also make sure install PyTorch 0.4.0 or newer. 
-torchtext >= 0.2.3
diff --git a/examples/transformer/.gitignore b/examples/transformer/.gitignore
deleted file mode 100644
index 2162c213..00000000
--- a/examples/transformer/.gitignore
+++ /dev/null
@@ -1,3 +0,0 @@
-/data/
-/temp/
-/outputs/
diff --git a/examples/transformer/README.md b/examples/transformer/README.md
deleted file mode 100644
index 0c180f91..00000000
--- a/examples/transformer/README.md
+++ /dev/null
@@ -1,167 +0,0 @@
-# Transformer for Machine Translation #
-
-This is an implementation of the Transformer model described in [Vaswani, Ashish, et al. "Attention is all you need."](http://papers.nips.cc/paper/7181-attention-is-all-you-need.pdf).
-
-[Quick Start](https://github.com/asyml/texar/tree/master/examples/transformer#quick-start): Prerequisites & use on machine translation datasets
-
-[Run Your Customized Experiments](https://github.com/asyml/texar/tree/master/examples/transformer#run-your-customized-experiments): Hands-on tutorial of data preparation, configuration, and model training/test
-
-## Quick Start ##
-
-### Prerequisites ###
-
-Run the following cmd to install necessary packages for the example: 
-```bash
-pip install -r requirements.txt
-```
-
-### Datasets ###
-
-Two example datasets are provided:
-- IWSLT'15 **EN-VI** for English-Vietnamese translation
-- WMT'14 **EN-DE** for English-German translation
-
-Download and pre-process the **IWSLT'15 EN-VI** data with the following cmds: 
-```bash
-sh scripts/iwslt15_en_vi.sh 
-sh preprocess_data.sh spm en vi
-```
-By default, the downloaded dataset is in `./data/en_vi`. 
-As with the [official implementation](https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/models/transformer.py), `spm` (`sentencepiece`) encoding is used to encode the raw text as data pre-processing. The encoded data is by default in `./temp/run_en_vi_spm`. 
-
-For the **WMT'14 EN-DE** data, download and pre-process with:
-```bash
-sh scripts/wmt14_en_de.sh
-sh preprocess_data.sh bpe en de
-```
-
-By default, the downloaded dataset is in `./data/en_de`.
-Note that for this dataset, `bpe` encoding (Byte pair encoding) is used instead. The encoded data is by default in `./temp/run_en_de_bpe`. 
-
-### Train and evaluate the model ###
-
-Train the model with the cmd:
-```bash
-python transformer_main.py --run_mode=train_and_evaluate --config_model=config_model --config_data=config_iwslt15
-```
-* Specify `--model_dir` to dump model checkpoints, training logs, and tensorboard summaries to a desired directory. By default it is set to `./outputs`. 
-* Specifying `--model_dir` will also restore the latest model checkpoint under the directory, if any checkpoint is there.
-* Specify `--config_data=config_wmt14` to train on the WMT'14 data.
-
-### Test a trained model ###
-
-To only evaluate a model checkpoint without training, first load the checkpoint and generate samples: 
-```bash
-python transformer_main.py --run_mode=test --config_data=config_iwslt15 --model_dir=./outputs
-```
-The latest checkpoint in `./outputs` is used. Generated samples are in the file `./outputs/test.output.hyp`, and reference sentences are in the file `./outputs/test.output.ref` 
-
-Next, decode the samples with respective decoder, and evaluate with `bleu_tool`:
-```bash
-../../bin/utils/spm_decode --infile ./outputs/test.output.hyp --outfile temp/test.output.spm --model temp/run_en_vi_spm/data/spm-codes.32000.model --input_format=piece 
-
-python bleu_tool.py --reference=data/en_vi/test.vi --translation=temp/test.output.spm
-```
-
-For WMT'14, the corresponding cmds are:
-```bash
-# Loads model and generates samples
-python transformer_main.py --run_mode=test --config_data=config_wmt14 --model_dir=./outputs
-
-# BPE decoding
-cat outputs/test.output.hyp | sed -E 's/(@@ )|(@@ ?$)//g' > temp/test.output.bpe
-
-# Evaluates BLEU
-python bleu_tool.py --reference=data/en_de/test.de --translation=temp/test.output.bpe
-```
-
-### Results
-
-* On IWSLT'15, the implementation achieves around `BLEU_cased=28.54` and `BLEU_uncased=29.30` (by [bleu_tool.py](./bleu_tool.py)), which are comparable to the base_single_gpu results by the [official implementation](https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/models/transformer.py) (`28.12` and `28.97`, respectively, as reported [here](https://github.com/tensorflow/tensor2tensor/pull/611)).
-
-* On WMT'14, the implementation achieves around `BLEU_cased=25.12` (setting: base_single_gpu, batch_size=3072).
-
-
-### Example training log
-
-```
-12:02:02,686:INFO:step:500 loss: 7.3735
-12:04:20,035:INFO:step:1000 loss:6.1502
-12:06:37,550:INFO:step:1500 loss:5.4877
-```
-Using an Nvidia GTX 1080Ti, the model usually converges within 5 hours (~15 epochs) on IWSLT'15.
-
----
-
-## Run Your Customized Experiments
-
-Here is an hands-on tutorial on running Transformer with your own customized dataset.
-
-### 1. Prepare raw data
-
-Create a data directory and put the raw data in the directory. To be compatible with the data preprocessing in the next step, you may follow the convention below:
-
-* The data directory should be named as `data/${src}_${tgt}/`. Take the data downloaded with `scripts/iwslt15_en_vi.sh` for example, the data directory is `data/en_vi`.
-* The raw data should have 6 files, which contain source and target sentences of training/dev/test sets, respectively. In the `iwslt15_en_vi` example, `data/en_vi/train.en` contains the source sentences of the training set, where each line is a sentence. Other files are `train.vi`, `dev.en`, `dev.vi`, `test.en`, `test.vi`. 
-
-### 2. Preprocess the data
-
-To obtain the processed dataset, run
-```bash
-preprocess_data.sh ${encoder} ${src} ${tgt} ${vocab_size} ${max_seq_length}
-```
-where
-
-* The `encoder` parameter can be `bpe`(byte pairwise encoding), `spm` (sentence piece encoding), or
-`raw`(no subword encoding).
-* `vocab_size` is optional. The default is 32000. 
-  - At this point, this parameter is used only when `encoder` is set to `bpe` or `spm`. For `raw` encoding, you'd have to truncate the vocabulary by yourself.
-  - For `spm` encoding, the preprocessing may fail (due to the Python sentencepiece module) if `vocab_size` is too large. So you may want to try smaller `vocab_size` if it happens. 
-* `max_seq_length` is optional. The default is 70.
-
-In the `iwslt15_en_vi` example, the cmd is `sh preprocess_data.sh spm en vi`.
-
-By default, the preprocessed data are dumped under `temp/run_${src}_${tgt}_${encoder}`. In the `iwslt15_en_vi` example, the directory is `temp/run_en_vi_spm`.
-
-If you choose to use `raw` encoding method, notice that:
-
-- By default, the word embedding layer is built with the combination of source vocabulary and target vocabulary. For example, if the source vocabulary is of size 3K and the target vocabulary of size 3K and there is no overlap between the two vocabularies, then the final vocabulary used in the model is of size 6K.
-- By default, the final output layer of transformer decoder (hidden_state -> logits) shares the parameters with the word embedding layer.
-
-### 3. Specify data and model configuration
-
-Customize the Python configuration files to config the model and data.
-
-Please refer to the example configuration files `config_model.py` for model configuration and `config_iwslt15.py` for data configuration.
-
-### 4. Train the model
-
-Train the model with the following cmd:
-```bash
-python transformer_main.py --run_mode=train_and_evaluate --config_model=custom_config_model --config_data=custom_config_data
-```
-where the model and data configuration files are `custom_config_model.py` and `custom_config_data.py`, respectively.
-
-Outputs such as model checkpoints are by default under `outputs/`.
-
-### 5. Test the model
-
-Test with the following cmd:
-```bash
-python transformer_main.py --run_mode=test --config_data=custom_config_data --model_dir=./outputs
-```
-
-Generated samples on the test set are in `outputs/test.output.hyp`, and reference sentences are in `outputs/test.output.ref`. If you've used `bpe` or `spm` encoding in the data preprocessing step, the text in these files are in the respective encoding too. To decode, use the respective cmd:
-```bash
-# BPE decoding
-cat outputs/test.output.hyp | sed -E 's/(@@ )|(@@ ?$)//g' > temp/test.output.hyp.final
-
-# SPM decoding (take `iwslt15_en_vi` for example)
-../../bin/utils/spm_decode --infile ./outputs/test.output.hyp --outfile temp/test.output.hyp.final --model temp/run_en_vi_spm/data/spm-codes.32000.model --input_format=piece 
-```
-
-Finally, to evaluate the BLEU score against the ground truth on the test set:
-```bash
-python bleu_tool.py --reference=you_reference_file --translation=temp/test.output.hyp.final
-```
-E.g., in the `iwslt15_en_vi` example, with `--reference=data/en_vi/test.vi`
diff --git a/examples/transformer/bleu_tool.py b/examples/transformer/bleu_tool.py
deleted file mode 100755
index 052ac863..00000000
--- a/examples/transformer/bleu_tool.py
+++ /dev/null
@@ -1,222 +0,0 @@
-# Copyright 2018 The Tensor2Tensor Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# Modifications copyright (C) 2018 Texar
-# ==============================================================================
-"""BLEU metric utililities used for MT eval.
-
-Usage: python bleu_tool.py --translation=my-wmt13.de --reference=wmt13_deen.de
-"""
-# This also:
-# Put compounds in ATAT format (comparable to papers like GNMT, ConvS2S).
-# See https://nlp.stanford.edu/projects/nmt/ :
-# 'Also, for historical reasons, we split compound words, e.g.,
-#    "rich-text format" --> rich ##AT##-##AT## text format."'
-# BLEU score will be similar to the one obtained using: mteval-v14.pl
-# Note:compound splitting is not implemented in this module
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
-
-from argparse import ArgumentParser
-from io import open
-import collections
-import math
-import re
-import sys
-import unicodedata
-
-# Dependency imports
-
-import numpy as np
-import six
-# pylint: disable=redefined-builtin
-from six.moves import xrange
-from six.moves import zip
-
-
-# pylint: enable=redefined-builtin
-
-
-def _get_ngrams(segment, max_order):
-    """Extracts all n-grams upto a given maximum order from an input segment.
-
-  Args:
-    segment: text segment from which n-grams will be extracted.
-    max_order: maximum length in tokens of the n-grams returned by this
-        methods.
-
-  Returns:
-    The Counter containing all n-grams upto max_order in segment
-    with a count of how many times each n-gram occurred.
-  """
-    ngram_counts = collections.Counter()
-    for order in xrange(1, max_order + 1):
-        for i in xrange(0, len(segment) - order + 1):
-            ngram = tuple(segment[i:i + order])
-            ngram_counts[ngram] += 1
-    return ngram_counts
-
-
-def compute_bleu(reference_corpus,
-                 translation_corpus,
-                 max_order=4,
-                 use_bp=True):
-    """Computes BLEU score of translated segments against references.
-
-    Args:
-        reference_corpus: list of references for each translation. Each
-            reference should be tokenized into a list of tokens.
-        translation_corpus: list of translations to score. Each translation
-            should be tokenized into a list of tokens.
-        max_order: Maximum n-gram order to use when computing BLEU score.
-        use_bp: boolean, whether to apply brevity penalty.
-    Returns:
-        BLEU score.
-    """
-
-    reference_length = 0
-    translation_length = 0
-    bp = 1.0
-    geo_mean = 0
-
-    matches_by_order = [0] * max_order
-    possible_matches_by_order = [0] * max_order
-    precisions = []
-
-    for (references, translations) in zip(reference_corpus, translation_corpus):
-        reference_length += len(references)
-        translation_length += len(translations)
-        ref_ngram_counts = _get_ngrams(references, max_order)
-        translation_ngram_counts = _get_ngrams(translations, max_order)
-
-        overlap = dict((ngram,
-                        min(count, translation_ngram_counts[ngram]))
-                       for ngram, count in ref_ngram_counts.items())
-
-        for ngram in overlap:
-            matches_by_order[len(ngram) - 1] += overlap[ngram]
-        for ngram in translation_ngram_counts:
-            possible_matches_by_order[len(ngram) - 1] += \
-                translation_ngram_counts[ngram]
-    precisions = [0] * max_order
-    smooth = 1.0
-    for i in xrange(0, max_order):
-        if possible_matches_by_order[i] > 0:
-            precisions[i] = matches_by_order[i] / possible_matches_by_order[i]
-            if matches_by_order[i] > 0:
-                precisions[i] = matches_by_order[i] / \
-                    possible_matches_by_order[i]
-            else:
-                smooth *= 2
-                precisions[i] = 1.0 / (smooth * possible_matches_by_order[i])
-        else:
-            precisions[i] = 0.0
-
-    if max(precisions) > 0:
-        p_log_sum = sum(math.log(p) for p in precisions if p)
-        geo_mean = math.exp(p_log_sum / max_order)
-
-    if use_bp:
-        ratio = translation_length / reference_length
-        if ratio <= 0:
-            bp = 0
-        elif ratio < 1.0:
-            bp = math.exp(1 - 1. / ratio)
-        else:
-            bp = 1.0
-    bleu = geo_mean * bp
-    return np.float32(bleu)
-
-
-class UnicodeRegex(object):
-    """Ad-hoc hack to recognize all punctuation and symbols."""
-    # pylint:disable=too-few-public-methods
-    def __init__(self):
-        punctuation = self.property_chars("P")
-        self.nondigit_punct_re = re.compile(r"([^\d])([" + punctuation + r"])")
-        self.punct_nondigit_re = re.compile(r"([" + punctuation + r"])([^\d])")
-        self.symbol_re = re.compile("([" + self.property_chars("S") + "])")
-
-    def property_chars(self, prefix):
-        # pylint:disable=no-self-use
-        return "".join(six.unichr(x) for x in range(sys.maxunicode)
-                       if unicodedata.category(
-            six.unichr(x)).startswith(prefix))
-
-
-uregex = UnicodeRegex()
-
-
-def bleu_tokenize(string):
-    r"""Tokenize a string following the official BLEU implementation.
-
-  See https://github.com/moses-smt/mosesdecoder/"
-           "blob/master/scripts/generic/mteval-v14.pl#L954-L983
-  In our case, the input string is expected to be just one line
-  and no HTML entities de-escaping is needed.
-  So we just tokenize on punctuation and symbols,
-  except when a punctuation is preceded and followed by a digit
-  (e.g. a comma/dot as a thousand/decimal separator).
-
-  Note that a numer (e.g. a year) followed by a dot at the end of sentence
-  is NOT tokenized,
-  i.e. the dot stays with the number because `s/(\p{P})(\P{N})/ $1 $2/g`
-  does not match this case (unless we add a space after each sentence).
-  However, this error is already in the original mteval-v14.pl
-  and we want to be consistent with it.
-
-  Args:
-    string: the input string
-
-  Returns:
-    a list of tokens
-  """
-    string = uregex.nondigit_punct_re.sub(r"\1 \2 ", string)
-    string = uregex.punct_nondigit_re.sub(r" \1 \2", string)
-    string = uregex.symbol_re.sub(r" \1 ", string)
-    return string.split()
-
-
-def bleu_wrapper(ref_filename, hyp_filename, case_sensitive=False):
-    """Compute BLEU for two files (reference and hypothesis translation)."""
-    ref_lines = open(ref_filename, encoding='utf-8').read().splitlines()
-    hyp_lines = open(hyp_filename, encoding='utf-8').read().splitlines()
-    assert len(ref_lines) == len(hyp_lines)
-    if not case_sensitive:
-        ref_lines = [x.lower() for x in ref_lines]
-        hyp_lines = [x.lower() for x in hyp_lines]
-    ref_tokens = [bleu_tokenize(x) for x in ref_lines]
-    hyp_tokens = [bleu_tokenize(x) for x in hyp_lines]
-    return compute_bleu(ref_tokens, hyp_tokens)
-
-
-if __name__ == "__main__":
-    parser = ArgumentParser(description='Compute BLEU score. \
-        Usage: t2t-bleu --translation=my-wmt13.de --reference=wmt13_deen.de')
-
-    parser.add_argument('--translation', type=str)
-    parser.add_argument('--reference', type=str)
-    args = parser.parse_args()
-
-    bleu = 100 * bleu_wrapper(args.reference,
-                              args.translation,
-                              case_sensitive=False)
-    print("BLEU_uncased = %6.2f" % bleu)
-    bleu = 100 * bleu_wrapper(args.reference,
-                              args.translation,
-                              case_sensitive=True)
-    print("BLEU_cased = %6.2f" % bleu)
diff --git a/examples/transformer/config_iwslt15.py b/examples/transformer/config_iwslt15.py
deleted file mode 100644
index ff460516..00000000
--- a/examples/transformer/config_iwslt15.py
+++ /dev/null
@@ -1,12 +0,0 @@
-batch_size = 2048
-test_batch_size = 64
-
-max_train_epoch = 20
-display_steps = 500
-eval_steps = 2000
-
-max_decoding_length = 256
-
-filename_prefix = "processed."
-input_dir = 'temp/run_en_vi_spm/data'
-vocab_file = input_dir + '/processed.vocab.pickle'
diff --git a/examples/transformer/config_model.py b/examples/transformer/config_model.py
deleted file mode 100644
index c0fe31a5..00000000
--- a/examples/transformer/config_model.py
+++ /dev/null
@@ -1,67 +0,0 @@
-"""Configurations of Transformer model
-"""
-import copy
-import texar.tf as tx
-
-random_seed = 1234
-beam_width = 5
-length_penalty = 0.6
-hidden_dim = 512
-
-emb = {
-    'name': 'lookup_table',
-    'dim': hidden_dim,
-    'initializer': {
-        'type': 'random_normal_initializer',
-        'kwargs': {
-            'mean': 0.0,
-            'stddev': hidden_dim**-0.5,
-        },
-    }
-}
-
-position_embedder_hparams = {
-    'dim': hidden_dim
-}
-
-encoder = {
-    'dim': hidden_dim,
-    'num_blocks': 6,
-    'multihead_attention': {
-        'num_heads': 8,
-        'output_dim': hidden_dim
-        # See documentation for more optional hyperparameters
-    },
-    'initializer': {
-        'type': 'variance_scaling_initializer',
-        'kwargs': {
-            'scale': 1.0,
-            'mode': 'fan_avg',
-            'distribution': 'uniform',
-        },
-    },
-    'poswise_feedforward': tx.modules.default_transformer_poswise_net_hparams(
-        output_dim=hidden_dim)
-}
-
-decoder = copy.deepcopy(encoder)
-
-loss_label_confidence = 0.9
-
-opt = {
-    'optimizer': {
-        'type': 'AdamOptimizer',
-        'kwargs': {
-            'beta1': 0.9,
-            'beta2': 0.997,
-            'epsilon': 1e-9
-        }
-    }
-}
-
-lr = {
-    'learning_rate_schedule': 'constant.linear_warmup.rsqrt_decay.rsqrt_depth',
-    'lr_constant': 2 * (hidden_dim ** -0.5),
-    'static_lr': 1e-3,
-    'warmup_steps': 16000,
-}
diff --git a/examples/transformer/config_wmt14.py b/examples/transformer/config_wmt14.py
deleted file mode 100644
index 12fe439f..00000000
--- a/examples/transformer/config_wmt14.py
+++ /dev/null
@@ -1,12 +0,0 @@
-batch_size = 3072
-test_batch_size = 64
-
-max_train_epoch = 10
-display_steps = 500
-eval_steps = 2000
-
-max_decoding_length = 256
-
-filename_prefix = "processed."
-input_dir = 'temp/run_en_de_bpe/data'
-vocab_file = input_dir + '/processed.vocab.pickle'
diff --git a/examples/transformer/preprocess_data.sh b/examples/transformer/preprocess_data.sh
deleted file mode 100644
index 112a7dfd..00000000
--- a/examples/transformer/preprocess_data.sh
+++ /dev/null
@@ -1,91 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#!/usr/bin/env bash
-
-###########################################################################
-
-# This file provides a script to preprocess raw text corpora to generate
-# vocabulary with sentence piece encoding or byte pairwise encoding.
-#
-# By default, the vocab size is 32000 and maximum sequence length is 70.
-###########################################################################
-
-
-TF=$(pwd)
-
-export PATH=$PATH:$TF/../../bin/utils/
-encoder=$1
-src_language=$2
-tgt_language=$3
-vocab_size=${4:-32000}
-max_seq_length=${5:-70}
-
-# update these variables
-data=${TF}"/data/${src_language}_${tgt_language}"
-name="run_${src_language}_${tgt_language}_${encoder}"
-out="temp/${name}"
-
-train_src=$data/train.${src_language}
-train_tgt=$data/train.${tgt_language}
-valid_src=$data/dev.${src_language}
-valid_tgt=$data/dev.${tgt_language}
-test_src=$data/test.${src_language}
-test_tgt=$data/test.${tgt_language}
-
-#====== EXPERIMENT BEGIN ======
-echo "Output dir = $out"
-[ -d $out ] || mkdir -p $out
-[ -d $out/data ] || mkdir -p $out/data
-[ -d $out/test ] || mkdir -p  $out/test
-
-echo "Step 1a: Preprocess inputs"
-
-case ${encoder} in
-    'spm')
-        echo "Learning Word Piece on source and target combined"
-        spm_train --input=${train_src},${train_tgt} --vocab_size ${vocab_size} --model_prefix=$out/data/spm-codes.${vocab_size}
-        spm_encode --model $out/data/spm-codes.${vocab_size}.model --output_format=piece --infile $train_src --outfile $out/data/train.${src_language}.spm
-        spm_encode --model $out/data/spm-codes.${vocab_size}.model --output_format=piece --infile $valid_src --outfile $out/data/valid.${src_language}.spm
-        spm_encode --model $out/data/spm-codes.${vocab_size}.model --output_format=piece --infile $test_src --outfile $out/data/test.${src_language}.spm
-        spm_encode --model $out/data/spm-codes.${vocab_size}.model --output_format=piece --infile $train_tgt --outfile $out/data/train.${tgt_language}.spm
-        spm_encode --model $out/data/spm-codes.${vocab_size}.model --output_format=piece --infile $valid_tgt --outfile $out/data/valid.${tgt_language}.spm
-        spm_encode --model $out/data/spm-codes.${vocab_size}.model --output_format=piece --infile ${test_tgt} --outfile $out/data/test.${tgt_language}.spm
-        cp ${test_tgt} ${out}/test/test.${tgt_language} ;;
-    'bpe'):
-        echo "Learning Byte Pairwise on source and target combined"
-        cat ${train_src} ${train_tgt} | learn_bpe -s ${vocab_size} > ${out}/data/bpe-codes.${vocab_size}
-        apply_bpe -c ${out}/data/bpe-codes.${vocab_size} < ${train_src} > $out/data/train.${src_language}.bpe
-        apply_bpe -c ${out}/data/bpe-codes.${vocab_size} < ${valid_src} > ${out}/data/valid.${src_language}.bpe
-        apply_bpe -c ${out}/data/bpe-codes.${vocab_size} < ${test_src} > ${out}/data/test.${src_language}.bpe
-        apply_bpe -c ${out}/data/bpe-codes.${vocab_size} < ${train_tgt} > $out/data/train.${tgt_language}.bpe
-        apply_bpe -c ${out}/data/bpe-codes.${vocab_size} < ${valid_tgt} > ${out}/data/valid.${tgt_language}.bpe
-        apply_bpe -c ${out}/data/bpe-codes.${vocab_size} < ${test_tgt} > ${out}/data/test.${tgt_language}.bpe
-        cp ${test_tgt} ${out}/test/test.${tgt_language} ;;
-    'raw'):
-        echo "No subword encoding is applied, just copy the corpus files into correct directory"
-        cp ${train_src} $out/data/train.${src_language}.raw
-        cp ${valid_src} $out/data/valid.${src_language}.raw
-        cp ${test_src} $out/data/test.${src_language}.raw
-        cp ${train_tgt} $out/data/train.${tgt_language}.raw
-        cp ${valid_tgt} $out/data/valid.${tgt_language}.raw
-        cp ${test_tgt} $out/data/test.${tgt_language}.raw
-esac
-# TODO(zhiting): Truncate vocab when encoder==raw
-
-python ${TF}/utils/preprocess.py -i ${out}/data \
-    --src ${src_language}.${encoder} \
-    --tgt ${tgt_language}.${encoder} \
-    --save_data processed. \
-    --max_seq_length=${max_seq_length} \
-    --pre_encoding=${encoder}
diff --git a/examples/transformer/requirements.txt b/examples/transformer/requirements.txt
deleted file mode 100644
index 413606a0..00000000
--- a/examples/transformer/requirements.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-torchtext
-torch
-sentencepiece
diff --git a/examples/transformer/run.sh b/examples/transformer/run.sh
deleted file mode 100644
index cca8c7ba..00000000
--- a/examples/transformer/run.sh
+++ /dev/null
@@ -1,3 +0,0 @@
-#!/bin/bash
-
-python3 transformer_main.py --run_mode=train_and_evaluate --config_model=config_model --config_data=config_iwslt15
diff --git a/examples/transformer/scripts/iwslt15_en_vi.sh b/examples/transformer/scripts/iwslt15_en_vi.sh
deleted file mode 100644
index caa1288f..00000000
--- a/examples/transformer/scripts/iwslt15_en_vi.sh
+++ /dev/null
@@ -1,29 +0,0 @@
-#!/bin/sh
-# Copied from https://github.com/tensorflow/nmt/blob/master/nmt/scripts/download_iwslt15.sh
-#
-# Download small-scale IWSLT15 Vietnames to English translation data for NMT
-# model training.
-#
-# Usage:
-#   ./download_iwslt15.sh path-to-output-dir
-#
-# If output directory is not specified, "./iwslt15" will be used as the default
-# output directory.
-
-OUT_DIR="${1:-data/en_vi}"
-SITE_PREFIX="https://nlp.stanford.edu/projects/nmt/data"
-
-mkdir -v -p $OUT_DIR
-
-# Download iwslt15 small dataset from standford website.
-echo "Download training dataset train.en and train.vi."
-curl -o "$OUT_DIR/train.en" "$SITE_PREFIX/iwslt15.en-vi/train.en"
-curl -o "$OUT_DIR/train.vi" "$SITE_PREFIX/iwslt15.en-vi/train.vi"
-
-echo "Download dev dataset tst2012.en and tst2012.vi."
-curl -o "$OUT_DIR/dev.en" "$SITE_PREFIX/iwslt15.en-vi/tst2012.en"
-curl -o "$OUT_DIR/dev.vi" "$SITE_PREFIX/iwslt15.en-vi/tst2012.vi"
-
-echo "Download test dataset tst2013.en and tst2013.vi."
-curl -o "$OUT_DIR/test.en" "$SITE_PREFIX/iwslt15.en-vi/tst2013.en"
-curl -o "$OUT_DIR/test.vi" "$SITE_PREFIX/iwslt15.en-vi/tst2013.vi"
diff --git a/examples/transformer/scripts/wmt14_en_de.sh b/examples/transformer/scripts/wmt14_en_de.sh
deleted file mode 100755
index 0aa7662c..00000000
--- a/examples/transformer/scripts/wmt14_en_de.sh
+++ /dev/null
@@ -1,141 +0,0 @@
-#!/usr/bin/env bash
-
-# This code was adapted from Tensorflow NMT toolkit on 03/24/2018.
-# URL: https://raw.githubusercontent.com/tensorflow/nmt/master/nmt/scripts/wmt16_en_de.sh
-
-# Copyright 2017 Google Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-set -e
-OUTPUT_DIR="data/en_de/"
-DOWNLOADED_DATA_DIR="data/en_de_temp/"
-OUTPUT_DIR_CACHE="${DOWNLOADED_DATA_DIR}/cache"
-echo "Writing to ${OUTPUT_DIR_CACHE}. To change this, set the OUTPUT_DIR_CACHE environment variable."
-mkdir -p $DOWNLOADED_DATA_DIR
-mkdir -p ${OUTPUT_DIR_CACHE}
-mkdir -p ${OUTPUT_DIR}
-if [ ! -f ${DOWNLOADED_DATA_DIR}/europarl-v7-de-en.tgz ]; then
-    echo "Downloading Europarl v7. This may take a while..."
-    curl -o ${DOWNLOADED_DATA_DIR}/europarl-v7-de-en.tgz \
-        http://www.statmt.org/europarl/v7/de-en.tgz
-else
-    echo "${DOWNLOADED_DATA_DIR}/europarl-v7-de-en.tgz already exists."
-fi
-
-if [ ! -f ${DOWNLOADED_DATA_DIR}/common-crawl.tgz ]; then
-    echo "Downloading Common Crawl corpus. This may take a while..."
-    curl -o ${DOWNLOADED_DATA_DIR}/common-crawl.tgz \
-    http://www.statmt.org/wmt13/training-parallel-commoncrawl.tgz
-else
-    echo "${DOWNLOADED_DATA_DIR}/common-crawl.tgz already exists."
-fi
-
-if [ ! -f ${DOWNLOADED_DATA_DIR}/nc-v11.tgz ]; then
-    echo "Downloading News Commentary v11. This may take a while..."
-    curl -o ${DOWNLOADED_DATA_DIR}/nc-v11.tgz \
-        http://data.statmt.org/wmt16/translation-task/training-parallel-nc-v11.tgz
-else
-    echo "${DOWNLOADED_DATA_DIR}/nc-v11.tgz already exists"
-fi
-
-if [ ! -f ${DOWNLOADED_DATA_DIR}/dev.tgz ]; then
-    echo "Downloading dev/test sets"
-    curl -o ${DOWNLOADED_DATA_DIR}/dev.tgz \
-        http://data.statmt.org/wmt16/translation-task/dev.tgz
-else
-    echo "${DOWNLOADED_DATA_DIR}/dev.tgz already exists"
-fi
-
-if [ ! -f ${DOWNLOADED_DATA_DIR}/test.tgz ]; then
-    curl -o ${DOWNLOADED_DATA_DIR}/test.tgz \
-        http://data.statmt.org/wmt16/translation-task/test.tgz
-else
-    echo "${DOWNLOADED_DATA_DIR}/test.tgz already exists"
-fi
-
-# Extract everything
-echo "Extracting all files..."
-if  [ ! -d ${DOWNLOADED_DATA_DIR}/europarl-v7-de-en ]; then
-    mkdir -p "${DOWNLOADED_DATA_DIR}/europarl-v7-de-en"
-    tar -xvzf "${DOWNLOADED_DATA_DIR}/europarl-v7-de-en.tgz" -C "${DOWNLOADED_DATA_DIR}/europarl-v7-de-en"
-    mkdir -p "${DOWNLOADED_DATA_DIR}/common-crawl"
-    tar -xvzf "${DOWNLOADED_DATA_DIR}/common-crawl.tgz" -C "${DOWNLOADED_DATA_DIR}/common-crawl"
-    mkdir -p "${DOWNLOADED_DATA_DIR}/nc-v11"
-    tar -xvzf "${DOWNLOADED_DATA_DIR}/nc-v11.tgz" -C "${DOWNLOADED_DATA_DIR}/nc-v11"
-    mkdir -p "${DOWNLOADED_DATA_DIR}/dev"
-    tar -xvzf "${DOWNLOADED_DATA_DIR}/dev.tgz" -C "${DOWNLOADED_DATA_DIR}/dev"
-    mkdir -p "${DOWNLOADED_DATA_DIR}/test"
-    tar -xvzf "${DOWNLOADED_DATA_DIR}/test.tgz" -C "${DOWNLOADED_DATA_DIR}/test"
-else
-    echo "the tar files have been unzipped"
-fi
-
-# Concatenate Training data
-wc -l ${DOWNLOADED_DATA_DIR}/europarl-v7-de-en/europarl-v7.de-en.en
-wc -l ${DOWNLOADED_DATA_DIR}/common-crawl/commoncrawl.de-en.en
-wc -l ${DOWNLOADED_DATA_DIR}/nc-v11/training-parallel-nc-v11/news-commentary-v11.de-en.en
-
-cat "${DOWNLOADED_DATA_DIR}/europarl-v7-de-en/europarl-v7.de-en.en" \
-  "${DOWNLOADED_DATA_DIR}/common-crawl/commoncrawl.de-en.en" \
-  "${DOWNLOADED_DATA_DIR}/nc-v11/training-parallel-nc-v11/news-commentary-v11.de-en.en" \
-  > "${OUTPUT_DIR_CACHE}/train.en" &&\
-wc -l "${OUTPUT_DIR_CACHE}/train.en"
-
-cat "${DOWNLOADED_DATA_DIR}/europarl-v7-de-en/europarl-v7.de-en.de" \
-  "${DOWNLOADED_DATA_DIR}/common-crawl/commoncrawl.de-en.de" \
-  "${DOWNLOADED_DATA_DIR}/nc-v11/training-parallel-nc-v11/news-commentary-v11.de-en.de" \
-  > "${OUTPUT_DIR_CACHE}/train.de" &&\
-wc -l "${OUTPUT_DIR_CACHE}/train.de"
-
-# Clone Moses
-if [ ! -d "${OUTPUT_DIR_CACHE}/mosesdecoder" ]; then
-  echo "Cloning moses for data processing"
-  git clone https://github.com/moses-smt/mosesdecoder.git "${OUTPUT_DIR_CACHE}/mosesdecoder"
-fi
-
-${OUTPUT_DIR_CACHE}/mosesdecoder/scripts/ems/support/input-from-sgm.perl \
-  < ${DOWNLOADED_DATA_DIR}/dev/dev/newstest2014-deen-src.de.sgm \
-  > ${DOWNLOADED_DATA_DIR}/dev/dev/newstest2014.de
-${OUTPUT_DIR_CACHE}/mosesdecoder/scripts/ems/support/input-from-sgm.perl \
-  < ${DOWNLOADED_DATA_DIR}/dev/dev/newstest2014-deen-ref.en.sgm \
-  > ${DOWNLOADED_DATA_DIR}/dev/dev/newstest2014.en
-
-# Copy dev/test data to output dir
-cp ${DOWNLOADED_DATA_DIR}/dev/dev/newstest20*.de ${OUTPUT_DIR_CACHE}
-cp ${DOWNLOADED_DATA_DIR}/dev/dev/newstest20*.en ${OUTPUT_DIR_CACHE}
-
-# Tokenize data
-for f in ${OUTPUT_DIR_CACHE}/*.de; do
-  echo "Tokenizing $f..."
-  ${OUTPUT_DIR_CACHE}/mosesdecoder/scripts/tokenizer/tokenizer.perl -q -l de -threads 8 < $f > ${f%.*}.tok.de
-done
-
-for f in ${OUTPUT_DIR_CACHE}/*.en; do
-  echo "Tokenizing $f..."
-  ${OUTPUT_DIR_CACHE}/mosesdecoder/scripts/tokenizer/tokenizer.perl -q -l en -threads 8 < $f > ${f%.*}.tok.en
-done
-
-# Clean train corpora
-for f in ${OUTPUT_DIR_CACHE}/train.tok.en; do
-  fbase=${f%.*}
-  echo "Cleaning ${fbase}..."
-  ${OUTPUT_DIR_CACHE}/mosesdecoder/scripts/training/clean-corpus-n.perl $fbase de en "${fbase}.clean" 1 80
-done
-
-cp ${OUTPUT_DIR_CACHE}/train.tok.clean.en ${OUTPUT_DIR}/train.en
-cp ${OUTPUT_DIR_CACHE}/train.tok.clean.de ${OUTPUT_DIR}/train.de
-cp ${OUTPUT_DIR_CACHE}/newstest2013.tok.en ${OUTPUT_DIR}/dev.en
-cp ${OUTPUT_DIR_CACHE}/newstest2013.tok.de ${OUTPUT_DIR}/dev.de
-cp ${OUTPUT_DIR_CACHE}/newstest2014.tok.en ${OUTPUT_DIR}/test.en
-cp ${OUTPUT_DIR_CACHE}/newstest2014.tok.de ${OUTPUT_DIR}/test.de
diff --git a/examples/transformer/transformer_main.py b/examples/transformer/transformer_main.py
deleted file mode 100644
index abb01597..00000000
--- a/examples/transformer/transformer_main.py
+++ /dev/null
@@ -1,309 +0,0 @@
-# Copyright 2019 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Transformer model.
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
-
-import pickle
-import random
-import os
-import importlib
-import tensorflow as tf
-from torchtext import data
-import texar.tf as tx
-from texar.tf.modules import TransformerEncoder, TransformerDecoder
-from texar.tf.utils import transformer_utils
-
-from utils import data_utils, utils
-from utils.preprocess import bos_token_id, eos_token_id
-from bleu_tool import bleu_wrapper
-
-# pylint: disable=invalid-name, too-many-locals
-
-flags = tf.flags
-
-flags.DEFINE_string("config_model", "config_model", "The model config.")
-flags.DEFINE_string("config_data", "config_iwslt15", "The dataset config.")
-flags.DEFINE_string("run_mode", "train_and_evaluate",
-                    "Either train_and_evaluate or test.")
-flags.DEFINE_string("model_dir", "./outputs",
-                    "Directory to save the trained model and logs.")
-
-FLAGS = flags.FLAGS
-
-config_model = importlib.import_module(FLAGS.config_model)
-config_data = importlib.import_module(FLAGS.config_data)
-
-utils.set_random_seed(config_model.random_seed)
-
-
-def main():
-    """Entrypoint.
-    """
-    # Load data
-    train_data, dev_data, test_data = data_utils.load_data_numpy(
-        config_data.input_dir, config_data.filename_prefix)
-    with open(config_data.vocab_file, 'rb') as f:
-        id2w = pickle.load(f)
-    vocab_size = len(id2w)
-
-    beam_width = config_model.beam_width
-
-    # Create logging
-    tx.utils.maybe_create_dir(FLAGS.model_dir)
-    logging_file = os.path.join(FLAGS.model_dir, 'logging.txt')
-    logger = utils.get_logger(logging_file)
-    print('logging file is saved in: %s', logging_file)
-
-    # Build model graph
-    encoder_input = tf.placeholder(tf.int64, shape=(None, None))
-    decoder_input = tf.placeholder(tf.int64, shape=(None, None))
-    batch_size = tf.shape(encoder_input)[0]
-    # (text sequence length excluding padding)
-    encoder_input_length = tf.reduce_sum(
-        1 - tf.cast(tf.equal(encoder_input, 0), tf.int32), axis=1)
-
-    labels = tf.placeholder(tf.int64, shape=(None, None))
-    is_target = tf.cast(tf.not_equal(labels, 0), tf.float32)
-
-    global_step = tf.Variable(0, dtype=tf.int64, trainable=False)
-    learning_rate = tf.placeholder(tf.float64, shape=(), name='lr')
-
-    # Source word embedding
-    src_word_embedder = tx.modules.WordEmbedder(
-        vocab_size=vocab_size, hparams=config_model.emb)
-    src_word_embeds = src_word_embedder(encoder_input)
-    src_word_embeds = src_word_embeds * config_model.hidden_dim ** 0.5
-
-    # Position embedding (shared b/w source and target)
-    pos_embedder = tx.modules.SinusoidsPositionEmbedder(
-        position_size=config_data.max_decoding_length,
-        hparams=config_model.position_embedder_hparams)
-    src_seq_len = tf.ones([batch_size], tf.int32) * tf.shape(encoder_input)[1]
-    src_pos_embeds = pos_embedder(sequence_length=src_seq_len)
-
-    src_input_embedding = src_word_embeds + src_pos_embeds
-
-    encoder = TransformerEncoder(hparams=config_model.encoder)
-    encoder_output = encoder(inputs=src_input_embedding,
-                             sequence_length=encoder_input_length)
-
-    # The decoder ties the input word embedding with the output logit layer.
-    # As the decoder masks out <PAD>'s embedding, which in effect means
-    # <PAD> has all-zero embedding, so here we explicitly set <PAD>'s embedding
-    # to all-zero.
-    tgt_embedding = tf.concat(
-        [tf.zeros(shape=[1, src_word_embedder.dim]),
-         src_word_embedder.embedding[1:, :]],
-        axis=0)
-    tgt_embedder = tx.modules.WordEmbedder(tgt_embedding)
-    tgt_word_embeds = tgt_embedder(decoder_input)
-    tgt_word_embeds = tgt_word_embeds * config_model.hidden_dim ** 0.5
-
-    tgt_seq_len = tf.ones([batch_size], tf.int32) * tf.shape(decoder_input)[1]
-    tgt_pos_embeds = pos_embedder(sequence_length=tgt_seq_len)
-
-    tgt_input_embedding = tgt_word_embeds + tgt_pos_embeds
-
-    _output_w = tf.transpose(tgt_embedder.embedding, (1, 0))
-
-    decoder = TransformerDecoder(vocab_size=vocab_size,
-                                 output_layer=_output_w,
-                                 hparams=config_model.decoder)
-    # For training
-    outputs = decoder(
-        memory=encoder_output,
-        memory_sequence_length=encoder_input_length,
-        inputs=tgt_input_embedding,
-        decoding_strategy='train_greedy',
-        mode=tf.estimator.ModeKeys.TRAIN
-    )
-
-    mle_loss = transformer_utils.smoothing_cross_entropy(
-        outputs.logits, labels, vocab_size, config_model.loss_label_confidence)
-    mle_loss = tf.reduce_sum(mle_loss * is_target) / tf.reduce_sum(is_target)
-
-    train_op = tx.core.get_train_op(
-        mle_loss,
-        learning_rate=learning_rate,
-        global_step=global_step,
-        hparams=config_model.opt)
-
-    tf.summary.scalar('lr', learning_rate)
-    tf.summary.scalar('mle_loss', mle_loss)
-    summary_merged = tf.summary.merge_all()
-
-    # For inference (beam-search)
-    start_tokens = tf.fill([batch_size], bos_token_id)
-
-    def _embedding_fn(x, y):
-        x_w_embed = tgt_embedder(x)
-        y_p_embed = pos_embedder(y)
-        return x_w_embed * config_model.hidden_dim ** 0.5 + y_p_embed
-
-    predictions = decoder(
-        memory=encoder_output,
-        memory_sequence_length=encoder_input_length,
-        beam_width=beam_width,
-        length_penalty=config_model.length_penalty,
-        start_tokens=start_tokens,
-        end_token=eos_token_id,
-        embedding=_embedding_fn,
-        max_decoding_length=config_data.max_decoding_length,
-        mode=tf.estimator.ModeKeys.PREDICT)
-    # Uses the best sample by beam search
-    beam_search_ids = predictions['sample_id'][:, :, 0]
-
-    saver = tf.train.Saver(max_to_keep=5)
-    best_results = {'score': 0, 'epoch': -1}
-
-    def _eval_epoch(sess, epoch, mode):
-        if mode == 'eval':
-            eval_data = dev_data
-        elif mode == 'test':
-            eval_data = test_data
-        else:
-            raise ValueError('`mode` should be either "eval" or "test".')
-
-        references, hypotheses = [], []
-        bsize = config_data.test_batch_size
-        for i in range(0, len(eval_data), bsize):
-            sources, targets = zip(*eval_data[i:i + bsize])
-            x_block = data_utils.source_pad_concat_convert(sources)
-            feed_dict = {
-                encoder_input: x_block,
-                tx.global_mode(): tf.estimator.ModeKeys.EVAL,
-            }
-            fetches = {
-                'beam_search_ids': beam_search_ids,
-            }
-            fetches_ = sess.run(fetches, feed_dict=feed_dict)
-
-            hypotheses.extend(h.tolist() for h in fetches_['beam_search_ids'])
-            references.extend(r.tolist() for r in targets)
-            hypotheses = utils.list_strip_eos(hypotheses, eos_token_id)
-            references = utils.list_strip_eos(references, eos_token_id)
-
-        if mode == 'eval':
-            # Writes results to files to evaluate BLEU
-            # For 'eval' mode, the BLEU is based on token ids (rather than
-            # text tokens) and serves only as a surrogate metric to monitor
-            # the training process
-            fname = os.path.join(FLAGS.model_dir, 'tmp.eval')
-            hypotheses = tx.utils.str_join(hypotheses)
-            references = tx.utils.str_join(references)
-            hyp_fn, ref_fn = tx.utils.write_paired_text(
-                hypotheses, references, fname, mode='s')
-            eval_bleu = bleu_wrapper(ref_fn, hyp_fn, case_sensitive=True)
-            eval_bleu = 100. * eval_bleu
-            logger.info('epoch: %d, eval_bleu %.4f', epoch, eval_bleu)
-            print('epoch: %d, eval_bleu %.4f' % (epoch, eval_bleu))
-
-            if eval_bleu > best_results['score']:
-                logger.info('epoch: %d, best bleu: %.4f', epoch, eval_bleu)
-                best_results['score'] = eval_bleu
-                best_results['epoch'] = epoch
-                model_path = os.path.join(FLAGS.model_dir, 'best-model.ckpt')
-                logger.info('saving model to %s', model_path)
-                print('saving model to %s' % model_path)
-                saver.save(sess, model_path)
-
-        elif mode == 'test':
-            # For 'test' mode, together with the cmds in README.md, BLEU
-            # is evaluated based on text tokens, which is the standard metric.
-            fname = os.path.join(FLAGS.model_dir, 'test.output')
-            hwords, rwords = [], []
-            for hyp, ref in zip(hypotheses, references):
-                hwords.append([id2w[y] for y in hyp])
-                rwords.append([id2w[y] for y in ref])
-            hwords = tx.utils.str_join(hwords)
-            rwords = tx.utils.str_join(rwords)
-            hyp_fn, ref_fn = tx.utils.write_paired_text(
-                hwords, rwords, fname, mode='s',
-                src_fname_suffix='hyp', tgt_fname_suffix='ref')
-            logger.info('Test output writtn to file: %s', hyp_fn)
-            print('Test output writtn to file: %s' % hyp_fn)
-
-    def _train_epoch(sess, epoch, step, smry_writer):
-        random.shuffle(train_data)
-        train_iter = data.iterator.pool(
-            train_data,
-            config_data.batch_size,
-            key=lambda x: (len(x[0]), len(x[1])),
-            batch_size_fn=utils.batch_size_fn,
-            random_shuffler=data.iterator.RandomShuffler())
-
-        for _, train_batch in enumerate(train_iter):
-            in_arrays = data_utils.seq2seq_pad_concat_convert(train_batch)
-            feed_dict = {
-                encoder_input: in_arrays[0],
-                decoder_input: in_arrays[1],
-                labels: in_arrays[2],
-                learning_rate: utils.get_lr(step, config_model.lr)
-            }
-            fetches = {
-                'step': global_step,
-                'train_op': train_op,
-                'smry': summary_merged,
-                'loss': mle_loss,
-            }
-
-            fetches_ = sess.run(fetches, feed_dict=feed_dict)
-
-            step, loss = fetches_['step'], fetches_['loss']
-            if step and step % config_data.display_steps == 0:
-                logger.info('step: %d, loss: %.4f', step, loss)
-                print('step: %d, loss: %.4f' % (step, loss))
-                smry_writer.add_summary(fetches_['smry'], global_step=step)
-
-            if step and step % config_data.eval_steps == 0:
-                _eval_epoch(sess, epoch, mode='eval')
-        return step
-
-    # Run the graph
-    with tf.Session() as sess:
-        sess.run(tf.global_variables_initializer())
-        sess.run(tf.local_variables_initializer())
-        sess.run(tf.tables_initializer())
-
-        smry_writer = tf.summary.FileWriter(FLAGS.model_dir, graph=sess.graph)
-
-        if FLAGS.run_mode == 'train_and_evaluate':
-            logger.info('Begin running with train_and_evaluate mode')
-
-            if tf.train.latest_checkpoint(FLAGS.model_dir) is not None:
-                logger.info('Restore latest checkpoint in %s' % FLAGS.model_dir)
-                saver.restore(sess, tf.train.latest_checkpoint(FLAGS.model_dir))
-
-            step = 0
-            for epoch in range(config_data.max_train_epoch):
-                step = _train_epoch(sess, epoch, step, smry_writer)
-
-        elif FLAGS.run_mode == 'test':
-            logger.info('Begin running with test mode')
-
-            logger.info('Restore latest checkpoint in %s' % FLAGS.model_dir)
-            saver.restore(sess, tf.train.latest_checkpoint(FLAGS.model_dir))
-
-            _eval_epoch(sess, 0, mode='test')
-
-        else:
-            raise ValueError('Unknown mode: {}'.format(FLAGS.run_mode))
-
-
-if __name__ == '__main__':
-    main()
diff --git a/examples/transformer/utils/__init__.py b/examples/transformer/utils/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/examples/transformer/utils/data_utils.py b/examples/transformer/utils/data_utils.py
deleted file mode 100644
index f69694c1..00000000
--- a/examples/transformer/utils/data_utils.py
+++ /dev/null
@@ -1,123 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Data read/write utilities for Transformer.
-"""
-import os
-import codecs
-import six
-import numpy as np
-
-# pylint: disable=no-member
-
-
-def load_data_numpy(input_dir, prefix):
-    train_data = np.load(
-        os.path.join(input_dir, prefix + "train.npy"),
-        encoding="latin1",
-        allow_pickle=True,
-    ).tolist()
-    dev_data = np.load(
-        os.path.join(input_dir, prefix + "valid.npy"),
-        encoding="latin1",
-        allow_pickle=True,
-    ).tolist()
-    test_data = np.load(
-        os.path.join(input_dir, prefix + "test.npy"),
-        encoding="latin1",
-        allow_pickle=True,
-    ).tolist()
-    print("train data size:{}".format(len(train_data)))
-    return train_data, dev_data, test_data
-
-
-def seq2seq_pad_concat_convert(xy_batch, eos_id=2, bos_id=1):
-    """
-    Args:
-        xy_batch (list of tuple of two numpy.ndarray-s or cupy.ndarray-s):
-            xy_batch[i][0] is an array
-            of token ids of i-th input sentence in a minibatch.
-            xy_batch[i][1] is an array
-            of token ids of i-th target sentence in a minibatch.
-            The shape of each array is `(sentence length, )`.
-        eos_id: The index of end-of-sentence special token in the
-            dictionary.
-
-    Returns:
-        Tuple of Converted array.
-            (input_sent_batch_array, target_sent_batch_input_array,
-            target_sent_batch_output_array).
-            The shape of each array is `(batchsize, max_sentence_length)`.
-            All sentences are padded with 0 to reach max_sentence_length.
-    """
-
-    x_seqs, y_seqs = zip(*xy_batch)
-    x_block = _concat_examples(x_seqs, padding=0)
-    y_block = _concat_examples(y_seqs, padding=0)
-
-    # Add EOS
-    x_block = np.pad(x_block, ((0, 0), (0, 1)), "constant", constant_values=0)
-    for i_batch, seq in enumerate(x_seqs):
-        x_block[i_batch, len(seq)] = eos_id
-
-    y_out_block = np.pad(
-        y_block, ((0, 0), (0, 1)), "constant", constant_values=0
-    )
-    for i_batch, seq in enumerate(y_seqs):
-        y_out_block[i_batch, len(seq)] = eos_id
-
-    # Add BOS in target language
-    y_in_block = np.pad(
-        y_block, ((0, 0), (1, 0)), "constant", constant_values=bos_id
-    )
-    return x_block, y_in_block, y_out_block
-
-
-def source_pad_concat_convert(x_seqs, eos_id=2, bos_id=1):
-    """
-    This function is used when testing the model without target input.
-    """
-    x_block = _concat_examples(x_seqs, padding=0)
-
-    # add EOS
-    x_block = np.pad(x_block, ((0, 0), (0, 1)), "constant", constant_values=0)
-    for i_batch, seq in enumerate(x_seqs):
-        x_block[i_batch, len(seq)] = eos_id
-    return x_block
-
-
-def _concat_examples(arrays, padding=0):
-    if len(arrays) == 0:
-        raise ValueError("batch is empty")
-
-    first_elem = arrays[0]
-    assert isinstance(first_elem, np.ndarray)
-
-    shape = np.array(arrays[0].shape, dtype=int)
-    for array in arrays[1:]:
-        if np.any(shape != array.shape):
-            np.maximum(shape, array.shape, shape)
-    shape = tuple(np.insert(shape, 0, len(arrays)))
-
-    result = np.full(shape, padding, dtype=arrays[0].dtype)
-    for i in six.moves.range(len(arrays)):
-        src = arrays[i]
-        slices = tuple(slice(dim) for dim in src.shape)
-        result[(i,) + slices] = src
-    return result
-
-
-def write_words(words_list, filename):
-    with codecs.open(filename, "w+", "utf-8") as myfile:
-        for words in words_list:
-            myfile.write(" ".join(words) + "\n")
diff --git a/examples/transformer/utils/preprocess.py b/examples/transformer/utils/preprocess.py
deleted file mode 100644
index 2a4e5061..00000000
--- a/examples/transformer/utils/preprocess.py
+++ /dev/null
@@ -1,243 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-preprocessing text data. Generally it's to generate plain text vocab file,
-truncate sequence by length, generate the preprocessed dataset.
-"""
-from __future__ import unicode_literals
-import collections
-import re
-import json
-import os
-import numpy as np
-import pickle
-import argparse
-from io import open
-# pylint:disable=invalid-name
-
-split_pattern = re.compile(r'([.,!?"\':;)(])')
-digit_pattern = re.compile(r'\d')
-
-# Refer to https://texar.readthedocs.io/en/latest/_modules/texar/data/vocabulary.html#SpecialTokens
-# these tokens will by default have token ids 0, 1, 2, 3 respectively
-pad_token_id, bos_token_id, eos_token_id, unk_token_id = 0, 1, 2, 3
-
-
-def split_sentence(s, tok=False):
-    """split sentence with some segmentation rules."""
-    if tok:
-        s = s.lower()
-        s = s.replace('\u2019', "'")
-        s = digit_pattern.sub('0', s)
-    words = []
-    for word in s.split():
-        if tok:
-            words.extend(split_pattern.split(word))
-        else:
-            words.append(word)
-    words = [w for w in words if w]
-    return words
-
-
-def open_file(path):
-    """more robust open function"""
-    return open(path, encoding='utf-8')
-
-
-def read_file(path, tok=False):
-    """a generator to generate each line of file."""
-    with open_file(path) as f:
-        for line in f.readlines():
-            words = split_sentence(line.strip(), tok)
-            yield words
-
-
-def count_words(path, max_vocab_size=40000, tok=False):
-    """count all words in the corpus and output a counter"""
-    counts = collections.Counter()
-    for words in read_file(path, tok):
-        for word in words:
-            counts[word] += 1
-
-    vocab = [word for (word, _) in counts.most_common(max_vocab_size)]
-    return vocab
-
-
-def make_array(word_id, words):
-    """generate id numpy array from plain text words."""
-    ids = [word_id.get(word, unk_token_id) for word in words]
-    return np.array(ids, 'i')
-
-
-def make_dataset(path, w2id, tok=False):
-    """generate dataset."""
-    dataset, npy_dataset = [], []
-    token_count, unknown_count = 0, 0
-    for words in read_file(path, tok):
-        array = make_array(w2id, words)
-        npy_dataset.append(array)
-        dataset.append(words)
-        token_count += array.size
-        unknown_count += (array == unk_token_id).sum()
-    print('# of tokens:{}'.format(token_count))
-    print('# of unknown {} {:.2}'.format(unknown_count,
-                                         100. * unknown_count / token_count))
-    return dataset, npy_dataset
-
-
-def get_preprocess_args():
-    """Data preprocessing options."""
-    class Config():
-        pass
-    config = Config()
-    parser = argparse.ArgumentParser(description='Preprocessing Options')
-    parser.add_argument('--source_vocab', type=int, default=40000,
-                        help='Vocabulary size of source language')
-    parser.add_argument('--target_vocab', type=int, default=40000,
-                        help='Vocabulary size of target language')
-    parser.add_argument('--tok', dest='tok', action='store_true',
-                        help='tokenized and lowercased')
-    parser.set_defaults(tok=False)
-    parser.add_argument('--max_seq_length', type=int, default=70)
-    parser.add_argument('--pre_encoding', type=str, default='spm')
-    parser.add_argument('--src', type=str, default='en')
-    parser.add_argument('--tgt', type=str, default='vi')
-    parser.add_argument('--input_dir', '-i', type=str,
-                        default='./data/en_vi/data/', help='Input directory')
-    parser.add_argument('--save_data', type=str, default='preprocess',
-                        help='Output file for the prepared data')
-    parser.parse_args(namespace=config)
-
-    # keep consistent with original implementation
-    # pylint:disable=attribute-defined-outside-init
-    config.input = config.input_dir
-    config.source_train = 'train.' + config.src
-    config.target_train = 'train.' + config.tgt
-    config.source_valid = 'valid.' + config.src
-    config.target_valid = 'valid.' + config.tgt
-    config.source_test = 'test.' + config.src
-    config.target_test = 'test.' + config.tgt
-    return config
-
-
-if __name__ == "__main__":
-    args = get_preprocess_args()
-
-    print(json.dumps(args.__dict__, indent=4))
-
-    # pylint:disable=no-member
-    # Vocab Construction
-    source_path = os.path.join(args.input_dir, args.source_train)
-    target_path = os.path.join(args.input_dir, args.target_train)
-
-    src_cntr = count_words(source_path, args.source_vocab, args.tok)
-    trg_cntr = count_words(target_path, args.target_vocab, args.tok)
-    all_words = sorted(list(set(src_cntr + trg_cntr)))
-
-    vocab = ['<pad>', '<bos>', '<eos>', '<unk>'] + all_words
-
-    w2id = {word: index for index, word in enumerate(vocab)}
-
-    # Train Dataset
-    source_data, source_npy = make_dataset(source_path, w2id, args.tok)
-    target_data, target_npy = make_dataset(target_path, w2id, args.tok)
-    assert len(source_data) == len(target_data)
-
-    train_data = [(s, t) for s, t in zip(source_data, target_data)
-                  if s and len(s) < args.max_seq_length
-                  and t and len(t) < args.max_seq_length]
-    train_npy = [(s, t) for s, t in zip(source_npy, target_npy)
-                 if len(s) > 0 and len(s) < args.max_seq_length
-                 and len(t) > 0 and len(t) < args.max_seq_length]
-    assert len(train_data) == len(train_npy)
-
-    # Display corpus statistics
-    print("Vocab: {} with special tokens".format(len(vocab)))
-    print('Original training data size: %d' % len(source_data))
-    print('Filtered training data size: %d' % len(train_data))
-
-    # Valid Dataset
-    source_path = os.path.join(args.input_dir, args.source_valid)
-    source_data, source_npy = make_dataset(source_path, w2id, args.tok)
-    target_path = os.path.join(args.input_dir, args.target_valid)
-    target_data, target_npy = make_dataset(target_path, w2id, args.tok)
-    assert len(source_data) == len(target_data)
-
-    valid_data = [(s, t) for s, t in zip(source_data, target_data)
-                  if s and t]
-    valid_npy = [(s, t) for s, t in zip(source_npy, target_npy)
-                 if len(s) > 0 and len(t) > 0]
-    assert len(valid_data) == len(valid_npy)
-    print('Original dev data size: %d' % len(source_data))
-    print('Filtered dev data size: %d' % len(valid_data))
-
-    # Test Dataset
-    source_path = os.path.join(args.input_dir, args.source_test)
-    source_data, source_npy = make_dataset(source_path, w2id, args.tok)
-    target_path = os.path.realpath(
-        os.path.join(args.input_dir, args.target_test))
-    target_data, target_npy = make_dataset(target_path, w2id, args.tok)
-    assert len(source_data) == len(target_data)
-    test_data = [(s, t) for s, t in zip(source_data, target_data)
-                 if s and t]
-    test_npy = [(s, t) for s, t in zip(source_npy, target_npy)
-                if len(s) > 0 and len(t) > 0]
-    print('Original test data size: %d' % len(source_data))
-    print('Filtered test data size: %d' % len(test_data))
-    id2w = {i: w for w, i in w2id.items()}
-    # Save the dataset to numpy files
-    train_src_output = os.path.join(
-        args.input_dir, args.save_data + 'train.' + args.src + '.txt')
-    train_tgt_output = os.path.join(
-        args.input_dir, args.save_data + 'train.' + args.tgt + '.txt')
-    dev_src_output = os.path.join(args.input_dir,
-                                  args.save_data + 'dev.' + args.src + '.txt')
-    dev_tgt_output = os.path.join(args.input_dir,
-                                  args.save_data + 'dev.' + args.tgt + '.txt')
-    test_src_output = os.path.join(args.input_dir,
-                                   args.save_data + 'test.' + args.src + '.txt')
-    test_tgt_output = os.path.join(args.input_dir,
-                                   args.save_data + 'test.' + args.tgt + '.txt')
-
-    np.save(os.path.join(args.input, args.save_data + 'train.npy'),
-            train_npy)
-    np.save(os.path.join(args.input, args.save_data + 'valid.npy'),
-            valid_npy)
-    np.save(os.path.join(args.input, args.save_data + 'test.npy'),
-            test_npy)
-    with open(os.path.join(args.input, args.save_data + 'vocab.pickle'), 'wb')\
-        as f:
-        pickle.dump(id2w, f, protocol=pickle.HIGHEST_PROTOCOL)
-
-    with open(train_src_output, 'w+', encoding='utf-8') as fsrc, \
-        open(train_tgt_output, 'w+', encoding='utf-8') as ftgt:
-        for words in train_data:
-            fsrc.write('{}\n'.format(' '.join(words[0])))
-            ftgt.write('{}\n'.format(' '.join(words[1])))
-    with open(dev_src_output, 'w+', encoding='utf-8') as fsrc, \
-        open(dev_tgt_output, 'w+', encoding='utf-8') as ftgt:
-        for words in valid_data:
-            fsrc.write('{}\n'.format(' '.join(words[0])))
-            ftgt.write('{}\n'.format(' '.join(words[1])))
-    with open(test_src_output, 'w+', encoding='utf-8') as fsrc, \
-        open(test_tgt_output, 'w+', encoding='utf-8') as ftgt:
-        for words in test_data:
-            fsrc.write('{}\n'.format(' '.join(words[0])))
-            ftgt.write('{}\n'.format(' '.join(words[1])))
-    with open(os.path.join(args.input_dir,
-                           args.save_data + args.pre_encoding + '.vocab.text'),
-              'w+', encoding='utf-8') as f:
-        max_size = len(id2w)
-        for idx in range(4, max_size):
-            f.write('{}\n'.format(id2w[idx]))
diff --git a/examples/transformer/utils/utils.py b/examples/transformer/utils/utils.py
deleted file mode 100644
index f2fdbe94..00000000
--- a/examples/transformer/utils/utils.py
+++ /dev/null
@@ -1,78 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Helper functions for model training.
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
-
-import random
-import math
-import logging
-import numpy as np
-import tensorflow as tf
-
-
-def set_random_seed(myseed):
-    tf.set_random_seed(myseed)
-    np.random.seed(myseed)
-    random.seed(myseed)
-
-
-def batch_size_fn(new, count, size_so_far):
-    max_src_in_batch, max_tgt_in_batch = 0, 0
-    max_src_in_batch = max(max_src_in_batch, len(new[0] + 1))
-    max_tgt_in_batch = max(max_tgt_in_batch, len(new[1] + 1))
-    src_elements = count * max_src_in_batch
-    tgt_elements = count * max_tgt_in_batch
-    return max(src_elements, tgt_elements)
-
-
-def get_lr(fstep, opt_config):
-    if opt_config['learning_rate_schedule'] == 'static':
-        lr = opt_config['static_lr']
-    else:
-        lr = opt_config['lr_constant'] \
-            * min(1.0, (fstep / opt_config['warmup_steps'])) \
-            * (1 / math.sqrt(max(fstep, opt_config['warmup_steps'])))
-    return lr
-
-
-def get_logger(log_path):
-    """Returns a logger.
-
-    Args:
-        log_path (str): Path to the log file.
-    """
-    logger = logging.getLogger(__name__)
-    logger.setLevel(logging.DEBUG)
-    fh = logging.FileHandler(log_path)
-    fh.setLevel(logging.DEBUG)
-    fh.setFormatter(
-        logging.Formatter('%(asctime)s:%(levelname)s:%(message)s'))
-    logger.addHandler(fh)
-    return logger
-
-
-def list_strip_eos(list_, eos_token):
-    """Strips EOS token from a list of lists of tokens.
-    """
-    list_strip = []
-    for elem in list_:
-        if eos_token in elem:
-            elem = elem[:elem.index(eos_token)]
-        list_strip.append(elem)
-    return list_strip
diff --git a/examples/vae_text/.gitignore b/examples/vae_text/.gitignore
deleted file mode 100644
index 2412abc8..00000000
--- a/examples/vae_text/.gitignore
+++ /dev/null
@@ -1,4 +0,0 @@
-/simple-examples/
-/data/
-/models/
-/simple-examples.tgz
diff --git a/examples/vae_text/README.md b/examples/vae_text/README.md
deleted file mode 100644
index 5339b0fe..00000000
--- a/examples/vae_text/README.md
+++ /dev/null
@@ -1,57 +0,0 @@
-# Variational Autoencoder (VAE) for Text Generation
-
-This example builds a VAE for text generation, with an LSTM as encoder and an LSTM or [Transformer](https://arxiv.org/pdf/1706.03762.pdf) as decoder. Training is performed on the official PTB data and Yahoo data, respectively. 
-
-The VAE with LSTM decoder is first decribed in [(Bowman et al., 2015) Generating Sentences from a Continuous Space](https://arxiv.org/pdf/1511.06349.pdf)
-
-The Yahoo dataset is from [(Yang et al., 2017) Improved Variational Autoencoders for Text Modeling using Dilated Convolutions](https://arxiv.org/pdf/1702.08139.pdf), which is created by sampling 100k documents from the original Yahoo Answer data. The average document length is 78 and the vocab size is 200k. 
-
-## Data
-The datasets can be downloaded by running:
-```shell
-python prepare_data.py --data ptb
-python prepare_data.py --data yahoo
-```
-
-## Training
-Train with the following command:
-
-```shell
-python vae_train.py --config config_trans_ptb
-```
-
-Here:
-
-* `--config` specifies the config file to use, including model hyperparameters and data paths. We provide 4 config files:
-  - [config_lstm_ptb.py](./config_lstm_ptb.py): LSTM decoder, on the PTB data
-  - [config_lstm_yahoo.py](./config_lstm_yahoo.py): LSTM decoder, on the Yahoo data
-  - [config_trans_ptb.py](./config_trans_ptb.py): Transformer decoder, on the PTB data
-  - [config_trans_yahoo.py](./config_trans_yahoo.py): Transformer decoder, on the Yahoo data
-
-## Generation
-Generating sentences with pre-trained model can be performed with the following command:
-```shell
-python vae_train.py --config config_file --mode predict --model /path/to/model.ckpt --out /path/to/output
-```
-
-Here `--model` specifies the saved model checkpoint, which is saved in `./models/dataset_name/` at training time. For example, the model path is `./models/ptb/ptb_lstmDecoder.ckpt` when generating with a LSTM decoder trained on PTB dataset. Generated sentences will be written to standard output if `--out` is not specifcied.
-
-## Results
-
-### Language Modeling
-
-|Dataset    |Metrics   | VAE-LSTM |VAE-Transformer |
-|---------------|-------------|----------------|------------------------|
-|Yahoo | Test PPL<br>Test NLL | 68.11<br>337.13 |59.95<br>326.93|
-|PTB | Test PPL<br>Test NLL | 104.61<br>101.92 | 103.68<br>101.72 |
-
-### Generated Examples
-We show the generated examples with transformer as decoder trained  on PTB training data.
-
-|Examples|
-|:---------|
-|i 'm always looking at a level of \$ N to \$ N billion \<EOS\> |
-|after four years ago president bush has federal regulators decided to file financing for the waiver\<EOS\> |
-|the savings & loan association said total asset revenue was about \$ N billion compared with \$ N billion \<EOS\> |
-|the trend would seem to be effective \<EOS\> |
-|chicago city 's <unk> computer bank of britain posted a N N jump in third-quarter net income \<EOS\>|
diff --git a/examples/vae_text/config_lstm_ptb.py b/examples/vae_text/config_lstm_ptb.py
deleted file mode 100644
index 568272d3..00000000
--- a/examples/vae_text/config_lstm_ptb.py
+++ /dev/null
@@ -1,135 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""VAE config.
-"""
-
-# pylint: disable=invalid-name, too-few-public-methods, missing-docstring
-
-dataset = "ptb"
-num_epochs = 100
-hidden_size = 256
-dec_dropout_in = 0.5
-dec_dropout_out = 0.5
-enc_dropout_in = 0.
-enc_dropout_out = 0.
-word_keep_prob = 0.5
-batch_size = 32
-embed_dim = 256
-
-latent_dims = 32
-
-lr_decay_hparams = {
-    "init_lr": 0.001,
-    "threshold": 2,
-    "decay_factor": 0.5,
-    "max_decay": 5
-}
-
-
-decoder_type = 'lstm'
-
-enc_cell_hparams = {
-    "type": "LSTMBlockCell",
-    "kwargs": {
-        "num_units": hidden_size,
-        "forget_bias": 0.
-    },
-    "dropout": {"output_keep_prob": 1. - enc_dropout_out},
-    "num_layers": 1
-}
-
-dec_cell_hparams = {
-    "type": "LSTMBlockCell",
-    "kwargs": {
-        "num_units": hidden_size,
-        "forget_bias": 0.
-    },
-    "dropout": {"output_keep_prob": 1. - dec_dropout_out},
-    "num_layers": 1
-}
-
-enc_emb_hparams = {
-    'name': 'lookup_table',
-    "dim": embed_dim,
-    "dropout_rate": enc_dropout_in,
-    'initializer': {
-        'type': 'random_normal_initializer',
-        'kwargs': {
-            'mean': 0.0,
-            'stddev': embed_dim**-0.5,
-        },
-    }
-}
-
-dec_emb_hparams = {
-    'name': 'lookup_table',
-    "dim": embed_dim,
-    "dropout_rate": dec_dropout_in,
-    'initializer': {
-        'type': 'random_normal_initializer',
-        'kwargs': {
-            'mean': 0.0,
-            'stddev': embed_dim**-0.5,
-        },
-    }
-}
-
-# KL annealing
-kl_anneal_hparams = {
-    "warm_up": 10,
-    "start": 0.1
-}
-
-train_data_hparams = {
-    "num_epochs": 1,
-    "batch_size": batch_size,
-    "seed": 123,
-    "dataset": {
-        "files": './simple-examples/data/ptb.train.txt',
-        "vocab_file": './simple-examples/data/vocab.txt'
-    }
-}
-
-val_data_hparams = {
-    "num_epochs": 1,
-    "batch_size": batch_size,
-    "seed": 123,
-    "dataset": {
-        "files": './simple-examples/data/ptb.valid.txt',
-        "vocab_file": './simple-examples/data/vocab.txt'
-    }
-}
-
-test_data_hparams = {
-    "num_epochs": 1,
-    "batch_size": batch_size,
-    "dataset": {
-        "files": './simple-examples/data/ptb.test.txt',
-        "vocab_file": './simple-examples/data/vocab.txt'
-    }
-}
-
-opt_hparams = {
-    "optimizer": {
-        "type": "AdamOptimizer",
-        "kwargs": {
-            "learning_rate": 0.001
-        }
-    },
-    "gradient_clip": {
-        "type": "clip_by_global_norm",
-        "kwargs": {"clip_norm": 5.}
-    }
-}
diff --git a/examples/vae_text/config_lstm_yahoo.py b/examples/vae_text/config_lstm_yahoo.py
deleted file mode 100644
index e58843cd..00000000
--- a/examples/vae_text/config_lstm_yahoo.py
+++ /dev/null
@@ -1,142 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""VAE config.
-"""
-
-# pylint: disable=invalid-name, too-few-public-methods, missing-docstring
-
-dataset = "yahoo"
-num_epochs = 100
-hidden_size = 550
-dec_dropout_in = 0.5
-dec_dropout_out = 0.5
-enc_dropout_in = 0.
-enc_dropout_out = 0.
-batch_size = 32
-embed_dim = 512
-
-latent_dims = 32
-
-lr_decay_hparams = {
-    "init_lr": 0.001,
-    "threshold": 2,
-    "decay_factor": 0.5,
-    "max_decay": 5
-}
-
-
-relu_dropout = 0.2
-embedding_dropout = 0.2
-attention_dropout = 0.2
-residual_dropout = 0.2
-num_blocks = 3
-
-decoder_type = 'lstm'
-
-enc_cell_hparams = {
-    "type": "LSTMBlockCell",
-    "kwargs": {
-        "num_units": hidden_size,
-        "forget_bias": 0.
-    },
-    "dropout": {"output_keep_prob": 1. - enc_dropout_out},
-    "num_layers": 1
-}
-
-dec_cell_hparams = {
-    "type": "LSTMBlockCell",
-    "kwargs": {
-        "num_units": hidden_size,
-        "forget_bias": 0.
-    },
-    "dropout": {"output_keep_prob": 1. - dec_dropout_out},
-    "num_layers": 1
-}
-
-enc_emb_hparams = {
-    'name': 'lookup_table',
-    "dim": embed_dim,
-    "dropout_rate": enc_dropout_in,
-    'initializer': {
-        'type': 'random_normal_initializer',
-        'kwargs': {
-            'mean': 0.0,
-            'stddev': embed_dim**-0.5,
-        },
-    }
-}
-
-dec_emb_hparams = {
-    'name': 'lookup_table',
-    "dim": embed_dim,
-    "dropout_rate": dec_dropout_in,
-    'initializer': {
-        'type': 'random_normal_initializer',
-        'kwargs': {
-            'mean': 0.0,
-            'stddev': embed_dim**-0.5,
-        },
-    }
-}
-
-
-# KL annealing
-# kl_weight = 1.0 / (1 + np.exp(-k*(step-x0)))
-kl_anneal_hparams = {
-    "warm_up": 10,
-    "start": 0.1
-}
-
-train_data_hparams = {
-    "num_epochs": 1,
-    "batch_size": batch_size,
-    "seed": 123,
-    "dataset": {
-        "files": './data/yahoo/yahoo.train.txt',
-        "vocab_file": './data/yahoo/vocab.txt'
-    }
-}
-
-val_data_hparams = {
-    "num_epochs": 1,
-    "batch_size": batch_size,
-    "seed": 123,
-    "dataset": {
-        "files": './data/yahoo/yahoo.valid.txt',
-        "vocab_file": './data/yahoo/vocab.txt'
-    }
-}
-
-test_data_hparams = {
-    "num_epochs": 1,
-    "batch_size": batch_size,
-    "dataset": {
-        "files": './data/yahoo/yahoo.test.txt',
-        "vocab_file": './data/yahoo/vocab.txt'
-    }
-}
-
-opt_hparams = {
-    "optimizer": {
-        "type": "AdamOptimizer",
-        "kwargs": {
-            "learning_rate": 0.001
-        }
-    },
-    "gradient_clip": {
-        "type": "clip_by_global_norm",
-        "kwargs": {"clip_norm": 5.}
-    }
-}
diff --git a/examples/vae_text/config_trans_ptb.py b/examples/vae_text/config_trans_ptb.py
deleted file mode 100644
index eb7b7877..00000000
--- a/examples/vae_text/config_trans_ptb.py
+++ /dev/null
@@ -1,184 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Config file of VAE with Trasnformer decoder, on PTB data.
-"""
-
-# pylint: disable=invalid-name, too-few-public-methods, missing-docstring
-
-dataset = 'ptb'
-num_epochs = 100
-hidden_size = 256
-dec_dropout_in = 0.
-enc_dropout_in = 0.
-enc_dropout_out = 0.
-batch_size = 32
-embed_dim = 256
-
-latent_dims = 32
-
-lr_decay_hparams = {
-    'init_lr': 0.001,
-    'threshold': 2,
-    'decay_factor': 0.5,
-    'max_decay': 5
-}
-
-
-relu_dropout = 0.2
-embedding_dropout = 0.2
-attention_dropout = 0.2
-residual_dropout = 0.2
-num_blocks = 3
-
-decoder_type = 'transformer'
-
-enc_cell_hparams = {
-    'type': 'LSTMBlockCell',
-    'kwargs': {
-        'num_units': hidden_size,
-        'forget_bias': 0.
-    },
-    'dropout': {'output_keep_prob': 1. - enc_dropout_out},
-    'num_layers': 1
-}
-
-enc_emb_hparams = {
-    'name': 'lookup_table',
-    'dim': embed_dim,
-    'dropout_rate': enc_dropout_in,
-    'initializer': {
-        'type': 'random_normal_initializer',
-        'kwargs': {
-            'mean': 0.0,
-            'stddev': embed_dim**-0.5,
-        },
-    }
-}
-
-dec_emb_hparams = {
-    'name': 'lookup_table',
-    'dim': embed_dim,
-    'dropout_rate': dec_dropout_in,
-    'initializer': {
-        'type': 'random_normal_initializer',
-        'kwargs': {
-            'mean': 0.0,
-            'stddev': embed_dim**-0.5,
-        },
-    }
-}
-
-max_pos = 200  # max sequence length in training data
-dec_pos_emb_hparams = {
-    'dim': hidden_size,
-}
-
-# due to the residual connection, the embed_dim should be equal to hidden_size
-trans_hparams = {
-    'output_layer_bias': False,
-    'embedding_dropout': embedding_dropout,
-    'residual_dropout': residual_dropout,
-    'num_blocks': num_blocks,
-    'dim': hidden_size,
-    'initializer': {
-        'type': 'variance_scaling_initializer',
-        'kwargs': {
-            'scale': 1.0,
-            'mode': 'fan_avg',
-            'distribution': 'uniform',
-        },
-    },
-    'multihead_attention': {
-        'dropout_rate': attention_dropout,
-        'num_heads': 8,
-        'num_units': hidden_size,
-        'output_dim': hidden_size
-    },
-    'poswise_feedforward': {
-        'name': 'fnn',
-        'layers': [
-            {
-                'type': 'Dense',
-                'kwargs': {
-                    'name': 'conv1',
-                    'units': hidden_size * 4,
-                    'activation': 'relu',
-                    'use_bias': True,
-                },
-            },
-            {
-                'type': 'Dropout',
-                'kwargs': {
-                    'rate': relu_dropout,
-                }
-            },
-            {
-                'type': 'Dense',
-                'kwargs': {
-                    'name': 'conv2',
-                    'units': hidden_size,
-                    'use_bias': True,
-                    }
-            }
-        ],
-    }
-}
-
-# KL annealing
-kl_anneal_hparams = {
-    'warm_up': 10,
-    'start': 0.1
-}
-
-train_data_hparams = {
-    'num_epochs': 1,
-    'batch_size': batch_size,
-    'seed': 123,
-    'dataset': {
-        'files': './simple-examples/data/ptb.train.txt',
-        'vocab_file': './simple-examples/data/vocab.txt'
-    }
-}
-
-val_data_hparams = {
-    'num_epochs': 1,
-    'batch_size': batch_size,
-    'seed': 123,
-    'dataset': {
-        'files': './simple-examples/data/ptb.valid.txt',
-        'vocab_file': './simple-examples/data/vocab.txt'
-    }
-}
-
-test_data_hparams = {
-    'num_epochs': 1,
-    'batch_size': batch_size,
-    'dataset': {
-        'files': './simple-examples/data/ptb.test.txt',
-        'vocab_file': './simple-examples/data/vocab.txt'
-    }
-}
-
-opt_hparams = {
-    'optimizer': {
-        'type': 'AdamOptimizer',
-        'kwargs': {
-            'learning_rate': 0.001
-        }
-    },
-    'gradient_clip': {
-        'type': 'clip_by_global_norm',
-        'kwargs': {'clip_norm': 5.}
-    }
-}
diff --git a/examples/vae_text/config_trans_yahoo.py b/examples/vae_text/config_trans_yahoo.py
deleted file mode 100644
index f8048dfd..00000000
--- a/examples/vae_text/config_trans_yahoo.py
+++ /dev/null
@@ -1,186 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""VAE config.
-"""
-
-# pylint: disable=invalid-name, too-few-public-methods, missing-docstring
-
-dataset = "yahoo"
-num_epochs = 100
-hidden_size = 512
-dec_dropout_in = 0.
-enc_dropout_in = 0.
-enc_dropout_out = 0.
-batch_size = 32
-embed_dim = 512
-
-latent_dims = 32
-
-lr_decay_hparams = {
-    "init_lr": 0.001,
-    "threshold": 2,
-    "decay_factor": 0.5,
-    "max_decay": 5
-}
-
-
-relu_dropout = 0.2
-embedding_dropout = 0.2
-attention_dropout = 0.2
-residual_dropout = 0.2
-num_blocks = 3
-
-decoder_type = 'transformer'
-
-enc_cell_hparams = {
-    "type": "LSTMBlockCell",
-    "kwargs": {
-        "num_units": hidden_size,
-        "forget_bias": 0.
-    },
-    "dropout": {"output_keep_prob": 1. - enc_dropout_out},
-    "num_layers": 1
-}
-
-enc_emb_hparams = {
-    'name': 'lookup_table',
-    "dim": embed_dim,
-    "dropout_rate": enc_dropout_in,
-    'initializer': {
-        'type': 'random_normal_initializer',
-        'kwargs': {
-            'mean': 0.0,
-            'stddev': embed_dim**-0.5,
-        },
-    }
-}
-
-dec_emb_hparams = {
-    'name': 'lookup_table',
-    "dim": embed_dim,
-    "dropout_rate": dec_dropout_in,
-    'initializer': {
-        'type': 'random_normal_initializer',
-        'kwargs': {
-            'mean': 0.0,
-            'stddev': embed_dim**-0.5,
-        },
-    }
-}
-
-
-max_pos = 200  # max sequence length in training data
-dec_pos_emb_hparams = {
-    'dim': hidden_size,
-}
-
-# due to the residual connection, the embed_dim should be equal to hidden_size
-trans_hparams = {
-    'output_layer_bias': False,
-    'embedding_dropout': embedding_dropout,
-    'residual_dropout': residual_dropout,
-    'num_blocks': num_blocks,
-    'dim': hidden_size,
-    'initializer': {
-        'type': 'variance_scaling_initializer',
-        'kwargs': {
-            'scale': 1.0,
-            'mode': 'fan_avg',
-            'distribution': 'uniform',
-        },
-    },
-    'multihead_attention': {
-        'dropout_rate': attention_dropout,
-        'num_heads': 8,
-        'num_units': hidden_size,
-        'output_dim': hidden_size
-    },
-    'poswise_feedforward': {
-        'name': 'fnn',
-        'layers': [
-            {
-                'type': 'Dense',
-                'kwargs': {
-                    'name': 'conv1',
-                    'units': hidden_size * 4,
-                    'activation': 'relu',
-                    'use_bias': True,
-                },
-            },
-            {
-                'type': 'Dropout',
-                'kwargs': {
-                    'rate': relu_dropout,
-                }
-            },
-            {
-                'type': 'Dense',
-                'kwargs': {
-                    'name': 'conv2',
-                    'units': hidden_size,
-                    'use_bias': True,
-                    }
-            }
-        ],
-    }
-}
-
-# KL annealing
-kl_anneal_hparams = {
-    "warm_up": 10,
-    "start": 0.1
-}
-
-train_data_hparams = {
-    "num_epochs": 1,
-    "batch_size": batch_size,
-    "seed": 123,
-    "dataset": {
-        "files": './data/yahoo/yahoo.train.txt',
-        "vocab_file": './data/yahoo/vocab.txt'
-    }
-}
-
-val_data_hparams = {
-    "num_epochs": 1,
-    "batch_size": batch_size,
-    "seed": 123,
-    "dataset": {
-        "files": './data/yahoo/yahoo.valid.txt',
-        "vocab_file": './data/yahoo/vocab.txt'
-    }
-}
-
-test_data_hparams = {
-    "num_epochs": 1,
-    "batch_size": batch_size,
-    "dataset": {
-        "files": './data/yahoo/yahoo.test.txt',
-        "vocab_file": './data/yahoo/vocab.txt'
-    }
-}
-
-opt_hparams = {
-    "optimizer": {
-        "type": "AdamOptimizer",
-        "kwargs": {
-            "learning_rate": 0.001
-        }
-    },
-    "gradient_clip": {
-        "type": "clip_by_global_norm",
-        "kwargs": {"clip_norm": 5.}
-    }
-}
diff --git a/examples/vae_text/prepare_data.py b/examples/vae_text/prepare_data.py
deleted file mode 100644
index b0523bb7..00000000
--- a/examples/vae_text/prepare_data.py
+++ /dev/null
@@ -1,62 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Utilities for downloading and preprocessing the PTB and Yohoo data.
-"""
-import os
-import argparse
-
-import tensorflow as tf
-import texar.tf as tx
-
-
-def prepare_data(data_name):
-    """Prepare datasets.
-    Args:
-        data_path: the path to save the data
-        data_name: the name of dataset, "ptb" and "yahoo"
-            are currently supported
-    """
-    if data_name == "ptb":
-        data_path = "./simple-examples/data"
-        train_path = os.path.join(data_path, "ptb.train.txt")
-        if not tf.gfile.Exists(train_path):
-            url = 'http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz'
-            tx.data.maybe_download(url, './', extract=True)
-
-        train_path = os.path.join(data_path, "ptb.train.txt")
-        vocab_path = os.path.join(data_path, "vocab.txt")
-        word_to_id = tx.data.make_vocab(
-            train_path, return_type="dict")
-
-        with open(vocab_path, 'w') as fvocab:
-            for word in word_to_id:
-                fvocab.write("%s\n" % word)
-
-    elif data_name == "yahoo":
-        data_path = "./data/yahoo"
-        train_path = os.path.join(data_path, "yahoo.train.txt")
-        if not tf.gfile.Exists(train_path):
-            url = 'https://drive.google.com/file/d/'\
-                  '13IsiffVjcQ-wrrbBGMwiG3sYf-DFxtXH/view?usp=sharing'
-            tx.data.maybe_download(url, path='./', filenames='yahoo.zip',
-                                   extract=True)
-    else:
-        raise ValueError('Unknown data: {}'.format(data_name))
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description='prepare data')
-    parser.add_argument('--data', type=str, help='dataset to prepare')
-    args = parser.parse_args()
-    prepare_data(args.data)
diff --git a/examples/vae_text/vae_train.py b/examples/vae_text/vae_train.py
deleted file mode 100644
index f1ff1511..00000000
--- a/examples/vae_text/vae_train.py
+++ /dev/null
@@ -1,398 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Example for building the Variational Autoencoder.
-
-This is an impmentation of Variational Autoencoder for text generation
-
-To run:
-
-$ python vae_train.py
-
-Hyperparameters and data path may be specified in config_trans.py
-
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
-
-# pylint: disable=invalid-name, no-member, too-many-locals
-# pylint: disable=too-many-branches, too-many-statements, redefined-variable-type
-
-import os
-import sys
-import time
-import importlib
-from io import open
-
-import numpy as np
-import tensorflow as tf
-import tensorflow_probability as tfp
-import texar.tf as tx
-
-
-tfd = tfp.distributions
-
-flags = tf.flags
-
-flags.DEFINE_string("config", "config", "The config to use.")
-flags.DEFINE_string("mode", "train", "train or predict")
-flags.DEFINE_string("model", None, "model path for generating sentences")
-flags.DEFINE_string("out", None, "generation output path")
-
-FLAGS = flags.FLAGS
-
-config = importlib.import_module(FLAGS.config)
-
-
-def kl_dvg(means, logvars):
-    """compute the KL divergence between Gaussian distribution
-    """
-    kl_cost = -0.5 * (logvars - tf.square(means) -
-                      tf.exp(logvars) + 1.0)
-    kl_cost = tf.reduce_mean(kl_cost, 0)
-
-    return tf.reduce_sum(kl_cost)
-
-
-def _main(_):
-    # Data
-    train_data = tx.data.MonoTextData(config.train_data_hparams)
-    val_data = tx.data.MonoTextData(config.val_data_hparams)
-    test_data = tx.data.MonoTextData(config.test_data_hparams)
-    iterator = tx.data.TrainTestDataIterator(train=train_data,
-                                             val=val_data,
-                                             test=test_data)
-    data_batch = iterator.get_next()
-
-    opt_vars = {
-        'learning_rate': config.lr_decay_hparams["init_lr"],
-        'best_valid_nll': 1e100,
-        'steps_not_improved': 0,
-        'kl_weight': config.kl_anneal_hparams["start"]
-    }
-
-    decay_cnt = 0
-    max_decay = config.lr_decay_hparams["max_decay"]
-    decay_factor = config.lr_decay_hparams["decay_factor"]
-    decay_ts = config.lr_decay_hparams["threshold"]
-
-    save_dir = "./models/%s" % config.dataset
-
-    if not os.path.exists(save_dir):
-        os.makedirs(save_dir)
-
-    suffix = "%s_%sDecoder.ckpt" % \
-            (config.dataset, config.decoder_type)
-
-    save_path = os.path.join(save_dir, suffix)
-
-    # KL term annealing rate
-    anneal_r = 1.0 / (config.kl_anneal_hparams["warm_up"] *
-                      (train_data.dataset_size() / config.batch_size))
-
-    # Model architecture
-    encoder_w_embedder = tx.modules.WordEmbedder(
-        vocab_size=train_data.vocab.size, hparams=config.enc_emb_hparams)
-    input_embed = encoder_w_embedder(data_batch["text_ids"])
-    encoder = tx.modules.UnidirectionalRNNEncoder(
-        hparams={"rnn_cell": config.enc_cell_hparams})
-
-    decoder_w_embedder = tx.modules.WordEmbedder(
-        vocab_size=train_data.vocab.size, hparams=config.dec_emb_hparams)
-    output_w_embed = decoder_w_embedder(data_batch["text_ids"][:, :-1])
-
-    if config.decoder_type == "lstm":
-        output_embed = output_w_embed
-
-        decoder = tx.modules.BasicRNNDecoder(
-            vocab_size=train_data.vocab.size,
-            hparams={"rnn_cell": config.dec_cell_hparams})
-        decoder_initial_state_size = decoder.cell.state_size
-    elif config.decoder_type == 'transformer':
-        # position embedding
-        decoder_p_embedder = tx.modules.SinusoidsPositionEmbedder(
-            position_size=config.max_pos, hparams=config.dec_pos_emb_hparams)
-        batch_size = tf.shape(data_batch["text_ids"])[0]
-        max_seq_len = tf.shape(data_batch["text_ids"])[1] - 1
-        batch_max_seq_len = tf.ones([batch_size], tf.int32) * max_seq_len
-        output_p_embed = decoder_p_embedder(sequence_length=batch_max_seq_len)
-
-        output_w_embed = output_w_embed * config.hidden_size ** 0.5
-        output_embed = output_w_embed + output_p_embed
-
-        # decoder
-        decoder = tx.modules.TransformerDecoder(
-            # tie word embedding with output layer
-            output_layer=tf.transpose(decoder_w_embedder.embedding, (1, 0)),
-            hparams=config.trans_hparams)
-        decoder_initial_state_size = tf.TensorShape(
-            [1, config.dec_emb_hparams["dim"]])
-    else:
-        raise NotImplementedError
-
-    connector_mlp = tx.modules.MLPTransformConnector(
-        config.latent_dims * 2)
-
-    connector_stoch = tx.modules.ReparameterizedStochasticConnector(
-        decoder_initial_state_size)
-
-    # encoder -> connector -> decoder
-
-    _, ecdr_states = encoder(
-        input_embed,
-        sequence_length=data_batch["length"])
-
-    mean_logvar = connector_mlp(ecdr_states)
-    mean, logvar = tf.split(mean_logvar, 2, 1)
-    kl_loss = kl_dvg(mean, logvar)
-
-    dst = tfd.MultivariateNormalDiag(
-        loc=mean,
-        scale_diag=tf.exp(0.5 * logvar))
-
-    dcdr_states, latent_z = connector_stoch(dst)
-
-    # decoder
-    if config.decoder_type == "lstm":
-        # concat latent variable to input at every time step
-        latent_z = tf.expand_dims(latent_z, axis=1)
-        latent_z = tf.tile(latent_z, [1, tf.shape(output_embed)[1], 1])
-        output_embed = tf.concat([output_embed, latent_z], axis=2)
-
-        outputs, _, _ = decoder(
-            initial_state=dcdr_states,
-            decoding_strategy="train_greedy",
-            inputs=output_embed,
-            sequence_length=data_batch["length"] - 1)
-    else:
-        outputs = decoder(
-            inputs=output_embed,
-            memory=dcdr_states,
-            memory_sequence_length=tf.ones(tf.shape(dcdr_states)[0]))
-
-    logits = outputs.logits
-
-    seq_lengths = data_batch["length"] - 1
-    # Losses & train ops
-    rc_loss = tx.losses.sequence_sparse_softmax_cross_entropy(
-        labels=data_batch["text_ids"][:, 1:],
-        logits=logits,
-        sequence_length=data_batch["length"] - 1)
-
-    # KL annealing
-    kl_weight = tf.placeholder(tf.float32, shape=())
-
-    nll = rc_loss + kl_weight * kl_loss
-
-    learning_rate = tf.placeholder(dtype=tf.float32, shape=(),
-                                   name='learning_rate')
-    train_op = tx.core.get_train_op(nll, learning_rate=learning_rate,
-                                    hparams=config.opt_hparams)
-
-    def _run_epoch(sess, epoch, mode_string, display=10):
-        if mode_string == 'train':
-            iterator.switch_to_train_data(sess)
-        elif mode_string == 'valid':
-            iterator.switch_to_val_data(sess)
-        elif mode_string == 'test':
-            iterator.switch_to_test_data(sess)
-
-        step = 0
-        start_time = time.time()
-        num_words = num_sents = 0
-        nll_ = 0.
-        kl_loss_ = rc_loss_ = 0.
-
-        while True:
-            try:
-                fetches = {"nll": nll,
-                           "kl_loss": kl_loss,
-                           "rc_loss": rc_loss,
-                           "lengths": seq_lengths}
-
-                if mode_string == 'train':
-                    fetches["train_op"] = train_op
-                    opt_vars["kl_weight"] = min(
-                        1.0, opt_vars["kl_weight"] + anneal_r)
-
-                    kl_weight_ = opt_vars["kl_weight"]
-                else:
-                    kl_weight_ = 1.0
-
-                mode = (tf.estimator.ModeKeys.TRAIN if mode_string == 'train'
-                        else tf.estimator.ModeKeys.EVAL)
-
-                feed = {tx.global_mode(): mode,
-                        kl_weight: kl_weight_,
-                        learning_rate: opt_vars["learning_rate"]}
-
-                fetches_ = sess.run(fetches, feed_dict=feed)
-
-                batch_size_ = len(fetches_["lengths"])
-                num_sents += batch_size_
-
-                num_words += sum(fetches_["lengths"])
-                nll_ += fetches_["nll"] * batch_size_
-                kl_loss_ += fetches_["kl_loss"] * batch_size_
-                rc_loss_ += fetches_["rc_loss"] * batch_size_
-
-                if step % display == 0 and mode_string == 'train':
-                    print('%s: epoch %d, step %d, nll %.4f, klw: %.4f, '
-                          'KL %.4f,  rc %.4f, log_ppl %.4f, ppl %.4f, '
-                          'time elapsed: %.1fs' %
-                          (mode_string, epoch, step, nll_ / num_sents,
-                           opt_vars["kl_weight"], kl_loss_ / num_sents,
-                           rc_loss_ / num_sents, nll_ / num_words,
-                           np.exp(nll_ / num_words), time.time() - start_time))
-
-                    sys.stdout.flush()
-
-                step += 1
-
-            except tf.errors.OutOfRangeError:
-                print('\n%s: epoch %d, nll %.4f, KL %.4f, rc %.4f, '
-                      'log_ppl %.4f, ppl %.4f\n' %
-                      (mode_string, epoch, nll_ / num_sents,
-                       kl_loss_ / num_sents, rc_loss_ / num_sents,
-                       nll_ / num_words, np.exp(nll_ / num_words)))
-                break
-
-        return nll_ / num_sents, np.exp(nll_ / num_words)
-
-    def _generate(sess, saver, fname=None):
-        if tf.train.checkpoint_exists(FLAGS.model):
-            saver.restore(sess, FLAGS.model)
-        else:
-            raise ValueError("cannot find checkpoint model")
-
-        batch_size = train_data.batch_size
-
-        dst = tfd.MultivariateNormalDiag(
-            loc=tf.zeros([batch_size, config.latent_dims]),
-            scale_diag=tf.ones([batch_size, config.latent_dims]))
-
-        dcdr_states, latent_z = connector_stoch(dst)
-
-        vocab = train_data.vocab
-        start_tokens = tf.ones(batch_size, tf.int32) * vocab.bos_token_id
-        end_token = vocab.eos_token_id
-
-        if config.decoder_type == "lstm":
-            def _cat_embedder(ids):
-                """Concatenates latent variable to input word embeddings
-                """
-                embedding = decoder_w_embedder(ids)
-                return tf.concat([embedding, latent_z], axis=1)
-
-            outputs, _, _ = decoder(
-                initial_state=dcdr_states,
-                decoding_strategy="infer_sample",
-                embedding=_cat_embedder,
-                max_decoding_length=100,
-                start_tokens=start_tokens,
-                end_token=end_token)
-        else:
-            def _embedding_fn(ids, times):
-                w_embed = decoder_w_embedder(ids)
-                p_embed = decoder_p_embedder(times)
-                return w_embed * config.hidden_size ** 0.5 + p_embed
-
-            outputs, _ = decoder(
-                memory=dcdr_states,
-                decoding_strategy="infer_sample",
-                memory_sequence_length=tf.ones(tf.shape(dcdr_states)[0]),
-                embedding=_embedding_fn,
-                max_decoding_length=100,
-                start_tokens=start_tokens,
-                end_token=end_token)
-
-        sample_tokens = vocab.map_ids_to_tokens(outputs.sample_id)
-        sess.run(tf.tables_initializer())
-
-        feed = {tx.global_mode(): tf.estimator.ModeKeys.PREDICT}
-        sample_tokens_ = sess.run(sample_tokens, feed_dict=feed)
-
-        if fname is None:
-            fh = sys.stdout
-        else:
-            fh = open(fname, 'w', encoding='utf-8')
-
-        for sent in sample_tokens_:
-            sent = tx.utils.compat_as_text(list(sent))
-            end_id = len(sent)
-            if vocab.eos_token in sent:
-                end_id = sent.index(vocab.eos_token)
-            fh.write(' '.join(sent[:end_id + 1]) + '\n')
-
-        print('Output done')
-        fh.close()
-
-    saver = tf.train.Saver()
-    with tf.Session() as sess:
-        # generate samples from prior
-        if FLAGS.mode == "predict":
-            _generate(sess, saver, FLAGS.out)
-            return
-
-        sess.run(tf.global_variables_initializer())
-        sess.run(tf.local_variables_initializer())
-        sess.run(tf.tables_initializer())
-
-        # Counts trainable parameters
-        total_parameters = 0
-        for variable in tf.trainable_variables():
-            shape = variable.get_shape()  # shape is an array of tf.Dimension
-            variable_parameters = 1
-            for dim in shape:
-                variable_parameters *= dim.value
-            total_parameters += variable_parameters
-        print("%d total parameters" % total_parameters)
-
-        best_nll = best_ppl = 0.
-
-        for epoch in range(config.num_epochs):
-            _, _ = _run_epoch(sess, epoch, 'train', display=200)
-            val_nll, _ = _run_epoch(sess, epoch, 'valid')
-            test_nll, test_ppl = _run_epoch(sess, epoch, 'test')
-
-            if val_nll < opt_vars['best_valid_nll']:
-                opt_vars['best_valid_nll'] = val_nll
-                opt_vars['steps_not_improved'] = 0
-                best_nll = test_nll
-                best_ppl = test_ppl
-                saver.save(sess, save_path)
-            else:
-                opt_vars['steps_not_improved'] += 1
-                if opt_vars['steps_not_improved'] == decay_ts:
-                    old_lr = opt_vars['learning_rate']
-                    opt_vars['learning_rate'] *= decay_factor
-                    opt_vars['steps_not_improved'] = 0
-                    new_lr = opt_vars['learning_rate']
-
-                    print('-----\nchange lr, old lr: %f, new lr: %f\n-----' %
-                          (old_lr, new_lr))
-
-                    saver.restore(sess, save_path)
-
-                    decay_cnt += 1
-                    if decay_cnt == max_decay:
-                        break
-
-        print('\nbest testing nll: %.4f, best testing ppl %.4f\n' %
-              (best_nll, best_ppl))
-
-
-if __name__ == '__main__':
-    tf.app.run(main=_main)
diff --git a/requirements.txt b/requirements.txt
index 9985daab..de613865 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,7 @@
-tensorflow >= 1.10.0, < 2.0.0
-tensorflow-gpu >= 1.10.0, < 2.0.0
-tensorflow-probability >= 0.3.0, < 0.8.0
+tensorflow >= 2.0.0
+tensorflow-gpu >= 2.0.0
+tensorflow-probability >= 0.3.0
 funcsigs >= 1.0.2
+numpy >= 1.15.4
 regex >= 2018.01.10
 sentencepiece >= 0.1.8
diff --git a/setup.py b/setup.py
index 75a2f2cb..c5027b66 100644
--- a/setup.py
+++ b/setup.py
@@ -1,21 +1,23 @@
+import sys
 import setuptools
 
-
 long_description = '''
-Texar is an open-source toolkit based on Tensorflow,
+Texar is an open-source toolkit based on TensorFlow,
 aiming to support a broad set of machine learning especially text generation
 tasks, such as machine translation, dialog, summarization, content manipulation,
 language modeling, and so on.
-
 Texar is designed for both researchers and practitioners for fast prototyping
 and experimentation. Checkout https://github.com/asyml/texar-pytorch for the
 PyTorch version which has the same functionalities and (mostly) the same
 interfaces.
 '''
 
+if sys.version_info < (3, 6):
+    sys.exit('Python>=3.6 is required by Texar.')
+
 setuptools.setup(
     name="texar",
-    version="0.2.4-unreleased",
+    version="0.4.0-unreleased",
     url="https://github.com/asyml/texar",
 
     description="Toolkit for Machine Learning and Text Generation",
@@ -27,22 +29,20 @@
 
     install_requires=[
         'regex>=2018.01.10',
-        'numpy<1.17.0',
-        'pathlib>=1.0',
-        'pyyaml',
+        'numpy',
         'requests',
         'funcsigs>=1.0.2',
         'sentencepiece>=0.1.8',
-        'packaging'
+        'packaging>=19.0'
     ],
     extras_require={
         'tensorflow-cpu': [
-            'tensorflow>=1.10.0,<2.0',
-            'tensorflow-probability>=0.3.0,<0.8.0'
+            'tensorflow>=2.0.0',
+            'tensorflow-probability>=0.3.0'
         ],
         'tensorflow-gpu': [
-            'tensorflow-gpu>=1.10.0,<2.0',
-            'tensorflow-probability>=0.3.0,<0.8.0'
+            'tensorflow-gpu>=2.0.0',
+            'tensorflow-probability>=0.3.0'
         ]
     },
     package_data={
diff --git a/texar/__init__.py b/texar/__init__.py
index 94e4eb8a..54624eba 100644
--- a/texar/__init__.py
+++ b/texar/__init__.py
@@ -11,86 +11,3 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""
-Modules of texar library.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# pylint: disable=wildcard-import
-
-import sys
-
-if sys.version_info.major < 3:
-    # PY 2.x, import as is because Texar-PyTorch cannot be installed.
-    import texar.tf
-
-else:
-    # Lazily load Texar-TF modules upon usage. This is to ensure that Texar-TF
-    # and TensorFlow will not be imported if the user only requires
-    # Texar-PyTorch modules from `texar.torch`.
-
-    import importlib
-
-    __import_modules__ = [
-        "modules", "core", "losses", "models", "data", "evals",
-        "agents", "run", "utils",
-    ]
-    __import_star_modules__ = ["module_base", "hyperparams", "context"]
-
-    def _import_all():
-        import warnings
-
-        with warnings.catch_warnings():
-            warnings.simplefilter("always", DeprecationWarning)
-            warnings.warn(
-                "Importing from `texar` is deprecated. Please import from "
-                "`texar.tf` instead, e.g. `import texar.tf as tx`",
-                DeprecationWarning, stacklevel=3)
-
-        from texar.tf.version import VERSION
-        globals()["__version__"] = VERSION
-
-        for module_name in __import_star_modules__:
-            # from ... import *. Requires manually handling `__all__`.
-            module = importlib.import_module("." + module_name, package="texar.tf")
-            try:
-                variables = module.__all__
-            except AttributeError:
-                variables = [name for name in module.__dict__
-                             if not name.startswith("_")]
-            globals().update({
-                name: module.__dict__[name] for name in variables})
-            globals()[module_name] = module
-
-        for module_name in __import_modules__:
-            # from ... import module
-            module = importlib.import_module("." + module_name, package="texar.tf")
-            globals()[module_name] = module
-
-    class _DummyTexarBaseModule:
-        # Credit: https://stackoverflow.com/a/7668273/4909228
-        def __getattr__(self, name):
-            if name in globals():
-                # Shortcut to global names.
-                return globals()[name]
-            if name in ["torch", "tf"]:
-                # To use `texar.torch`, Texar-TF and TensorFlow should not be
-                # imported; To use `texar.tf`, Texar-PyTorch and PyTorch should
-                # not be imported.
-                module = importlib.import_module("." + name, package="texar")
-                globals()[name] = module
-                return module
-
-            # The user tries to access Texar-TF modules, so we load all modules
-            # at this point, and restore the registered `texar` module.
-            sys.modules[__name__] = __module__
-            _import_all()
-            return globals()[name]
-
-    # Save `texar` module as `__module__`, ans replace the system-wide
-    # registered module with our dummy module.
-    __module__ = sys.modules[__name__]
-    sys.modules[__name__] = _DummyTexarBaseModule()
diff --git a/texar/tf/__init__.py b/texar/tf/__init__.py
index 2cb543d8..81ab699f 100644
--- a/texar/tf/__init__.py
+++ b/texar/tf/__init__.py
@@ -12,38 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-Modules of texar library.
+Modules of Texar library.
 """
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# pylint: disable=wildcard-import
-
-import pkg_resources
-import tensorflow as tf
-
-VERSION_WARNING = "1.13.2"
-
-
-if (pkg_resources.parse_version(tf.__version__) <=
-        pkg_resources.parse_version(VERSION_WARNING)):
-    tf.logging.set_verbosity(tf.logging.ERROR)
-else:
-    tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
-
 from texar.tf.version import VERSION as __version__
 
-from texar.tf import agents
 from texar.tf import core
 from texar.tf import data
-from texar.tf import evals
-from texar.tf import losses
-from texar.tf import models
 from texar.tf import modules
-from texar.tf import run
 from texar.tf import utils
-from texar.tf.module_base import *
 from texar.tf.hyperparams import *
-from texar.tf.context import *
+from texar.tf.module_base import *
diff --git a/texar/tf/agents/__init__.py b/texar/tf/agents/__init__.py
deleted file mode 100644
index e8c248dd..00000000
--- a/texar/tf/agents/__init__.py
+++ /dev/null
@@ -1,32 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Various RL Agents
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# pylint: disable=wildcard-import
-
-from texar.tf.agents.pg_agent import *
-from texar.tf.agents.seq_pg_agent import *
-from texar.tf.agents.dqn_agent import *
-from texar.tf.agents.ac_agent import *
-from texar.tf.agents.agent_utils import *
-try:
-    from texar.tf.agents.agent_gym_utils import *
-except ImportError:
-    pass
diff --git a/texar/tf/agents/ac_agent.py b/texar/tf/agents/ac_agent.py
deleted file mode 100644
index d0e7664b..00000000
--- a/texar/tf/agents/ac_agent.py
+++ /dev/null
@@ -1,218 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Actor-critic agent.
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow as tf
-import numpy as np
-
-from texar.tf.agents.episodic_agent_base import EpisodicAgentBase
-from texar.tf.utils import utils
-
-# pylint: disable=too-many-instance-attributes, protected-access
-# pylint: disable=too-many-arguments
-
-__all__ = [
-    "ActorCriticAgent"
-]
-
-
-class ActorCriticAgent(EpisodicAgentBase):
-    """Actor-critic agent for episodic setting.
-
-    An actor-critic algorithm consists of several components:
-
-        - **Actor** is the policy to optimize. As a temporary implementation,\
-        here by default we use a :class:`~texar.tf.agents.PGAgent` instance \
-        that wraps a `policy net` and provides proper interfaces to perform \
-        the role of an actor.
-        - **Critic** that provides learning signals to the actor. Again, as \
-        a temporary implemetation, here by default we use a \
-        :class:`~texar.tf.agents.DQNAgent` instance that wraps a `Q net` and \
-        provides proper interfaces to perform the role of a critic.
-
-    Args:
-        env_config: An instance of :class:`~texar.tf.agents.EnvConfig`
-            specifying action space, observation space, and reward range, etc.
-            Use :func:`~texar.tf.agents.get_gym_env_config` to create an
-            EnvConfig from a gym environment.
-        sess (optional): A tf session.
-            Can be `None` here and set later with `agent.sess = session`.
-        actor (optional): An instance of :class:`~texar.tf.agents.PGAgent` that
-            performs as actor in the algorithm.
-            If not provided, an actor is created based on :attr:`hparams`.
-        actor_kwargs (dict, optional): Keyword arguments for actor
-            constructor. Note that the `hparams` argument for actor
-            constructor is specified in the "actor_hparams" field of
-            :attr:`hparams` and should not be included in `actor_kwargs`.
-            Ignored if :attr:`actor` is given.
-        critic (optional): An instance of :class:`~texar.tf.agents.DQNAgent`
-            that performs as critic in the algorithm.
-            If not provided, a critic is created based on :attr:`hparams`.
-        critic_kwargs (dict, optional): Keyword arguments for critic
-            constructor. Note that the `hparams` argument for critic
-            constructor is specified in the "critic_hparams" field of
-            :attr:`hparams` and should not be included in `critic_kwargs`.
-            Ignored if :attr:`critic` is given.
-        hparams (dict or HParams, optional): Hyperparameters. Missing
-            hyperparamerters will be set to default values. See
-            :meth:`default_hparams` for the hyperparameter sturcture and
-            default values.
-    """
-
-    def __init__(self,
-                 env_config,
-                 sess=None,
-                 actor=None,
-                 actor_kwargs=None,
-                 critic=None,
-                 critic_kwargs=None,
-                 hparams=None):
-        EpisodicAgentBase.__init__(self, env_config=env_config, hparams=hparams)
-
-        self._sess = sess
-        self._num_actions = self._env_config.action_space.high - \
-                            self._env_config.action_space.low
-
-        with tf.variable_scope(self.variable_scope):
-            if actor is None:
-                kwargs = utils.get_instance_kwargs(
-                    actor_kwargs, self._hparams.actor_hparams)
-                kwargs.update(dict(env_config=env_config, sess=sess))
-                actor = utils.get_instance(
-                    class_or_name=self._hparams.actor_type,
-                    kwargs=kwargs,
-                    module_paths=['texar.tf.agents', 'texar.tf.custom'])
-            self._actor = actor
-
-            if critic is None:
-                kwargs = utils.get_instance_kwargs(
-                    critic_kwargs, self._hparams.critic_hparams)
-                kwargs.update(dict(env_config=env_config, sess=sess))
-                critic = utils.get_instance(
-                    class_or_name=self._hparams.critic_type,
-                    kwargs=kwargs,
-                    module_paths=['texar.tf.agents', 'texar.tf.custom'])
-            self._critic = critic
-
-            if self._actor._discount_factor != self._critic._discount_factor:
-                raise ValueError('discount_factor of the actor and the critic '
-                                 'must be the same.')
-            self._discount_factor = self._actor._discount_factor
-
-            self._observs = []
-            self._actions = []
-            self._rewards = []
-
-    @staticmethod
-    def default_hparams():
-        """Returns a dictionary of hyperparameters with default values:
-
-        .. role:: python(code)
-           :language: python
-
-        .. code-block:: python
-
-            {
-                'actor_type': 'PGAgent',
-                'actor_hparams': None,
-                'critic_type': 'DQNAgent',
-                'critic_hparams': None,
-                'name': 'actor_critic_agent'
-            }
-
-        Here:
-
-        "actor_type": str or class or instance
-            Actor. Can be class, its
-            name or module path, or a class instance. If class name is given,
-            the class must be from module :mod:`texar.tf.agents` or
-            :mod:`texar.tf.custom`. Ignored if a `actor` is given to
-            the agent constructor.
-
-        "actor_kwargs": dict, optional
-            Hyperparameters for the actor class. With the :attr:`actor_kwargs`
-            argument to the constructor, an actor is created with
-            :python:`actor_class(**actor_kwargs, hparams=actor_hparams)`.
-
-        "critic_type": str or class or instance
-            Critic. Can be class, its
-            name or module path, or a class instance. If class name is given,
-            the class must be from module :mod:`texar.tf.agents` or
-            :mod:`texar.tf.custom`. Ignored if a `critic` is given to
-            the agent constructor.
-
-        "critic_kwargs": dict, optional
-            Hyperparameters for the critic class. With the :attr:`critic_kwargs`
-            argument to the constructor, an critic is created with
-            :python:`critic_class(**critic_kwargs, hparams=critic_hparams)`.
-
-        "name": str
-            Name of the agent.
-        """
-        return {
-            'actor_type': 'PGAgent',
-            'actor_hparams': None,
-            'critic_type': 'DQNAgent',
-            'critic_hparams': None,
-            'name': 'actor_critic_agent'
-        }
-
-    def _reset(self):
-        self._actor._reset()
-        self._critic._reset()
-
-    def _observe(self, reward, terminal, train_policy, feed_dict):
-        self._train_actor(
-            observ=self._observ,
-            action=self._action,
-            feed_dict=feed_dict)
-        self._critic._observe(reward, terminal, train_policy, feed_dict)
-
-    def _train_actor(self, observ, action, feed_dict):
-        qvalues = self._critic._qvalues_from_target(observ=observ)
-        advantage = qvalues[0][action] - np.mean(qvalues)
-        # TODO (bowen): should be a funciton to customize?
-
-        feed_dict_ = {
-            self._actor._observ_inputs: [observ],
-            self._actor._action_inputs: [action],
-            self._actor._advantage_inputs: [advantage]
-        }
-        feed_dict_.update(feed_dict)
-
-        self._actor._train_policy(feed_dict=feed_dict_)
-
-    def get_action(self, observ, feed_dict=None):
-        self._observ = observ
-        self._action = self._actor.get_action(observ, feed_dict=feed_dict)
-
-        self._critic._update_observ_action(self._observ, self._action)
-
-        return self._action
-
-    @property
-    def sess(self):
-        """The tf session.
-        """
-        return self._sess
-
-    @sess.setter
-    def sess(self, session):
-        self._sess = session
-        self._actor._sess = session
-        self._critic._sess = session
diff --git a/texar/tf/agents/agent_base.py b/texar/tf/agents/agent_base.py
deleted file mode 100644
index ee72a02b..00000000
--- a/texar/tf/agents/agent_base.py
+++ /dev/null
@@ -1,72 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Base class for reinforcement learning agents.
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from texar.tf.hyperparams import HParams
-from texar.tf.utils.variables import get_unique_named_variable_scope
-
-# pylint: disable=too-many-instance-attributes
-
-__all__ = [
-    "AgentBase"
-]
-
-
-class AgentBase(object):
-    """
-    Base class inherited by RL agents.
-
-    Args:
-        TODO
-    """
-    def __init__(self, hparams=None):
-        self._hparams = HParams(hparams, self.default_hparams())
-
-        name = self._hparams.name
-        self._variable_scope = get_unique_named_variable_scope(name)
-        self._unique_name = self._variable_scope.name.split("/")[-1]
-
-    @staticmethod
-    def default_hparams():
-        """Returns a dictionary of hyperparameters with default values.
-
-        TODO
-        """
-        return {
-            'name': 'agent'
-        }
-
-    @property
-    def variable_scope(self):
-        """The variable scope of the agent.
-        """
-        return self._variable_scope
-
-    @property
-    def name(self):
-        """The name of the module (not uniquified).
-        """
-        return self._unique_name
-
-    @property
-    def hparams(self):
-        """A :class:`~texar.tf.hyperparams.HParams` instance. The hyperparameters
-        of the module.
-        """
-        return self._hparams
diff --git a/texar/tf/agents/agent_gym_utils.py b/texar/tf/agents/agent_gym_utils.py
deleted file mode 100644
index f1305326..00000000
--- a/texar/tf/agents/agent_gym_utils.py
+++ /dev/null
@@ -1,61 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Various agent utilities based on OpenAI Gym.
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import gym
-
-__all__ = [
-    "convert_gym_space",
-    "get_gym_env_config"
-]
-
-
-def convert_gym_space(spc):
-    """Converts a :gym:`gym.Space <#spaces>` instance to a
-    :class:`~texar.tf.agents.Space` instance.
-
-    Args:
-        spc: An instance of `gym.Space` or
-            :class:`~texar.tf.agents.Space`.
-    """
-    from texar.tf.agents.agent_utils import Space
-    if isinstance(spc, Space):
-        return spc
-    if isinstance(spc, gym.spaces.Discrete):
-        return Space(shape=(), low=0, high=spc.n, dtype=spc.dtype)
-    elif isinstance(spc, gym.spaces.Box):
-        return Space(
-            shape=spc.shape, low=spc.low, high=spc.high, dtype=spc.dtype)
-
-
-def get_gym_env_config(env):
-    """Creates an instance of :class:`~texar.tf.agents.EnvConfig`
-    from a :gym:`gym env <#environments>`.
-
-    Args:
-        env: An instance of OpenAI gym Environment.
-
-    Returns:
-        An instance of :class:`~texar.tf.agents.EnvConfig`.
-    """
-    from texar.tf.agents.agent_utils import EnvConfig
-    return EnvConfig(
-        action_space=env.action_space,
-        observ_space=env.observation_space,
-        reward_range=env.reward_range)
diff --git a/texar/tf/agents/agent_utils.py b/texar/tf/agents/agent_utils.py
deleted file mode 100644
index 811049cf..00000000
--- a/texar/tf/agents/agent_utils.py
+++ /dev/null
@@ -1,171 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Various agent utilities.
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# pylint: disable=too-many-arguments, too-few-public-methods, no-member
-# pylint: disable=invalid-name, wrong-import-position
-
-import numpy as np
-
-gym_utils = None
-try:
-    from texar.tf.agents import agent_gym_utils as gym_utils
-except ImportError:
-    pass
-
-__all__ = [
-    "Space",
-    "EnvConfig"
-]
-
-
-class Space(object):
-    """Observation and action spaces. Describes valid actions and observations.
-    Similar to :gym:`gym.Space <#spaces>`.
-
-    Args:
-        shape (optional): Shape of the space, a tuple. If not
-            given, infers from :attr:`low` and :attr:`high`.
-        low (optional): Lower bound (inclusive) of each dimension of the
-            space. Must have
-            shape as specified by :attr:`shape`, and of the same shape with
-            with :attr:`high` (if given). If `None`, set to `-inf` for each
-            dimension.
-        high (optional): Upper bound (inclusive) of each dimension of the
-            space. Must have
-            shape as specified by :attr:`shape`, and of the same shape with
-            with :attr:`low` (if given). If `None`, set to `inf` for each
-            dimension.
-        dtype (optional): Data type of elements in the space. If not given,
-            infers from :attr:`low` (if given) or set to `float`.
-
-    Example:
-
-        .. code-block:: python
-
-            s = Space(low=0, high=10, dtype=np.int32)
-            #s.contains(2) == True
-            #s.contains(10) == True
-            #s.contains(11) == False
-            #s.shape == ()
-
-            s2 = Space(shape=(2,2), high=np.ones([2,2]), dtype=np.float)
-            #s2.low == [[-inf, -inf], [-inf, -inf]]
-            #s2.high == [[1., 1.], [1., 1.]]
-    """
-    def __init__(self, shape=None, low=None, high=None, dtype=None):
-        if low is None:
-            low = -float('inf')
-        if high is None:
-            high = float('inf')
-
-        if shape is None:
-            low = np.asarray(low)
-            high = np.asarray(high)
-            if low.shape != high.shape:
-                raise ValueError('`low` and `high` must have the same shape.')
-            shape = low.shape
-        else:
-            shape = tuple(shape)
-
-        if np.isscalar(low):
-            low = low + np.zeros(shape, dtype=dtype)
-        if np.isscalar(high):
-            high = high + np.zeros(shape, dtype=dtype)
-        if shape != low.shape or shape != high.shape:
-            raise ValueError(
-                'Shape inconsistent: shape={}, low.shape={}, high.shape={}'
-                .format(shape, low.shape, high.shape))
-        if dtype is None:
-            dtype = low.dtype
-        dtype = np.dtype(dtype)
-        low = low.astype(dtype)
-        high = high.astype(dtype)
-        self._shape = shape
-        self._low = low
-        self._high = high
-        self._dtype = dtype
-
-    def contains(self, x):
-        """Checks if x is contained in the space. Returns a `bool`.
-        """
-        x = np.asarray(x)
-        dtype_match = True
-        if self._dtype.kind in np.typecodes['AllInteger']:
-            if x.dtype.kind not in np.typecodes['AllInteger']:
-                dtype_match = False
-        shape_match = x.shape == self._shape
-        low_match = (x >= self._low).all()
-        high_match = (x <= self._high).all()
-        return dtype_match and shape_match and low_match and high_match
-
-    @property
-    def shape(self):
-        """Shape of the space.
-        """
-        return self._shape
-
-    @property
-    def low(self):
-        """Lower bound of the space.
-        """
-        return self._low
-
-    @property
-    def high(self):
-        """Upper bound of the space.
-        """
-        return self._high
-
-    @property
-    def dtype(self):
-        """Data type of the element.
-        """
-        return self._dtype
-
-
-class EnvConfig(object):
-    """Configurations of an environment.
-
-    Args:
-        action_space: An instance of :class:`~texar.tf.agents.Space` or
-            :gym:`gym.Space <#spaces>`, the action space.
-        observ_space: An instance of :class:`~texar.tf.agents.Space` or
-            :gym:`gym.Space <#spaces>`, the observation space.
-        reward_range: A tuple corresponding to the min and max possible
-            rewards, e.g., `reward_range=(-1.0, 1.0)`.
-    """
-
-    def __init__(self,
-                 action_space,
-                 observ_space,
-                 reward_range):
-        if gym_utils:
-            action_space = gym_utils.convert_gym_space(action_space)
-            observ_space = gym_utils.convert_gym_space(observ_space)
-
-        self.action_space = action_space
-        self.action_dtype = action_space.dtype
-        self.action_shape = action_space.shape
-
-        self.observ_space = observ_space
-        self.observ_dtype = observ_space.dtype
-        self.observ_shape = observ_space.shape
-
-        self.reward_range = reward_range
diff --git a/texar/tf/agents/agent_utils_test.py b/texar/tf/agents/agent_utils_test.py
deleted file mode 100644
index 269b4abf..00000000
--- a/texar/tf/agents/agent_utils_test.py
+++ /dev/null
@@ -1,44 +0,0 @@
-#
-"""
-Unit tests for agent utilities.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
-
-# pylint: disable=no-member, invalid-name, too-many-arguments
-
-import numpy as np
-
-import tensorflow as tf
-
-from texar.tf.agents.agent_utils import Space
-
-
-class SpaceTest(tf.test.TestCase):
-    """Tests the Space class.
-    """
-
-    def _test_space(self, s, shape, low, high, dtype):
-        self.assertEqual(s.shape, shape)
-        self.assertEqual(s.low, low)
-        self.assertEqual(s.high, high)
-        self.assertEqual(s.dtype, dtype)
-
-    def test_space(self):
-        """Tests descrete space.
-        """
-        s = Space(shape=(), low=0, high=10, dtype=np.int32)
-        self._test_space(s, (), 0, 10, np.dtype(np.int32))
-        self.assertTrue(s.contains(5))
-        self.assertFalse(s.contains(5.))
-        self.assertFalse(s.contains(15))
-
-        s = Space(low=0, high=10, dtype=np.int32)
-        self._test_space(s, (), 0, 10, np.dtype(np.int32))
-
-
-if __name__ == "__main__":
-    tf.test.main()
diff --git a/texar/tf/agents/dqn_agent.py b/texar/tf/agents/dqn_agent.py
deleted file mode 100644
index e5e608b4..00000000
--- a/texar/tf/agents/dqn_agent.py
+++ /dev/null
@@ -1,413 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Deep Q learning Agent.
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import random
-import numpy as np
-
-import tensorflow as tf
-
-from texar.tf import context
-from texar.tf.agents.episodic_agent_base import EpisodicAgentBase
-from texar.tf.utils import utils
-from texar.tf.core import optimization as opt
-
-# pylint: disable=too-many-instance-attributes, too-many-arguments
-# pylint: disable=invalid-name
-
-__all__ = [
-    "DQNAgent"
-]
-
-
-class DQNAgent(EpisodicAgentBase):
-    """Deep Q learning agent for episodic setting.
-
-    A Q learning algorithm consists of several components:
-
-        - A **Q-net** takes in a state and returns Q-value for action sampling.
-          See :class:`~texar.tf.modules.CategoricalQNet` for an example Q-net
-          class and required interface.
-        - A **replay memory** manages past experience for Q-net updates. See\
-        :class:`~texar.tf.core.DequeReplayMemory` for an example replay memory\
-        class and required interface.
-        - An **exploration** that specifies the exploration strategy used\
-        to train the Q-net. See\
-        :class:`~texar.tf.core.EpsilonLinearDecayExploration` for an example\
-        class and required interface.
-
-    Args:
-        env_config: An instance of :class:`~texar.tf.agents.EnvConfig`
-            specifying action space, observation space, and reward range, etc.
-            Use :func:`~texar.tf.agents.get_gym_env_config` to create an
-            EnvConfig from a gym environment.
-        sess (optional): A tf session.
-            Can be `None` here and set later with `agent.sess = session`.
-        qnet (optional): A Q network that predicts Q values given states.
-            If not given, a Q network is created based on :attr:`hparams`.
-        target (optional): A target network to compute target Q values.
-        qnet_kwargs (dict, optional): Keyword arguments for qnet
-            constructor. Note that the `hparams` argument for network
-            constructor is specified in the "policy_hparams" field of
-            :attr:`hparams` and should not be included in `policy_kwargs`.
-            Ignored if :attr:`qnet` is given.
-        qnet_caller_kwargs (dict, optional): Keyword arguments for
-            calling `qnet` to get Q values. The `qnet` is called with
-            :python:`outputs=qnet(inputs=observation, **qnet_caller_kwargs)`
-        replay_memory (optional): A replay memory instance.
-            If not given, a replay memory is created based on :attr:`hparams`.
-        replay_memory_kwargs (dict, optional): Keyword arguments for
-            replay_memory constructor.
-            Ignored if :attr:`replay_memory` is given.
-        exploration (optional): An exploration instance used in the algorithm.
-            If not given, an exploration instance is created based on
-            :attr:`hparams`.
-        exploration_kwargs (dict, optional): Keyword arguments for exploration
-            class constructor. Ignored if :attr:`exploration` is given.
-        hparams (dict or HParams, optional): Hyperparameters. Missing
-            hyperparamerters will be set to default values. See
-            :meth:`default_hparams` for the hyperparameter sturcture and
-            default values.
-    """
-    def __init__(self,
-                 env_config,
-                 sess=None,
-                 qnet=None,
-                 target=None,
-                 qnet_kwargs=None,
-                 qnet_caller_kwargs=None,
-                 replay_memory=None,
-                 replay_memory_kwargs=None,
-                 exploration=None,
-                 exploration_kwargs=None,
-                 hparams=None):
-        EpisodicAgentBase.__init__(self, env_config, hparams)
-
-        self._sess = sess
-        self._cold_start_steps = self._hparams.cold_start_steps
-        self._sample_batch_size = self._hparams.sample_batch_size
-        self._update_period = self._hparams.update_period
-        self._discount_factor = self._hparams.discount_factor
-        self._target_update_strategy = self._hparams.target_update_strategy
-        self._num_actions = self._env_config.action_space.high - \
-                            self._env_config.action_space.low
-
-        with tf.variable_scope(self.variable_scope):
-            if qnet is None:
-                kwargs = utils.get_instance_kwargs(
-                    qnet_kwargs, self._hparams.qnet_hparams)
-                qnet = utils.check_or_get_instance(
-                    ins_or_class_or_name=self._hparams.qnet_type,
-                    kwargs=kwargs,
-                    module_paths=['texar.tf.modules', 'texar.tf.custom'])
-                target = utils.check_or_get_instance(
-                    ins_or_class_or_name=self._hparams.qnet_type,
-                    kwargs=kwargs,
-                    module_paths=['texar.tf.modules', 'texar.tf.custom'])
-            self._qnet = qnet
-            self._target = target
-            self._qnet_caller_kwargs = qnet_caller_kwargs or {}
-
-            if replay_memory is None:
-                kwargs = utils.get_instance_kwargs(
-                    replay_memory_kwargs, self._hparams.replay_memory_hparams)
-                replay_memory = utils.check_or_get_instance(
-                    ins_or_class_or_name=self._hparams.replay_memory_type,
-                    kwargs=kwargs,
-                    module_paths=['texar.tf.core', 'texar.tf.custom'])
-            self._replay_memory = replay_memory
-
-            if exploration is None:
-                kwargs = utils.get_instance_kwargs(
-                    exploration_kwargs, self._hparams.exploration_hparams)
-                exploration = utils.check_or_get_instance(
-                    ins_or_class_or_name=self._hparams.exploration_type,
-                    kwargs=kwargs,
-                    module_paths=['texar.tf.core', 'texar.tf.custom'])
-            self._exploration = exploration
-
-        self._build_graph()
-
-        self._observ = None
-        self._action = None
-        self._timestep = 0
-
-    @staticmethod
-    def default_hparams():
-        """Returns a dictionary of hyperparameters with default values:
-
-        .. role:: python(code)
-           :language: python
-
-        .. code-block:: python
-
-            {
-                'qnet_type': 'CategoricalQNet',
-                'qnet_hparams': None,
-                'replay_memory_type': 'DequeReplayMemory',
-                'replay_memory_hparams': None,
-                'exploration_type': 'EpsilonLinearDecayExploration',
-                'exploration_hparams': None,
-                'optimization': opt.default_optimization_hparams(),
-                'target_update_strategy': 'copy',
-                'cold_start_steps': 100,
-                'sample_batch_size': 32,
-                'update_period': 100,
-                'discount_factor': 0.95,
-                'name': 'dqn_agent'
-            }
-
-        Here:
-
-        "qnet_type": str or class or instance
-            Q-value net. Can be class, its
-            name or module path, or a class instance. If class name is given,
-            the class must be from module :mod:`texar.tf.modules` or
-            :mod:`texar.tf.custom`. Ignored if a `qnet` is given to
-            the agent constructor.
-
-        "qnet_hparams": dict, optional
-            Hyperparameters for the Q net. With the :attr:`qnet_kwargs`
-            argument to the constructor, a network is created with
-            :python:`qnet_class(**qnet_kwargs, hparams=qnet_hparams)`.
-
-        "replay_memory_type": str or class or instance
-            Replay memory class. Can be class, its name or module path,
-            or a class instance.
-            If class name is given, the class must be from module
-            :mod:`texar.tf.core` or :mod:`texar.tf.custom`.
-            Ignored if a `replay_memory` is given to the agent constructor.
-
-        "replay_memory_hparams": dict, optional
-            Hyperparameters for the replay memory. With the
-            :attr:`replay_memory_kwargs` argument to the constructor,
-            a network is created with
-            :python:`replay_memory_class(
-            **replay_memory_kwargs, hparams=replay_memory_hparams)`.
-
-        "exploration_type": str or class or instance
-            Exploration class. Can be class,
-            its name or module path, or a class instance. If class name is
-            given, the class must be from module :mod:`texar.tf.core` or
-            :mod:`texar.tf.custom`. Ignored if a `exploration` is given to
-            the agent constructor.
-
-        "exploration_hparams": dict, optional
-            Hyperparameters for the exploration class.
-            With the :attr:`exploration_kwargs` argument to the constructor,
-            a network is created with :python:`exploration_class(
-            **exploration_kwargs, hparams=exploration_hparams)`.
-
-        "optimization": dict
-            Hyperparameters of optimization for updating the Q-net.
-            See :func:`~texar.tf.core.default_optimization_hparams` for details.
-
-        "cold_start_steps": int
-            In the beginning, Q-net is not trained in the first few steps.
-
-        "sample_batch_size": int
-            The number of samples taken in replay memory when training.
-
-        "target_update_strategy": string
-
-            - If **"copy"**, the target network is assigned with the parameter \
-            of Q-net every :attr:`"update_period"` steps.
-            - If **"tau"**, target will be updated by assigning as \
-            ``` (1 - 1/update_period) * target + 1/update_period * qnet ```
-
-        "update_period": int
-            Frequecy of updating the target network, i.e., updating
-            the target once for every "update_period" steps.
-
-        "discount_factor": float
-            The discount factor of reward.
-
-        "name": str
-            Name of the agent.
-        """
-        return {
-            'qnet_type': 'CategoricalQNet',
-            'qnet_hparams': None,
-            'replay_memory_type': 'DequeReplayMemory',
-            'replay_memory_hparams': None,
-            'exploration_type': 'EpsilonLinearDecayExploration',
-            'exploration_hparams': None,
-            'optimization': opt.default_optimization_hparams(),
-            'target_update_strategy': 'copy',
-            'cold_start_steps': 100,
-            'sample_batch_size': 32,
-            'update_period': 100,
-            'discount_factor': 0.95,
-            'name': 'dqn_agent'
-        }
-
-    def _build_graph(self):
-        with tf.variable_scope(self.variable_scope):
-            self._observ_inputs = tf.placeholder(
-                dtype=self._env_config.observ_dtype,
-                shape=[None, ] + list(self._env_config.observ_shape),
-                name='observ_inputs')
-            self._action_inputs = tf.placeholder(
-                dtype=self._env_config.action_dtype,
-                shape=[None, self._num_actions],
-                name='action_inputs')
-            self._y_inputs = tf.placeholder(
-                dtype=tf.float32,
-                shape=[None, ],
-                name='y_inputs')
-
-            self._qnet_outputs = self._get_qnet_outputs(self._observ_inputs)
-            self._target_outputs = self._get_target_outputs(self._observ_inputs)
-            self._td_error = self._get_td_error(
-                qnet_qvalues=self._qnet_outputs['qvalues'],
-                actions=self._action_inputs,
-                y=self._y_inputs)
-            self._train_op = self._get_train_op()
-
-            if self._target_update_strategy == 'copy':
-                self._update_op = self._get_copy_update_op()
-            elif self._target_update_strategy == 'tau':
-                self._update_op = self._get_tau_update_op()
-
-    def _get_qnet_outputs(self, state_inputs):
-        return self._qnet(inputs=state_inputs, **self._qnet_caller_kwargs)
-
-    def _get_target_outputs(self, state_inputs):
-        return self._target(inputs=state_inputs, **self._qnet_caller_kwargs)
-
-    def _get_td_error(self, qnet_qvalues, actions, y):
-        return y - tf.reduce_sum(qnet_qvalues * tf.cast(actions, tf.float), axis=1)
-
-    def _get_train_op(self):
-        train_op = opt.get_train_op(
-            loss=tf.reduce_sum(self._td_error ** 2),
-            variables=self._qnet.trainable_variables,
-            hparams=self._hparams.optimization.todict())
-        return train_op
-
-    def _get_copy_update_op(self):
-        op = []
-        for i in range(len(self._qnet.trainable_variables)):
-            op.append(tf.assign(ref=self._target.trainable_variables[i],
-                                value=self._qnet.trainable_variables[i]))
-        return op
-
-    def _get_tau_update_op(self):
-        tau = 1. / self._update_period
-        op = []
-        for i in range(len(self._qnet.trainable_variables)):
-            value_ = (1. - tau) * self._target.trainable_variables[i] + \
-                    tau * self._qnet.trainable_variables[i]
-            op.append(tf.assign(
-                ref=self._target.trainable_variables[i], value=value_))
-        return op
-
-    def _observe(self, reward, terminal, train_policy, feed_dict):
-        if self._timestep > self._cold_start_steps and train_policy:
-            self._train_qnet(feed_dict)
-
-        action_one_hot = [0.] * self._num_actions
-        action_one_hot[self._action] = 1.
-
-        self._replay_memory.add(dict(
-            observ=self._observ,
-            action=action_one_hot,
-            reward=reward,
-            terminal=terminal,
-            next_observ=None))
-        self._timestep += 1
-
-    def _train_qnet(self, feed_dict):
-        minibatch = self._replay_memory.get(self._sample_batch_size)
-        observ_batch = np.array([data['observ'] for data in minibatch])
-        action_batch = np.array([data['action'] for data in minibatch])
-        reward_batch = np.array([data['reward'] for data in minibatch])
-        terminal_batch = np.array([data['terminal'] for data in minibatch])
-        next_observ_batch = \
-            np.array([data['next_observ'] for data in minibatch])
-
-        target_qvalue = self._sess.run(
-            self._target_outputs['qvalues'], feed_dict={
-                self._observ_inputs: next_observ_batch,
-                context.global_mode(): tf.estimator.ModeKeys.PREDICT})
-
-        y_batch = reward_batch
-        for i in range(self._sample_batch_size):
-            if not terminal_batch[i]:
-                y_batch[i] += self._discount_factor * np.max(target_qvalue[i])
-
-        feed_dict_ = {
-            self._observ_inputs: observ_batch,
-            self._y_inputs: y_batch,
-            self._action_inputs: action_batch
-        }
-        feed_dict_.update(feed_dict or {})
-
-        self._sess.run(self._train_op, feed_dict=feed_dict_)
-
-        self._update_target(feed_dict)
-
-    def _update_target(self, feed_dict):
-        if self._target_update_strategy == 'tau' or (
-                self._target_update_strategy == 'copy' and
-                self._timestep % self._update_period == 0):
-            self._sess.run(self._update_op, feed_dict=feed_dict)
-
-    def _qvalues_from_qnet(self, observ):
-        return self._sess.run(
-            self._qnet_outputs['qvalues'],
-            feed_dict={self._observ_inputs: np.array([observ]),
-                       context.global_mode(): tf.estimator.ModeKeys.PREDICT})
-
-    def _qvalues_from_target(self, observ):
-        return self._sess.run(
-            self._target_outputs['qvalues'],
-            feed_dict={self._observ_inputs: np.array([observ]),
-                       context.global_mode(): tf.estimator.ModeKeys.PREDICT})
-
-    def _update_observ_action(self, observ, action):
-        self._observ = observ
-        self._action = action
-        if self._replay_memory.size() > 0:
-            self._replay_memory.last()['next_observ'] = self._observ
-
-    def _get_action(self, observ, feed_dict=None):
-        qvalue = self._qvalues_from_qnet(observ)
-
-        if random.random() < self._exploration.get_epsilon(self._timestep):
-            action = random.randrange(self._num_actions)
-        else:
-            action = np.argmax(qvalue)
-
-        self._update_observ_action(observ, action)
-
-        return action
-
-    def _reset(self):
-        self._observ = None
-        self._action = None
-
-    @property
-    def sess(self):
-        """The tf session.
-        """
-        return self._sess
-
-    @sess.setter
-    def sess(self, session):
-        self._sess = session
diff --git a/texar/tf/agents/episodic_agent_base.py b/texar/tf/agents/episodic_agent_base.py
deleted file mode 100644
index 8cad3ef5..00000000
--- a/texar/tf/agents/episodic_agent_base.py
+++ /dev/null
@@ -1,135 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Base class for episodic reinforcement learning agents.
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow as tf
-
-from texar.tf.agents.agent_base import AgentBase
-
-# pylint: disable=too-many-instance-attributes
-
-
-class EpisodicAgentBase(AgentBase):
-    """Base class inherited by episodic RL agents.
-
-    An agent is a wrapper of the **training process** that trains a model
-    with RL algorithms. Agent itself does not create new trainable variables.
-
-    An episodic RL agent typically provides 3 interfaces, namely, :meth:`reset`,
-    :meth:`get_action` and :meth:`observe`, and is used as the following
-    example.
-
-    Example:
-
-        .. code-block:: python
-
-            env = SomeEnvironment(...)
-            agent = PGAgent(...)
-
-            while True:
-                # Starts one episode
-                agent.reset()
-                observ = env.reset()
-                while True:
-                    action = agent.get_action(observ)
-                    next_observ, reward, terminal = env.step(action)
-                    agent.observe(reward, terminal)
-                    observ = next_observ
-                    if terminal:
-                        break
-
-    Args:
-        env_config: An instance of :class:`~texar.tf.agents.EnvConfig`
-            specifying action space, observation space, and reward range, etc.
-            Use :func:`~texar.tf.agents.get_gym_env_config` to create an
-            EnvConfig from a gym environment.
-        hparams (dict or HParams, optional): Hyperparameters. Missing
-            hyperparamerter will be set to default values. See
-            :meth:`default_hparams` for the hyperparameter sturcture and
-            default values.
-    """
-    def __init__(self, env_config, hparams=None):
-        AgentBase.__init__(self, hparams)
-
-        self._env_config = env_config
-
-        self._reset_tmplt_fn = tf.make_template(
-            "{}_reset".format(self.name), self._reset)
-        self._observe_tmplt_fn = tf.make_template(
-            "{}_observe".format(self.name), self._observe)
-        self._get_action_tmplt_fn = tf.make_template(
-            "{}_get_action".format(self.name), self._get_action)
-
-    @staticmethod
-    def default_hparams():
-        """Returns a dictionary of hyperparameters with default values.
-
-        .. code-block:: python
-
-            {
-                "name": "agent"
-            }
-        """
-        return {
-            'name': 'agent'
-        }
-
-    def reset(self):
-        """Resets the states to begin new episode.
-        """
-        self._reset_tmplt_fn()
-
-    def _reset(self):
-        raise NotImplementedError
-
-    def observe(self, reward, terminal, train_policy=True, feed_dict=None):
-        """Observes experience from environment.
-
-        Args:
-            reward: Reward of the action. The configuration (e.g., shape) of
-                the reward is defined in :attr:`env_config`.
-            terminal (bool): Whether the episode is terminated.
-            train_policy (bool): Wether to update the policy for this step.
-            feed_dict (dict, optional): Any stuffs fed to running the training
-                operator.
-        """
-        return self._observe_tmplt_fn(reward, terminal, train_policy, feed_dict)
-
-    def _observe(self, reward, terminal, train_policy, feed_dict):
-        raise NotImplementedError
-
-    def get_action(self, observ, feed_dict=None):
-        """Gets action according to observation.
-
-        Args:
-            observ: Observation from the environment.
-
-        Returns:
-            action from the policy.
-        """
-        return self._get_action_tmplt_fn(observ, feed_dict)
-
-    def _get_action(self, observ, feed_dict):
-        raise NotImplementedError
-
-    @property
-    def env_config(self):
-        """Environment configuration.
-        """
-        return self._env_config
diff --git a/texar/tf/agents/pg_agent.py b/texar/tf/agents/pg_agent.py
deleted file mode 100644
index 40536675..00000000
--- a/texar/tf/agents/pg_agent.py
+++ /dev/null
@@ -1,278 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Policy Gradient agent.
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# pylint: disable=too-many-instance-attributes, too-many-arguments
-
-import tensorflow as tf
-
-from texar.tf.agents.episodic_agent_base import EpisodicAgentBase
-from texar.tf.utils import utils
-from texar.tf.core import optimization as opt
-from texar.tf.losses import pg_losses as losses
-from texar.tf.losses.rewards import discount_reward
-
-
-class PGAgent(EpisodicAgentBase):
-    """Policy gradient agent for episodic setting. This agent here supports
-    **un-batched** training, i.e., each time generates one action, takes one
-    observation, and updates the policy.
-
-    The policy must take in an observation of shape `[1] + observation_shape`,
-    where the first dimension 1 stands for batch dimension, and output a `dict`
-    containing:
-
-    - Key **"action"** whose value is a Tensor of shape \
-    `[1] + action_shape` containing a single action.
-    - One of keys "log_prob" or "dist":
-
-        - **"log_prob"**: A Tensor of shape `[1]`, the log probability of the \
-        "action".
-        - **"dist"**: A \
-        tf_main:`tf.distributions.Distribution <distributions/Distribution>`\
-        with the `log_prob` interface and \
-        `log_prob = dist.log_prob(outputs["action"])`.
-
-    .. role:: python(code)
-       :language: python
-
-    Args:
-        env_config: An instance of :class:`~texar.tf.agents.EnvConfig`
-            specifying action space, observation space, and reward range, etc.
-            Use :func:`~texar.tf.agents.get_gym_env_config` to create an
-            EnvConfig from a gym environment.
-        sess (optional): A tf session.
-            Can be `None` here and set later with `agent.sess = session`.
-        policy (optional): A policy net that takes in observation and outputs
-            actions and probabilities.
-            If not given, a policy network is created based on :attr:`hparams`.
-        policy_kwargs (dict, optional): Keyword arguments for policy
-            constructor. Note that the `hparams` argument for network
-            constructor is specified in the "policy_hparams" field of
-            :attr:`hparams` and should not be included in `policy_kwargs`.
-            Ignored if :attr:`policy` is given.
-        policy_caller_kwargs (dict, optional): Keyword arguments for
-            calling the policy to get actions. The policy is called with
-            :python:`outputs=policy(inputs=observation, **policy_caller_kwargs)`
-        learning_rate (optional): Learning rate for policy optimization. If
-            not given, determine the learning rate from :attr:`hparams`.
-            See :func:`~texar.tf.core.get_train_op` for more details.
-        hparams (dict or HParams, optional): Hyperparameters. Missing
-            hyperparamerter will be set to default values. See
-            :meth:`default_hparams` for the hyperparameter sturcture and
-            default values.
-    """
-    def __init__(self,
-                 env_config,
-                 sess=None,
-                 policy=None,
-                 policy_kwargs=None,
-                 policy_caller_kwargs=None,
-                 learning_rate=None,
-                 hparams=None):
-        EpisodicAgentBase.__init__(self, env_config, hparams)
-
-        self._sess = sess
-        self._lr = learning_rate
-        self._discount_factor = self._hparams.discount_factor
-
-        with tf.variable_scope(self.variable_scope):
-            if policy is None:
-                kwargs = utils.get_instance_kwargs(
-                    policy_kwargs, self._hparams.policy_hparams)
-                policy = utils.check_or_get_instance(
-                    self._hparams.policy_type,
-                    kwargs,
-                    module_paths=['texar.tf.modules', 'texar.tf.custom'])
-            self._policy = policy
-            self._policy_caller_kwargs = policy_caller_kwargs or {}
-
-        self._observs = []
-        self._actions = []
-        self._rewards = []
-
-        self._train_outputs = None
-
-        self._build_graph()
-
-    def _build_graph(self):
-        with tf.variable_scope(self.variable_scope):
-            self._observ_inputs = tf.placeholder(
-                dtype=self._env_config.observ_dtype,
-                shape=[None, ] + list(self._env_config.observ_shape),
-                name='observ_inputs')
-            self._action_inputs = tf.placeholder(
-                dtype=self._env_config.action_dtype,
-                shape=[None, ] + list(self._env_config.action_shape),
-                name='action_inputs')
-            self._advantage_inputs = tf.placeholder(
-                dtype=tf.float32,
-                shape=[None, ],
-                name='advantages_inputs')
-
-            self._outputs = self._get_policy_outputs()
-
-            self._pg_loss = self._get_pg_loss()
-
-            self._train_op = self._get_train_op()
-
-    def _get_policy_outputs(self):
-        outputs = self._policy(
-            inputs=self._observ_inputs, **self._policy_caller_kwargs)
-        return outputs
-
-    def _get_pg_loss(self):
-        if 'log_prob' in self._outputs:
-            log_probs = self._outputs['log_prob']
-        elif 'dist' in self._outputs:
-            log_probs = self._outputs['dist'].log_prob(self._action_inputs)
-        else:
-            raise ValueError('Outputs of the policy must have one of '
-                             '"log_prob" or "dist".')
-        pg_loss = losses.pg_loss_with_log_probs(
-            log_probs=log_probs,
-            advantages=self._advantage_inputs,
-            average_across_timesteps=True,
-            sum_over_timesteps=False)
-        return pg_loss
-
-    def _get_train_op(self):
-        train_op = opt.get_train_op(
-            loss=self._pg_loss,
-            variables=self._policy.trainable_variables,
-            learning_rate=self._lr,
-            hparams=self._hparams.optimization.todict())
-        return train_op
-
-    @staticmethod
-    def default_hparams():
-        """Returns a dictionary of hyperparameters with default values:
-
-        .. role:: python(code)
-           :language: python
-
-        .. code-block:: python
-
-            {
-                'policy_type': 'CategoricalPolicyNet',
-                'policy_hparams': None,
-                'discount_factor': 0.95,
-                'normalize_reward': False,
-                'optimization': default_optimization_hparams(),
-                'name': 'pg_agent',
-            }
-
-        Here:
-
-        "policy_type": str or class or instance
-            Policy net. Can be class, its name or module path, or a class
-            instance. If class name is given, the class must be from module
-            :mod:`texar.tf.modules` or :mod:`texar.tf.custom`. Ignored if a
-            `policy` is given to the agent constructor.
-
-        "policy_hparams": dict, optional
-            Hyperparameters for the policy net. With the :attr:`policy_kwargs`
-            argument to the constructor, a network is created with
-            :python:`policy_class(**policy_kwargs, hparams=policy_hparams)`.
-
-        "discount_factor": float
-            The discount factor of reward.
-
-        "normalize_reward": bool
-            Whether to normalize the discounted reward, by
-            `(discounted_reward - mean) / std`.
-
-        "optimization": dict
-            Hyperparameters of optimization for updating the policy net.
-            See :func:`~texar.tf.core.default_optimization_hparams` for details.
-
-        "name": str
-            Name of the agent.
-        """
-        return {
-            'policy_type': 'CategoricalPolicyNet',
-            'policy_hparams': None,
-            'discount_factor': 0.95,
-            'normalize_reward': False,
-            'optimization': opt.default_optimization_hparams(),
-            'name': 'pg_agent',
-        }
-
-    def _reset(self):
-        self._observs = []
-        self._actions = []
-        self._rewards = []
-
-    def _get_action(self, observ, feed_dict):
-        fetches = {
-            "action": self._outputs['action']
-        }
-
-        feed_dict_ = {self._observ_inputs: [observ, ]}
-        feed_dict_.update(feed_dict or {})
-
-        vals = self._sess.run(fetches, feed_dict=feed_dict_)
-        action = vals['action']
-        action = action[0]  # Removes the batch dimension
-
-        self._observs.append(observ)
-        self._actions.append(action)
-
-        return action
-
-    def _observe(self, reward, terminal, train_policy, feed_dict):
-        self._rewards.append(reward)
-
-        if terminal and train_policy:
-            self._train_policy(feed_dict=feed_dict)
-
-    def _train_policy(self, feed_dict=None):
-        """Updates the policy.
-
-        Args:
-            TODO
-        """
-        qvalues = discount_reward(
-            [self._rewards], discount=self._hparams.discount_factor,
-            normalize=self._hparams.normalize_reward)
-        qvalues = qvalues[0, :]
-
-        fetches = dict(loss=self._train_op)
-        feed_dict_ = {
-            self._observ_inputs: self._observs,
-            self._action_inputs: self._actions,
-            self._advantage_inputs: qvalues}
-        feed_dict_.update(feed_dict or {})
-
-        self._train_outputs = self._sess.run(fetches, feed_dict=feed_dict_)
-
-    @property
-    def sess(self):
-        """The tf session.
-        """
-        return self._sess
-
-    @sess.setter
-    def sess(self, session):
-        self._sess = session
-
-    @property
-    def policy(self):
-        """The policy model.
-        """
-        return self._policy
diff --git a/texar/tf/agents/seq_agent_base.py b/texar/tf/agents/seq_agent_base.py
deleted file mode 100644
index 260c3528..00000000
--- a/texar/tf/agents/seq_agent_base.py
+++ /dev/null
@@ -1,44 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Base class for reinforcement learning agents for sequence prediction.
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from texar.tf.agents.agent_base import AgentBase
-
-# pylint: disable=too-many-instance-attributes
-
-
-class SeqAgentBase(AgentBase):
-    """
-    Base class inherited by sequence prediction RL agents.
-
-    Args:
-        TODO
-    """
-    def __init__(self, hparams=None):
-        AgentBase.__init__(self, hparams)
-
-    @staticmethod
-    def default_hparams():
-        """Returns a dictionary of hyperparameters with default values.
-
-        TODO
-        """
-        return {
-            'name': 'agent'
-        }
diff --git a/texar/tf/agents/seq_pg_agent.py b/texar/tf/agents/seq_pg_agent.py
deleted file mode 100644
index e7a6c7e3..00000000
--- a/texar/tf/agents/seq_pg_agent.py
+++ /dev/null
@@ -1,439 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Policy Gradient agent for sequence prediction.
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# pylint: disable=too-many-instance-attributes, too-many-arguments, no-member
-
-import tensorflow as tf
-
-from texar.tf.agents.seq_agent_base import SeqAgentBase
-from texar.tf.core import optimization as opt
-from texar.tf.losses.pg_losses import pg_loss_with_logits
-from texar.tf.losses.rewards import discount_reward
-from texar.tf.losses.entropy import sequence_entropy_with_logits
-
-__all__ = [
-    "SeqPGAgent"
-]
-
-
-class SeqPGAgent(SeqAgentBase):
-    """Policy Gradient agent for sequence prediction.
-
-    This is a wrapper of the **training process** that trains a model
-    with policy gradient. Agent itself does not create new trainable variables.
-
-    Args:
-        samples: An `int` Tensor of shape `[batch_size, max_time]` containing
-            sampled sequences from the model.
-        logits: A float Tenosr of shape `[batch_size, max_time, vocab_size]`
-            containing the logits of samples from the model.
-        sequence_length: A Tensor of shape `[batch_size]`.
-            Time steps beyond the respective sequence lengths are masked out.
-        trainable_variables (optional): Trainable variables of the model to
-            update during training. If `None`, all trainable variables in the
-            graph are used.
-        learning_rate (optional): Learning rate for policy optimization. If
-            not given, determine the learning rate from :attr:`hparams`.
-            See :func:`~texar.tf.core.get_train_op` for more details.
-        sess (optional): A tf session.
-            Can be `None` here and set later with `agent.sess = session`.
-        hparams (dict or HParams, optional): Hyperparameters. Missing
-            hyperparamerter will be set to default values. See
-            :meth:`default_hparams` for the hyperparameter sturcture and
-            default values.
-
-    Example:
-
-        .. code-block:: python
-
-            ## Train a decoder with policy gradient
-            decoder = BasicRNNDecoder(...)
-            outputs, _, sequence_length = decoder(
-                decoding_strategy='infer_sample', ...)
-
-            sess = tf.Session()
-            agent = SeqPGAgent(
-                samples=outputs.sample_id,
-                logits=outputs.logits,
-                sequence_length=sequence_length,
-                sess=sess)
-            while training:
-                # Generate samples
-                vals = agent.get_samples()
-                # Evaluate reward
-                sample_text = tx.utils.map_ids_to_strs(vals['samples'], vocab)
-                reward_bleu = []
-                for y, y_ in zip(ground_truth, sample_text)
-                    reward_bleu.append(tx.evals.sentence_bleu(y, y_)
-                # Update
-                agent.observe(reward=reward_bleu)
-    """
-    def __init__(self,
-                 samples,
-                 logits,
-                 sequence_length,
-                 trainable_variables=None,
-                 learning_rate=None,
-                 sess=None,
-                 hparams=None):
-        SeqAgentBase.__init__(self, hparams)
-
-        self._lr = learning_rate
-
-        # Tensors
-        self._samples = samples
-        self._logits = logits
-        self._sequence_length = sequence_length
-        self._trainable_variables = trainable_variables
-
-        # Python values
-        self._samples_py = None
-        self._sequence_length_py = None
-        self._rewards = None
-
-        self._sess = sess
-
-        # For session partial run
-        self._partial_run_handle = None
-        self._qvalue_inputs_fed = False
-
-        self._build_graph()
-
-    def _build_graph(self):
-        with tf.variable_scope(self.variable_scope):
-            self._qvalue_inputs = tf.placeholder(
-                dtype=tf.float32,
-                shape=[None, None],
-                name='qvalue_inputs')
-            self._pg_loss = self._get_pg_loss()
-            self._train_op = self._get_train_op()
-
-    def _get_pg_loss(self):
-        loss_hparams = self._hparams.loss
-        pg_loss = pg_loss_with_logits(
-            actions=self._samples,
-            logits=self._logits,
-            sequence_length=self._sequence_length,
-            advantages=self._qvalue_inputs,
-            batched=True,
-            average_across_batch=loss_hparams.average_across_batch,
-            average_across_timesteps=loss_hparams.average_across_timesteps,
-            sum_over_batch=loss_hparams.sum_over_batch,
-            sum_over_timesteps=loss_hparams.sum_over_timesteps,
-            time_major=loss_hparams.time_major)
-
-        if self._hparams.entropy_weight > 0:
-            entropy = self._get_entropy()
-            pg_loss -= self._hparams.entropy_weight * entropy
-
-        return pg_loss
-
-    def _get_entropy(self):
-        loss_hparams = self._hparams.loss
-        return sequence_entropy_with_logits(
-            self._logits,
-            sequence_length=self._sequence_length,
-            average_across_batch=loss_hparams.average_across_batch,
-            average_across_timesteps=loss_hparams.average_across_timesteps,
-            sum_over_batch=loss_hparams.sum_over_batch,
-            sum_over_timesteps=loss_hparams.sum_over_timesteps,
-            time_major=loss_hparams.time_major)
-
-    def _get_train_op(self):
-        train_op = opt.get_train_op(
-            loss=self._pg_loss,
-            variables=self._trainable_variables,
-            learning_rate=self._lr,
-            hparams=self._hparams.optimization.todict())
-        return train_op
-
-    @staticmethod
-    def default_hparams():
-        """Returns a dictionary of hyperparameters with default values:
-
-        .. role:: python(code)
-           :language: python
-
-        .. code-block:: python
-
-            {
-                'discount_factor': 0.95,
-                'normalize_reward': False,
-                'entropy_weight': 0.,
-                'loss': {
-                    'average_across_batch': True,
-                    'average_across_timesteps': False,
-                    'sum_over_batch': False,
-                    'sum_over_timesteps': True,
-                    'time_major': False
-                },
-                'optimization': default_optimization_hparams(),
-                'name': 'pg_agent',
-            }
-
-        Here:
-
-        "discount_factor": float
-            The discount factor of reward.
-
-        "normalize_reward": bool
-            Whether to normalize the discounted reward, by
-            `(discounted_reward - mean) / std`. Here `mean` and `std` are
-            over all time steps and all samples in the batch.
-
-        "entropy_weight": float
-            The weight of entropy loss of the sample distribution, to encourage
-            maximizing the Shannon entropy. Set to 0 to disable the loss.
-
-        "loss": dict
-            Extra keyword arguments for
-            :func:`~texar.tf.losses.pg_loss_with_logits`, including the
-            reduce arguments (e.g., `average_across_batch`) and `time_major`
-
-        "optimization": dict
-            Hyperparameters of optimization for updating the policy net.
-            See :func:`~texar.tf.core.default_optimization_hparams` for details.
-
-        "name": str
-            Name of the agent.
-        """
-        return {
-            'discount_factor': 0.95,
-            'normalize_reward': False,
-            'entropy_weight': 0.,
-            'loss': {
-                'average_across_batch': True,
-                'average_across_timesteps': False,
-                'sum_over_batch': False,
-                'sum_over_timesteps': True,
-                'time_major': False
-            },
-            'optimization': opt.default_optimization_hparams(),
-            'name': 'pg_agent',
-        }
-
-    def _get_partial_run_feeds(self, feeds=None):
-        if feeds is None:
-            feeds = []
-        feeds += [self._qvalue_inputs]
-        return feeds
-
-    def _setup_partial_run(self, fetches=None, feeds=None):
-        fetches_ = [self._samples, self._sequence_length, self._pg_loss,
-                    self._train_op]
-        if fetches is not None:
-            for fet in fetches:
-                if fet not in fetches_:
-                    fetches_.append(fet)
-
-        feeds = self._get_partial_run_feeds(feeds)
-
-        self._partial_run_handle = self._sess.partial_run_setup(
-            fetches_, feeds=feeds)
-
-        self._qvalue_inputs_fed = False
-
-    def _check_extra_fetches(self, extra_fetches):
-        fetch_values = None
-        if extra_fetches is not None:
-            fetch_values = list(extra_fetches.values())
-        if fetch_values is not None:
-            if self._samples in fetch_values:
-                raise ValueError(
-                    "`samples` must not be included in `extra_fetches`. "
-                    "It is added automatically.")
-            if self._sequence_length in fetch_values:
-                raise ValueError(
-                    "`sequence_length` must not be included in `extra_fetches`."
-                    " It is added automatically.")
-            if "samples" in extra_fetches:
-                raise ValueError(
-                    "Key 'samples' is preserved and must not be used "
-                    "in `extra_fetches`.")
-            if "sequence_length" in extra_fetches:
-                raise ValueError(
-                    "Key 'sequence_length' is preserved and must not be used "
-                    "in `extra_fetches`.")
-
-    def get_samples(self, extra_fetches=None, feed_dict=None):
-        """Returns sequence samples and extra results.
-
-        Args:
-            extra_fetches (dict, optional): Extra tensors to fetch values,
-                besides `samples` and `sequence_length`. Same as the
-                `fetches` argument of
-                :tf_main:`tf.Session.run <Session#run>` and
-                tf_main:`partial_run <Session#partial_run>`.
-            feed_dict (dict, optional): A `dict` that maps tensor to
-                values. Note that all placeholder values used in
-                :meth:`get_samples` and subsequent :meth:`observe` calls
-                should be fed here.
-
-        Returns:
-            A `dict` with keys **"samples"** and **"sequence_length"**
-            containing the fetched values of :attr:`samples` and
-            :attr:`sequence_length`, as well as other fetched values
-            as specified in :attr:`extra_fetches`.
-
-        Example:
-
-            .. code-block:: python
-
-                extra_fetches = {'truth_ids': data_batch['text_ids']}
-                vals = agent.get_samples()
-                sample_text = tx.utils.map_ids_to_strs(vals['samples'], vocab)
-                truth_text = tx.utils.map_ids_to_strs(vals['truth_ids'], vocab)
-                reward = reward_fn_in_python(truth_text, sample_text)
-        """
-        if self._sess is None:
-            raise ValueError("`sess` must be specified before sampling.")
-
-        self._check_extra_fetches(extra_fetches)
-
-        # Sets up partial_run
-        fetch_values = None
-        if extra_fetches is not None:
-            fetch_values = list(extra_fetches.values())
-        feeds = None
-        if feed_dict is not None:
-            feeds = list(feed_dict.keys())
-        self._setup_partial_run(fetches=fetch_values, feeds=feeds)
-
-        # Runs the sampling
-        fetches = {
-            "samples": self._samples,
-            "sequence_length": self._sequence_length
-        }
-        if extra_fetches is not None:
-            fetches.update(extra_fetches)
-
-        feed_dict_ = feed_dict
-
-        vals = self._sess.partial_run(
-            self._partial_run_handle, fetches, feed_dict=feed_dict_)
-
-        self._samples_py = vals['samples']
-        self._sequence_length_py = vals['sequence_length']
-
-        return vals
-
-    def observe(self, reward, train_policy=True, compute_loss=True):
-        """Observes the reward, and updates the policy or computes loss
-        accordingly.
-
-        Args:
-            reward: A Python array/list of shape `[batch_size]` containing
-                the reward for the samples generated in last call of
-                :meth:`get_samples`.
-            train_policy (bool): Whether to update the policy model according
-                to the reward.
-            compute_loss (bool): If `train_policy` is False, whether to
-                compute the policy gradient loss (but does not update the
-                policy).
-
-        Returns:
-            If `train_policy` or `compute_loss` is True, returns the loss
-            (a python float scalar). Otherwise returns `None`.
-        """
-        self._rewards = reward
-
-        if train_policy:
-            return self._train_policy()
-        elif compute_loss:
-            return self._evaluate_pg_loss()
-        else:
-            return None
-
-    def _get_qvalues(self):
-        qvalues = discount_reward(
-            self._rewards,
-            self._sequence_length_py,
-            discount=self._hparams.discount_factor,
-            normalize=self._hparams.normalize_reward)
-        return qvalues
-
-    def _evaluate_pg_loss(self):
-        fetches = {
-            "loss": self._pg_loss
-        }
-
-        feed_dict_ = None
-        if not self._qvalue_inputs_fed:
-            qvalues = self._get_qvalues()
-            feed_dict_ = {self._qvalue_inputs: qvalues}
-
-        vals = self._sess.partial_run(
-            self._partial_run_handle, fetches, feed_dict=feed_dict_)
-
-        self._qvalue_inputs_fed = True
-
-        return vals['loss']
-
-    def _train_policy(self):
-        """Updates the policy.
-        """
-        fetches = {
-            "loss": self._train_op,
-        }
-
-        feed_dict_ = None
-        if not self._qvalue_inputs_fed:
-            qvalues = self._get_qvalues()
-            feed_dict_ = {self._qvalue_inputs: qvalues}
-
-        vals = self._sess.partial_run(
-            self._partial_run_handle, fetches, feed_dict=feed_dict_)
-
-        self._qvalue_inputs_fed = True
-
-        return vals['loss']
-
-    @property
-    def sess(self):
-        """The tf session.
-        """
-        return self._sess
-
-    @sess.setter
-    def sess(self, sess):
-        self._sess = sess
-
-    @property
-    def pg_loss(self):
-        """The scalar tensor of policy gradient loss.
-        """
-        return self._pg_loss
-
-    @property
-    def sequence_length(self):
-        """The tensor of sample sequence length, of shape `[batch_size]`.
-        """
-        return self._sequence_length
-
-    @property
-    def samples(self):
-        """The tensor of sequence samples.
-        """
-        return self._samples
-
-    @property
-    def logits(self):
-        """The tensor of sequence logits.
-        """
-        return self._logits
diff --git a/texar/tf/agents/seq_pg_agent_test.py b/texar/tf/agents/seq_pg_agent_test.py
deleted file mode 100644
index c34173f3..00000000
--- a/texar/tf/agents/seq_pg_agent_test.py
+++ /dev/null
@@ -1,79 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Unit tests for sequence prediction policy gradient agents.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
-
-import tensorflow as tf
-
-from texar.tf.modules.decoders.rnn_decoders import BasicRNNDecoder
-from texar.tf.agents import SeqPGAgent
-from texar.tf import context
-
-
-class SeqPGAgentTest(tf.test.TestCase):
-    """Tests :class:`texar.tf.agents.SeqPGAgent`
-    """
-
-    def setUp(self):
-        tf.test.TestCase.setUp(self)
-        self._vocab_size = 4
-        self._max_time = 8
-        self._batch_size = 16
-        self._emb_dim = 20
-        self._inputs = tf.random_uniform(
-            [self._batch_size, self._max_time, self._emb_dim],
-            maxval=1., dtype=tf.float32)
-        self._embedding = tf.random_uniform(
-            [self._vocab_size, self._emb_dim], maxval=1., dtype=tf.float32)
-
-    def test_seq_pg_agent(self):
-        """Tests logits.
-        """
-        decoder = BasicRNNDecoder(vocab_size=self._vocab_size)
-        outputs, _, sequence_length = decoder(
-            decoding_strategy="infer_greedy",
-            max_decoding_length=10,
-            embedding=self._embedding,
-            start_tokens=[1] * self._batch_size,
-            end_token=2)
-
-        agent = SeqPGAgent(
-            outputs.sample_id, outputs.logits, sequence_length,
-            decoder.trainable_variables)
-
-        with self.test_session() as sess:
-            sess.run(tf.global_variables_initializer())
-
-            agent.sess = sess
-
-            feed_dict = {context.global_mode(): tf.estimator.ModeKeys.TRAIN}
-            for _ in range(2):
-                vals = agent.get_samples(feed_dict=feed_dict)
-                self.assertEqual(vals['samples'].shape[0], self._batch_size)
-
-                loss_1 = agent.observe([1.] * self._batch_size)
-                loss_2 = agent.observe(
-                    [1.] * self._batch_size, train_policy=False)
-                self.assertEqual(loss_1.shape, ())
-                self.assertEqual(loss_2.shape, ())
-
-
-if __name__ == "__main__":
-    tf.test.main()
diff --git a/texar/tf/context.py b/texar/tf/context.py
deleted file mode 100644
index 0856d82b..00000000
--- a/texar/tf/context.py
+++ /dev/null
@@ -1,106 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Global context manager that handles train/infer mode, etc
-"""
-
-from __future__ import absolute_import
-from __future__ import print_function
-from __future__ import division
-
-import tensorflow as tf
-
-__all__ = [
-    "global_mode",
-    "global_mode_train",
-    "global_mode_eval",
-    "global_mode_predict",
-    "valid_modes"
-]
-
-_GLOBAL_MODE_KEY = "GLOBAL_MODE"
-
-
-def global_mode():
-    """Returns the Tensor of global mode.
-
-    This is a placeholder with default value of
-    :tf_main:`tf.estimator.ModeKeys.TRAIN <estimator/ModeKeys>`.
-
-    Example:
-
-        .. code-block:: python
-
-            mode = session.run(global_mode())
-            # mode == tf.estimator.ModeKeys.TRAIN
-
-            mode = session.run(
-                global_mode(),
-                feed_dict={tf.global_mode(): tf.estimator.ModeKeys.PREDICT})
-            # mode == tf.estimator.ModeKeys.PREDICT
-    """
-    mode = tf.get_collection_ref(_GLOBAL_MODE_KEY)
-    if len(mode) < 1:
-        # mode_tensor = tf.placeholder(tf.string, name="global_mode")
-        mode_tensor = tf.placeholder_with_default(
-            input=tf.estimator.ModeKeys.TRAIN,
-            shape=(),
-            name="global_mode")
-        # mode_tensor = tf.constant(
-        #    value=tf.estimator.ModeKeys.TRAIN,
-        #    dtype=tf.string,
-        #    name="global_mode")
-        mode.append(mode_tensor)
-    return mode[0]
-
-
-def global_mode_train():
-    """Returns a bool Tensor indicating whether the global mode is TRAIN.
-
-    Example:
-
-        .. code-block:: python
-
-            is_train = session.run(global_mode_train())
-            # is_train == True
-
-            is_train = session.run(
-                global_mode_train()
-                feed_dict={tf.global_mode(): tf.estimator.ModeKeys.PREDICT})
-            # is_train == False
-    """
-    mode = global_mode()
-    return tf.equal(mode, tf.estimator.ModeKeys.TRAIN)
-
-
-def global_mode_eval():
-    """Returns a bool Tensor indicating whether the global mode is EVAL.
-    """
-    mode = global_mode()
-    return tf.equal(mode, tf.estimator.ModeKeys.EVAL)
-
-
-def global_mode_predict():
-    """Returns a bool Tensor indicating whether the global mode is PREDICT.
-    """
-    mode = global_mode()
-    return tf.equal(mode, tf.estimator.ModeKeys.PREDICT)
-
-
-def valid_modes():
-    """Returns a set of possible values of mode.
-    """
-    return {tf.estimator.ModeKeys.TRAIN,
-            tf.estimator.ModeKeys.EVAL,
-            tf.estimator.ModeKeys.PREDICT}
diff --git a/texar/tf/context_test.py b/texar/tf/context_test.py
deleted file mode 100644
index 445d4089..00000000
--- a/texar/tf/context_test.py
+++ /dev/null
@@ -1,70 +0,0 @@
-# -*- coding: utf-8 -*-
-#
-"""
-Unit tests for various context functionalities.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
-
-import tensorflow as tf
-
-from texar.tf import context
-
-# pylint: disable=protected-access
-
-
-class ContextTest(tf.test.TestCase):
-    """Tests context.
-    """
-
-    def test_global_mode(self):
-        """Tests the mode context manager.
-        """
-        global_mode = context.global_mode()
-        self.assertIsInstance(global_mode, tf.Tensor)
-
-        mode_train = context.global_mode_train()
-        mode_eval = context.global_mode_eval()
-        mode_predict = context.global_mode_predict()
-
-        with self.test_session() as sess:
-            sess.run(tf.global_variables_initializer())
-
-            global_mode_ = sess.run(global_mode)
-            self.assertEqual(tf.compat.as_str(global_mode_),
-                             tf.estimator.ModeKeys.TRAIN)
-
-            global_mode_, mode_train_, mode_eval_, mode_predict_ = sess.run(
-                [global_mode, mode_train, mode_eval, mode_predict],
-                feed_dict={context.global_mode(): tf.estimator.ModeKeys.TRAIN})
-            self.assertEqual(global_mode_, tf.estimator.ModeKeys.TRAIN)
-            self.assertTrue(mode_train_)
-            self.assertFalse(mode_eval_)
-            self.assertFalse(mode_predict_)
-
-            global_mode_, mode_train_, mode_eval_, mode_predict_ = sess.run(
-                [global_mode, mode_train, mode_eval, mode_predict],
-                feed_dict={context.global_mode(): tf.estimator.ModeKeys.EVAL})
-            self.assertEqual(global_mode_, tf.estimator.ModeKeys.EVAL)
-            self.assertFalse(mode_train_)
-            self.assertTrue(mode_eval_)
-            self.assertFalse(mode_predict_)
-
-            global_mode_, mode_train_, mode_eval_, mode_predict_ = sess.run(
-                [global_mode, mode_train, mode_eval, mode_predict],
-                feed_dict={context.global_mode():
-                           tf.estimator.ModeKeys.PREDICT})
-            self.assertEqual(global_mode_, tf.estimator.ModeKeys.PREDICT)
-            self.assertFalse(mode_train_)
-            self.assertFalse(mode_eval_)
-            self.assertTrue(mode_predict_)
-
-        global_mode_values = tf.get_collection_ref(context._GLOBAL_MODE_KEY)
-        self.assertEqual(len(global_mode_values), 1)
-
-
-if __name__ == "__main__":
-    tf.test.main()
diff --git a/texar/tf/core/__init__.py b/texar/tf/core/__init__.py
index 7030bebf..ad9275dd 100644
--- a/texar/tf/core/__init__.py
+++ b/texar/tf/core/__init__.py
@@ -1,4 +1,4 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
+# Copyright 2019 The Texar Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,16 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-Modules of texar core.
+Modules of Texar core.
 """
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# pylint: disable=wildcard-import
-
 from texar.tf.core.layers import *
-from texar.tf.core.replay_memories import *
-from texar.tf.core.explorations import *
-from texar.tf.core.optimization import *
diff --git a/texar/tf/core/explorations.py b/texar/tf/core/explorations.py
deleted file mode 100644
index 9b00a250..00000000
--- a/texar/tf/core/explorations.py
+++ /dev/null
@@ -1,126 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Classes and utilities for exploration in RL.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from texar.tf.hyperparams import HParams
-
-# pylint: disable=invalid-name
-
-__all__ = [
-    "ExplorationBase",
-    "EpsilonLinearDecayExploration"
-]
-
-
-class ExplorationBase(object):
-    """Base class inherited by all exploration classes.
-
-    Args:
-        hparams (dict or HParams, optional): Hyperparameters. Missing
-            hyperparameters are set to default values. See
-            :meth:`default_hparams` for the defaults.
-    """
-    def __init__(self, hparams=None):
-        self._hparams = HParams(hparams, self.default_hparams())
-
-    @staticmethod
-    def default_hparams():
-        """Returns a `dict` of hyperparameters and their default values.
-
-        .. code-block:: python
-
-            {
-                'name': 'exploration_base'
-            }
-        """
-        return {
-            'name': 'exploration_base'
-        }
-
-    def get_epsilon(self, timestep):
-        """Returns the epsilon value.
-
-        Args:
-            timestep (int): The time step.
-
-        Returns:
-            float: the epsilon value.
-        """
-        raise NotImplementedError
-
-    @property
-    def hparams(self):
-        """The hyperparameter.
-        """
-        return self._hparams
-
-
-class EpsilonLinearDecayExploration(ExplorationBase):
-    """Decays epsilon linearly.
-
-    Args:
-        hparams (dict or HParams, optional): Hyperparameters. Missing
-            hyperparameters are set to default values. See
-            :meth:`default_hparams` for the defaults.
-    """
-    def __init__(self, hparams=None):
-        ExplorationBase.__init__(self, hparams=hparams)
-
-    @staticmethod
-    def default_hparams():
-        """Returns a `dict` of hyperparameters and their default values.
-
-        .. code-block:: python
-
-            {
-                'initial_epsilon': 0.1,
-                'final_epsilon': 0.0,
-                'decay_timesteps': 20000,
-                'start_timestep': 0,
-                'name': 'epsilon_linear_decay_exploration',
-            }
-
-        This specifies the decay process that starts at
-        "start_timestep" with the value "initial_epsilon", and decays for
-        steps "decay_timesteps" to reach the final epsilon value
-        "final_epsilon".
-        """
-        return {
-            'name': 'epsilon_linear_decay_exploration',
-            'initial_epsilon': 0.1,
-            'final_epsilon': 0.0,
-            'decay_timesteps': 20000,
-            'start_timestep': 0
-        }
-
-    def get_epsilon(self, timestep):
-        nsteps = self._hparams.decay_timesteps
-        st = self._hparams.start_timestep
-        et = st + nsteps
-
-        if timestep <= st:
-            return self._hparams.initial_epsilon
-        if timestep > et:
-            return self._hparams.final_epsilon
-        r = (timestep - st) * 1.0 / nsteps
-        epsilon = (1 - r) * self._hparams.initial_epsilon + \
-                r * self._hparams.final_epsilon
-
-        return epsilon
diff --git a/texar/tf/core/layers.py b/texar/tf/core/layers.py
index 7049d80f..26a8188d 100644
--- a/texar/tf/core/layers.py
+++ b/texar/tf/core/layers.py
@@ -1,4 +1,4 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
+# Copyright 2019 The Texar Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -15,292 +15,25 @@
 Various neural network layers
 """
 
-from __future__ import absolute_import
-from __future__ import print_function
-from __future__ import division
-
-import copy
-
 import tensorflow as tf
-import tensorflow.contrib.rnn as rnn
 
 from texar.tf.hyperparams import HParams
 from texar.tf.utils import utils
 from texar.tf.utils.dtypes import is_str
-from texar.tf.utils.variables import add_variable
-from texar.tf.utils.mode import is_train_mode, switch_dropout
 
-# pylint: disable=redefined-variable-type, invalid-name
-# pylint: disable=too-many-branches, too-many-arguments, too-many-lines
-# pylint: disable=protected-access
 
 __all__ = [
-    "default_rnn_cell_hparams",
-    "get_rnn_cell",
-    "get_rnn_cell_trainable_variables",
     "default_regularizer_hparams",
     "get_regularizer",
     "get_initializer",
     "get_activation_fn",
     "get_constraint_fn",
     "get_layer",
-    "_ReducePooling1D",
-    "MaxReducePooling1D",
-    "AverageReducePooling1D",
-    "get_pooling_layer_hparams",
-    "MergeLayer",
-    "SequentialLayer",
-    "default_conv1d_kwargs",
-    "default_conv2d_kwargs",
-    "default_conv3d_kwargs",
-    "default_conv2d_transpose_kwargs",
-    "default_conv3d_transpose_kwargs",
-    "default_dense_kwargs",
-    "default_dropout_kwargs",
-    "default_flatten_kwargs",
-    "default_max_pooling1d_kwargs",
-    "default_max_pooling2d_kwargs",
-    "default_max_pooling3d_kwargs",
-    "default_separable_conv2d_kwargs",
-    "default_batch_normalization_kwargs",
-    "default_average_pooling1d_kwargs",
-    "default_average_pooling2d_kwargs",
-    "default_average_pooling3d_kwargs",
-    "layer_normalize",
 ]
 
 
-def default_rnn_cell_hparams():
-    """Returns a `dict` of RNN cell hyperparameters and their default values.
-
-    .. role:: python(code)
-       :language: python
-
-    .. code-block:: python
-
-        {
-            "type": "LSTMCell",
-            "kwargs": {
-                "num_units": 256
-            },
-            "num_layers": 1,
-            "dropout": {
-                "input_keep_prob": 1.0,
-                "output_keep_prob": 1.0,
-                "state_keep_prob": 1.0,
-                "variational_recurrent": False,
-                "input_size": []
-            },
-            "residual": False,
-            "highway": False,
-        }
-
-    Here:
-
-    "type": str or cell class or cell instance
-        The RNN cell type. This can be
-
-        - The string name or full module path of a cell class. If class \
-        name is provided, the class must be in module \
-        :tf_main:`tf.nn.rnn_cell <nn/rnn_cell/LSTMCell>`, \
-        :tf_main:`tf.contrib.rnn <contrib/rnn>`, or :mod:`texar.tf.custom`.
-        - A cell class.
-        - An instance of a cell class. This is not valid if \
-        "num_layers" > 1.
-
-        For example
-
-        .. code-block:: python
-
-            "type": "LSTMCell" # class name
-            "type": "tensorflow.contrib.rnn.Conv1DLSTMCell" # module path
-            "type": "my_module.MyCell" # module path
-            "type": tf.nn.rnn_cell.GRUCell # class
-            "type": BasicRNNCell(num_units=100) # cell instance
-            "type": MyCell(...) # cell instance
-
-    "kwargs": dict
-        Keyword arguments for the constructor of the cell class.
-        A cell is created by :python:`cell_class(**kwargs)`, where
-        `cell_class` is specified in "type" above.
-
-        Ignored if "type" is a cell instance.
-
-    "num_layers": int
-        Number of cell layers. Each layer is a cell created as above, with
-        the same hyperparameters specified in "kwargs".
-
-    "dropout": dict
-        Dropout applied to the cell in **each** layer. See
-        :tf_main:`DropoutWrapper <contrib/rnn/DropoutWrapper>` for details of
-        the hyperparameters. If all "*_keep_prob" = 1, no dropout is applied.
-
-        Specifically, if "variational_recurrent" = `True`,
-        the same dropout mask is applied across all time steps per run call.
-        If `True`, "input_size" is required, which is a list of input
-        size of each cell layer. The input size of a cell layer is the last
-        dimension size of its input tensor. For example, the
-        input size of the first layer is usually the dimension of
-        word embeddings, while the input size of subsequent layers
-        are usually the `num_units` of the preceding-layer cell. E.g.,
-
-        .. code-block:: python
-
-            # Assume embedding_dim = 100
-            "type": "LSTMCell",
-            "kwargs": { "num_units": 123 },
-            "num_layers": 3,
-            "dropout": {
-                "output_keep_prob": 0.5,
-                "variational_recurrent": True,
-                "input_size": [100, 123, 123]
-            }
-
-    "residual": bool
-        If `True`, apply residual connection on the inputs and
-        outputs of cell in **each** layer except the first layer. Ignored
-        if "num_layers" = 1.
-
-    "highway": bool
-        If True, apply highway connection on the inputs and
-        outputs of cell in each layer except the first layer. Ignored if
-        "num_layers" = 1.
-    """
-    return {
-        "type": "LSTMCell",
-        "kwargs": {
-            "num_units": 256,
-        },
-        "num_layers": 1,
-        "dropout": {
-            "input_keep_prob": 1.0,
-            "output_keep_prob": 1.0,
-            "state_keep_prob": 1.0,
-            "variational_recurrent": False,
-            "input_size": [],
-            "@no_typecheck": [
-                "input_keep_prob", "output_keep_prob", "state_keep_prob"
-            ]
-        },
-        "residual": False,
-        "highway": False,
-        "@no_typecheck": ["type"]
-    }
-
-
-def get_rnn_cell(hparams=None, mode=None):
-    """Creates an RNN cell.
-
-    See :func:`~texar.tf.core.default_rnn_cell_hparams` for all
-    hyperparameters and default values.
-
-    Args:
-        hparams (dict or HParams, optional): Cell hyperparameters. Missing
-            hyperparameters are set to default values.
-        mode (optional): A Tensor taking value in
-            :tf_main:`tf.estimator.ModeKeys <estimator/ModeKeys>`, including
-            `TRAIN`, `EVAL`, and `PREDICT`. If `None`, dropout will be
-            controlled by :func:`texar.tf.global_mode`.
-
-    Returns:
-        A cell instance.
-
-    Raises:
-        ValueError: If hparams["num_layers"]>1 and hparams["type"] is a class
-            instance.
-        ValueError: The cell is not an
-            :tf_main:`RNNCell <contrib/rnn/RNNCell>` instance.
-    """
-    if hparams is None or isinstance(hparams, dict):
-        hparams = HParams(hparams, default_rnn_cell_hparams())
-
-    d_hp = hparams["dropout"]
-    if d_hp["variational_recurrent"] and \
-            len(d_hp["input_size"]) != hparams["num_layers"]:
-        raise ValueError(
-            "If variational_recurrent=True, input_size must be a list of "
-            "num_layers(%d) integers. Got len(input_size)=%d." %
-            (hparams["num_layers"], len(d_hp["input_size"])))
-
-    cells = []
-    cell_kwargs = hparams["kwargs"].todict()
-    num_layers = hparams["num_layers"]
-    for layer_i in range(num_layers):
-        # Create the basic cell
-        cell_type = hparams["type"]
-        if not is_str(cell_type) and not isinstance(cell_type, type):
-            if num_layers > 1:
-                raise ValueError(
-                    "If 'num_layers'>1, then 'type' must be a cell class or "
-                    "its name/module path, rather than a cell instance.")
-        cell_modules = ['tensorflow.nn.rnn_cell', 'tensorflow.contrib.rnn',
-                        'texar.tf.custom']
-        cell = utils.check_or_get_instance(
-            cell_type, cell_kwargs, cell_modules, rnn.RNNCell)
-
-        # Optionally add dropout
-        if d_hp["input_keep_prob"] < 1.0 or \
-                d_hp["output_keep_prob"] < 1.0 or \
-                d_hp["state_keep_prob"] < 1.0:
-            vr_kwargs = {}
-            if d_hp["variational_recurrent"]:
-                vr_kwargs = {
-                    "variational_recurrent": True,
-                    "input_size": d_hp["input_size"][layer_i],
-                    "dtype": tf.float32
-                }
-            input_keep_prob = switch_dropout(d_hp["input_keep_prob"],
-                                             mode)
-            output_keep_prob = switch_dropout(d_hp["output_keep_prob"],
-                                              mode)
-            state_keep_prob = switch_dropout(d_hp["state_keep_prob"],
-                                             mode)
-            cell = rnn.DropoutWrapper(
-                cell=cell,
-                input_keep_prob=input_keep_prob,
-                output_keep_prob=output_keep_prob,
-                state_keep_prob=state_keep_prob,
-                **vr_kwargs)
-
-        # Optionally add residual and highway connections
-        if layer_i > 0:
-            if hparams["residual"]:
-                cell = rnn.ResidualWrapper(cell)
-            if hparams["highway"]:
-                cell = rnn.HighwayWrapper(cell)
-
-        cells.append(cell)
-
-    if hparams["num_layers"] > 1:
-        cell = rnn.MultiRNNCell(cells)
-    else:
-        cell = cells[0]
-
-    return cell
-
-
-def get_rnn_cell_trainable_variables(cell):
-    """Returns the list of trainable variables of an RNN cell.
-
-    Args:
-        cell: an instance of :tf_main:`RNNCell <nn/rnn_cell/RNNCell>`.
-
-    Returns:
-        list: trainable variables of the cell.
-    """
-    cell_ = cell
-    while True:
-        try:
-            return cell_.trainable_variables
-        except AttributeError:
-            # Cell wrappers (e.g., `DropoutWrapper`) cannot directly access to
-            # `trainable_variables` as they don't initialize superclass
-            # (tf==v1.3). So try to access through the cell in the wrapper.
-            cell_ = cell._cell  # pylint: disable=protected-access
-
-
 def default_regularizer_hparams():
-    """Returns the hyperparameters and their default values of a variable
+    r"""Returns the hyperparameters and their default values of a variable
     regularizer:
 
     .. code-block:: python
@@ -314,7 +47,7 @@ def default_regularizer_hparams():
         }
 
     The default value corresponds to :tf_main:`L1L2 <keras/regularizers/L1L2>`
-    and, with `(l1=0, l2=0)`, disables regularization.
+    and, with ``(l1=0, l2=0)``, disables regularization.
     """
     return {
         "type": "L1L2",
@@ -326,7 +59,7 @@ def default_regularizer_hparams():
 
 
 def get_regularizer(hparams=None):
-    """Returns a variable regularizer instance.
+    r"""Returns a variable regularizer instance.
 
     See :func:`~texar.tf.core.default_regularizer_hparams` for all
     hyperparameters and default values.
@@ -370,10 +103,7 @@ def get_regularizer(hparams=None):
 
 
 def get_initializer(hparams=None):
-    """Returns an initializer instance.
-
-    .. role:: python(code)
-       :language: python
+    r"""Returns an initializer instance.
 
     Args:
         hparams (dict or HParams, optional): Hyperparameters with the structure
@@ -383,25 +113,25 @@ def get_initializer(hparams=None):
                 {
                     "type": "initializer_class_or_function",
                     "kwargs": {
-                        #...
+                        # ...
                     }
                 }
 
-            The "type" field can be a initializer class, its name or module
-            path, or class instance. If class name is provided, the class must
-            be from one the following modules:
+            The `"type"` field can be a initializer class, its name or module
+            path, or class instance. If class name is provided, it must
+            be from one of the following modules:
             :tf_main:`tf.initializers <initializers>`,
             :tf_main:`tf.keras.initializers <keras/initializers>`,
             :tf_main:`tf < >`, and :mod:`texar.tf.custom`. The class is created
             by :python:`initializer_class(**kwargs)`. If a class instance
-            is given, "kwargs" is ignored and can be omitted.
+            is given, `"kwargs"` is ignored and can be omitted.
 
-            Besides, the "type" field can also be an initialization function
+            Besides, the `"type"` field can also be an initialization function
             called with :python:`initialization_fn(**kwargs)`. In this case
-            "type" can be the function, or its name or module path. If
+            `"type"` can be the function, or its name or module path. If
             function name is provided, the function must be from one of the
-            above modules or module `tf.contrib.layers`. If no
-            keyword argument is required, "kwargs" can be omitted.
+            above modules or module `tfa.layers`.If no
+            keyword argument is required, `"kwargs"` can be omitted.
 
     Returns:
         An initializer instance. `None` if :attr:`hparams` is `None`.
@@ -418,7 +148,7 @@ def get_initializer(hparams=None):
         initializer = utils.check_or_get_instance(hparams["type"], kwargs,
                                                   modules)
     except (TypeError, ValueError):
-        modules = ['tensorflow.contrib.layers'] + modules
+        modules = ['tensorflow_addons.layers'] + modules
         initializer_fn = utils.get_function(hparams["type"], modules)
         initializer = initializer_fn(**kwargs)
 
@@ -426,7 +156,7 @@ def get_initializer(hparams=None):
 
 
 def get_activation_fn(fn_name="identity", kwargs=None):
-    """Returns an activation function `fn` with the signature
+    r"""Returns an activation function `fn` with the signature
     `output = fn(input)`.
 
     If the function specified by :attr:`fn_name` has more than one arguments
@@ -445,7 +175,7 @@ def get_activation_fn(fn_name="identity", kwargs=None):
             - User-defined activation functions in module
               :mod:`texar.tf.custom`.
             - External activation functions. Must provide the full module path,
-              e.g., "my_module.my_activation_fn".
+              e.g., ``"my_module.my_activation_fn"``.
 
         kwargs (optional): A `dict` or instance of :class:`~texar.tf.HParams`
             containing the keyword arguments of the activation function.
@@ -474,13 +204,9 @@ def _partial_fn(features):
 
 
 def get_constraint_fn(fn_name="NonNeg"):
-    """Returns a constraint function.
+    r"""Returns a constraint function.
 
-    .. role:: python(code)
-       :language: python
-
-    The function must follow the signature:
-    :python:`w_ = constraint_fn(w)`.
+    The function must follow the signature: :python:`w_ = constraint_fn(w)`.
 
     Args:
         fn_name (str or callable): The name or full path to a
@@ -488,14 +214,14 @@ def get_constraint_fn(fn_name="NonNeg"):
 
             The function can be:
 
-            - Built-in constraint functions defined in modules \
-            :tf_main:`tf.keras.constraints <keras/constraints>` \
-            (e.g., :tf_main:`NonNeg <keras/constraints/NonNeg>`) \
-            or :tf_main:`tf < >` or :tf_main:`tf.nn <nn>` \
-            (e.g., activation functions).
+            - Built-in constraint functions defined in modules
+              :tf_main:`tf.keras.constraints <keras/constraints>`
+              (e.g., :tf_main:`NonNeg <keras/constraints/NonNeg>`)
+              or :tf_main:`tf < >` or :tf_main:`tf.nn <nn>`
+              (e.g., activation functions).
             - User-defined function in :mod:`texar.tf.custom`.
-            - Externally defined function. Must provide the full path, \
-            e.g., `"my_module.my_constraint_fn"`.
+            - Externally defined function. Must provide the full path,
+              e.g., ``"my_module.my_constraint_fn"``.
 
             If a callable is provided, then it is returned directly.
 
@@ -512,13 +238,12 @@ def get_constraint_fn(fn_name="NonNeg"):
 
 
 def get_layer(hparams):
-    """Makes a layer instance.
+    r"""Makes a layer instance.
 
-    The layer must be an instance of :tf_main:`tf.layers.Layer <layers/Layer>`.
+    The layer must be an instance of :tf_main:`tf.keras.layers.Layer`.
 
     Args:
-        hparams (dict or HParams): Hyperparameters of the layer, with
-            structure:
+        hparams (dict or HParams): Hyperparameters of the layer, with structure:
 
             .. code-block:: python
 
@@ -532,13 +257,13 @@ def get_layer(hparams):
 
             Here:
 
-            "type": str or layer class or layer instance
+            `"type"`: str or layer class or layer instance
                 The layer type. This can be
 
-                - The string name or full module path of a layer class. If \
-                the class name is provided, the class must be in module \
-                :tf_main:`tf.layers <layers>`, :mod:`texar.tf.core`, \
-                or :mod:`texar.tf.custom`.
+                - The string name or full module path of a layer class. If
+                  the class name is provided, the class must be in module
+                  :tf_main:`tf.keras.layers`, :mod:`texar.tf.core`,
+                  or :mod:`texar.tf.custom`.
                 - A layer class.
                 - An instance of a layer class.
 
@@ -546,34 +271,34 @@ def get_layer(hparams):
 
                 .. code-block:: python
 
-                    "type": "Conv1D" # class name
+                    "type": "Conv1D"                           # class name
                     "type": "texar.tf.core.MaxReducePooling1D" # module path
-                    "type": "my_module.MyLayer" # module path
-                    "type": tf.layers.Conv2D # class
-                    "type": Conv1D(filters=10, kernel_size=2) # cell instance
-                    "type": MyLayer(...) # cell instance
+                    "type": "my_module.MyLayer"                # module path
+                    "type": tf.layers.Conv2D                   # class
+                    "type": Conv1D(filters=10, kernel_size=2)  # cell instance
+                    "type": MyLayer(...)                       # cell instance
 
-            "kwargs": dict
+            `"kwargs"`: dict
                 A dictionary of keyword arguments for constructor of the
                 layer class. Ignored if :attr:`"type"` is a layer instance.
 
                 - Arguments named "activation" can be a callable,
                   or a `str` of the name or module path to the activation
                   function.
-                - Arguments named "*_regularizer" and "*_initializer"
+                - Arguments named "\*_regularizer" and "\*_initializer"
                   can be a class instance, or a `dict` of hyperparameters of
                   respective regularizers and initializers. See
-                - Arguments named "*_constraint" can be a callable, or a
+                - Arguments named "\*_constraint" can be a callable, or a
                   `str` of the name or full path to the constraint function.
 
     Returns:
-        A layer instance. If hparams["type"] is a layer instance, returns it
+        A layer instance. If ``hparams["type"]`` is a layer instance, returns it
         directly.
 
     Raises:
         ValueError: If :attr:`hparams` is `None`.
         ValueError: If the resulting layer is not an instance of
-            :tf_main:`tf.layers.Layer <layers/Layer>`.
+            :tf_main:`tf.keras.layers.Layer`.
     """
     if hparams is None:
         raise ValueError("`hparams` must not be `None`.")
@@ -582,7 +307,7 @@ def get_layer(hparams):
     if not is_str(layer_type) and not isinstance(layer_type, type):
         layer = layer_type
     else:
-        layer_modules = ["tensorflow.layers", "texar.tf.core",
+        layer_modules = ["tensorflow.keras.layers", "texar.tf.core",
                          "texar.tf.custom"]
         layer_class = utils.check_or_get_class(layer_type, layer_modules)
         if isinstance(hparams, dict):
@@ -605,406 +330,22 @@ def get_layer(hparams):
                 kwargs[k] = v
         layer = utils.get_instance(layer_type, kwargs, layer_modules)
 
-    if not isinstance(layer, tf.layers.Layer):
-        raise ValueError("layer must be an instance of `tf.layers.Layer`.")
+    if not isinstance(layer, tf.keras.layers.Layer):
+        raise ValueError(
+            "layer must be an instance of `tf.keras.layers.Layer`.")
 
     return layer
 
 
-def _compute_concat_output_shape(input_shape, axis):
-    """Infers the output shape of concat given the input shape.
-
-    The code is adapted from the ConcatLayer of lasagne
-    (https://github.com/Lasagne/Lasagne/blob/master/lasagne/layers/merge.py)
-
-    Args:
-        input_shape (list): A list of shapes, each of which is in turn a
-            list or TensorShape.
-        axis (int): Axis of the concat operation.
-
-    Returns:
-        list: Output shape of concat.
-    """
-    # The size of each axis of the output shape equals the first
-    # input size of respective axis that is not `None`
-    input_shape = [tf.TensorShape(s).as_list() for s in input_shape]
-    output_shape = [next((s for s in sizes if s is not None), None)
-                    for sizes in zip(*input_shape)]
-    axis_sizes = [s[axis] for s in input_shape]
-    concat_axis_size = None if any(s is None for s in axis_sizes) \
-            else sum(axis_sizes)
-    output_shape[axis] = concat_axis_size
-    return output_shape
-
-
-class _ReducePooling1D(tf.layers.Layer):
-    """Pooling layer for arbitrary reduce functions for 1D inputs.
-
-    The same as `tf.python.layers.pooling._Pooling1D` except that the pooling
-    dimension is entirely reduced (i.e., `pool_size=length`).
-
-    This class is for code reuse, rather than an exposed API.
-    """
-    def __init__(self, reduce_function, data_format='channels_last',
-                 name=None, **kwargs):
-        super(_ReducePooling1D, self).__init__(name=name, **kwargs)
-        self._reduce_function = reduce_function
-        if data_format not in {'channels_last', 'channels_first'}:
-            raise ValueError("`data_format must be either 'channels_last' or` "
-                             "'channels_first'. Got: {}".format(data_format))
-        self._data_format = data_format
-
-    def compute_output_shape(self, input_shape):
-        input_shape = tf.TensorShape(input_shape).as_list()
-        if self._data_format == 'channels_last':
-            return tf.TensorShape([input_shape[0], input_shape[2]])
-        else:
-            return tf.TensorShape([input_shape[0], input_shape[1]])
-
-    def call(self, inputs):
-        if self._data_format == 'channels_last':
-            return self._reduce_function(inputs, axis=1)
-        else:
-            return self._reduce_function(inputs, axis=2)
-
-
-class MaxReducePooling1D(_ReducePooling1D):
-    """A subclass of :tf_main:`tf.layers.Layer <layers/Layer>`.
-    Max Pooling layer for 1D inputs. The same as
-    :tf_main:`MaxPooling1D <layers/MaxPooling1D>` except that the pooling
-    dimension is entirely reduced (i.e., `pool_size=input_length`).
-    """
-    def __init__(self, data_format='channels_last', name=None, **kwargs):
-        super(MaxReducePooling1D, self).__init__(
-            tf.reduce_max, data_format=data_format, name=name, **kwargs)
-
-
-class AverageReducePooling1D(_ReducePooling1D):
-    """A subclass of :tf_main:`tf.layers.Layer <layers/Layer>`.
-    Average Pooling layer for 1D inputs. The same as
-    :tf_main:`AveragePooling1D <layers/AveragePooling1D>` except that the
-    pooling dimension is entirely reduced (i.e., `pool_size=input_length`).
-    """
-    def __init__(self, data_format='channels_last', name=None, **kwargs):
-        super(AverageReducePooling1D, self).__init__(
-            tf.reduce_mean, data_format=data_format, name=name, **kwargs)
-
-
-_POOLING_TO_REDUCE = {
-    "MaxPooling1D": "MaxReducePooling1D",
-    "AveragePooling1D": "AverageReducePooling1D",
-    tf.layers.MaxPooling1D: MaxReducePooling1D,
-    tf.layers.AveragePooling1D: AverageReducePooling1D
-}
-
-
-def get_pooling_layer_hparams(hparams):
-    """Creates pooling layer hparams `dict` usable for :func:`get_layer`.
-
-    If the :attr:`hparams` sets `'pool_size'` to `None`, the layer will be
-    changed to the respective reduce-pooling layer. For example,
-    :class:`tf.layers.MaxPooling1D <layers/MaxPooling1D>` is replaced with
-    :class:`~texar.tf.core.MaxReducePooling1D`.
-    """
-    if isinstance(hparams, HParams):
-        hparams = hparams.todict()
-
-    new_hparams = copy.copy(hparams)
-    kwargs = new_hparams.get('kwargs', None)
-
-    if kwargs and kwargs.get('pool_size', None) is None:
-        pool_type = hparams['type']
-        new_hparams['type'] = _POOLING_TO_REDUCE.get(pool_type, pool_type)
-        kwargs.pop('pool_size', None)
-        kwargs.pop('strides', None)
-        kwargs.pop('padding', None)
-
-    return new_hparams
-
-
-class MergeLayer(tf.layers.Layer):
-    """A subclass of :tf_main:`tf.layers.Layer <layers/Layer>`.
-    A layer that consists of multiple layers in parallel. Input is fed to
-    each of the parallel layers, and the outputs are merged with a
-    specified mode.
-
-    Args:
-        layers (list, optional): A list of :tf_main:`tf.layers.Layer
-            <layers/layer>` instances, or a list of hyperparameter dicts
-            each of which specifies type and kwargs of each layer (see
-            the `hparams` argument of :func:`get_layer`).
-
-            If `None`, this layer degenerates to a merging operator that merges
-            inputs directly.
-        mode (str): Mode of the merge op. This can be:
-
-            - :attr:`'concat'`: Concatenates layer outputs along one axis. \
-              Tensors must have the same shape except for the dimension \
-              specified in `axis`, which can have different sizes.
-            - :attr:`'elemwise_sum'`: Outputs element-wise sum.
-            - :attr:`'elemwise_mul'`: Outputs element-wise product.
-            - :attr:`'sum'`: Computes the sum of layer outputs along the \
-              dimension given by `axis`. E.g., given `axis=1`, \
-              two tensors of shape `[a, b]` and `[a, c]` respectively \
-              will result in a merged tensor of shape `[a]`.
-            - :attr:`'mean'`: Computes the mean of layer outputs along the \
-              dimension given in `axis`.
-            - :attr:`'prod'`: Computes the product of layer outputs along the \
-              dimension given in `axis`.
-            - :attr:`'max'`: Computes the maximum of layer outputs along the \
-              dimension given in `axis`.
-            - :attr:`'min'`: Computes the minimum of layer outputs along the \
-              dimension given in `axis`.
-            - :attr:`'and'`: Computes the `logical and` of layer outputs along \
-              the dimension given in `axis`.
-            - :attr:`'or'`: Computes the `logical or` of layer outputs along \
-              the dimension given in `axis`.
-            - :attr:`'logsumexp'`: Computes \
-              log(sum(exp(elements across the dimension of layer outputs)))
-        axis (int): The axis to use in merging. Ignored in modes
-            :attr:`'elemwise_sum'` and :attr:`'elemwise_mul'`.
-        trainable (bool): Whether the layer should be trained.
-        name (str, optional): Name of the layer.
-    """
-
-    def __init__(self,
-                 layers=None,
-                 mode='concat',
-                 axis=1,
-                 trainable=True,
-                 name=None,
-                 **kwargs):
-        super(MergeLayer, self).__init__(
-            trainable=trainable, name=name, **kwargs)
-        self._mode = mode
-        self._axis = axis
-
-        self._layers = None
-        if layers is not None:
-            if len(layers) == 0:
-                raise ValueError(
-                    "'layers' must be either None or a non-empty list.")
-            self._layers = []
-            for layer in layers:
-                if isinstance(layer, tf.layers.Layer):
-                    self._layers.append(layer)
-                else:
-                    self._layers.append(get_layer(hparams=layer))
-
-        # Keep tracks of whether trainable variables have been created
-        self._vars_built = False
-
-    def compute_output_shape(self, input_shape):
-        if self._layers is None:
-            _shapes = input_shape
-            if not isinstance(_shapes, (list, tuple)):
-                _shapes = [_shapes]
-        else:
-            _shapes = []
-            for layer in self._layers:
-                layer_output_shape = layer.compute_output_shape(input_shape)
-                _shapes.append(layer_output_shape)
-        _shapes = [tf.TensorShape(s) for s in _shapes]
-
-        if self._mode == 'concat':
-            output_shape = _compute_concat_output_shape(_shapes, self._axis)
-        elif self._mode in ['sum', 'mean', 'prod', 'max', 'min',
-                            'and', 'or', 'logsumexp']:
-            output_shape = _compute_concat_output_shape(_shapes, self._axis)
-            output_shape.pop(self._axis)
-        elif self._mode in ['elemwise_sum', 'elemwise_mul']:
-            # Simply infer the output shape as the input shape of highest rank
-            _ranks = [s.ndims for s in _shapes]
-            max_rank = max(_ranks)
-            max_ranked_shapes = []
-            for i, s in enumerate(_shapes):
-                if _ranks[i] == max_rank:
-                    max_ranked_shapes.append(s.as_list())
-            # Grab the first size of each axis that is not `None`
-            output_shape = [next((s for s in sizes if s is not None), None)
-                            for sizes in zip(*max_ranked_shapes)]
-        else:
-            raise ValueError("Unknown merge mode: '%s'" % self._mode)
-
-        return tf.TensorShape(output_shape)
-
-    def _collect_weights(self):
-        """Collects (non-)trainable weights of each of the parallel layers.
-        """
-        if self._layers is None:
-            pass
-        for layer in self._layers:
-            if self.trainable:
-                add_variable(
-                    layer._trainable_weights, self._trainable_weights)
-            else:
-                add_variable(
-                    layer._trainable_weights, self._non_trainable_weights)
-            add_variable(
-                layer._non_trainable_weights, self._non_trainable_weights)
-
-    @property
-    def trainable_weights(self):
-        return self._trainable_weights
-
-    @property
-    def non_trainable_weights(self):
-        return self._non_trainable_weights
-
-    def call(self, inputs):
-        if self._layers is None:
-            layer_outputs = inputs
-            if not isinstance(layer_outputs, (list, tuple)):
-                layer_outputs = [layer_outputs]
-        else:
-            layer_outputs = []
-            for layer in self._layers:
-                layer_output = layer(inputs)
-                layer_outputs.append(layer_output)
-
-        if self._mode == 'concat':
-            outputs = tf.concat(values=layer_outputs, axis=self._axis)
-        elif self._mode == 'elemwise_sum':
-            outputs = layer_outputs[0]
-            for i in range(1, len(layer_outputs)):
-                outputs = tf.add(outputs, layer_outputs[i])
-        elif self._mode == 'elemwise_mul':
-            outputs = layer_outputs[0]
-            for i in range(1, len(layer_outputs)):
-                outputs = tf.multiply(outputs, layer_outputs[i])
-        elif self._mode == 'sum':
-            _concat = tf.concat(values=layer_outputs, axis=self._axis)
-            outputs = tf.reduce_sum(_concat, axis=self._axis)
-        elif self._mode == 'mean':
-            _concat = tf.concat(values=layer_outputs, axis=self._axis)
-            outputs = tf.reduce_mean(_concat, axis=self._axis)
-        elif self._mode == 'prod':
-            _concat = tf.concat(values=layer_outputs, axis=self._axis)
-            outputs = tf.reduce_prod(_concat, axis=self._axis)
-        elif self._mode == 'max':
-            _concat = tf.concat(values=layer_outputs, axis=self._axis)
-            outputs = tf.reduce_max(_concat, axis=self._axis)
-        elif self._mode == 'min':
-            _concat = tf.concat(values=layer_outputs, axis=self._axis)
-            outputs = tf.reduce_min(_concat, axis=self._axis)
-        elif self._mode == 'and':
-            _concat = tf.concat(values=layer_outputs, axis=self._axis)
-            outputs = tf.reduce_all(_concat, axis=self._axis)
-        elif self._mode == 'or':
-            _concat = tf.concat(values=layer_outputs, axis=self._axis)
-            outputs = tf.reduce_any(_concat, axis=self._axis)
-        elif self._mode == 'logsumexp':
-            _concat = tf.concat(values=layer_outputs, axis=self._axis)
-            outputs = tf.reduce_logsumexp(_concat, axis=self._axis)
-        else:
-            raise ValueError("Unknown merge mode: '%s'" % self._mode)
-
-        if not self.built or not self._vars_built:
-            self._collect_weights()
-            self._vars_built = True
-
-        return outputs
-
-    @property
-    def layers(self):
-        """The list of parallel layers.
-        """
-        return self._layers
-
-
-class SequentialLayer(tf.layers.Layer):
-    """A subclass of :tf_main:`tf.layers.Layer <layers/Layer>`.
-    A layer that consists of multiple layers connected sequentially.
-
-    Args:
-        layers (list): A list of :tf_main:`tf.layers.Layer
-            <layers/layer>` instances, or a list of hyperparameter dicts
-            each of which specifying type and kwargs of each layer (see
-            the `hparams` argument of :func:`get_layer`). The layers are
-            connected sequentially.
-    """
-    def __init__(self,
-                 layers,
-                 trainable=True,
-                 name=None,
-                 **kwargs):
-        super(SequentialLayer, self).__init__(
-            trainable=trainable, name=name, **kwargs)
-
-        if len(layers) == 0:
-            raise ValueError("'layers' must be a non-empty list.")
-        self._layers = []
-        for layer in layers:
-            if isinstance(layer, tf.layers.Layer):
-                self._layers.append(layer)
-            else:
-                self._layers.append(get_layer(hparams=layer))
-
-        # Keep tracks of whether trainable variables have been created
-        self._vars_built = False
-
-    def compute_output_shape(self, input_shape):
-        input_shape = tf.TensorShape(input_shape)
-        for layer in self._layers:
-            output_shape = layer.compute_output_shape(input_shape)
-            input_shape = output_shape
-        return output_shape
-
-    def _collect_weights(self):
-        """Collects (non-)trainable weights of each of the layers.
-        """
-        for layer in self._layers:
-            if self.trainable:
-                add_variable(
-                    layer._trainable_weights, self._trainable_weights)
-            else:
-                add_variable(
-                    layer._trainable_weights, self._non_trainable_weights)
-            add_variable(
-                layer._non_trainable_weights, self._non_trainable_weights)
-
-    @property
-    def trainable_weights(self):
-        return self._trainable_weights
-
-    @property
-    def non_trainable_weights(self):
-        return self._non_trainable_weights
-
-    def call(self, inputs, mode=None):  # pylint: disable=arguments-differ
-        training = is_train_mode(mode)
-
-        outputs = inputs
-        for layer in self._layers:
-            if isinstance(layer, tf.layers.Dropout) or \
-                    isinstance(layer, tf.layers.BatchNormalization):
-                outputs = layer(outputs, training=training)
-            else:
-                outputs = layer(inputs)
-            inputs = outputs
-
-        if not self.built or not self._vars_built:
-            self._collect_weights()
-            self._vars_built = True
-
-        return outputs
-
-    @property
-    def layers(self):
-        """The list of layers connected sequentially.
-        """
-        return self._layers
-
-
 def _common_default_conv_dense_kwargs():
-    """Returns the default keyword argument values that are common to
+    r"""Returns the default keyword argument values that are common to
     convolution layers.
     """
     return {
         "activation": None,
         "use_bias": True,
         "kernel_initializer": {
-            "type": "glorot_uniform_initializer",
+            "type": "glorot_uniform",
             "kwargs": {}
         },
         "bias_initializer": {
@@ -1015,19 +356,16 @@ def _common_default_conv_dense_kwargs():
         "bias_regularizer": default_regularizer_hparams(),
         "activity_regularizer": default_regularizer_hparams(),
         "kernel_constraint": None,
-        "bias_constraint": None,
-        "trainable": True,
-        "name": None
+        "bias_constraint": None
     }
 
 
 def default_conv1d_kwargs():
-    """Returns the default keyword argument values of the constructor
+    r"""Returns the default keyword argument values of the constructor
     of 1D-convolution layer class
-    :tf_main:`tf.layers.Conv1D <layers/Conv1D>`.
+    :tf_main:`tf.keras.layers.Conv1D <layers/Conv1D>`.
 
     .. code-block:: python
-
         {
             "filters": 100,
             "kernel_size": 3,
@@ -1038,7 +376,7 @@ def default_conv1d_kwargs():
             "activation": "identity",
             "use_bias": True,
             "kernel_initializer": {
-                "type": "glorot_uniform_initializer",
+                "type": "glorot_uniform",
                 "kwargs": {}
             },
             "bias_initializer": {
@@ -1062,8 +400,6 @@ def default_conv1d_kwargs():
             },
             "kernel_constraint": None,
             "bias_constraint": None,
-            "trainable": True,
-            "name": None
         }
     """
     kwargs = _common_default_conv_dense_kwargs()
@@ -1078,32 +414,32 @@ def default_conv1d_kwargs():
 
 
 def default_conv2d_kwargs():
-    """TODO
+    r"""TODO
     """
     return {}
 
 
 def default_conv3d_kwargs():
-    """TODO
+    r"""TODO
     """
     return {}
 
 
 def default_conv2d_transpose_kwargs():
-    """TODO
+    r"""TODO
     """
     return {}
 
 
 def default_conv3d_transpose_kwargs():
-    """TODO
+    r"""TODO
     """
     return {}
 
 
 def default_dense_kwargs():
-    """Returns the default keyword argument values of the constructor
-    of the dense layer class :tf_main:`tf.layers.Dense <layers/Dense>`.
+    r"""Returns the default keyword argument values of the constructor
+    of the dense layer class :tf_main:`tf.keras.layers.Dense <layers/Dense>`.
 
     .. code-block:: python
 
@@ -1112,7 +448,7 @@ def default_dense_kwargs():
             "activation": "identity",
             "use_bias": True,
             "kernel_initializer": {
-                "type": "glorot_uniform_initializer",
+                "type": "glorot_uniform",
                 "kwargs": {}
             },
             "bias_initializer": {
@@ -1136,8 +472,6 @@ def default_dense_kwargs():
             },
             "kernel_constraint": None,
             "bias_constraint": None,
-            "trainable": True,
-            "name": None
         }
     """
     kwargs = _common_default_conv_dense_kwargs()
@@ -1148,124 +482,80 @@ def default_dense_kwargs():
 
 
 def default_dropout_kwargs():
-    """TODO
+    r"""TODO
     """
     return {}
-    # raise NotImplementedError
 
 
 def default_flatten_kwargs():
-    """TODO
+    r"""TODO
     """
     return {}
 
 
 def default_max_pooling1d_kwargs():
-    """TODO
+    r"""TODO
     """
     return {}
-    # raise NotImplementedError
 
 
 def default_max_pooling2d_kwargs():
-    """TODO
+    r"""TODO
     """
     return {}
-    # raise NotImplementedError
 
 
 def default_max_pooling3d_kwargs():
-    """TODO
+    r"""TODO
     """
     return {}
-    # raise NotImplementedError
 
 
 def default_separable_conv2d_kwargs():
-    """TODO
+    r"""TODO
     """
     return {}
-    # raise NotImplementedError
 
 
 def default_batch_normalization_kwargs():
-    """TODO
+    r"""TODO
     """
     return {}
-    # raise NotImplementedError
 
 
 def default_average_pooling1d_kwargs():
-    """TODO
+    r"""TODO
     """
     return {}
-    # raise NotImplementedError
 
 
 def default_average_pooling2d_kwargs():
-    """TODO
+    r"""TODO
     """
     return {}
-    # raise NotImplementedError
 
 
 def default_average_pooling3d_kwargs():
-    """TODO
+    r"""TODO
     """
     return {}
-    # raise NotImplementedError
 
 
 _layer_class_to_default_kwargs_map = {
-    tf.layers.Conv1D: default_conv1d_kwargs(),
-    tf.layers.Conv2D: default_conv2d_kwargs(),
-    tf.layers.Conv3D: default_conv3d_kwargs(),
-    tf.layers.Conv2DTranspose: default_conv2d_transpose_kwargs(),
-    tf.layers.Conv3DTranspose: default_conv3d_transpose_kwargs(),
-    tf.layers.Dense: default_dense_kwargs(),
-    tf.layers.Dropout: default_dropout_kwargs(),
-    tf.layers.Flatten: default_flatten_kwargs(),
-    tf.layers.MaxPooling1D: default_max_pooling1d_kwargs(),
-    tf.layers.MaxPooling2D: default_max_pooling2d_kwargs(),
-    tf.layers.MaxPooling3D: default_max_pooling3d_kwargs(),
-    tf.layers.SeparableConv2D: default_separable_conv2d_kwargs(),
-    tf.layers.BatchNormalization: default_batch_normalization_kwargs(),
-    tf.layers.AveragePooling1D: default_average_pooling1d_kwargs(),
-    tf.layers.AveragePooling2D: default_average_pooling2d_kwargs(),
-    tf.layers.AveragePooling3D: default_average_pooling3d_kwargs(),
+    tf.keras.layers.Conv1D: default_conv1d_kwargs(),
+    tf.keras.layers.Conv2D: default_conv2d_kwargs(),
+    tf.keras.layers.Conv3D: default_conv3d_kwargs(),
+    tf.keras.layers.Conv2DTranspose: default_conv2d_transpose_kwargs(),
+    tf.keras.layers.Conv3DTranspose: default_conv3d_transpose_kwargs(),
+    tf.keras.layers.Dense: default_dense_kwargs(),
+    tf.keras.layers.Dropout: default_dropout_kwargs(),
+    tf.keras.layers.Flatten: default_flatten_kwargs(),
+    tf.keras.layers.MaxPooling1D: default_max_pooling1d_kwargs(),
+    tf.keras.layers.MaxPooling2D: default_max_pooling2d_kwargs(),
+    tf.keras.layers.MaxPooling3D: default_max_pooling3d_kwargs(),
+    tf.keras.layers.SeparableConv2D: default_separable_conv2d_kwargs(),
+    tf.keras.layers.BatchNormalization: default_batch_normalization_kwargs(),
+    tf.keras.layers.AveragePooling1D: default_average_pooling1d_kwargs(),
+    tf.keras.layers.AveragePooling2D: default_average_pooling2d_kwargs(),
+    tf.keras.layers.AveragePooling3D: default_average_pooling3d_kwargs(),
 }
-
-
-def layer_normalize(inputs,
-                    scope=None,
-                    **kwargs):
-    """Applies layer normalization. Normalizes over the last dimension.
-
-    Args:
-        inputs: A tensor with 2 or more dimensions, where the first
-            dimension must be `batch_size`.
-        scope (optional): variable scope.
-
-    Returns:
-        A tensor with the same shape and data dtype as `inputs`.
-    """
-    return tf.contrib.layers.layer_norm(
-        inputs=inputs, begin_norm_axis=-1, begin_params_axis=-1, scope=scope,
-        **kwargs
-    )
-
-
-def gelu(input_tensor):
-    """Gaussian Error Linear Unit.
-
-    This is a smoother version of the RELU.
-    Original paper: https://arxiv.org/abs/1606.08415
-
-    Args:
-      input_tensor: float Tensor to perform activation.
-
-    Returns:
-      `input_tensor` with the GELU activation applied.
-    """
-    cdf = 0.5 * (1.0 + tf.erf(input_tensor / tf.sqrt(2.0)))
-    return input_tensor * cdf
diff --git a/texar/tf/core/layers_test.py b/texar/tf/core/layers_test.py
index 6c2ab63e..b8bd329f 100644
--- a/texar/tf/core/layers_test.py
+++ b/texar/tf/core/layers_test.py
@@ -1,130 +1,13 @@
-#
 """
 Unit tests for various layers.
 """
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
-
 import numpy as np
 
 import tensorflow as tf
-import tensorflow.contrib.rnn as rnn
 
-import texar.tf as tx
-from texar.tf import context
-from texar.tf.hyperparams import HParams
 from texar.tf.core import layers
 
-# pylint: disable=no-member, protected-access, invalid-name
-# pylint: disable=redefined-variable-type
-
-
-class GetRNNCellTest(tf.test.TestCase):
-    """Tests RNN cell creator.
-    """
-
-    def test_get_rnn_cell(self):
-        """Tests :func:`texar.tf.core.layers.get_rnn_cell`.
-        """
-        emb_dim = 4
-        num_units = 64
-
-        # Given instance
-        hparams = {
-            "type": rnn.LSTMCell(num_units)
-        }
-        cell = layers.get_rnn_cell(hparams)
-        self.assertTrue(isinstance(cell, rnn.LSTMCell))
-
-        # Given class
-        hparams = {
-            "type": rnn.LSTMCell,
-            "kwargs": {"num_units": 10}
-        }
-        cell = layers.get_rnn_cell(hparams)
-        self.assertTrue(isinstance(cell, rnn.LSTMCell))
-
-        # Given string, and complex hyperparameters
-        keep_prob_x = tf.placeholder(
-            name='keep_prob', shape=[], dtype=tf.float32)
-        hparams = {
-            "type": "tensorflow.contrib.rnn.GRUCell",
-            "kwargs": {
-                "num_units": num_units
-            },
-            "num_layers": 2,
-            "dropout": {
-                "input_keep_prob": 0.8,
-                "state_keep_prob": keep_prob_x,
-                "variational_recurrent": True,
-                "input_size": [emb_dim, num_units]
-            },
-            "residual": True,
-            "highway": True
-        }
-
-        hparams_ = HParams(hparams, layers.default_rnn_cell_hparams())
-        cell = layers.get_rnn_cell(hparams_)
-
-        batch_size = 16
-        inputs = tf.zeros([batch_size, emb_dim], dtype=tf.float32)
-        output, state = cell(inputs,
-                             cell.zero_state(batch_size, dtype=tf.float32))
-        with self.test_session() as sess:
-            sess.run(tf.global_variables_initializer())
-
-            feed_dict = {
-                keep_prob_x: 1.0,
-                context.global_mode(): tf.estimator.ModeKeys.TRAIN
-            }
-            output_, state_ = sess.run([output, state], feed_dict=feed_dict)
-
-            self.assertEqual(output_.shape[0], batch_size)
-            if isinstance(state_, (list, tuple)):
-                self.assertEqual(state_[0].shape[0], batch_size)
-                self.assertEqual(state_[0].shape[1],
-                                 hparams_.kwargs.num_units)
-            else:
-                self.assertEqual(state_.shape[0], batch_size)
-                self.assertEqual(state_.shape[1],
-                                 hparams_.kwargs.num_units)
-
-    def test_switch_dropout(self):
-        """Tests dropout mode.
-        """
-        emb_dim = 4
-        num_units = 64
-        hparams = {
-            "kwargs": {
-                "num_units": num_units
-            },
-            "num_layers": 2,
-            "dropout": {
-                "input_keep_prob": 0.8,
-            },
-        }
-        mode = tf.placeholder(tf.string)
-        hparams_ = HParams(hparams, layers.default_rnn_cell_hparams())
-        cell = layers.get_rnn_cell(hparams_, mode)
-
-        batch_size = 16
-        inputs = tf.zeros([batch_size, emb_dim], dtype=tf.float32)
-        output, state = cell(inputs,
-                             cell.zero_state(batch_size, dtype=tf.float32))
-        with self.test_session() as sess:
-            sess.run(tf.global_variables_initializer())
-            output_train, _ = sess.run(
-                [output, state],
-                feed_dict={mode: tf.estimator.ModeKeys.TRAIN})
-            self.assertEqual(output_train.shape[0], batch_size)
-            output_test, _ = sess.run(
-                [output, state],
-                feed_dict={mode: tf.estimator.ModeKeys.EVAL})
-            self.assertEqual(output_test.shape[0], batch_size)
-
 
 class GetActivationFnTest(tf.test.TestCase):
     """Tests :func:`texar.tf.core.layers.get_activation_fn`.
@@ -138,23 +21,17 @@ def test_get_activation_fn(self):
         fn = layers.get_activation_fn('relu')
         self.assertEqual(fn, tf.nn.relu)
 
-        inputs = tf.random_uniform([64, 100], -5, 20, dtype=tf.int32)
+        inputs = tf.random.uniform([64, 100], -5, 20, dtype=tf.int32)
 
         fn = layers.get_activation_fn('leaky_relu')
         fn_output = fn(inputs)
         ref_output = tf.nn.leaky_relu(inputs)
-        with self.test_session() as sess:
-            sess.run(tf.global_variables_initializer())
-            fn_output_, ref_output_ = sess.run([fn_output, ref_output])
-            np.testing.assert_array_equal(fn_output_, ref_output_)
+        np.testing.assert_array_equal(fn_output, ref_output)
 
         fn = layers.get_activation_fn('leaky_relu', kwargs={'alpha': 0.1})
         fn_output = fn(inputs)
         ref_output = tf.nn.leaky_relu(inputs, alpha=0.1)
-        with self.test_session() as sess:
-            sess.run(tf.global_variables_initializer())
-            fn_output_, ref_output_ = sess.run([fn_output, ref_output])
-            np.testing.assert_array_equal(fn_output_, ref_output_)
+        np.testing.assert_array_equal(fn_output, ref_output)
 
 
 class GetLayerTest(tf.test.TestCase):
@@ -167,168 +44,19 @@ def test_get_layer(self):
             "type": "Conv1D"
         }
         layer = layers.get_layer(hparams)
-        self.assertTrue(isinstance(layer, tf.layers.Conv1D))
+        self.assertTrue(isinstance(layer, tf.keras.layers.Conv1D))
 
         hparams = {
-            "type": "MergeLayer",
-            "kwargs": {
-                "layers": [
-                    {"type": "Conv1D"},
-                    {"type": "Conv1D"}
-                ]
-            }
+            "type": tf.keras.layers.Conv1D
         }
         layer = layers.get_layer(hparams)
-        self.assertTrue(isinstance(layer, tx.core.MergeLayer))
+        self.assertTrue(isinstance(layer, tf.keras.layers.Conv1D))
 
         hparams = {
-            "type": tf.layers.Conv1D
+            "type": tf.keras.layers.Conv1D(filters=10, kernel_size=2)
         }
         layer = layers.get_layer(hparams)
-        self.assertTrue(isinstance(layer, tf.layers.Conv1D))
-
-        hparams = {
-            "type": tf.layers.Conv1D(filters=10, kernel_size=2)
-        }
-        layer = layers.get_layer(hparams)
-        self.assertTrue(isinstance(layer, tf.layers.Conv1D))
-
-
-class ReducePoolingLayerTest(tf.test.TestCase):
-    """Tests reduce pooling layer.
-    """
-    def setUp(self):
-        tf.test.TestCase.setUp(self)
-
-        self._batch_size = 64
-        self._seq_length = 16
-        self._emb_dim = 100
-
-    def test_max_reduce_pooling_layer(self):
-        """Tests :class:`texar.tf.core.MaxReducePooling1D`.
-        """
-        pool_layer = layers.MaxReducePooling1D()
-
-        inputs = tf.random_uniform(
-            [self._batch_size, self._seq_length, self._emb_dim])
-        output_shape = pool_layer.compute_output_shape(inputs.get_shape())
-        output = pool_layer(inputs)
-        output_reduce = tf.reduce_max(inputs, axis=1)
-        self.assertEqual(output.get_shape(), output_shape)
-        self.assertEqual(output.get_shape(), [self._batch_size, self._emb_dim])
-
-        with self.test_session() as sess:
-            sess.run(tf.global_variables_initializer())
-            output_, output_reduce_ = sess.run([output, output_reduce])
-            np.testing.assert_array_equal(output_, output_reduce_)
-
-    def test_average_reduce_pooling_layer(self):
-        """Tests :class:`texar.tf.core.AverageReducePooling1D`.
-        """
-        pool_layer = layers.AverageReducePooling1D()
-
-        inputs = tf.random_uniform(
-            [self._batch_size, self._seq_length, self._emb_dim])
-        output_shape = pool_layer.compute_output_shape(inputs.get_shape())
-        output = pool_layer(inputs)
-        output_reduce = tf.reduce_mean(inputs, axis=1)
-        self.assertEqual(output.get_shape(), output_shape)
-        self.assertEqual(output.get_shape(), [self._batch_size, self._emb_dim])
-
-        with self.test_session() as sess:
-            sess.run(tf.global_variables_initializer())
-            output_, output_reduce_ = sess.run([output, output_reduce])
-            np.testing.assert_array_equal(output_, output_reduce_)
-
-
-class MergeLayerTest(tf.test.TestCase):
-    """Tests MergeLayer.
-    """
-
-    def test_output_shape(self):
-        """Tests MergeLayer.compute_output_shape function.
-        """
-        input_shapes = [[None, 1, 2], [64, 2, 2], [None, 3, 2]]
-
-        concat_layer = layers.MergeLayer(mode='concat', axis=1)
-        concat_output_shape = concat_layer.compute_output_shape(input_shapes)
-        self.assertEqual(concat_output_shape, [64, 6, 2])
-
-        sum_layer = layers.MergeLayer(mode='sum', axis=1)
-        sum_output_shape = sum_layer.compute_output_shape(input_shapes)
-        self.assertEqual(sum_output_shape, [64, 2])
-
-        input_shapes = [[None, 5, 2], [64, None, 2], [2]]
-        esum_layer = layers.MergeLayer(mode='elemwise_sum')
-        esum_output_shape = esum_layer.compute_output_shape(input_shapes)
-        self.assertEqual(esum_output_shape, [64, 5, 2])
-
-    def test_layer_logics(self):
-        """Test the logic of MergeLayer.
-        """
-        layers_ = []
-        layers_.append(tf.layers.Conv1D(filters=200, kernel_size=3))
-        layers_.append(tf.layers.Conv1D(filters=200, kernel_size=4))
-        layers_.append(tf.layers.Conv1D(filters=200, kernel_size=5))
-        layers_.append(tf.layers.Dense(200))
-        layers_.append(tf.layers.Dense(200))
-        m_layer = layers.MergeLayer(layers_)
-
-        inputs = tf.zeros([64, 16, 1024], dtype=tf.float32)
-        outputs = m_layer(inputs)
-        with self.test_session() as sess:
-            sess.run(tf.global_variables_initializer())
-            outputs_ = sess.run(outputs)
-            self.assertEqual(outputs_.shape[0], 64)
-            self.assertEqual(outputs_.shape[2], 200)
-            self.assertEqual(
-                outputs_.shape,
-                m_layer.compute_output_shape(inputs.shape.as_list()))
-
-    def test_trainable_variables(self):
-        """Test the trainable_variables of the layer.
-        """
-        layers_ = []
-        layers_.append(tf.layers.Conv1D(filters=200, kernel_size=3))
-        layers_.append(tf.layers.Conv1D(filters=200, kernel_size=4))
-        layers_.append(tf.layers.Conv1D(filters=200, kernel_size=5))
-        layers_.append(tf.layers.Dense(200))
-        layers_.append(tf.layers.Dense(200))
-        m_layer = layers.MergeLayer(layers_)
-
-        inputs = tf.zeros([64, 16, 1024], dtype=tf.float32)
-        _ = m_layer(inputs)
-
-        num_vars = sum([len(layer.trainable_variables) for layer in layers_])
-        self.assertEqual(num_vars, len(m_layer.trainable_variables))
-
-
-class SequentialLayerTest(tf.test.TestCase):
-    """Tests sequential layer.
-    """
-
-    def test_seq_layer(self):
-        """Test sequential layer.
-        """
-        layers_ = []
-        layers_.append(tf.layers.Dense(100))
-        layers_.append(tf.layers.Dense(200))
-        seq_layer = layers.SequentialLayer(layers_)
-
-        output_shape = seq_layer.compute_output_shape([None, 10])
-        self.assertEqual(output_shape[1].value, 200)
-
-        inputs = tf.zeros([10, 20], dtype=tf.float32)
-        outputs = seq_layer(inputs)
-
-        num_vars = sum([len(layer.trainable_variables) for layer in layers_])
-        self.assertEqual(num_vars, len(seq_layer.trainable_variables))
-
-        with self.test_session() as sess:
-            sess.run(tf.global_variables_initializer())
-            outputs_ = sess.run(outputs)
-            self.assertEqual(outputs_.shape[0], 10)
-            self.assertEqual(outputs_.shape[1], 200)
+        self.assertTrue(isinstance(layer, tf.keras.layers.Conv1D))
 
 
 if __name__ == "__main__":
diff --git a/texar/tf/core/optimization.py b/texar/tf/core/optimization.py
deleted file mode 100644
index 70036be8..00000000
--- a/texar/tf/core/optimization.py
+++ /dev/null
@@ -1,597 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Various optimization related utilities.
-"""
-
-from __future__ import absolute_import
-from __future__ import print_function
-from __future__ import division
-
-import re
-import tensorflow as tf
-
-from texar.tf.hyperparams import HParams
-from texar.tf.utils import utils
-
-# pylint: disable=too-many-arguments, no-member
-
-__all__ = [
-    "default_optimization_hparams",
-    "get_optimizer_fn",
-    "get_learning_rate_decay_fn",
-    "get_gradient_clip_fn",
-    "get_optimizer",
-    "get_train_op",
-    "AdamWeightDecayOptimizer",
-]
-
-
-def default_optimization_hparams():
-    """Returns a `dict` of default hyperparameters of training op
-    and their default values
-
-    .. role:: python(code)
-       :language: python
-
-    .. code-block:: python
-
-        {
-            "optimizer": {
-                "type": "AdamOptimizer",
-                "kwargs": {
-                    "learning_rate": 0.001
-                }
-            },
-            "learning_rate_decay": {
-                "type": "",
-                "kwargs": {},
-                "min_learning_rate": 0.,
-                "start_decay_step": 0,
-                "end_decay_step": inf
-            },
-            "gradient_clip": {
-                "type": "",
-                "kwargs": {}
-            },
-            "gradient_noise_scale": None,
-            "name": None
-        }
-
-    Here:
-
-    "optimizer": dict
-        Hyperparameters of a :tf_main:`tf.train.Optimizer <train/Optimizer>`.
-
-        - **"type"** specifies the optimizer class. This can be
-
-            - The string name or full module path of an optimizer class. \
-            If the class name is provided, the class must be in module \
-            :tf_main:`tf.train <train>`, \
-            :tf_main:`tf.contrib.opt <contrib/opt>` or :mod:`texar.tf.custom` \
-            , :mod:`texar.tf.core.optimization`
-            - An optimizer class.
-            - An instance of an optimizer class.
-
-            For example
-
-            .. code-block:: python
-
-                "type": "AdamOptimizer" # class name
-                "type": "my_module.MyOptimizer" # module path
-                "type": tf.contrib.opt.AdamWOptimizer # class
-                "type": my_module.MyOptimizer # class
-                "type": GradientDescentOptimizer(learning_rate=0.1) # instance
-                "type": MyOptimizer(...) # instance
-
-        - **"kwargs"** is a `dict` specifying keyword arguments for creating \
-        the optimizer class instance, with :python:`opt_class(**kwargs)`. \
-        Ignored if "type" is a class instance.
-
-    "learning_rate_decay": dict
-        Hyperparameters of learning rate decay function. The learning rate
-        starts decay from :attr:`"start_decay_step"` and keeps unchanged after
-        :attr:`"end_decay_step"` or reaching :attr:`"min_learning_rate"`.
-
-        The decay function is specified in "type" and "kwargs".
-
-            - "type" can be a decay function or its name or module path. If \
-            function name is provided, it must be from module \
-            :tf_main:`tf.train <train>` or :mod:`texar.tf.custom`, \
-            :mod:`texar.tf.core.optimization`.
-
-            - "kwargs" is a `dict` of keyword arguments for the function \
-            excluding arguments named "global_step" and "learning_rate".
-
-        The function is called with
-        :python:`lr = decay_fn(learning_rate=lr, global_step=offset_step,
-        **kwargs)`, where `offset_step` is the global step offset as above.
-        The only exception is :tf_main:`tf.train.piecewise_constant
-        <train/piecewise_constant>` which is called with
-        :python:`lr = piecewise_constant(x=offset_step, **kwargs)`.
-
-    "gradient_clip": dict
-        Hyperparameters of gradient clipping. The gradient clipping function
-        takes a list of `(gradients, variables)` tuples and returns a list
-        of `(clipped_gradients, variables)` tuples. Typical examples include
-        :tf_main:`tf.clip_by_global_norm <clip_by_global_norm>`,
-        :tf_main:`tf.clip_by_value <clip_by_value>`,
-        :tf_main:`tf.clip_by_norm <clip_by_norm>`,
-        :tf_main:`tf.clip_by_average_norm <clip_by_average_norm>`, etc.
-
-        "type" specifies the gradient clip function, and can be a function,
-        or its name or mudule path. If function name is provided, the
-        function must be from module :tf_main:`tf < >`
-        or :mod:`texar.tf.custom`, :mod:`texar.tf.core.optimization`.
-
-
-        "kwargs" specifies keyword arguments to the function, except arguments
-        named "t" or "t_list".
-
-        The function is called with
-        :python:`clipped_grads(, _) = clip_fn(t_list=grads, **kwargs)`
-        (e.g., for :tf_main:`tf.clip_by_global_norm <clip_by_global_norm>`) or
-        :python:`clipped_grads = [clip_fn(t=grad, **kwargs) for grad in grads]`
-        (e.g., for :tf_main:`tf.clip_by_value <clip_by_value>`).
-
-    "gradient_noise_scale": float, optional
-        Adds 0-mean normal noise scaled by this value to gradient.
-    """
-    return {
-        "optimizer": {
-            "type": "AdamOptimizer",
-            "kwargs": {
-                "learning_rate": 0.001
-            }
-        },
-        "learning_rate_decay": {
-            "type": "",
-            "kwargs": {},
-            "min_learning_rate": 0.,
-            "start_decay_step": 0,
-            "end_decay_step": utils.MAX_SEQ_LENGTH,
-        },
-        "gradient_clip": {
-            "type": "",
-            "kwargs": {}
-        },
-        "gradient_noise_scale": None,
-        # TODO(zhiting): allow module-level control of gradient_multipliers
-        "name": None
-    }
-
-
-def get_optimizer_fn(hparams=None):
-    """Returns a function `optimizer_fn` of making optimizer instance, along
-    with the optimizer class.
-
-    .. role:: python(code)
-       :language: python
-
-    The function has the signiture
-    :python:`optimizer_fn(learning_rate=None) -> optimizer class instance`
-
-    See the :attr:`"optimizer"` field of
-    :meth:`~texar.tf.core.default_optimization_hparams` for all
-    hyperparameters and default values.
-
-    The optimizer class must be a subclass of
-    :tf_main:`tf.train.Optimizer <train/Optimizer>`.
-
-    Args:
-        hparams (dict or HParams, optional): hyperparameters. Missing
-            hyperparameters are set to default values automatically.
-
-    Returns:
-        - If hparams["type"] is a string or optimizer class, returns\
-        `(optimizer_fn, optimizer class)`,
-
-        - If hparams["type"] is an optimizer instance, returns \
-        `(the optimizer instance, optimizer class)`
-    """
-    if hparams is None or isinstance(hparams, dict):
-        hparams = HParams(
-            hparams, default_optimization_hparams()["optimizer"])
-
-    opt = hparams["type"]
-    if isinstance(opt, tf.train.Optimizer):
-        return opt, type(opt)
-    opt_modules = ['tensorflow.train',
-                   'tensorflow.contrib.opt',
-                   'texar.tf.core.optimization',
-                   'texar.tf.custom']
-    try:
-        opt_class = utils.check_or_get_class(opt, opt_modules,
-                                             tf.train.Optimizer)
-    except TypeError:
-        raise ValueError(
-            "Unrecognized optimizer. Must be string name of the "
-            "optimizer class, or the class which is a subclass of "
-            "tf.train.Optimizer, or an instance of the subclass of "
-            "Optimizer.")
-
-    def _get_opt(learning_rate=None):
-        opt_kwargs = hparams["kwargs"].todict()
-        fn_args = set(utils.get_args(opt_class.__init__))
-        if 'learning_rate' in fn_args and learning_rate is not None:
-            opt_kwargs["learning_rate"] = learning_rate
-        return opt_class(**opt_kwargs)
-
-    return _get_opt, opt_class
-
-
-def get_learning_rate_decay_fn(hparams=None):
-    """Creates learning rate decay function based on the hyperparameters.
-
-    See the :attr:`learning_rate_decay` field in
-    :meth:`~texar.tf.core.default_optimization_hparams` for all
-    hyperparameters and default values.
-
-    Args:
-        hparams (dict or HParams, optional): hyperparameters. Missing
-            hyperparameters are set to default values automatically.
-
-    Returns:
-        function or None: If hparams["type"] is specified, returns a
-        function that takes `(learning_rate, step, **kwargs)` and
-        returns a decayed learning rate. If
-        hparams["type"] is empty, returns `None`.
-    """
-    if hparams is None or isinstance(hparams, dict):
-        hparams = HParams(
-            hparams, default_optimization_hparams()["learning_rate_decay"])
-
-    fn_type = hparams["type"]
-    if fn_type is None or fn_type == "":
-        return None
-
-    fn_modules = ["tensorflow.train", "texar.tf.custom"]
-    decay_fn = utils.get_function(fn_type, fn_modules)
-    fn_kwargs = hparams["kwargs"]
-    if fn_kwargs is HParams:
-        fn_kwargs = fn_kwargs.todict()
-
-    start_step = tf.cast(hparams["start_decay_step"], tf.int32)
-    end_step = tf.cast(hparams["end_decay_step"], tf.int32)
-
-    def lr_decay_fn(learning_rate, global_step):
-        """Learning rate decay function.
-
-        Args:
-            learning_rate (float or Tensor): The original learning rate.
-            global_step (int or scalar int Tensor): optimization step counter.
-
-        Returns:
-            scalar float Tensor: decayed learning rate.
-        """
-        offset_global_step = tf.maximum(
-            tf.minimum(tf.cast(global_step, tf.int32), end_step) - start_step,
-            0)
-        if decay_fn == tf.train.piecewise_constant:
-            decayed_lr = decay_fn(x=offset_global_step, **fn_kwargs)
-        else:
-            fn_kwargs_ = {
-                "learning_rate": learning_rate,
-                "global_step": offset_global_step}
-            fn_kwargs_.update(fn_kwargs)
-            decayed_lr = utils.call_function_with_redundant_kwargs(
-                decay_fn, fn_kwargs_)
-
-            decayed_lr = tf.maximum(decayed_lr, hparams["min_learning_rate"])
-
-        return decayed_lr
-
-    return lr_decay_fn
-
-
-def get_gradient_clip_fn(hparams=None):
-    """Creates a gradient clipping function based on the hyperparameters.
-
-    See the :attr:`gradient_clip` field in
-    :meth:`~texar.tf.core.default_optimization_hparams` for all
-    hyperparameters and default values.
-
-    The gradient clipping function takes a list of `(gradients, variables)`
-    tuples and returns a list of `(clipped_gradients, variables)` tuples.
-    Typical examples include
-    :tf_main:`tf.clip_by_global_norm <clip_by_global_norm>`,
-    :tf_main:`tf.clip_by_value <clip_by_value>`,
-    :tf_main:`tf.clip_by_norm <clip_by_norm>`,
-    :tf_main:`tf.clip_by_average_norm <clip_by_average_norm>`, etc.
-
-    Args:
-        hparams (dict or HParams, optional): hyperparameters. Missing
-            hyperparameters are set to default values automatically.
-
-    Returns:
-        function or `None`: If hparams["type"] is specified, returns
-        the respective function. If hparams["type"] is empty,
-        returns `None`.
-    """
-    if hparams is None or isinstance(hparams, dict):
-        hparams = HParams(
-            hparams, default_optimization_hparams()["gradient_clip"])
-    fn_type = hparams["type"]
-    if fn_type is None or fn_type == "":
-        return None
-
-    fn_modules = ["tensorflow", "texar.tf.custom"]
-    clip_fn = utils.get_function(fn_type, fn_modules)
-    clip_fn_args = utils.get_args(clip_fn)
-    fn_kwargs = hparams["kwargs"]
-    if isinstance(fn_kwargs, HParams):
-        fn_kwargs = fn_kwargs.todict()
-
-    def grad_clip_fn(grads_and_vars):
-        """Gradient clipping function.
-
-        Args:
-            grads_and_vars (list): A list of `(gradients, variables)` tuples.
-
-        Returns:
-            list: A list of `(clipped_gradients, variables)` tuples.
-        """
-        grads, vars_ = zip(*grads_and_vars)
-        if clip_fn == tf.clip_by_global_norm:
-            clipped_grads, _ = clip_fn(t_list=grads, **fn_kwargs)
-        elif 't_list' in clip_fn_args:
-            clipped_grads = clip_fn(t_list=grads, **fn_kwargs)
-        elif 't' in clip_fn_args:     # e.g., tf.clip_by_value
-            clipped_grads = [clip_fn(t=grad, **fn_kwargs) for grad in grads]
-
-        return list(zip(clipped_grads, vars_))
-
-    return grad_clip_fn
-
-
-def _get_static_lr(learning_rate=None, optimizer_class=None, hparams=None):
-    """Return the base static learning_rate.
-        A helper function for creating the optimization function.
-    """
-    hparams = HParams(hparams, default_optimization_hparams())
-    opt_hparams = hparams['optimizer']
-    if learning_rate is None:
-        learning_rate = opt_hparams["kwargs"].get("learning_rate", None)
-    if learning_rate is None:
-        # Try to get learning_rate from the default value of the
-        # optimizer's argument
-        opt_argspec = utils.get_default_arg_values(optimizer_class.__init__)
-        learning_rate = opt_argspec.get("learning_rate", None)
-    return learning_rate
-
-
-def get_optimizer(learning_rate=None, global_step=None, hparams=None):
-
-    """Creates a optimizer instance.
-
-    Args:
-        learning_rate (float or Tensor, optional): If `None`, learning rate
-            specified in :attr:`hparams`, or the default learning rate
-            of the optimizer (if exists) is used.
-        global_step (optional): A scalar int Tensor. Step counter to update on
-            each step unless :attr:`increment_global_step` is `False`.
-            Learning rate decay uses :attr:`global_step`.
-            If `None`, it will be fetched from the default graph (see
-            :tf_main:`tf.train.get_global_step <train/get_global_step>` for
-            more details). If it has not been created, no step will be
-            incremented with each weight update.
-        hparams (dict or HParams, optional): hyperparameters. Missing
-            hyperparameters are set to default values automatically. See
-            :func:`~texar.tf.core.default_optimization_hparams` for
-            all hyperparameters and default values.
-
-    Returns:
-        optimizer: the tf.train.Optimizer instance specified in hparams.
-    """
-    hparams = HParams(hparams, default_optimization_hparams())
-
-    opt_hparams = hparams["optimizer"]
-    optimizer_fn, optimizer_class = get_optimizer_fn(opt_hparams)
-
-    static_lr = _get_static_lr(learning_rate, optimizer_class, hparams)
-
-    lr_decay_fn = get_learning_rate_decay_fn(hparams["learning_rate_decay"])
-    if lr_decay_fn is not None:
-        learning_rate = lr_decay_fn(learning_rate=static_lr,
-                                    global_step=global_step)
-    else:
-        learning_rate = static_lr
-
-    tf.summary.scalar("learning_rate", learning_rate)
-
-    optimizer = optimizer_fn(learning_rate=learning_rate)
-
-    return optimizer
-
-
-def get_train_op(loss, variables=None,
-                 optimizer=None, learning_rate=None,
-                 global_step=None, increment_global_step=True, hparams=None):
-    """Creates a training op.
-
-    This is a wrapper of :tf_main:`tf.contrib.layers.optimize_loss
-    <contrib/layers/optimize_loss>`.
-
-    Args:
-        loss: A scalar Tensor representing the loss to minimize.
-        variables (optional): A list of Variables to optimize. If
-            `None`, all trainable variables are used.
-        optimizer (optional): An tf.train.Optimizer instance. If `None`,
-            use the setting in `hparams` to create the optimizer.
-        learning_rate (float or Tensor, optional): If `None`, learning rate
-            specified in :attr:`hparams`, or the default learning rate
-            of the optimizer will be used (if exists).
-        global_step (optional): A scalar int Tensor. Step counter to update on
-            each step unless :attr:`increment_global_step` is `False`.
-            Learning rate decay uses :attr:`global_step`.
-            If `None`, it will be fetched from the default graph (see
-            :tf_main:`tf.train.get_global_step <train/get_global_step>` for
-            more details). If it has not been created, no step will be
-            incremented with each weight update.
-        increment_global_step (bool): Whether to increment
-            :attr:`global_step`. This is useful if the :attr:`global_step` is
-            used in multiple training ops per training step (e.g. to optimize
-            different parts of the model) to avoid incrementing
-            :attr:`global_step` more times than necessary.
-        hparams (dict or HParams, optional): hyperparameters. Missing
-            hyperparameters are set to default values automatically. See
-            :func:`~texar.tf.core.default_optimization_hparams` for
-            all hyperparameters and default values.
-
-    Returns:
-        train_op: the operator used for variables optimization.
-    """
-    hparams = HParams(hparams, default_optimization_hparams())
-    grad_clip_fn = get_gradient_clip_fn(hparams["gradient_clip"])
-
-    if not isinstance(optimizer, tf.train.Optimizer):
-        opt_hparams = hparams["optimizer"]
-        optimizer_fn, optimizer_class = get_optimizer_fn(opt_hparams)
-        learning_rate = _get_static_lr(learning_rate, optimizer_class, hparams)
-        lr_decay_fn = get_learning_rate_decay_fn(
-            hparams["learning_rate_decay"])
-        train_op = tf.contrib.layers.optimize_loss(
-            loss=loss,
-            global_step=global_step,
-            learning_rate=learning_rate,
-            optimizer=optimizer_fn,
-            gradient_noise_scale=hparams["gradient_noise_scale"],
-            clip_gradients=grad_clip_fn,
-            learning_rate_decay_fn=lr_decay_fn,
-            variables=variables,
-            name=hparams["name"],
-            increment_global_step=increment_global_step)
-
-    else:
-        train_op = tf.contrib.layers.optimize_loss(
-            loss=loss,
-            global_step=global_step,
-            learning_rate=None,
-            optimizer=optimizer,
-            gradient_noise_scale=hparams["gradient_noise_scale"],
-            clip_gradients=grad_clip_fn,
-            variables=variables,
-            name=hparams["name"],
-            increment_global_step=increment_global_step)
-
-    return train_op
-
-
-class AdamWeightDecayOptimizer(tf.train.Optimizer):
-    """
-    A basic Adam optimizer that includes "correct" L2 weight decay.
-    Copied from the google BERT repo.
-    Except that in `apply_gradient` function, we add the support to increment
-    the passed global step parameter, to make it more compatible to
-    tf.train.Optimizer implementation.
-    """
-
-    def __init__(self,
-                 learning_rate,
-                 weight_decay_rate=0.0,
-                 beta_1=0.9,
-                 beta_2=0.999,
-                 epsilon=1e-6,
-                 exclude_from_weight_decay=None,
-                 name="AdamWeightDecayOptimizer"):
-        """Constructs a AdamWeightDecayOptimizer."""
-        super(AdamWeightDecayOptimizer, self).__init__(False, name)
-
-        self.learning_rate = learning_rate
-        self.weight_decay_rate = weight_decay_rate
-        self.beta_1 = beta_1
-        self.beta_2 = beta_2
-        self.epsilon = epsilon
-        self.exclude_from_weight_decay = exclude_from_weight_decay
-
-    # pylint: disable=too-many-locals
-    def apply_gradients(self, grads_and_vars, global_step=None, name=None):
-        """See base class."""
-        with tf.name_scope(name, self._name) as name:
-            assignments = []
-            for (grad, param) in grads_and_vars:
-                if grad is None or param is None:
-                    continue
-
-                param_name = self._get_variable_name(param.name)
-
-                m = tf.get_variable(
-                    name=param_name + "/adam_m",
-                    shape=param.shape.as_list(),
-                    dtype=tf.float32,
-                    trainable=False,
-                    initializer=tf.zeros_initializer())
-                v = tf.get_variable(
-                    name=param_name + "/adam_v",
-                    shape=param.shape.as_list(),
-                    dtype=tf.float32,
-                    trainable=False,
-                    initializer=tf.zeros_initializer())
-
-                # Standard Adam update.
-                next_m = (tf.multiply(self.beta_1, m)
-                          + tf.multiply(1.0 - self.beta_1,
-                                        grad))
-                next_v = (tf.multiply(self.beta_2, v)
-                          + tf.multiply(1.0 - self.beta_2, tf.square(grad)))
-
-                update = next_m / (tf.sqrt(next_v) + self.epsilon)
-
-                # Just adding the square of the weights to the loss function is
-                # *not* the correct way of using L2 regularization/weight decay
-                # with Adam, since that will interact with the m and v
-                # parameters in strange ways.
-                # Instead we want ot decay the weights in a manner that doesn't
-                # interact with the m/v parameters.
-                # This is equivalent to adding the square
-                # of the weights to the loss with plain (non-momentum) SGD.
-                if self._do_use_weight_decay(param_name):
-                    update += self.weight_decay_rate * param
-
-                update_with_lr = self.learning_rate * update
-
-                next_param = param - update_with_lr
-
-                assignments.extend(
-                    [param.assign(next_param),
-                     m.assign(next_m),
-                     v.assign(next_v)])
-
-            update_ops = assignments
-            if global_step is None:
-                apply_updates = self._finish(update_ops, name)
-            else:
-                with tf.control_dependencies([self._finish(update_ops,
-                                                           "update")]):
-                    with tf.colocate_with(global_step):
-                        apply_updates = tf.assign_add(global_step, 1, name=name)
-
-        return apply_updates
-
-    def _do_use_weight_decay(self, param_name):
-        """Whether to use L2 weight decay for `param_name`."""
-        if not self.weight_decay_rate:
-            return False
-        if self.exclude_from_weight_decay:
-            for r in self.exclude_from_weight_decay:
-                if re.search(r, param_name) is not None:
-                    return False
-        return True
-
-    def _get_variable_name(self, param_name):
-        """Get the variable name from the tensor name."""
-        m = re.match("^(.*):\\d+$", param_name)
-        if m is not None:
-            param_name = m.group(1)
-        return param_name
diff --git a/texar/tf/core/optimization_test.py b/texar/tf/core/optimization_test.py
deleted file mode 100644
index 003b7e6d..00000000
--- a/texar/tf/core/optimization_test.py
+++ /dev/null
@@ -1,158 +0,0 @@
-#
-"""
-Unit tests for various optimization related utilities.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
-
-import numpy as np
-
-import tensorflow as tf
-
-import texar.tf.core.optimization as opt
-from texar.tf.utils import utils
-
-
-class OptimizationTest(tf.test.TestCase):
-    """Tests optimization.
-    """
-
-    def test_get_optimizer(self):
-        """Tests get_optimizer.
-        """
-        default_optimizer_fn, optimizer_class = opt.get_optimizer_fn(
-            opt.default_optimization_hparams()["optimizer"])
-        default_optimizer = default_optimizer_fn(1.0)
-        self.assertTrue(optimizer_class, tf.train.Optimizer)
-        self.assertIsInstance(default_optimizer, tf.train.AdamOptimizer)
-
-        hparams = {
-            "type": "MomentumOptimizer",
-            "kwargs": {
-                "learning_rate": 0.001,
-                "momentum": 0.9,
-                "use_nesterov": True
-            }
-        }
-        momentum_optimizer_fn, _ = opt.get_optimizer_fn(hparams)
-        momentum_optimizer = momentum_optimizer_fn()
-        self.assertIsInstance(momentum_optimizer, tf.train.MomentumOptimizer)
-
-        hparams = {
-            "type": tf.train.MomentumOptimizer,
-            "kwargs": {
-                "momentum": 0.9,
-                "use_nesterov": True
-            }
-        }
-        momentum_optimizer_fn, _ = opt.get_optimizer_fn(hparams)
-        momentum_optimizer = momentum_optimizer_fn(0.001)
-        self.assertIsInstance(momentum_optimizer, tf.train.MomentumOptimizer)
-
-        hparams = {
-            "type": tf.train.MomentumOptimizer(0.001, 0.9)
-        }
-        momentum_optimizer, _ = opt.get_optimizer_fn(hparams)
-        self.assertIsInstance(momentum_optimizer, tf.train.MomentumOptimizer)
-
-    def test_get_learning_rate_decay_fn(self):
-        """Tests get_learning_rate_decay_fn.
-        """
-        default_lr_decay_fn = opt.get_learning_rate_decay_fn(
-            opt.default_optimization_hparams()["learning_rate_decay"])
-        self.assertIsNone(default_lr_decay_fn)
-
-        boundaries = [2, 4]
-        values = [0.1, 0.01, 0.001]
-        hparams = {
-            "type": "piecewise_constant",
-            "kwargs": {
-                "boundaries": boundaries,
-                "values": values
-            },
-            "min_learning_rate": 0.05,
-            "start_decay_step": 1,
-            "end_decay_step": utils.MAX_SEQ_LENGTH,
-        }
-        pc_lr_decay_fn = opt.get_learning_rate_decay_fn(hparams)
-
-        global_step = 1
-        pc_lr = pc_lr_decay_fn(learning_rate=1., global_step=global_step)
-        pc_lr_true = tf.train.piecewise_constant(
-            global_step - hparams["start_decay_step"], boundaries, values)
-
-        hparams["type"] = "natural_exp_decay"
-        hparams["kwargs"] = {
-            "decay_steps": 1,
-            "decay_rate": 0.5
-        }
-        ned_lr_decay_fn = opt.get_learning_rate_decay_fn(hparams)
-        ned_lr = ned_lr_decay_fn(learning_rate=1., global_step=global_step)
-        ned_lr_true = tf.train.natural_exp_decay(
-            1., global_step - hparams["start_decay_step"],
-            hparams["kwargs"]["decay_steps"], hparams["kwargs"]["decay_rate"])
-
-        with self.test_session() as sess:
-            sess.run(tf.global_variables_initializer())
-            pc_lr_, pc_lr_true_, ned_lr_, ned_lr_true_ = sess.run(
-                [pc_lr, pc_lr_true, ned_lr, ned_lr_true])
-            self.assertEqual(pc_lr_, pc_lr_true_)
-            self.assertEqual(ned_lr_, ned_lr_true_)
-
-    def test_get_gradient_clip_fn(self):    # pylint: disable=too-many-locals
-        """Tests get_gradient_clip_fn.
-        """
-        default_grad_clip_fn = opt.get_gradient_clip_fn(
-            opt.default_optimization_hparams()["gradient_clip"])
-        self.assertIsNone(default_grad_clip_fn)
-
-        grads = [tf.random_uniform([10, 10], -1., 1.) for _ in range(5)]
-        grads_and_vars = list(zip(grads, range(5)))
-
-        hparams = {
-            "type": "clip_by_global_norm",
-            "kwargs": {
-                "clip_norm": 0.1
-            }
-        }
-        gn_grad_clip_fn = opt.get_gradient_clip_fn(hparams)
-        gn_grads_and_vars = gn_grad_clip_fn(grads_and_vars)
-        gn_grads, _ = zip(*gn_grads_and_vars)
-        gn_grads_true, _ = tf.clip_by_global_norm(
-            grads, hparams["kwargs"]["clip_norm"])
-
-        hparams = {
-            "type": "clip_by_value",
-            "kwargs": {
-                "clip_value_min": -0.01,
-                "clip_value_max": 0.01
-            }
-        }
-        v_grad_clip_fn = opt.get_gradient_clip_fn(hparams)
-        v_grads_and_vars = v_grad_clip_fn(grads_and_vars)
-        v_grads, _ = zip(*v_grads_and_vars)
-        v_grads_true = tf.clip_by_value(grads,
-                                        hparams["kwargs"]["clip_value_min"],
-                                        hparams["kwargs"]["clip_value_max"])
-
-        with self.test_session() as sess:
-            sess.run(tf.global_variables_initializer())
-            gn_grads_, gn_grads_true_, v_grads_, v_grads_true_ = sess.run(
-                [gn_grads, gn_grads_true, v_grads, v_grads_true])
-            np.testing.assert_array_equal(gn_grads_, gn_grads_true_)
-            np.testing.assert_array_equal(v_grads_, v_grads_true_)
-
-    def test_get_train_op(self):
-        """Tests get_train_op.
-        """
-        var = tf.Variable(0.)
-        loss = tf.nn.l2_loss(var)
-        train_op = opt.get_train_op(loss)
-        self.assertTrue(tf.contrib.framework.is_tensor(train_op))
-
-
-if __name__ == "__main__":
-    tf.test.main()
diff --git a/texar/tf/core/replay_memories.py b/texar/tf/core/replay_memories.py
deleted file mode 100644
index 84240b21..00000000
--- a/texar/tf/core/replay_memories.py
+++ /dev/null
@@ -1,139 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Classes and utilities for replay memory in RL.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from collections import deque
-import random
-
-from texar.tf.hyperparams import HParams
-
-__all__ = [
-    "ReplayMemoryBase",
-    "DequeReplayMemory"
-]
-
-
-class ReplayMemoryBase(object):
-    """Base class of replay memory inheritted by all replay memory classes.
-
-    Args:
-        hparams (dict or HParams, optional): Hyperparameters. Missing
-            hyperparameters are set to default values. See
-            :meth:`default_hparams` for the defaults.
-    """
-    def __init__(self, hparams=None):
-        self._hparams = HParams(hparams, self.default_hparams())
-
-    @staticmethod
-    def default_hparams():
-        """Returns a `dict` of hyperparameters and their default values.
-
-        .. code-block:: python
-
-            {
-                'name': 'replay_memory'
-            }
-        """
-        return {
-            'name': 'replay_memory'
-        }
-
-    def add(self, element):
-        """Inserts a memory entry
-        """
-        raise NotImplementedError
-
-    def get(self, size):
-        """Pops a memory entry.
-        """
-        raise NotImplementedError
-
-    def last(self):
-        """Returns the latest element in the memeory.
-        """
-        raise NotImplementedError
-
-    def size(self):
-        """Returns the current size of the memory.
-        """
-        raise NotImplementedError
-
-
-class DequeReplayMemory(ReplayMemoryBase):
-    """A deque based replay memory that accepts new memory entry and deletes
-    oldest memory entry if exceeding the capacity. Memory entries are
-    accessed in random order.
-
-    Args:
-        hparams (dict or HParams, optional): Hyperparameters. Missing
-            hyperparameters are set to default values. See
-            :meth:`default_hparams` for the defaults.
-    """
-    def __init__(self, hparams=None):
-        ReplayMemoryBase.__init__(self, hparams)
-        self.deque = deque()
-        self.capacity = self._hparams.capacity
-
-    @staticmethod
-    def default_hparams():
-        """Returns a `dict` of hyperparameters and their default values.
-
-        .. code-block:: python
-
-            {
-                'capacity': 80000,
-                'name': 'deque_replay_memory',
-            }
-
-        Here:
-
-        "capacity": int
-            Maximum size of memory kept. Deletes oldest memories if exceeds
-            the capacity.
-        """
-        return {
-            'name': 'deque_replay_memory',
-            'capacity': 80000
-        }
-
-    def add(self, element):
-        """Appends element to the memory and deletes old memory if exceeds
-        the capacity.
-        """
-        self.deque.append(element)
-        if len(self.deque) > self.capacity:
-            self.deque.popleft()
-
-    # TODO(zhiting): is it okay to have stand alone random generator ?
-    def get(self, size):
-        """Randomly samples :attr:`size` entries from the memory. Returns
-        a list.
-        """
-        return random.sample(self.deque, size)
-
-    def last(self):
-        """Returns the latest element in the memeory.
-        """
-        return self.deque[-1]
-
-    def size(self):
-        """Returns the current size of the memory.
-        """
-        return len(self.deque)
diff --git a/texar/tf/custom/__init__.py b/texar/tf/custom/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/texar/tf/data/__init__.py b/texar/tf/data/__init__.py
index 1bec2923..cba656c8 100644
--- a/texar/tf/data/__init__.py
+++ b/texar/tf/data/__init__.py
@@ -1,4 +1,4 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
+# Copyright 2019 The Texar Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,18 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-Modules of texar library data.
+Modules of Texar library data.
 """
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# pylint: disable=wildcard-import
-
 from texar.tf.data.data import *
 from texar.tf.data.tokenizers import *
-from texar.tf.data.data_utils import *
 from texar.tf.data.data_decoders import *
-from texar.tf.data.vocabulary import *
+from texar.tf.data.data_utils import *
 from texar.tf.data.embedding import *
+from texar.tf.data.vocabulary import *
diff --git a/texar/tf/data/data/__init__.py b/texar/tf/data/data/__init__.py
index 70447444..1f03bd2a 100644
--- a/texar/tf/data/data/__init__.py
+++ b/texar/tf/data/data/__init__.py
@@ -1,4 +1,4 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
+# Copyright 2019 The Texar Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,21 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-Modules of texar library data inputs.
+Modules of Texar library data inputs.
 """
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# pylint: disable=wildcard-import
-
 from texar.tf.data.data.data_base import *
-from texar.tf.data.data.scalar_data import *
-from texar.tf.data.data.text_data_base import *
-from texar.tf.data.data.mono_text_data import *
-from texar.tf.data.data.paired_text_data import *
-from texar.tf.data.data.multi_aligned_data import *
 from texar.tf.data.data.data_iterators import *
 from texar.tf.data.data.dataset_utils import *
+from texar.tf.data.data.mono_text_data import *
+from texar.tf.data.data.text_data_base import *
 from texar.tf.data.data.tfrecord_data import *
diff --git a/texar/tf/data/data/data_base.py b/texar/tf/data/data/data_base.py
index 1044f433..a9cb4956 100644
--- a/texar/tf/data/data/data_base.py
+++ b/texar/tf/data/data/data_base.py
@@ -1,4 +1,4 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
+# Copyright 2019 The Texar Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,29 +12,23 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-Base data class that is enherited by all data classes.
+Base data class that is inherited by all data classes.
 A data defines data reading, parsing, batching, and other
 preprocessing operations.
 """
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
-
-import tensorflow as tf
-
 from texar.tf.hyperparams import HParams
-from texar.tf.data.data import dataset_utils as dsutils
 from texar.tf.data.data_utils import count_file_lines
+from texar.tf.data.data.dataset_utils import random_shard_dataset
+
 
 __all__ = [
     "DataBase"
 ]
 
 
-class DataBase(object):
-    """Base class inheritted by all data classes.
+class DataBase:
+    r"""Base class inherited by all data classes.
     """
 
     def __init__(self, hparams):
@@ -42,7 +36,7 @@ def __init__(self, hparams):
 
     @staticmethod
     def default_hparams():
-        """Returns a dictionary of default hyperparameters.
+        r"""Returns a dictionary of default hyperparameters.
 
         .. code-block:: python
 
@@ -62,7 +56,7 @@ def default_hparams():
 
         Here:
 
-            "num_epochs": int
+            `"num_epochs"`: int
                 Number of times the dataset should be repeated. An
                 :tf_main:`OutOfRangeError <errors/OutOfRangeError>` signal will
                 be raised after the whole repeated dataset has been iterated
@@ -72,11 +66,11 @@ def default_hparams():
                 will get the signal after each epoch of training. Set to -1
                 to repeat the dataset indefinitely.
 
-            "batch_size": int
+            `"batch_size"`: int
                 Batch size, i.e., the number of consecutive elements of the
                 dataset to combine in a single batch.
 
-            "allow_smaller_final_batch": bool
+            `"allow_smaller_final_batch"`: bool
                Whether to allow the final batch to be smaller if there are
                insufficient elements left. If `False`, the final batch is
                discarded if it is smaller than batch size. Note that,
@@ -84,10 +78,10 @@ def default_hparams():
                will have a a **static** batch_size dimension equal to
                "batch_size".
 
-            "shuffle": bool
+            `"shuffle"`: bool
                 Whether to randomly shuffle the elements of the dataset.
 
-            "shuffle_buffer_size": int
+            `"shuffle_buffer_size"`: int
                 The buffer size for data shuffling. The larger, the better
                 the resulting data is mixed.
 
@@ -95,7 +89,7 @@ def default_hparams():
                 whole dataset (i.e., make the shuffling the maximally
                 effective).
 
-            "shard_and_shuffle": bool
+            `"shard_and_shuffle"`: bool
                 Whether to first shard the dataset and then shuffle each
                 block respectively. Useful when the whole data is too large to
                 be loaded efficiently into the memory.
@@ -103,20 +97,20 @@ def default_hparams():
                 If `True`, :attr:`shuffle_buffer_size` must be specified to
                 determine the size of each shard.
 
-            "num_parallel_calls": int
+            `"num_parallel_calls"`: int
                 Number of elements from the datasets to process in parallel.
 
-            "prefetch_buffer_size": int
+            `"prefetch_buffer_size"`: int
                 The maximum number of elements that will be buffered when
                 prefetching.
 
-            max_dataset_size : int
+            `"max_dataset_size"`: int
                 Maximum number of instances to include in
                 the dataset. If set to `-1` or greater than the size of
                 dataset, all instances will be included. This constraint is
                 imposed after data shuffling and filtering.
 
-            seed : int, optional
+            `"seed"`: int, optional
                 The random seed for shuffle.
 
                 Note that if a seed is set, the shuffle order will be exact
@@ -128,7 +122,7 @@ def default_hparams():
                 different **within** every `num_epochs`, but are the same
                 **across** the `num_epochs`.
 
-            name : str
+            `"name"`: str
                 Name of the data.
         """
         return {
@@ -157,10 +151,9 @@ def _make_batch(dataset, hparams, padded_batch=False, padding_values=None):
             else:
                 dataset = dataset.batch(batch_size)
         else:
-            dataset = dataset.apply(
-                tf.contrib.data.padded_batch_and_drop_remainder(
-                    batch_size, dataset.output_shapes,
-                    padding_values=padding_values))
+            dataset = dataset.padded_batch(batch_size, dataset.output_shapes,
+                                           padding_values=padding_values,
+                                           drop_remainder=True)
         return dataset
 
     @staticmethod
@@ -179,7 +172,7 @@ def _shuffle_dataset(dataset, hparams, dataset_files):
                     "shuffle_and_shard to `False`." %
                     (dataset_size, shuffle_buffer_size))
             # TODO(zhiting): Use a different seed?
-            dataset = dataset.apply(dsutils.random_shard_dataset(
+            dataset = dataset.apply(random_shard_dataset(
                 dataset_size, shuffle_buffer_size, hparams["seed"]))
             dataset = dataset.shuffle(shuffle_buffer_size + 16,  # add a margin
                                       seed=hparams["seed"])
@@ -193,25 +186,25 @@ def _shuffle_dataset(dataset, hparams, dataset_files):
 
     @property
     def num_epochs(self):
-        """Number of epochs.
+        r"""Number of epochs.
         """
         return self._hparams.num_epochs
 
     @property
     def batch_size(self):
-        """The batch size.
+        r"""The batch size.
         """
         return self._hparams.batch_size
 
     @property
     def hparams(self):
-        """A :class:`~texar.tf.HParams` instance of the
+        r"""A :class:`~texar.tf.HParams` instance of the
         data hyperparameters.
         """
         return self._hparams
 
     @property
     def name(self):
-        """Name of the module.
+        r"""Name of the module.
         """
         return self._hparams.name
diff --git a/texar/tf/data/data/data_iterators.py b/texar/tf/data/data/data_iterators.py
index aac7df1e..ed2d80af 100644
--- a/texar/tf/data/data/data_iterators.py
+++ b/texar/tf/data/data/data_iterators.py
@@ -1,4 +1,4 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
+# Copyright 2019 The Texar Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -15,41 +15,27 @@
 Various data iterator classes.
 """
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
-
 import tensorflow as tf
 
 from texar.tf.data.data.data_base import DataBase
-from texar.tf.utils.variables import get_unique_named_variable_scope
-
-__all__ = [
-    "DataIteratorBase",
-    "DataIterator",
-    "TrainTestDataIterator",
-    "FeedableDataIterator",
-    "TrainTestFeedableDataIterator"
-]
 
 
-class DataIteratorBase(object):
-    """Base class for all data iterator classes to inherit. A data iterator
+class DataIteratorBase:
+    r"""Base class for all data iterator classes to inherit. A data iterator
     is a wrapper of :tf_main:`tf.data.Iterator <data/Iterator>`, and can
     switch between and iterate through **multiple** datasets.
 
     Args:
         datasets: Datasets to iterates through. This can be:
 
-            - A single instance of :tf_main:`tf.data.Dataset <data/Dataset>` \
-            or instance of subclass of :class:`~texar.tf.data.DataBase`.
-            - A `dict` that maps dataset name to \
-            instance of :tf_main:`tf.data.Dataset <data/Dataset>` or \
-            subclass of :class:`~texar.tf.data.DataBase`.
-            - A `list` of instances of subclasses of \
-            :class:`texar.tf.data.DataBase`. The name of instances \
-            (:attr:`texar.tf.data.DataBase.name`) must be unique.
+            - A single instance of :tf_main:`tf.data.Dataset <data/Dataset>`
+              or instance of subclass of :class:`~texar.tf.data.DataBase`.
+            - A `dict` that maps dataset name to
+              instance of :tf_main:`tf.data.Dataset <data/Dataset>` or
+              subclass of :class:`~texar.tf.data.DataBase`.
+            - A `list` of instances of subclasses of
+              :class:`texar.tf.data.DataBase`. The name of instances
+              (:attr:`texar.tf.data.DataBase.name`) must be unique.
     """
 
     def __init__(self, datasets):
@@ -66,7 +52,7 @@ def __init__(self, datasets):
                 raise ValueError("Names of datasets must be unique.")
 
         _datasets = {}
-        for k, v in datasets.items():  # pylint: disable=invalid-name
+        for k, v in datasets.items():
             _datasets[k] = v if isinstance(v, tf.data.Dataset) else v.dataset
         self._datasets = _datasets
 
@@ -75,33 +61,33 @@ def __init__(self, datasets):
 
     @property
     def num_datasets(self):
-        """Number of datasets.
+        r"""Number of datasets.
         """
         return len(self._datasets)
 
     @property
     def dataset_names(self):
-        """A list of dataset names.
+        r"""A list of dataset names.
         """
         return list(self._datasets.keys())
 
 
 class DataIterator(DataIteratorBase):
-    """Data iterator that switches and iterates through multiple datasets.
+    r"""Data iterator that switches and iterates through multiple datasets.
 
-    This is a wrapper of TF reinitializble :tf_main:`iterator <data/Iterator>`.
+    This is a wrapper of TF reinitializable :tf_main:`iterator <data/Iterator>`.
 
     Args:
         datasets: Datasets to iterates through. This can be:
 
-            - A single instance of :tf_main:`tf.data.Dataset <data/Dataset>` \
-            or instance of subclass of :class:`~texar.tf.data.DataBase`.
-            - A `dict` that maps dataset name to \
-            instance of :tf_main:`tf.data.Dataset <data/Dataset>` or \
-            subclass of :class:`~texar.tf.data.DataBase`.
-            - A `list` of instances of subclasses of \
-            :class:`texar.tf.data.DataBase`. The name of instances \
-            (:attr:`texar.tf.data.DataBase.name`) must be unique.
+            - A single instance of :tf_main:`tf.data.Dataset <data/Dataset>`
+              or instance of subclass of :class:`~texar.tf.data.DataBase`.
+            - A `dict` that maps dataset name to
+              instance of :tf_main:`tf.data.Dataset <data/Dataset>` or
+              subclass of :class:`~texar.tf.data.DataBase`.
+            - A `list` of instances of subclasses of
+              :class:`texar.tf.data.DataBase`. The name of instances
+              (:attr:`texar.tf.data.DataBase.name`) must be unique.
 
     Example:
 
@@ -112,6 +98,7 @@ class DataIterator(DataIteratorBase):
             iterator = DataIterator({'train': train_data, 'test': test_data})
             batch = iterator.get_next()
 
+            TODO: Should be updated.
             sess = tf.Session()
 
             for _ in range(200): # Run 200 epochs of train/test
@@ -134,22 +121,20 @@ class DataIterator(DataIteratorBase):
     def __init__(self, datasets):
         DataIteratorBase.__init__(self, datasets)
 
-        self._variable_scope = get_unique_named_variable_scope('data_iterator')
-        with tf.variable_scope(self._variable_scope):
-            first_dataset = self._datasets[sorted(self.dataset_names)[0]]
-            self._iterator = tf.data.Iterator.from_structure(
-                first_dataset.output_types, first_dataset.output_shapes)
-            self._iterator_init_ops = {
-                name: self._iterator.make_initializer(d)
-                for name, d in self._datasets.items()
-            }
-
-    def switch_to_dataset(self, sess, dataset_name=None):
-        """Re-initializes the iterator of a given dataset and starts iterating
+        first_dataset = self._datasets[sorted(self.dataset_names)[0]]
+        self._iterator = tf.compat.v1.data.Iterator.from_structure(
+            tf.compat.v1.data.get_output_types(first_dataset),
+            tf.compat.v1.data.get_output_shapes(first_dataset))
+        self._iterator_init_ops = {
+            name: self._iterator.make_initializer(d)
+            for name, d in self._datasets.items()
+        }
+
+    def switch_to_dataset(self, dataset_name=None):
+        r"""Re-initializes the iterator of a given dataset and starts iterating
         over the dataset (from the beginning).
 
         Args:
-            sess: The current tf session.
             dataset_name (optional): Name of the dataset. If not provided,
                 there must be only one Dataset.
         """
@@ -160,16 +145,16 @@ def switch_to_dataset(self, sess, dataset_name=None):
             dataset_name = next(iter(self._datasets))
         if dataset_name not in self._datasets:
             raise ValueError("Dataset not found: ", dataset_name)
-        sess.run(self._iterator_init_ops[dataset_name])
+        self._iterator.make_initializer(self._datasets[dataset_name])
 
     def get_next(self):
-        """Returns the next element of the activated dataset.
+        r"""Returns the next element of the activated dataset.
         """
         return self._iterator.get_next()
 
 
 class TrainTestDataIterator(DataIterator):
-    """Data iterator that alternatives between train, val, and test datasets.
+    r"""Data iterator that alternatives between train, val, and test datasets.
 
     :attr:`train`, :attr:`val`, and :attr:`test` can be instance of
     either :tf_main:`tf.data.Dataset <data/Dataset>` or subclass of
@@ -191,6 +176,7 @@ class TrainTestDataIterator(DataIterator):
             iterator = TrainTestDataIterator(train=train_data, val=val_data)
             batch = iterator.get_next()
 
+            TODO: Should be updated.
             sess = tf.Session()
 
             for _ in range(200): # Run 200 epochs of train/val
@@ -227,345 +213,23 @@ def __init__(self, train=None, val=None, test=None):
 
         DataIterator.__init__(self, dataset_dict)
 
-    def switch_to_train_data(self, sess):
-        """Starts to iterate through training data (from the beginning).
-
-        Args:
-            sess: The current tf session.
-        """
-        if self._train_name not in self._datasets:
-            raise ValueError("Training data not provided.")
-        self.switch_to_dataset(sess, self._train_name)
-
-    def switch_to_val_data(self, sess):
-        """Starts to iterate through val data (from the beginning).
-
-        Args:
-            sess: The current tf session.
-        """
-        if self._val_name not in self._datasets:
-            raise ValueError("Val data not provided.")
-        self.switch_to_dataset(sess, self._val_name)
-
-    def switch_to_test_data(self, sess):
-        """Starts to iterate through test data (from the beginning).
-
-        Args:
-            sess: The current tf session.
-        """
-        if self._test_name not in self._datasets:
-            raise ValueError("Test data not provided.")
-        self.switch_to_dataset(sess, self._test_name)
-
-
-class FeedableDataIterator(DataIteratorBase):
-    """Data iterator that iterates through **multiple** datasets and switches
-    between datasets.
-
-    The iterator can switch to a dataset and resume from where we
-    left off last time we visited the dataset. This is a wrapper of TF
-    feedable :tf_main:`iterator <data/Iterator>`.
-
-    Args:
-        datasets: Datasets to iterates through. This can be:
-
-            - A single instance of :tf_main:`tf.data.Dataset <data/Dataset>` \
-            or instance of subclass of :class:`~texar.tf.data.DataBase`.
-            - A `dict` that maps dataset name to \
-            instance of :tf_main:`tf.data.Dataset <data/Dataset>` or \
-            subclass of :class:`~texar.tf.data.DataBase`.
-            - A `list` of instances of subclasses of \
-            :class:`texar.tf.data.DataBase`. The name of instances \
-            (:attr:`texar.tf.data.DataBase.name`) must be unique.
-
-    Example:
-
-        .. code-block:: python
-
-            train_data = MonoTextData(hparams={'num_epochs': 200, ...})
-            test_data = MonoTextData(hparams_test)
-            iterator = FeedableDataIterator({'train': train_data,
-                                             'test': test_data})
-            batch = iterator.get_next()
-
-            sess = tf.Session()
-
-            def _eval_epoch(): # Iterate through test data for one epoch
-                # Initialize and start from beginning of test data
-                iterator.initialize_dataset(sess, 'test')
-                while True:
-                    try:
-                        fetch_dict = { # Read from test data
-                            iterator.handle: Iterator.get_handle(sess, 'test')
-                        }
-                        test_batch_ = sess.run(batch, feed_dict=feed_dict)
-                    except tf.errors.OutOfRangeError:
-                        print("End of val epoch.")
-
-            # Initialize and start from beginning of training data
-            iterator.initialize_dataset(sess, 'train')
-            step = 0
-            while True:
-                try:
-                    fetch_dict = { # Read from training data
-                        iterator.handle: Iterator.get_handle(sess, 'train')
-                    }
-                    train_batch_ = sess.run(batch, fetch_dict=fetch_dict)
-
-                    step +=1
-                    if step % 200 == 0: # Evaluate periodically
-                        _eval_epoch()
-                except tf.errors.OutOfRangeError:
-                    print("End of training.")
-    """
-
-    def __init__(self, datasets):
-        DataIteratorBase.__init__(self, datasets)
-
-        self._variable_scope = get_unique_named_variable_scope(
-            'feedable_data_iterator')
-        with tf.variable_scope(self._variable_scope):
-            self._handle = tf.placeholder(tf.string, shape=[], name='handle')
-            first_dataset = self._datasets[sorted(self.dataset_names)[0]]
-            self._iterator = tf.data.Iterator.from_string_handle(
-                self._handle, first_dataset.output_types,
-                first_dataset.output_shapes)
-
-            self._dataset_iterators = {
-                name: dataset.make_initializable_iterator()
-                for name, dataset in self._datasets.items()
-            }
-
-    def get_handle(self, sess, dataset_name=None):
-        """Returns a dataset handle used to feed the
-        :attr:`handle` placeholder to fetch data from the dataset.
-
-        Args:
-            sess: The current tf session.
-            dataset_name (optional): Name of the dataset. If not provided,
-                there must be only one Dataset.
-
-        Returns:
-            A string handle to be fed to the :attr:`handle` placeholder.
-
-        Example:
-
-            .. code-block:: python
-
-                next_element = iterator.get_next()
-                train_handle = iterator.get_handle(sess, 'train')
-                # Gets the next training element
-                ne_ = sess.run(next_element,
-                               feed_dict={iterator.handle: train_handle})
-        """
-        if dataset_name is None:
-            if self.num_datasets > 1:
-                raise ValueError("`dataset_name` is required if there are "
-                                 "more than one datasets.")
-            dataset_name = next(iter(self._datasets))
-        if dataset_name not in self._datasets:
-            raise ValueError("Dataset not found: ", dataset_name)
-        return sess.run(self._dataset_iterators[dataset_name].string_handle())
-
-    def restart_dataset(self, sess, dataset_name=None):
-        """Restarts datasets so that next iteration will fetch data from
-        the beginning of the datasets.
-
-        Args:
-            sess: The current tf session.
-            dataset_name (optional): A dataset name or a list of dataset names
-                that specifies which dataset(s) to restart. If `None`, all
-                datasets are restart.
-        """
-        self.initialize_dataset(sess, dataset_name)
-
-    def initialize_dataset(self, sess, dataset_name=None):
-        """Initializes datasets. A dataset must be initialized before being
-        used.
-
-        Args:
-            sess: The current tf session.
-            dataset_name (optional): A dataset name or a list of dataset names
-                that specifies which dataset(s) to initialize. If `None`, all
-                datasets are initialized.
-        """
-        if dataset_name is None:
-            dataset_name = self.dataset_names
-        if not isinstance(dataset_name, (tuple, list)):
-            dataset_name = [dataset_name]
-
-        for name in dataset_name:
-            sess.run(self._dataset_iterators[name].initializer)
-
-    def get_next(self):
-        """Returns the next element of the activated dataset.
-        """
-        return self._iterator.get_next()
-
-    @property
-    def handle(self):
-        """The handle placeholder that can be fed with a dataset handle to
-        fetch data from the dataset.
-        """
-        return self._handle
-
-
-class TrainTestFeedableDataIterator(FeedableDataIterator):
-    """Feedable data iterator that alternatives between train, val, and test
-    datasets.
-
-    This is a wrapper of :class:`~texar.tf.data.FeedableDataIterator`.
-    The iterator can switch to a dataset and resume from where it was
-    left off when it was visited last time.
-
-    :attr:`train`, :attr:`val`, and :attr:`test` can be instance of
-    either :tf_main:`tf.data.Dataset <data/Dataset>` or subclass of
-    :class:`~texar.tf.data.DataBase`. At least one of them must be provided.
-
-    Args:
-        train (optional): Training data.
-        val (optional): Validation data.
-        test (optional): Test data.
-
-    Example:
-
-        .. code-block:: python
-
-            train_data = MonoTextData(hparams={'num_epochs': 200, ...})
-            test_data = MonoTextData(hparams_test)
-            iterator = TrainTestFeedableDataIterator(train=train_data,
-                                                     test=test_data)
-            batch = iterator.get_next()
-
-            sess = tf.Session()
-
-            def _eval_epoch(): # Iterate through test data for one epoch
-                # Initialize and start from beginning of test data
-                iterator.initialize_test_dataset(sess)
-                while True:
-                    try:
-                        fetch_dict = { # Read from test data
-                            iterator.handle: Iterator.get_test_handle(sess)
-                        }
-                        test_batch_ = sess.run(batch, feed_dict=feed_dict)
-                    except tf.errors.OutOfRangeError:
-                        print("End of test epoch.")
-
-            # Initialize and start from beginning of training data
-            iterator.initialize_train_dataset(sess)
-            step = 0
-            while True:
-                try:
-                    fetch_dict = { # Read from training data
-                        iterator.handle: Iterator.get_train_handle(sess)
-                    }
-                    train_batch_ = sess.run(batch, fetch_dict=fetch_dict)
-
-                    step +=1
-                    if step % 200 == 0: # Evaluate periodically
-                        _eval_epoch()
-                except tf.errors.OutOfRangeError:
-                    print("End of training.")
-    """
-
-    def __init__(self, train=None, val=None, test=None):
-        dataset_dict = {}
-        self._train_name = 'train'
-        self._val_name = 'val'
-        self._test_name = 'test'
-        if train is not None:
-            dataset_dict[self._train_name] = train
-        if val is not None:
-            dataset_dict[self._val_name] = val
-        if test is not None:
-            dataset_dict[self._test_name] = test
-        if len(dataset_dict) == 0:
-            raise ValueError("At least one of `train`, `val`, and `test` "
-                             "must be provided.")
-
-        FeedableDataIterator.__init__(self, dataset_dict)
-
-    def get_train_handle(self, sess):
-        """Returns the handle of the training dataset. The handle can be used
-        to feed the :attr:`handle` placeholder to fetch training data.
-
-        Args:
-            sess: The current tf session.
-
-        Returns:
-            A string handle to be fed to the :attr:`handle` placeholder.
-
-        Example:
-
-            .. code-block:: python
-
-                next_element = iterator.get_next()
-                train_handle = iterator.get_train_handle(sess)
-                # Gets the next training element
-                ne_ = sess.run(next_element,
-                               feed_dict={iterator.handle: train_handle})
+    def switch_to_train_data(self):
+        r"""Starts to iterate through training data (from the beginning).
         """
         if self._train_name not in self._datasets:
             raise ValueError("Training data not provided.")
-        return self.get_handle(sess, self._train_name)
+        self.switch_to_dataset(self._train_name)
 
-    def get_val_handle(self, sess):
-        """Returns the handle of the validation dataset. The handle can be used
-        to feed the :attr:`handle` placeholder to fetch validation data.
-
-        Args:
-            sess: The current tf session.
-
-        Returns:
-            A string handle to be fed to the :attr:`handle` placeholder.
+    def switch_to_val_data(self):
+        r"""Starts to iterate through val data (from the beginning).
         """
         if self._val_name not in self._datasets:
             raise ValueError("Val data not provided.")
-        return self.get_handle(sess, self._val_name)
+        self.switch_to_dataset(self._val_name)
 
-    def get_test_handle(self, sess):
-        """Returns the handle of the test dataset. The handle can be used
-        to feed the :attr:`handle` placeholder to fetch test data.
-
-        Args:
-            sess: The current tf session.
-
-        Returns:
-            A string handle to be fed to the :attr:`handle` placeholder.
-        """
-        if self._test_name not in self._datasets:
-            raise ValueError("Test data not provided.")
-        return self.get_handle(sess, self._test_name)
-
-    def restart_train_dataset(self, sess):
-        """Restarts the training dataset so that next iteration will fetch
-        data from the beginning of the training dataset.
-
-        Args:
-            sess: The current tf session.
-        """
-        if self._train_name not in self._datasets:
-            raise ValueError("Training data not provided.")
-        self.restart_dataset(sess, self._train_name)
-
-    def restart_val_dataset(self, sess):
-        """Restarts the validation dataset so that next iteration will fetch
-        data from the beginning of the validation dataset.
-
-        Args:
-            sess: The current tf session.
-        """
-        if self._val_name not in self._datasets:
-            raise ValueError("Val data not provided.")
-        self.restart_dataset(sess, self._val_name)
-
-    def restart_test_dataset(self, sess):
-        """Restarts the test dataset so that next iteration will fetch
-        data from the beginning of the test dataset.
-
-        Args:
-            sess: The current tf session.
+    def switch_to_test_data(self):
+        r"""Starts to iterate through test data (from the beginning).
         """
         if self._test_name not in self._datasets:
             raise ValueError("Test data not provided.")
-        self.restart_dataset(sess, self._test_name)
+        self.switch_to_dataset(self._test_name)
diff --git a/texar/tf/data/data/data_iterators_test.py b/texar/tf/data/data/data_iterators_test.py
index 4388b381..4bcd0e80 100644
--- a/texar/tf/data/data/data_iterators_test.py
+++ b/texar/tf/data/data/data_iterators_test.py
@@ -1,16 +1,7 @@
-# -*- coding: utf-8 -*-
-#
 """
 Unit tests for data iterator related operations.
 """
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
-
-# pylint: disable=no-member, invalid-name
-
 import tempfile
 import numpy as np
 
@@ -80,27 +71,22 @@ def test_iterator_single_dataset(self):
         data = tx.data.MonoTextData(self._test_hparams)
 
         iterator = tx.data.DataIterator(data)
-        data_batch = iterator.get_next()
-
-        with self.test_session() as sess:
-            sess.run(tf.global_variables_initializer())
-            sess.run(tf.local_variables_initializer())
-            sess.run(tf.tables_initializer())
 
-            for _ in range(2):
-                iterator.switch_to_dataset(sess)
-                i = 1001
-                while True:
-                    try:
-                        data_batch_ = sess.run(data_batch)
-                        self.assertEqual(
-                            tf.compat.as_text(data_batch_['text'][0][0]),
-                            str(i))
-                        i += 1
-                    except tf.errors.OutOfRangeError:
-                        print('Done -- epoch limit reached')
-                        self.assertEqual(i, 2001)
-                        break
+        for _ in range(2):
+            iterator.switch_to_dataset()
+            i = 1001
+            while True:
+                try:
+                    data_batch = iterator.get_next()
+
+                    self.assertEqual(
+                        tf.compat.as_text(data_batch['text'][0][0].numpy()),
+                        str(i))
+                    i += 1
+                except tf.errors.OutOfRangeError:
+                    print('Done -- epoch limit reached')
+                    self.assertEqual(i, 2001)
+                    break
 
     def test_iterator_multi_datasets(self):
         """Tests iterating over multiple datasets.
@@ -109,43 +95,37 @@ def test_iterator_multi_datasets(self):
         test_data = tx.data.MonoTextData(self._test_hparams)
 
         iterator = tx.data.DataIterator([train_data, test_data])
-        data_batch = iterator.get_next()
 
-        with self.test_session() as sess:
-            sess.run(tf.global_variables_initializer())
-            sess.run(tf.local_variables_initializer())
-            sess.run(tf.tables_initializer())
-
-            for _ in range(2):
-                # Iterates over train data
-                iterator.switch_to_dataset(sess, train_data.name)
-                i = 0
-                while True:
-                    try:
-                        data_batch_ = sess.run(data_batch)
-                        self.assertEqual(
-                            tf.compat.as_text(data_batch_['text'][0][0]),
-                            str(i + 1))
-                        i = (i + 1) % 1000
-                    except tf.errors.OutOfRangeError:
-                        print('Train data limit reached')
-                        self.assertEqual(i, 0)
-                        break
-
-                # Iterates over test data
-                iterator.switch_to_dataset(sess, test_data.name)
-                i = 1001
-                while True:
-                    try:
-                        data_batch_ = sess.run(data_batch)
-                        self.assertEqual(
-                            tf.compat.as_text(data_batch_['text'][0][0]),
-                            str(i))
-                        i += 1
-                    except tf.errors.OutOfRangeError:
-                        print('Test data limit reached')
-                        self.assertEqual(i, 2001)
-                        break
+        for _ in range(2):
+            # Iterates over train data
+            iterator.switch_to_dataset(train_data.name)
+            i = 0
+            while True:
+                try:
+                    data_batch_ = iterator.get_next()
+                    self.assertEqual(
+                        tf.compat.as_text(data_batch_['text'][0][0].numpy()),
+                        str(i + 1))
+                    i = (i + 1) % 1000
+                except tf.errors.OutOfRangeError:
+                    print('Train data limit reached')
+                    self.assertEqual(i, 0)
+                    break
+
+            # Iterates over test data
+            iterator.switch_to_dataset(test_data.name)
+            i = 1001
+            while True:
+                try:
+                    data_batch_ = iterator.get_next()
+                    self.assertEqual(
+                        tf.compat.as_text(data_batch_['text'][0][0].numpy()),
+                        str(i))
+                    i += 1
+                except tf.errors.OutOfRangeError:
+                    print('Test data limit reached')
+                    self.assertEqual(i, 2001)
+                    break
 
     def test_train_test_data_iterator(self):
         """Tests :class:`texar.tf.data.TrainTestDataIterator`
@@ -155,143 +135,35 @@ def test_train_test_data_iterator(self):
 
         iterator = tx.data.TrainTestDataIterator(train=train_data,
                                                  test=test_data)
-        data_batch = iterator.get_next()
-
-        with self.test_session() as sess:
-            sess.run(tf.global_variables_initializer())
-            sess.run(tf.local_variables_initializer())
-            sess.run(tf.tables_initializer())
-
-            for _ in range(2):
-                iterator.switch_to_train_data(sess)
-                i = 0
-                while True:
-                    try:
-                        data_batch_ = sess.run(data_batch)
-                        self.assertEqual(
-                            tf.compat.as_text(data_batch_['text'][0][0]),
-                            str(i + 1))
-                        i = (i + 1) % 1000
-                    except tf.errors.OutOfRangeError:
-                        print('Train data limit reached')
-                        self.assertEqual(i, 0)
-                        break
-
-                iterator.switch_to_test_data(sess)
-                i = 1001
-                while True:
-                    try:
-                        data_batch_ = sess.run(data_batch)
-                        self.assertEqual(
-                            tf.compat.as_text(data_batch_['text'][0][0]),
-                            str(i))
-                        i += 1
-                    except tf.errors.OutOfRangeError:
-                        print('Test data limit reached')
-                        self.assertEqual(i, 2001)
-                        break
-
-    def test_feedable_iterator_multi_datasets(self):
-        """Tests iterating over multiple datasets with the
-        :class:`FeedableDataIterator`.
-        """
-        train_data = tx.data.MonoTextData(self._train_hparams)
-        test_data = tx.data.MonoTextData(self._test_hparams)
-
-        iterator = tx.data.FeedableDataIterator([train_data, test_data])
-        data_batch = iterator.get_next()
-
-        with self.test_session() as sess:
-            sess.run(tf.global_variables_initializer())
-            sess.run(tf.local_variables_initializer())
-            sess.run(tf.tables_initializer())
-
-            iterator.initialize_dataset(sess)
-
-            for _ in range(2):
-                # Iterates over train data
-                iterator.restart_dataset(sess, train_data.name)
-                data_handle = iterator.get_handle(sess, train_data.name)
-                i = 0
-                while True:
-                    try:
-                        feed_dict = {iterator.handle: data_handle}
-                        data_batch_ = sess.run(data_batch, feed_dict=feed_dict)
-                        self.assertEqual(
-                            tf.compat.as_text(data_batch_['text'][0][0]),
-                            str(i + 1))
-                        i = (i + 1) % 1000
-                    except tf.errors.OutOfRangeError:
-                        print('Train data limit reached')
-                        self.assertEqual(i, 0)
-                        break
-
-                # Iterates over test data
-                iterator.restart_dataset(sess, test_data.name)
-                data_handle = iterator.get_handle(sess, test_data.name)
-                i = 1001
-                while True:
-                    try:
-                        feed_dict = {iterator.handle: data_handle}
-                        data_batch_ = sess.run(data_batch, feed_dict=feed_dict)
-                        self.assertEqual(
-                            tf.compat.as_text(data_batch_['text'][0][0]),
-                            str(i))
-                        i += 1
-                    except tf.errors.OutOfRangeError:
-                        print('Test data limit reached')
-                        self.assertEqual(i, 2001)
-                        break
-
-    def test_train_test_feedable_data_iterator(self):
-        """Tests :class:`texar.tf.data.TrainTestFeedableDataIterator`
-        """
-        train_data = tx.data.MonoTextData(self._train_hparams)
-        test_data = tx.data.MonoTextData(self._test_hparams)
-
-        iterator = tx.data.TrainTestFeedableDataIterator(train=train_data,
-                                                         test=test_data)
-        data_batch = iterator.get_next()
-
-        with self.test_session() as sess:
-            sess.run(tf.global_variables_initializer())
-            sess.run(tf.local_variables_initializer())
-            sess.run(tf.tables_initializer())
-
-            for _ in range(2):
-                iterator.restart_train_dataset(sess)
-                i = 0
-                while True:
-                    try:
-                        feed_dict = {
-                            iterator.handle: iterator.get_train_handle(sess)
-                        }
-                        data_batch_ = sess.run(data_batch, feed_dict=feed_dict)
-                        self.assertEqual(
-                            tf.compat.as_text(data_batch_['text'][0][0]),
-                            str(i + 1))
-                        i = (i + 1) % 1000
-                    except tf.errors.OutOfRangeError:
-                        print('Train data limit reached')
-                        self.assertEqual(i, 0)
-                        break
 
-                iterator.restart_test_dataset(sess)
-                i = 1001
-                while True:
-                    try:
-                        feed_dict = {
-                            iterator.handle: iterator.get_test_handle(sess)
-                        }
-                        data_batch_ = sess.run(data_batch, feed_dict=feed_dict)
-                        self.assertEqual(
-                            tf.compat.as_text(data_batch_['text'][0][0]),
-                            str(i))
-                        i += 1
-                    except tf.errors.OutOfRangeError:
-                        print('Test data limit reached')
-                        self.assertEqual(i, 2001)
-                        break
+        for _ in range(2):
+            iterator.switch_to_train_data()
+            i = 0
+            while True:
+                try:
+                    data_batch_ = iterator.get_next()
+                    self.assertEqual(
+                        tf.compat.as_text(data_batch_['text'][0][0].numpy()),
+                        str(i + 1))
+                    i = (i + 1) % 1000
+                except tf.errors.OutOfRangeError:
+                    print('Train data limit reached')
+                    self.assertEqual(i, 0)
+                    break
+
+            iterator.switch_to_test_data()
+            i = 1001
+            while True:
+                try:
+                    data_batch_ = iterator.get_next()
+                    self.assertEqual(
+                        tf.compat.as_text(data_batch_['text'][0][0].numpy()),
+                        str(i))
+                    i += 1
+                except tf.errors.OutOfRangeError:
+                    print('Test data limit reached')
+                    self.assertEqual(i, 2001)
+                    break
 
 
 if __name__ == "__main__":
diff --git a/texar/tf/data/data/dataset_utils.py b/texar/tf/data/data/dataset_utils.py
index ef4fb971..facc3d32 100644
--- a/texar/tf/data/data/dataset_utils.py
+++ b/texar/tf/data/data/dataset_utils.py
@@ -1,4 +1,4 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
+# Copyright 2019 The Texar Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,21 +14,14 @@
 """
 Various utilities specific to dataset processing.
 """
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
-
 import six
 
 import tensorflow as tf
 
 import numpy as np
 
-from texar.tf.utils import utils
+from texar.tf.utils.utils import ceildiv
 
-# pylint: disable=invalid-name, too-many-arguments
 
 __all__ = [
     "_DataSpec",
@@ -41,8 +34,8 @@
 ]
 
 
-class _DataSpec(object):
-    """Dataset specification. Used to pass necessary info to
+class _DataSpec:
+    r"""Dataset specification. Used to pass necessary info to
     user-defined tranformation functions.
 
     Args:
@@ -63,12 +56,12 @@ def __init__(self, dataset=None, dataset_size=None, decoder=None,
         self.__dict__.update(kwargs)
 
     def add_spec(self, **kwargs):
-        """Adds new field(s).
+        r"""Adds new field(s).
         """
         self.__dict__.update(kwargs)
 
     def get_ith_data_spec(self, i):
-        """Returns an instance of :class:`_DataSpec` that contains the
+        r"""Returns an instance of :class:`_DataSpec` that contains the
         `i`-th specifications.
         """
         kwargs = {}
@@ -77,7 +70,7 @@ def get_ith_data_spec(self, i):
         return _DataSpec(**kwargs)
 
     def set_ith_data_spec(self, i, data_spec, total_count):
-        """Sets the `i`-th specification to respective values in
+        r"""Sets the `i`-th specification to respective values in
         :attr:`data_spec`.
         """
         for k, v in six.iteritems(data_spec.__dict__):
@@ -96,7 +89,7 @@ def set_ith_data_spec(self, i, data_spec, total_count):
 
 
 def _make_length_filter_fn(length_name, max_length):
-    """Returns a predicate function which takes in data sample
+    r"""Returns a predicate function which takes in data sample
     and returns a bool indicating whether to filter by length.
     """
     def _filter_fn(data):
@@ -105,7 +98,7 @@ def _filter_fn(data):
 
 
 def _make_smaller_batch_filter_fn(batch_size):
-    """Returns a predicate function which takes in a batched data
+    r"""Returns a predicate function which takes in a batched data
     and returns a bool indicating whether the batch is of :attr:`batch_size`.
     """
     def _filter_fn(data):
@@ -120,7 +113,7 @@ def _filter_fn(data):
 
 
 def _make_combined_filter_fn(filter_fns, mode="and"):
-    """Returns a new predicate function that combines multiple
+    r"""Returns a new predicate function that combines multiple
     predicate functions with certain mode.
 
     Returns `None` if all elements in :attr:`filter_fns` are `None`.
@@ -156,7 +149,7 @@ def _connect_name(lhs_name, rhs_name):
 
 
 def maybe_tuple(data):
-    """Returns `tuple(data)` if :attr:`data` contains more than 1 elements.
+    r"""Returns `tuple(data)` if :attr:`data` contains more than 1 elements.
 
     Used to wrap `map_func` inputs.
     """
@@ -166,7 +159,7 @@ def maybe_tuple(data):
 
 
 def make_partial(fn, *args, **kwargs):
-    """Returns a new function with single argument by freezing other arguments
+    r"""Returns a new function with single argument by freezing other arguments
     of :attr:`fn`.
     """
     def _new_fn(data):
@@ -175,7 +168,7 @@ def _new_fn(data):
 
 
 def name_prefix_fn(name_prefix):
-    """Returns a function that append a prefix to field names.
+    r"""Returns a function that append a prefix to field names.
     """
     def _prefix_fn(data):
         transformed_data = {}
@@ -188,7 +181,7 @@ def _prefix_fn(data):
 
 
 def make_chained_transformation(tran_fns, *args, **kwargs):
-    """Returns a dataset transformation function that applies a list of
+    r"""Returns a dataset transformation function that applies a list of
     transformations sequentially.
 
     Args:
@@ -209,8 +202,8 @@ def _chained_fn(data):
     return _chained_fn
 
 
-def make_combined_transformation(tran_fns, name_prefix=None, *args, **kwargs):
-    """Returns a dataset transformation function that applies
+def make_combined_transformation(tran_fns, *args, name_prefix=None, **kwargs):
+    r"""Returns a dataset transformation function that applies
     transformations to each component of the data.
 
     The data to be transformed must be a tuple of the same length
@@ -259,12 +252,12 @@ def _combined_fn(data):
 
 
 def random_shard_dataset(dataset_size, shard_size, seed=None):
-    """Returns a dataset transformation function that randomly shards a
+    r"""Returns a dataset transformation function that randomly shards a
     dataset.
     """
-    num_shards = utils.ceildiv(dataset_size, shard_size)
+    num_shards = ceildiv(dataset_size, shard_size)
     boundaries = np.linspace(0, dataset_size, num=num_shards, endpoint=False,
-                             dtype=np.int64)  # pylint: disable=no-member
+                             dtype=np.int64)
 
     def _shard_fn(dataset):
         sharded_dataset = (
diff --git a/texar/tf/data/data/dataset_utils_test.py b/texar/tf/data/data/dataset_utils_test.py
index 6b296026..faea5eae 100644
--- a/texar/tf/data/data/dataset_utils_test.py
+++ b/texar/tf/data/data/dataset_utils_test.py
@@ -1,22 +1,13 @@
-# -*- coding: utf-8 -*-
-#
 """
 Unit tests for data utils.
 """
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
-
 import numpy as np
 
 import tensorflow as tf
 
-from texar.tf.data.data import dataset_utils as dsutils
-
+import texar.tf.data.data.dataset_utils as dsutils
 
-# pylint: disable=invalid-name
 
 class TransformationTest(tf.test.TestCase):
     """Tests various transformation utilities.
@@ -41,18 +32,19 @@ def _tran_c(data):
             [_tran_a, _tran_b, _tran_c])
         dataset = dataset.map(chained_tran)
 
-        iterator = dataset.make_one_shot_iterator()
-        elem = iterator.get_next()
-        with self.test_session() as sess:
-            data_ = []
-            while True:
-                try:
-                    data_.append(sess.run(elem))
-                except tf.errors.OutOfRangeError:
-                    break
-            self.assertEqual(len(data_), len(original_data))
-            data_ = [elem_ - 11100 for elem_ in data_]
-            self.assertEqual(data_, original_data.tolist())
+        iterator = tf.compat.v1.data.make_one_shot_iterator(dataset)
+
+        data_ = []
+        while True:
+            try:
+                elem = iterator.get_next()
+                data_.append(elem)
+            except tf.errors.OutOfRangeError:
+                break
+
+        self.assertEqual(len(data_), len(original_data))
+        data_ = [elem_ - 11100 for elem_ in data_]
+        self.assertEqual(data_, original_data.tolist())
 
 
 if __name__ == "__main__":
diff --git a/texar/tf/data/data/mono_text_data.py b/texar/tf/data/data/mono_text_data.py
index 73f87d7d..233aff5a 100644
--- a/texar/tf/data/data/mono_text_data.py
+++ b/texar/tf/data/data/mono_text_data.py
@@ -1,4 +1,4 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
+# Copyright 2019 The Texar Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -16,11 +16,6 @@
 preprocessing operations.
 """
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
-
 import tensorflow as tf
 
 from texar.tf.utils import utils
@@ -32,7 +27,6 @@
 from texar.tf.data.vocabulary import Vocab, SpecialTokens
 from texar.tf.data.embedding import Embedding
 
-# pylint: disable=invalid-name, arguments-differ, protected-access, no-member
 
 __all__ = [
     "_default_mono_text_dataset_hparams",
@@ -40,15 +34,15 @@
 ]
 
 
-class _LengthFilterMode(object):
-    """Options of length filter mode.
+class _LengthFilterMode:
+    r"""Options of length filter mode.
     """
     TRUNC = "truncate"
     DISCARD = "discard"
 
 
 def _default_mono_text_dataset_hparams():
-    """Returns hyperparameters of a mono text dataset with default values.
+    r"""Returns hyperparameters of a mono text dataset with default values.
 
     See :meth:`texar.tf.MonoTextData.default_hparams` for details.
     """
@@ -73,7 +67,7 @@ def _default_mono_text_dataset_hparams():
 
 
 class MonoTextData(TextDataBase):
-    """Text data processor that reads single set of text files. This can be
+    r"""Text data processor that reads single set of text files. This can be
     used for, e.g., language models, auto-encoders, etc.
 
     Args:
@@ -134,7 +128,7 @@ class MonoTextData(TextDataBase):
             data = MonoTextData(hparams)
             iterator = DataIterator(data)
             batch = iterator.get_next()
-
+            TODO: This should be modified in the future.
             iterator.switch_to_dataset(sess) # initializes the dataset
             batch_ = sess.run(batch)
             # batch_ == {
@@ -146,12 +140,12 @@ class MonoTextData(TextDataBase):
 
     def __init__(self, hparams):
         TextDataBase.__init__(self, hparams)
-        with tf.name_scope(self.name, self.default_hparams()["name"]):
+        with tf.name_scope(self.name):
             self._make_data()
 
     @staticmethod
     def default_hparams():
-        """Returns a dicitionary of default hyperparameters:
+        r"""Returns a dictionary of default hyperparameters:
 
         .. code-block:: python
 
@@ -196,31 +190,31 @@ def default_hparams():
 
         1. For the hyperparameters in the :attr:`"dataset"` field:
 
-            "files": str or list
+            `"files"`: str or list
                 A (list of) text file path(s).
 
                 Each line contains a single text sequence.
 
-            "compression_type": str, optional
+            `"compression_type"`: str, optional
                 One of "" (no compression), "ZLIB", or "GZIP".
 
-            "vocab_file": str
+            `"vocab_file"`: str
                 Path to vocabulary file. Each line of the file should contain
                 one vocabulary token.
 
                 Used to create an instance of :class:`~texar.tf.data.Vocab`.
 
-            "embedding_init": dict
+            `"embedding_init"`: dict
                 The hyperparameters for pre-trained embedding loading and
                 initialization.
 
                 The structure and default values are defined in
                 :meth:`texar.tf.data.Embedding.default_hparams`.
 
-            "delimiter": str
+            `"delimiter"`: str
                 The delimiter to split each line of the text files into tokens.
 
-            "max_seq_length": int, optional
+            `"max_seq_length"`: int, optional
                 Maximum length of output sequences. Data samples exceeding the
                 length will be truncated or discarded according to
                 :attr:`"length_filter_mode"`. The length does not include
@@ -228,35 +222,35 @@ def default_hparams():
                 :attr:`"bos_token"` or :attr:`"eos_token"`. If `None` (default),
                 no filtering is performed.
 
-            "length_filter_mode": str
+            `"length_filter_mode"`: str
                 Either "truncate" or "discard". If "truncate" (default),
                 tokens exceeding the :attr:`"max_seq_length"` will be truncated.
                 If "discard", data samples longer than the
                 :attr:`"max_seq_length"`
                 will be discarded.
 
-            "pad_to_max_seq_length": bool
+            `"pad_to_max_seq_length"`: bool
                 If `True`, pad all data instances to length
                 :attr:`"max_seq_length"`.
                 Raises error if :attr:`"max_seq_length"` is not provided.
 
-            "bos_token": str
+            `"bos_token"`: str
                 The Begin-Of-Sequence token prepended to each sequence.
 
                 Set to an empty string to avoid prepending.
 
-            "eos_token": str
+            `"eos_token"`: str
                 The End-Of-Sequence token appended to each sequence.
 
                 Set to an empty string to avoid appending.
 
-            "other_transformations": list
+            `"other_transformations"`: list
                 A list of transformation functions or function names/paths to
                 further transform each single data instance.
 
                 (More documentations to be added.)
 
-            "variable_utterance": bool
+            `"variable_utterance"`: bool
                 If `True`, each line of the text file is considered to contain
                 multiple sequences (utterances) separated by
                 :attr:`"utterance_delimiter"`.
@@ -265,16 +259,16 @@ def default_hparams():
                 dialog history utterances. See the example in
                 `examples/hierarchical_dialog` for a use case.
 
-            "utterance_delimiter": str
+            `"utterance_delimiter"`: str
                 The delimiter to split over utterance level. Should not be the
                 same with :attr:`"delimiter"`. Used only when
                 :attr:`"variable_utterance"``==True`.
 
-            "max_utterance_cnt": int
+            `"max_utterance_cnt"`: int
                 Maximally allowed number of utterances in a data instance.
                 Extra utterances are truncated out.
 
-            "data_name": str
+            `"data_name"`: str
                 Name of the dataset.
 
         2. For the **general** hyperparameters, see
@@ -286,20 +280,20 @@ def default_hparams():
         <contrib/data/bucket_by_sequence_length>`). For bucketing
         hyperparameters:
 
-            "bucket_boundaries": list
+            `"bucket_boundaries"`: list
                 An int list containing the upper length boundaries of the
                 buckets.
 
                 Set to an empty list (default) to disable bucketing.
 
-            "bucket_batch_sizes": list
+            `"bucket_batch_sizes"`: list
                 An int list containing batch size per bucket. Length should be
                 `len(bucket_boundaries) + 1`.
 
                 If `None`, every bucket whill have the same batch size specified
                 in :attr:`batch_size`.
 
-            "bucket_length_fn": str or callable
+            `"bucket_length_fn"`: str or callable
                 Function maps dataset element to `tf.int32` scalar, determines
                 the length of the element.
 
@@ -320,7 +314,7 @@ def default_hparams():
 
     @staticmethod
     def make_vocab(hparams):
-        """Reads vocab file and returns an instance of
+        r"""Reads vocab file and returns an instance of
         :class:`texar.tf.data.Vocab`.
         """
         bos_token = utils.default_str(
@@ -333,7 +327,7 @@ def make_vocab(hparams):
 
     @staticmethod
     def make_embedding(emb_hparams, token_to_id_map):
-        """Optionally loads embedding from file (if provided), and returns
+        r"""Optionally loads embedding from file (if provided), and returns
         an instance of :class:`texar.tf.data.Embedding`.
         """
         embedding = None
@@ -350,7 +344,7 @@ def _make_mono_text_dataset(dataset_hparams):
 
     @staticmethod
     def _make_other_transformations(other_trans_hparams, data_spec):
-        """Creates a list of tranformation functions based on the
+        r"""Creates a list of transformation functions based on the
         hyperparameters.
 
         Args:
@@ -417,6 +411,7 @@ def _make_length_filter(dataset_hparams, length_name, decoder):
         filter_fn = None
         if filter_mode == _LengthFilterMode.DISCARD and max_length is not None:
             max_length += decoder.added_length
+            # pylint: disable=protected-access
             filter_fn = dsutils._make_length_filter_fn(length_name,
                                                        max_length)
         return filter_fn
@@ -449,7 +444,6 @@ def _make_bucket_length_fn(self):
         if not length_fn:
             length_fn = lambda x: x[self.length_name]
         elif not is_callable(length_fn):
-            # pylint: disable=redefined-variable-type
             length_fn = utils.get_function(length_fn, ["texar.tf.custom"])
         return length_fn
 
@@ -462,7 +456,7 @@ def _make_padded_text_and_id_shapes(dataset, dataset_hparams, decoder,
                              "when 'pad_to_max_seq_length' is True.")
         max_length += decoder.added_length
 
-        padded_shapes = dataset.output_shapes
+        padded_shapes = tf.compat.v1.data.get_output_shapes(dataset)
 
         def _get_new_shape(name):
             dim = len(padded_shapes[name])
@@ -495,7 +489,7 @@ def _make_padded_shapes(self, dataset, decoder):
             dataset, self._hparams.dataset, decoder,
             self.text_name, self.text_id_name)
 
-        padded_shapes = dataset.output_shapes
+        padded_shapes = tf.compat.v1.data.get_output_shapes(dataset)
         padded_shapes.update(text_and_id_shapes)
 
         return padded_shapes
@@ -515,6 +509,7 @@ def _make_data(self):
         self._dataset_size = dataset_size
 
         # Processing
+        # pylint: disable=protected-access
         data_spec = dsutils._DataSpec(dataset=dataset,
                                       dataset_size=self._dataset_size,
                                       vocab=self._vocab,
@@ -537,22 +532,22 @@ def _make_data(self):
         self._dataset = dataset
 
     def list_items(self):
-        """Returns the list of item names that the data can produce.
+        r"""Returns the list of item names that the data can produce.
 
         Returns:
             A list of strings.
         """
-        return list(self._dataset.output_types.keys())
+        return list(tf.compat.v1.data.get_output_types(self._dataset).keys())
 
     @property
     def dataset(self):
-        """The dataset, an instance of
+        r"""The dataset, an instance of
         :tf_main:`TF dataset <data/TextLineDataset>`.
         """
         return self._dataset
 
     def dataset_size(self):
-        """Returns the number of data instances in the data files.
+        r"""Returns the number of data instances in the data files.
 
         Note that this is the total data count in the raw files, before any
         filtering and truncation.
@@ -565,13 +560,13 @@ def dataset_size(self):
 
     @property
     def vocab(self):
-        """The vocabulary, an instance of :class:`~texar.tf.data.Vocab`.
+        r"""The vocabulary, an instance of :class:`~texar.tf.data.Vocab`.
         """
         return self._vocab
 
     @property
     def embedding_init_value(self):
-        """The `Tensor` containing the embedding value loaded from file.
+        r"""The `Tensor` containing the embedding value loaded from file.
         `None` if embedding is not specified.
         """
         if self._embedding is None:
@@ -580,7 +575,7 @@ def embedding_init_value(self):
 
     @property
     def text_name(self):
-        """The name of text tensor, "text" by default.
+        r"""The name of text tensor, "text" by default.
         """
         name = dsutils._connect_name(
             self._data_spec.name_prefix,
@@ -589,7 +584,7 @@ def text_name(self):
 
     @property
     def length_name(self):
-        """The name of length tensor, "length" by default.
+        r"""The name of length tensor, "length" by default.
         """
         name = dsutils._connect_name(
             self._data_spec.name_prefix,
@@ -598,7 +593,7 @@ def length_name(self):
 
     @property
     def text_id_name(self):
-        """The name of text index tensor, "text_ids" by default.
+        r"""The name of text index tensor, "text_ids" by default.
         """
         name = dsutils._connect_name(
             self._data_spec.name_prefix,
@@ -607,7 +602,7 @@ def text_id_name(self):
 
     @property
     def utterance_cnt_name(self):
-        """The name of utterance count tensor, "utterance_cnt" by default.
+        r"""The name of utterance count tensor, "utterance_cnt" by default.
         """
         if not self._hparams.dataset.variable_utterance:
             raise ValueError("`utterance_cnt_name` is not defined.")
diff --git a/texar/tf/data/data/mono_text_data_test.py b/texar/tf/data/data/mono_text_data_test.py
index 276b0dcf..e2b28b75 100644
--- a/texar/tf/data/data/mono_text_data_test.py
+++ b/texar/tf/data/data/mono_text_data_test.py
@@ -1,14 +1,7 @@
-# -*- coding: utf-8 -*-
-#
 """
 Unit tests for data related operations.
 """
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
-
 import tempfile
 import copy
 import numpy as np
@@ -17,9 +10,6 @@
 
 import texar.tf as tx
 
-# pylint: disable=too-many-locals, protected-access, too-many-branches
-# pylint: disable=invalid-name
-
 
 class MonoTextDataTest(tf.test.TestCase):
     """Tests text data class.
@@ -60,61 +50,55 @@ def _run_and_test(self,
         self.assertEqual(text_data.vocab.size,
                          self._vocab_size + len(text_data.vocab.special_tokens))
 
-        iterator = text_data.dataset.make_initializable_iterator()
-        text_data_batch = iterator.get_next()
-
-        with self.test_session() as sess:
-            sess.run(tf.global_variables_initializer())
-            sess.run(tf.local_variables_initializer())
-            sess.run(tf.tables_initializer())
-            sess.run(iterator.initializer)
-
-            while True:
-                try:
-                    data_batch_ = sess.run(text_data_batch)
-
-                    self.assertEqual(set(data_batch_.keys()),
-                                     set(text_data.list_items()))
-
-                    if test_batch_size:
-                        self.assertEqual(len(data_batch_['text']),
-                                         hparams['batch_size'])
-
-                    if length_inc:
-                        for i in range(len(data_batch_['text'])):
-                            text_ = data_batch_['text'][i].tolist()
-                            self.assertEqual(
-                                text_.index(b'<EOS>') + 1,
-                                data_batch_['length'][i] - length_inc)
-
-                    max_seq_length = text_data.hparams.dataset.max_seq_length
-                    mode = text_data.hparams.dataset.length_filter_mode
-                    if max_seq_length == 6:
-                        max_l = max_seq_length
-                        max_l += text_data._decoder.added_length
+        iterator = tf.compat.v1.data.make_one_shot_iterator(
+            text_data.dataset)
+
+        while True:
+            try:
+                data_batch_ = iterator.get_next()
+
+                self.assertEqual(set(data_batch_.keys()),
+                                 set(text_data.list_items()))
+
+                if test_batch_size:
+                    self.assertEqual(len(data_batch_['text']),
+                                     hparams['batch_size'])
+
+                if length_inc:
+                    for i in range(len(data_batch_['text'])):
+                        text_ = data_batch_['text'][i].numpy().tolist()
+                        self.assertEqual(
+                            text_.index(b'<EOS>') + 1,
+                            data_batch_['length'][i] - length_inc)
+
+                max_seq_length = text_data.hparams.dataset.max_seq_length
+                mode = text_data.hparams.dataset.length_filter_mode
+                if max_seq_length == 6:
+                    max_l = max_seq_length
+                    max_l += text_data._decoder.added_length
+                    for length in data_batch_['length']:
+                        self.assertLessEqual(length, max_l)
+                    if mode == "discard":
+                        for length in data_batch_['length']:
+                            self.assertEqual(length, 5)
+                    elif mode == "truncate":
+                        num_length_6 = 0
                         for length in data_batch_['length']:
-                            self.assertLessEqual(length, max_l)
-                        if mode == "discard":
-                            for length in data_batch_['length']:
-                                self.assertEqual(length, 5)
-                        elif mode == "truncate":
-                            num_length_6 = 0
-                            for length in data_batch_['length']:
-                                num_length_6 += int(length == 6)
-                            self.assertGreater(num_length_6, 0)
-                        else:
-                            raise ValueError("Unknown mode: %s" % mode)
-
-                    if text_data.hparams.dataset.pad_to_max_seq_length:
-                        max_l = max_seq_length + text_data._decoder.added_length
-                        for x in data_batch_['text']:
-                            self.assertEqual(len(x), max_l)
-                        for x in data_batch_['text_ids']:
-                            self.assertEqual(len(x), max_l)
-
-                except tf.errors.OutOfRangeError:
-                    print('Done -- epoch limit reached')
-                    break
+                            num_length_6 += int(length == 6)
+                        self.assertGreater(num_length_6, 0)
+                    else:
+                        raise ValueError("Unknown mode: %s" % mode)
+
+                if text_data.hparams.dataset.pad_to_max_seq_length:
+                    max_l = max_seq_length + text_data._decoder.added_length
+                    for x in data_batch_['text']:
+                        self.assertEqual(len(x), max_l)
+                    for x in data_batch_['text_ids']:
+                        self.assertEqual(len(x), max_l)
+
+            except tf.errors.OutOfRangeError:
+                print('Done -- epoch limit reached')
+                break
 
     def test_default_setting(self):
         """Tests the logics of MonoTextData.
@@ -133,13 +117,11 @@ def test_bucketing(self):
         """Tests bucketing.
         """
         hparams = copy.copy(self._hparams)
-        hparams.update({
-            "bucket_boundaries": [7],
-            "bucket_batch_sizes": [6, 4]})
+        hparams.update({"bucket_boundaries": [7],
+                        "bucket_batch_sizes": [6, 4]})
 
         text_data = tx.data.MonoTextData(hparams)
-        iterator = text_data.dataset.make_initializable_iterator()
-        text_data_batch = iterator.get_next()
+        iterator = tf.compat.v1.data.make_one_shot_iterator(text_data.dataset)
 
         hparams.update({
             "bucket_boundaries": [7],
@@ -147,39 +129,31 @@ def test_bucketing(self):
             "allow_smaller_final_batch": False})
 
         text_data_1 = tx.data.MonoTextData(hparams)
-        iterator_1 = text_data_1.dataset.make_initializable_iterator()
-        text_data_batch_1 = iterator_1.get_next()
-
-        with self.test_session() as sess:
-            sess.run(tf.global_variables_initializer())
-            sess.run(tf.local_variables_initializer())
-            sess.run(tf.tables_initializer())
-            sess.run(iterator.initializer)
-            sess.run(iterator_1.initializer)
-
-            while True:
-                try:
-                    # Run the logics
-                    data_batch_, data_batch_1_ = sess.run(
-                        [text_data_batch, text_data_batch_1])
-
-                    length_ = data_batch_['length'][0]
-                    if length_ < 7:
-                        last_batch_size = hparams['num_epochs'] % 6
-                        self.assertTrue(
-                            len(data_batch_['text']) == 6 or
-                            len(data_batch_['text']) == last_batch_size)
-                    else:
-                        last_batch_size = hparams['num_epochs'] % 4
-                        self.assertTrue(
-                            len(data_batch_['text']) == 4 or
-                            len(data_batch_['text']) == last_batch_size)
-
-                    self.assertEqual(len(data_batch_1_['text']), 7)
-
-                except tf.errors.OutOfRangeError:
-                    print('Done -- epoch limit reached')
-                    break
+        iterator_1 = tf.compat.v1.data.make_one_shot_iterator(
+            text_data_1.dataset)
+
+        while True:
+            try:
+                data_batch_ = iterator.get_next()
+                data_batch_1_ = iterator_1.get_next()
+
+                length_ = data_batch_['length'][0]
+                if length_ < 7:
+                    last_batch_size = hparams['num_epochs'] % 6
+                    self.assertTrue(
+                        len(data_batch_['text']) == 6 or
+                        len(data_batch_['text']) == last_batch_size)
+                else:
+                    last_batch_size = hparams['num_epochs'] % 4
+                    self.assertTrue(
+                        len(data_batch_['text']) == 4 or
+                        len(data_batch_['text']) == last_batch_size)
+
+                self.assertEqual(len(data_batch_1_['text']), 7)
+
+            except tf.errors.OutOfRangeError:
+                print('Done -- epoch limit reached')
+                break
 
     def test_shuffle(self):
         """Tests different shuffle strategies.
@@ -200,7 +174,7 @@ def test_prefetch(self):
     def test_other_transformations(self):
         """Tests use of other transformations
         """
-        def _transform(x, data_specs):  # pylint: disable=invalid-name
+        def _transform(x, data_specs):
             x[data_specs.decoder.length_tensor_name] += 1
             return x
 
@@ -298,43 +272,37 @@ def _run_and_test(self, hparams):
         self.assertEqual(text_data.vocab.size,
                          self._vocab_size + len(text_data.vocab.special_tokens))
 
-        iterator = text_data.dataset.make_initializable_iterator()
-        text_data_batch = iterator.get_next()
-
-        with self.test_session() as sess:
-            sess.run(tf.global_variables_initializer())
-            sess.run(tf.local_variables_initializer())
-            sess.run(tf.tables_initializer())
-            sess.run(iterator.initializer)
-
-            while True:
-                try:
-                    # Run the logics
-                    data_batch_ = sess.run(text_data_batch)
-
-                    self.assertEqual(set(data_batch_.keys()),
-                                     set(text_data.list_items()))
-
-                    # Test utterance count
-                    utt_ind = np.sum(data_batch_["text_ids"], 2) != 0
-                    utt_cnt = np.sum(utt_ind, 1)
-                    self.assertListEqual(
-                        data_batch_[text_data.utterance_cnt_name].tolist(),
-                        utt_cnt.tolist())
-
-                    if text_data.hparams.dataset.pad_to_max_seq_length:
-                        max_l = text_data.hparams.dataset.max_seq_length
-                        max_l += text_data._decoder.added_length
-                        for x in data_batch_['text']:
-                            for xx in x:
-                                self.assertEqual(len(xx), max_l)
-                        for x in data_batch_['text_ids']:
-                            for xx in x:
-                                self.assertEqual(len(xx), max_l)
-
-                except tf.errors.OutOfRangeError:
-                    print('Done -- epoch limit reached')
-                    break
+        iterator = tf.compat.v1.data.make_one_shot_iterator(
+            text_data.dataset)
+
+        while True:
+            try:
+                # Run the logics
+                data_batch_ = iterator.get_next()
+
+                self.assertEqual(set(data_batch_.keys()),
+                                 set(text_data.list_items()))
+
+                # Test utterance count
+                utt_ind = np.sum(data_batch_["text_ids"], 2) != 0
+                utt_cnt = np.sum(utt_ind, 1)
+                self.assertListEqual(
+                    data_batch_[text_data.utterance_cnt_name].numpy().tolist(),
+                    utt_cnt.tolist())
+
+                if text_data.hparams.dataset.pad_to_max_seq_length:
+                    max_l = text_data.hparams.dataset.max_seq_length
+                    max_l += text_data._decoder.added_length
+                    for x in data_batch_['text']:
+                        for xx in x:
+                            self.assertEqual(len(xx), max_l)
+                    for x in data_batch_['text_ids']:
+                        for xx in x:
+                            self.assertEqual(len(xx), max_l)
+
+            except tf.errors.OutOfRangeError:
+                print('Done -- epoch limit reached')
+                break
 
     def test_default_setting(self):
         """Tests the logics of the text data.
diff --git a/texar/tf/data/data/multi_aligned_data.py b/texar/tf/data/data/multi_aligned_data.py
deleted file mode 100644
index 10282f3d..00000000
--- a/texar/tf/data/data/multi_aligned_data.py
+++ /dev/null
@@ -1,698 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Data consisting of multiple aligned parts.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
-
-import copy
-
-import tensorflow as tf
-
-from texar.tf.hyperparams import HParams
-from texar.tf.utils import utils
-from texar.tf.utils.dtypes import is_str, is_callable
-from texar.tf.data.data.text_data_base import TextDataBase
-from texar.tf.data.data.scalar_data import ScalarData
-from texar.tf.data.data.tfrecord_data import TFRecordData
-from texar.tf.data.data.mono_text_data import _default_mono_text_dataset_hparams
-from texar.tf.data.data.scalar_data import _default_scalar_dataset_hparams
-from texar.tf.data.data.tfrecord_data import _default_tfrecord_dataset_hparams
-from texar.tf.data.data.mono_text_data import MonoTextData
-from texar.tf.data.data_utils import count_file_lines
-from texar.tf.data.data import dataset_utils as dsutils
-from texar.tf.data.vocabulary import Vocab, SpecialTokens
-from texar.tf.data.embedding import Embedding
-
-# pylint: disable=invalid-name, arguments-differ
-# pylint: disable=protected-access, too-many-instance-attributes
-
-__all__ = [
-    "_default_dataset_hparams",
-    "MultiAlignedData"
-]
-
-
-class _DataTypes(object):  # pylint: disable=no-init, too-few-public-methods
-    """Enumeration of data types.
-    """
-    TEXT = "text"
-    INT = "int"
-    FLOAT = "float"
-    TF_RECORD = "tf_record"
-
-
-def _is_text_data(data_type):
-    return data_type == _DataTypes.TEXT
-
-
-def _is_scalar_data(data_type):
-    return data_type == _DataTypes.INT or data_type == _DataTypes.FLOAT
-
-
-def _is_tfrecord_data(data_type):
-    return data_type == _DataTypes.TF_RECORD
-
-
-def _default_dataset_hparams(data_type=None):
-    """Returns hyperparameters of a dataset with default values.
-
-    See :meth:`texar.tf.data.MultiAlignedData.default_hparams` for details.
-    """
-    if not data_type or _is_text_data(data_type):
-        hparams = _default_mono_text_dataset_hparams()
-        hparams.update({
-            "data_type": _DataTypes.TEXT,
-            "vocab_share_with": None,
-            "embedding_init_share_with": None,
-            "processing_share_with": None,
-        })
-    elif _is_scalar_data(data_type):
-        hparams = _default_scalar_dataset_hparams()
-    elif _is_tfrecord_data(data_type):
-        hparams = _default_tfrecord_dataset_hparams()
-        hparams.update({
-            "data_type": _DataTypes.TF_RECORD,
-        })
-    return hparams
-
-
-class MultiAlignedData(TextDataBase):
-    """Data consisting of multiple aligned parts.
-
-    Args:
-        hparams (dict): Hyperparameters. See :meth:`default_hparams` for the
-            defaults.
-
-    The processor can read any number of parallel fields as specified in
-    the "datasets" list of :attr:`hparams`, and result in a TF Dataset whose
-    element is a python `dict` containing data fields from each of the
-    specified datasets. Fields from a text dataset or TFRecord dataset have
-    names prefixed by its "data_name". Fields from a scalar dataset are
-    specified by its "data_name".
-
-    Example:
-
-        .. code-block:: python
-
-            hparams={
-                'datasets': [
-                    {'files': 'a.txt', 'vocab_file': 'v.a', 'data_name': 'x'},
-                    {'files': 'b.txt', 'vocab_file': 'v.b', 'data_name': 'y'},
-                    {'files': 'c.txt', 'data_type': 'int', 'data_name': 'z'}
-                ]
-                'batch_size': 1
-            }
-            data = MultiAlignedData(hparams)
-            iterator = DataIterator(data)
-            batch = iterator.get_next()
-
-            iterator.switch_to_dataset(sess) # initializes the dataset
-            batch_ = sess.run(batch)
-            # batch_ == {
-            #    'x_text': [['<BOS>', 'x', 'sequence', '<EOS>']],
-            #    'x_text_ids': [['1', '5', '10', '2']],
-            #    'x_length': [4]
-            #    'y_text': [['<BOS>', 'y', 'sequence', '1', '<EOS>']],
-            #    'y_text_ids': [['1', '6', '10', '20', '2']],
-            #    'y_length': [5],
-            #    'z': [1000],
-            # }
-            ...
-
-            hparams={
-                'datasets': [
-                    {'files': 'd.txt', 'vocab_file': 'v.d', 'data_name': 'm'},
-                    {
-                        'files': 'd.tfrecord',
-                        'data_type': 'tf_record',
-                        "feature_original_types": {
-                            'image': ['tf.string', 'FixedLenFeature']
-                        },
-                        'image_options': {
-                            'image_feature_name': 'image',
-                            'resize_height': 512,
-                            'resize_width': 512,
-                        },
-                        'data_name': 't',
-                    }
-                ]
-                'batch_size': 1
-            }
-            data = MultiAlignedData(hparams)
-            iterator = DataIterator(data)
-            batch = iterator.get_next()
-
-            iterator.switch_to_dataset(sess) # initializes the dataset
-            batch_ = sess.run(batch)
-            # batch_ == {
-            #    'x_text': [['<BOS>', 'NewYork', 'City', 'Map', '<EOS>']],
-            #    'x_text_ids': [['1', '100', '80', '65', '2']],
-            #    'x_length': [5],
-            #
-            #    # "t_image" is a list of a "numpy.ndarray" image
-            #    # in this example. Its width equals to 512 and
-            #    # its height equals to 512.
-            #    't_image': [...]
-            # }
-
-    """
-    def __init__(self, hparams):
-        TextDataBase.__init__(self, hparams)
-        # Defaultizes hparams of each dataset
-        datasets_hparams = self._hparams.datasets
-        defaultized_datasets_hparams = []
-        for ds_hpms in datasets_hparams:
-            data_type = ds_hpms.get("data_type", None)
-            defaultized_ds_hpms = HParams(ds_hpms,
-                                          _default_dataset_hparams(data_type))
-            defaultized_datasets_hparams.append(defaultized_ds_hpms)
-        self._hparams.datasets = defaultized_datasets_hparams
-
-        with tf.name_scope(self.name, self.default_hparams()["name"]):
-            self._make_data()
-
-    @staticmethod
-    def default_hparams():
-        """Returns a dicitionary of default hyperparameters.
-
-        .. code-block:: python
-
-            {
-                # (1) Hyperparams specific to text dataset
-                "datasets": []
-                # (2) General hyperparams
-                "num_epochs": 1,
-                "batch_size": 64,
-                "allow_smaller_final_batch": True,
-                "shuffle": True,
-                "shuffle_buffer_size": None,
-                "shard_and_shuffle": False,
-                "num_parallel_calls": 1,
-                "prefetch_buffer_size": 0,
-                "max_dataset_size": -1,
-                "seed": None,
-                "name": "multi_aligned_data",
-            }
-
-        Here:
-
-        1. "datasets" is a list of `dict` each of which specifies a
-        dataset which can be text, scalar or TFRecord. The
-        :attr:`"data_name"` field of each dataset is used as the name
-        prefix of the data fields from the respective dataset. The
-        :attr:`"data_name"` field of each dataset should not be the same.
-
-            - For scalar dataset, the allowed hyperparameters and default \
-            values are the same as the "dataset" field of \
-            :meth:`texar.tf.data.ScalarData.default_hparams`. Note that \
-            :attr:`"data_type"` must be explicily specified \
-            (either "int" or "float"). \
-
-            - For TFRecord dataset, the allowed hyperparameters and default \
-            values are the same as the "dataset" field of \
-            :meth:`texar.tf.data.TFRecordData.default_hparams`. Note that \
-            :attr:`"data_type"` must be explicily specified \
-            (tf_record"). \
-
-            - For text dataset, the allowed hyperparameters and default values\
-            are the same as the "dataset" filed of \
-            :meth:`texar.tf.data.MonoTextData.default_hparams`, with several \
-            extra hyperparameters:
-
-                "data_type": str
-                    The type of the dataset, one of {"text", "int", "float",
-                    "tf_record"}. If set to "int" or "float", the dataset is
-                    considered to be a scalar dataset. If set to "tf_record",
-                    the dataset is considered to be a TFRecord dataset.
-                    If not specified or set to "text", the dataset is
-                    considered to be a text dataset.
-
-                "vocab_share_with": int, optional
-                    Share the vocabulary of a preceding text dataset with the
-                    specified index in the list (starting from 0). The
-                    specified dataset must be a text dataset, and must have
-                    an index smaller than the current dataset.
-
-                    If specified, the vocab file of current dataset is ignored.
-                    Default is `None` which disables the vocab sharing.
-
-                "embedding_init_share_with": int, optional
-                    Share the embedding initial value of a preceding text
-                    dataset with the specified index in the list (starting
-                    from 0).
-                    The specified dataset must be a text dataset, and must have
-                    an index smaller than the current dataset.
-
-                    If specified, the :attr:`"embedding_init"` field of
-                    the current dataset is ignored. Default is `None` which
-                    disables the initial value sharing.
-
-                "processing_share_with": int, optional
-                    Share the processing configurations of a preceding text
-                    dataset with the specified index in the list (starting
-                    from 0).
-                    The specified dataset must be a text dataset, and must have
-                    an index smaller than the current dataset.
-
-                    If specified, relevant field of the current dataset are
-                    ignored, including "delimiter", "bos_token", "eos_token",
-                    and "other_transformations". Default is `None` which
-                    disables the processing sharing.
-
-        2. For the **general** hyperparameters, see
-        :meth:`texar.tf.data.DataBase.default_hparams` for details.
-        """
-        hparams = TextDataBase.default_hparams()
-        hparams["name"] = "multi_aligned_data"
-        hparams["datasets"] = []
-        return hparams
-
-    @staticmethod
-    def _raise_sharing_error(err_data, shr_data, hparam_name):
-        raise ValueError(
-            "Must only share specifications with a preceding dataset. "
-            "Dataset %d has '%s=%d'" % (err_data, hparam_name, shr_data))
-
-    @staticmethod
-    def make_vocab(hparams):
-        """Makes a list of vocabs based on the hparams.
-
-        Args:
-            hparams (list): A list of dataset hyperparameters.
-
-        Returns:
-            A list of :class:`texar.tf.data.Vocab` instances. Some instances
-            may be the same objects if they are set to be shared and have
-            the same other configs.
-        """
-        if not isinstance(hparams, (list, tuple)):
-            hparams = [hparams]
-
-        vocabs = []
-        for i, hparams_i in enumerate(hparams):
-            if not _is_text_data(hparams_i["data_type"]):
-                vocabs.append(None)
-                continue
-
-            proc_shr = hparams_i["processing_share_with"]
-            if proc_shr is not None:
-                bos_token = hparams[proc_shr]["bos_token"]
-                eos_token = hparams[proc_shr]["eos_token"]
-            else:
-                bos_token = hparams_i["bos_token"]
-                eos_token = hparams_i["eos_token"]
-            bos_token = utils.default_str(
-                bos_token, SpecialTokens.BOS)
-            eos_token = utils.default_str(
-                eos_token, SpecialTokens.EOS)
-
-            vocab_shr = hparams_i["vocab_share_with"]
-            if vocab_shr is not None:
-                if vocab_shr >= i:
-                    MultiAlignedData._raise_sharing_error(
-                        i, vocab_shr, "vocab_share_with")
-                if not vocabs[vocab_shr]:
-                    raise ValueError("Cannot share vocab with dataset %d which "
-                                     "does not have a vocab." % vocab_shr)
-                if bos_token == vocabs[vocab_shr].bos_token and \
-                        eos_token == vocabs[vocab_shr].eos_token:
-                    vocab = vocabs[vocab_shr]
-                else:
-                    vocab = Vocab(hparams[vocab_shr]["vocab_file"],
-                                  bos_token=bos_token,
-                                  eos_token=eos_token)
-            else:
-                vocab = Vocab(hparams_i["vocab_file"],
-                              bos_token=bos_token,
-                              eos_token=eos_token)
-            vocabs.append(vocab)
-
-        return vocabs
-
-    @staticmethod
-    def make_embedding(hparams, vocabs):
-        """Optionally loads embeddings from files (if provided), and
-        returns respective :class:`texar.tf.data.Embedding` instances.
-        """
-        if not isinstance(hparams, (list, tuple)):
-            hparams = [hparams]
-
-        embs = []
-        for i, hparams_i in enumerate(hparams):
-            if not _is_text_data(hparams_i["data_type"]):
-                embs.append(None)
-                continue
-
-            emb_shr = hparams_i["embedding_init_share_with"]
-            if emb_shr is not None:
-                if emb_shr >= i:
-                    MultiAlignedData._raise_sharing_error(
-                        i, emb_shr, "embedding_init_share_with")
-                if not embs[emb_shr]:
-                    raise ValueError("Cannot share embedding with dataset %d "
-                                     "which does not have an embedding." %
-                                     emb_shr)
-                if emb_shr != hparams_i["vocab_share_with"]:
-                    raise ValueError("'embedding_init_share_with' != "
-                                     "vocab_share_with. embedding_init can "
-                                     "be shared only when vocab is shared.")
-                emb = embs[emb_shr]
-            else:
-                emb = None
-                emb_file = hparams_i["embedding_init"]["file"]
-                if emb_file and emb_file != "":
-                    emb = Embedding(vocabs[i].token_to_id_map_py,
-                                    hparams_i["embedding_init"])
-            embs.append(emb)
-
-        return embs
-
-    def _make_dataset(self):
-        datasets = []
-        for _, hparams_i in enumerate(self._hparams.datasets):
-            dtype = hparams_i.data_type
-            if _is_text_data(dtype) or _is_scalar_data(dtype):
-                dataset = tf.data.TextLineDataset(
-                    hparams_i.files,
-                    compression_type=hparams_i.compression_type)
-                datasets.append(dataset)
-            elif _is_tfrecord_data(dtype):
-                dataset = tf.data.TFRecordDataset(filenames=hparams_i.files)
-                num_shards = hparams_i.num_shards
-                shard_id = hparams_i.shard_id
-                if num_shards is not None and shard_id is not None:
-                    dataset = dataset.shard(num_shards, shard_id)
-                datasets.append(dataset)
-            else:
-                raise ValueError("Unknown data type: %s" % hparams_i.data_type)
-        return tf.data.Dataset.zip(tuple(datasets))
-
-    # @staticmethod
-    # def _get_name_prefix(dataset_hparams):
-    #    def _dtype_conflict(dtype_1, dtype_2):
-    #        conflict = ((dtype_1 == dtype_2) or
-    #                    (dtype_1 in {_DataTypes.INT, _DataTypes.FLOAT} and
-    #                     dtype_2 in {_DataTypes.INT, _DataTypes.FLOAT}))
-    #        return conflict
-
-    #    name_prefix = [hpms["data_name"] for hpms in dataset_hparams]
-    #    name_prefix_dict = {}
-    #    for i, np in enumerate(name_prefix):
-    #        ids = name_prefix_dict.get(np, [])
-    #        for j in ids:
-    #            if _dtype_conflict(dataset_hparams[j]["data_type"],
-    #                               dataset_hparams[i]["data_type"]):
-    #                raise ValueError(
-    #                    "'data_name' of the datasets with compatible "
-    #                    "data_types cannot be the same: %d-th dataset and "
-    #                    "%d-th dataset have the same name '%s'" %
-    #                    (i, j, name_prefix[i]))
-    #        ids.append(i)
-    #        name_prefix_dict[np] = ids
-    #    return name_prefix
-
-    @staticmethod
-    def _get_name_prefix(dataset_hparams):
-        name_prefix = [hpms["data_name"] for hpms in dataset_hparams]
-        for i in range(1, len(name_prefix)):
-            if name_prefix[i] in name_prefix[:i - 1]:
-                raise ValueError("Data name duplicated: %s" % name_prefix[i])
-        return name_prefix
-
-    @staticmethod
-    def _make_processor(dataset_hparams, data_spec, name_prefix):
-        processors = []
-        for i, hparams_i in enumerate(dataset_hparams):
-            data_spec_i = data_spec.get_ith_data_spec(i)
-
-            data_type = hparams_i["data_type"]
-            if _is_text_data(data_type):
-                tgt_proc_hparams = hparams_i
-                proc_shr = hparams_i["processing_share_with"]
-                if proc_shr is not None:
-                    tgt_proc_hparams = copy.copy(dataset_hparams[proc_shr])
-                    try:
-                        tgt_proc_hparams["variable_utterance"] = \
-                                hparams_i["variable_utterance"]
-                    except TypeError:
-                        tgt_proc_hparams.variable_utterance = \
-                                hparams_i["variable_utterance"]
-
-                processor, data_spec_i = MonoTextData._make_processor(
-                    tgt_proc_hparams, data_spec_i)
-            elif _is_scalar_data(data_type):
-                processor, data_spec_i = ScalarData._make_processor(
-                    hparams_i, data_spec_i, name_prefix='')
-            elif _is_tfrecord_data(data_type):
-                processor, data_spec_i = TFRecordData._make_processor(
-                    hparams_i, data_spec_i, name_prefix='')
-            else:
-                raise ValueError("Unsupported data type: %s" % data_type)
-
-            processors.append(processor)
-            data_spec.set_ith_data_spec(i, data_spec_i, len(dataset_hparams))
-
-        tran_fn = dsutils.make_combined_transformation(
-            processors, name_prefix=name_prefix)
-
-        data_spec.add_spec(name_prefix=name_prefix)
-
-        return tran_fn, data_spec
-
-    @staticmethod
-    def _make_length_filter(dataset_hparams, length_name, decoder):
-        filter_fns = []
-        for i, hpms in enumerate(dataset_hparams):
-            if not _is_text_data(hpms["data_type"]):
-                filter_fn = None
-            else:
-                filter_fn = MonoTextData._make_length_filter(
-                    hpms, length_name[i], decoder[i])
-            filter_fns.append(filter_fn)
-        combined_filter_fn = dsutils._make_combined_filter_fn(filter_fns)
-        return combined_filter_fn
-
-    def _process_dataset(self, dataset, hparams, data_spec):
-        name_prefix = self._get_name_prefix(hparams["datasets"])
-        # pylint: disable=attribute-defined-outside-init
-        self._name_to_id = {v: k for k, v in enumerate(name_prefix)}
-
-        tran_fn, data_spec = self._make_processor(
-            hparams["datasets"], data_spec, name_prefix)
-
-        num_parallel_calls = hparams["num_parallel_calls"]
-        dataset = dataset.map(
-            lambda *args: tran_fn(dsutils.maybe_tuple(args)),
-            num_parallel_calls=num_parallel_calls)
-
-        # Filters by length
-        def _get_length_name(i):
-            if not _is_text_data(hparams["datasets"][i]["data_type"]):
-                return None
-            name = dsutils._connect_name(
-                data_spec.name_prefix[i],
-                data_spec.decoder[i].length_tensor_name)
-            return name
-        filter_fn = self._make_length_filter(
-            hparams["datasets"],
-            [_get_length_name(i) for i in range(len(hparams["datasets"]))],
-            data_spec.decoder)
-        if filter_fn:
-            dataset = dataset.filter(filter_fn)
-
-        # Truncates data count
-        dataset = dataset.take(hparams["max_dataset_size"])
-
-        return dataset, data_spec
-
-    def _make_bucket_length_fn(self):
-        length_fn = self._hparams.bucket_length_fn
-        if not length_fn:
-            # Uses the length of the first text data
-            i = -1
-            for i, hparams_i in enumerate(self._hparams.datasets):
-                if _is_text_data(hparams_i["data_type"]):
-                    break
-            if i < 0:
-                raise ValueError("Undefined `length_fn`.")
-            length_fn = lambda x: x[self.length_name(i)]
-        elif not is_callable(length_fn):
-            # pylint: disable=redefined-variable-type
-            length_fn = utils.get_function(length_fn, ["texar.tf.custom"])
-        return length_fn
-
-    def _make_padded_shapes(self, dataset, decoders):
-        padded_shapes = dataset.output_shapes
-        for i, hparams_i in enumerate(self._hparams.datasets):
-            if not _is_text_data(hparams_i["data_type"]):
-                continue
-            if not hparams_i["pad_to_max_seq_length"]:
-                continue
-            text_and_id_shapes = MonoTextData._make_padded_text_and_id_shapes(
-                dataset, hparams_i, decoders[i],
-                self.text_name(i), self.text_id_name(i))
-
-            padded_shapes.update(text_and_id_shapes)
-
-        return padded_shapes
-
-    def _make_data(self):
-        self._vocab = self.make_vocab(self._hparams.datasets)
-        self._embedding = self.make_embedding(self._hparams.datasets,
-                                              self._vocab)
-
-        # Create dataset
-        dataset = self._make_dataset()
-        dataset, dataset_size = self._shuffle_dataset(
-            dataset, self._hparams, self._hparams.datasets[0].files)
-        self._dataset_size = dataset_size
-
-        # Processing
-        data_spec = dsutils._DataSpec(dataset=dataset,
-                                      dataset_size=self._dataset_size,
-                                      vocab=self._vocab,
-                                      embedding=self._embedding)
-        dataset, data_spec = self._process_dataset(
-            dataset, self._hparams, data_spec)
-        self._data_spec = data_spec
-        self._decoder = data_spec.decoder
-
-        # Batching
-        length_fn = self._make_bucket_length_fn()
-        padded_shapes = self._make_padded_shapes(dataset, self._decoder)
-        dataset = self._make_batch(
-            dataset, self._hparams, length_fn, padded_shapes)
-
-        # Prefetching
-        if self._hparams.prefetch_buffer_size > 0:
-            dataset = dataset.prefetch(self._hparams.prefetch_buffer_size)
-
-        self._dataset = dataset
-
-    def list_items(self):
-        """Returns the list of item names that the data can produce.
-
-        Returns:
-            A list of strings.
-        """
-        return list(self._dataset.output_types.keys())
-
-    @property
-    def dataset(self):
-        """The dataset.
-        """
-        return self._dataset
-
-    def dataset_size(self):
-        """Returns the number of data instances in the dataset.
-
-        Note that this is the total data count in the raw files, before any
-        filtering and truncation.
-        """
-        if not self._dataset_size:
-            # pylint: disable=attribute-defined-outside-init
-            self._dataset_size = count_file_lines(
-                self._hparams.datasets[0].files)
-        return self._dataset_size
-
-    def _maybe_name_to_id(self, name_or_id):
-        if is_str(name_or_id):
-            if name_or_id not in self._name_to_id:
-                raise ValueError("Unknown data name: {}".format(name_or_id))
-            return self._name_to_id[name_or_id]
-        return name_or_id
-
-    def vocab(self, name_or_id):
-        """Returns the :class:`~texar.tf.data.Vocab` of text dataset by its name
-        or id. `None` if the dataset is not of text type.
-
-        Args:
-            name_or_id (str or int): Data name or the index of text dataset.
-        """
-        i = self._maybe_name_to_id(name_or_id)
-        return self._vocab[i]
-
-    def embedding_init_value(self, name_or_id):
-        """Returns the `Tensor` of embedding init value of the
-        dataset by its name or id. `None` if the dataset is not of text type.
-        """
-        i = self._maybe_name_to_id(name_or_id)
-        return self._embedding[i]
-
-    def text_name(self, name_or_id):
-        """The name of text tensor of text dataset by its name or id. If the
-        dataaet is not of text type, returns `None`.
-        """
-        i = self._maybe_name_to_id(name_or_id)
-        if not _is_text_data(self._hparams.datasets[i]["data_type"]):
-            return None
-        name = dsutils._connect_name(
-            self._data_spec.name_prefix[i],
-            self._data_spec.decoder[i].text_tensor_name)
-        return name
-
-    def length_name(self, name_or_id):
-        """The name of length tensor of text dataset by its name or id. If the
-        dataset is not of text type, returns `None`.
-        """
-        i = self._maybe_name_to_id(name_or_id)
-        if not _is_text_data(self._hparams.datasets[i]["data_type"]):
-            return None
-        name = dsutils._connect_name(
-            self._data_spec.name_prefix[i],
-            self._data_spec.decoder[i].length_tensor_name)
-        return name
-
-    def text_id_name(self, name_or_id):
-        """The name of length tensor of text dataset by its name or id. If the
-        dataset is not of text type, returns `None`.
-        """
-        i = self._maybe_name_to_id(name_or_id)
-        if not _is_text_data(self._hparams.datasets[i]["data_type"]):
-            return None
-        name = dsutils._connect_name(
-            self._data_spec.name_prefix[i],
-            self._data_spec.decoder[i].text_id_tensor_name)
-        return name
-
-    def utterance_cnt_name(self, name_or_id):
-        """The name of utterance count tensor of text dataset by its name or id.
-        If the dataset is not variable utterance text data, returns `None`.
-        """
-        i = self._maybe_name_to_id(name_or_id)
-        if not _is_text_data(self._hparams.datasets[i]["data_type"]) or \
-                not self._hparams.datasets[i]["variable_utterance"]:
-            return None
-        name = dsutils._connect_name(
-            self._data_spec.name_prefix[i],
-            self._data_spec.decoder[i].utterance_cnt_tensor_name)
-        return name
-
-    @property
-    def data_name(self, name_or_id):
-        """The name of the data tensor of scalar dataset by its name or id..
-        If the dataset is not a scalar data, returns `None`.
-        """
-        i = self._maybe_name_to_id(name_or_id)
-        if not _is_scalar_data(self._hparams.datasets[i]["data_type"]):
-            return None
-        name = dsutils._connect_name(
-            self._data_spec.name_prefix[i],
-            self._data_spec.decoder[i].data_tensor_name)
-        return name
diff --git a/texar/tf/data/data/multi_aligned_data_test.py b/texar/tf/data/data/multi_aligned_data_test.py
deleted file mode 100644
index 3762a249..00000000
--- a/texar/tf/data/data/multi_aligned_data_test.py
+++ /dev/null
@@ -1,236 +0,0 @@
-# -*- coding: utf-8 -*-
-#
-"""
-Unit tests for data related operations.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
-
-import sys
-import tempfile
-import copy
-import numpy as np
-
-import tensorflow as tf
-
-import texar.tf as tx
-
-# pylint: disable=too-many-locals, too-many-branches, protected-access
-
-
-class MultiAlignedDataTest(tf.test.TestCase):
-    """Tests multi aligned text data class.
-    """
-
-    def setUp(self):
-        tf.test.TestCase.setUp(self)
-
-        # Create test data
-        vocab_list = ['This', 'is', 'a', 'word', '词']
-        vocab_file = tempfile.NamedTemporaryFile()
-        vocab_file.write('\n'.join(vocab_list).encode("utf-8"))
-        vocab_file.flush()
-        self._vocab_file = vocab_file
-        self._vocab_size = len(vocab_list)
-
-        text_0 = ['This is a sentence from source .', '词 词 。 source']
-        text_0_file = tempfile.NamedTemporaryFile()
-        text_0_file.write('\n'.join(text_0).encode("utf-8"))
-        text_0_file.flush()
-        self._text_0_file = text_0_file
-
-        text_1 = ['This is a sentence from target .', '词 词 。 target']
-        text_1_file = tempfile.NamedTemporaryFile()
-        text_1_file.write('\n'.join(text_1).encode("utf-8"))
-        text_1_file.flush()
-        self._text_1_file = text_1_file
-
-        text_2 = [
-            'This is a sentence from dialog . ||| dialog ',
-            '词 词 。 ||| 词 dialog']
-        text_2_file = tempfile.NamedTemporaryFile()
-        text_2_file.write('\n'.join(text_2).encode("utf-8"))
-        text_2_file.flush()
-        self._text_2_file = text_2_file
-
-        int_3 = [0, 1]
-        int_3_file = tempfile.NamedTemporaryFile()
-        int_3_file.write(('\n'.join([str(_) for _ in int_3])).encode("utf-8"))
-        int_3_file.flush()
-        self._int_3_file = int_3_file
-
-        def _bytes_feature(value):
-            """Returns a bytes_list from a string / byte.
-            """
-            value = tf.compat.as_bytes(
-                value,
-                encoding='utf-8'
-            )
-            return tf.train.Feature(
-                bytes_list=tf.train.BytesList(value=[value]))
-
-        def _int64_feature(value):
-            """Returns an int64_list from a bool / enum / int / uint.
-            """
-            return tf.train.Feature(
-                int64_list=tf.train.Int64List(value=[value]))
-
-        feature = {
-            "number1": _int64_feature(128),
-            "number2": _int64_feature(512),
-            "text": _bytes_feature("This is a sentence for TFRecord 词 词 。")
-        }
-        data_example = tf.train.Example(
-            features=tf.train.Features(feature=feature))
-        tfrecord_file = tempfile.NamedTemporaryFile(suffix=".tfrecord")
-        with tf.python_io.TFRecordWriter(tfrecord_file.name) as writer:
-            writer.write(data_example.SerializeToString())
-        tfrecord_file.flush()
-        self._tfrecord_file = tfrecord_file
-
-        # Construct database
-        self._hparams = {
-            "num_epochs": 123,
-            "batch_size": 23,
-            "datasets": [
-                {  # dataset 0
-                    "files": [self._text_0_file.name],
-                    "vocab_file": self._vocab_file.name,
-                    "bos_token": "",
-                    "data_name": "0"
-                },
-                {  # dataset 1
-                    "files": [self._text_1_file.name],
-                    "vocab_share_with": 0,
-                    "eos_token": "<TARGET_EOS>",
-                    "data_name": "1"
-                },
-                {  # dataset 2
-                    "files": [self._text_2_file.name],
-                    "vocab_file": self._vocab_file.name,
-                    "processing_share_with": 0,
-                    "variable_utterance": True,
-                    "data_name": "2"
-                },
-                {  # dataset 3
-                    "files": self._int_3_file.name,
-                    "data_type": "int",
-                    "data_name": "label"
-                },
-                {  # dataset 4
-                    "files": self._tfrecord_file.name,
-                    "feature_original_types": {
-                        'number1': ['tf.int64', 'FixedLenFeature'],
-                        'number2': ['tf.int64', 'FixedLenFeature'],
-                        'text': ['tf.string', 'FixedLenFeature'],
-                    },
-                    "feature_convert_types": {
-                        'number2': 'tf.float32',
-                    },
-                    "num_shards": 2,
-                    "shard_id": 1,
-                    "data_type": "tf_record",
-                    "data_name": "4"
-                }
-            ]
-        }
-
-    def _run_and_test(self, hparams, discard_did=None):
-        # Construct database
-        text_data = tx.data.MultiAlignedData(hparams)
-        self.assertEqual(
-            text_data.vocab(0).size,
-            self._vocab_size + len(text_data.vocab(0).special_tokens))
-
-        iterator = text_data.dataset.make_initializable_iterator()
-        text_data_batch = iterator.get_next()
-
-        with self.test_session() as sess:
-            sess.run(tf.global_variables_initializer())
-            sess.run(tf.local_variables_initializer())
-            sess.run(tf.tables_initializer())
-            sess.run(iterator.initializer)
-
-            while True:
-                try:
-                    # Run the logics
-                    data_batch_ = sess.run(text_data_batch)
-
-                    self.assertEqual(set(data_batch_.keys()),
-                                     set(text_data.list_items()))
-                    self.assertEqual(text_data.utterance_cnt_name('2'),
-                                     '2_utterance_cnt')
-                    text_0 = data_batch_['0_text']
-                    text_1 = data_batch_['1_text']
-                    text_2 = data_batch_['2_text']
-                    int_3 = data_batch_['label']
-                    number_1 = data_batch_['4_number1']
-                    number_2 = data_batch_['4_number2']
-                    text_3 = data_batch_['4_text']
-
-                    # pylint: disable=invalid-name
-                    for t0, t1, t2, i3, n1, n2, t4 in zip(
-                        text_0, text_1, text_2, int_3,
-                            number_1, number_2, text_3):
-
-                        np.testing.assert_array_equal(
-                            t0[:2], t1[1:3])
-                        np.testing.assert_array_equal(
-                            t0[:3], t2[0][:3])
-                        if t0[0].startswith(b'This'):
-                            self.assertEqual(i3, 0)
-                        else:
-                            self.assertEqual(i3, 1)
-                        self.assertEqual(n1, 128)
-                        self.assertEqual(n2, 512)
-                        self.assertTrue(isinstance(n1, np.int64))
-                        self.assertTrue(isinstance(n2, np.float32))
-                        self.assertTrue(isinstance(t4, bytes))
-
-                    if discard_did is not None:
-                        hpms = text_data._hparams.datasets[discard_did]
-                        max_l = hpms.max_seq_length
-                        max_l += text_data._decoder[discard_did].added_length
-                        for i in range(2):
-                            for length in data_batch_[text_data.length_name(i)]:
-                                self.assertLessEqual(length, max_l)
-                        for lengths in data_batch_[text_data.length_name(2)]:
-                            for length in lengths:
-                                self.assertLessEqual(length, max_l)
-                    for i, hpms in enumerate(text_data._hparams.datasets):
-                        if hpms.data_type != "text":
-                            continue
-                        max_l = hpms.max_seq_length
-                        mode = hpms.length_filter_mode
-                        if max_l is not None and mode == "truncate":
-                            max_l += text_data._decoder[i].added_length
-                            for length in data_batch_[text_data.length_name(i)]:
-                                self.assertLessEqual(length, max_l)
-
-                except tf.errors.OutOfRangeError:
-                    print('Done -- epoch limit reached')
-                    break
-
-    def test_default_setting(self):
-        """Tests the logics of the text data.
-        """
-        self._run_and_test(self._hparams)
-
-    def test_length_filter(self):
-        """Tests filtering by length.
-        """
-        hparams = copy.copy(self._hparams)
-        hparams["datasets"][0].update(
-            {"max_seq_length": 4,
-             "length_filter_mode": "discard"})
-        hparams["datasets"][1].update(
-            {"max_seq_length": 2,
-             "length_filter_mode": "truncate"})
-        self._run_and_test(hparams, discard_did=0)
-
-
-if __name__ == "__main__":
-    tf.test.main()
diff --git a/texar/tf/data/data/paired_text_data.py b/texar/tf/data/data/paired_text_data.py
deleted file mode 100644
index d93b097e..00000000
--- a/texar/tf/data/data/paired_text_data.py
+++ /dev/null
@@ -1,650 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Paired text data that consists of source text and target text.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
-
-import copy
-
-import tensorflow as tf
-
-from texar.tf.utils import utils
-from texar.tf.utils.dtypes import is_callable
-from texar.tf.data.data.mono_text_data import _default_mono_text_dataset_hparams
-from texar.tf.data.data.text_data_base import TextDataBase
-from texar.tf.data.data.mono_text_data import MonoTextData
-from texar.tf.data.data_utils import count_file_lines
-from texar.tf.data.data import dataset_utils as dsutils
-from texar.tf.data.vocabulary import Vocab, SpecialTokens
-from texar.tf.data.embedding import Embedding
-
-# pylint: disable=invalid-name, arguments-differ, not-context-manager
-# pylint: disable=protected-access, too-many-arguments
-
-__all__ = [
-    "_default_paired_text_dataset_hparams",
-    "PairedTextData"
-]
-
-
-def _default_paired_text_dataset_hparams():
-    """Returns hyperparameters of a paired text dataset with default values.
-
-    See :meth:`texar.tf.data.PairedTextData.default_hparams` for details.
-    """
-    source_hparams = _default_mono_text_dataset_hparams()
-    source_hparams["bos_token"] = None
-    source_hparams["data_name"] = "source"
-    target_hparams = _default_mono_text_dataset_hparams()
-    target_hparams.update(
-        {
-            "vocab_share": False,
-            "embedding_init_share": False,
-            "processing_share": False,
-            "data_name": "target"
-        }
-    )
-    return {
-        "source_dataset": source_hparams,
-        "target_dataset": target_hparams
-    }
-
-
-# pylint: disable=too-many-instance-attributes, too-many-public-methods
-class PairedTextData(TextDataBase):
-    """Text data processor that reads parallel source and target text.
-    This can be used in, e.g., seq2seq models.
-
-    Args:
-        hparams (dict): Hyperparameters. See :meth:`default_hparams` for the
-            defaults.
-
-    By default, the processor reads raw data files, performs tokenization,
-    batching and other pre-processing steps, and results in a TF Dataset
-    whose element is a python `dict` including six fields:
-
-        - "source_text":
-            A string Tensor of shape `[batch_size, max_time]` containing
-            the **raw** text toknes of source sequences. `max_time` is the
-            length of the longest sequence in the batch.
-            Short sequences in the batch are padded with **empty string**.
-            By default only EOS token is appended to each sequence.
-            Out-of-vocabulary tokens are **NOT** replaced with UNK.
-        - "source_text_ids":
-            An `int64` Tensor of shape `[batch_size, max_time]`
-            containing the token indexes of source sequences.
-        - "source_length":
-            An `int` Tensor of shape `[batch_size]` containing the
-            length of each source sequence in the batch (including BOS and/or
-            EOS if added).
-        - "target_text":
-            A string Tensor as "source_text" but for target sequences. By
-            default both BOS and EOS are added.
-        - "target_text_ids":
-            An `int64` Tensor as "source_text_ids" but for target sequences.
-        - "target_length":
-            An `int` Tensor of shape `[batch_size]` as "source_length" but for
-            target sequences.
-
-    If :attr:`'variable_utterance'` is set to `True` in :attr:`'source_dataset'`
-    and/or :attr:`'target_dataset'` of :attr:`hparams`, the corresponding
-    fields "source_*" and/or "target_*" are respectively changed to contain
-    variable utterance text data, as in :class:`~texar.tf.data.MonoTextData`.
-
-    The above field names can be accessed through :attr:`source_text_name`,
-    :attr:`source_text_id_name`, :attr:`source_length_name`,
-    :attr:`source_utterance_cnt_name`, and those prefixed with `target_`,
-    respectively.
-
-    Example:
-
-        .. code-block:: python
-
-            hparams={
-                'source_dataset': {'files': 's', 'vocab_file': 'vs'},
-                'target_dataset': {'files': ['t1', 't2'], 'vocab_file': 'vt'},
-                'batch_size': 1
-            }
-            data = PairedTextData(hparams)
-            iterator = DataIterator(data)
-            batch = iterator.get_next()
-
-            iterator.switch_to_dataset(sess) # initializes the dataset
-            batch_ = sess.run(batch)
-            # batch_ == {
-            #    'source_text': [['source', 'sequence', '<EOS>']],
-            #    'source_text_ids': [[5, 10, 2]],
-            #    'source_length': [3]
-            #    'target_text': [['<BOS>', 'target', 'sequence', '1', '<EOS>']],
-            #    'target_text_ids': [[1, 6, 10, 20, 2]],
-            #    'target_length': [5]
-            # }
-
-    """
-    def __init__(self, hparams):
-        TextDataBase.__init__(self, hparams)
-        with tf.name_scope(self.name, self.default_hparams()["name"]):
-            self._make_data()
-
-    @staticmethod
-    def default_hparams():
-        """Returns a dicitionary of default hyperparameters.
-
-        .. code-block:: python
-
-            {
-                # (1) Hyperparams specific to text dataset
-                "source_dataset": {
-                    "files": [],
-                    "compression_type": None,
-                    "vocab_file": "",
-                    "embedding_init": {},
-                    "delimiter": " ",
-                    "max_seq_length": None,
-                    "length_filter_mode": "truncate",
-                    "pad_to_max_seq_length": False,
-                    "bos_token": None,
-                    "eos_token": "<EOS>",
-                    "other_transformations": [],
-                    "variable_utterance": False,
-                    "utterance_delimiter": "|||",
-                    "max_utterance_cnt": 5,
-                    "data_name": "source",
-                },
-                "target_dataset": {
-                    # ...
-                    # Same fields are allowed as in "source_dataset" with the
-                    # same default values, except the
-                    # following new fields/values:
-                    "bos_token": "<BOS>"
-                    "vocab_share": False,
-                    "embedding_init_share": False,
-                    "processing_share": False,
-                    "data_name": "target"
-                }
-                # (2) General hyperparams
-                "num_epochs": 1,
-                "batch_size": 64,
-                "allow_smaller_final_batch": True,
-                "shuffle": True,
-                "shuffle_buffer_size": None,
-                "shard_and_shuffle": False,
-                "num_parallel_calls": 1,
-                "prefetch_buffer_size": 0,
-                "max_dataset_size": -1,
-                "seed": None,
-                "name": "paired_text_data",
-                # (3) Bucketing
-                "bucket_boundaries": [],
-                "bucket_batch_sizes": None,
-                "bucket_length_fn": None,
-            }
-
-        Here:
-
-        1. Hyperparameters in the :attr:`"source_dataset"` and
-        attr:`"target_dataset"` fields have the same definition as those
-        in :meth:`texar.tf.data.MonoTextData.default_hparams`, for source and
-        target text, respectively.
-
-        For the new hyperparameters in "target_dataset":
-
-            "vocab_share": bool
-                Whether to share the vocabulary of source.
-                If `True`, the vocab file of target is ignored.
-
-            "embedding_init_share": bool
-                Whether to share the embedding initial value of source. If
-                `True`, :attr:`"embedding_init"` of target is ignored.
-
-                :attr:`"vocab_share"` must be true to share the embedding
-                initial value.
-
-            "processing_share": bool
-                Whether to share the processing configurations of source,
-                including
-                "delimiter", "bos_token", "eos_token", and
-                "other_transformations".
-
-        2. For the **general** hyperparameters, see
-        :meth:`texar.tf.data.DataBase.default_hparams` for details.
-
-        3. For **bucketing** hyperparameters, see
-        :meth:`texar.tf.data.MonoTextData.default_hparams` for details, except
-        that the default bucket_length_fn is the maximum sequence length
-        of source and target sequences.
-
-        """
-        hparams = TextDataBase.default_hparams()
-        hparams["name"] = "paired_text_data"
-        hparams.update(_default_paired_text_dataset_hparams())
-        return hparams
-
-    @staticmethod
-    def make_vocab(src_hparams, tgt_hparams):
-        """Reads vocab files and returns source vocab and target vocab.
-
-        Args:
-            src_hparams (dict or HParams): Hyperparameters of source dataset.
-            tgt_hparams (dict or HParams): Hyperparameters of target dataset.
-
-        Returns:
-            A pair of :class:`texar.tf.data.Vocab` instances. The two instances
-            may be the same objects if source and target vocabs are shared
-            and have the same other configs.
-        """
-        src_vocab = MonoTextData.make_vocab(src_hparams)
-
-        if tgt_hparams["processing_share"]:
-            tgt_bos_token = src_hparams["bos_token"]
-            tgt_eos_token = src_hparams["eos_token"]
-        else:
-            tgt_bos_token = tgt_hparams["bos_token"]
-            tgt_eos_token = tgt_hparams["eos_token"]
-        tgt_bos_token = utils.default_str(tgt_bos_token,
-                                          SpecialTokens.BOS)
-        tgt_eos_token = utils.default_str(tgt_eos_token,
-                                          SpecialTokens.EOS)
-        if tgt_hparams["vocab_share"]:
-            if tgt_bos_token == src_vocab.bos_token and \
-                    tgt_eos_token == src_vocab.eos_token:
-                tgt_vocab = src_vocab
-            else:
-                tgt_vocab = Vocab(src_hparams["vocab_file"],
-                                  bos_token=tgt_bos_token,
-                                  eos_token=tgt_eos_token)
-        else:
-            tgt_vocab = Vocab(tgt_hparams["vocab_file"],
-                              bos_token=tgt_bos_token,
-                              eos_token=tgt_eos_token)
-
-        return src_vocab, tgt_vocab
-
-    @staticmethod
-    def make_embedding(src_emb_hparams, src_token_to_id_map,
-                       tgt_emb_hparams=None, tgt_token_to_id_map=None,
-                       emb_init_share=False):
-        """Optionally loads source and target embeddings from files
-        (if provided), and returns respective :class:`texar.tf.data.Embedding`
-        instances.
-        """
-        src_embedding = MonoTextData.make_embedding(src_emb_hparams,
-                                                    src_token_to_id_map)
-
-        if emb_init_share:
-            tgt_embedding = src_embedding
-        else:
-            tgt_emb_file = tgt_emb_hparams["file"]
-            tgt_embedding = None
-            if tgt_emb_file is not None and tgt_emb_file != "":
-                tgt_embedding = Embedding(tgt_token_to_id_map, tgt_emb_hparams)
-
-        return src_embedding, tgt_embedding
-
-    def _make_dataset(self):
-        src_dataset = tf.data.TextLineDataset(
-            self._hparams.source_dataset.files,
-            compression_type=self._hparams.source_dataset.compression_type)
-        tgt_dataset = tf.data.TextLineDataset(
-            self._hparams.target_dataset.files,
-            compression_type=self._hparams.target_dataset.compression_type)
-        return tf.data.Dataset.zip((src_dataset, tgt_dataset))
-
-    @staticmethod
-    def _get_name_prefix(src_hparams, tgt_hparams):
-        name_prefix = [
-            src_hparams["data_name"], tgt_hparams["data_name"]]
-        if name_prefix[0] == name_prefix[1]:
-            raise ValueError("'data_name' of source and target "
-                             "datasets cannot be the same.")
-        return name_prefix
-
-    @staticmethod
-    def _make_processor(src_hparams, tgt_hparams, data_spec, name_prefix):
-        # Create source data decoder
-        data_spec_i = data_spec.get_ith_data_spec(0)
-        src_decoder, src_trans, data_spec_i = MonoTextData._make_processor(
-            src_hparams, data_spec_i, chained=False)
-        data_spec.set_ith_data_spec(0, data_spec_i, 2)
-
-        # Create target data decoder
-        tgt_proc_hparams = tgt_hparams
-        if tgt_hparams["processing_share"]:
-            tgt_proc_hparams = copy.copy(src_hparams)
-            try:
-                tgt_proc_hparams["variable_utterance"] = \
-                        tgt_hparams["variable_utterance"]
-            except TypeError:
-                tgt_proc_hparams.variable_utterance = \
-                        tgt_hparams["variable_utterance"]
-        data_spec_i = data_spec.get_ith_data_spec(1)
-        tgt_decoder, tgt_trans, data_spec_i = MonoTextData._make_processor(
-            tgt_proc_hparams, data_spec_i, chained=False)
-        data_spec.set_ith_data_spec(1, data_spec_i, 2)
-
-        tran_fn = dsutils.make_combined_transformation(
-            [[src_decoder] + src_trans, [tgt_decoder] + tgt_trans],
-            name_prefix=name_prefix)
-
-        data_spec.add_spec(name_prefix=name_prefix)
-
-        return tran_fn, data_spec
-
-    @staticmethod
-    def _make_length_filter(src_hparams, tgt_hparams,
-                            src_length_name, tgt_length_name,
-                            src_decoder, tgt_decoder):
-        src_filter_fn = MonoTextData._make_length_filter(
-            src_hparams, src_length_name, src_decoder)
-        tgt_filter_fn = MonoTextData._make_length_filter(
-            tgt_hparams, tgt_length_name, tgt_decoder)
-        combined_filter_fn = dsutils._make_combined_filter_fn(
-            [src_filter_fn, tgt_filter_fn])
-        return combined_filter_fn
-
-    def _process_dataset(self, dataset, hparams, data_spec):
-        name_prefix = PairedTextData._get_name_prefix(
-            hparams["source_dataset"], hparams["target_dataset"])
-        tran_fn, data_spec = self._make_processor(
-            hparams["source_dataset"], hparams["target_dataset"],
-            data_spec, name_prefix=name_prefix)
-
-        num_parallel_calls = hparams["num_parallel_calls"]
-        dataset = dataset.map(
-            lambda *args: tran_fn(dsutils.maybe_tuple(args)),
-            num_parallel_calls=num_parallel_calls)
-
-        # Filters by length
-        src_length_name = dsutils._connect_name(
-            data_spec.name_prefix[0],
-            data_spec.decoder[0].length_tensor_name)
-        tgt_length_name = dsutils._connect_name(
-            data_spec.name_prefix[1],
-            data_spec.decoder[1].length_tensor_name)
-        filter_fn = self._make_length_filter(
-            hparams["source_dataset"], hparams["target_dataset"],
-            src_length_name, tgt_length_name,
-            data_spec.decoder[0], data_spec.decoder[1])
-        if filter_fn:
-            dataset = dataset.filter(filter_fn)
-
-        # Truncates data count
-        dataset = dataset.take(hparams["max_dataset_size"])
-
-        return dataset, data_spec
-
-    def _make_bucket_length_fn(self):
-        length_fn = self._hparams.bucket_length_fn
-        if not length_fn:
-            length_fn = lambda x: tf.maximum(
-                x[self.source_length_name], x[self.target_length_name])
-        elif not is_callable(length_fn):
-            # pylint: disable=redefined-variable-type
-            length_fn = utils.get_function(length_fn, ["texar.tf.custom"])
-        return length_fn
-
-    def _make_padded_shapes(self, dataset, src_decoder, tgt_decoder):
-        src_text_and_id_shapes = {}
-        if self._hparams.source_dataset.pad_to_max_seq_length:
-            src_text_and_id_shapes = \
-                    MonoTextData._make_padded_text_and_id_shapes(
-                        dataset, self._hparams.source_dataset, src_decoder,
-                        self.source_text_name, self.source_text_id_name)
-
-        tgt_text_and_id_shapes = {}
-        if self._hparams.target_dataset.pad_to_max_seq_length:
-            tgt_text_and_id_shapes = \
-                    MonoTextData._make_padded_text_and_id_shapes(
-                        dataset, self._hparams.target_dataset, tgt_decoder,
-                        self.target_text_name, self.target_text_id_name)
-
-        padded_shapes = dataset.output_shapes
-        padded_shapes.update(src_text_and_id_shapes)
-        padded_shapes.update(tgt_text_and_id_shapes)
-
-        return padded_shapes
-
-    def _make_data(self):
-        self._src_vocab, self._tgt_vocab = self.make_vocab(
-            self._hparams.source_dataset, self._hparams.target_dataset)
-
-        tgt_hparams = self._hparams.target_dataset
-        if not tgt_hparams.vocab_share and tgt_hparams.embedding_init_share:
-            raise ValueError("embedding_init can be shared only when vocab "
-                             "is shared. Got `vocab_share=False, "
-                             "emb_init_share=True`.")
-        self._src_embedding, self._tgt_embedding = self.make_embedding(
-            self._hparams.source_dataset.embedding_init,
-            self._src_vocab.token_to_id_map_py,
-            self._hparams.target_dataset.embedding_init,
-            self._tgt_vocab.token_to_id_map_py,
-            self._hparams.target_dataset.embedding_init_share)
-
-        # Create dataset
-        dataset = self._make_dataset()
-        dataset, dataset_size = self._shuffle_dataset(
-            dataset, self._hparams, self._hparams.source_dataset.files)
-        self._dataset_size = dataset_size
-
-        # Processing.
-        data_spec = dsutils._DataSpec(
-            dataset=dataset, dataset_size=self._dataset_size,
-            vocab=[self._src_vocab, self._tgt_vocab],
-            embedding=[self._src_embedding, self._tgt_embedding])
-        dataset, data_spec = self._process_dataset(
-            dataset, self._hparams, data_spec)
-        self._data_spec = data_spec
-        self._decoder = data_spec.decoder
-        self._src_decoder = data_spec.decoder[0]
-        self._tgt_decoder = data_spec.decoder[1]
-
-        # Batching
-        length_fn = self._make_bucket_length_fn()
-        padded_shapes = self._make_padded_shapes(
-            dataset, self._src_decoder, self._tgt_decoder)
-        dataset = self._make_batch(
-            dataset, self._hparams, length_fn, padded_shapes)
-
-        # Prefetching
-        if self._hparams.prefetch_buffer_size > 0:
-            dataset = dataset.prefetch(self._hparams.prefetch_buffer_size)
-
-        self._dataset = dataset
-
-    def list_items(self):
-        """Returns the list of item names that the data can produce.
-
-        Returns:
-            A list of strings.
-        """
-        return list(self._dataset.output_types.keys())
-
-    @property
-    def dataset(self):
-        """The dataset.
-        """
-        return self._dataset
-
-    def dataset_size(self):
-        """Returns the number of data instances in the dataset.
-
-        Note that this is the total data count in the raw files, before any
-        filtering and truncation.
-        """
-        if not self._dataset_size:
-            # pylint: disable=attribute-defined-outside-init
-            self._dataset_size = count_file_lines(
-                self._hparams.source_dataset.files)
-        return self._dataset_size
-
-    @property
-    def vocab(self):
-        """A pair instances of :class:`~texar.tf.data.Vocab` that are source
-        and target vocabs, respectively.
-        """
-        return self._src_vocab, self._tgt_vocab
-
-    @property
-    def source_vocab(self):
-        """The source vocab, an instance of :class:`~texar.tf.data.Vocab`.
-        """
-        return self._src_vocab
-
-    @property
-    def target_vocab(self):
-        """The target vocab, an instance of :class:`~texar.tf.data.Vocab`.
-        """
-        return self._tgt_vocab
-
-    @property
-    def source_embedding_init_value(self):
-        """The `Tensor` containing the embedding value of source data
-        loaded from file. `None` if embedding is not specified.
-        """
-        if self._src_embedding is None:
-            return None
-        return self._src_embedding.word_vecs
-
-    @property
-    def target_embedding_init_value(self):
-        """The `Tensor` containing the embedding value of target data
-        loaded from file. `None` if embedding is not specified.
-        """
-        if self._tgt_embedding is None:
-            return None
-        return self._tgt_embedding.word_vecs
-
-    def embedding_init_value(self):
-        """A pair of `Tensor` containing the embedding values of source and
-        target data loaded from file.
-        """
-        src_emb = self.source_embedding_init_value
-        tgt_emb = self.target_embedding_init_value
-        return src_emb, tgt_emb
-
-    @property
-    def source_text_name(self):
-        """The name of the source text tensor, "source_text" by default.
-        """
-        name = dsutils._connect_name(
-            self._data_spec.name_prefix[0],
-            self._src_decoder.text_tensor_name)
-        return name
-
-    @property
-    def source_length_name(self):
-        """The name of the source length tensor, "source_length" by default.
-        """
-        name = dsutils._connect_name(
-            self._data_spec.name_prefix[0],
-            self._src_decoder.length_tensor_name)
-        return name
-
-    @property
-    def source_text_id_name(self):
-        """The name of the source text index tensor, "source_text_ids" by
-        default.
-        """
-        name = dsutils._connect_name(
-            self._data_spec.name_prefix[0],
-            self._src_decoder.text_id_tensor_name)
-        return name
-
-    @property
-    def source_utterance_cnt_name(self):
-        """The name of the source text utterance count tensor,
-        "source_utterance_cnt" by default.
-        """
-        if not self._hparams.source_dataset.variable_utterance:
-            raise ValueError(
-                "`utterance_cnt_name` of source data is undefined.")
-        name = dsutils._connect_name(
-            self._data_spec.name_prefix[0],
-            self._src_decoder.utterance_cnt_tensor_name)
-        return name
-
-    @property
-    def target_text_name(self):
-        """The name of the target text tensor, "target_text" bt default.
-        """
-        name = dsutils._connect_name(
-            self._data_spec.name_prefix[1],
-            self._tgt_decoder.text_tensor_name)
-        return name
-
-    @property
-    def target_length_name(self):
-        """The name of the target length tensor, "target_length" by default.
-        """
-        name = dsutils._connect_name(
-            self._data_spec.name_prefix[1],
-            self._tgt_decoder.length_tensor_name)
-        return name
-
-    @property
-    def target_text_id_name(self):
-        """The name of the target text index tensor, "target_text_ids" by
-        default.
-        """
-        name = dsutils._connect_name(
-            self._data_spec.name_prefix[1],
-            self._tgt_decoder.text_id_tensor_name)
-        return name
-
-    @property
-    def target_utterance_cnt_name(self):
-        """The name of the target text utterance count tensor,
-        "target_utterance_cnt" by default.
-        """
-        if not self._hparams.target_dataset.variable_utterance:
-            raise ValueError(
-                "`utterance_cnt_name` of target data is undefined.")
-        name = dsutils._connect_name(
-            self._data_spec.name_prefix[1],
-            self._tgt_decoder.utterance_cnt_tensor_name)
-        return name
-
-    @property
-    def text_name(self):
-        """The name of text tensor, "text" by default.
-        """
-        return self._src_decoder.text_tensor_name
-
-    @property
-    def length_name(self):
-        """The name of length tensor, "length" by default.
-        """
-        return self._src_decoder.length_tensor_name
-
-    @property
-    def text_id_name(self):
-        """The name of text index tensor, "text_ids" by default.
-        """
-        return self._src_decoder.text_id_tensor_name
-
-    @property
-    def utterance_cnt_name(self):
-        """The name of the text utterance count tensor, "utterance_cnt" by
-        default.
-        """
-        if self._hparams.source_dataset.variable_utterance:
-            return self._src_decoder.utterance_cnt_tensor_name
-        if self._hparams.target_dataset.variable_utterance:
-            return self._tgt_decoder.utterance_cnt_tensor_name
-        raise ValueError("`utterance_cnt_name` is not defined.")
diff --git a/texar/tf/data/data/paired_text_data_test.py b/texar/tf/data/data/paired_text_data_test.py
deleted file mode 100644
index 16412f9b..00000000
--- a/texar/tf/data/data/paired_text_data_test.py
+++ /dev/null
@@ -1,213 +0,0 @@
-# -*- coding: utf-8 -*-
-#
-"""
-Unit tests for data related operations.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
-
-import tempfile
-import copy
-import numpy as np
-
-import tensorflow as tf
-
-import texar.tf as tx
-from texar.tf.data import SpecialTokens
-
-# pylint: disable=too-many-locals, too-many-branches, protected-access
-# pylint: disable=invalid-name
-
-
-class PairedTextDataTest(tf.test.TestCase):
-    """Tests paired text data class.
-    """
-
-    def setUp(self):
-        tf.test.TestCase.setUp(self)
-
-        # Create test data
-        vocab_list = ['This', 'is', 'a', 'word', '词']
-        vocab_file = tempfile.NamedTemporaryFile()
-        vocab_file.write('\n'.join(vocab_list).encode("utf-8"))
-        vocab_file.flush()
-        self._vocab_file = vocab_file
-        self._vocab_size = len(vocab_list)
-
-        src_text = ['This is a sentence from source .', '词 词 。 source']
-        src_text_file = tempfile.NamedTemporaryFile()
-        src_text_file.write('\n'.join(src_text).encode("utf-8"))
-        src_text_file.flush()
-        self._src_text_file = src_text_file
-
-        tgt_text = ['This is a sentence from target .', '词 词 。 target']
-        tgt_text_file = tempfile.NamedTemporaryFile()
-        tgt_text_file.write('\n'.join(tgt_text).encode("utf-8"))
-        tgt_text_file.flush()
-        self._tgt_text_file = tgt_text_file
-
-        self._hparams = {
-            "num_epochs": 50,
-            "batch_size": 3,
-            "source_dataset": {
-                "files": [self._src_text_file.name],
-                "vocab_file": self._vocab_file.name,
-            },
-            "target_dataset": {
-                "files": self._tgt_text_file.name,
-                "vocab_share": True,
-                "eos_token": "<TARGET_EOS>"
-            }
-        }
-
-    def _run_and_test(self, hparams, proc_shr=False, length_inc=None,
-                      discard_src=False):
-        # Construct database
-        text_data = tx.data.PairedTextData(hparams)
-        self.assertEqual(
-            text_data.source_vocab.size,
-            self._vocab_size + len(text_data.source_vocab.special_tokens))
-
-        iterator = text_data.dataset.make_initializable_iterator()
-        text_data_batch = iterator.get_next()
-
-        with self.test_session() as sess:
-            sess.run(tf.global_variables_initializer())
-            sess.run(tf.local_variables_initializer())
-            sess.run(tf.tables_initializer())
-            sess.run(iterator.initializer)
-
-            if proc_shr:
-                tgt_eos = b'<EOS>'
-            else:
-                tgt_eos = b'<TARGET_EOS>'
-
-            while True:
-                try:
-                    # Run the logics
-                    data_batch_ = sess.run(text_data_batch)
-                    self.assertEqual(set(data_batch_.keys()),
-                                     set(text_data.list_items()))
-                    # Test matching
-                    src_text = data_batch_['source_text']
-                    tgt_text = data_batch_['target_text']
-                    if proc_shr:
-                        for src, tgt in zip(src_text, tgt_text):
-                            np.testing.assert_array_equal(src[:3], tgt[:3])
-                    else:
-                        for src, tgt in zip(src_text, tgt_text):
-                            np.testing.assert_array_equal(src[:3], tgt[1:4])
-                    self.assertTrue(
-                        tgt_eos in data_batch_['target_text'][0])
-
-                    if length_inc:
-                        for i in range(len(data_batch_['source_text'])):
-                            text_ = data_batch_['source_text'][i].tolist()
-                            self.assertEqual(
-                                text_.index(b'<EOS>') + 1,
-                                data_batch_['source_length'][i] - length_inc[0])
-                        for i in range(len(data_batch_['target_text'])):
-                            text_ = data_batch_['target_text'][i].tolist()
-                            self.assertEqual(
-                                text_.index(tgt_eos) + 1,
-                                data_batch_['target_length'][i] - length_inc[1])
-
-                    if discard_src:
-                        src_hparams = text_data.hparams.source_dataset
-                        max_l = src_hparams.max_seq_length
-                        max_l += text_data._decoder[0].added_length
-                        for l in data_batch_[text_data.source_length_name]:
-                            self.assertLessEqual(l, max_l)
-
-                except tf.errors.OutOfRangeError:
-                    print('Done -- epoch limit reached')
-                    break
-
-    def test_default_setting(self):
-        """Tests the logics of the text data.
-        """
-        self._run_and_test(self._hparams)
-
-    def test_shuffle(self):
-        """Tests toggling shuffle.
-        """
-        hparams = copy.copy(self._hparams)
-        hparams["shuffle"] = False
-        self._run_and_test(hparams)
-
-    def test_processing_share(self):
-        """Tests sharing processing.
-        """
-        hparams = copy.copy(self._hparams)
-        hparams["target_dataset"]["processing_share"] = True
-        self._run_and_test(hparams, proc_shr=True)
-
-    def test_other_transformations(self):
-        """Tests use of other transformations
-        """
-        def _transform(x, data_specs):  # pylint: disable=invalid-name
-            x[data_specs.decoder.length_tensor_name] += 1
-            return x
-
-        hparams = copy.copy(self._hparams)
-        hparams["source_dataset"].update(
-            {"other_transformations": [_transform, _transform]})
-        hparams["target_dataset"].update(
-            {"other_transformations": [_transform]})
-        self._run_and_test(hparams, length_inc=(2, 1))
-
-    def test_length_filter(self):
-        """Tests filtering by length.
-        """
-        hparams = copy.copy(self._hparams)
-        hparams["source_dataset"].update(
-            {"max_seq_length": 4,
-             "length_filter_mode": "discard"})
-        self._run_and_test(hparams, discard_src=True)
-
-    # def test_sequence_length(self):
-    #    hparams = {
-    #        "batch_size": 64,
-    #        "num_epochs": 1,
-    #        "shuffle": False,
-    #        "allow_smaller_final_batch": False,
-    #        "source_dataset": {
-    #            "files": "../../../data/yelp/sentiment.dev.sort.0",
-    #            "vocab_file": "../../../data/yelp/vocab",
-    #            "bos_token": SpecialTokens.BOS,
-    #            "eos_token": SpecialTokens.EOS,
-    #        },
-    #        "target_dataset": {
-    #            "files": "../../../data/yelp/sentiment.dev.sort.1",
-    #            "vocab_share": True,
-    #        },
-    #    }
-    #    data = tx.data.PairedTextData(hparams)
-
-    #    iterator = tx.data.TrainTestDataIterator(val=data)
-    #    text_data_batch = iterator.get_next()
-
-    #    with self.test_session() as sess:
-    #        sess.run(tf.global_variables_initializer())
-    #        sess.run(tf.local_variables_initializer())
-    #        sess.run(tf.tables_initializer())
-    #        iterator.switch_to_val_data(sess)
-
-    #        while True:
-    #            try:
-    #                data_batch_ = sess.run(text_data_batch)
-    #                src = data_batch_["source_text_ids"]
-    #                src_len = data_batch_["source_length"]
-    #                self.assertEqual(src.shape[1], np.max(src_len))
-    #                tgt = data_batch_["target_text_ids"]
-    #                tgt_len = data_batch_["target_length"]
-    #                self.assertEqual(tgt.shape[1], np.max(tgt_len))
-    #            except tf.errors.OutOfRangeError:
-    #                break
-
-
-if __name__ == "__main__":
-    tf.test.main()
diff --git a/texar/tf/data/data/scalar_data.py b/texar/tf/data/data/scalar_data.py
deleted file mode 100644
index 64d2dde4..00000000
--- a/texar/tf/data/data/scalar_data.py
+++ /dev/null
@@ -1,264 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Various data classes that define data reading, parsing, batching, and other
-preprocessing operations.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
-
-import tensorflow as tf
-
-from texar.tf.data.data_utils import count_file_lines
-from texar.tf.data.data import dataset_utils as dsutils
-from texar.tf.data.data.data_base import DataBase
-from texar.tf.data.data.mono_text_data import MonoTextData
-from texar.tf.data.data_decoders import ScalarDataDecoder
-
-# pylint: disable=invalid-name, arguments-differ, not-context-manager
-
-__all__ = [
-    "_default_scalar_dataset_hparams",
-    "ScalarData"
-]
-
-
-def _default_scalar_dataset_hparams():
-    """Returns hyperparameters of a scalar dataset with default values.
-
-    See :meth:`texar.tf.data.ScalarData.default_hparams` for details.
-    """
-    return {
-        "files": [],
-        "compression_type": None,
-        "data_type": "int",
-        "data_name": None,
-        "other_transformations": [],
-        "@no_typecheck": ["files"]
-    }
-
-
-class ScalarData(DataBase):
-    """Scalar data where each line of the files is a scalar (int or float),
-    e.g., a data label.
-
-    Args:
-        hparams (dict): Hyperparameters. See :meth:`default_hparams` for the
-            defaults.
-
-    The processor reads and processes raw data and results in a TF dataset
-    whose element is a python `dict` including one field. The field name is
-    specified in :attr:`hparams["dataset"]["data_name"]`. If not specified,
-    the default name is `"data"`. The field name can be accessed through
-    :attr:`data_name`.
-
-    This field is a Tensor of shape `[batch_size]` containing a batch of
-    scalars, of either int or float type as specified in :attr:`hparams`.
-
-    Example:
-
-        .. code-block:: python
-
-            hparams={
-                'dataset': { 'files': 'data.txt', 'data_name': 'label' },
-                'batch_size': 2
-            }
-            data = ScalarData(hparams)
-            iterator = DataIterator(data)
-            batch = iterator.get_next()
-
-            iterator.switch_to_dataset(sess) # initializes the dataset
-            batch_ = sess.run(batch)
-            # batch_ == {
-            #     'label': [2, 9]
-            # }
-    """
-
-    def __init__(self, hparams):
-        DataBase.__init__(self, hparams)
-        with tf.name_scope(self.name, self.default_hparams()["name"]):
-            self._make_data()
-
-    @staticmethod
-    def default_hparams():
-        """Returns a dicitionary of default hyperparameters.
-
-        .. code-block:: python
-
-            {
-                # (1) Hyperparams specific to scalar dataset
-                "dataset": {
-                    "files": [],
-                    "compression_type": None,
-                    "data_type": "int",
-                    "other_transformations": [],
-                    "data_name": None,
-                }
-                # (2) General hyperparams
-                "num_epochs": 1,
-                "batch_size": 64,
-                "allow_smaller_final_batch": True,
-                "shuffle": True,
-                "shuffle_buffer_size": None,
-                "shard_and_shuffle": False,
-                "num_parallel_calls": 1,
-                "prefetch_buffer_size": 0,
-                "max_dataset_size": -1,
-                "seed": None,
-                "name": "scalar_data",
-            }
-
-        Here:
-
-        1. For the hyperparameters in the :attr:`"dataset"` field:
-
-            "files": str or list
-                A (list of) file path(s).
-
-                Each line contains a single scalar number.
-
-            "compression_type": str, optional
-                One of "" (no compression), "ZLIB", or "GZIP".
-
-            "data_type": str
-                The scalar type. Currently supports "int" and "float".
-
-            "other_transformations": list
-                A list of transformation functions or function names/paths to
-                further transform each single data instance.
-
-                (More documentations to be added.)
-
-            "data_name": str
-                Name of the dataset.
-
-        2. For the **general** hyperparameters, see
-        :meth:`texar.tf.data.DataBase.default_hparams` for details.
-
-        """
-        hparams = DataBase.default_hparams()
-        hparams["name"] = "scalar_data"
-        hparams.update({
-            "dataset": _default_scalar_dataset_hparams()
-        })
-        return hparams
-
-    @staticmethod
-    def _get_dtype(dtype_hparam):
-        if dtype_hparam == "int":
-            dtype = tf.int32
-        elif dtype_hparam == "float":
-            dtype = tf.float32
-        else:
-            raise ValueError("Unknown data type: " + dtype_hparam)
-        return dtype
-
-    @staticmethod
-    def _make_processor(dataset_hparams, data_spec, chained=True,
-                        name_prefix=None):
-        # Create data decoder
-        decoder = ScalarDataDecoder(
-            ScalarData._get_dtype(dataset_hparams["data_type"]),
-            data_name=name_prefix)
-        # Create other transformations
-        data_spec.add_spec(decoder=decoder)
-        # pylint: disable=protected-access
-        other_trans = MonoTextData._make_other_transformations(
-            dataset_hparams["other_transformations"], data_spec)
-
-        data_spec.add_spec(name_prefix=name_prefix)
-
-        if chained:
-            chained_tran = dsutils.make_chained_transformation(
-                [decoder] + other_trans)
-            return chained_tran, data_spec
-        else:
-            return decoder, other_trans, data_spec
-
-    def _process_dataset(self, dataset, hparams, data_spec):
-        chained_tran, data_spec = self._make_processor(
-            hparams["dataset"], data_spec,
-            name_prefix=hparams["dataset"]["data_name"])
-        num_parallel_calls = hparams["num_parallel_calls"]
-        dataset = dataset.map(
-            lambda *args: chained_tran(dsutils.maybe_tuple(args)),
-            num_parallel_calls=num_parallel_calls)
-
-        # Truncates data count
-        dataset = dataset.take(hparams["max_dataset_size"])
-
-        return dataset, data_spec
-
-    def _make_data(self):
-        dataset_hparams = self._hparams.dataset
-
-        # Create and shuffle dataset
-        dataset = MonoTextData._make_mono_text_dataset(dataset_hparams)
-        dataset, dataset_size = self._shuffle_dataset(
-            dataset, self._hparams, self._hparams.dataset.files)
-        self._dataset_size = dataset_size
-
-        # Processing
-        # pylint: disable=protected-access
-        data_spec = dsutils._DataSpec(dataset=dataset,
-                                      dataset_size=self._dataset_size)
-        dataset, data_spec = self._process_dataset(dataset, self._hparams,
-                                                   data_spec)
-        self._data_spec = data_spec
-        self._decoder = data_spec.decoder  # pylint: disable=no-member
-
-        # Batching
-        dataset = self._make_batch(dataset, self._hparams)
-
-        # Prefetching
-        if self._hparams.prefetch_buffer_size > 0:
-            dataset = dataset.prefetch(self._hparams.prefetch_buffer_size)
-
-        self._dataset = dataset
-
-    def list_items(self):
-        """Returns the list of item names that the data can produce.
-
-        Returns:
-            A list of strings.
-        """
-        return list(self._dataset.output_types.keys())
-
-    @property
-    def dataset(self):
-        """The dataset.
-        """
-        return self._dataset
-
-    def dataset_size(self):
-        """Returns the number of data instances in the dataset.
-
-        Note that this is the total data count in the raw files, before any
-        filtering and truncation.
-        """
-        if not self._dataset_size:
-            # pylint: disable=attribute-defined-outside-init
-            self._dataset_size = count_file_lines(
-                self._hparams.dataset.files)
-        return self._dataset_size
-
-    @property
-    def data_name(self):
-        """The name of the data tensor, "data" by default if not specified in
-        :attr:`hparams`.
-        """
-        return self._decoder.data_tensor_name
diff --git a/texar/tf/data/data/scalar_data_test.py b/texar/tf/data/data/scalar_data_test.py
deleted file mode 100644
index 846cb938..00000000
--- a/texar/tf/data/data/scalar_data_test.py
+++ /dev/null
@@ -1,139 +0,0 @@
-# -*- coding: utf-8 -*-
-#
-"""
-Unit tests for data related operations.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
-
-import copy
-import tempfile
-import numpy as np
-
-import tensorflow as tf
-
-import texar.tf as tx
-
-
-class ScalarDataTest(tf.test.TestCase):
-    """Tests scalar data class.
-    """
-
-    def setUp(self):
-        tf.test.TestCase.setUp(self)
-
-        # Create test data
-        # pylint: disable=no-member
-        int_data = np.linspace(0, 100, num=101, dtype=np.int32).tolist()
-        int_data = [str(i) for i in int_data]
-        int_file = tempfile.NamedTemporaryFile()
-        int_file.write('\n'.join(int_data).encode("utf-8"))
-        int_file.flush()
-        self._int_file = int_file
-
-        self._int_hparams = {
-            "num_epochs": 1,
-            "batch_size": 1,
-            "shuffle": False,
-            "dataset": {
-                "files": self._int_file.name,
-                "data_type": "int",
-                "data_name": "label"
-            }
-        }
-
-        self._float_hparams = {
-            "num_epochs": 1,
-            "batch_size": 1,
-            "shuffle": False,
-            "dataset": {
-                "files": self._int_file.name,
-                "data_type": "float",
-                "data_name": "feat"
-            }
-        }
-
-    def _run_and_test(self, hparams):
-        # Construct database
-        scalar_data = tx.data.ScalarData(hparams)
-
-        self.assertEqual(scalar_data.list_items()[0],
-                         hparams["dataset"]["data_name"])
-
-        iterator = scalar_data.dataset.make_initializable_iterator()
-        data_batch = iterator.get_next()
-
-        with self.test_session() as sess:
-            sess.run(tf.global_variables_initializer())
-            sess.run(tf.local_variables_initializer())
-            sess.run(tf.tables_initializer())
-            sess.run(iterator.initializer)
-
-            i = 0
-            while True:
-                try:
-                    # Run the logics
-                    data_batch_ = sess.run(data_batch)
-                    self.assertEqual(set(data_batch_.keys()),
-                                     set(scalar_data.list_items()))
-                    value = data_batch_[scalar_data.data_name][0]
-                    self.assertEqual(i, value)
-                    i += 1
-                    # pylint: disable=no-member
-                    if hparams["dataset"]["data_type"] == "int":
-                        self.assertTrue(isinstance(value, np.int32))
-                    else:
-                        self.assertTrue(isinstance(value, np.float32))
-                except tf.errors.OutOfRangeError:
-                    print('Done -- epoch limit reached')
-                    break
-
-    def test_default_setting(self):
-        """Tests the logics of ScalarData.
-        """
-        self._run_and_test(self._int_hparams)
-        self._run_and_test(self._float_hparams)
-
-    def test_shuffle(self):
-        """Tests results of toggling shuffle.
-        """
-        hparams = copy.copy(self._int_hparams)
-        hparams["batch_size"] = 10
-        scalar_data = tx.data.ScalarData(hparams)
-        iterator = scalar_data.dataset.make_initializable_iterator()
-        data_batch = iterator.get_next()
-
-        hparams_sfl = copy.copy(hparams)
-        hparams_sfl["shuffle"] = True
-        scalar_data_sfl = tx.data.ScalarData(hparams_sfl)
-        iterator_sfl = scalar_data_sfl.dataset.make_initializable_iterator()
-        data_batch_sfl = iterator_sfl.get_next()
-
-        with self.test_session() as sess:
-            sess.run(tf.global_variables_initializer())
-            sess.run(tf.local_variables_initializer())
-            sess.run(tf.tables_initializer())
-            sess.run(iterator.initializer)
-            sess.run(iterator_sfl.initializer)
-
-            vals = []
-            vals_sfl = []
-            while True:
-                try:
-                    # Run the logics
-                    data_batch_, data_batch_sfl_ = sess.run([data_batch,
-                                                             data_batch_sfl])
-                    vals += data_batch_[scalar_data.data_name].tolist()
-                    vals_sfl += data_batch_sfl_[scalar_data.data_name].tolist()
-                except tf.errors.OutOfRangeError:
-                    print('Done -- epoch limit reached')
-                    break
-            self.assertEqual(len(vals), len(vals_sfl))
-            self.assertSetEqual(set(vals), set(vals_sfl))
-
-
-if __name__ == "__main__":
-    tf.test.main()
diff --git a/texar/tf/data/data/text_data_base.py b/texar/tf/data/data/text_data_base.py
index 7d3d46c8..187bc7d9 100644
--- a/texar/tf/data/data/text_data_base.py
+++ b/texar/tf/data/data/text_data_base.py
@@ -1,4 +1,4 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
+# Copyright 2019 The Texar Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,28 +12,23 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-Base text data class that is enherited by all text data classes.
+Base text data class that is inherited by all text data classes.
 """
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
-
 import tensorflow as tf
 
 from texar.tf.data.data.data_base import DataBase
-from texar.tf.data.data import dataset_utils as dsutils
+from texar.tf.data.data.dataset_utils import maybe_tuple, \
+    _make_smaller_batch_filter_fn
 
-# pylint: disable=protected-access, arguments-differ
 
 __all__ = [
     "TextDataBase"
 ]
 
 
-class TextDataBase(DataBase):  # pylint: disable=too-few-public-methods
-    """Base class inheritted by all text data classes.
+class TextDataBase(DataBase):
+    r"""Base class inherited by all text data classes.
     """
 
     def __init__(self, hparams):
@@ -41,7 +36,7 @@ def __init__(self, hparams):
 
     @staticmethod
     def default_hparams():
-        """Returns a dictionary of default hyperparameters.
+        r"""Returns a dictionary of default hyperparameters.
 
         See the specific subclasses for the details.
         """
@@ -60,32 +55,32 @@ def _make_batch(dataset, hparams, element_length_func,
         batch_size = hparams["batch_size"]
         bucket_boundaries = hparams["bucket_boundaries"]
         if padded_shapes is None:
-            padded_shapes = dataset.output_shapes
+            padded_shapes = tf.compat.v1.data.get_output_shapes(dataset)
 
         if len(bucket_boundaries) == 0:
             if hparams["allow_smaller_final_batch"]:
                 dataset = dataset.padded_batch(
                     batch_size, padded_shapes, padding_values=padding_values)
             else:
-                dataset = dataset.apply(
-                    tf.contrib.data.padded_batch_and_drop_remainder(
-                        batch_size, padded_shapes,
-                        padding_values=padding_values))
+                dataset = dataset.padded_batch(
+                    batch_size, padded_shapes,
+                    padding_values=padding_values, drop_remainder=True)
         else:
             bucket_batch_size = hparams["bucket_batch_sizes"]
             if bucket_batch_size is None:
                 bucket_batch_size = [batch_size] * (len(bucket_boundaries) + 1)
-            dataset = dataset.apply(tf.contrib.data.bucket_by_sequence_length(
-                element_length_func, bucket_boundaries, bucket_batch_size,
-                padded_shapes=padded_shapes, padding_values=padding_values))
+            dataset = dataset.apply(
+                tf.data.experimental.bucket_by_sequence_length(
+                    element_length_func, bucket_boundaries, bucket_batch_size,
+                    padded_shapes=padded_shapes, padding_values=padding_values))
             if not hparams["allow_smaller_final_batch"]:
                 if len(set(bucket_batch_size)) > 1:
                     raise ValueError(
                         "Batch size of every bucket must be the same if "
                         "smaller final batch is not allowed.")
                 batch_size = bucket_batch_size[0]
-                filter_fn = dsutils._make_smaller_batch_filter_fn(batch_size)
+                filter_fn = _make_smaller_batch_filter_fn(batch_size)
                 dataset = dataset.filter(
-                    lambda *args: filter_fn(dsutils.maybe_tuple(args)))
+                    lambda *args: filter_fn(maybe_tuple(args)))
 
         return dataset
diff --git a/texar/tf/data/data/tfrecord_data.py b/texar/tf/data/data/tfrecord_data.py
index 95688647..7ad3c765 100644
--- a/texar/tf/data/data/tfrecord_data.py
+++ b/texar/tf/data/data/tfrecord_data.py
@@ -15,11 +15,6 @@
 Data class that supports reading TFRecord data and data type converting.
 """
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
-
 import tensorflow as tf
 
 from texar.tf.data.data import dataset_utils as dsutils
@@ -27,7 +22,6 @@
 from texar.tf.data.data.mono_text_data import MonoTextData
 from texar.tf.data.data_decoders import TFRecordDataDecoder
 
-# pylint: disable=invalid-name, arguments-differ, not-context-manager
 
 __all__ = [
     "_default_tfrecord_dataset_hparams",
@@ -36,7 +30,7 @@
 
 
 def _default_tfrecord_dataset_hparams():
-    """Returns hyperparameters of a TFRecord dataset with default values.
+    r"""Returns hyperparameters of a TFRecord dataset with default values.
 
     See :meth:`texar.tf.data.TFRecordData.default_hparams` for details.
     """
@@ -60,7 +54,7 @@ def _default_tfrecord_dataset_hparams():
 
 
 class TFRecordData(DataBase):
-    """TFRecord data which loads and processes TFRecord files.
+    r"""TFRecord data which loads and processes TFRecord files.
 
     This module can be used to process image data, features, etc.
 
@@ -98,6 +92,7 @@ class TFRecordData(DataBase):
             batch = iterator.get_next()
 
             iterator.switch_to_dataset(sess) # initializes the dataset
+            TODO: This should be modified in the future.
             batch_ = sess.run(batch)
             # batch_ == {
             #    'data': {
@@ -134,6 +129,7 @@ class TFRecordData(DataBase):
             batch = iterator.get_next()
 
             iterator.switch_to_dataset(sess) # initializes the dataset
+            TODO: This should be modified in the future.
             batch_ = sess.run(batch)
             # batch_ == {
             #    'data': {
@@ -150,12 +146,12 @@ class TFRecordData(DataBase):
 
     def __init__(self, hparams):
         DataBase.__init__(self, hparams)
-        with tf.name_scope(self.name, self.default_hparams()["name"]):
+        with tf.name_scope(self.name):
             self._make_data()
 
     @staticmethod
     def default_hparams():
-        """Returns a dicitionary of default hyperparameters.
+        r"""Returns a dictionary of default hyperparameters.
 
         .. code-block:: python
 
@@ -189,10 +185,10 @@ def default_hparams():
 
         1. For the hyperparameters in the :attr:`"dataset"` field:
 
-            "files": str or list
+            `"files"`: str or list
                 A (list of) TFRecord file path(s).
 
-            "feature_original_types": dict
+            `"feature_original_types"`: dict
                 The feature names (str) with their data types and length types,
                 key and value in pair
                 `feature_name: [dtype, feature_len_type, len]`,
@@ -219,7 +215,7 @@ def default_hparams():
                         "name_lists": ["tf.string", "VarLenFeature"],
                     }
 
-            "feature_convert_types": dict, optional
+            `"feature_convert_types"`: dict, optional
                 Specifies dtype converting after reading the data files. This
                 `dict` maps feature names to desired data dtypes. For example,
                 you can first read a feature into dtype `tf.float64` by
@@ -243,7 +239,7 @@ def default_hparams():
                         "label_ids": "tf.int32",
                     }
 
-            "image_options": dict, optional
+            `"image_options"`: dict, optional
                 Specifies the image feature name and performs image resizing,
                 includes three fields:
 
@@ -258,11 +254,11 @@ def default_hparams():
 
                 If either `resize_height` or `resize_width` is not set,
                 image data will be restored with original shape.
-            "num_shards": int, optional
+            `"num_shards"`: int, optional
                 The number of data shards in distributed mode. Usually set to
                 the number of processes in distributed computing.
                 Used in combination with :attr:`"shard_id"`.
-            "shard_id": int, optional
+            `"shard_id"`: int, optional
                 Sets the unique id to identify a shard. The module will
                 processes only the corresponding shard of the whole data.
                 Used in combination with :attr:`"num_shards"`.
@@ -293,10 +289,10 @@ def default_hparams():
 
                 Also refer to `examples/bert` for a use case.
 
-            "other_transformations": list
+            `"other_transformations"`: list
                 A list of transformation functions or function names/paths to
                 further transform each single data instance.
-            "data_name": str
+            `"data_name"`: str
                 Name of the dataset.
 
         2. For the **general** hyperparameters, see
@@ -386,7 +382,8 @@ def list_items(self):
         Returns:
             A list of strings.
         """
-        return sorted(list(self._dataset.output_types.keys()))
+        return sorted(list(tf.compat.v1.data.get_output_types(
+            self._dataset).keys()))
 
     @property
     def feature_names(self):
diff --git a/texar/tf/data/data/tfrecord_data_test.py b/texar/tf/data/data/tfrecord_data_test.py
index 457f4240..0a285053 100644
--- a/texar/tf/data/data/tfrecord_data_test.py
+++ b/texar/tf/data/data/tfrecord_data_test.py
@@ -1,16 +1,8 @@
-# -*- coding: utf-8 -*-
-#
 """
 Unit tests for data related operations.
 """
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
-
 import os
-import sys
 import copy
 import shutil
 import tempfile
@@ -113,7 +105,7 @@ def _image_example(image_string, image_shape, label):
             self._test_dir,
             'test.tfrecord')
         # Prepare Validation data
-        with tf.python_io.TFRecordWriter(_tfrecord_filepath) as writer:
+        with tf.io.TFRecordWriter(_tfrecord_filepath) as writer:
             for image_path, label in _toy_image_labels_valid.items():
 
                 with open(image_path, 'rb') as fid:
@@ -156,76 +148,73 @@ def tearDown(self):
     def _run_and_test(self, hparams):
         # Construct database
         tfrecord_data = tx.data.TFRecordData(hparams)
-        iterator = tfrecord_data.dataset.make_initializable_iterator()
-        data_batch = iterator.get_next()
-
-        with self.test_session() as sess:
-            sess.run(tf.global_variables_initializer())
-            sess.run(tf.local_variables_initializer())
-            sess.run(tf.tables_initializer())
-            sess.run(iterator.initializer)
-            i = 0
-
-            def _prod(lst):
-                res = 1
-                for i in lst:
-                    res *= i
-                return res
-            while True:
-                try:
-                    # Run the logics
-                    data_batch_ = sess.run(data_batch)
-                    self.assertEqual(
-                        set(data_batch_.keys()),
-                        set(tfrecord_data.list_items()))
-
-                    # Check data consistency
-                    for key in self._unconvert_features:
-                        value = data_batch_[key][0]
-                        self.assertEqual(value, self._dataset_valid[key][i])
-                    self.assertEqual(
-                        list(data_batch_['shape'].values),
-                        list(self._dataset_valid['shape'][i]))
-
-                    # Check data type conversion
-                    for key, item in self._feature_convert_types.items():
-                        value = data_batch_[key][0]
-                        if item == 'tf.string' or item is tf.string:
-                            self.assertTrue(isinstance(value, bytes))
-                        else:
-                            dtype_matched = (
-                                tx.utils.dtypes.get_tf_dtype(str(value.dtype))
+        iterator = tf.compat.v1.data.make_one_shot_iterator(
+            tfrecord_data.dataset)
+
+        i = 0
+
+        def _prod(lst):
+            res = 1
+            for i in lst:
+                res *= i
+            return res
+
+        while True:
+            try:
+                # Run the logics
+                data_batch_ = iterator.get_next()
+                self.assertEqual(
+                    set(data_batch_.keys()),
+                    set(tfrecord_data.list_items()))
+
+                # Check data consistency
+                for key in self._unconvert_features:
+                    value = data_batch_[key][0]
+                    self.assertEqual(value, self._dataset_valid[key][i])
+                self.assertEqual(
+                    list(data_batch_['shape'].values),
+                    list(self._dataset_valid['shape'][i]))
+
+                # Check data type conversion
+                for key, item in self._feature_convert_types.items():
+                    value = data_batch_[key][0]
+                    if item == 'tf.string' or item is tf.string:
+                        self.assertTrue(isinstance(value.numpy(), bytes))
+                    else:
+                        dtype_matched = (
+                                tx.utils.dtypes.get_tf_dtype(str(
+                                    value.dtype.name))
                                 is tx.utils.dtypes.get_tf_dtype(item))
-                            self.assertTrue(dtype_matched)
-
-                    # Check image decoding and resize
-                    if hparams["dataset"].get("image_options"):
-                        image_options = hparams["dataset"].get("image_options")
-                        if isinstance(image_options, dict):
-                            image_options = [image_options]
-                        for image_option_feature in image_options:
-                            image_key = image_option_feature.get(
-                                "image_feature_name")
-                            if image_key is None:
-                                continue
-                            image_gen = data_batch_[image_key][0]
-                            image_valid_shape = self._dataset_valid["shape"][i]
-                            resize_height = image_option_feature.get(
-                                "resize_height")
-                            resize_width = image_option_feature.get(
-                                "resize_width")
-                            if resize_height and resize_width:
-                                self.assertEqual(
-                                    image_gen.shape[0] * image_gen.shape[1],
-                                    resize_height * resize_width)
-                            else:
-                                self.assertEqual(
-                                    _prod(image_gen.shape),
-                                    _prod(image_valid_shape))
-                    i += 1
-                except tf.errors.OutOfRangeError:
-                    print('Done -- epoch limit reached')
-                    break
+                        self.assertTrue(dtype_matched)
+
+                # Check image decoding and resize
+                if hparams["dataset"].get("image_options"):
+                    image_options = hparams["dataset"].get("image_options")
+                    if isinstance(image_options, dict):
+                        image_options = [image_options]
+                    for image_option_feature in image_options:
+                        image_key = image_option_feature.get(
+                            "image_feature_name")
+                        if image_key is None:
+                            continue
+                        image_gen = data_batch_[image_key][0]
+                        image_valid_shape = self._dataset_valid["shape"][i]
+                        resize_height = image_option_feature.get(
+                            "resize_height")
+                        resize_width = image_option_feature.get(
+                            "resize_width")
+                        if resize_height and resize_width:
+                            self.assertEqual(
+                                image_gen.shape[0] * image_gen.shape[1],
+                                resize_height * resize_width)
+                        else:
+                            self.assertEqual(
+                                _prod(image_gen.shape),
+                                _prod(image_valid_shape))
+                i += 1
+            except tf.errors.OutOfRangeError:
+                print('Done -- epoch limit reached')
+                break
 
     def test_default_setting(self):
         """Tests the logics of TFRecordData.
diff --git a/texar/tf/data/data_decoders.py b/texar/tf/data/data_decoders.py
index 4277c737..c3e266d9 100644
--- a/texar/tf/data/data_decoders.py
+++ b/texar/tf/data/data_decoders.py
@@ -1,5 +1,4 @@
-# -*- coding: utf-8 -*-
-# Copyright 2018 The Texar Authors. All Rights Reserved.
+# Copyright 2019 The Texar Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -17,24 +16,15 @@
 reading raw text data.
 """
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
-
 import numpy as np
 
 import tensorflow as tf
-from tensorflow.contrib.slim.python.slim.data import data_decoder
 
 from texar.tf.data.vocabulary import SpecialTokens
 from texar.tf.utils import dtypes
 from texar.tf.hyperparams import HParams
 
 
-# pylint: disable=too-many-instance-attributes, too-many-arguments,
-# pylint: disable=no-member, invalid-name
-
 __all__ = [
     "ScalarDataDecoder",
     "TextDataDecoder",
@@ -47,8 +37,8 @@ def _append_token(token):
     return token is not None and token != ""
 
 
-class ScalarDataDecoder(data_decoder.DataDecoder):
-    """A data decoder that decodes a scalar, e.g., int label or float number.
+class ScalarDataDecoder:
+    r"""A data decoder that decodes a scalar, e.g., int label or float number.
 
     The only operation is to cast the data into a specified data type.
 
@@ -69,7 +59,7 @@ def __call__(self, data):
         return dict(zip(self.list_items(), outputs))
 
     def decode(self, data, items):
-        """Decodes the data to return the tensors specified by the list of
+        r"""Decodes the data to return the tensors specified by the list of
         items.
 
         Args:
@@ -82,7 +72,8 @@ def decode(self, data, items):
         """
         data = tf.reshape(data, shape=[])
         if data.dtype is tf.string:
-            decoded_data = tf.string_to_number(data, out_type=self._dtype)
+            decoded_data = tf.compat.v1.string_to_number(data,
+                                                         out_type=self._dtype)
         else:
             decoded_data = tf.cast(data, self._dtype)
         outputs = {
@@ -91,7 +82,7 @@ def decode(self, data, items):
         return [outputs[item] for item in items]
 
     def list_items(self):
-        """Returns the list of item names that the decoder can produce.
+        r"""Returns the list of item names that the decoder can produce.
 
         Returns:
             A list of strings can be passed to :meth:`decode()`.
@@ -100,13 +91,13 @@ def list_items(self):
 
     @property
     def data_tensor_name(self):
-        """The name of the data tensor.
+        r"""The name of the data tensor.
         """
         return self._data_name
 
 
-class TextDataDecoder(data_decoder.DataDecoder):
-    """A text data decoder that decodes raw text data.
+class TextDataDecoder:
+    r"""A text data decoder that decodes raw text data.
 
     Operations include splitting on word or character level, truncation,
     inserting special tokens, mapping text units to indexes, etc.
@@ -164,7 +155,7 @@ def __call__(self, data):
         return dict(zip(self.list_items(), outputs))
 
     def decode(self, data, items):
-        """Decodes the data to return the tensors specified by the list of
+        r"""Decodes the data to return the tensors specified by the list of
         items.
 
         Args:
@@ -179,7 +170,8 @@ def decode(self, data, items):
         """
         # Split
         if self._split_level == "word":
-            tokens = tf.string_split([data], delimiter=self._delimiter).values
+            tokens = tf.compat.v1.string_split([data],
+                                               sep=self._delimiter).values
         elif self._split_level == "char":
             raise NotImplementedError
         else:
@@ -210,7 +202,7 @@ def decode(self, data, items):
         return [outputs[item] for item in items]
 
     def list_items(self):
-        """Returns the list of item names that the decoder can produce.
+        r"""Returns the list of item names that the decoder can produce.
 
         Returns:
             A list of strings can be passed to :meth:`decode()`.
@@ -221,7 +213,7 @@ def list_items(self):
 
     @property
     def text_tensor_name(self):
-        """The name of text tensor.
+        r"""The name of text tensor.
         """
         return self._text_tensor_name
 
@@ -231,7 +223,7 @@ def text_tensor_name(self, name):
 
     @property
     def length_tensor_name(self):
-        """The name of length tensor.
+        r"""The name of length tensor.
         """
         return self._length_tensor_name
 
@@ -241,7 +233,7 @@ def length_tensor_name(self, name):
 
     @property
     def text_id_tensor_name(self):
-        """The name of text index tensor.
+        r"""The name of text index tensor.
         """
         return self._text_id_tensor_name
 
@@ -251,13 +243,13 @@ def text_id_tensor_name(self, name):
 
     @property
     def added_length(self):
-        """The added text length due to appended bos and eos tokens.
+        r"""The added text length due to appended bos and eos tokens.
         """
         return self._added_length
 
 
-class VarUttTextDataDecoder(data_decoder.DataDecoder):
-    """A text data decoder that decodes raw text data. Each data is considered
+class VarUttTextDataDecoder:
+    r"""A text data decoder that decodes raw text data. Each data is considered
     to be multiple sentences concatenated by a delimiter.
 
     Operations include splitting on word or character level, truncation,
@@ -328,8 +320,8 @@ def __call__(self, data):
         outputs = self.decode(data, self.list_items())
         return dict(zip(self.list_items(), outputs))
 
-    def decode(self, data, items):  # pylint: disable=too-many-locals
-        """Decodes the data to return the tensors specified by the list of
+    def decode(self, data, items):
+        r"""Decodes the data to return the tensors specified by the list of
         items.
 
         Args:
@@ -343,8 +335,8 @@ def decode(self, data, items):  # pylint: disable=too-many-locals
             returns `None` for the token index item.
         """
 
-        sentences = tf.string_split([data],
-                                    delimiter=self._sentence_delimiter).values
+        sentences = tf.compat.v1.string_split(
+            [data], sep=self._sentence_delimiter).values
 
         # Truncate utterances
         if self._max_utterance_cnt:
@@ -354,7 +346,7 @@ def decode(self, data, items):  # pylint: disable=too-many-locals
         # Get (max) sentence length
         def _get_sent_length(s):
             raw_length = tf.size(
-                tf.string_split([s], delimiter=self._delimiter).values)
+                tf.compat.v1.string_split([s], sep=self._delimiter).values)
             if self._max_seq_length:
                 return tf.minimum(raw_length, self._max_seq_length)
             else:
@@ -387,10 +379,10 @@ def _trunc_and_pad(s, pad_token, max_length):
         # Split each sentence to tokens, and pad them to a same length.
         # This is necessary to treat all sentences as a single tensor.
         split_sentences = tf.map_fn(
-            lambda s: tf.py_func(
+            lambda s: tf.compat.v1.py_func(
                 _trunc_and_pad,
                 [
-                    tf.string_split([s], delimiter=self._delimiter).values,
+                    tf.compat.v1.string_split([s], sep=self._delimiter).values,
                     SpecialTokens.PAD,
                     sent_length
                 ],
@@ -415,7 +407,7 @@ def _trunc_and_pad(s, pad_token, max_length):
         return [outputs[item] for item in items]
 
     def list_items(self):
-        """Returns the list of item names that the decoder can produce.
+        r"""Returns the list of item names that the decoder can produce.
 
         Returns:
             A list of strings can be passed to :meth:`decode()`.
@@ -429,7 +421,7 @@ def list_items(self):
 
     @property
     def text_tensor_name(self):
-        """The name of text tensor.
+        r"""The name of text tensor.
         """
         return self._text_tensor_name
 
@@ -439,13 +431,13 @@ def text_tensor_name(self, name):
 
     @property
     def utterance_cnt_tensor_name(self):
-        """The name of the utterance count tensor.
+        r"""The name of the utterance count tensor.
         """
         return self._utterance_cnt_tensor_name
 
     @property
     def length_tensor_name(self):
-        """The name of length tensor.
+        r"""The name of length tensor.
         """
         return self._length_tensor_name
 
@@ -455,7 +447,7 @@ def length_tensor_name(self, name):
 
     @property
     def text_id_tensor_name(self):
-        """The name of text index tensor.
+        r"""The name of text index tensor.
         """
         return self._text_id_tensor_name
 
@@ -465,13 +457,13 @@ def text_id_tensor_name(self, name):
 
     @property
     def added_length(self):
-        """The added text length due to appended bos and eos tokens.
+        r"""The added text length due to appended bos and eos tokens.
         """
         return self._added_length
 
 
-class TFRecordDataDecoder(data_decoder.DataDecoder):
-    """A data decoder that decodes a TFRecord file, e.g., the
+class TFRecordDataDecoder:
+    r"""A data decoder that decodes a TFRecord file, e.g., the
     TFRecord file.
 
     The only operation is to parse the TFRecord data into a
@@ -574,7 +566,7 @@ def _find_resize_method(resize_method):
         # Resize the image
         if resize_height and resize_width:
             resize_method = _find_resize_method(resize_method)
-            image_resized = tf.image.resize_images(
+            image_resized = tf.image.resize(
                 image_decoded,
                 (resize_height, resize_width),
                 method=resize_method)
@@ -583,7 +575,7 @@ def _find_resize_method(resize_method):
         return
 
     def decode(self, data, items):
-        """Decodes the data to return the tensors specified by the list of
+        r"""Decodes the data to return the tensors specified by the list of
         items.
 
         Args:
@@ -594,7 +586,6 @@ def decode(self, data, items):
         Returns:
             A list of tensors, each of which corresponds to each item.
         """
-        # pylint: disable=too-many-branches
         feature_description = dict()
         for key, value in self._feature_original_types.items():
             shape = []
@@ -605,14 +596,14 @@ def decode(self, data, items):
                     shape = value
             if len(value) < 2 or value[1] == 'FixedLenFeature':
                 feature_description.update(
-                    {key: tf.FixedLenFeature(
+                    {key: tf.io.FixedLenFeature(
                         shape,
                         dtypes.get_tf_dtype(value[0]))})
             elif value[1] == 'VarLenFeature':
                 feature_description.update(
-                    {key: tf.VarLenFeature(
+                    {key: tf.io.VarLenFeature(
                         dtypes.get_tf_dtype(value[0]))})
-        decoded_data = tf.parse_single_example(data, feature_description)
+        decoded_data = tf.io.parse_single_example(data, feature_description)
 
         # Handle TFRecord containing images
         if isinstance(self._image_options, dict):
@@ -637,7 +628,7 @@ def decode(self, data, items):
             elif to_type is tf.string:
                 decoded_data[key] = tf.as_string(decoded_data[key])
             elif from_type is tf.string:
-                decoded_data[key] = tf.string_to_number(
+                decoded_data[key] = tf.compat.v1.string_to_number(
                     decoded_data[key], to_type)
             else:
                 decoded_data[key] = tf.cast(
@@ -646,7 +637,7 @@ def decode(self, data, items):
         return [outputs[item] for item in items]
 
     def list_items(self):
-        """Returns the list of item names that the decoder can produce.
+        r"""Returns the list of item names that the decoder can produce.
 
         Returns:
             A list of strings can be passed to :meth:`decode()`.
diff --git a/texar/tf/data/data_utils.py b/texar/tf/data/data_utils.py
index b05abae8..09a89179 100644
--- a/texar/tf/data/data_utils.py
+++ b/texar/tf/data/data_utils.py
@@ -1,4 +1,4 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
+# Copyright 2019 The Texar Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -15,41 +15,30 @@
 Various utilities specific to data processing.
 """
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
-
+import logging
 import os
 import sys
 import tarfile
+import urllib.request
 import zipfile
-import collections
-import numpy as np
-from six.moves import urllib
-import requests
 
-import tensorflow as tf
+import numpy as np
 
 from texar.tf.utils import utils_io
+from texar.tf.utils.types import MaybeList
 
-# pylint: disable=invalid-name, too-many-branches
 
 __all__ = [
     "maybe_download",
-    "read_words",
-    "make_vocab",
-    "count_file_lines"
+    "count_file_lines",
 ]
 
-Py3 = sys.version_info[0] == 3
-
 
 def maybe_download(urls, path, filenames=None, extract=False):
-    """Downloads a set of files.
+    r"""Downloads a set of files.
 
     Args:
-        urls: A (list of) urls to download files.
+        urls: A (list of) URLs to download files.
         path (str): The destination path to save the files.
         filenames: A (list of) strings of the file names. If given,
             must have the same length with :attr:`urls`. If `None`,
@@ -62,7 +51,10 @@ def maybe_download(urls, path, filenames=None, extract=False):
     utils_io.maybe_create_dir(path)
 
     if not isinstance(urls, (list, tuple)):
+        is_list = False
         urls = [urls]
+    else:
+        is_list = True
     if filenames is not None:
         if not isinstance(filenames, (list, tuple)):
             filenames = [filenames]
@@ -86,54 +78,61 @@ def maybe_download(urls, path, filenames=None, extract=False):
         filepath = os.path.join(path, filename)
         result.append(filepath)
 
-        if not tf.gfile.Exists(filepath):
+        # if not tf.gfile.Exists(filepath):
+        if not os.path.exists(filepath):
             if 'drive.google.com' in url:
                 filepath = _download_from_google_drive(url, filename, path)
             else:
                 filepath = _download(url, filename, path)
 
             if extract:
-                tf.logging.info('Extract %s', filepath)
+                logging.info('Extract %s', filepath)
                 if tarfile.is_tarfile(filepath):
                     tarfile.open(filepath, 'r').extractall(path)
                 elif zipfile.is_zipfile(filepath):
                     with zipfile.ZipFile(filepath) as zfile:
                         zfile.extractall(path)
                 else:
-                    tf.logging.info("Unknown compression type. Only .tar.gz, "
-                                    ".tar.bz2, .tar, and .zip are supported")
-
+                    logging.info("Unknown compression type. Only .tar.gz"
+                                 ".tar.bz2, .tar, and .zip are supported")
+    if not is_list:
+        return result[0]
     return result
 
 
-def _download(url, filename, path):
-    def _progress(count, block_size, total_size):
+def _download(url: str, filename: str, path: str) -> str:
+    def _progress_hook(count, block_size, total_size):
         percent = float(count * block_size) / float(total_size) * 100.
-        # pylint: disable=cell-var-from-loop
-        sys.stdout.write('\r>> Downloading %s %.1f%%' %
-                         (filename, percent))
+        sys.stdout.write(f'\r>> Downloading {filename} {percent:.1f}%')
         sys.stdout.flush()
 
     filepath = os.path.join(path, filename)
-    filepath, _ = urllib.request.urlretrieve(url, filepath, _progress)
+    filepath, _ = urllib.request.urlretrieve(url, filepath, _progress_hook)
     print()
     statinfo = os.stat(filepath)
-    print('Successfully downloaded {} {} bytes.'.format(
-        filename, statinfo.st_size))
+    print(f'Successfully downloaded {filename} {statinfo.st_size} bytes')
 
     return filepath
 
 
-def _extract_google_drive_file_id(url):
+def _extract_google_drive_file_id(url: str) -> str:
     # id is between `/d/` and '/'
     url_suffix = url[url.find('/d/') + 3:]
     file_id = url_suffix[:url_suffix.find('/')]
     return file_id
 
 
-def _download_from_google_drive(url, filename, path):
-    """Adapted from `https://github.com/saurabhshri/gdrive-downloader`
+def _download_from_google_drive(url: str, filename: str, path: str) -> str:
+    r"""Adapted from `https://github.com/saurabhshri/gdrive-downloader`
     """
+
+    try:
+        import requests
+    except ImportError:
+        print("The requests library must be installed to download files from "
+              "Google drive. Please see: https://github.com/psf/requests")
+        raise
+
     def _get_confirm_token(response):
         for key, value in response.cookies.items():
             if key.startswith('download_warning'):
@@ -153,106 +152,20 @@ def _get_confirm_token(response):
 
     filepath = os.path.join(path, filename)
     CHUNK_SIZE = 32768
-    with tf.gfile.GFile(filepath, "wb") as f:
+    with open(filepath, "wb") as f:
         for chunk in response.iter_content(CHUNK_SIZE):
             if chunk:
                 f.write(chunk)
 
-    print('Successfully downloaded {}.'.format(filename))
+    print(f'Successfully downloaded {filename}')
 
     return filepath
 
 
-def read_words(filename, newline_token=None):
-    """Reads word from a file.
-
-    Args:
-        filename (str): Path to the file.
-        newline_token (str, optional): The token to replace the original newline
-            token "\\\\n". For example,
-            `newline_token=tx.data.SpecialTokens.EOS`.
-            If `None`, no replacement is performed.
-
-    Returns:
-        A list of words.
-    """
-    with tf.gfile.GFile(filename, "r") as f:
-        if Py3:
-            if newline_token is None:
-                return f.read().split()
-            else:
-                return f.read().replace("\n", newline_token).split()
-        else:
-            if newline_token is None:
-                return f.read().decode("utf-8").split()
-            else:
-                return (f.read().decode("utf-8")
-                        .replace("\n", newline_token).split())
-
-
-def make_vocab(filenames, max_vocab_size=-1, newline_token=None,
-               return_type="list", return_count=False):
-    """Builds vocab of the files.
-
-    Args:
-        filenames (str): A (list of) files.
-        max_vocab_size (int): Maximum size of the vocabulary. Low frequency
-            words that exceeding the limit will be discarded.
-            Set to `-1` (default) if no truncation is wanted.
-        newline_token (str, optional): The token to replace the original newline
-            token "\\\\n". For example,
-            `newline_token=tx.data.SpecialTokens.EOS`.
-            If `None`, no replacement is performed.
-        return_type (str): Either "list" or "dict". If "list" (default), this
-            function returns a list of words sorted by frequency. If "dict",
-            this function returns a dict mapping words to their index sorted
-            by frequency.
-        return_count (bool): Whether to return word counts. If `True` and
-            :attr:`return_type` is "dict", then a count dict is returned, which
-            is a mapping from words to their frequency.
-
-    Returns:
-        - If :attr:`return_count` is False, returns a list or dict containing \
-        the vocabulary words.
-
-        - If :attr:`return_count` if True, returns a pair of list or dict \
-        `(a, b)`, where `a` is a list or dict containing the vocabulary \
-        words, `b` is a list of dict containing the word counts.
+def count_file_lines(filenames: MaybeList[str]) -> int:
+    r"""Counts the number of lines in the file(s).
     """
-    if not isinstance(filenames, (list, tuple)):
-        filenames = [filenames]
-
-    words = []
-    for fn in filenames:
-        words += read_words(fn, newline_token=newline_token)
 
-    counter = collections.Counter(words)
-    count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))
-
-    words, counts = list(zip(*count_pairs))
-    if max_vocab_size >= 0:
-        words = words[:max_vocab_size]
-        counts = counts[:max_vocab_size]
-
-    if return_type == "list":
-        if not return_count:
-            return words
-        else:
-            return words, counts
-    elif return_type == "dict":
-        word_to_id = dict(zip(words, range(len(words))))
-        if not return_count:
-            return word_to_id
-        else:
-            word_to_count = dict(zip(words, counts))
-            return word_to_id, word_to_count
-    else:
-        raise ValueError("Unknown return_type: {}".format(return_type))
-
-
-def count_file_lines(filenames):
-    """Counts the number of lines in the file(s).
-    """
     def _count_lines(fn):
         with open(fn, "rb") as f:
             i = -1
@@ -262,5 +175,5 @@ def _count_lines(fn):
 
     if not isinstance(filenames, (list, tuple)):
         filenames = [filenames]
-    num_lines = np.sum([_count_lines(fn) for fn in filenames])
+    num_lines = np.sum([_count_lines(fn) for fn in filenames]).item()
     return num_lines
diff --git a/texar/tf/data/data_utils_test.py b/texar/tf/data/data_utils_test.py
index f981e614..53155c31 100644
--- a/texar/tf/data/data_utils_test.py
+++ b/texar/tf/data/data_utils_test.py
@@ -1,5 +1,3 @@
-# -*- coding: utf-8 -*-
-#
 """
 Unit tests for data utils.
 """
diff --git a/texar/tf/data/embedding.py b/texar/tf/data/embedding.py
index 14a3757c..347755b6 100644
--- a/texar/tf/data/embedding.py
+++ b/texar/tf/data/embedding.py
@@ -1,4 +1,4 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
+# Copyright 2019 The Texar Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -15,13 +15,7 @@
 Helper functions and classes for embedding processing.
 """
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
-
 import tensorflow as tf
-from tensorflow import gfile
 import numpy as np
 
 from texar.tf.utils import utils
@@ -35,7 +29,7 @@
 
 
 def load_word2vec(filename, vocab, word_vecs):
-    """Loads embeddings in the word2vec binary format which has a header line
+    r"""Loads embeddings in the word2vec binary format which has a header line
     containing the number of vectors and their dimensionality (two integers),
     followed with number-of-vectors lines each of which is formatted as
     '<word-string> <embedding-vector>'.
@@ -50,7 +44,7 @@ def load_word2vec(filename, vocab, word_vecs):
     Returns:
         The updated :attr:`word_vecs`.
     """
-    with gfile.GFile(filename, "rb") as fin:
+    with tf.io.gfile.GFile(filename, "rb") as fin:
         header = fin.readline()
         vocab_size, vector_size = [int(s) for s in header.split()]
         if vector_size != word_vecs.shape[1]:
@@ -76,7 +70,7 @@ def load_word2vec(filename, vocab, word_vecs):
 
 
 def load_glove(filename, vocab, word_vecs):
-    """Loads embeddings in the glove text format in which each line is
+    r"""Loads embeddings in the glove text format in which each line is
     '<word-string> <embedding-vector>'. Dimensions of the embedding vector
     are separated with whitespace characters.
 
@@ -90,7 +84,7 @@ def load_glove(filename, vocab, word_vecs):
     Returns:
         The updated :attr:`word_vecs`.
     """
-    with gfile.GFile(filename) as fin:
+    with tf.io.gfile.GFile(filename) as fin:
         for line in fin:
             vec = line.strip().split()
             if len(vec) == 0:
@@ -106,17 +100,15 @@ def load_glove(filename, vocab, word_vecs):
     return word_vecs
 
 
-class Embedding(object):
-    """Embedding class that loads token embedding vectors from file. Token
+class Embedding:
+    r"""Embedding class that loads token embedding vectors from file. Token
     embeddings not in the embedding file are initialized as specified in
     :attr:`hparams`.
 
     Args:
         vocab (dict): A dictionary that maps token strings to integer index.
-        read_fn: Callable that takes `(filename, vocab, word_vecs)` and
-            returns the updated `word_vecs`. E.g.,
-            :func:`~texar.tf.data.embedding.load_word2vec` and
-            :func:`~texar.tf.data.embedding.load_glove`.
+        hparams (dict): Hyperparameters. See :meth:`default_hparams` for the
+            defaults.
     """
     def __init__(self, vocab, hparams=None):
         self._hparams = HParams(hparams, self.default_hparams())
@@ -148,7 +140,7 @@ def __init__(self, vocab, hparams=None):
 
     @staticmethod
     def default_hparams():
-        """Returns a dictionary of hyperparameters with default values:
+        r"""Returns a dictionary of hyperparameters with default values:
 
         .. role:: python(code)
            :language: python
@@ -170,14 +162,14 @@ def default_hparams():
 
         Here:
 
-        "file": str
+        `"file"`: str
             Path to the embedding file. If not provided, all embeddings are
             initialized with the initialization function.
 
-        "dim": int
+        `"dim"`: int
             Dimension size of each embedding vector
 
-        "read_fn": str or callable
+        `"read_fn"`: str or callable
             Function to read the embedding file. This can be the function,
             or its string name or full module path. E.g.,
 
@@ -194,7 +186,7 @@ def default_hparams():
             The function must have the same signature as with
             :func:`load_word2vec`.
 
-        "init_fn": dict
+        `"init_fn"`: dict
             Hyperparameters of the initialization function used to initialize
             embedding of tokens missing in the embedding
             file.
@@ -228,12 +220,12 @@ def default_hparams():
 
     @property
     def word_vecs(self):
-        """2D numpy array of shape `[vocab_size, embedding_dim]`.
+        r"""2D numpy array of shape `[vocab_size, embedding_dim]`.
         """
         return self._word_vecs
 
     @property
     def vector_size(self):
-        """The embedding dimention size.
+        r"""The embedding dimention size.
         """
         return self._hparams.dim
diff --git a/texar/tf/data/embedding_test.py b/texar/tf/data/embedding_test.py
index fb400f43..1722d57d 100644
--- a/texar/tf/data/embedding_test.py
+++ b/texar/tf/data/embedding_test.py
@@ -1,26 +1,7 @@
-# -*- coding: utf-8 -*-
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 """
 Unit tests for embedding related operations.
 """
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
-
 import sys
 import tempfile
 import numpy as np
@@ -29,7 +10,7 @@
 
 from texar.tf.data import embedding
 
-Py3 = sys.version_info[0] == 3  # pylint: disable=invalid-name
+Py3 = sys.version_info[0] == 3
 
 
 class EmbeddingTest(tf.test.TestCase):
diff --git a/texar/tf/data/tokenizers/__init__.py b/texar/tf/data/tokenizers/__init__.py
index 23818b4d..acc7d0e1 100644
--- a/texar/tf/data/tokenizers/__init__.py
+++ b/texar/tf/data/tokenizers/__init__.py
@@ -16,6 +16,4 @@
 """
 
 from texar.tf.data.tokenizers.bert_tokenizer import *
-from texar.tf.data.tokenizers.gpt2_tokenizer import *
 from texar.tf.data.tokenizers.tokenizer_base import *
-from texar.tf.data.tokenizers.xlnet_tokenizer import *
diff --git a/texar/tf/data/tokenizers/bert_tokenizer.py b/texar/tf/data/tokenizers/bert_tokenizer.py
index 4e46dd17..eb5b79b3 100644
--- a/texar/tf/data/tokenizers/bert_tokenizer.py
+++ b/texar/tf/data/tokenizers/bert_tokenizer.py
@@ -62,8 +62,44 @@ class BERTTokenizer(PretrainedBERTMixin, TokenizerBase):
         'bert-base-multilingual-uncased': 512,
         'bert-base-multilingual-cased': 512,
         'bert-base-chinese': 512,
+
+        # BioBERT
+        'biobert-v1.0-pmc': 512,
+        'biobert-v1.0-pubmed-pmc': 512,
+        'biobert-v1.0-pubmed': 512,
+        'biobert-v1.1-pubmed': 512,
+
+        # SciBERT
+        'scibert-scivocab-uncased': 512,
+        'scibert-scivocab-cased': 512,
+        'scibert-basevocab-uncased': 512,
+        'scibert-basevocab-cased': 512,
     }
     _VOCAB_FILE_NAMES = {'vocab_file': 'vocab.txt'}
+    _VOCAB_FILE_MAP = {
+        'vocab_file': {
+            # Standard BERT
+            'bert-base-uncased': 'vocab.txt',
+            'bert-large-uncased': 'vocab.txt',
+            'bert-base-cased': 'vocab.txt',
+            'bert-large-cased': 'vocab.txt',
+            'bert-base-multilingual-uncased': 'vocab.txt',
+            'bert-base-multilingual-cased': 'vocab.txt',
+            'bert-base-chinese': 'vocab.txt',
+
+            # BioBERT
+            'biobert-v1.0-pmc': 'vocab.txt',
+            'biobert-v1.0-pubmed-pmc': 'vocab.txt',
+            'biobert-v1.0-pubmed': 'vocab.txt',
+            'biobert-v1.1-pubmed': 'vocab.txt',
+
+            # SciBERT
+            'scibert-scivocab-uncased': 'vocab.txt',
+            'scibert-scivocab-cased': 'vocab.txt',
+            'scibert-basevocab-uncased': 'vocab.txt',
+            'scibert-basevocab-cased': 'vocab.txt',
+        }
+    }
 
     def __init__(self,
                  pretrained_model_name: Optional[str] = None,
@@ -81,8 +117,10 @@ def __init__(self,
         }
 
         if self.pretrained_model_dir is not None:
+            assert self.pretrained_model_name is not None
             vocab_file = os.path.join(self.pretrained_model_dir,
-                                      self._VOCAB_FILE_NAMES['vocab_file'])
+                                      self._VOCAB_FILE_MAP['vocab_file']
+                                      [self.pretrained_model_name])
             assert self.pretrained_model_name is not None
             if self._MAX_INPUT_SIZE.get(self.pretrained_model_name):
                 self.max_len = self._MAX_INPUT_SIZE[self.pretrained_model_name]
diff --git a/texar/tf/data/tokenizers/gpt2_tokenizer.py b/texar/tf/data/tokenizers/gpt2_tokenizer.py
deleted file mode 100644
index 84bbd18f..00000000
--- a/texar/tf/data/tokenizers/gpt2_tokenizer.py
+++ /dev/null
@@ -1,366 +0,0 @@
-# Copyright 2019 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Pre-trained GPT-2 tokenizer.
-
-Code structure adapted from:
-    `https://github.com/huggingface/pytorch-transformers/blob/master/pytorch_transformers/tokenization_gpt2.py`
-"""
-
-from typing import Any, Dict, List, Optional, Tuple
-
-import os
-import json
-import regex as re
-
-from texar.tf.modules.pretrained.gpt2 import PretrainedGPT2Mixin
-from texar.tf.data.tokenizers.tokenizer_base import TokenizerBase
-from texar.tf.data.tokenizers.gpt2_tokenizer_utils import \
-    bytes_to_unicode, get_pairs
-
-__all__ = [
-    'GPT2Tokenizer',
-]
-
-
-class GPT2Tokenizer(TokenizerBase, PretrainedGPT2Mixin):
-    r"""Pre-trained GPT2 Tokenizer.
-
-    Args:
-        pretrained_model_name (optional): a `str`, the name of
-            pre-trained model (e.g., `117M`). Please refer to
-            :class:`~texar.torch.modules.PretrainedGPT2Mixin` for
-            all supported models.
-            If None, the model name in :attr:`hparams` is used.
-        cache_dir (optional): the path to a folder in which the
-            pre-trained models will be cached. If `None` (default),
-            a default directory (``texar_data`` folder under user's home
-            directory) will be used.
-        hparams (dict or HParams, optional): Hyperparameters. Missing
-            hyperparameter will be set to default values. See
-            :meth:`default_hparams` for the hyperparameter structure
-            and default values.
-    """
-
-    _IS_PRETRAINED = True
-    _MAX_INPUT_SIZE = {
-        'gpt2-small': 1024,
-        'gpt2-medium': 1024,
-        'gpt2-large': 1024,
-        'gpt2-xl': 1024,
-    }
-    _DEPRECATED_MAX_INPUT_SIZE = {
-        '117M': 1024,
-        '345M': 1024,
-    }
-    _MAX_INPUT_SIZE.update(_DEPRECATED_MAX_INPUT_SIZE)
-
-    _VOCAB_FILE_NAMES = {
-        'vocab_file': 'encoder.json',
-        'merges_file': 'vocab.bpe',
-    }
-
-    def __init__(self,
-                 pretrained_model_name: Optional[str] = None,
-                 cache_dir: Optional[str] = None,
-                 hparams=None):
-        self.load_pretrained_config(pretrained_model_name, cache_dir, hparams)
-
-        super().__init__(hparams=None)
-
-        self.config = {
-            'errors': self.hparams['errors']
-        }
-
-        if self.pretrained_model_dir is not None:
-            vocab_file = os.path.join(self.pretrained_model_dir,
-                                      self._VOCAB_FILE_NAMES['vocab_file'])
-            merges_file = os.path.join(self.pretrained_model_dir,
-                                       self._VOCAB_FILE_NAMES['merges_file'])
-            assert pretrained_model_name is not None
-            if self._MAX_INPUT_SIZE.get(pretrained_model_name):
-                self.max_len = self._MAX_INPUT_SIZE[pretrained_model_name]
-        else:
-            vocab_file = self.hparams['vocab_file']
-            merges_file = self.hparams['merges_file']
-            if self.hparams.get('max_len'):
-                self.max_len = self.hparams['max_len']
-
-        if not os.path.isfile(vocab_file):
-            raise ValueError("Can't find a vocabulary file at path "
-                             "'{}".format(vocab_file))
-
-        if not os.path.isfile(merges_file):
-            raise ValueError("Can't find a merges file at path "
-                             "'{}".format(merges_file))
-
-        with open(vocab_file) as fp:
-            self.encoder = json.load(fp)
-        self.decoder = {v: k for k, v in self.encoder.items()}
-        self.errors = self.hparams["errors"]  # how to handle errors in decoding
-        self.byte_encoder = bytes_to_unicode()
-        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
-        with open(merges_file, encoding='utf-8') as fp:
-            bpe_data = fp.read().split('\n')[1:-1]
-        bpe_merges = [tuple(merge.split()) for merge in bpe_data]
-        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
-        self.cache: Dict[str, str] = {}
-
-        # Should haved added re.IGNORECASE so BPE merges can happen for
-        # capitalized versions of contractions
-        self.pat = re.compile(
-            r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?
-            [^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
-
-    def _map_text_to_token(self, text: str) -> List[str]:  # type: ignore
-        r"""Tokenize a string. """
-        bpe_tokens: List[str] = []
-        for token in re.findall(self.pat, text):
-            token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
-            bpe_tokens.extend(
-                bpe_token for bpe_token in self._bpe(token).split(' '))
-        return bpe_tokens
-
-    def save_vocab(self, save_dir: str) -> Tuple[str, str]:
-        r"""Save the tokenizer vocabulary and merge files to a directory."""
-        if not os.path.isdir(save_dir):
-            raise ValueError("Vocabulary path ({}) should be a "
-                             "directory".format(save_dir))
-
-        vocab_file = os.path.join(save_dir,
-                                  self._VOCAB_FILE_NAMES['vocab_file'])
-        merge_file = os.path.join(save_dir,
-                                  self._VOCAB_FILE_NAMES['merges_file'])
-
-        with open(vocab_file, 'w', encoding='utf-8') as f:
-            f.write(json.dumps(self.encoder, ensure_ascii=False))
-
-        index = 0
-        with open(merge_file, "w", encoding="utf-8") as writer:
-            writer.write(u'#version: 0.2\n')
-            for bpe_tokens, token_index in sorted(self.bpe_ranks.items(),
-                                                  key=lambda kv: kv[1]):
-                if index != token_index:
-                    print("Saving vocabulary to {}: BPE merge indices are "
-                          "not consecutive. Please check that the tokenizer "
-                          "is not corrupted!".format(merge_file))
-                    index = token_index
-                writer.write(' '.join(bpe_tokens) + u'\n')
-                index += 1
-
-        return (vocab_file, merge_file)
-
-    def _bpe(self, token: str) -> str:
-        if token in self.cache:
-            return self.cache[token]
-        word = tuple(token)
-        pairs = get_pairs(word)
-
-        if not pairs:
-            return token
-
-        while True:
-            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(
-                pair, float('inf')))
-            if bigram not in self.bpe_ranks:
-                break
-            first, second = bigram
-            new_word: List[str] = []
-            i = 0
-            while i < len(word):
-                try:
-                    j = word.index(first, i)
-                    new_word.extend(word[i:j])
-                    i = j
-                except ValueError:
-                    new_word.extend(word[i:])
-                    break
-
-                if word[i] == first and i < len(word) - 1 \
-                        and word[i + 1] == second:
-                    new_word.append(first + second)
-                    i += 2
-                else:
-                    new_word.append(word[i])
-                    i += 1
-            new_word = tuple(new_word)
-            word = new_word
-            if len(word) == 1:
-                break
-            else:
-                pairs = get_pairs(word)
-        word = ' '.join(word)
-        self.cache[token] = word
-        return word
-
-    @property
-    def vocab_size(self) -> int:
-        return len(self.encoder)
-
-    def _map_token_to_id(self, token: str) -> int:
-        r"""Maps a token to an id using the vocabulary."""
-        return self.encoder.get(token, self.encoder.get(self.unk_token))
-
-    def _map_id_to_token(self, index: int) -> str:
-        r"""Maps an id to a token using the vocabulary."""
-        token = self.decoder.get(index)
-        assert isinstance(token, str)
-        return token
-
-    def map_token_to_text(self, tokens: List[str]) -> str:
-        r"""Maps a sequence of tokens (string) in a single string."""
-        text = ''.join(tokens)
-        text = bytearray([self.byte_decoder[c] for c in text]).decode(
-            'utf-8', errors=self.errors)
-        return text
-
-    def encode_text(  # type: ignore
-            self,
-            text: str,
-            max_seq_length: Optional[int] = None,
-            append_eos_token: bool = True) -> Tuple[List[int], int]:
-        r"""Adds special tokens to a sequence and computes the corresponding
-        sequence length for GPT2 specific tasks. The sequence will be truncated
-        if its length is larger than ``max_seq_length``.
-
-        A GPT2 sequence has the following format:
-        `[bos_token]` X `[eos_token]` `[pad_token]`
-
-        Args:
-            text: Input text.
-            max_seq_length: Maximum sequence length.
-            append_eos_token: Whether to append ``eos_token`` after the
-                sequence.
-
-        Returns:
-            A tuple of `(input_ids, seq_len)`, where
-
-            - ``input_ids``: A list of input token ids with added
-              special tokens.
-            - ``seq_len``: The sequence length.
-        """
-        if max_seq_length is None:
-            max_seq_length = self.max_len
-
-        token_ids = self.map_text_to_id(text)
-        assert isinstance(token_ids, list)
-
-        bos_token_id = self._map_token_to_id(self.bos_token)
-        eos_token_id = self._map_token_to_id(self.eos_token)
-        pad_token_id = self._map_token_to_id(self.pad_token)
-
-        if append_eos_token:
-            input_ids = token_ids[:max_seq_length - 2]
-            input_ids = [bos_token_id] + input_ids + [eos_token_id]
-        else:
-            input_ids = token_ids[:max_seq_length - 1]
-            input_ids = [bos_token_id] + input_ids
-
-        seq_len = len(input_ids)
-
-        # Pad up to the maximum sequence length.
-        input_ids = input_ids + [pad_token_id] * (max_seq_length - seq_len)
-
-        assert len(input_ids) == max_seq_length
-
-        return input_ids, seq_len
-
-    @staticmethod
-    def default_hparams() -> Dict[str, Any]:
-        r"""Returns a dictionary of hyperparameters with default values.
-
-        * The tokenizer is determined by the constructor argument
-          :attr:`pretrained_model_name` if it's specified. In this case,
-          `hparams` are ignored.
-        * Otherwise, the tokenizer is determined by
-          `hparams['pretrained_model_name']` if it's specified. All other
-          configurations in `hparams` are ignored.
-        * If the above two are `None`, the tokenizer is defined by the
-          configurations in `hparams`.
-
-        .. code-block:: python
-
-            {
-                "pretrained_model_name": "117M",
-                "vocab_file": None,
-                "merges_file": None,
-                "max_len": 1024,
-                "bos_token": "<|endoftext|>",
-                "eos_token": "<|endoftext|>",
-                "unk_token": "<|endoftext|>",
-                "pad_token": "<|endoftext|>",
-                "errors": "replace",
-            }
-
-        Here:
-
-        `"pretrained_model_name"`: str or None
-            The name of the pre-trained GPT2 model.
-
-        `"vocab_file"`: str or None
-            The path to a vocabulary json file mapping tokens to ids.
-
-        `"merges_file"`: str or None
-            The path to a merges file.
-
-        `"max_len"`: int
-            The maximum sequence length that this model might ever be used with.
-
-        `"bos_token"`: str
-            Beginning of sentence token
-
-        `"eos_token"`: str
-            End of sentence token
-
-        `"unk_token"`: str
-            Unknown token
-
-        `"pad_token"`: str
-            Padding token
-
-        `"errors"`: str
-            Response when mapping tokens to text fails. The possible values are
-            `ignore`, `replace`, and `strict`.
-
-        `"name"`: str
-            Name of the tokenizer.
-        """
-        return {
-            'pretrained_model_name': '117M',
-            'vocab_file': None,
-            'merges_file': None,
-            'max_len': 1024,
-            'bos_token': '<|endoftext|>',
-            'eos_token': '<|endoftext|>',
-            'unk_token': '<|endoftext|>',
-            'pad_token': '<|endoftext|>',
-            'errors': 'replace',
-            'name': 'gpt2_tokenizer',
-            '@no_typecheck': ['pretrained_model_name'],
-        }
-
-    @classmethod
-    def _transform_config(cls, pretrained_model_name: str,
-                          cache_dir: str):
-        r"""Returns the configuration of the pre-trained GPT2 tokenizer."""
-        return {
-            'vocab_file': None,
-            'merges_file': None,
-            'max_len': 1024,
-            'bos_token': '<|endoftext|>',
-            'eos_token': '<|endoftext|>',
-            'unk_token': '<|endoftext|>',
-            'pad_token': '<|endoftext|>',
-            'errors': 'replace',
-        }
diff --git a/texar/tf/data/tokenizers/gpt2_tokenizer_test.py b/texar/tf/data/tokenizers/gpt2_tokenizer_test.py
deleted file mode 100644
index b6f63d53..00000000
--- a/texar/tf/data/tokenizers/gpt2_tokenizer_test.py
+++ /dev/null
@@ -1,195 +0,0 @@
-"""
-Unit tests for pre-trained GPT2 tokenizer.
-"""
-
-import json
-import os
-import pickle
-import tempfile
-
-import tensorflow as tf
-
-from texar.tf.data.tokenizers.gpt2_tokenizer import \
-    GPT2Tokenizer
-from texar.tf.utils.test import pretrained_test
-
-
-class GPT2TokenizerTest(tf.test.TestCase):
-
-    def setUp(self):
-        vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n",
-                 "lo", "low", "er",
-                 "low", "lowest", "newer", "wider", "<unk>"]
-        vocab_tokens = dict(zip(vocab, range(len(vocab))))
-        merges = ["#version: 0.2", "l o", "lo w", "e r", ""]
-        self.special_tokens_map = {"unk_token": "<unk>"}
-
-        self.tmp_dir = tempfile.TemporaryDirectory()
-        self.vocab_file = os.path.join(self.tmp_dir.name, 'encoder.json')
-        self.merges_file = os.path.join(self.tmp_dir.name, 'vocab.bpe')
-
-        with open(self.vocab_file, "w") as fp:
-            fp.write(json.dumps(vocab_tokens))
-        with open(self.merges_file, "w") as fp:
-            fp.write("\n".join(merges))
-
-    def tearDown(self):
-        self.tmp_dir.cleanup()
-
-    @pretrained_test
-    def test_model_loading(self):
-        for pretrained_model_name in \
-                GPT2Tokenizer.available_checkpoints():
-            tokenizer = GPT2Tokenizer(
-                pretrained_model_name=pretrained_model_name)
-            _ = tokenizer.map_text_to_token(
-                u"Munich and Berlin are nice cities")
-
-    def test_tokenize(self):
-        tokenizer = GPT2Tokenizer.load(self.tmp_dir.name,
-                                       self.special_tokens_map)
-
-        text = "lower"
-        bpe_tokens = ["low", "er"]
-        tokens = tokenizer.map_text_to_token(text)
-        self.assertListEqual(tokens, bpe_tokens)
-
-        input_tokens = tokens + [tokenizer.unk_token]
-        input_bpe_tokens = [13, 12, 17]
-        self.assertListEqual(
-            tokenizer.map_token_to_id(input_tokens),
-            input_bpe_tokens)
-
-    def test_pickle(self):
-        tokenizer = GPT2Tokenizer.load(self.tmp_dir.name,
-                                       self.special_tokens_map)
-        self.assertIsNotNone(tokenizer)
-
-        text = u"Munich and Berlin are nice cities"
-        subwords = tokenizer.map_text_to_token(text)
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            filename = os.path.join(tmpdirname, u"tokenizer.bin")
-            with open(filename, "wb") as f:
-                pickle.dump(tokenizer, f)
-            with open(filename, "rb") as f:
-                tokenizer_new = pickle.load(f)
-
-        subwords_loaded = tokenizer_new.map_text_to_token(text)
-
-        self.assertListEqual(subwords, subwords_loaded)
-
-    def test_save_load(self):
-        tokenizer = GPT2Tokenizer.load(self.tmp_dir.name,
-                                       self.special_tokens_map)
-
-        before_tokens = tokenizer.map_text_to_id(
-            u"He is very happy, UNwant\u00E9d,running")
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            tokenizer.save(tmpdirname)
-            tokenizer = tokenizer.load(tmpdirname)
-
-        after_tokens = tokenizer.map_text_to_id(
-            u"He is very happy, UNwant\u00E9d,running")
-        self.assertListEqual(before_tokens, after_tokens)
-
-    def test_pretrained_model_list(self):
-        model_list_1 = list(GPT2Tokenizer._MODEL2URL.keys())
-        model_list_2 = list(GPT2Tokenizer._MAX_INPUT_SIZE.keys())
-
-        self.assertListEqual(model_list_1, model_list_2)
-
-    def test_encode_decode(self):
-        tokenizer = GPT2Tokenizer.load(self.tmp_dir.name,
-                                       self.special_tokens_map)
-
-        input_text = u"lower newer"
-        output_text = u"lower<unk>newer"
-
-        tokens = tokenizer.map_text_to_token(input_text)
-        ids = tokenizer.map_token_to_id(tokens)
-        ids_2 = tokenizer.map_text_to_id(input_text)
-        self.assertListEqual(ids, ids_2)
-
-        tokens_2 = tokenizer.map_id_to_token(ids)
-        text_2 = tokenizer.map_id_to_text(ids)
-
-        self.assertEqual(text_2, output_text)
-
-        self.assertNotEqual(len(tokens_2), 0)
-        self.assertIsInstance(text_2, str)
-
-    def test_add_tokens(self):
-        tokenizer = GPT2Tokenizer.load(self.tmp_dir.name,
-                                       self.special_tokens_map)
-
-        vocab_size = tokenizer.vocab_size
-        all_size = len(tokenizer)
-
-        self.assertNotEqual(vocab_size, 0)
-        self.assertEqual(vocab_size, all_size)
-
-        new_toks = ["aaaaabbbbbb", "cccccccccdddddddd"]
-        added_toks = tokenizer.add_tokens(new_toks)
-        vocab_size_2 = tokenizer.vocab_size
-        all_size_2 = len(tokenizer)
-
-        self.assertNotEqual(vocab_size_2, 0)
-        self.assertEqual(vocab_size, vocab_size_2)
-        self.assertEqual(added_toks, len(new_toks))
-        self.assertEqual(all_size_2, all_size + len(new_toks))
-
-        tokens = tokenizer.map_text_to_id("aaaaabbbbbb low cccccccccdddddddd l")
-        self.assertGreaterEqual(len(tokens), 4)
-        self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
-        self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
-
-        new_toks_2 = {'eos_token': ">>>>|||<||<<|<<",
-                      'pad_token': "<<<<<|||>|>>>>|>"}
-        added_toks_2 = tokenizer.add_special_tokens(new_toks_2)
-        vocab_size_3 = tokenizer.vocab_size
-        all_size_3 = len(tokenizer)
-
-        self.assertNotEqual(vocab_size_3, 0)
-        self.assertEqual(vocab_size, vocab_size_3)
-        self.assertEqual(added_toks_2, len(new_toks_2))
-        self.assertEqual(all_size_3, all_size_2 + len(new_toks_2))
-
-        tokens = tokenizer.map_text_to_id(
-            ">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd "
-            "<<<<<|||>|>>>>|> l")
-
-        self.assertGreaterEqual(len(tokens), 6)
-        self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
-        self.assertGreater(tokens[0], tokens[1])
-        self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
-        self.assertGreater(tokens[-2], tokens[-3])
-        self.assertEqual(tokens[0],
-                         tokenizer.map_token_to_id(tokenizer.eos_token))
-        self.assertEqual(tokens[-2],
-                         tokenizer.map_token_to_id(tokenizer.pad_token))
-
-    def test_encode_text(self):
-        tokenizer = GPT2Tokenizer.load(self.tmp_dir.name,
-                                       self.special_tokens_map)
-
-        text_1 = u"lower newer"
-
-        text_1_ids = tokenizer.map_text_to_id(text_1)
-
-        input_ids, seq_len = \
-            tokenizer.encode_text(text=text_1, max_seq_length=10)
-
-        bos_token_id = tokenizer.map_token_to_id(tokenizer.bos_token)
-        eos_token_id = tokenizer.map_token_to_id(tokenizer.eos_token)
-        pad_token_id = tokenizer.map_token_to_id(tokenizer.pad_token)
-
-        self.assertListEqual(input_ids,
-                             [bos_token_id] + text_1_ids + [eos_token_id] +
-                             [pad_token_id])
-        self.assertEqual(seq_len, 9)
-
-
-if __name__ == "__main__":
-    tf.test.main()
diff --git a/texar/tf/data/tokenizers/gpt2_tokenizer_utils.py b/texar/tf/data/tokenizers/gpt2_tokenizer_utils.py
deleted file mode 100644
index 218c96fe..00000000
--- a/texar/tf/data/tokenizers/gpt2_tokenizer_utils.py
+++ /dev/null
@@ -1,68 +0,0 @@
-# Copyright 2019 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Utils of pre-trained GPT2 tokenizer.
-
-Code structure adapted from:
-    `https://github.com/huggingface/pytorch-transformers/blob/master/pytorch_transformers/tokenization_gpt2.py`
-"""
-
-from functools import lru_cache
-
-__all__ = [
-    "bytes_to_unicode",
-    "get_pairs",
-]
-
-
-@lru_cache()
-def bytes_to_unicode():
-    r"""Returns list of utf-8 byte and a corresponding list of unicode strings.
-    The reversible bpe codes work on unicode strings. This means you need a
-    large number of unicode characters in your vocab if you want to avoid UNKs.
-    When you're at something like a 10B token dataset you end up needing
-    around 5K for decent coverage. This is a significant percentage of your
-    normal, say, 32K bpe vocab. To avoid that, we want lookup tables between
-    utf-8 bytes and unicode strings.
-
-    Note that this function avoids the mapping to whitespace and control
-    characters, which is designed specifically for GPT-2 BPE.
-    """
-    bs = list(range(ord("!"), ord("~") + 1)) + list(
-        range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
-    cs = bs[:]
-    n = 0
-    for b in range(2**8):
-        if b not in bs:
-            bs.append(b)
-            cs.append(2**8 + n)
-            n += 1
-    cs = [chr(n) for n in cs]
-    return dict(zip(bs, cs))
-
-
-def get_pairs(word):
-    r"""Return set of symbol pairs in a word. Word is represented as tuple of
-    symbols (symbols being variable-length strings).
-
-    Example:
-        word = "texar"
-        get_pairs(word)
-        # {('t', 'e'), ('e', 'x'), ('x', 'a'), ('a', 'r')}
-    """
-    pairs = set()
-    prev_char = word[0]
-    for char in word[1:]:
-        pairs.add((prev_char, char))
-        prev_char = char
-    return pairs
diff --git a/texar/tf/data/tokenizers/tokenizer_base.py b/texar/tf/data/tokenizers/tokenizer_base.py
index c469e884..073d37d5 100644
--- a/texar/tf/data/tokenizers/tokenizer_base.py
+++ b/texar/tf/data/tokenizers/tokenizer_base.py
@@ -23,7 +23,7 @@
 import os
 import json
 
-from texar.tf.module_base import ModuleBase
+from texar.tf.hyperparams import HParams
 
 __all__ = [
     "TokenizerBase",
@@ -34,7 +34,7 @@
 CONFIG_FILE = 'config.json'
 
 
-class TokenizerBase(ModuleBase):
+class TokenizerBase:
     r"""Base class inherited by all tokenizer classes. This class
     handles downloading and loading pre-trained tokenizer and adding tokens to
     the vocabulary.
@@ -53,12 +53,23 @@ class TokenizerBase(ModuleBase):
     _IS_PRETRAINED: bool
     _MAX_INPUT_SIZE: Dict[str, Optional[int]]
     _VOCAB_FILE_NAMES: Dict[str, str]
+    _VOCAB_FILE_MAP: Dict[str, Dict[str, str]]
     _SPECIAL_TOKENS_ATTRIBUTES = ["bos_token", "eos_token", "unk_token",
                                   "sep_token", "pad_token", "cls_token",
                                   "mask_token", "additional_special_tokens"]
 
     def __init__(self, hparams):
-        super().__init__(hparams=hparams)
+        super().__init__()
+        if not hasattr(self, '_hparams'):
+            self._hparams = HParams(hparams, self.default_hparams())
+        else:
+            # Probably already parsed by subclasses. We rely on subclass
+            # implementations to get this right.
+            # As a sanity check, we require `hparams` to be `None` in this case.
+            if hparams is not None:
+                raise ValueError(
+                    "`self._hparams` already exists. Argument `hparams` "
+                    "must be set to `None` in this case.")
 
         self.config = None
 
@@ -84,6 +95,29 @@ def __init__(self, hparams):
                     assert isinstance(value, str)
                 setattr(self, key, value)
 
+    @staticmethod
+    def default_hparams():
+        r"""Returns a `dict` of hyperparameters of the module with default
+        values. Used to replace the missing values of input `hparams`
+        during module construction.
+
+        .. code-block:: python
+
+            {
+                "name": "module"
+            }
+        """
+        return {
+            "name": "module"
+        }
+
+    @property
+    def hparams(self):
+        r"""An :class:`~texar.tf.HParams` instance. The hyperparameters
+        of the module.
+        """
+        return self._hparams
+
     @classmethod
     def load(cls, pretrained_model_path: str, configs: Optional[Dict] = None):
         r"""Instantiate a tokenizer from the vocabulary files or the saved
diff --git a/texar/tf/data/tokenizers/xlnet_tokenizer.py b/texar/tf/data/tokenizers/xlnet_tokenizer.py
deleted file mode 100644
index 83d42dde..00000000
--- a/texar/tf/data/tokenizers/xlnet_tokenizer.py
+++ /dev/null
@@ -1,393 +0,0 @@
-# Copyright 2019 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Pre-trained XLNet Tokenizer.
-
-Code structure adapted from:
-    `https://github.com/huggingface/pytorch-transformers/blob/master/pytorch_transformers/tokenization_xlnet.py`
-"""
-
-from typing import Any, Dict, List, Optional, Tuple
-
-import os
-import unicodedata
-from shutil import copyfile
-import sentencepiece as spm
-
-from texar.tf.modules.pretrained.xlnet import PretrainedXLNetMixin
-from texar.tf.data.tokenizers.tokenizer_base import TokenizerBase
-from texar.tf.utils.utils import truncate_seq_pair
-
-__all__ = [
-    "XLNetTokenizer",
-]
-
-SPIECE_UNDERLINE = u'▁'
-
-SEG_ID_A = 0
-SEG_ID_B = 1
-SEG_ID_CLS = 2
-SEG_ID_SEP = 3
-SEG_ID_PAD = 4
-
-
-class XLNetTokenizer(PretrainedXLNetMixin, TokenizerBase):
-    r"""Pre-trained XLNet Tokenizer.
-
-    Args:
-        pretrained_model_name (optional): a `str`, the name of
-            pre-trained model (e.g., `xlnet-base-uncased`). Please refer to
-            :class:`~texar.torch.modules.PretrainedXLNetMixin` for
-            all supported models.
-            If None, the model name in :attr:`hparams` is used.
-        cache_dir (optional): the path to a folder in which the
-            pre-trained models will be cached. If `None` (default),
-            a default directory (``texar_data`` folder under user's home
-            directory) will be used.
-        hparams (dict or HParams, optional): Hyperparameters. Missing
-            hyperparameter will be set to default values. See
-            :meth:`default_hparams` for the hyperparameter structure
-            and default values.
-    """
-
-    _IS_PRETRAINED = True
-    _MAX_INPUT_SIZE = {
-        'xlnet-base-cased': None,
-        'xlnet-large-cased': None,
-    }
-    _VOCAB_FILE_NAMES = {'vocab_file': 'spiece.model'}
-
-    def __init__(self,
-                 pretrained_model_name: Optional[str] = None,
-                 cache_dir: Optional[str] = None,
-                 hparams=None):
-        self.load_pretrained_config(pretrained_model_name, cache_dir, hparams)
-
-        super().__init__(hparams=None)
-
-        self.__dict__: Dict
-
-        self.config = {
-            'do_lower_case': self.hparams['do_lower_case'],
-            'remove_space': self.hparams['remove_space'],
-            'keep_accents': self.hparams['keep_accents'],
-        }
-
-        if self.pretrained_model_dir is not None:
-            vocab_file = os.path.join(self.pretrained_model_dir,
-                                      self._VOCAB_FILE_NAMES['vocab_file'])
-            assert pretrained_model_name is not None
-            if self._MAX_INPUT_SIZE.get(pretrained_model_name):
-                self.max_len = self._MAX_INPUT_SIZE[pretrained_model_name]
-        else:
-            vocab_file = self.hparams['vocab_file']
-            if self.hparams.get('max_len'):
-                self.max_len = self.hparams['max_len']
-
-        if not os.path.isfile(vocab_file):
-            raise ValueError("Can't find a vocabulary file at path "
-                             "'{}".format(vocab_file))
-
-        self.do_lower_case = self.hparams["do_lower_case"]
-        self.remove_space = self.hparams["remove_space"]
-        self.keep_accents = self.hparams["keep_accents"]
-        self.vocab_file = vocab_file
-
-        self.sp_model = spm.SentencePieceProcessor()
-        self.sp_model.Load(self.vocab_file)
-
-    # spm.SentencePieceProcessor() is a SwigPyObject object which cannot be
-    # pickled. We need to define __getstate__ here.
-    def __getstate__(self):
-        state = self.__dict__.copy()
-        state["sp_model"] = None
-        state["vocab_file"] = None
-        return state, self.vocab_file
-
-    # spm.SentencePieceProcessor() is a SwigPyObject object which cannot be
-    # pickled. We need to define __setstate__ here.
-    def __setstate__(self, d):
-        self.__dict__, self.vocab_file = d
-        self.sp_model = spm.SentencePieceProcessor()
-        self.sp_model.Load(self.vocab_file)
-
-    def _preprocess_text(self, inputs: str) -> str:
-        r"""Pre-process the text, including removing space,
-        stripping accents, and lower-casing the text.
-        """
-        if self.remove_space:
-            outputs = ' '.join(inputs.strip().split())
-        else:
-            outputs = inputs
-        outputs = outputs.replace("``", '"').replace("''", '"')
-
-        if not self.keep_accents:
-            outputs = unicodedata.normalize('NFKD', outputs)
-            outputs = ''.join([c for c in outputs if not
-            unicodedata.combining(c)])
-        if self.do_lower_case:
-            outputs = outputs.lower()
-
-        return outputs
-
-    def _map_text_to_token(self, text: str,  # type: ignore
-                           sample: bool = False) -> List[str]:
-        text = self._preprocess_text(text)
-        if not sample:
-            pieces = self.sp_model.EncodeAsPieces(text)
-        else:
-            pieces = self.sp_model.SampleEncodeAsPieces(text, 64, 0.1)
-
-        new_pieces: List[str] = []
-        for piece in pieces:
-            if len(piece) > 1 and piece[-1] == ',' and piece[-2].isdigit():
-                cur_pieces = self.sp_model.EncodeAsPieces(
-                    piece[:-1].replace(SPIECE_UNDERLINE, ''))
-                if piece[0] != SPIECE_UNDERLINE and \
-                        cur_pieces[0][0] == SPIECE_UNDERLINE:
-                    if len(cur_pieces[0]) == 1:
-                        cur_pieces = cur_pieces[1:]
-                    else:
-                        cur_pieces[0] = cur_pieces[0][1:]
-                cur_pieces.append(piece[-1])
-                new_pieces.extend(cur_pieces)
-            else:
-                new_pieces.append(piece)
-
-        return new_pieces
-
-    def save_vocab(self, save_dir: str) -> Tuple[str]:
-        r"""Save the sentencepiece vocabulary (copy original file) to
-        a directory.
-        """
-        if not os.path.isdir(save_dir):
-            raise ValueError("Vocabulary path ({}) should be a "
-                             "directory".format(save_dir))
-        out_vocab_file = os.path.join(save_dir,
-                                      self._VOCAB_FILE_NAMES['vocab_file'])
-
-        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
-            copyfile(self.vocab_file, out_vocab_file)
-
-        return (out_vocab_file,)
-
-    @property
-    def vocab_size(self) -> int:
-        return len(self.sp_model)
-
-    def _map_token_to_id(self, token: str) -> int:
-        return self.sp_model.PieceToId(token)
-
-    def _map_id_to_token(self, index: int) -> str:
-        token = self.sp_model.IdToPiece(index)
-        return token
-
-    def map_token_to_text(self, tokens: List[str]) -> str:
-        r"""Maps a sequence of tokens (string) in a single string."""
-        out_string = ''.join(tokens).replace(SPIECE_UNDERLINE, ' ').strip()
-        return out_string
-
-    def encode_text(self,
-                    text_a: str,
-                    text_b: Optional[str] = None,
-                    max_seq_length: Optional[int] = None) -> \
-            Tuple[List[int], List[int], List[int]]:
-        r"""Adds special tokens to a sequence or sequence pair and computes the
-        corresponding segment ids and input mask for XLNet specific tasks.
-        The sequence will be truncated if its length is larger than
-        ``max_seq_length``.
-
-        A XLNet sequence has the following format:
-        X `[sep_token]` `[cls_token]`
-
-        A XLNet sequence pair has the following format:
-        `[cls_token]` A `[sep_token]` B `[sep_token]`
-
-        Args:
-            text_a: The first input text.
-            text_b: The second input text.
-            max_seq_length: Maximum sequence length.
-
-        Returns:
-            A tuple of `(input_ids, segment_ids, input_mask)`, where
-
-            - ``input_ids``: A list of input token ids with added
-              special token ids.
-            - ``segment_ids``: A list of segment ids.
-            - ``input_mask``: A list of mask ids. The mask has 1 for real
-              tokens and 0 for padding tokens. Only real tokens are
-              attended to.
-        """
-        if max_seq_length is None:
-            max_seq_length = self.max_len
-
-        cls_token_id = self._map_token_to_id(self.cls_token)
-        sep_token_id = self._map_token_to_id(self.sep_token)
-
-        token_ids_a = self.map_text_to_id(text_a)
-        assert isinstance(token_ids_a, list)
-
-        token_ids_b = None
-        if text_b:
-            token_ids_b = self.map_text_to_id(text_b)
-
-        if token_ids_b:
-            assert isinstance(token_ids_b, list)
-            # Modifies `token_ids_a` and `token_ids_b` in place so that the
-            # total length is less than the specified length.
-            # Account for [CLS], [SEP], [SEP] with "- 3"
-            truncate_seq_pair(token_ids_a, token_ids_b, max_seq_length - 3)
-
-            input_ids = (token_ids_a + [sep_token_id] + token_ids_b +
-                         [sep_token_id] + [cls_token_id])
-            segment_ids = [SEG_ID_A] * (len(token_ids_a) + 1) + \
-                          [SEG_ID_B] * (len(token_ids_b) + 1) + [SEG_ID_CLS]
-        else:
-            # Account for [CLS] and [SEP] with "- 2"
-            token_ids = token_ids_a[:max_seq_length - 2]
-
-            input_ids = token_ids + [sep_token_id] + [cls_token_id]
-            segment_ids = [SEG_ID_A] * (len(input_ids) - 1) + [SEG_ID_CLS]
-
-        input_mask = [0] * len(input_ids)
-
-        # Zero-pad up to the maximum sequence length.
-        input_ids = [0] * (max_seq_length - len(input_ids)) + input_ids
-        input_mask = [1] * (max_seq_length - len(input_mask)) + input_mask
-        segment_ids = ([SEG_ID_PAD] * (max_seq_length - len(segment_ids)) +
-                       segment_ids)
-
-        assert len(input_ids) == max_seq_length
-        assert len(input_mask) == max_seq_length
-        assert len(segment_ids) == max_seq_length
-
-        return input_ids, segment_ids, input_mask
-
-    @staticmethod
-    def default_hparams() -> Dict[str, Any]:
-        r"""Returns a dictionary of hyperparameters with default values.
-
-        * The tokenizer is determined by the constructor argument
-          :attr:`pretrained_model_name` if it's specified. In this case,
-          `hparams` are ignored.
-        * Otherwise, the tokenizer is determined by
-          `hparams['pretrained_model_name']` if it's specified. All other
-          configurations in `hparams` are ignored.
-        * If the above two are `None`, the tokenizer is defined by the
-          configurations in `hparams`.
-
-        .. code-block:: python
-
-            {
-                "pretrained_model_name": "xlnet-base-cased",
-                "vocab_file": None,
-                "max_len": None,
-                "bos_token": "<s>",
-                "eos_token": "</s>",
-                "unk_token": "<unk>",
-                "sep_token": "<sep>",
-                "pad_token": "<pad>",
-                "cls_token": "<cls>",
-                "mask_token": "<mask>",
-                "additional_special_tokens": ["<eop>", "<eod>"],
-                "do_lower_case": False,
-                "remove_space": True,
-                "keep_accents": False,
-            }
-
-        Here:
-
-        `"pretrained_model_name"`: str or None
-            The name of the pre-trained XLNet model.
-
-        `"vocab_file"`: str or None
-            The path to a sentencepiece vocabulary file.
-
-        `"max_len"`: int or None
-            The maximum sequence length that this model might ever be used with.
-
-        `"bos_token"`: str
-            Beginning of sentence token.
-
-        `"eos_token"`: str
-            End of sentence token.
-
-        `"unk_token"`: str
-            Unknown token.
-
-        `"sep_token"`: str
-            Separation token.
-
-        `"pad_token"`: str
-            Padding token.
-
-        `"cls_token"`: str
-            Classification token.
-
-        `"mask_token"`: str
-            Masking token.
-
-        `"additional_special_tokens"`: list
-            A list of additional special tokens.
-
-        `"do_lower_case"`: bool
-            Whether to lower-case the text.
-
-        `"remove_space"`: bool
-            Whether to remove the space in the text.
-
-        `"keep_accents"`: bool
-            Whether to keep the accents in the text.
-
-        `"name"`: str
-            Name of the tokenizer.
-        """
-        return {
-            'pretrained_model_name': 'xlnet-base-cased',
-            'vocab_file': None,
-            'max_len': None,
-            'bos_token': '<s>',
-            'eos_token': '</s>',
-            'unk_token': '<unk>',
-            'sep_token': '<sep>',
-            'pad_token': '<pad>',
-            'cls_token': '<cls>',
-            'mask_token': '<mask>',
-            'additional_special_tokens': ['<eop>', '<eod>'],
-            'do_lower_case': False,
-            'remove_space': True,
-            'keep_accents': False,
-            'name': 'xlnet_tokenizer',
-            '@no_typecheck': ['pretrained_model_name'],
-        }
-
-    @classmethod
-    def _transform_config(cls, pretrained_model_name: str,
-                          cache_dir: str):
-        r"""Returns the configuration of the pre-trained XLNet tokenizer."""
-        return {
-            'vocab_file': None,
-            'max_len': None,
-            'bos_token': '<s>',
-            'eos_token': '</s>',
-            'unk_token': '<unk>',
-            'sep_token': '<sep>',
-            'pad_token': '<pad>',
-            'cls_token': '<cls>',
-            'mask_token': '<mask>',
-            'additional_special_tokens': ['<eop>', '<eod>'],
-            'do_lower_case': False,
-            'remove_space': True,
-            'keep_accents': False,
-        }
diff --git a/texar/tf/data/tokenizers/xlnet_tokenizer_test.py b/texar/tf/data/tokenizers/xlnet_tokenizer_test.py
deleted file mode 100644
index eb6bee08..00000000
--- a/texar/tf/data/tokenizers/xlnet_tokenizer_test.py
+++ /dev/null
@@ -1,255 +0,0 @@
-"""
-Unit tests for pre-trained XLNet tokenizer.
-"""
-
-import os
-import pickle
-import tempfile
-
-import tensorflow as tf
-
-from texar.tf.data.data_utils import maybe_download
-from texar.tf.data.tokenizers.xlnet_tokenizer import \
-    XLNetTokenizer, SPIECE_UNDERLINE
-from texar.tf.utils.test import pretrained_test
-
-
-class XLNetTokenizerTest(tf.test.TestCase):
-
-    def setUp(self):
-        self.tmp_dir = tempfile.TemporaryDirectory()
-        self.SAMPLE_VOCAB = maybe_download(
-            'https://github.com/gpengzhi/pytorch-transformers/blob/master/'
-            'pytorch_transformers/tests/fixtures/test_sentencepiece.model'
-            '?raw=true', self.tmp_dir.name)
-
-        self.tokenizer = XLNetTokenizer.load(
-            self.SAMPLE_VOCAB[0], configs={'keep_accents': True})
-        self.tokenizer.save(self.tmp_dir.name)
-
-    def tearDown(self):
-        self.tmp_dir.cleanup()
-
-    @pretrained_test
-    def test_model_loading(self):
-        for pretrained_model_name in \
-                XLNetTokenizer.available_checkpoints():
-            tokenizer = XLNetTokenizer(
-                pretrained_model_name=pretrained_model_name)
-            _ = tokenizer.map_text_to_token(u"This is a test")
-
-    def test_tokenize(self):
-        tokens = self.tokenizer.map_text_to_token(u'This is a test')
-        self.assertListEqual(tokens, [u'▁This', u'▁is', u'▁a', u'▁t', u'est'])
-
-        self.assertListEqual(
-            self.tokenizer.map_token_to_id(tokens),
-            [285, 46, 10, 170, 382])
-
-        tokens = self.tokenizer.map_text_to_token(
-            u"I was born in 92000, and this is falsé.")
-        self.assertListEqual(tokens, [SPIECE_UNDERLINE + u'I',
-                                      SPIECE_UNDERLINE + u'was',
-                                      SPIECE_UNDERLINE + u'b',
-                                      u'or', u'n', SPIECE_UNDERLINE + u'in',
-                                      SPIECE_UNDERLINE + u'',
-                                      u'9', u'2', u'0', u'0', u'0', u',',
-                                      SPIECE_UNDERLINE + u'and',
-                                      SPIECE_UNDERLINE + u'this',
-                                      SPIECE_UNDERLINE + u'is',
-                                      SPIECE_UNDERLINE + u'f', u'al', u's',
-                                      u'é', u'.'])
-        ids = self.tokenizer.map_token_to_id(tokens)
-        self.assertListEqual(
-            ids, [8, 21, 84, 55, 24, 19, 7, 0,
-                  602, 347, 347, 347, 3, 12, 66,
-                  46, 72, 80, 6, 0, 4])
-
-        back_tokens = self.tokenizer.map_id_to_token(ids)
-        self.assertListEqual(back_tokens, [SPIECE_UNDERLINE + u'I',
-                                           SPIECE_UNDERLINE + u'was',
-                                           SPIECE_UNDERLINE + u'b',
-                                           u'or', u'n',
-                                           SPIECE_UNDERLINE + u'in',
-                                           SPIECE_UNDERLINE + u'', u'<unk>',
-                                           u'2', u'0', u'0', u'0', u',',
-                                           SPIECE_UNDERLINE + u'and',
-                                           SPIECE_UNDERLINE + u'this',
-                                           SPIECE_UNDERLINE + u'is',
-                                           SPIECE_UNDERLINE + u'f', u'al', u's',
-                                           u'<unk>', u'.'])
-
-    def test_pickle(self):
-        tokenizer = XLNetTokenizer.load(self.tmp_dir.name)
-        self.assertIsNotNone(tokenizer)
-
-        text = u"Munich and Berlin are nice cities"
-        subwords = tokenizer.map_text_to_token(text)
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            filename = os.path.join(tmpdirname, u"tokenizer.bin")
-            with open(filename, "wb") as f:
-                pickle.dump(tokenizer, f)
-            with open(filename, "rb") as f:
-                tokenizer_new = pickle.load(f)
-
-        subwords_loaded = tokenizer_new.map_text_to_token(text)
-
-        self.assertListEqual(subwords, subwords_loaded)
-
-    def test_save_load(self):
-        tokenizer = XLNetTokenizer.load(self.tmp_dir.name)
-
-        before_tokens = tokenizer.map_text_to_id(
-            u"He is very happy, UNwant\u00E9d,running")
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            tokenizer.save(tmpdirname)
-            tokenizer = tokenizer.load(tmpdirname)
-
-        after_tokens = tokenizer.map_text_to_id(
-            u"He is very happy, UNwant\u00E9d,running")
-        self.assertListEqual(before_tokens, after_tokens)
-
-    def test_pretrained_model_list(self):
-        model_list_1 = list(XLNetTokenizer._MODEL2URL.keys())
-        model_list_2 = list(XLNetTokenizer._MAX_INPUT_SIZE.keys())
-
-        self.assertListEqual(model_list_1, model_list_2)
-
-    def test_encode_decode(self):
-        tokenizer = XLNetTokenizer.load(self.tmp_dir.name)
-
-        input_text = u"This is a test"
-        output_text = u"This is a test"
-
-        tokens = tokenizer.map_text_to_token(input_text)
-        ids = tokenizer.map_token_to_id(tokens)
-        ids_2 = tokenizer.map_text_to_id(input_text)
-        self.assertListEqual(ids, ids_2)
-
-        tokens_2 = tokenizer.map_id_to_token(ids)
-        text_2 = tokenizer.map_id_to_text(ids)
-
-        self.assertEqual(text_2, output_text)
-
-        self.assertNotEqual(len(tokens_2), 0)
-        self.assertIsInstance(text_2, str)
-
-    def test_add_tokens(self):
-        tokenizer = XLNetTokenizer.load(self.tmp_dir.name)
-
-        vocab_size = tokenizer.vocab_size
-        all_size = len(tokenizer)
-
-        self.assertNotEqual(vocab_size, 0)
-        self.assertEqual(vocab_size, all_size)
-
-        new_toks = ["aaaaabbbbbb", "cccccccccdddddddd"]
-        added_toks = tokenizer.add_tokens(new_toks)
-        vocab_size_2 = tokenizer.vocab_size
-        all_size_2 = len(tokenizer)
-
-        self.assertNotEqual(vocab_size_2, 0)
-        self.assertEqual(vocab_size, vocab_size_2)
-        self.assertEqual(added_toks, len(new_toks))
-        self.assertEqual(all_size_2, all_size + len(new_toks))
-
-        tokens = tokenizer.map_text_to_id("aaaaabbbbbb low cccccccccdddddddd l")
-        self.assertGreaterEqual(len(tokens), 4)
-        self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
-        self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
-
-        new_toks_2 = {'eos_token': ">>>>|||<||<<|<<",
-                      'pad_token': "<<<<<|||>|>>>>|>"}
-        added_toks_2 = tokenizer.add_special_tokens(new_toks_2)
-        vocab_size_3 = tokenizer.vocab_size
-        all_size_3 = len(tokenizer)
-
-        self.assertNotEqual(vocab_size_3, 0)
-        self.assertEqual(vocab_size, vocab_size_3)
-        self.assertEqual(added_toks_2, len(new_toks_2))
-        self.assertEqual(all_size_3, all_size_2 + len(new_toks_2))
-
-        tokens = tokenizer.map_text_to_id(
-            ">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd "
-            "<<<<<|||>|>>>>|> l")
-
-        self.assertGreaterEqual(len(tokens), 6)
-        self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
-        self.assertGreater(tokens[0], tokens[1])
-        self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
-        self.assertGreater(tokens[-2], tokens[-3])
-        self.assertEqual(tokens[0],
-                         tokenizer.map_token_to_id(tokenizer.eos_token))
-        self.assertEqual(tokens[-2],
-                         tokenizer.map_token_to_id(tokenizer.pad_token))
-
-    def test_tokenizer_lower(self):
-        tokenizer = XLNetTokenizer.load(
-            self.SAMPLE_VOCAB[0], configs={'do_lower_case': True,
-                                           'keep_accents': False})
-        tokens = tokenizer.map_text_to_token(
-            u"I was born in 92000, and this is falsé.")
-        self.assertListEqual(tokens, [SPIECE_UNDERLINE + u'', u'i',
-                                      SPIECE_UNDERLINE + u'was',
-                                      SPIECE_UNDERLINE + u'b',
-                                      u'or', u'n', SPIECE_UNDERLINE + u'in',
-                                      SPIECE_UNDERLINE + u'',
-                                      u'9', u'2', u'0', u'0', u'0', u',',
-                                      SPIECE_UNDERLINE + u'and',
-                                      SPIECE_UNDERLINE + u'this',
-                                      SPIECE_UNDERLINE + u'is',
-                                      SPIECE_UNDERLINE + u'f', u'al', u'se',
-                                      u'.'])
-        self.assertListEqual(tokenizer.map_text_to_token(u"H\u00E9llo"),
-                             [u"▁he", u"ll", u"o"])
-
-    def test_tokenizer_no_lower(self):
-        tokenizer = XLNetTokenizer.load(
-            self.SAMPLE_VOCAB[0], configs={'do_lower_case': False,
-                                           'keep_accents': False})
-        tokens = tokenizer.map_text_to_token(
-            u"I was born in 92000, and this is falsé.")
-        self.assertListEqual(tokens, [SPIECE_UNDERLINE + u'I',
-                                      SPIECE_UNDERLINE + u'was',
-                                      SPIECE_UNDERLINE + u'b', u'or',
-                                      u'n', SPIECE_UNDERLINE + u'in',
-                                      SPIECE_UNDERLINE + u'',
-                                      u'9', u'2', u'0', u'0', u'0', u',',
-                                      SPIECE_UNDERLINE + u'and',
-                                      SPIECE_UNDERLINE + u'this',
-                                      SPIECE_UNDERLINE + u'is',
-                                      SPIECE_UNDERLINE + u'f', u'al', u'se',
-                                      u'.'])
-
-    def test_encode_text(self):
-        text_1 = u"He is very happy"
-        text_2 = u"unwanted, running"
-
-        text_1_ids = self.tokenizer.map_text_to_id(text_1)
-        text_2_ids = self.tokenizer.map_text_to_id(text_2)
-
-        cls_token_id = self.tokenizer.map_token_to_id(self.tokenizer.cls_token)
-        sep_token_id = self.tokenizer.map_token_to_id(self.tokenizer.sep_token)
-
-        input_ids, segment_ids, input_mask = \
-            self.tokenizer.encode_text(text_1, None, 4)
-
-        self.assertListEqual(input_ids,
-                             text_1_ids[:2] + [sep_token_id] + [cls_token_id])
-        self.assertListEqual(segment_ids, [0, 0, 0, 2])
-        self.assertListEqual(input_mask, [0, 0, 0, 0])
-
-        input_ids, segment_ids, input_mask = \
-            self.tokenizer.encode_text(text_1, text_2, 7)
-
-        self.assertListEqual(input_ids, text_1_ids[:2] +
-                             [sep_token_id] + text_2_ids[:2] + [sep_token_id] +
-                             [cls_token_id])
-        self.assertListEqual(segment_ids, [0, 0, 0, 1, 1, 1, 2])
-        self.assertListEqual(input_mask, [0, 0, 0, 0, 0, 0, 0])
-
-
-if __name__ == "__main__":
-    tf.test.main()
diff --git a/texar/tf/data/vocabulary.py b/texar/tf/data/vocabulary.py
index bf5828c2..f430427e 100644
--- a/texar/tf/data/vocabulary.py
+++ b/texar/tf/data/vocabulary.py
@@ -1,5 +1,4 @@
-# -*- coding: utf-8 -*-
-# Copyright 2018 The Texar Authors. All Rights Reserved.
+# Copyright 2019 The Texar Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -16,22 +15,14 @@
 Helper functions and classes for vocabulary processing.
 """
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
-
 import warnings
 from collections import defaultdict
 
 import tensorflow as tf
-from tensorflow import gfile
 import numpy as np
 
 from texar.tf.utils.utils import dict_lookup
 
-# pylint: disable=too-few-public-methods, invalid-name
-# pylint: disable=too-many-instance-attributes, too-many-arguments
 
 __all__ = [
     "SpecialTokens",
@@ -39,8 +30,8 @@
 ]
 
 
-class SpecialTokens(object):
-    """Special tokens, including :attr:`PAD`, :attr:`BOS`, :attr:`EOS`,
+class SpecialTokens:
+    r"""Special tokens, including :attr:`PAD`, :attr:`BOS`, :attr:`EOS`,
     :attr:`UNK`. These tokens will by default have token ids 0, 1, 2, 3,
     respectively.
     """
@@ -51,7 +42,7 @@ class SpecialTokens(object):
 
 
 def _make_defaultdict(keys, values, default_value):
-    """Creates a python defaultdict.
+    r"""Creates a python `defaultdict`.
 
     Args:
         keys (list): Keys of the dictionary.
@@ -69,8 +60,8 @@ def _make_defaultdict(keys, values, default_value):
     return dict_
 
 
-class Vocab(object):
-    """Vocabulary class that loads vocabulary from file, and maintains mapping
+class Vocab:
+    r"""Vocabulary class that loads vocabulary from file, and maintains mapping
     tables between token strings and indexes.
 
     Each line of the vocab file should contains one vocabulary token, e.g.,::
@@ -104,12 +95,12 @@ def __init__(self,
         self._eos_token = eos_token
         self._unk_token = unk_token
 
-        self._id_to_token_map, self._token_to_id_map, \
-        self._id_to_token_map_py, self._token_to_id_map_py = \
+        (self._id_to_token_map, self._token_to_id_map,
+         self._id_to_token_map_py, self._token_to_id_map_py) = \
             self.load(self._filename)
 
     def load(self, filename):
-        """Loads the vocabulary from the file.
+        r"""Loads the vocabulary from the file.
 
         Args:
             filename (str): Path to the vocabulary file.
@@ -123,7 +114,7 @@ def load(self, filename):
             and :attr:`id_to_token_map_py` and
             :attr:`token_to_id_map_py` are python `defaultdict` instances.
         """
-        with gfile.GFile(filename) as vocab_file:
+        with tf.io.gfile.GFile(filename) as vocab_file:
             # Converts to 'unicode' (Python 2) or 'str' (Python 3)
             vocab = list(tf.compat.as_text(line.strip()) for line in vocab_file)
 
@@ -153,13 +144,13 @@ def load(self, filename):
         vocab_idx = np.arange(vocab_size)
 
         # Creates TF maps
-        id_to_token_map = tf.contrib.lookup.HashTable(
-            tf.contrib.lookup.KeyValueTensorInitializer(
+        id_to_token_map = tf.lookup.StaticHashTable(
+            tf.lookup.KeyValueTensorInitializer(
                 vocab_idx, vocab, key_dtype=tf.int64, value_dtype=tf.string),
             self._unk_token)
 
-        token_to_id_map = tf.contrib.lookup.HashTable(
-            tf.contrib.lookup.KeyValueTensorInitializer(
+        token_to_id_map = tf.lookup.StaticHashTable(
+            tf.lookup.KeyValueTensorInitializer(
                 vocab, vocab_idx, key_dtype=tf.string, value_dtype=tf.int64),
             unk_token_idx)
 
@@ -173,7 +164,7 @@ def load(self, filename):
                id_to_token_map_py, token_to_id_map_py
 
     def map_ids_to_tokens(self, ids):
-        """Maps ids into text tokens.
+        r"""Maps ids into text tokens.
 
         The returned tokens are a Tensor.
 
@@ -186,7 +177,7 @@ def map_ids_to_tokens(self, ids):
         return self.id_to_token_map.lookup(tf.cast(ids, tf.int64))
 
     def map_tokens_to_ids(self, tokens):
-        """Maps text tokens into ids.
+        r"""Maps text tokens into ids.
 
         The returned ids are a Tensor.
 
@@ -199,7 +190,7 @@ def map_tokens_to_ids(self, tokens):
         return self.token_to_id_map.lookup(tokens)
 
     def map_ids_to_tokens_py(self, ids):
-        """Maps ids into text tokens.
+        r"""Maps ids into text tokens.
 
         The input :attr:`ids` and returned tokens are both python
         arrays or list.
@@ -213,7 +204,7 @@ def map_ids_to_tokens_py(self, ids):
         return dict_lookup(self.id_to_token_map_py, ids, self.unk_token)
 
     def map_tokens_to_ids_py(self, tokens):
-        """Maps text tokens into ids.
+        r"""Maps text tokens into ids.
 
         The input :attr:`tokens` and returned ids are both python
         arrays or list.
@@ -228,92 +219,92 @@ def map_tokens_to_ids_py(self, tokens):
 
     @property
     def id_to_token_map(self):
-        """The :tf_main:`HashTable <contrib/lookup/HashTable>` instance that
+        r"""The :tf_main:`HashTable <contrib/lookup/HashTable>` instance that
         maps from token index to the string form.
         """
         return self._id_to_token_map
 
     @property
     def token_to_id_map(self):
-        """The :tf_main:`HashTable <contrib/lookup/HashTable>` instance
+        r"""The :tf_main:`HashTable <contrib/lookup/HashTable>` instance
         that maps from token string to the index.
         """
         return self._token_to_id_map
 
     @property
     def id_to_token_map_py(self):
-        """The python `defaultdict` instance that maps from token index to the
+        r"""The python `defaultdict` instance that maps from token index to the
         string form.
         """
         return self._id_to_token_map_py
 
     @property
     def token_to_id_map_py(self):
-        """The python `defaultdict` instance that maps from token string to the
+        r"""The python `defaultdict` instance that maps from token string to the
         index.
         """
         return self._token_to_id_map_py
 
     @property
     def size(self):
-        """The vocabulary size.
+        r"""The vocabulary size.
         """
         return len(self.token_to_id_map_py)
 
     @property
     def bos_token(self):
-        """A string of the special token indicating the beginning of sequence.
+        r"""A string of the special token indicating the beginning of sequence.
         """
         return self._bos_token
 
     @property
     def bos_token_id(self):
-        """The `int` index of the special token indicating the beginning
+        r"""The `int` index of the special token indicating the beginning
         of sequence.
         """
         return self.token_to_id_map_py[self._bos_token]
 
     @property
     def eos_token(self):
-        """A string of the special token indicating the end of sequence.
+        r"""A string of the special token indicating the end of sequence.
         """
         return self._eos_token
 
     @property
     def eos_token_id(self):
-        """The `int` index of the special token indicating the end
+        r"""The `int` index of the special token indicating the end
         of sequence.
         """
         return self.token_to_id_map_py[self._eos_token]
 
     @property
     def unk_token(self):
-        """A string of the special token indicating unknown token.
+        r"""A string of the special token indicating unknown token.
         """
         return self._unk_token
 
     @property
     def unk_token_id(self):
-        """The `int` index of the special token indicating unknown token.
+        r"""The `int` index of the special token indicating unknown token.
         """
         return self.token_to_id_map_py[self._unk_token]
 
     @property
     def pad_token(self):
-        """A string of the special token indicating padding token. The
+        r"""A string of the special token indicating padding token. The
         default padding token is an empty string.
         """
         return self._pad_token
 
     @property
     def pad_token_id(self):
-        """The `int` index of the special token indicating padding token.
+        r"""The `int` index of the special token indicating padding token.
         """
         return self.token_to_id_map_py[self._pad_token]
 
     @property
     def special_tokens(self):
-        """The list of special tokens
+        r"""The list of special tokens
         [:attr:`pad_token`, :attr:`bos_token`, :attr:`eos_token`,
         :attr:`unk_token`].
         """
diff --git a/texar/tf/data/vocabulary_test.py b/texar/tf/data/vocabulary_test.py
index fb8dcbf9..83891efc 100644
--- a/texar/tf/data/vocabulary_test.py
+++ b/texar/tf/data/vocabulary_test.py
@@ -1,21 +1,12 @@
-# -*- coding: utf-8 -*-
-#
 """
 Unit tests for vocabulary related operations.
 """
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
-
 import tempfile
 import tensorflow as tf
 
 from texar.tf.data import vocabulary
 
-# pylint: disable=protected-access
-
 
 class VocabularyTest(tf.test.TestCase):
     """Tests vocabulary related operations.
diff --git a/texar/tf/evals/__init__.py b/texar/tf/evals/__init__.py
deleted file mode 100644
index 87b324b3..00000000
--- a/texar/tf/evals/__init__.py
+++ /dev/null
@@ -1,26 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Modules of texar library evals.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# pylint: disable=wildcard-import
-
-from texar.tf.evals.bleu_moses import *
-from texar.tf.evals.bleu import *
-from texar.tf.evals.metrics import *
diff --git a/texar/tf/evals/bleu.py b/texar/tf/evals/bleu.py
deleted file mode 100644
index 9e9218af..00000000
--- a/texar/tf/evals/bleu.py
+++ /dev/null
@@ -1,197 +0,0 @@
-# Copyright 2017 Google Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# Modifications copyright (C) 2018 Texar
-# ==============================================================================
-"""
-Python implementation of BLEU and smoothed BLEU adapted from:
-    `https://github.com/tensorflow/nmt/blob/master/nmt/scripts/bleu.py`
-
-This module provides a Python implementation of BLEU and smoothed BLEU.
-Smooth BLEU is computed following the method outlined in the paper:
-
-    (Lin et al. 2004) ORANGE: a method for evaluating automatic evaluation
-    metrics for maching translation.
-    Chin-Yew Lin, Franz Josef Och. COLING 2004.
-"""
-
-from __future__ import absolute_import
-from __future__ import print_function
-from __future__ import division
-from __future__ import unicode_literals
-
-import collections
-import math
-
-from texar.tf.utils.dtypes import compat_as_text, is_str
-
-# pylint: disable=invalid-name, too-many-branches, too-many-locals
-# pylint: disable=too-many-arguments
-
-__all__ = [
-    "sentence_bleu",
-    "corpus_bleu"
-]
-
-
-def _get_ngrams(segment, max_order):
-    """Extracts all n-grams up to a given maximum order from an input segment.
-
-    Args:
-        segment: text segment from which n-grams will be extracted.
-        max_order: maximum length in tokens of the n-grams returned by this
-            methods.
-
-    Returns:
-        The Counter containing all n-grams upto max_order in segment
-        with a count of how many times each n-gram occurred.
-    """
-    ngram_counts = collections.Counter()
-    for order in range(1, max_order + 1):
-        for i in range(0, len(segment) - order + 1):
-            ngram = tuple(segment[i:i + order])
-            ngram_counts[ngram] += 1
-    return ngram_counts
-
-
-def _maybe_str_to_list(list_or_str):
-    if is_str(list_or_str):
-        return list_or_str.split()
-    return list_or_str
-
-
-def _lowercase(str_list):
-    return [str_.lower() for str_ in str_list]
-
-
-def sentence_bleu(references, hypothesis, max_order=4, lowercase=False,
-                  smooth=False, return_all=False):
-    """Calculates BLEU score of a hypothesis sentence.
-
-    Args:
-        references: A list of reference for the hypothesis.
-            Each reference can be either a list of string tokens, or a string
-            containing tokenized tokens separated with whitespaces.
-            List can also be numpy array.
-        hypotheses: A hypothesis sentence.
-            Each hypothesis can be either a list of string tokens, or a
-            string containing tokenized tokens separated with whitespaces.
-            List can also be numpy array.
-        lowercase (bool): If `True`, lowercase reference and hypothesis tokens.
-        max_order (int): Maximum n-gram order to use when computing BLEU score.
-        smooth (bool): Whether or not to apply (Lin et al. 2004) smoothing.
-        return_all (bool): If `True`, returns BLEU and all n-gram precisions.
-
-    Returns:
-        If :attr:`return_all` is `False` (default), returns a float32
-        BLEU score.
-
-        If :attr:`return_all` is `True`, returns a list of float32 scores:
-        `[BLEU] + n-gram precisions`, which is of length :attr:`max_order` + 1.
-    """
-    return corpus_bleu(
-        [references], [hypothesis], max_order=max_order, lowercase=lowercase,
-        smooth=smooth, return_all=return_all)
-
-
-def corpus_bleu(list_of_references, hypotheses, max_order=4, lowercase=False,
-                smooth=False, return_all=True):
-    """Computes corpus-level BLEU score.
-
-    Args:
-        list_of_references: A list of lists of references for each hypothesis.
-            Each reference can be either a list of string tokens, or a string
-            containing tokenized tokens separated with whitespaces.
-            List can also be numpy array.
-        hypotheses: A list of hypothesis sentences.
-            Each hypothesis can be either a list of string tokens, or a
-            string containing tokenized tokens separated with whitespaces.
-            List can also be numpy array.
-        lowercase (bool): If `True`, lowercase reference and hypothesis tokens.
-        max_order (int): Maximum n-gram order to use when computing BLEU score.
-        smooth (bool): Whether or not to apply (Lin et al. 2004) smoothing.
-        return_all (bool): If `True`, returns BLEU and all n-gram precisions.
-
-    Returns:
-        If :attr:`return_all` is `False` (default), returns a float32
-        BLEU score.
-
-        If :attr:`return_all` is `True`, returns a list of float32 scores:
-        `[BLEU] + n-gram precisions`, which is of length :attr:`max_order` + 1.
-    """
-    list_of_references = compat_as_text(list_of_references)
-    hypotheses = compat_as_text(hypotheses)
-
-    matches_by_order = [0] * max_order
-    possible_matches_by_order = [0] * max_order
-    reference_length = 0
-    hyperthsis_length = 0
-    for (references, hyperthsis) in zip(list_of_references, hypotheses):
-        reference_length += min(len(r) for r in references)
-        hyperthsis_length += len(hyperthsis)
-
-        merged_ref_ngram_counts = collections.Counter()
-        for reference in references:
-            reference = _maybe_str_to_list(reference)
-            if lowercase:
-                reference = _lowercase(reference)
-            merged_ref_ngram_counts |= _get_ngrams(reference, max_order)
-
-        hyperthsis = _maybe_str_to_list(hyperthsis)
-        if lowercase:
-            hyperthsis = _lowercase(hyperthsis)
-        hyperthsis_ngram_counts = _get_ngrams(hyperthsis, max_order)
-
-        overlap = hyperthsis_ngram_counts & merged_ref_ngram_counts
-        for ngram in overlap:
-            matches_by_order[len(ngram) - 1] += overlap[ngram]
-        for order in range(1, max_order + 1):
-            possible_matches = len(hyperthsis) - order + 1
-            if possible_matches > 0:
-                possible_matches_by_order[order - 1] += possible_matches
-
-    precisions = [0] * max_order
-    for i in range(0, max_order):
-        if smooth:
-            precisions[i] = ((matches_by_order[i] + 1.) /
-                             (possible_matches_by_order[i] + 1.))
-        else:
-            if possible_matches_by_order[i] > 0:
-                precisions[i] = (float(matches_by_order[i]) /
-                                 possible_matches_by_order[i])
-            else:
-                precisions[i] = 0.0
-
-    if min(precisions) > 0:
-        p_log_sum = sum((1. / max_order) * math.log(p) for p in precisions)
-        geo_mean = math.exp(p_log_sum)
-    else:
-        geo_mean = 0
-
-    ratio = float(hyperthsis_length) / reference_length
-
-    if ratio > 1.0:
-        bp = 1.
-    else:
-        try:
-            bp = math.exp(1 - 1. / ratio)
-        except ZeroDivisionError:
-            bp = math.exp(1 - 1. / (ratio + 1e-8))
-
-    bleu = geo_mean * bp
-
-    if return_all:
-        return [bleu * 100] + [p * 100 for p in precisions]
-    else:
-        return bleu * 100
diff --git a/texar/tf/evals/bleu_moses.py b/texar/tf/evals/bleu_moses.py
deleted file mode 100644
index 9d8f280e..00000000
--- a/texar/tf/evals/bleu_moses.py
+++ /dev/null
@@ -1,169 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-The BLEU metric.
-"""
-
-from __future__ import absolute_import
-from __future__ import print_function
-from __future__ import division
-from __future__ import unicode_literals
-
-import os
-from io import open  # pylint: disable=redefined-builtin
-import shutil
-import re
-import subprocess
-import tempfile
-import numpy as np
-
-import tensorflow as tf
-
-from texar.tf.utils.dtypes import compat_as_text
-
-# pylint: disable=too-many-locals, no-member, redefined-variable-type
-
-__all__ = [
-    "sentence_bleu_moses",
-    "corpus_bleu_moses"
-]
-
-
-def _maybe_list_to_str(list_or_str):
-    if isinstance(list_or_str, (tuple, list, np.ndarray)):
-        return ' '.join(list_or_str)
-    return list_or_str
-
-
-def _parse_multi_bleu_ret(bleu_str, return_all=False):
-    bleu_score = re.search(r"BLEU = (.+?),", bleu_str).group(1)
-    bleu_score = np.float32(bleu_score)
-
-    if return_all:
-        bleus = re.search(r", (.+?)/(.+?)/(.+?)/(.+?) ", bleu_str)
-        bleus = [bleus.group(group_idx) for group_idx in range(1, 5)]
-        bleus = [np.float32(b) for b in bleus]
-        bleu_score = [bleu_score] + bleus
-
-    return bleu_score
-
-
-def sentence_bleu_moses(references, hypothesis, lowercase=False,
-                        return_all=False):
-    """Calculates BLEU score of a hypothesis sentence using the
-    **MOSES multi-bleu.perl** script.
-
-    Args:
-        references: A list of reference for the hypothesis.
-            Each reference can be either a string, or a list of string tokens.
-            List can also be numpy array.
-        hypotheses: A hypothesis sentence.
-            The hypothesis can be either a string, or a list of string tokens.
-            List can also be numpy array.
-        lowercase (bool): If `True`, pass the "-lc" flag to the multi-bleu
-            script.
-        return_all (bool): If `True`, returns BLEU and all n-gram precisions.
-
-    Returns:
-        If :attr:`return_all` is `False` (default), returns a float32
-        BLEU score.
-
-        If :attr:`return_all` is `True`, returns a list of 5 float32 scores:
-        `[BLEU, 1-gram precision, ..., 4-gram precision]`.
-    """
-    return corpus_bleu_moses(
-        [references], [hypothesis], lowercase=lowercase, return_all=return_all)
-
-
-def corpus_bleu_moses(list_of_references, hypotheses, lowercase=False,
-                      return_all=False):
-    """Calculates corpus-level BLEU score using the
-    **MOSES multi-bleu.perl** script.
-
-    Args:
-        list_of_references: A list of lists of references for each hypothesis.
-            Each reference can be either a string, or a list of string tokens.
-            List can also be numpy array.
-        hypotheses: A list of hypothesis sentences.
-            Each hyperthsis can be either a string, or a list of string tokens.
-            List can also be numpy array.
-        lowercase (bool): If `True`, pass the "-lc" flag to the multi-bleu
-            script.
-        return_all (bool): If `True`, returns BLEU and all n-gram precisions.
-
-    Returns:
-        If :attr:`return_all` is `False` (default), returns a float32
-        BLEU score.
-
-        If :attr:`return_all` is `True`, returns a list of 5 float32 scores:
-        `[BLEU, 1-gram precision, ..., 4-gram precision]`.
-    """
-    list_of_references = compat_as_text(list_of_references)
-    hypotheses = compat_as_text(hypotheses)
-
-    if np.size(hypotheses) == 0:
-        return np.float32(0.)   # pylint: disable=no-member
-
-    # Get multi-bleu.perl
-    cur_dir = os.path.dirname(os.path.realpath(__file__))
-    multi_bleu_path = os.path.abspath(
-        os.path.join(cur_dir, "..", "..", "..", "bin", "utils",
-                     "multi-bleu.perl"))
-
-    # Create a temporary folder containing hyperthesis and reference files
-    result_path = tempfile.mkdtemp()
-    # Create hyperthesis file
-    hfile_path = os.path.join(result_path, 'hyp')
-    hyps = [_maybe_list_to_str(h) for h in hypotheses]
-    with open(hfile_path, 'w', encoding='utf-8') as hfile:
-        text = "\n".join(hyps)
-        hfile.write(text)
-        hfile.write("\n")
-    # Create reference files
-    max_nrefs = max([len(refs) for refs in list_of_references])
-    rfile_path = os.path.join(result_path, 'ref')
-    for rid in range(max_nrefs):
-        with open(rfile_path + '%d' % rid, 'w', encoding='utf-8') as rfile:
-            for refs in list_of_references:
-                if rid < len(refs):
-                    ref = _maybe_list_to_str(refs[rid])
-                    rfile.write(ref + "\n")
-                else:
-                    rfile.write("\n")
-
-    # Calculate BLEU
-    multi_bleu_cmd = [multi_bleu_path]
-    if lowercase:
-        multi_bleu_cmd += ["-lc"]
-    multi_bleu_cmd += [rfile_path]
-    with open(hfile_path, "r") as hyp_input:
-        try:
-            multi_bleu_ret = subprocess.check_output(
-                multi_bleu_cmd, stdin=hyp_input, stderr=subprocess.STDOUT)
-            multi_bleu_ret = multi_bleu_ret.decode("utf-8")
-            bleu_score = _parse_multi_bleu_ret(multi_bleu_ret, return_all)
-        except subprocess.CalledProcessError as error:
-            if error.output is not None:
-                tf.logging.warning(
-                    "multi-bleu.perl returned non-zero exit code")
-                tf.logging.warning(error.output)
-            if return_all:
-                bleu_score = [np.float32(0.0)] * 5
-            else:
-                bleu_score = np.float32(0.0)
-
-    shutil.rmtree(result_path)
-
-    return np.float32(bleu_score)
diff --git a/texar/tf/evals/bleu_test.py b/texar/tf/evals/bleu_test.py
deleted file mode 100644
index 9df84ec2..00000000
--- a/texar/tf/evals/bleu_test.py
+++ /dev/null
@@ -1,122 +0,0 @@
-# -*- coding: utf-8 -*-
-#
-"""
-Unit tests for bleu.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
-
-import numpy as np
-
-import tensorflow as tf
-
-from texar.tf.evals.bleu_moses import sentence_bleu_moses, corpus_bleu_moses
-from texar.tf.evals.bleu import sentence_bleu, corpus_bleu
-
-# pylint: disable=too-many-locals, too-many-arguments
-
-
-class BLEUTest(tf.test.TestCase):
-    """Tests the bleu functions.
-    """
-
-    def _test_sentence_bleu(self, references, hypothesis, lowercase,
-                            true_bleu):
-        bleu = sentence_bleu_moses(references=references,
-                                   hypothesis=hypothesis,
-                                   lowercase=lowercase)
-        self.assertAlmostEqual(bleu, true_bleu, places=2)
-
-        bleu = sentence_bleu(references=references,
-                             hypothesis=hypothesis,
-                             lowercase=lowercase)
-        self.assertAlmostEqual(bleu, true_bleu, places=0)
-
-    def test_sentence_strings(self):
-        """Tests hypothesis as strings.
-        """
-        hypothesis = \
-            "this is a test sentence to evaluate the good bleu score . 词"
-        references = ["this is a test sentence to evaluate the bleu score ."]
-        self._test_sentence_bleu(
-            references, hypothesis, lowercase=False, true_bleu=67.03)
-
-    def test_sentence_list(self):
-        """Tests hypothesis as a list of tokens.
-        """
-        hypothesis = \
-            "this is a test sentence to evaluate the good bleu score . 词"
-        hypothesis = hypothesis.split()
-        references = ["this is a test sentence to evaluate the bleu score ."]
-        references = [references[0].split()]
-        self._test_sentence_bleu(
-            references, hypothesis, lowercase=False, true_bleu=67.03)
-
-    def test_sentence_multi_references(self):
-        """Tests multiple references.
-        """
-        hypothesis = \
-            "this is a test sentence to evaluate the good bleu score . 词"
-        references = ["this is a test sentence to evaluate the bleu score .",
-                      "this is a test sentence to evaluate the good score ."]
-        self._test_sentence_bleu(
-            references, hypothesis, lowercase=False, true_bleu=76.12)
-
-    def test_sentence_numpy(self):
-        """Tests with numpy format.
-        """
-        hypothesis = \
-            "this is a test sentence to evaluate the good bleu score . 词"
-        hypothesis = np.array(hypothesis.split())
-        references = ["this is a test sentence to evaluate the bleu score .",
-                      "this is a test sentence to evaluate the good score ."]
-        references = np.array([np.array(r.split()) for r in references])
-        self._test_sentence_bleu(
-            references, hypothesis, lowercase=False, true_bleu=76.12)
-
-    def _test_corpus_bleu(self, list_of_references, hypotheses, lowercase,
-                          return_all, true_bleu):
-        bleu = corpus_bleu_moses(list_of_references=list_of_references,
-                                 hypotheses=hypotheses,
-                                 lowercase=lowercase,
-                                 return_all=return_all)
-        if not return_all:
-            self.assertAlmostEqual(bleu, true_bleu, places=2)
-        else:
-            for ret, true in zip(bleu, true_bleu):
-                self.assertAlmostEqual(ret, true, places=2)
-
-        bleu = corpus_bleu(list_of_references=list_of_references,
-                           hypotheses=hypotheses,
-                           lowercase=lowercase,
-                           return_all=return_all)
-        if not return_all:
-            self.assertAlmostEqual(bleu, true_bleu, places=0)
-        else:
-            for ret, true in zip(bleu, true_bleu):
-                self.assertAlmostEqual(ret, true, places=0)
-
-    def test_corpus_strings(self):
-        """Tests corpus level BLEU.
-        """
-        hypotheses = [
-            "this is a test sentence to evaluate the good bleu score . 词",
-            "i believe that that the script is 词 perfectly correct ."
-        ]
-        list_of_references = [
-            ["this is a test sentence to evaluate the bleu score .",
-             "this is a test sentence to evaluate the good score ."],
-            ["i believe that the script is perfectly correct .".split()]
-        ]
-        self._test_corpus_bleu(list_of_references, hypotheses,
-                               False, False, 63.02)
-
-        self._test_corpus_bleu(list_of_references, hypotheses,
-                               False, True, [63.02, 87.5, 77.3, 60.0, 38.9])
-
-
-if __name__ == "__main__":
-    tf.test.main()
diff --git a/texar/tf/evals/metrics.py b/texar/tf/evals/metrics.py
deleted file mode 100644
index 2852ea0b..00000000
--- a/texar/tf/evals/metrics.py
+++ /dev/null
@@ -1,52 +0,0 @@
-"""
-Various metrics.
-"""
-
-from __future__ import absolute_import
-from __future__ import print_function
-from __future__ import division
-from __future__ import unicode_literals
-
-import tensorflow as tf
-
-__all__ = [
-    "accuracy",
-    "binary_clas_accuracy"
-]
-
-
-def accuracy(labels, preds):
-    """Calculates the accuracy of predictions.
-
-    Args:
-        labels: The ground truth values. A Tensor of the same shape of
-            :attr:`preds`.
-        preds: A Tensor of any shape containing the predicted values.
-
-    Returns:
-        A float scalar Tensor containing the accuracy.
-    """
-    labels = tf.cast(labels, preds.dtype)
-    return tf.reduce_mean(tf.cast(tf.equal(preds, labels), tf.float32))
-
-
-def binary_clas_accuracy(pos_preds=None, neg_preds=None):
-    """Calculates the accuracy of binary predictions.
-
-    Args:
-        pos_preds (optional): A Tensor of any shape containing the
-            predicted values on positive data (i.e., ground truth labels are
-            `1`).
-        neg_preds (optional): A Tensor of any shape containing the
-            predicted values on negative data (i.e., ground truth labels are
-            `0`).
-
-    Returns:
-        A float scalar Tensor containing the accuracy.
-    """
-    pos_accu = accuracy(tf.ones_like(pos_preds), pos_preds)
-    neg_accu = accuracy(tf.zeros_like(neg_preds), neg_preds)
-    psize = tf.cast(tf.size(pos_preds), tf.float32)
-    nsize = tf.cast(tf.size(neg_preds), tf.float32)
-    accu = (pos_accu * psize + neg_accu * nsize) / (psize + nsize)
-    return accu
diff --git a/texar/tf/hyperparams.py b/texar/tf/hyperparams.py
index 127ebede..4249c9e4 100644
--- a/texar/tf/hyperparams.py
+++ b/texar/tf/hyperparams.py
@@ -1,4 +1,4 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
+# Copyright 2019 The Texar Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -15,16 +15,12 @@
 Hyperparameter manager
 """
 
-from __future__ import absolute_import
-from __future__ import print_function
-from __future__ import division
-
 import copy
 import json
 
 
 __all__ = [
-    "HParams"
+    "HParams",
 ]
 
 
@@ -32,17 +28,16 @@ def _type_name(value):
     return type(value).__name__
 
 
-class HParams(object):
-    """A class that maintains hyperparameters for configing Texar modules.
+class HParams:
+    r"""A class that maintains hyperparameters for configuring Texar modules.
     The class has several useful features:
 
-    - **Auto-completion of missing values.** Users can specify only a subset of \
-    hyperparameters they care about. Other hyperparameters will automatically \
-    take the default values. The auto-completion performs **recursively** so \
-    that hyperparameters taking `dict` values will also be auto-completed \
-    **All Texar modules** provide a \
-    :meth:`default_hparams` containing allowed hyperparameters and their \
-    default values. For example
+    - **Auto-completion of missing values.** Users can specify only a subset of
+    hyperparameters they care about. Other hyperparameters will automatically
+    take the default values. The auto-completion performs **recursively** so
+    that hyperparameters taking `dict` values will also be auto-completed
+    **All Texar modules** provide a :meth:`default_hparams` containing
+    allowed hyperparameters and their default values. For example:
 
         .. code-block:: python
 
@@ -62,30 +57,33 @@ class HParams(object):
                 ...
             }
 
-    - **Automatic typecheck.** For most hyperparameters, provided value must \
-    have the same or compatible dtype with the default value. HParams does \
-    necessary typecheck, and raises Error if improper dtype is provided. \
-    Also, hyperparameters not listed in `default_hparams` are not allowed, \
-    except for "kwargs" as detailed below.
+    - **Automatic typecheck.** For most hyperparameters, provided value must
+    have the same or compatible dtype with the default value. :class:`HParams`
+    does necessary type-check, and raises Error if improper dtype is provided.
+    Also, hyperparameters not listed in `default_hparams` are not allowed,
+    except for `"kwargs"` as detailed below.
 
-    - **Flexible dtype for specified hyperparameters.**  Some hyperparameters\
+    - **Flexible dtype for specified hyperparameters.**  Some hyperparameters
     may allow different dtypes of values.
 
-        - Hyperparameters named "type" are not typechecked.\
-        For example, in :func:`~texar.tf.core.get_rnn_cell`, hyperparameter \
-        `"type"` can take value of an RNNCell class, its string name of module \
-        path, or an RNNCell class instance. (String name or module path is \
-        allowd so that users can specify the value in YAML config files.)
-
-        - For other hyperparameters, list them\
-        in the "@no_typecheck" field in `default_hparams` to skip typecheck. \
-        For example, in :func:`~texar.tf.core.get_rnn_cell`, hyperparameter \
-        "*_keep_prob" can be set to either a `float` or a `tf.placeholder`.
-
-    - **Special flexibility of keyword argument hyparameters.** \
-    Hyperparameters named "kwargs" are used as keyword arguments for a class\
-    constructor or a function call. Such hyperparameters take a `dict`, and \
-    users can add arbitrary valid keyword arguments to the dict. For example:
+        - Hyperparameters named `"type"` are not type-checked.
+        For example, in :func:`~texar.tf.core.get_rnn_cell`, hyperparameter
+        `"type"` can take value of an RNNCell class, its string name of module
+        path, or an RNNCell class instance. (String name or module path is
+        allowed so that users can specify the value in YAML configuration
+        files.)
+
+        - For other hyperparameters, list them in the `"@no_typecheck"` field
+        in :meth:`default_hparams` to skip type-check. For example, in
+        :class:`~texar.tf.modules.Conv1DNetwork`, hyperparameter
+        `"kernel_size"` can be set to either a `list` of `int`\ s or simply
+        an `int`.
+
+    - **Special flexibility of keyword argument hyparameters.**
+    Hyperparameters named ``"kwargs"`` are used as keyword arguments for a
+    class constructor or a function call. Such hyperparameters take a `dict`,
+    and users can add arbitrary valid keyword arguments to the dict.
+    For example:
 
         .. code-block:: python
 
@@ -98,14 +96,15 @@ class HParams(object):
             my_hparams = {
                 "kwargs" {
                     "num_units": 123,
-                    "forget_bias": 0.0         # Other valid keyword arguments
-                    "activation": "tf.nn.relu" # for LSTMCell constructor
+                    # Other valid keyword arguments for LSTMCell constructor
+                    "forget_bias": 0.0
+                    "activation": "tf.nn.relu"
                 }
             }
             _ = HParams(my_hparams, default_rnn_cell_hparams)
 
-    - **Rich interfaces.** An HParams instance provides rich interfaces for\
-    accessing, updating, or adding hyperparameters.
+    - **Rich interfaces.** An :class:`HParams` instance provides rich interfaces
+      for accessing, updating, or adding hyperparameters.
 
         .. code-block:: python
 
@@ -130,8 +129,9 @@ class HParams(object):
 
 
     Args:
-        hparams: A `dict` or an `HParams` instance containing hyperparameters.
-            If `None`, all hyperparameters are set to default values.
+        hparams: A `dict` or an :class:`HParams` instance containing
+            hyperparameters. If `None`, all hyperparameters are set to default
+            values.
         default_hparams (dict): Hyperparameters with default values. If `None`,
             Hyperparameters are fully defined by :attr:`hparams`.
         allow_new_hparam (bool): If `False` (default), :attr:`hparams` cannot
@@ -139,15 +139,15 @@ class HParams(object):
             :attr:`default_hparams`, except for the case of :attr:`"kwargs"` as
             above.
     """
-    # - The default hyperparameters in :attr:`"kwargs"` are used (for typecheck\
-    # and complementing missing hyperparameters) only when :attr:`"type"` \
-    # takes default value (i.e., missing in :attr:`hparams` or set to \
-    # the same value with the default). In this case :attr:`kwargs` allows to \
-    # contain new keys not included in :attr:`default_hparams["kwargs"]`.
+    # - The default hyperparameters in :attr:`"kwargs"` are used (for type-check
+    #   and complementing missing hyperparameters) only when :attr:`"type"`
+    #   takes default value (i.e., missing in :attr:`hparams` or set to
+    #   the same value with the default). In this case :attr:`kwargs` allows to
+    #   contain new keys not included in :attr:`default_hparams["kwargs"]`.
     #
-    # - If :attr:`"type"` is set to an other \
-    # value and :attr:`"kwargs"` is missing in :attr:`hparams`, \
-    # :attr:`"kwargs"` is set to an empty dictionary.
+    # - If :attr:`"type"` is set to an other value and :attr:`"kwargs"` is
+    #   missing in :attr:`hparams`, :attr:`"kwargs"` is set to an empty
+    #   dictionary.
 
     def __init__(self, hparams, default_hparams, allow_new_hparam=False):
         if isinstance(hparams, HParams):
@@ -157,19 +157,17 @@ def __init__(self, hparams, default_hparams, allow_new_hparam=False):
                 hparams, default_hparams, allow_new_hparam)
         else:
             parsed_hparams = self._parse(hparams, hparams)
-        super(HParams, self).__setattr__('_hparams', parsed_hparams)
+        super().__setattr__('_hparams', parsed_hparams)
 
     @staticmethod
-    def _parse(hparams,
-               default_hparams,
-               allow_new_hparam=False):
-        """Parses hyperparameters.
+    def _parse(hparams, default_hparams, allow_new_hparam=False):
+        r"""Parses hyperparameters.
 
         Args:
             hparams (dict): Hyperparameters. If `None`, all hyperparameters are
                 set to default values.
             default_hparams (dict): Hyperparameters with default values.
-                If `None`,Hyperparameters are fully defined by :attr:`hparams`.
+                If `None`, hyperparameters are fully defined by :attr:`hparams`.
             allow_new_hparam (bool): If `False` (default), :attr:`hparams`
                 cannot contain hyperparameters that are not included in
                 :attr:`default_hparams`, except the case of :attr:`"kwargs"`.
@@ -213,8 +211,6 @@ def _parse(hparams,
                 else:
                     parsed_hparams[name] = HParams(value, value)
 
-        from texar.tf.utils.dtypes import is_callable
-
         # Parse hparams
         for name, value in hparams.items():
             if name not in default_hparams:
@@ -228,8 +224,8 @@ def _parse(hparams,
                         "entries undefined in default hyperparameters." % name)
 
             if value is None:
-                parsed_hparams[name] = \
-                    HParams._parse_value(parsed_hparams[name])
+                parsed_hparams[name] = HParams._parse_value(
+                    parsed_hparams[name])
 
             default_value = default_hparams[name]
             if default_value is None:
@@ -238,8 +234,8 @@ def _parse(hparams,
 
             # Parse recursively for params of type dictionary.
             if isinstance(value, dict):
-                if name not in no_typecheck_names \
-                        and not isinstance(default_value, dict):
+                if name not in no_typecheck_names and \
+                        not isinstance(default_value, dict):
                     raise ValueError(
                         "Hyperparameter '%s' must have type %s, got %s" %
                         (name, _type_name(default_value), _type_name(value)))
@@ -271,7 +267,7 @@ def _parse(hparams,
                 parsed_hparams[name] = value
             elif isinstance(value, type(default_value)):
                 parsed_hparams[name] = value
-            elif is_callable(value) and is_callable(default_value):
+            elif callable(value) and callable(default_value):
                 parsed_hparams[name] = value
             else:
                 try:
@@ -291,22 +287,22 @@ def _parse_value(value, name=None):
             return value
 
     def __getattr__(self, name):
-        """Retrieves the value of the hyperparameter.
+        r"""Retrieves the value of the hyperparameter.
         """
         if name == '_hparams':
-            return super(HParams, self).__getattribute__('_hparams')
+            return super().__getattribute__('_hparams')
         if name not in self._hparams:
             # Raise AttributeError to allow copy.deepcopy, etc
             raise AttributeError("Unknown hyperparameter: %s" % name)
         return self._hparams[name]
 
     def __getitem__(self, name):
-        """Retrieves the value of the hyperparameter.
+        r"""Retrieves the value of the hyperparameter.
         """
         return self.__getattr__(name)
 
     def __setattr__(self, name, value):
-        """Sets the value of the hyperparameter.
+        r"""Sets the value of the hyperparameter.
         """
         if name not in self._hparams:
             raise ValueError(
@@ -316,12 +312,12 @@ def __setattr__(self, name, value):
         self._hparams[name] = self._parse_value(value, name)
 
     def items(self):
-        """Returns the list of hyperparam `(name, value)` pairs
+        r"""Returns the list of hyperparameter `(name, value)` pairs.
         """
-        return iter(self)
+        return self._hparams.items()
 
     def keys(self):
-        """Returns the list of hyperparam names
+        r"""Returns the list of hyperparameter names.
         """
         return self._hparams.keys()
 
@@ -336,13 +332,13 @@ def __contains__(self, name):
         return name in self._hparams
 
     def __str__(self):
-        """Return a string of the hparams.
+        r"""Return a string of the hyperparameters.
         """
         hparams_dict = self.todict()
         return json.dumps(hparams_dict, sort_keys=True, indent=2)
 
     def get(self, name, default=None):
-        """Returns the hyperparameter value for the given name. If name is not
+        r"""Returns the hyperparameter value for the given name. If name is not
         available then returns :attr:`default`.
 
         Args:
@@ -355,14 +351,14 @@ def get(self, name, default=None):
             return default
 
     def add_hparam(self, name, value):
-        """Adds a new hyperparameter.
+        r"""Adds a new hyperparameter.
         """
         if (name in self._hparams) or hasattr(self, name):
             raise ValueError("Hyperparameter name already exists: %s" % name)
         self._hparams[name] = self._parse_value(value, name)
 
     def todict(self):
-        """Returns a copy of hyperparameters as a dictionary.
+        r"""Returns a copy of hyperparameters as a dictionary.
         """
         dict_ = copy.deepcopy(self._hparams)
         for name, value in self._hparams.items():
diff --git a/texar/tf/hyperparams_test.py b/texar/tf/hyperparams_test.py
index e233fc88..061c4a93 100644
--- a/texar/tf/hyperparams_test.py
+++ b/texar/tf/hyperparams_test.py
@@ -2,27 +2,21 @@
 Unit tests of :class:`HParams`.
 """
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
 import copy
 import pickle
-
 import tempfile
+
 import tensorflow as tf
 
 from texar.tf.hyperparams import HParams
 
-# pylint: disable=no-member
-
 
 class HParamsTest(tf.test.TestCase):
     """Tests hyperparameter related operations.
     """
 
     def test_hparams(self):
-        """Tests the HParams class.
+        r"""Tests the HParams class.
         """
         default_hparams = {
             "str": "str",
@@ -97,7 +91,7 @@ def test_hparams(self):
         self.assertEqual(hparams_loaded.todict(), hparams_.todict())
 
     def test_typecheck(self):
-        """Tests type-check functionality.
+        r"""Tests type-check functionality.
         """
         def _foo():
             pass
@@ -117,7 +111,7 @@ def _bar():
         self.assertEqual(hparams_.fn, default_hparams["fn"])
 
     def test_type_kwargs(self):
-        """The the special cases involving "type" and "kwargs"
+        r"""The the special cases involving "type" and "kwargs"
         hyperparameters.
         """
         default_hparams = {
diff --git a/texar/tf/losses/__init__.py b/texar/tf/losses/__init__.py
deleted file mode 100644
index bd08232b..00000000
--- a/texar/tf/losses/__init__.py
+++ /dev/null
@@ -1,29 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Modules of texar losses.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# pylint: disable=wildcard-import
-
-from texar.tf.losses.losses_utils import *
-from texar.tf.losses.mle_losses import *
-from texar.tf.losses.pg_losses import *
-from texar.tf.losses.adv_losses import *
-from texar.tf.losses.rewards import *
-from texar.tf.losses.entropy import *
diff --git a/texar/tf/losses/adv_losses.py b/texar/tf/losses/adv_losses.py
deleted file mode 100644
index ca860312..00000000
--- a/texar/tf/losses/adv_losses.py
+++ /dev/null
@@ -1,81 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Adversarial losses.
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow as tf
-
-
-def binary_adversarial_losses(real_data,
-                              fake_data,
-                              discriminator_fn,
-                              mode="max_real"):
-    """Computes adversarial losses of real/fake binary discrimination game.
-
-    .. role:: python(code)
-       :language: python
-
-    Args:
-        real_data (Tensor or array): Real data of shape
-            `[num_real_examples, ...]`.
-        fake_data (Tensor or array): Fake data of shape
-            `[num_fake_examples, ...]`. `num_real_examples` does not
-            necessarily equal `num_fake_examples`.
-        discriminator_fn: A callable takes data (e.g., :attr:`real_data` and
-            :attr:`fake_data`) and returns the logits of being real. The
-            signature of `discriminator_fn` must be:
-            :python:`logits, ... = discriminator_fn(data)`.
-            The return value of `discriminator_fn` can be the logits, or
-            a tuple where the logits are the first element.
-
-        mode (str): Mode of the generator loss. Either "max_real" or "min_fake".
-
-            - **"max_real"** (default): minimizing the generator loss is to\
-            maximize the probability of fake data being classified as real.
-
-            - **"min_fake"**: minimizing the generator loss is to minimize the\
-            probability of fake data being classified as fake.
-
-    Returns:
-        A tuple `(generator_loss, discriminator_loss)` each of which is
-        a scalar Tensor, loss to be minimized.
-    """
-    real_logits = discriminator_fn(real_data)
-    if isinstance(real_logits, (list, tuple)):
-        real_logits = real_logits[0]
-    real_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(
-        logits=real_logits, labels=tf.ones_like(real_logits)))
-
-    fake_logits = discriminator_fn(fake_data)
-    if isinstance(fake_logits, (list, tuple)):
-        fake_logits = fake_logits[0]
-    fake_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(
-        logits=fake_logits, labels=tf.zeros_like(fake_logits)))
-
-    d_loss = real_loss + fake_loss
-
-    if mode == "min_fake":
-        g_loss = - fake_loss
-    elif mode == "max_real":
-        g_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(
-            logits=fake_logits, labels=tf.ones_like(fake_logits)))
-    else:
-        raise ValueError("Unknown mode: %s. Only 'min_fake' and 'max_real' "
-                         "are allowed.")
-
-    return g_loss, d_loss
diff --git a/texar/tf/losses/adv_losses_test.py b/texar/tf/losses/adv_losses_test.py
deleted file mode 100644
index 7e76c3e1..00000000
--- a/texar/tf/losses/adv_losses_test.py
+++ /dev/null
@@ -1,40 +0,0 @@
-#
-"""
-Tests adversarial loss related functions.
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
-
-import tensorflow as tf
-
-from texar.tf.losses.adv_losses import binary_adversarial_losses
-
-
-class AdvLossesTest(tf.test.TestCase):
-    """Tests adversarial losses.
-    """
-    def test_binary_adversarial_losses(self):
-        """Tests :meth:`~texar.tf.losses.adv_losses.binary_adversarial_losse`.
-        """
-        batch_size = 16
-        data_dim = 64
-        real_data = tf.zeros([batch_size, data_dim], dtype=tf.float32)
-        fake_data = tf.ones([batch_size, data_dim], dtype=tf.float32)
-        const_logits = tf.zeros([batch_size], dtype=tf.float32)
-        # Use a dumb discriminator that always outputs logits=0.
-        gen_loss, disc_loss = binary_adversarial_losses(
-            real_data, fake_data, lambda x: const_logits)
-        gen_loss_2, disc_loss_2 = binary_adversarial_losses(
-            real_data, fake_data, lambda x: const_logits, mode="min_fake")
-
-        with self.test_session() as sess:
-            gen_loss_, disc_loss_ = sess.run([gen_loss, disc_loss])
-            gen_loss_2_, disc_loss_2_ = sess.run([gen_loss_2, disc_loss_2])
-            self.assertAlmostEqual(gen_loss_, -gen_loss_2_)
-            self.assertAlmostEqual(disc_loss_, disc_loss_2_)
-
-
-if __name__ == "__main__":
-    tf.test.main()
diff --git a/texar/tf/losses/entropy.py b/texar/tf/losses/entropy.py
deleted file mode 100644
index 9a2bf1ca..00000000
--- a/texar/tf/losses/entropy.py
+++ /dev/null
@@ -1,205 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Various entropies.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow as tf
-
-from texar.tf.losses.losses_utils import mask_and_reduce, reduce_dimensions
-from texar.tf.utils.shapes import get_rank
-
-# pylint: disable=too-many-arguments
-
-__all__ = [
-    "entropy_with_logits",
-    "sequence_entropy_with_logits"
-]
-
-
-def _get_entropy(logits):
-    probs = tf.nn.softmax(logits) + 1e-8
-    entropy = - probs * tf.log(probs)
-    entropy = tf.reduce_sum(entropy, -1)
-    return entropy
-
-
-def entropy_with_logits(logits,
-                        rank=None,
-                        average_across_batch=True,
-                        average_across_remaining=False,
-                        sum_over_batch=False,
-                        sum_over_remaining=True):
-    """Shannon entropy given logits.
-
-    Args:
-        logits: Unscaled log probabilities of shape
-            `[batch_size, d_2, ..., d_{rank-1}, distribution_dim]`
-            and of dtype `float32` or `float64`.
-
-            The rank of the tensor is optionally specified by the argument
-            :attr:`rank`.
-
-            The tensor is considered as having `[batch_size, .., d_{rank-1}]`
-            elements, each of which has a distribution of length `d_rank`
-            (i.e., `distribution_dim`). So the last dimension is always
-            summed out to compute the entropy.
-        rank (int, optional): The rank of :attr:`logits`.
-            If `None` (default), `rank` is inferred automatically from
-            `logits`. If the inference fails, `rank` is
-            set to 2, i.e., assuming :attr:`logits` is of shape
-            `[batch_size, distribution_dim]`
-        average_across_batch (bool): If set, average the entropy across the
-            batch dimension. Must not set `average_across_batch`'
-            and `sum_over_batch` at the same time.
-        average_across_remaining (bool): If set, average the entropy across the
-            remaining dimensions. Must not set `average_across_remaining`'
-            and `sum_over_remaining` at the same time.
-            Used only when :attr:`logits` has rank >= 3.
-        sum_over_batch (bool): If set, sum the entropy across the
-            batch dimension. Must not set `average_across_batch`
-            and `sum_over_batch` at the same time.
-        sum_over_remaining (bool): If set, sum the entropy across the
-            remaining dimension. Must not set `average_across_remaining`
-            and `sum_over_remaining` at the same time.
-            Used only when :attr:`logits` has rank >= 3.
-
-    Returns:
-        A Tensor containing the shannon entropy. The dimensionality of the
-        Tensor depends on the configuration of reduction arguments. For
-        example, if both batch and remaining dimensions are reduced (by
-        either sum or average), the returned Tensor is a scalar Tensor.
-    """
-    entropy = _get_entropy(logits)
-
-    if rank is None:
-        rank = get_rank(logits)
-    if rank is None:
-        rank = 2
-    rank -= 1  # reduced last dimension
-
-    # Reduces
-    if average_across_batch and sum_over_batch:
-        raise ValueError("Only one of `average_across_batch` and "
-                         "`sum_over_batch` can be set.")
-    if average_across_remaining and sum_over_remaining:
-        raise ValueError("Only one of `average_across_remaining` and "
-                         "`sum_over_remaining` can be set.")
-    sum_axes, average_axes = [], []
-    if sum_over_batch:
-        sum_axes.append(0)
-    if average_across_batch:
-        average_axes.append(0)
-    if sum_over_remaining and rank >= 2:
-        sum_axes += list(range(1, rank))
-    if average_across_remaining and rank >= 2:
-        average_axes += list(range(1, rank))
-
-    entropy = reduce_dimensions(
-        entropy, average_axes=average_axes, sum_axes=sum_axes)
-
-    return entropy
-
-
-def sequence_entropy_with_logits(logits,
-                                 rank=None,
-                                 sequence_length=None,
-                                 average_across_batch=True,
-                                 average_across_timesteps=False,
-                                 average_across_remaining=False,
-                                 sum_over_batch=False,
-                                 sum_over_timesteps=True,
-                                 sum_over_remaining=True,
-                                 time_major=False):
-    """Shannon entropy given logits.
-
-    Args:
-        logits: Unscaled log probabilities of shape
-            `[batch_size, max_time, d_3, ..., d_{rank-1}, distribution_dim]`
-            and of dtype `float32` or `float64`.
-
-            The rank of the tensor is optionally specified by the argument
-            :attr:`rank`.
-
-            The tensor is considered as having `[batch_size, .., d_{rank-1}]`
-            elements, each of which has a distribution of length `d_rank`
-            (i.e., `distribution_dim`). So the last dimension is always
-            summed out to compute the entropy.
-
-            The batch and time dimensions are exchanged if :attr:`time_major`
-            is `True`.
-        rank (int, optional): The rank of :attr:`logits`.
-            If `None` (default), `rank` is inferred automatically from
-            `logits`. If the inference fails, `rank` is
-            set to 3, i.e., assuming `logits` is of shape
-            `[batch_size, max_time, distribution_dim]`
-        sequence_length (optional): A Tensor of shape `[batch_size]`.
-            Time steps beyond the respective sequence lengths are
-            counted into the entropy.
-        average_across_timesteps (bool): If set, average the entropy across
-            the time dimension. Must not set `average_across_timesteps`
-            and `sum_over_timesteps` at the same time.
-        average_across_batch (bool): If set, average the entropy across the
-            batch dimension. Must not set `average_across_batch`'
-            and `sum_over_batch` at the same time.
-        average_across_remaining (bool): If set, average the entropy across the
-            remaining dimensions. Must not set `average_across_remaining`'
-            and `sum_over_remaining` at the same time.
-            Used only when :attr:`logits` has rank >= 4.
-        sum_over_timesteps (bool): If set, sum the entropy across the
-            time dimension. Must not set `average_across_timesteps`
-            and `sum_over_timesteps` at the same time.
-        sum_over_batch (bool): If set, sum the entropy across the
-            batch dimension. Must not set `average_across_batch`
-            and `sum_over_batch` at the same time.
-        sum_over_remaining (bool): If set, sum the entropy across the
-            remaining dimension. Must not set `average_across_remaining`
-            and `sum_over_remaining` at the same time.
-            Used only when :attr:`logits` has rank >= 4.
-        time_major (bool): The shape format of the inputs. If `True`,
-            :attr:`logits` must have shape `[max_time, batch_size, ...]`.
-            If `False` (default), it must have shape
-            `[batch_size, max_time, ...]`.
-
-    Returns:
-        A Tensor containing the shannon entropy. The dimensionality of the
-        Tensor depends on the configuration of reduction arguments. For
-        example, if batch, time, and remaining dimensions are all reduced (by
-        either sum or average), the returned Tensor is a scalar Tensor.
-    """
-    entropy = _get_entropy(logits)
-
-    if rank is None:
-        rank = get_rank(logits)
-    if rank is None:
-        rank = 3
-    rank -= 1  # reduced last dimension
-
-    entropy = mask_and_reduce(
-        entropy,
-        sequence_length,
-        rank=rank,
-        average_across_batch=average_across_batch,
-        average_across_timesteps=average_across_timesteps,
-        average_across_remaining=average_across_remaining,
-        sum_over_batch=sum_over_batch,
-        sum_over_timesteps=sum_over_timesteps,
-        sum_over_remaining=sum_over_remaining,
-        time_major=time_major)
-
-    return entropy
diff --git a/texar/tf/losses/entropy_test.py b/texar/tf/losses/entropy_test.py
deleted file mode 100644
index 152b5190..00000000
--- a/texar/tf/losses/entropy_test.py
+++ /dev/null
@@ -1,104 +0,0 @@
-# -*- coding: utf-8 -*-
-#
-"""
-Unit tests for entropy.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
-
-# pylint: disable=invalid-name
-
-import tensorflow as tf
-import texar.tf as tx
-
-
-class EntropyTest(tf.test.TestCase):
-    """Tests entropy.
-    """
-
-    def setUp(self):
-        tf.test.TestCase.setUp(self)
-        self._batch_size = 64
-        self._max_time = 128
-        self._d = 16
-        self._distribution_dim = 32
-        self._logits = tf.random_uniform([self._batch_size, self._d,
-                                          self._distribution_dim])
-        self._sequence_logits = tf.random_uniform([self._batch_size,
-                                                   self._max_time,
-                                                   self._d,
-                                                   self._distribution_dim])
-        self._sequence_length = tf.random_uniform(
-            [self._batch_size], maxval=self._max_time, dtype=tf.int32)
-
-    def _test_entropy(self, entropy_fn, logits, sequence_length=None):
-        with self.test_session() as sess:
-            if sequence_length is None:
-                entropy = entropy_fn(logits)
-                rank = sess.run(tf.rank(entropy))
-                self.assertEqual(rank, 0)
-
-                entropy = entropy_fn(logits, average_across_batch=False)
-                rank = sess.run(tf.rank(entropy))
-                self.assertEqual(rank, 1)
-                self.assertEqual(entropy.shape,
-                                 tf.TensorShape([self._batch_size]))
-            else:
-                entropy = entropy_fn(logits, sequence_length=sequence_length)
-                rank = sess.run(tf.rank(entropy))
-                self.assertEqual(rank, 0)
-
-                entropy = entropy_fn(logits, sequence_length=sequence_length,
-                                     sum_over_timesteps=False)
-                rank = sess.run(tf.rank(entropy))
-                self.assertEqual(rank, 1)
-                self.assertEqual(entropy.shape,
-                                 tf.TensorShape([self._max_time]))
-
-                entropy = entropy_fn(logits, sequence_length=sequence_length,
-                                     sum_over_timesteps=False,
-                                     average_across_timesteps=True,
-                                     average_across_batch=False)
-                rank = sess.run(tf.rank(entropy))
-                self.assertEqual(rank, 1)
-                self.assertEqual(entropy.shape,
-                                 tf.TensorShape([self._batch_size]))
-
-                entropy = entropy_fn(logits, sequence_length=sequence_length,
-                                     sum_over_timesteps=False,
-                                     average_across_batch=False)
-                rank = sess.run(tf.rank(entropy))
-                self.assertEqual(rank, 2)
-                self.assertEqual(entropy.shape,
-                                 tf.TensorShape([self._batch_size,
-                                                 self._max_time]))
-
-                sequence_length_time = tf.random_uniform(
-                    [self._max_time], maxval=self._batch_size, dtype=tf.int32)
-                entropy = entropy_fn(logits,
-                                     sequence_length=sequence_length_time,
-                                     sum_over_timesteps=False,
-                                     average_across_batch=False,
-                                     time_major=True)
-                self.assertEqual(entropy.shape, tf.TensorShape(
-                    [self._batch_size, self._max_time]))
-
-    def test_entropy_with_logits(self):
-        """Tests `entropy_with_logits`
-        """
-        self._test_entropy(
-            tx.losses.entropy_with_logits, self._logits)
-
-    def test_sequence_entropy_with_logits(self):
-        """Tests `sequence_entropy_with_logits`
-        """
-        self._test_entropy(
-            tx.losses.sequence_entropy_with_logits, self._sequence_logits,
-            sequence_length=self._sequence_length)
-
-
-if __name__ == "__main__":
-    tf.test.main()
diff --git a/texar/tf/losses/losses_utils.py b/texar/tf/losses/losses_utils.py
deleted file mode 100644
index 7f805dd6..00000000
--- a/texar/tf/losses/losses_utils.py
+++ /dev/null
@@ -1,204 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Various utilities for losses.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-import tensorflow as tf
-from tensorflow.python.ops import rnn          # pylint: disable=E0611
-
-from texar.tf.utils.shapes import mask_sequences
-
-# pylint: disable=invalid-name, not-context-manager, protected-access,
-# pylint: disable=too-many-arguments
-
-__all__ = [
-    "mask_and_reduce",
-    "reduce_batch_time",
-    "reduce_dimensions"
-]
-
-
-def mask_and_reduce(sequence,
-                    sequence_length,
-                    rank=2,
-                    average_across_batch=True,
-                    average_across_timesteps=False,
-                    average_across_remaining=False,
-                    sum_over_batch=False,
-                    sum_over_timesteps=True,
-                    sum_over_remaining=True,
-                    dtype=None,
-                    time_major=False):
-    """Masks out sequence entries that are beyond the respective sequence
-    lengths, and reduces (average or sum) away dimensions.
-
-    This is a combination of :func:`~texar.tf.utils.shapes.mask_sequences`
-    and :func:`~texar.tf.losses.losses_utils.reduce_batch_time`.
-
-    Args:
-        sequence: A Tensor of sequence values.
-            If `time_major=False` (default), this must be a Tensor of shape
-            `[batch_size, max_time, d_2, ..., d_rank]`, where the rank of
-            the Tensor is specified with :attr:`rank`.
-            The batch and time dimensions are exchanged if `time_major` is True.
-        sequence_length: A Tensor of shape `[batch_size]`. Time steps beyond
-            the respective sequence lengths will be made zero. If `None`,
-            not masking is performed.
-        rank (int): The rank of :attr:`sequence`. Must be >= 2. Default is 2,
-            i.e., `sequence` is a 2D Tensor consisting of batch and time
-            dimensions.
-        average_across_timesteps (bool): If set, average the sequence across
-            the time dimension. Must not set `average_across_timesteps`
-            and `sum_over_timesteps` at the same time.
-        average_across_batch (bool): If set, average the sequence across the
-            batch dimension. Must not set `average_across_batch`'
-            and `sum_over_batch` at the same time.
-        average_across_remaining (bool): If set, average the sequence across the
-            remaining dimensions. Must not set `average_across_remaining`'
-            and `sum_over_remaining` at the same time.
-        sum_over_timesteps (bool): If set, sum the loss across the
-            time dimension. Must not set `average_across_timesteps`
-            and `sum_over_timesteps` at the same time.
-        sum_over_batch (bool): If set, sum the loss across the
-            batch dimension. Must not set `average_across_batch`
-            and `sum_over_batch` at the same time.
-        sum_over_remaining (bool): If set, sum the loss across the
-            remaining dimension. Must not set `average_across_remaining`
-            and `sum_over_remaining` at the same time.
-        time_major (bool): The shape format of the inputs. If `True`,
-            :attr:`sequence` must have shape `[max_time, batch_size, ...]`.
-            If `False` (default), `sequence` must have
-            shape `[batch_size, max_time, ...]`.
-        dtype (dtype): Type of :attr:`sequence`. If `None`, infer from
-            :attr:`sequence` automatically.
-
-    Returns
-        A Tensor containing the masked and reduced sequence.
-    """
-    if rank < 2:
-        raise ValueError('`rank` must be >= 2.')
-
-    if time_major:
-        sequence = rnn._transpose_batch_time(sequence)
-
-    if sequence_length is not None:
-        sequence = mask_sequences(sequence, sequence_length, dtype=dtype,
-                                  time_major=False, tensor_rank=rank)
-
-    if rank > 2:
-        if average_across_remaining and sum_over_remaining:
-            raise ValueError("Only one of `average_across_remaining` and "
-                             "`sum_over_remaining` can be set.")
-        if average_across_remaining:
-            sequence = tf.reduce_mean(sequence, axis=np.arange(2, rank))
-        elif sum_over_remaining:
-            sequence = tf.reduce_sum(sequence, axis=np.arange(2, rank))
-
-    sequence = reduce_batch_time(sequence,
-                                 sequence_length,
-                                 average_across_batch,
-                                 average_across_timesteps,
-                                 sum_over_batch,
-                                 sum_over_timesteps)
-
-    reduce_time = average_across_timesteps or sum_over_timesteps
-    reduce_batch = average_across_batch or sum_over_batch
-    if not reduce_time and not reduce_batch and time_major:
-        sequence = rnn._transpose_batch_time(sequence)
-
-    return sequence
-
-
-def reduce_batch_time(sequence,
-                      sequence_length,
-                      average_across_batch=True,
-                      average_across_timesteps=False,
-                      sum_over_batch=False,
-                      sum_over_timesteps=True):
-    """Average or sum over the respective dimensions of :attr:`sequence`, which
-    is of shape `[batch_size, max_time, ...]`.
-
-    Assumes :attr:`sequence` has been properly masked according to
-    :attr:`sequence_length`.
-    """
-    if average_across_timesteps and sum_over_timesteps:
-        raise ValueError("Only one of `average_across_timesteps` and "
-                         "`sum_over_timesteps` can be set.")
-    if average_across_batch and sum_over_batch:
-        raise ValueError("Only one of `average_across_batch` and "
-                         "`sum_over_batch` can be set.")
-
-    if sum_over_timesteps:
-        sequence = tf.reduce_sum(sequence, axis=[1])
-    elif average_across_timesteps:
-        if sequence_length is None:
-            sequence = tf.reduce_mean(sequence, axis=[1])
-        else:
-            sequence = tf.reduce_sum(sequence, axis=[1]) / \
-                       tf.cast(sequence_length, sequence.dtype)
-
-    if sum_over_batch:
-        sequence = tf.reduce_sum(sequence, axis=[0])
-    elif average_across_batch:
-        sequence = tf.reduce_mean(sequence, axis=[0])
-
-    return sequence
-
-
-def reduce_dimensions(tensor, average_axes=None, sum_axes=None, keepdims=None):
-    """Average or sum over dimensions of :attr:`tensor`.
-
-    :attr:`average_axes` and :attr:`sum_axes` must be mutually exclusive. That
-    is, elements in `average_axes` must not be contained in
-    `sum_axes`, and vice versa.
-
-    Args:
-        tensor: A tensor to reduce.
-        average_axes (optional): A (list of) `int` that indicates the
-            dimensions to reduce by taking average.
-        sum_axes (optional): A (list of) `int` that indicates the
-            dimensions to reduce by taking sum.
-        keepdims (optional): If `True`, retains reduced dimensions with
-            length 1.
-    """
-    reduced_axes = set()
-    if average_axes is not None:
-        if not isinstance(average_axes, (list, tuple)):
-            average_axes = [average_axes]
-        if len(average_axes) > 0:
-            tensor = tf.reduce_mean(tensor, axis=average_axes, keepdims=True)
-            reduced_axes.update(average_axes)
-
-    if sum_axes is not None:
-        if not isinstance(sum_axes, (list, tuple)):
-            sum_axes = [sum_axes]
-        if len(sum_axes) > 0:
-            tensor = tf.reduce_sum(tensor, axis=sum_axes, keepdims=True)
-            reduced_axes.update(sum_axes)
-
-            if average_axes is not None:
-                if len(reduced_axes) != len(average_axes) + len(sum_axes):
-                    raise ValueError('`average_axes` and `sum_axes` must not '
-                                     'have overlapped elements.')
-    if not keepdims:
-        tensor = tf.squeeze(tensor, axis=list(reduced_axes))
-
-    return tensor
diff --git a/texar/tf/losses/mle_losses.py b/texar/tf/losses/mle_losses.py
deleted file mode 100644
index 40c73703..00000000
--- a/texar/tf/losses/mle_losses.py
+++ /dev/null
@@ -1,467 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Various losses
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow as tf
-
-from texar.tf.losses.losses_utils import mask_and_reduce, reduce_dimensions
-from texar.tf.utils import shapes
-
-# pylint: disable=invalid-name, not-context-manager, protected-access,
-# pylint: disable=too-many-arguments
-
-__all__ = [
-    "sequence_softmax_cross_entropy",
-    "sequence_sparse_softmax_cross_entropy",
-    "sequence_sigmoid_cross_entropy",
-    "binary_sigmoid_cross_entropy",
-    "binary_sigmoid_cross_entropy_with_clas"
-]
-
-
-def sequence_softmax_cross_entropy(labels,
-                                   logits,
-                                   sequence_length,
-                                   average_across_batch=True,
-                                   average_across_timesteps=False,
-                                   sum_over_batch=False,
-                                   sum_over_timesteps=True,
-                                   time_major=False,
-                                   stop_gradient_to_label=False,
-                                   name=None):
-    """Computes softmax cross entropy for each time step of sequence
-    predictions.
-
-    Args:
-        labels: Target class distributions.
-
-            - If :attr:`time_major` is `False` (default), this must be a\
-            Tensor of shape `[batch_size, max_time, num_classes]`.
-
-            - If `time_major` is `True`, this must be a Tensor of shape\
-            `[max_time, batch_size, num_classes]`.
-
-            Each row of `labels` should be a valid probability
-            distribution, otherwise, the computation of the gradient will be
-            incorrect.
-        logits: Unscaled log probabilities. This must have the shape of
-            `[max_time, batch_size, num_classes]` or
-            `[batch_size, max_time, num_classes]` according to
-            the value of `time_major`.
-        sequence_length: A Tensor of shape `[batch_size]`. Time steps beyond
-            the respective sequence lengths will have zero losses.
-        average_across_timesteps (bool): If set, average the loss across
-            the time dimension. Must not set `average_across_timesteps`
-            and `sum_over_timesteps` at the same time.
-        average_across_batch (bool): If set, average the loss across the
-            batch dimension. Must not set `average_across_batch`'
-            and `sum_over_batch` at the same time.
-        sum_over_timesteps (bool): If set, sum the loss across the
-            time dimension. Must not set `average_across_timesteps`
-            and `sum_over_timesteps` at the same time.
-        sum_over_batch (bool): If set, sum the loss across the
-            batch dimension. Must not set `average_across_batch`
-            and `sum_over_batch` at the same time.
-        time_major (bool): The shape format of the inputs. If `True`,
-            :attr:`labels` and :attr:`logits` must have shape
-            `[max_time, batch_size, ...]`. If `False`
-            (default), they must have shape `[batch_size, max_time, ...]`.
-        stop_gradient_to_label (bool): If set, gradient propagation to
-            :attr:`labels` will be disabled.
-        name (str, optional): A name for the operation.
-
-    Returns:
-        A Tensor containing the loss, of rank 0, 1, or 2 depending on the
-        arguments :attr:`{average_across}/{sum_over}_{timesteps}/{batch}`.
-        For example:
-
-        - If :attr:`sum_over_timesteps` and :attr:`average_across_batch`  \
-        are `True` (default), the return Tensor is of rank 0.
-
-        - If :attr:`average_across_batch` is `True` and other arguments are \
-        `False`, the return Tensor is of shape `[max_time]`.
-    """
-    with tf.name_scope(name, "sequence_softmax_cross_entropy"):
-        if stop_gradient_to_label:
-            labels = tf.stop_gradient(labels)
-
-        losses = tf.nn.softmax_cross_entropy_with_logits_v2(
-            labels=labels, logits=logits)
-
-        losses = mask_and_reduce(
-            losses,
-            sequence_length,
-            rank=2,
-            average_across_batch=average_across_batch,
-            average_across_timesteps=average_across_timesteps,
-            sum_over_batch=sum_over_batch,
-            sum_over_timesteps=sum_over_timesteps,
-            time_major=time_major)
-
-        return losses
-
-
-def sequence_sparse_softmax_cross_entropy(labels,
-                                          logits,
-                                          sequence_length,
-                                          average_across_batch=True,
-                                          average_across_timesteps=False,
-                                          sum_over_batch=False,
-                                          sum_over_timesteps=True,
-                                          time_major=False,
-                                          name=None):
-    """Computes sparse softmax cross entropy for each time step of sequence
-    predictions.
-
-    Args:
-        labels: Target class indexes. I.e., classes are mutually exclusive
-            (each entry is in exactly one class).
-
-            - If :attr:`time_major` is `False` (default), this must be\
-            a Tensor of shape `[batch_size, max_time]`.
-
-            - If `time_major` is `True`, this must be a Tensor of shape\
-            `[max_time, batch_size].`
-        logits: Unscaled log probabilities. This must have the shape of
-            `[max_time, batch_size, num_classes]` or
-            `[batch_size, max_time, num_classes]` according to
-            the value of `time_major`.
-        sequence_length: A Tensor of shape `[batch_size]`. Time steps beyond
-            the respective sequence lengths will have zero losses.
-        average_across_timesteps (bool): If set, average the loss across
-            the time dimension. Must not set `average_across_timesteps`
-            and `sum_over_timesteps` at the same time.
-        average_across_batch (bool): If set, average the loss across the
-            batch dimension. Must not set `average_across_batch`'
-            and `sum_over_batch` at the same time.
-        sum_over_timesteps (bool): If set, sum the loss across the
-            time dimension. Must not set `average_across_timesteps`
-            and `sum_over_timesteps` at the same time.
-        sum_over_batch (bool): If set, sum the loss across the
-            batch dimension. Must not set `average_across_batch`
-            and `sum_over_batch` at the same time.
-        time_major (bool): The shape format of the inputs. If `True`,
-            :attr:`labels` and :attr:`logits` must have shape
-            `[max_time, batch_size, ...]`. If `False`
-            (default), they must have shape `[batch_size, max_time, ...]`.
-        name (str, optional): A name for the operation.
-
-    Returns:
-        A Tensor containing the loss, of rank 0, 1, or 2 depending on the
-        arguments :attr:`{average_across}/{sum_over}_{timesteps}/{batch}`.
-        For example:
-
-        - If :attr:`sum_over_timesteps` and :attr:`average_across_batch`  \
-        are `True` (default), the return Tensor is of rank 0.
-
-        - If :attr:`average_across_batch` is `True` and other arguments are \
-        `False`, the return Tensor is of shape `[max_time]`.
-
-    Example:
-
-        .. code-block:: python
-
-            embedder = WordEmbedder(vocab_size=data.vocab.size)
-            decoder = BasicRNNDecoder(vocab_size=data.vocab.size)
-            outputs, _, _ = decoder(
-                decoding_strategy='train_greedy',
-                inputs=embedder(data_batch['text_ids']),
-                sequence_length=data_batch['length']-1)
-
-            loss = sequence_sparse_softmax_cross_entropy(
-                labels=data_batch['text_ids'][:, 1:],
-                logits=outputs.logits,
-                sequence_length=data_batch['length']-1)
-
-    """
-    with tf.name_scope(name, "sequence_sparse_softmax_cross_entropy"):
-        losses = tf.nn.sparse_softmax_cross_entropy_with_logits(
-            labels=labels, logits=logits)
-
-        losses = mask_and_reduce(
-            losses,
-            sequence_length,
-            rank=2,
-            average_across_batch=average_across_batch,
-            average_across_timesteps=average_across_timesteps,
-            sum_over_batch=sum_over_batch,
-            sum_over_timesteps=sum_over_timesteps,
-            time_major=time_major)
-
-        return losses
-
-
-def sequence_sigmoid_cross_entropy(labels,
-                                   logits,
-                                   sequence_length,
-                                   average_across_batch=True,
-                                   average_across_timesteps=False,
-                                   average_across_classes=True,
-                                   sum_over_batch=False,
-                                   sum_over_timesteps=True,
-                                   sum_over_classes=False,
-                                   time_major=False,
-                                   stop_gradient_to_label=False,
-                                   name=None):
-    """Computes sigmoid cross entropy for each time step of sequence
-    predictions.
-
-    Args:
-        labels: Target class distributions.
-
-            - If :attr:`time_major` is `False` (default), this must be a\
-            Tensor of shape `[batch_size, max_time(, num_classes)]`.
-
-            - If `time_major` is `True`, this must be a Tensor of shape\
-            `[max_time, batch_size(, num_classes)]`.
-
-            Each row of `labels` should be a valid probability
-            distribution, otherwise, the computation of the gradient will be
-            incorrect.
-        logits: Unscaled log probabilities having the same shape as with
-            :attr:`labels`.
-        sequence_length: A Tensor of shape `[batch_size]`. Time steps beyond
-            the respective sequence lengths will have zero losses.
-        average_across_timesteps (bool): If set, average the loss across
-            the time dimension. Must not set `average_across_timesteps`
-            and `sum_over_timesteps` at the same time.
-        average_across_batch (bool): If set, average the loss across the
-            batch dimension. Must not set `average_across_batch`'
-            and `sum_over_batch` at the same time.
-        average_across_classes (bool): If set, average the loss across the
-            class dimension (if exists). Must not set
-            `average_across_classes`' and `sum_over_classes` at
-            the same time. Ignored if :attr:`logits` is a 2D Tensor.
-        sum_over_timesteps (bool): If set, sum the loss across the
-            time dimension. Must not set `average_across_timesteps`
-            and `sum_over_timesteps` at the same time.
-        sum_over_batch (bool): If set, sum the loss across the
-            batch dimension. Must not set `average_across_batch`
-            and `sum_over_batch` at the same time.
-        sum_over_classes (bool): If set, sum the loss across the
-            class dimension. Must not set `average_across_classes`
-            and `sum_over_classes` at the same time. Ignored if
-            :attr:`logits` is a 2D Tensor.
-        time_major (bool): The shape format of the inputs. If `True`,
-            :attr:`labels` and :attr:`logits` must have shape
-            `[max_time, batch_size, ...]`. If `False`
-            (default), they must have shape `[batch_size, max_time, ...]`.
-        stop_gradient_to_label (bool): If set, gradient propagation to
-            :attr:`labels` will be disabled.
-        name (str, optional): A name for the operation.
-
-    Returns:
-        A Tensor containing the loss, of rank 0, 1, or 2 depending on the
-        arguments
-        :attr:`{average_across}/{sum_over}_{timesteps}/{batch}/{classes}`.
-        For example, if the class dimension does not exist, and
-
-        - If :attr:`sum_over_timesteps` and :attr:`average_across_batch`  \
-        are `True` (default), the return Tensor is of rank 0.
-
-        - If :attr:`average_across_batch` is `True` and other arguments are \
-        `False`, the return Tensor is of shape `[max_time]`.
-    """
-
-    with tf.name_scope(name, "sequence_sigmoid_cross_entropy"):
-        if stop_gradient_to_label:
-            labels = tf.stop_gradient(labels)
-
-        losses = tf.nn.sigmoid_cross_entropy_with_logits(
-            labels=labels, logits=logits)
-
-        rank = shapes.get_rank(logits) or shapes.get_rank(labels)
-        if rank is None:
-            raise ValueError(
-                'Cannot determine the rank of `logits` or `labels`.')
-
-        losses = mask_and_reduce(
-            losses,
-            sequence_length,
-            rank=rank,
-            average_across_batch=average_across_batch,
-            average_across_timesteps=average_across_timesteps,
-            average_across_remaining=average_across_classes,
-            sum_over_batch=sum_over_batch,
-            sum_over_timesteps=sum_over_timesteps,
-            sum_over_remaining=sum_over_classes,
-            time_major=time_major)
-
-        return losses
-
-
-def binary_sigmoid_cross_entropy(pos_logits=None,
-                                 neg_logits=None,
-                                 average_across_batch=True,
-                                 average_across_classes=True,
-                                 sum_over_batch=False,
-                                 sum_over_classes=False,
-                                 return_pos_neg_losses=False,
-                                 name=None):
-    """Computes sigmoid cross entropy of binary predictions.
-
-    Args:
-        pos_logits: The logits of predicting positive on positive data. A
-            tensor of shape `[batch_size(, num_classes)]`.
-        neg_logits: The logits of predicting positive on negative data. A
-            tensor of shape `[batch_size(, num_classes)]`.
-        average_across_batch (bool): If set, average the loss across the
-            batch dimension. Must not set `average_across_batch`'
-            and `sum_over_batch` at the same time.
-        average_across_classes (bool): If set, average the loss across the
-            class dimension (if exists). Must not set
-            `average_across_classes`' and `sum_over_classes` at
-            the same time. Ignored if :attr:`logits` is a 1D Tensor.
-        sum_over_batch (bool): If set, sum the loss across the
-            batch dimension. Must not set `average_across_batch`
-            and `sum_over_batch` at the same time.
-        sum_over_classes (bool): If set, sum the loss across the
-            class dimension. Must not set `average_across_classes`
-            and `sum_over_classes` at the same time. Ignored if
-            :attr:`logits` is a 2D Tensor.
-        return_pos_neg_losses (bool): If set, additionally returns the losses
-            on :attr:`pos_logits` and :attr:`neg_logits`, respectively.
-        name (str, optional): A name for the operation.
-
-    Returns:
-        By default, a Tensor containing the loss, of rank 0, 1, or 2 depending
-        on the arguments :attr:`{average_across}/{sum_over}_{batch}/{classes}`.
-        For example:
-
-            - If :attr:`sum_over_batch` and :attr:`average_across_classes`  \
-            are `True` (default), the return Tensor is of rank 0.
-
-            - If  arguments are `False`, the return Tensor is of shape \
-            `[batch_size(, num_classes)]`.
-
-        If :attr:`return_pos_neg_losses` is `True`, returns a tuple
-        `(loss, pos_loss, neg_loss)`, where `loss` is the loss above;
-        `pos_loss` is the loss on `pos_logits` only; and
-        `neg_loss` is the loss on `neg_logits` only. They have
-        `loss = pos_loss + neg_loss`.
-    """
-    with tf.name_scope(name, "binary_sigmoid_cross_entropy"):
-        average_axes, sum_axes = [], []
-        average_axes += [0] if average_across_batch else []
-        average_axes += [1] if average_across_classes else []
-        sum_axes += [0] if sum_over_batch else []
-        sum_axes += [1] if sum_over_classes else []
-
-        pos_loss = 0
-        if pos_logits is not None:
-            pos_loss = tf.nn.sigmoid_cross_entropy_with_logits(
-                logits=pos_logits, labels=tf.ones_like(pos_logits))
-
-            pos_loss = reduce_dimensions(pos_loss, average_axes, sum_axes)
-
-        neg_loss = 0
-        if neg_logits is not None:
-            neg_loss = tf.nn.sigmoid_cross_entropy_with_logits(
-                logits=neg_logits, labels=tf.zeros_like(neg_logits))
-
-            neg_loss = reduce_dimensions(neg_loss, average_axes, sum_axes)
-
-    loss = pos_loss + neg_loss
-
-    if return_pos_neg_losses:
-        return loss, pos_loss, neg_loss
-    else:
-        return loss
-
-
-def binary_sigmoid_cross_entropy_with_clas(clas_fn,
-                                           pos_inputs=None,
-                                           neg_inputs=None,
-                                           average_across_batch=True,
-                                           average_across_classes=True,
-                                           sum_over_batch=False,
-                                           sum_over_classes=False,
-                                           return_pos_neg_losses=False,
-                                           name=None):
-    """Computes sigmoid cross entropy of binary classifier.
-
-    .. role:: python(code)
-       :language: python
-
-    Args:
-        clas_fn: A callable takes data (e.g., :attr:`pos_inputs` and
-            :attr:`fake_inputs`) and returns the logits of being positive. The
-            signature of `clas_fn` must be:
-            :python:`logits (, ...) = clas_fn(inputs)`.
-            The return value of `clas_fn` can be the logits, or
-            a tuple where the logits are the first element.
-        pos_inputs: The positive data fed into `clas_fn`.
-        neg_inputs: The negative data fed into `clas_fn`.
-        average_across_batch (bool): If set, average the loss across the
-            batch dimension. Must not set `average_across_batch`'
-            and `sum_over_batch` at the same time.
-        average_across_classes (bool): If set, average the loss across the
-            class dimension (if exists). Must not set
-            `average_across_classes`' and `sum_over_classes` at
-            the same time. Ignored if :attr:`logits` is a 1D Tensor.
-        sum_over_batch (bool): If set, sum the loss across the
-            batch dimension. Must not set `average_across_batch`
-            and `sum_over_batch` at the same time.
-        sum_over_classes (bool): If set, sum the loss across the
-            class dimension. Must not set `average_across_classes`
-            and `sum_over_classes` at the same time. Ignored if
-            :attr:`logits` is a 2D Tensor.
-        return_pos_neg_losses (bool): If set, additionally returns the losses
-            on :attr:`pos_logits` and :attr:`neg_logits`, respectively.
-        name (str, optional): A name for the operation.
-
-    Returns:
-        By default, a Tensor containing the loss, of rank 0, 1, or 2 depending
-        on the arguments :attr:`{average_across}/{sum_over}_{batch}/{classes}`.
-        For example:
-
-            - If :attr:`sum_over_batch` and :attr:`average_across_classes`  \
-            are `True` (default), the return Tensor is of rank 0.
-
-            - If  arguments are `False`, the return Tensor is of shape \
-            `[batch_size(, num_classes)]`.
-
-        If :attr:`return_pos_neg_losses`=`True`, returns a tuple
-        `(loss, pos_loss, neg_loss)`, where `loss` is the loss above;
-        `pos_loss` is the loss on `pos_logits` only; and
-        `neg_loss` is the loss on `neg_logits` only. They have
-        `loss = pos_loss + neg_loss`.
-    """
-    pos_logits = None
-    if pos_inputs is not None:
-        pos_logits = clas_fn(pos_inputs)
-        if isinstance(pos_logits, (list, tuple)):
-            pos_logits = pos_logits[0]
-
-    neg_logits = None
-    if neg_inputs is not None:
-        neg_logits = clas_fn(neg_inputs)
-        if isinstance(neg_logits, (list, tuple)):
-            neg_logits = neg_logits[0]
-
-    return binary_sigmoid_cross_entropy(
-        pos_logits=pos_logits,
-        neg_logits=neg_logits,
-        average_across_batch=average_across_batch,
-        average_across_classes=average_across_classes,
-        sum_over_batch=sum_over_batch,
-        sum_over_classes=sum_over_classes,
-        return_pos_neg_losses=return_pos_neg_losses,
-        name=name)
diff --git a/texar/tf/losses/mle_losses_test.py b/texar/tf/losses/mle_losses_test.py
deleted file mode 100644
index 32521c23..00000000
--- a/texar/tf/losses/mle_losses_test.py
+++ /dev/null
@@ -1,116 +0,0 @@
-# -*- coding: utf-8 -*-
-#
-"""
-Unit tests for mle losses.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
-
-# pylint: disable=invalid-name
-
-import numpy as np
-
-import tensorflow as tf
-
-import texar.tf as tx
-
-
-class MLELossesTest(tf.test.TestCase):
-    """Tests mle losses.
-    """
-
-    def setUp(self):
-        tf.test.TestCase.setUp(self)
-        self._batch_size = 64
-        self._max_time = 16
-        self._num_classes = 100
-        self._labels = tf.ones([self._batch_size, self._max_time],
-                               dtype=tf.int32)
-        one_hot_labels = tf.one_hot(
-            self._labels, self._num_classes, dtype=tf.float32)
-        self._one_hot_labels = tf.reshape(
-            one_hot_labels, [self._batch_size, self._max_time, -1])
-        self._logits = tf.random_uniform(
-            [self._batch_size, self._max_time, self._num_classes])
-        self._sequence_length = tf.random_uniform(
-            [self._batch_size], maxval=self._max_time, dtype=tf.int32)
-
-    def _test_sequence_loss(self, loss_fn, labels, logits, sequence_length):
-        with self.test_session() as sess:
-            loss = loss_fn(labels, logits, sequence_length)
-            rank = sess.run(tf.rank(loss))
-            self.assertEqual(rank, 0)
-
-            loss = loss_fn(
-                labels, logits, sequence_length, sum_over_timesteps=False)
-            rank = sess.run(tf.rank(loss))
-            self.assertEqual(rank, 1)
-            self.assertEqual(loss.shape, tf.TensorShape([self._max_time]))
-
-            loss = loss_fn(
-                labels, logits, sequence_length, sum_over_timesteps=False,
-                average_across_timesteps=True, average_across_batch=False)
-            rank = sess.run(tf.rank(loss))
-            self.assertEqual(rank, 1)
-            self.assertEqual(loss.shape, tf.TensorShape([self._batch_size]))
-
-            loss = loss_fn(
-                labels, logits, sequence_length, sum_over_timesteps=False,
-                average_across_batch=False)
-            rank = sess.run(tf.rank(loss))
-            self.assertEqual(rank, 2)
-            self.assertEqual(loss.shape,
-                             tf.TensorShape([self._batch_size, self._max_time]))
-
-            sequence_length_time = tf.random_uniform(
-                [self._max_time], maxval=self._batch_size, dtype=tf.int32)
-            loss = loss_fn(
-                labels, logits, sequence_length_time, sum_over_timesteps=False,
-                average_across_batch=False, time_major=True)
-            self.assertEqual(loss.shape,
-                             tf.TensorShape([self._batch_size, self._max_time]))
-
-    def test_sequence_softmax_cross_entropy(self):
-        """Tests `sequence_softmax_cross_entropy`
-        """
-        self._test_sequence_loss(
-            tx.losses.sequence_softmax_cross_entropy,
-            self._one_hot_labels, self._logits, self._sequence_length)
-
-    def test_sequence_sparse_softmax_cross_entropy(self):
-        """Tests `sequence_sparse_softmax_cross_entropy`
-        """
-        self._test_sequence_loss(
-            tx.losses.sequence_sparse_softmax_cross_entropy,
-            self._labels, self._logits, self._sequence_length)
-
-    def test_sequence_sigmoid_cross_entropy(self):
-        """Tests `sequence_sigmoid_cross_entropy`.
-        """
-        self._test_sequence_loss(
-            tx.losses.sequence_sigmoid_cross_entropy,
-            self._one_hot_labels, self._logits, self._sequence_length)
-
-        self._test_sequence_loss(
-            tx.losses.sequence_sigmoid_cross_entropy,
-            self._one_hot_labels[:, :, 0],
-            self._logits[:, :, 0],
-            self._sequence_length)
-
-        labels = tf.placeholder(dtype=tf.int32, shape=None)
-        loss = tx.losses.sequence_sigmoid_cross_entropy(
-            logits=self._logits[:, :, 0],
-            labels=tf.cast(labels, tf.float32),
-            sequence_length=self._sequence_length)
-        with self.test_session() as sess:
-            rank = sess.run(
-                tf.rank(loss),
-                feed_dict={labels: np.ones([self._batch_size, self._max_time])})
-            self.assertEqual(rank, 0)
-
-
-if __name__ == "__main__":
-    tf.test.main()
diff --git a/texar/tf/losses/pg_losses.py b/texar/tf/losses/pg_losses.py
deleted file mode 100644
index 998cd876..00000000
--- a/texar/tf/losses/pg_losses.py
+++ /dev/null
@@ -1,256 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Various loss functions for policy gradients.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow as tf
-
-from texar.tf.losses.losses_utils import mask_and_reduce
-from texar.tf.utils.shapes import get_rank
-
-# pylint: disable=too-many-arguments, protected-access
-
-__all__ = [
-    "pg_loss_with_logits",
-    "pg_loss_with_log_probs"
-]
-
-
-def pg_loss_with_logits(actions,
-                        logits,
-                        advantages,
-                        rank=None,
-                        batched=False,
-                        sequence_length=None,
-                        average_across_batch=True,
-                        average_across_timesteps=False,
-                        average_across_remaining=False,
-                        sum_over_batch=False,
-                        sum_over_timesteps=True,
-                        sum_over_remaining=True,
-                        time_major=False):
-    """Policy gradient loss with logits. Used for discrete actions.
-
-    `pg_loss = reduce( advantages * -log_prob( actions )  )`,
-    where `advantages` and `actions` do not back-propagate gradients.
-
-    All arguments except :attr:`logits` and :attr:`actions` are the same with
-    :func:`pg_loss_with_log_probs`.
-
-    Args:
-        actions: Tensor of shape
-            `[(batch_size,) max_time, d_3, ..., d_rank]` and of dtype
-            `int32` or `int64`.
-            The rank of the Tensor is specified with :attr:`rank`.
-
-            The batch dimension exists only if :attr:`batched` is `True`.
-
-            The batch and time dimensions
-            are exchanged, i.e., `[max_time, batch_size, ...]` if
-            :attr:`time_major` is `True`.
-        logits: Unscaled log probabilities of shape
-            `[(batch_size,) max_time, d_3, ..., d_{rank+1}]`
-            and dtype `float32` or `float64`.
-            The batch and time dimensions are exchanged if `time_major`
-            is `True`.
-        advantages: Tensor of shape
-            `[(batch_size,) max_time, d_3, ..., d_rank]` and
-            dtype `float32` or `float64`.
-            The batch and time dimensions are exchanged if `time_major`
-            is `True`.
-        rank (int, optional): The rank of :attr:`actions`.
-            If `None` (default), rank is automatically inferred from
-            `actions` or `advantages`. If the inference fails,
-            `rank` is set to 1 if :attr:`batched` is `False`,
-            and set to 2 if :attr:`batched` is `True`.
-        batched (bool): `True` if the inputs are batched.
-        sequence_length (optional): A Tensor of shape `[batch_size]`.
-            Time steps beyond the respective sequence lengths will have zero
-            losses. Used if :attr:`batched` is `True`.
-        average_across_timesteps (bool): If set, average the loss across
-            the time dimension. Must not set `average_across_timesteps`
-            and `sum_over_timesteps` at the same time.
-        average_across_batch (bool): If set, average the loss across the
-            batch dimension. Must not set `average_across_batch`'
-            and `sum_over_batch` at the same time.
-            Ignored if `batched` is `False`.
-        average_across_remaining (bool): If set, average the sequence across the
-            remaining dimensions. Must not set `average_across_remaining`'
-            and `sum_over_remaining` at the same time. Ignored if
-            no more dimensions other than the batch and time dimensions.
-        sum_over_timesteps (bool): If set, sum the loss across the
-            time dimension. Must not set `average_across_timesteps`
-            and `sum_over_timesteps` at the same time.
-        sum_over_batch (bool): If set, sum the loss across the
-            batch dimension. Must not set `average_across_batch`
-            and `sum_over_batch` at the same time.
-            Ignored if `batched` is `False`.
-        sum_over_remaining (bool): If set, sum the loss across the
-            remaining dimension. Must not set `average_across_remaining`
-            and `sum_over_remaining` at the same time. Ignored if
-            no more dimensions other than the batch and time dimensions.
-        time_major (bool): The shape format of the inputs. If `True`,
-            :attr:`logits`, :attr:`actions` and :attr:`advantages` must
-            have shape `[max_time, batch_size, ...]`. If `False` (default),
-            they must have shape `[batch_size, max_time, ...]`.
-            Ignored if `batched` is `False`.
-
-    Returns:
-        A Tensor containing the loss to minimize, whose rank depends on the
-        reduce arguments. For example, the batch dimension is reduced if
-        either :attr:`average_across_batch` or :attr:`sum_over_batch` is
-        `True`, which decreases the rank of output tensor by 1.
-    """
-    actions = tf.stop_gradient(actions)
-    neg_log_probs = tf.nn.sparse_softmax_cross_entropy_with_logits(
-        logits=logits, labels=actions)
-    return pg_loss_with_log_probs(
-        log_probs=-neg_log_probs,
-        advantages=advantages,
-        rank=rank,
-        batched=batched,
-        sequence_length=sequence_length,
-        average_across_batch=average_across_batch,
-        average_across_timesteps=average_across_timesteps,
-        average_across_remaining=average_across_remaining,
-        sum_over_batch=sum_over_batch,
-        sum_over_timesteps=sum_over_timesteps,
-        sum_over_remaining=sum_over_remaining,
-        time_major=time_major)
-
-
-def pg_loss_with_log_probs(log_probs,
-                           advantages,
-                           rank=None,
-                           batched=False,
-                           sequence_length=None,
-                           average_across_batch=True,
-                           average_across_timesteps=False,
-                           average_across_remaining=False,
-                           sum_over_batch=False,
-                           sum_over_timesteps=True,
-                           sum_over_remaining=True,
-                           time_major=False):
-    """Policy gradient loss with log probs of actions.
-
-    `pg_loss = reduce( advantages * -log_probs )`,
-    where `advantages` does not back-propagate gradients.
-
-    All arguments except :attr:`log_probs` are the same as
-    :func:`pg_loss_with_logits`.
-
-    Args:
-        log_probs: Log probabilities of shape
-            `[(batch_size,) max_time, ..., d_rank]` and dtype `float32`
-            or `float64`. The rank of the Tensor is specified
-            with :attr:`rank`.
-
-            The batch dimension exists only if :attr:`batched` is `True`.
-
-            The batch and time dimensions are exchanged, i.e.,
-            `[max_time, batch_size, ...]` if :attr:`time_major` is `True`.
-        advantages: Tensor of shape
-            `[(batch_size,) max_time, d_3, ..., d_rank]` and
-            dtype `float32` or `float64`.
-            The batch dimension exists only if `batched` is `True`.
-            The batch and time dimensions
-            are exchanged if `time_major` is `True`.
-        rank (int, optional): The rank of :attr:`log_probs`.
-            If `None` (default), rank is automatically inferred from
-            `log_probs` or `advantages`. If the inference fails,
-            `rank` is set to 1 if `batched``==False`,
-            and set to 2 if `batched``==True`.
-        batched (bool): `True` if the inputs are batched.
-        sequence_length (optional): A Tensor of shape `[batch_size]`.
-            Time steps beyond the respective sequence lengths will have zero
-            losses. Used if :attr:`batched` is `True`.
-        average_across_timesteps (bool): If set, average the loss across
-            the time dimension. Must not set `average_across_timesteps`
-            and `sum_over_timesteps` at the same time.
-        average_across_batch (bool): If set, average the loss across the
-            batch dimension. Must not set `average_across_batch`'
-            and `sum_over_batch` at the same time.
-            Ignored if `batched` is `False`.
-        average_across_remaining (bool): If set, average the sequence across the
-            remaining dimensions. Must not set `average_across_remaining`'
-            and `sum_over_remaining` at the same time. Ignored if
-            no more dimensions other than the batch and time dimensions.
-        sum_over_timesteps (bool): If set, sum the loss across the
-            time dimension. Must not set `average_across_timesteps`
-            and `sum_over_timesteps` at the same time.
-        sum_over_batch (bool): If set, sum the loss across the
-            batch dimension. Must not set `average_across_batch`
-            and `sum_over_batch` at the same time.
-            Ignored if `batched` is `False`.
-        sum_over_remaining (bool): If set, sum the loss across the
-            remaining dimension. Must not set `average_across_remaining`
-            and `sum_over_remaining` at the same time. Ignored if
-            no more dimensions other than the batch and time dimensions.
-        time_major (bool): The shape format of the inputs. If `True`,
-            :attr:`log_probs` and :attr:`advantages` must have shape
-            `[max_time, batch_size, ...]`. If `False` (default),
-            they must have shape `[batch_size, max_time, ...]`.
-            Ignored if :attr:`batched` is `False`.
-
-    Returns:
-        A Tensor containing the loss to minimize, whose rank depends on the
-        reduce arguments. For example, the batch dimension is reduced if
-        either :attr:`average_across_batch` or :attr:`sum_over_batch` is
-        `True`, which decreases the rank of output tensor by 1.
-    """
-    advantages = tf.stop_gradient(advantages)
-
-    losses = -log_probs * advantages
-
-    if rank is None:
-        rank = get_rank(log_probs) or get_rank(advantages)
-    if rank is None:
-        rank = 2 if batched else 1
-
-    if batched:
-        losses = mask_and_reduce(
-            losses,
-            sequence_length,
-            rank=rank,
-            average_across_batch=average_across_batch,
-            average_across_timesteps=average_across_timesteps,
-            average_across_remaining=average_across_remaining,
-            sum_over_batch=sum_over_batch,
-            sum_over_timesteps=sum_over_timesteps,
-            sum_over_remaining=sum_over_remaining,
-            time_major=time_major)
-    elif rank > 1:
-        if average_across_remaining and sum_over_remaining:
-            raise ValueError("Only one of `average_across_remaining` and "
-                             "`sum_over_remaining` can be set.")
-        if average_across_remaining:
-            losses = tf.reduce_mean(losses, axis=list(range(1, rank)))
-        elif sum_over_remaining:
-            losses = tf.reduce_sum(losses, axis=list(range(1, rank)))
-
-    if not batched:
-        if average_across_timesteps and sum_over_timesteps:
-            raise ValueError("Only one of `average_across_timesteps` and "
-                             "`sum_over_timesteps` can be set.")
-        if average_across_timesteps:
-            losses = tf.reduce_mean(losses, axis=0)
-        elif sum_over_timesteps:
-            losses = tf.reduce_sum(losses, axis=0)
-
-    return losses
diff --git a/texar/tf/losses/pg_losses_test.py b/texar/tf/losses/pg_losses_test.py
deleted file mode 100644
index a390d872..00000000
--- a/texar/tf/losses/pg_losses_test.py
+++ /dev/null
@@ -1,129 +0,0 @@
-# -*- coding: utf-8 -*-
-#
-"""
-Unit tests for pg losses.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
-
-# pylint: disable=invalid-name
-
-import tensorflow as tf
-import texar.tf as tx
-
-
-class PGLossesTest(tf.test.TestCase):
-    """Tests pg losses.
-    """
-
-    def setUp(self):
-        tf.test.TestCase.setUp(self)
-        self._batch_size = 64
-        self._max_time = 16
-        self._d1 = 32
-        self._d2 = 32
-        self._d3 = 32
-        self._num_classes = 10
-        self._actions_batch = tf.ones([self._batch_size, self._max_time,
-                                      self._d1, self._d2, self._d3],
-                                      dtype=tf.int32)
-        self._logits_batch = tf.random_uniform([self._batch_size,
-                                                self._max_time,
-                                                self._d1, self._d2, self._d3,
-                                                self._num_classes])
-        self._advantages_batch = tf.random_uniform([self._batch_size,
-                                                    self._max_time,
-                                                    self._d1, self._d2,
-                                                    self._d3])
-        self._actions_no_batch = tf.ones([self._max_time,
-                                          self._d1, self._d2, self._d3],
-                                         dtype=tf.int32)
-        self._logits_no_batch = tf.random_uniform([self._max_time,
-                                                   self._d1, self._d2, self._d3,
-                                                   self._num_classes])
-        self._advantages_no_batch = tf.random_uniform([self._max_time,
-                                                       self._d1, self._d2,
-                                                       self._d3])
-        self._sequence_length = tf.random_uniform(
-            [self._batch_size], maxval=self._max_time, dtype=tf.int32)
-
-    def _test_sequence_loss(self, loss_fn, actions, logits, advantages,
-                            batched, sequence_length):
-        with self.test_session() as sess:
-            loss = loss_fn(actions, logits, advantages, batched=batched,
-                           sequence_length=sequence_length)
-            rank = sess.run(tf.rank(loss))
-            self.assertEqual(rank, 0)
-
-            loss = loss_fn(actions, logits, advantages, batched=batched,
-                           sequence_length=sequence_length,
-                           sum_over_timesteps=False)
-            rank = sess.run(tf.rank(loss))
-            self.assertEqual(rank, 1)
-            self.assertEqual(loss.shape, tf.TensorShape([self._max_time]))
-
-            loss = loss_fn(actions, logits, advantages, batched=batched,
-                           sequence_length=sequence_length,
-                           sum_over_timesteps=False,
-                           average_across_timesteps=True,
-                           average_across_batch=False)
-            rank = sess.run(tf.rank(loss))
-            if batched:
-                self.assertEqual(rank, 1)
-                self.assertEqual(loss.shape, tf.TensorShape([self._batch_size]))
-            else:
-                self.assertEqual(rank, 0)
-
-            loss = loss_fn(actions, logits, advantages, batched=batched,
-                           sequence_length=sequence_length,
-                           sum_over_timesteps=False,
-                           average_across_batch=False)
-            rank = sess.run(tf.rank(loss))
-            if batched:
-                self.assertEqual(rank, 2)
-                self.assertEqual(loss.shape,
-                                 tf.TensorShape([self._batch_size,
-                                                 self._max_time]))
-            else:
-                self.assertEqual(rank, 1)
-                self.assertEqual(loss.shape,
-                                 tf.TensorShape([self._max_time]))
-
-            sequence_length_time = tf.random_uniform(
-                [self._max_time], maxval=self._max_time, dtype=tf.int32)
-            loss = loss_fn(actions, logits, advantages, batched=batched,
-                           sequence_length=sequence_length_time,
-                           sum_over_timesteps=False,
-                           average_across_batch=False,
-                           time_major=True)
-            if batched:
-                self.assertEqual(loss.shape,
-                                 tf.TensorShape([self._batch_size,
-                                                 self._max_time]))
-            else:
-                self.assertEqual(loss.shape,
-                                 tf.TensorShape([self._max_time]))
-
-    def test_pg_losses_with_logits(self):
-        """Tests `pg_losses_with_logits`.
-        """
-        self._test_sequence_loss(tx.losses.pg_loss_with_logits,
-                                 self._actions_batch,
-                                 self._logits_batch,
-                                 self._advantages_batch,
-                                 True,
-                                 self._sequence_length)
-
-        self._test_sequence_loss(tx.losses.pg_loss_with_logits,
-                                 self._actions_no_batch,
-                                 self._logits_no_batch,
-                                 self._advantages_no_batch,
-                                 False,
-                                 self._sequence_length)
-
-
-if __name__ == "__main__":
-    tf.test.main()
diff --git a/texar/tf/losses/rewards.py b/texar/tf/losses/rewards.py
deleted file mode 100644
index 88c0a2a6..00000000
--- a/texar/tf/losses/rewards.py
+++ /dev/null
@@ -1,215 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Various reward related functions.
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-import tensorflow as tf
-
-from texar.tf.utils.shapes import mask_sequences
-
-# pylint: disable=invalid-name, too-many-arguments, no-member
-
-__all__ = [
-    "discount_reward",
-    "_discount_reward_py_1d",
-    "_discount_reward_tensor_1d",
-    "_discount_reward_py_2d",
-    "_discount_reward_tensor_2d"
-]
-
-
-def discount_reward(reward,
-                    sequence_length=None,
-                    discount=1.,
-                    normalize=False,
-                    dtype=None,
-                    tensor_rank=1):
-    """Computes discounted reward.
-
-    :attr:`reward` and :attr:`sequence_length` can be either Tensors or python
-    arrays. If both are python array (or `None`), the return will be a python
-    array as well. Otherwise tf Tensors are returned.
-
-    Args:
-        reward: A Tensor or python array. Can be 1D with shape `[batch_size]`,
-            or 2D with shape `[batch_size, max_time]`.
-        sequence_length (optional): A Tensor or python array of shape
-            `[batch_size]`. Time steps beyond the respective sequence lengths
-            will be masked. Required if :attr:`reward` is 1D.
-        discount (float): A scalar. The discount factor.
-        normalize (bool): Whether to normalize the discounted reward, by
-            `(discounted_reward - mean) / std`. Here `mean` and `std` are
-            over all time steps and all samples in the batch.
-        dtype (dtype): Type of :attr:`reward`. If `None`, infer from
-            `reward` automatically.
-        tensor_rank (int): The number of dimensions of :attr:`reward`.
-            Default is 1, i.e., :attr:`reward` is a 1D Tensor consisting
-            of a batch dimension. Ignored if :attr:`reward`
-            and :attr:`sequence_length` are python arrays (or `None`).
-
-    Returns:
-        A 2D Tensor or python array of the discounted reward.
-
-        If :attr:`reward` and :attr:`sequence_length` are python
-        arrays (or `None`), the returned value is a python array as well.
-
-
-    Example:
-
-        .. code-block:: python
-
-            r = [2., 1.]
-            seq_length = [3, 2]
-            discounted_r = discount_reward(r, seq_length, discount=0.1)
-            # discounted_r == [[2. * 0.1^2, 2. * 0.1, 2.],
-            #                  [1. * 0.1,   1.,       0.]]
-
-            r = [[3., 4., 5.], [6., 7., 0.]]
-            seq_length = [3, 2]
-            discounted_r = discount_reward(r, seq_length, discount=0.1)
-            # discounted_r == [[3. + 4.*0.1 + 5.*0.1^2, 4. + 5.*0.1, 5.],
-            #                  [6. + 7.*0.1,            7.,          0.]]
-    """
-    is_tensor = tf.contrib.framework.is_tensor
-    if is_tensor(reward) or is_tensor(sequence_length):
-        if tensor_rank == 1:
-            disc_reward = _discount_reward_tensor_1d(
-                reward, sequence_length, discount, dtype)
-        elif tensor_rank == 2:
-            disc_reward = _discount_reward_tensor_2d(
-                reward, sequence_length, discount, dtype)
-        else:
-            raise ValueError("`tensor_rank` can only be 1 or 2.")
-
-        if normalize:
-            mu, var = tf.nn.moments(disc_reward, axes=[0, 1], keep_dims=True)
-            disc_reward = (disc_reward - mu) / (tf.sqrt(var) + 1e-8)
-    else:
-        reward = np.array(reward)
-        tensor_rank = reward.ndim
-        if tensor_rank == 1:
-            disc_reward = _discount_reward_py_1d(
-                reward, sequence_length, discount, dtype)
-        elif tensor_rank == 2:
-            disc_reward = _discount_reward_py_2d(
-                reward, sequence_length, discount, dtype)
-        else:
-            raise ValueError("`reward` can only be 1D or 2D.")
-
-        if normalize:
-            mu = np.mean(disc_reward)
-            std = np.std(disc_reward)
-            disc_reward = (disc_reward - mu) / (std + 1e-8)
-
-    return disc_reward
-
-
-def _discount_reward_py_1d(reward, sequence_length, discount=1., dtype=None):
-    if sequence_length is None:
-        raise ValueError('sequence_length must not be `None` for 1D reward.')
-
-    reward = np.array(reward)
-    sequence_length = np.array(sequence_length)
-
-    batch_size = reward.shape[0]
-    max_seq_length = np.max(sequence_length)
-    dtype = dtype or reward.dtype
-
-    if discount == 1.:
-        dmat = np.ones([batch_size, max_seq_length], dtype=dtype)
-    else:
-        steps = np.tile(np.arange(max_seq_length), [batch_size, 1])
-        mask = np.asarray(steps < (sequence_length - 1)[:, None], dtype=dtype)
-        # Make each row = [discount, ..., discount, 1, ..., 1]
-        dmat = mask * discount + (1 - mask)
-        dmat = np.cumprod(dmat[:, ::-1], axis=1)[:, ::-1]
-
-    disc_reward = dmat * reward[:, None]
-    disc_reward = mask_sequences(disc_reward, sequence_length, dtype=dtype)
-    # mask = np.asarray(steps < sequence_length[:, None], dtype=dtype)
-    # disc_reward = mask * disc_reward
-
-    return disc_reward
-
-
-def _discount_reward_tensor_1d(reward, sequence_length,
-                               discount=1., dtype=None):
-    if sequence_length is None:
-        raise ValueError('sequence_length must not be `None` for 1D reward.')
-
-    batch_size = tf.shape(reward)[0]
-    max_seq_length = tf.reduce_max(sequence_length)
-    dtype = dtype or reward.dtype
-
-    if discount == 1.:
-        dmat = tf.ones(
-            tf.concat([[batch_size], [max_seq_length]], 0), dtype=dtype)
-    else:
-        mask = tf.sequence_mask(sequence_length, dtype=dtype)
-        mask = tf.concat([mask[:, 1:], tf.zeros_like(mask[:, -1:])], axis=1)
-        # Make each row = [discount, ..., discount, 1, ..., 1]
-        dmat = mask * discount + (1 - mask)
-        dmat = tf.cumprod(dmat, axis=1, reverse=True)
-
-    disc_reward = dmat * tf.expand_dims(reward, -1)
-    disc_reward = mask_sequences(
-        disc_reward, sequence_length, dtype=dtype, tensor_rank=2)
-
-    return disc_reward
-
-
-def _discount_reward_py_2d(reward, sequence_length=None,
-                           discount=1., dtype=None):
-    if sequence_length is not None:
-        reward = mask_sequences(reward, sequence_length, dtype=dtype)
-
-    dtype = dtype or reward.dtype
-
-    if discount == 1.:
-        disc_reward = np.cumsum(
-            reward[:, ::-1], axis=1, dtype=dtype)[:, ::-1]
-    else:
-        disc_reward = np.copy(reward)
-        for i in range(reward.shape[1] - 2, -1, -1):
-            disc_reward[:, i] += disc_reward[:, i + 1] * discount
-
-    return disc_reward
-
-
-def _discount_reward_tensor_2d(reward, sequence_length=None,
-                               discount=1., dtype=None):
-    if sequence_length is not None:
-        reward = mask_sequences(
-            reward, sequence_length, dtype=dtype, tensor_rank=2)
-
-    if discount == 1.:
-        disc_reward = tf.cumsum(reward, axis=1, reverse=True)
-    else:
-        # [max_time, batch_size]
-        rev_reward_T = tf.transpose(tf.reverse(reward, [1]), [1, 0])
-        rev_reward_T_cum = tf.scan(
-            fn=lambda acc, cur: cur + discount * acc,
-            elems=rev_reward_T,
-            initializer=tf.zeros_like(reward[:, 1]),
-            back_prop=False)
-        disc_reward = tf.reverse(
-            tf.transpose(rev_reward_T_cum, [1, 0]), [1])
-
-    return disc_reward
diff --git a/texar/tf/losses/rewards_test.py b/texar/tf/losses/rewards_test.py
deleted file mode 100644
index c703ad81..00000000
--- a/texar/tf/losses/rewards_test.py
+++ /dev/null
@@ -1,203 +0,0 @@
-"""
-Unit tests for RL rewards.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
-
-# pylint: disable=invalid-name, no-member
-
-import numpy as np
-
-import tensorflow as tf
-
-from texar.tf.losses.rewards import \
-        _discount_reward_tensor_2d, _discount_reward_tensor_1d, \
-        _discount_reward_py_1d, _discount_reward_py_2d, \
-        discount_reward
-
-
-class RewardTest(tf.test.TestCase):
-    """Tests reward related functions.
-    """
-
-    def test_discount_reward(self):
-        """Tests :func:`texar.tf.losses.rewards.discount_reward`
-        """
-        # 1D
-        reward = np.ones([2], dtype=np.float64)
-        sequence_length = [3, 5]
-
-        discounted_reward = discount_reward(
-            reward, sequence_length, discount=1.)
-        discounted_reward_n = discount_reward(
-            reward, sequence_length, discount=.1, normalize=True)
-
-        discounted_reward_ = discount_reward(
-            tf.constant(reward, dtype=tf.float64),
-            sequence_length, discount=1.)
-        discounted_reward_n_ = discount_reward(
-            tf.constant(reward, dtype=tf.float64),
-            sequence_length, discount=.1, normalize=True)
-
-        with self.test_session() as sess:
-            sess.run(tf.global_variables_initializer())
-
-            r, r_n = sess.run([discounted_reward_, discounted_reward_n_])
-
-            np.testing.assert_array_almost_equal(
-                discounted_reward, r, decimal=6)
-            np.testing.assert_array_almost_equal(
-                discounted_reward_n, r_n, decimal=6)
-
-        # 2D
-        reward = np.ones([2, 10], dtype=np.float64)
-        sequence_length = [5, 10]
-
-        discounted_reward = discount_reward(
-            reward, sequence_length, discount=1.)
-        discounted_reward_n = discount_reward(
-            reward, sequence_length, discount=.1, normalize=True)
-
-        discounted_reward_ = discount_reward(
-            tf.constant(reward, dtype=tf.float64), sequence_length,
-            discount=1., tensor_rank=2)
-        discounted_reward_n_ = discount_reward(
-            tf.constant(reward, dtype=tf.float64), sequence_length,
-            discount=.1, tensor_rank=2, normalize=True)
-
-        with self.test_session() as sess:
-            sess.run(tf.global_variables_initializer())
-
-            r, r_n = sess.run([discounted_reward_, discounted_reward_n_])
-
-            np.testing.assert_array_almost_equal(
-                discounted_reward, r, decimal=6)
-            np.testing.assert_array_almost_equal(
-                discounted_reward_n, r_n, decimal=6)
-
-    def test_discount_reward_py_1d(self):
-        """Tests :func:`texar.tf.losses.rewards._discount_reward_py_1d`
-        """
-        reward = np.ones([2], dtype=np.float64)
-        sequence_length = [3, 5]
-
-        discounted_reward_1 = _discount_reward_py_1d(
-            reward, sequence_length, discount=1.)
-
-        discounted_reward_2 = _discount_reward_py_1d(
-            reward, sequence_length, discount=.1)
-
-        r = discounted_reward_1
-        for i in range(5):
-            if i < 3:
-                self.assertEqual(r[0, i], 1)
-            else:
-                self.assertEqual(r[0, i], 0)
-            self.assertEqual(r[1, i], 1)
-
-        r = discounted_reward_2
-        for i in range(5):
-            if i < 3:
-                self.assertAlmostEqual(r[0, i], 0.1**(2 - i))
-            else:
-                self.assertAlmostEqual(r[0, i], 0)
-            self.assertAlmostEqual(r[1, i], 0.1**(4 - i))
-
-    def test_discount_reward_tensor_1d(self):
-        """Tests :func:`texar.tf.losses.rewards._discount_reward_tensor_1d`
-        """
-        reward = tf.ones([2], dtype=tf.float64)
-        sequence_length = [3, 5]
-
-        discounted_reward_1 = _discount_reward_tensor_1d(
-            reward, sequence_length, discount=1.)
-
-        discounted_reward_2 = _discount_reward_tensor_1d(
-            reward, sequence_length, discount=.1)
-
-        with self.test_session() as sess:
-            sess.run(tf.global_variables_initializer())
-
-            r = sess.run(discounted_reward_1)
-            for i in range(5):
-                if i < 3:
-                    self.assertEqual(r[0, i], 1)
-                else:
-                    self.assertEqual(r[0, i], 0)
-                self.assertEqual(r[1, i], 1)
-
-            r = sess.run(discounted_reward_2)
-            for i in range(5):
-                if i < 3:
-                    self.assertAlmostEqual(r[0, i], 0.1**(2 - i))
-                else:
-                    self.assertAlmostEqual(r[0, i], 0)
-                self.assertAlmostEqual(r[1, i], 0.1**(4 - i))
-
-    def test_discount_reward_py_2d(self):
-        """Tests :func:`texar.tf.losses.rewards._discount_reward_py_2d`
-        """
-        reward = np.ones([2, 10], dtype=np.float64)
-        sequence_length = [5, 10]
-
-        discounted_reward_1 = _discount_reward_py_2d(
-            reward, sequence_length, discount=1.)
-
-        discounted_reward_2 = _discount_reward_py_2d(
-            reward, sequence_length, discount=.1)
-
-        r = discounted_reward_1
-        for i in range(10):
-            if i < 5:
-                self.assertEqual(r[0, i], 5 - i)
-            else:
-                self.assertEqual(r[0, i], 0)
-            self.assertEqual(r[1, i], 10 - i)
-
-        r = discounted_reward_2
-        for i in range(10):
-            if i < 5:
-                self.assertEqual(r[0, i], int(11111. / 10**i) / 10**(4 - i))
-            else:
-                self.assertEqual(r[0, i], 0)
-            self.assertEqual(r[1, i], int(1111111111. / 10**i) / 10**(9 - i))
-
-    def test_discount_reward_tensor_2d(self):
-        """Tests :func:`texar.tf.losses.rewards._discount_reward_tensor_2d`
-        """
-        reward = tf.ones([2, 10], dtype=tf.float64)
-        sequence_length = [5, 10]
-
-        discounted_reward_1 = _discount_reward_tensor_2d(
-            reward, sequence_length, discount=1.)
-
-        discounted_reward_2 = _discount_reward_tensor_2d(
-            reward, sequence_length, discount=.1)
-
-        with self.test_session() as sess:
-            sess.run(tf.global_variables_initializer())
-
-            r = sess.run(discounted_reward_1)
-            for i in range(10):
-                if i < 5:
-                    self.assertEqual(r[0, i], 5 - i)
-                else:
-                    self.assertEqual(r[0, i], 0)
-                self.assertEqual(r[1, i], 10 - i)
-
-            r = sess.run(discounted_reward_2)
-            for i in range(10):
-                if i < 5:
-                    self.assertEqual(r[0, i],
-                                     int(11111. / 10**i) / 10**(4 - i))
-                else:
-                    self.assertEqual(r[0, i], 0)
-                self.assertEqual(r[1, i],
-                                 int(1111111111. / 10**i) / 10**(9 - i))
-
-
-if __name__ == "__main__":
-    tf.test.main()
diff --git a/texar/tf/losses/rl_losses.py b/texar/tf/losses/rl_losses.py
deleted file mode 100644
index 25844be8..00000000
--- a/texar/tf/losses/rl_losses.py
+++ /dev/null
@@ -1,111 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Various RL losses
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow as tf
-
-from texar.tf.losses.mle_losses import _mask_sequences
-
-
-def reinforce_loss(sample_fn,
-                   global_reward_fn,
-                   local_reward_fn=None,
-                   num_samples=1):
-    """Computes REINFORCE loss with global and local rewards.
-
-    Args:
-        sample_fn: A callable that takes :attr:`num_samples` and returns
-            `(samples, probabilities, sequence_lengths)`, where:
-
-            `samples` is a Tensor of shape `[num_samples, max_sequence_length]`
-            containing the generated samples;
-
-            `probabilities` is a Tensor of shape
-            `[num_samples, max_sequence_length]` containing the probabilities of
-            generating each position of the samples. Probabilities beyond the
-            respective sequence lengths are ignored.
-
-            `sequence_lengths` is a Tensor of shape `[num_samples]` containing
-            the length of each samples.
-        global_reward_fn: A callable that takes `(samples, sequence_lengths)`
-            and returns a Tensor of shape `[num_samples]` containing the reward
-            of each of the samples.
-        local_reward_fn (optional): A callable that takes
-            `(samples, sequence_lengths)` and returns a Tensor of shape
-            `[num_samples, max_sequence_length]` containing the local reward
-            at each time step of samples.
-        num_samples (int scalar Tensor): the number of sequences to sample.
-
-    Returns:
-        A scalar Tensor of the REINFORCE loss.
-    """
-
-    # shape = [batch, length]
-    sequences, probs, seq_lens = sample_fn(num_samples)
-    batch, _ = tf.shape(sequences)
-    rewards_local = tf.constant(0., dtype=probs.dtype, shape=probs.shape)
-    if local_reward_fn is not None:
-        rewards_local = local_reward_fn(sequences, seq_lens)
-
-    # shape = [batch, ]
-    rewards_global = global_reward_fn(sequences, seq_lens)
-    # add broadcast to rewards_global to match the shape of rewards_local
-    rewards = rewards_local + tf.reshape(rewards_global, [batch, 1])
-
-    eps = 1e-12
-    log_probs = _mask_sequences(tf.log(probs + eps), seq_lens)
-    loss = - tf.reduce_mean(
-        tf.reduce_sum(log_probs * rewards, axis=1) / seq_lens)
-    return loss
-
-
-def reinforce_loss_with_MCtree(sample_fn,   # pylint: disable=invalid-name
-                               global_reward_fn,
-                               local_reward_fn=None,
-                               num_samples=1):
-    """Computes REINFORCE loss with Monte Carlo tree search.
-
-    Args:
-        sample_fn: A callable that takes :attr:`num_samples`, 'given_actions'
-            and returns `(samples, probabilities, sequence_lengths)`, where:
-
-            `samples` is a Tensor of shape `[num_samples, max_sequence_length]`
-            containing the generated samples;
-
-            `probabilities` is a Tensor of shape
-            `[num_samples, max_sequence_length]` containing the probabilities of
-            generating each position of the samples. Probabilities beyond the
-            respective sequence lengths are ignored.
-
-            `sequence_lengths` is a Tensor of shape `[num_samples]` containing
-            the length of each samples.
-        global_reward_fn: A callable that takes `(samples, sequence_lengths)`
-            and returns a Tensor of shape `[num_samples]` containing the reward
-            of each of the samples.
-        local_reward_fn (optional): A callable that takes
-            `(samples, sequence_lengths)` and returns a Tensor of shape
-            `[num_samples, max_sequence_length]` containing the local reward
-            at each time step of samples.
-        num_samples (int scalar Tensor): the number of sequences to sample.
-
-    Returns:
-        A scalar Tensor of the REINFORCE loss.
-    """
-    raise NotImplementedError
diff --git a/texar/tf/models/__init__.py b/texar/tf/models/__init__.py
deleted file mode 100644
index f35cf092..00000000
--- a/texar/tf/models/__init__.py
+++ /dev/null
@@ -1,25 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Modules of texar library models.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# pylint: disable=wildcard-import
-
-from texar.tf.models.model_base import *
-from texar.tf.models.seq2seq import *
diff --git a/texar/tf/models/model_base.py b/texar/tf/models/model_base.py
deleted file mode 100644
index 96e2399d..00000000
--- a/texar/tf/models/model_base.py
+++ /dev/null
@@ -1,82 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Base class for models.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from texar.tf.hyperparams import HParams
-
-# pylint: disable=too-many-arguments
-
-__all__ = [
-    "ModelBase"
-]
-
-
-class ModelBase(object):
-    """Base class inherited by all model classes.
-
-    A model class implements interfaces that are compatible with
-    :tf_main:`TF Estimator <estimator/Estimator>`. In particular,
-    :meth:`_build` implements the
-    :tf_main:`model_fn <estimator/Estimator#__init__>` interface; and
-    :meth:`get_input_fn` is for the :attr:`input_fn` interface.
-
-    .. document private functions
-    .. automethod:: _build
-    """
-
-    def __init__(self, hparams=None):
-        self._hparams = HParams(hparams, self.default_hparams(),
-                                allow_new_hparam=True)
-
-    @staticmethod
-    def default_hparams():
-        """Returns a dictionary of hyperparameters with default values.
-        """
-        hparams = {
-            "name": "model"
-        }
-        return hparams
-
-    def __call__(self, features, labels, params, mode, config=None):
-        """Used for the :tf_main:`model_fn <estimator/Estimator#__init__>`
-        argument when constructing
-        :tf_main:`tf.estimator.Estimator <estimator/Estimator>`.
-        """
-        return self._build(features, labels, params, mode, config=config)
-
-    def _build(self, features, labels, params, mode, config=None):
-        """Used for the :tf_main:`model_fn <estimator/Estimator#__init__>`
-        argument when constructing
-        :tf_main:`tf.estimator.Estimator <estimator/Estimator>`.
-        """
-        raise NotImplementedError
-
-    def get_input_fn(self, *args, **kwargs):
-        """Returns the :attr:`input_fn` function that constructs the input
-        data, used in :tf_main:`tf.estimator.Estimator <estimator/Estimator>`.
-        """
-        raise NotImplementedError
-
-    @property
-    def hparams(self):
-        """A :class:`~texar.tf.HParams` instance. The hyperparameters
-        of the module.
-        """
-        return self._hparams
diff --git a/texar/tf/models/seq2seq/__init__.py b/texar/tf/models/seq2seq/__init__.py
deleted file mode 100644
index 23a59d55..00000000
--- a/texar/tf/models/seq2seq/__init__.py
+++ /dev/null
@@ -1,25 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Modules of texar library seq2seq models.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# pylint: disable=wildcard-import
-
-from texar.tf.models.seq2seq.seq2seq_base import *
-from texar.tf.models.seq2seq.basic_seq2seq import *
diff --git a/texar/tf/models/seq2seq/basic_seq2seq.py b/texar/tf/models/seq2seq/basic_seq2seq.py
deleted file mode 100644
index 04795f92..00000000
--- a/texar/tf/models/seq2seq/basic_seq2seq.py
+++ /dev/null
@@ -1,180 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-The basic seq2seq model without attention.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow as tf
-
-from texar.tf.models.seq2seq.seq2seq_base import Seq2seqBase
-from texar.tf.modules.decoders.beam_search_decode import beam_search_decode
-from texar.tf.utils import utils
-from texar.tf.utils.shapes import get_batch_size
-
-# pylint: disable=protected-access, too-many-arguments, unused-argument
-
-__all__ = [
-    "BasicSeq2seq"
-]
-
-
-class BasicSeq2seq(Seq2seqBase):
-    """The basic seq2seq model (without attention).
-
-    Example:
-
-        .. code-block:: python
-
-            model = BasicSeq2seq(data_hparams, model_hparams)
-            exor = tx.run.Executor(
-                model=model,
-                data_hparams=data_hparams,
-                config=run_config)
-            exor.train_and_evaluate(
-                max_train_steps=10000,
-                eval_steps=100)
-
-    .. document private functions
-    .. automethod:: _build
-    """
-
-    def __init__(self, data_hparams, hparams=None):
-        Seq2seqBase.__init__(self, data_hparams, hparams=hparams)
-
-    @staticmethod
-    def default_hparams():
-        """Returns a dictionary of hyperparameters with default values.
-
-        Same as :meth:`~texar.tf.models.Seq2seqBase.default_hparams` of
-        :class:`~texar.tf.models.Seq2seqBase`.
-        """
-        hparams = Seq2seqBase.default_hparams()
-        hparams.update({
-            "name": "basic_seq2seq"
-        })
-        return hparams
-
-    def _build_decoder(self):
-        kwargs = {
-            "vocab_size": self._tgt_vocab.size,
-            "hparams": self._hparams.decoder_hparams.todict()
-        }
-        self._decoder = utils.check_or_get_instance(
-            self._hparams.decoder, kwargs,
-            ["texar.tf.modules", "texar.tf.custom"])
-
-    def _get_predictions(self, decoder_results, features, labels, loss=None):
-        preds = {}
-
-        preds.update(features)
-
-        if labels is not None:
-            preds.update(labels)
-
-        preds.update(utils.flatten_dict({'decode': decoder_results}))
-        preds['decode.outputs.sample'] = self._tgt_vocab.map_ids_to_tokens(
-            preds['decode.outputs.sample_id'])
-
-        if loss is not None:
-            preds['loss'] = loss
-
-        return preds
-
-    def embed_source(self, features, labels, mode):
-        """Embeds the inputs.
-        """
-        return self._src_embedder(ids=features["source_text_ids"], mode=mode)
-
-    def embed_target(self, features, labels, mode):
-        """Embeds the target inputs. Used in training.
-        """
-        return self._tgt_embedder(ids=labels["target_text_ids"], mode=mode)
-
-    def encode(self, features, labels, mode):
-        """Encodes the inputs.
-        """
-        embedded_source = self.embed_source(features, labels, mode)
-
-        outputs, final_state = self._encoder(
-            embedded_source,
-            sequence_length=features["source_length"],
-            mode=mode)
-
-        return {'outputs': outputs, 'final_state': final_state}
-
-    def _connect(self, encoder_results, features, labels, mode):
-        """Transforms encoder final state into decoder initial state.
-        """
-        enc_state = encoder_results["final_state"]
-        possible_kwargs = {
-            "inputs": enc_state,
-            "batch_size": get_batch_size(enc_state)
-        }
-        outputs = utils.call_function_with_redundant_kwargs(
-            self._connector._build, possible_kwargs)
-        return outputs
-
-    def _decode_train(self, initial_state, encoder_results, features,
-                      labels, mode):
-        return self._decoder(
-            initial_state=initial_state,
-            decoding_strategy=self._hparams.decoding_strategy_train,
-            inputs=self.embed_target(features, labels, mode),
-            sequence_length=labels['target_length'] - 1,
-            mode=mode)
-
-    def _decode_infer(self, initial_state, encoder_results, features,
-                      labels, mode):
-        start_token = self._tgt_vocab.bos_token_id
-        start_tokens = tf.ones_like(features['source_length']) * start_token
-
-        max_l = self._decoder.hparams.max_decoding_length_infer
-
-        if self._hparams.beam_search_width > 1:
-            return beam_search_decode(
-                decoder_or_cell=self._decoder,
-                embedding=self._tgt_embedder.embedding,
-                start_tokens=start_tokens,
-                end_token=self._tgt_vocab.eos_token_id,
-                beam_width=self._hparams.beam_search_width,
-                initial_state=initial_state,
-                max_decoding_length=max_l)
-        else:
-            return self._decoder(
-                initial_state=initial_state,
-                decoding_strategy=self._hparams.decoding_strategy_infer,
-                embedding=self._tgt_embedder.embedding,
-                start_tokens=start_tokens,
-                end_token=self._tgt_vocab.eos_token_id,
-                mode=mode)
-
-    def decode(self, encoder_results, features, labels, mode):
-        """Decodes.
-        """
-        initial_state = self._connect(encoder_results, features, labels, mode)
-
-        if mode == tf.estimator.ModeKeys.PREDICT:
-            outputs, final_state, sequence_length = self._decode_infer(
-                initial_state, encoder_results, features, labels, mode)
-        else:
-            outputs, final_state, sequence_length = self._decode_train(
-                initial_state, encoder_results, features, labels, mode)
-
-        return {'outputs': outputs,
-                'final_state': final_state,
-                'sequence_length': sequence_length}
diff --git a/texar/tf/models/seq2seq/seq2seq_base.py b/texar/tf/models/seq2seq/seq2seq_base.py
deleted file mode 100644
index 94307a70..00000000
--- a/texar/tf/models/seq2seq/seq2seq_base.py
+++ /dev/null
@@ -1,330 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Base class for seq2seq models.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow as tf
-
-from texar.tf.models.model_base import ModelBase
-from texar.tf.losses.mle_losses import sequence_sparse_softmax_cross_entropy
-from texar.tf.data.data.paired_text_data import PairedTextData
-from texar.tf.core.optimization import get_train_op
-from texar.tf.hyperparams import HParams
-from texar.tf.utils import utils
-from texar.tf.utils.variables import collect_trainable_variables
-
-# pylint: disable=too-many-instance-attributes, unused-argument,
-# pylint: disable=too-many-arguments, no-self-use
-
-__all__ = [
-    "Seq2seqBase"
-]
-
-
-class Seq2seqBase(ModelBase):
-    """Base class inherited by all seq2seq model classes.
-
-    .. document private functions
-    .. automethod:: _build
-    """
-
-    def __init__(self, data_hparams, hparams=None):
-        ModelBase.__init__(self, hparams)
-
-        self._data_hparams = HParams(data_hparams,
-                                     PairedTextData.default_hparams())
-
-        self._src_vocab = None
-        self._tgt_vocab = None
-        self._src_embedder = None
-        self._tgt_embedder = None
-        self._connector = None
-        self._encoder = None
-        self._decoder = None
-
-    @staticmethod
-    def default_hparams():
-        """Returns a dictionary of hyperparameters with default values.
-
-        .. code-block:: python
-
-            {
-                "source_embedder": "WordEmbedder",
-                "source_embedder_hparams": {},
-                "target_embedder": "WordEmbedder",
-                "target_embedder_hparams": {},
-                "embedder_share": True,
-                "embedder_hparams_share": True,
-                "encoder": "UnidirectionalRNNEncoder",
-                "encoder_hparams": {},
-                "decoder": "BasicRNNDecoder",
-                "decoder_hparams": {},
-                "decoding_strategy_train": "train_greedy",
-                "decoding_strategy_infer": "infer_greedy",
-                "beam_search_width": 0,
-                "connector": "MLPTransformConnector",
-                "connector_hparams": {},
-                "optimization": {},
-                "name": "seq2seq",
-            }
-
-        Here:
-
-        "source_embedder": str or class or instance
-            Word embedder for source text. Can be a class, its name or module
-            path, or a class instance.
-
-        "source_embedder_hparams": dict
-            Hyperparameters for constructing the source embedder. E.g.,
-            See :meth:`~texar.tf.modules.WordEmbedder.default_hparams` for
-            hyperparameters of :class:`~texar.tf.modules.WordEmbedder`. Ignored
-            if "source_embedder" is an instance.
-
-        "target_embedder", "target_embedder_hparams":
-            Same as "source_embedder" and "source_embedder_hparams" but for
-            target text embedder.
-
-        "embedder_share": bool
-            Whether to share the source and target embedder. If `True`,
-            source embedder will be used to embed target text.
-
-        "embedder_hparams_share": bool
-            Whether to share the embedder configurations. If `True`,
-            target embedder will be created with "source_embedder_hparams".
-            But the two embedders have different set of trainable variables.
-
-        "encoder", "encoder_hparams":
-            Same as "source_embedder" and "source_embedder_hparams" but for
-            encoder.
-
-        "decoder", "decoder_hparams":
-            Same as "source_embedder" and "source_embedder_hparams" but for
-            decoder.
-
-        "decoding_strategy_train": str
-            The decoding strategy in training mode. See
-            :meth:`~texar.tf.modules.RNNDecoderBase._build` for details.
-
-        "decoding_strategy_infer": str
-            The decoding strategy in eval/inference mode.
-
-        "beam_search_width": int
-            Beam width. If > 1, beam search is used in eval/inference mode.
-
-        "connector", "connector_hparams":
-            The connector class and hyperparameters. A connector transforms
-            an encoder final state to a decoder initial state.
-
-        "optimization": dict
-            Hyperparameters of optimizating the model. See
-            :func:`~texar.tf.core.default_optimization_hparams` for details.
-
-        "name": str
-            Name of the model.
-        """
-        hparams = ModelBase.default_hparams()
-        hparams.update({
-            "name": "seq2seq",
-            "source_embedder": "WordEmbedder",
-            "source_embedder_hparams": {},
-            "target_embedder": "WordEmbedder",
-            "target_embedder_hparams": {},
-            "embedder_share": True,
-            "embedder_hparams_share": True,
-            "encoder": "UnidirectionalRNNEncoder",
-            "encoder_hparams": {},
-            "decoder": "BasicRNNDecoder",
-            "decoder_hparams": {},
-            "decoding_strategy_train": "train_greedy",
-            "decoding_strategy_infer": "infer_greedy",
-            "beam_search_width": 0,
-            "connector": "MLPTransformConnector",
-            "connector_hparams": {},
-            "optimization": {}
-        })
-        return hparams
-
-    def _build_vocab(self):
-        self._src_vocab, self._tgt_vocab = PairedTextData.make_vocab(
-            self._data_hparams.source_dataset,
-            self._data_hparams.target_dataset)
-
-    def _build_embedders(self):
-        kwargs = {
-            "vocab_size": self._src_vocab.size,
-            "hparams": self._hparams.source_embedder_hparams.todict()
-        }
-        self._src_embedder = utils.check_or_get_instance(
-            self._hparams.source_embedder, kwargs,
-            ["texar.tf.modules", "texar.tf.custom"])
-
-        if self._hparams.embedder_share:
-            self._tgt_embedder = self._src_embedder
-        else:
-            kwargs = {
-                "vocab_size": self._tgt_vocab.size,
-            }
-            if self._hparams.embedder_hparams_share:
-                kwargs["hparams"] = \
-                        self._hparams.source_embedder_hparams.todict()
-            else:
-                kwargs["hparams"] = \
-                        self._hparams.target_embedder_hparams.todict()
-            self._tgt_embedder = utils.check_or_get_instance(
-                self._hparams.target_embedder, kwargs,
-                ["texar.tf.modules", "texar.tf.custom"])
-
-    def _build_encoder(self):
-        kwargs = {
-            "hparams": self._hparams.encoder_hparams.todict()
-        }
-        self._encoder = utils.check_or_get_instance(
-            self._hparams.encoder, kwargs,
-            ["texar.tf.modules", "texar.tf.custom"])
-
-    def _build_decoder(self):
-        raise NotImplementedError
-
-    def _build_connector(self):
-        kwargs = {
-            "output_size": self._decoder.state_size,
-            "hparams": self._hparams.connector_hparams.todict()
-        }
-        self._connector = utils.check_or_get_instance(
-            self._hparams.connector, kwargs,
-            ["texar.tf.modules", "texar.tf.custom"])
-
-    def get_loss(self, decoder_results, features, labels):
-        """Computes the training loss.
-        """
-        return sequence_sparse_softmax_cross_entropy(
-            labels=labels['target_text_ids'][:, 1:],
-            logits=decoder_results['outputs'].logits,
-            sequence_length=decoder_results['sequence_length'])
-
-    def _get_predictions(self, decoder_results, features, labels, loss=None):
-        raise NotImplementedError
-
-    def _get_train_op(self, loss):
-        varlist = collect_trainable_variables(
-            [self._src_embedder, self._tgt_embedder, self._encoder,
-             self._connector, self._decoder])
-        return get_train_op(
-            loss, variables=varlist, hparams=self._hparams.optimization)
-
-    def _get_eval_metric_ops(self, decoder_results, features, labels):
-        return None
-
-    def embed_source(self, features, labels, mode):
-        """Embeds the inputs.
-        """
-        raise NotImplementedError
-
-    def embed_target(self, features, labels, mode):
-        """Embeds the target inputs. Used in training.
-        """
-        raise NotImplementedError
-
-    def encode(self, features, labels, mode):
-        """Encodes the inputs.
-        """
-        raise NotImplementedError
-
-    def _connect(self, encoder_results, features, labels, mode):
-        """Transforms encoder final state into decoder initial state.
-        """
-        raise NotImplementedError
-
-    def decode(self, encoder_results, features, labels, mode):
-        """Decodes.
-        """
-        raise NotImplementedError
-
-    def _build(self, features, labels, params, mode, config=None):
-        self._build_vocab()
-        self._build_embedders()
-        self._build_encoder()
-        self._build_decoder()
-        self._build_connector()
-
-        encoder_results = self.encode(features, labels, mode)
-        decoder_results = self.decode(encoder_results, features, labels, mode)
-
-        loss, train_op, preds, eval_metric_ops = None, None, None, None
-        if mode == tf.estimator.ModeKeys.PREDICT:
-            preds = self._get_predictions(decoder_results, features, labels)
-        else:
-            loss = self.get_loss(decoder_results, features, labels)
-
-            if mode == tf.estimator.ModeKeys.TRAIN:
-                train_op = self._get_train_op(loss)
-            if mode == tf.estimator.ModeKeys.EVAL:
-                eval_metric_ops = self._get_eval_metric_ops(
-                    decoder_results, features, labels)
-
-            preds = self._get_predictions(decoder_results, features, labels,
-                                          loss)
-
-        return tf.estimator.EstimatorSpec(
-            mode=mode,
-            predictions=preds,
-            loss=loss,
-            train_op=train_op,
-            eval_metric_ops=eval_metric_ops)
-
-    def get_input_fn(self, mode, hparams=None):
-        """Creates an input function `input_fn` that provides input data
-        for the model in an :tf_main:`Estimator <estimator/Estimator>`.
-        See, e.g., :tf_main:`tf.estimator.train_and_evaluate
-        <estimator/train_and_evaluate>`.
-
-        Args:
-            mode: One of members in
-                :tf_main:`tf.estimator.ModeKeys <estimator/ModeKeys>`.
-            hparams: A `dict` or an :class:`~texar.tf.HParams` instance
-                containing the hyperparameters of
-                :class:`~texar.tf.data.PairedTextData`. See
-                :meth:`~texar.tf.data.PairedTextData.default_hparams` for the
-                the structure and default values of the hyperparameters.
-
-        Returns:
-            An input function that returns a tuple `(features, labels)`
-            when called. `features` contains data fields that are related
-            to source text, and `labels` contains data fields related
-            to target text. See :class:`~texar.tf.data.PairedTextData` for
-            all data fields.
-        """
-        def _input_fn():
-            data = PairedTextData(hparams)
-
-            iterator = data.dataset.make_initializable_iterator()
-            tf.add_to_collection(tf.GraphKeys.TABLE_INITIALIZERS,
-                                 iterator.initializer)
-
-            batch = iterator.get_next()
-
-            features, labels = {}, {}
-            for key, value in batch.items():
-                if key.startswith('source_'):
-                    features[key] = value
-                else:
-                    labels[key] = value
-            return features, labels
-
-        return _input_fn
diff --git a/texar/tf/module_base.py b/texar/tf/module_base.py
index 5c3b1724..b2682f34 100644
--- a/texar/tf/module_base.py
+++ b/texar/tf/module_base.py
@@ -1,4 +1,4 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
+# Copyright 2019 The Texar Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,62 +14,33 @@
 """
 Base class for modules.
 """
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import re
+from abc import ABC, abstractmethod
 
 import tensorflow as tf
 
-from texar.tf.utils.exceptions import TexarError
 from texar.tf.hyperparams import HParams
 
 __all__ = [
-    "ModuleBase"
+    "ModuleBase",
 ]
 
 
-class ModuleBase(object):
-    """Base class inherited by modules that create Variables and are
+class ModuleBase(tf.keras.layers.Layer, ABC):
+    r"""Base class inherited by modules that create Variables and are
     configurable through hyperparameters.
 
-    A Texar module inheriting :class:`~texar.tf.ModuleBase` has following key
-    features:
-
-        - **Convenient variable re-use**: A module instance creates \
-        its own sets of variables, and automatically re-uses its variables on \
-        subsequent calls. Hence TF variable/name scope is \
-        transparent to users. For example:
-
-            .. code-block:: python
-
-                encoder = UnidirectionalRNNEncoder(hparams) # create instance
-                output_1 = encoder(inputs_1) # variables are created
-                output_2 = encoder(inputs_2) # variables are re-used
-
-                print(encoder.trainable_variables) # access trainable variables
-                # [ ... ]
-
-        - **Configurable through hyperparameters**: Each module defines \
-        allowed hyperparameters and default values. Hyperparameters not \
-        specified by users will take default values.
-
-        - **Callable**: As the above example, a module instance is "called" \
-        with input tensors and returns output tensors. Every call of a module \
-        will add ops to the Graph to perform the module's logic.
+    A Texar module inheriting :class:`~texar.tf.ModuleBase` is
+    **configurable through hyperparameters**. That is, each module defines
+    allowed hyperparameters and default values. Hyperparameters not
+    specified by users will take default values.
 
     Args:
         hparams (dict, optional): Hyperparameters of the module. See
             :meth:`default_hparams` for the structure and default values.
-
-
-    .. document private functions
-    .. automethod:: _build
     """
 
     def __init__(self, hparams=None):
+        super().__init__()
         if not hasattr(self, '_hparams'):
             self._hparams = HParams(hparams, self.default_hparams())
         else:
@@ -80,15 +51,17 @@ def __init__(self, hparams=None):
                 raise ValueError(
                     "`self._hparams` already exists. Argument `hparams` "
                     "must be set to `None` in this case.")
-        self._template = tf.make_template(self._hparams.name, self._build,
-                                          create_scope_now_=True)
-        self._unique_name = self.variable_scope.name.split("/")[-1]
-        self._trainable_variables = []
-        self._built = False
+
+    @abstractmethod
+    def call(self, inputs, *args, **kwargs):
+        r"""Defines the computation performed at every call.
+        Should be overridden by all subclasses.
+        """
+        raise NotImplementedError
 
     @staticmethod
     def default_hparams():
-        """Returns a `dict` of hyperparameters of the module with default
+        r"""Returns a `dict` of hyperparameters of the module with default
         values. Used to replace the missing values of input `hparams`
         during module construction.
 
@@ -102,87 +75,9 @@ def default_hparams():
             "name": "module"
         }
 
-    def _build(self, *args, **kwargs):
-        """Subclass must implement this method to build the logic.
-
-        Args:
-            *args: Arguments.
-            **kwargs: Keyword arguments.
-
-        Returns:
-            Output Tensor(s).
-        """
-        raise NotImplementedError
-
-    def __call__(self, *args, **kwargs):
-        """Executes the module logic defined in _build method
-
-        Args:
-            *args: Arguments of _build method.
-            **kwargs: Keyword arguments of _build method.
-
-        Returns:
-            The output of _build method.
-        """
-        return self._template(*args, **kwargs)
-
-    def _add_internal_trainable_variables(self):  # pylint: disable=invalid-name
-        """Collects trainable variables constructured internally in this module.
-
-        This is typically called at the end of `_build()` where all necessary
-        trainable variables have been constructed.
-        """
-        scope_name = self.variable_scope.name
-        # Escape to handle possible "." characters in the name.
-        # Append a slash to the end to avoid searching scopes that have this
-        # scope name as a prefix.
-        scope_name = re.escape(scope_name) + "/"
-        internal_trainable_variables = tf.get_collection(
-            tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope_name)
-        self._add_trainable_variable(internal_trainable_variables)
-
-    def _add_trainable_variable(self, variable):
-        """Adds a trainable variable to the trainable variable list of the
-        module.
-
-        Args:
-            variable: a (list of) trainable variable(s) constructed either
-                internally in the module or constructured outside but used
-                inside the module.
-        """
-        if isinstance(variable, (list, tuple)):
-            for var in variable:
-                self._add_trainable_variable(var)
-        else:
-            if variable not in self._trainable_variables:
-                self._trainable_variables.append(variable)
-
-    @property
-    def variable_scope(self):
-        """The variable scope of the module.
-        """
-        return self._template.variable_scope
-
-    @property
-    def name(self):
-        """The uniquified name of the module.
-        """
-        return self._unique_name
-
-    @property
-    def trainable_variables(self):
-        """The list of trainable variables of the module.
-        """
-        if not self._built:
-            raise TexarError(
-                "Attempting to access trainable_variables before module %s "
-                "was fully built. The module is built once it is called, "
-                "e.g., with `%s(...)`" % (self.name, self.name))
-        return self._trainable_variables
-
     @property
     def hparams(self):
-        """An :class:`~texar.tf.HParams` instance. The hyperparameters
+        r"""An :class:`~texar.tf.HParams` instance. The hyperparameters
         of the module.
         """
         return self._hparams
diff --git a/texar/tf/modules/__init__.py b/texar/tf/modules/__init__.py
index 711485fd..53f03fd6 100644
--- a/texar/tf/modules/__init__.py
+++ b/texar/tf/modules/__init__.py
@@ -1,4 +1,4 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
+# Copyright 2019 The Texar Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -15,20 +15,6 @@
 Modules of Texar library module.
 """
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# pylint: disable=wildcard-import
-
-from texar.tf.modules.classifiers import *
-from texar.tf.modules.connectors import *
-from texar.tf.modules.decoders import *
 from texar.tf.modules.embedders import *
 from texar.tf.modules.encoders import *
-from texar.tf.modules.memory import *
 from texar.tf.modules.networks import *
-from texar.tf.modules.policies import *
-from texar.tf.modules.pretrained import *
-from texar.tf.modules.qnets import *
-from texar.tf.modules.regressors import *
diff --git a/texar/tf/modules/classifiers/__init__.py b/texar/tf/modules/classifiers/__init__.py
deleted file mode 100644
index 0202e1ce..00000000
--- a/texar/tf/modules/classifiers/__init__.py
+++ /dev/null
@@ -1,28 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Modules of texar library classifiers.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# pylint: disable=wildcard-import
-
-from texar.tf.modules.classifiers.conv_classifiers import *
-from texar.tf.modules.classifiers.rnn_classifiers import *
-from texar.tf.modules.classifiers.bert_classifier import *
-from texar.tf.modules.classifiers.gpt2_classifier import *
-from texar.tf.modules.classifiers.xlnet_classifier import *
diff --git a/texar/tf/modules/classifiers/bert_classifier.py b/texar/tf/modules/classifiers/bert_classifier.py
deleted file mode 100644
index b67e5fab..00000000
--- a/texar/tf/modules/classifiers/bert_classifier.py
+++ /dev/null
@@ -1,292 +0,0 @@
-# Copyright 2019 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-BERT classifiers.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow as tf
-
-from texar.tf.core.layers import get_layer
-from texar.tf.modules.classifiers.classifier_base import ClassifierBase
-from texar.tf.modules.encoders.bert_encoder import BERTEncoder
-from texar.tf.hyperparams import HParams
-from texar.tf.modules.pretrained.bert import PretrainedBERTMixin
-from texar.tf.utils.utils import dict_fetch
-
-# pylint: disable=too-many-arguments, invalid-name, no-member,
-# pylint: disable=too-many-branches, too-many-locals, too-many-statements
-
-__all__ = [
-    "BERTClassifier"
-]
-
-
-class BERTClassifier(ClassifierBase, PretrainedBERTMixin):
-    r"""Classifier based on BERT modules. Please see
-    :class:`~texar.tf.modules.PretrainedBERTMixin` for a brief description
-    of BERT.
-
-    This is a combination of the
-    :class:`~texar.tf.modules.BertEncoder` with a classification
-    layer. Both step-wise classification and sequence-level classification
-    are supported, specified in :attr:`hparams`.
-
-    Arguments are the same as in
-    :class:`~texar.tf.modules.BERTEncoder`.
-
-    Args:
-        pretrained_model_name (optional): a `str`, the name
-            of pre-trained model (e.g., ``bert-base-uncased``). Please refer to
-            :class:`~texar.tf.modules.PretrainedBERTMixin` for
-            all supported models.
-            If `None`, the model name in :attr:`hparams` is used.
-        cache_dir (optional): the path to a folder in which the
-            pre-trained models will be cached. If `None` (default),
-            a default directory (``texar_data`` folder under user's home
-            directory) will be used.
-        hparams (dict or HParams, optional): Hyperparameters. Missing
-            hyperparameters will be set to default values. See
-            :meth:`default_hparams` for the hyperparameter structure
-            and default values.
-
-    .. document private functions
-    .. automethod:: _build
-    """
-    _ENCODER_CLASS = BERTEncoder
-
-    def __init__(self,
-                 pretrained_model_name=None,
-                 cache_dir=None,
-                 hparams=None):
-
-        super(BERTClassifier, self).__init__(hparams=hparams)
-
-        with tf.variable_scope(self.variable_scope):
-            # Creates the underlying encoder
-            encoder_hparams = dict_fetch(
-                hparams, BERTEncoder.default_hparams())
-            if encoder_hparams is not None:
-                encoder_hparams['name'] = None
-            self._encoder = BERTEncoder(
-                pretrained_model_name=pretrained_model_name,
-                cache_dir=cache_dir,
-                hparams=encoder_hparams)
-
-            # Creates an dropout layer
-            drop_kwargs = {"rate": self._hparams.dropout}
-            layer_hparams = {"type": "Dropout", "kwargs": drop_kwargs}
-            self._dropout_layer = get_layer(hparams=layer_hparams)
-
-            # Creates an additional classification layer if needed
-            self._num_classes = self._hparams.num_classes
-            if self._num_classes <= 0:
-                self._logit_layer = None
-            else:
-                logit_kwargs = self._hparams.logit_layer_kwargs
-                if logit_kwargs is None:
-                    logit_kwargs = {}
-                elif not isinstance(logit_kwargs, HParams):
-                    raise ValueError(
-                        "hparams['logit_layer_kwargs'] must be a dict.")
-                else:
-                    logit_kwargs = logit_kwargs.todict()
-                logit_kwargs.update({"units": self._num_classes})
-                if 'name' not in logit_kwargs:
-                    logit_kwargs['name'] = "logit_layer"
-
-                layer_hparams = {"type": "Dense", "kwargs": logit_kwargs}
-                self._logit_layer = get_layer(hparams=layer_hparams)
-
-    @staticmethod
-    def default_hparams():
-        r"""Returns a dictionary of hyperparameters with default values.
-
-        .. code-block:: python
-
-            {
-                # (1) Same hyperparameters as in BertEncoder
-                ...
-                # (2) Additional hyperparameters
-                "num_classes": 2,
-                "logit_layer_kwargs": None,
-                "clas_strategy": "cls_time",
-                "max_seq_length": None,
-                "dropout": 0.1,
-                "name": "bert_classifier"
-            }
-
-        Here:
-
-        1. Same hyperparameters as in
-        :class:`~texar.tf.modules.BertEncoder`.
-        See the :meth:`~texar.tf.modules.BertEncoder.default_hparams`.
-        An instance of BertEncoder is created for feature extraction.
-
-        2. Additional hyperparameters:
-
-            `"num_classes"`: int
-                Number of classes:
-
-                - If **> 0**, an additional :tf_main:`Dense <layers/Dense>`
-                  layer is appended to the encoder to compute the logits over
-                  classes.
-                - If **<= 0**, no dense layer is appended. The number of
-                  classes is assumed to be the final dense layer size of the
-                  encoder.
-
-            `"logit_layer_kwargs"`: dict
-                Keyword arguments for the logit Dense layer constructor,
-                except for argument "units" which is set to `num_classes`.
-                Ignored if no extra logit layer is appended.
-
-            `"clas_strategy"`: str
-                The classification strategy, one of:
-
-                - **cls_time**: Sequence-level classification based on the
-                  output of the first time step (which is the `CLS` token).
-                  Each sequence has a class.
-                - **all_time**: Sequence-level classification based on
-                  the output of all time steps. Each sequence has a class.
-                - **time_wise**: Step-wise classification, i.e., make
-                  classification for each time step based on its output.
-
-            `"max_seq_length"`: int, optional
-                Maximum possible length of input sequences. Required if
-                `clas_strategy` is `all_time`.
-
-            `"dropout"`: float
-                The dropout rate of the BERT encoder output.
-
-            `"name"`: str
-                Name of the classifier.
-        """
-
-        hparams = BERTEncoder.default_hparams()
-        hparams.update({
-            "num_classes": 2,
-            "logit_layer_kwargs": None,
-            "clas_strategy": "cls_time",
-            "max_seq_length": None,
-            "dropout": 0.1,
-            "name": "bert_classifier"
-        })
-        return hparams
-
-    def _build(self,
-               inputs,
-               sequence_length=None,
-               segment_ids=None,
-               mode=None,
-               **kwargs):
-        r"""Feeds the inputs through the network and makes classification.
-
-        The arguments are the same as in
-        :class:`~texar.tf.modules.BertEncoder`.
-
-        Args:
-            inputs: A 2D Tensor of shape `[batch_size, max_time]`,
-                containing the token ids of tokens in input sequences.
-            sequence_length (optional): A 1D Tensor of shape `[batch_size]`.
-                Input tokens beyond respective sequence lengths are masked
-                out automatically.
-            segment_ids (optional): A 2D Tensor of shape
-                `[batch_size, max_time]`, containing the segment ids
-                of tokens in input sequences. If `None` (default), a tensor
-                with all elements set to zero is used.
-            mode (optional): A tensor taking value in
-                :tf_main:`tf.estimator.ModeKeys <estimator/ModeKeys>`,
-                including `TRAIN`, `EVAL`, and `PREDICT`. Used to toggle
-                dropout.
-                If `None` (default), :func:`texar.tf.global_mode` is used.
-            **kwargs: Keyword arguments.
-
-        Returns:
-            A tuple `(logits, pred)`, containing the logits over classes and
-            the predictions, respectively.
-
-            - If "clas_strategy"=="cls_time" or "all_time"
-
-                - If "num_classes"==1, `logits` and `pred` are of both \
-                  shape `[batch_size]`
-                - If "num_classes">1, `logits` is of shape \
-                  `[batch_size, num_classes]` and `pred` is of shape \
-                  `[batch_size]`.
-
-            - If "clas_strategy"=="time_wise",
-
-                - If "num_classes"==1, `logits` and `pred` are of both \
-                  shape `[batch_size, max_time]`
-                - If "num_classes">1, `logits` is of shape \
-                  `[batch_size, max_time, num_classes]` and `pred` is of shape \
-                  `[batch_size, max_time]`.
-        """
-
-        enc_outputs, pooled_output = self._encoder(inputs, sequence_length,
-                                                   segment_ids, mode)
-
-        # Compute logits
-        stra = self._hparams.clas_strategy
-        if stra == 'time_wise':
-            logits = enc_outputs
-        elif stra == 'cls_time':
-            logits = pooled_output
-        elif stra == 'all_time':
-            # Pad `enc_outputs` to have max_seq_length before flatten
-            length_diff = self._hparams.max_seq_length - tf.shape(inputs)[1]
-            length_diff = tf.reshape(length_diff, [1, 1])
-            # Set `paddings = [[0, 0], [0, length_dif], [0, 0]]`
-            paddings = tf.pad(length_diff, paddings=[[1, 1], [1, 0]])
-            logit_input = tf.pad(enc_outputs, paddings=paddings)
-            logit_input_dim = self._hparams.hidden_size * \
-                              self._hparams.max_seq_length
-            logits = tf.reshape(logit_input, [-1, logit_input_dim])
-        else:
-            raise ValueError('Unknown classification strategy: {}'.format(stra))
-
-        if self._logit_layer is not None:
-            logits = self._dropout_layer(logits, training=mode)
-            logits = self._logit_layer(logits)
-
-        # Compute predications
-        num_classes = self._hparams.num_classes
-        is_binary = num_classes == 1
-        is_binary = is_binary or (num_classes <= 0 and logits.shape[-1] == 1)
-
-        if stra == 'time_wise':
-            if is_binary:
-                pred = tf.squeeze(tf.greater(logits, 0), -1)
-                logits = tf.squeeze(logits, -1)
-            else:
-                pred = tf.argmax(logits, axis=-1)
-        else:
-            if is_binary:
-                pred = tf.greater(logits, 0)
-                logits = tf.reshape(logits, [-1])
-            else:
-                pred = tf.argmax(logits, axis=-1)
-            pred = tf.reshape(pred, [-1])
-        pred = tf.cast(pred, tf.int64)
-
-        if not self._built:
-            self._add_internal_trainable_variables()
-            if self._logit_layer:
-                self._add_trainable_variable(
-                    self._logit_layer.trainable_variables)
-            self._built = True
-
-        return logits, pred
diff --git a/texar/tf/modules/classifiers/bert_classifier_test.py b/texar/tf/modules/classifiers/bert_classifier_test.py
deleted file mode 100644
index 38db5c3f..00000000
--- a/texar/tf/modules/classifiers/bert_classifier_test.py
+++ /dev/null
@@ -1,206 +0,0 @@
-"""
-Unit tests for BERT classifiers.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
-
-import numpy as np
-import tensorflow as tf
-
-from texar.tf.modules.classifiers.bert_classifier import BERTClassifier
-from texar.tf.utils.test import pretrained_test
-
-# pylint: disable=too-many-locals, no-member
-
-
-class BERTClassifierTest(tf.test.TestCase):
-    """Tests :class:`~texar.tf.modules.BERTClassifier` class.
-    """
-
-    @pretrained_test
-    def test_model_loading(self):
-        r"""Tests model loading functionality."""
-
-        inputs = tf.placeholder(dtype=tf.int32, shape=[None, None])
-
-        for pretrained_model_name in BERTClassifier.available_checkpoints():
-            classifier = BERTClassifier(
-                pretrained_model_name=pretrained_model_name)
-            _, _ = classifier(inputs)
-
-    def test_trainable_variables(self):
-        """Tests the functionality of automatically collecting trainable
-        variables.
-        """
-        inputs = tf.placeholder(dtype=tf.int32, shape=[None, None])
-
-        # case 1
-        hparams = {
-            "pretrained_model_name": None,
-        }
-        clas = BERTClassifier(hparams=hparams)
-        _, _ = clas(inputs)
-        self.assertEqual(len(clas.trainable_variables), 199 + 2)
-
-        # case 2
-        hparams = {
-            "pretrained_model_name": None,
-            "clas_strategy": "all_time",
-            "max_seq_length": 8,
-        }
-        clas = BERTClassifier(hparams=hparams)
-        _, _ = clas(inputs)
-        self.assertEqual(len(clas.trainable_variables), 199 + 2)
-
-        # case 2
-        hparams = {
-            "pretrained_model_name": None,
-            "clas_strategy": "time_wise",
-        }
-        clas = BERTClassifier(hparams=hparams)
-        _, _ = clas(inputs)
-        self.assertEqual(len(clas.trainable_variables), 199 + 2)
-
-    def test_encode(self):
-        """Tests encoding.
-        """
-        max_time = 8
-        batch_size = 16
-        inputs = tf.random_uniform([batch_size, max_time],
-                                   maxval=30521, dtype=tf.int32)
-        # case 1
-        hparams = {
-            "pretrained_model_name": None,
-        }
-        clas = BERTClassifier(hparams=hparams)
-        logits, pred = clas(inputs)
-
-        with self.test_session() as sess:
-            sess.run(tf.global_variables_initializer())
-            logits_, pred_ = sess.run([logits, pred])
-            self.assertEqual(logits_.shape, (batch_size,
-                                             clas.hparams.num_classes))
-            self.assertEqual(pred_.shape, (batch_size, ))
-
-        # case 2
-        hparams = {
-            "pretrained_model_name": None,
-            "num_classes": 10,
-            "clas_strategy": "time_wise"
-        }
-        clas = BERTClassifier(hparams=hparams)
-        logits, pred = clas(inputs)
-
-        with self.test_session() as sess:
-            sess.run(tf.global_variables_initializer())
-            logits_, pred_ = sess.run([logits, pred])
-            self.assertEqual(logits_.shape,
-                             (batch_size, max_time, clas.hparams.num_classes))
-            self.assertEqual(pred_.shape, (batch_size, max_time))
-
-        # case 3
-        hparams = {
-            "pretrained_model_name": None,
-            "num_classes": 0,
-            "clas_strategy": "time_wise"
-        }
-        clas = BERTClassifier(hparams=hparams)
-        logits, pred = clas(inputs)
-
-        with self.test_session() as sess:
-            sess.run(tf.global_variables_initializer())
-            logits_, pred_ = sess.run([logits, pred])
-            self.assertEqual(logits_.shape,
-                             (batch_size, max_time, clas.hparams.encoder.dim))
-            self.assertEqual(pred_.shape, (batch_size, max_time))
-
-        # case 4
-        hparams = {
-            "pretrained_model_name": None,
-            "num_classes": 10,
-            "clas_strategy": "all_time",
-            "max_seq_length": max_time
-        }
-        inputs = tf.placeholder(tf.int32, shape=[batch_size, 6])
-        clas = BERTClassifier(hparams=hparams)
-        logits, pred = clas(inputs)
-
-        with self.test_session() as sess:
-            sess.run(tf.global_variables_initializer())
-            logits_, pred_ = sess.run(
-                [logits, pred],
-                feed_dict={inputs: np.random.randint(30521,
-                                                     size=(batch_size, 6))})
-            self.assertEqual(logits_.shape, (batch_size,
-                                             clas.hparams.num_classes))
-            self.assertEqual(pred_.shape, (batch_size, ))
-
-    def test_binary(self):
-        """Tests binary classification.
-        """
-        max_time = 8
-        batch_size = 16
-        inputs = tf.random_uniform([batch_size, max_time],
-                                   maxval=30521, dtype=tf.int32)
-
-        # case 2
-        hparams = {
-            "pretrained_model_name": None,
-            "num_classes": 1,
-            "clas_strategy": "time_wise"
-        }
-        clas = BERTClassifier(hparams=hparams)
-        logits, pred = clas(inputs)
-
-        with self.test_session() as sess:
-            sess.run(tf.global_variables_initializer())
-            logits_, pred_ = sess.run([logits, pred])
-            self.assertEqual(logits_.shape, (batch_size, max_time))
-            self.assertEqual(pred_.shape, (batch_size, max_time))
-
-        # case 3
-        hparams = {
-            "pretrained_model_name": None,
-            "num_classes": 1,
-            "clas_strategy": "cls_time",
-            "max_seq_length": max_time
-        }
-        inputs = tf.placeholder(tf.int32, shape=[batch_size, 6])
-        clas = BERTClassifier(hparams=hparams)
-        logits, pred = clas(inputs)
-
-        with self.test_session() as sess:
-            sess.run(tf.global_variables_initializer())
-            logits_, pred_ = sess.run(
-                [logits, pred],
-                feed_dict={inputs: np.random.randint(30521,
-                                                     size=(batch_size, 6))})
-            self.assertEqual(logits_.shape, (batch_size, ))
-            self.assertEqual(pred_.shape, (batch_size, ))
-
-        # case 4
-        hparams = {
-            "pretrained_model_name": None,
-            "num_classes": 1,
-            "clas_strategy": "all_time",
-            "max_seq_length": max_time
-        }
-        inputs = tf.placeholder(tf.int32, shape=[batch_size, 6])
-        clas = BERTClassifier(hparams=hparams)
-        logits, pred = clas(inputs)
-
-        with self.test_session() as sess:
-            sess.run(tf.global_variables_initializer())
-            logits_, pred_ = sess.run(
-                [logits, pred],
-                feed_dict={inputs: np.random.randint(30521,
-                                                     size=(batch_size, 6))})
-            self.assertEqual(logits_.shape, (batch_size, ))
-            self.assertEqual(pred_.shape, (batch_size, ))
-
-
-if __name__ == "__main__":
-    tf.test.main()
diff --git a/texar/tf/modules/classifiers/classifier_base.py b/texar/tf/modules/classifiers/classifier_base.py
deleted file mode 100644
index be67d257..00000000
--- a/texar/tf/modules/classifiers/classifier_base.py
+++ /dev/null
@@ -1,55 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Base class for encoders.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from texar.tf.module_base import ModuleBase
-
-__all__ = [
-    "ClassifierBase"
-]
-
-
-class ClassifierBase(ModuleBase):
-    """Base class inherited by all classifier classes.
-    """
-
-    def __init__(self, hparams=None):
-        ModuleBase.__init__(self, hparams)
-
-    @staticmethod
-    def default_hparams():
-        """Returns a dictionary of hyperparameters with default values.
-        """
-        return {
-            "name": "classifier"
-        }
-
-    def _build(self, inputs, *args, **kwargs):
-        """Classifies the inputs.
-
-        Args:
-          inputs: Inputs to the classifier.
-          *args: Other arguments.
-          **kwargs: Keyword arguments.
-
-        Returns:
-          Classification results.
-        """
-        raise NotImplementedError
diff --git a/texar/tf/modules/classifiers/conv_classifiers.py b/texar/tf/modules/classifiers/conv_classifiers.py
deleted file mode 100644
index c805f0ae..00000000
--- a/texar/tf/modules/classifiers/conv_classifiers.py
+++ /dev/null
@@ -1,271 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Various classifier classes.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# pylint: disable=not-context-manager, too-many-arguments, too-many-locals
-
-import tensorflow as tf
-
-from texar.tf.utils.exceptions import TexarError
-from texar.tf.modules.classifiers.classifier_base import ClassifierBase
-from texar.tf.modules.encoders.conv_encoders import Conv1DEncoder
-from texar.tf.utils import utils
-from texar.tf.hyperparams import HParams
-
-__all__ = [
-    "Conv1DClassifier"
-]
-
-
-class Conv1DClassifier(ClassifierBase):
-    """Simple Conv-1D classifier.
-    This is a combination of the
-    :class:`~texar.tf.modules.Conv1DEncoder` with a classification layer.
-
-    Args:
-        hparams (dict, optional): Hyperparameters. Missing
-            hyperparamerter will be set to default values. See
-            :meth:`default_hparams` for the hyperparameter sturcture and
-            default values.
-
-    Example:
-
-        .. code-block:: python
-
-            clas = Conv1DClassifier(hparams={'num_classes': 10})
-
-            inputs = tf.random_uniform([64, 20, 256])
-            logits, pred = clas(inputs)
-            # logits == Tensor of shape [64, 10]
-            # pred   == Tensor of shape [64]
-
-    .. document private functions
-    .. automethod:: _build
-    """
-
-    def __init__(self, hparams=None):
-        ClassifierBase.__init__(self, hparams)
-
-        with tf.variable_scope(self.variable_scope):
-            encoder_hparams = utils.dict_fetch(
-                hparams, Conv1DEncoder.default_hparams())
-            self._encoder = Conv1DEncoder(hparams=encoder_hparams)
-
-            # Add an additional dense layer if needed
-            self._num_classes = self._hparams.num_classes
-            if self._num_classes > 0:
-                if self._hparams.num_dense_layers <= 0:
-                    self._encoder.append_layer({"type": "Flatten"})
-
-                logit_kwargs = self._hparams.logit_layer_kwargs
-                if logit_kwargs is None:
-                    logit_kwargs = {}
-                elif not isinstance(logit_kwargs, HParams):
-                    raise ValueError(
-                        "hparams['logit_layer_kwargs'] must be a dict.")
-                else:
-                    logit_kwargs = logit_kwargs.todict()
-                logit_kwargs.update({"units": self._num_classes})
-                if 'name' not in logit_kwargs:
-                    logit_kwargs['name'] = "logit_layer"
-
-                self._encoder.append_layer(
-                    {"type": "Dense", "kwargs": logit_kwargs})
-
-    @staticmethod
-    def default_hparams():
-        """Returns a dictionary of hyperparameters with default values.
-
-        .. code-block:: python
-
-            {
-                # (1) Same hyperparameters as in Conv1DEncoder
-                ...
-
-                # (2) Additional hyperparameters
-                "num_classes": 2,
-                "logit_layer_kwargs": {
-                    "use_bias": False
-                },
-                "name": "conv1d_classifier"
-            }
-
-        Here:
-
-        1. Same hyperparameters as in :class:`~texar.tf.modules.Conv1DEncoder`.
-        See the :meth:`~texar.tf.modules.Conv1DEncoder.default_hparams`.
-        An instance of Conv1DEncoder is created for feature extraction.
-
-        2. Additional hyperparameters:
-
-            "num_classes": int
-                Number of classes:
-
-                - If **`> 0`**, an additional :tf_main:`Dense <layers/Dense>` \
-                layer is appended to the encoder to compute the logits over \
-                classes.
-                - If **`<= 0`**, no dense layer is appended. The number of \
-                classes is assumed to be the final dense layer size of the \
-                encoder.
-
-            "logit_layer_kwargs": dict
-                Keyword arguments for the logit Dense layer constructor,
-                except for argument "units" which is set to "num_classes".
-                Ignored if no extra logit layer is appended.
-
-            "name": str
-                Name of the classifier.
-        """
-        hparams = Conv1DEncoder.default_hparams()
-        hparams.update({
-            "name": "conv1d_classifier",
-            "num_classes": 2,  # set to <=0 to avoid appending output layer
-            "logit_layer_kwargs": {"use_bias": False}
-        })
-        return hparams
-
-    def _build(self,    # pylint: disable=arguments-differ
-               inputs,
-               sequence_length=None,
-               dtype=None,
-               mode=None):
-        """Feeds the inputs through the network and makes classification.
-
-        The arguments are the same as in :class:`~texar.tf.modules.Conv1DEncoder`.
-
-        The predictions of binary classification ("num_classes"=1) and
-        multi-way classification ("num_classes">1) are different, as explained
-        below.
-
-        Args:
-            inputs: The inputs to the network, which is a 3D tensor. See
-                :class:`~texar.tf.modules.Conv1DEncoder` for more details.
-            sequence_length (optional): An int tensor of shape `[batch_size]`
-                containing the length of each element in :attr:`inputs`.
-                If given, time steps beyond the length will first be masked out
-                before feeding to the layers.
-            dtype (optional): Type of the inputs. If not provided, infers
-                from inputs automatically.
-            mode (optional): A tensor taking value in
-                :tf_main:`tf.estimator.ModeKeys <estimator/ModeKeys>`, including
-                `TRAIN`, `EVAL`, and `PREDICT`. If `None`,
-                :func:`texar.tf.global_mode` is used.
-
-        Returns:
-            A tuple `(logits, pred)`, where
-
-            - **`logits`** is a Tensor of shape `[batch_size, num_classes]`\
-            for `num_classes` >1, and `[batch_size]` for `num_classes` =1 \
-            (i.e., binary classification).
-            - **`pred`** is the prediction, a Tensor of shape `[batch_size]` \
-            and type `tf.int64`. For binary classification, the standard \
-            sigmoid function is used for prediction, and the class labels are \
-            `{0, 1}`.
-        """
-        logits = self._encoder(inputs, sequence_length, dtype, mode)
-
-        num_classes = self._hparams.num_classes
-        is_binary = num_classes == 1
-        is_binary = is_binary or (num_classes <= 0 and logits.shape[1] == 1)
-
-        if is_binary:
-            pred = tf.greater(logits, 0)
-            logits = tf.reshape(logits, [-1])
-        else:
-            pred = tf.argmax(logits, 1)
-        pred = tf.cast(tf.reshape(pred, [-1]), tf.int64)
-
-        self._built = True
-
-        return logits, pred
-
-    @property
-    def trainable_variables(self):
-        """The list of trainable variables of the module.
-        """
-        if not self._built:
-            raise TexarError(
-                "Attempting to access trainable_variables before module %s "
-                "was fully built. The module is built once it is called, "
-                "e.g., with `%s(...)`" % (self.name, self.name))
-        return self._encoder.trainable_variables
-
-    @property
-    def num_classes(self):
-        """The number of classes.
-        """
-        return self._num_classes
-
-    @property
-    def nn(self):  # pylint: disable=invalid-name
-        """The classifier neural network.
-        """
-        return self._encoder
-
-    def has_layer(self, layer_name):
-        """Returns `True` if the network with the name exists. Returns `False`
-        otherwise.
-
-        Args:
-            layer_name (str): Name of the layer.
-        """
-        return self._encoder.has_layer(layer_name)
-
-    def layer_by_name(self, layer_name):
-        """Returns the layer with the name. Returns 'None' if the layer name
-        does not exist.
-
-        Args:
-            layer_name (str): Name of the layer.
-        """
-        return self._encoder.layer_by_name(layer_name)
-
-    @property
-    def layers_by_name(self):
-        """A dictionary mapping layer names to the layers.
-        """
-        return self._encoder.layers_by_name
-
-    @property
-    def layers(self):
-        """A list of the layers.
-        """
-        return self._encoder.layers
-
-    @property
-    def layer_names(self):
-        """A list of uniquified layer names.
-        """
-        return self._encoder.layer_names
-
-    def layer_outputs_by_name(self, layer_name):
-        """Returns the output tensors of the layer with the specified name.
-        Returns `None` if the layer name does not exist.
-
-        Args:
-            layer_name (str): Name of the layer.
-        """
-        return self._encoder.layer_outputs_by_name(layer_name)
-
-    @property
-    def layer_outputs(self):
-        """A list containing output tensors of each layer.
-        """
-        return self._encoder.layer_outputs
diff --git a/texar/tf/modules/classifiers/conv_classifiers_test.py b/texar/tf/modules/classifiers/conv_classifiers_test.py
deleted file mode 100644
index c2ed92b0..00000000
--- a/texar/tf/modules/classifiers/conv_classifiers_test.py
+++ /dev/null
@@ -1,52 +0,0 @@
-#
-"""
-Unit tests for conv encoders.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
-
-import tensorflow as tf
-
-import texar.tf as tx
-from texar.tf.modules.classifiers.conv_classifiers import Conv1DClassifier
-
-
-class Conv1DClassifierTest(tf.test.TestCase):
-    """Tests :class:`~texar.tf.modules.Conv1DClassifier` class.
-    """
-
-    def test_classifier(self):
-        """Tests classification.
-        """
-        # case 1: default hparams
-        classifier = Conv1DClassifier()
-        self.assertEqual(len(classifier.layers), 5)
-        self.assertTrue(isinstance(classifier.layers[-1],
-                                   tf.layers.Dense))
-        inputs = tf.ones([64, 16, 300], tf.float32)
-        logits, pred = classifier(inputs)
-        self.assertEqual(logits.shape, [64, 2])
-        self.assertEqual(pred.shape, [64])
-
-        inputs = tf.placeholder(tf.float32, [64, None, 300])
-        logits, pred = classifier(inputs)
-        self.assertEqual(logits.shape, [64, 2])
-        self.assertEqual(pred.shape, [64])
-
-        # case 1
-        hparams = {
-            "num_classes": 10,
-            "logit_layer_kwargs": {"use_bias": False}
-        }
-        classifier = Conv1DClassifier(hparams=hparams)
-        inputs = tf.ones([64, 16, 300], tf.float32)
-        logits, pred = classifier(inputs)
-        self.assertEqual(logits.shape, [64, 10])
-        self.assertEqual(pred.shape, [64])
-
-
-if __name__ == "__main__":
-    tf.test.main()
diff --git a/texar/tf/modules/classifiers/gpt2_classifier.py b/texar/tf/modules/classifiers/gpt2_classifier.py
deleted file mode 100644
index ad955954..00000000
--- a/texar/tf/modules/classifiers/gpt2_classifier.py
+++ /dev/null
@@ -1,283 +0,0 @@
-# Copyright 2019 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-GPT2 classifiers.
-"""
-
-import tensorflow as tf
-
-from texar.tf.core.layers import get_layer
-from texar.tf.modules.classifiers.classifier_base import ClassifierBase
-from texar.tf.modules.encoders.gpt2_encoder import GPT2Encoder
-from texar.tf.hyperparams import HParams
-from texar.tf.modules.pretrained.gpt2 import PretrainedGPT2Mixin
-from texar.tf.utils.utils import dict_fetch
-
-
-__all__ = [
-    "GPT2Classifier",
-]
-
-
-class GPT2Classifier(ClassifierBase, PretrainedGPT2Mixin):
-    r"""Classifier based on GPT2 modules. Please see
-    :class:`~texar.tf.modules.PretrainedGPT2Mixin` for a brief description
-    of GPT2.
-
-    This is a combination of the
-    :class:`~texar.tf.modules.GPT2Encoder` with a classification
-    layer. Both step-wise classification and sequence-level classification
-    are supported, specified in :attr:`hparams`.
-
-    Arguments are the same as in
-    :class:`~texar.tf.modules.GPT2Encoder`.
-
-    Args:
-        pretrained_model_name (optional): a `str`, the name
-            of pre-trained model (e.g., ``gpt2-small``). Please refer to
-            :class:`~texar.tf.modules.PretrainedGPT2Mixin` for
-            all supported models.
-            If `None`, the model name in :attr:`hparams` is used.
-        cache_dir (optional): the path to a folder in which the
-            pre-trained models will be cached. If `None` (default),
-            a default directory (``texar_data`` folder under user's home
-            directory) will be used.
-        hparams (dict or HParams, optional): Hyperparameters. Missing
-            hyperparameter will be set to default values. See
-            :meth:`default_hparams` for the hyperparameter structure
-            and default values.
-
-    .. document private functions
-    .. automethod:: _build
-    """
-
-    def __init__(self,
-                 pretrained_model_name=None,
-                 cache_dir=None,
-                 hparams=None):
-
-        super().__init__(hparams=hparams)
-
-        with tf.variable_scope(self.variable_scope):
-            encoder_hparams = dict_fetch(
-                hparams, GPT2Encoder.default_hparams())
-            if encoder_hparams is not None:
-                encoder_hparams['name'] = None
-
-            self._encoder = GPT2Encoder(
-                pretrained_model_name=pretrained_model_name,
-                cache_dir=cache_dir,
-                hparams=encoder_hparams)
-
-            # Creates an dropout layer
-            drop_kwargs = {"rate": self._hparams.dropout}
-            layer_hparams = {"type": "Dropout", "kwargs": drop_kwargs}
-            self._dropout_layer = get_layer(hparams=layer_hparams)
-
-            # Creates an additional classification layer if needed
-            self._num_classes = self._hparams.num_classes
-            if self._num_classes <= 0:
-                self._logit_layer = None
-            else:
-                logit_kwargs = self._hparams.logit_layer_kwargs
-                if logit_kwargs is None:
-                    logit_kwargs = {}
-                elif not isinstance(logit_kwargs, HParams):
-                    raise ValueError(
-                        "hparams['logit_layer_kwargs'] must be a dict.")
-                else:
-                    logit_kwargs = logit_kwargs.todict()
-                logit_kwargs.update({"units": self._num_classes})
-                if 'name' not in logit_kwargs:
-                    logit_kwargs['name'] = "logit_layer"
-
-                layer_hparams = {"type": "Dense", "kwargs": logit_kwargs}
-                self._logit_layer = get_layer(hparams=layer_hparams)
-
-    @staticmethod
-    def default_hparams():
-        r"""Returns a dictionary of hyperparameters with default values.
-
-        .. code-block:: python
-
-            {
-                # (1) Same hyperparameters as in GPT2Encoder
-                ...
-                # (2) Additional hyperparameters
-                "num_classes": 2,
-                "logit_layer_kwargs": None,
-                "clas_strategy": `cls_time`,
-                "max_seq_length": None,
-                "dropout": 0.1,
-                "name": `gpt2_classifier`
-            }
-
-        Here:
-
-        1. Same hyperparameters as in
-        :class:`~texar.tf.modules.GPT2Encoder`.
-        See the :meth:`~texar.tf.modules.GPT2Encoder.default_hparams`.
-        An instance of GPT2Encoder is created for feature extraction.
-
-        2. Additional hyperparameters:
-
-            `"num_classes"`: int
-                Number of classes:
-
-                - If **> 0**, an additional :tf_main:`Dense <layers/Dense>`
-                  layer is appended to the encoder to compute the logits over
-                  classes.
-                - If **<= 0**, no dense layer is appended. The number of
-                  classes is assumed to be the final dense layer size of the
-                  encoder.
-
-            `"logit_layer_kwargs"`: dict
-                Keyword arguments for the logit Dense layer constructor,
-                except for argument "units" which is set to `num_classes`.
-                Ignored if no extra logit layer is appended.
-
-            `"clas_strategy"`: str
-                The classification strategy, one of:
-
-                - **cls_time**: Sequence-level classification based on the
-                  output of the first time step (which is the `CLS` token).
-                  Each sequence has a class.
-                - **all_time**: Sequence-level classification based on
-                  the output of all time steps. Each sequence has a class.
-                - **time_wise**: Step-wise classification, i.e., make
-                  classification for each time step based on its output.
-
-            `"max_seq_length"`: int, optional
-                Maximum possible length of input sequences. Required if
-                `clas_strategy` is `all_time`.
-
-            `"dropout"`: float
-                The dropout rate of the BERT encoder output.
-
-            `"name"`: str
-                Name of the classifier.
-        """
-        hparams = GPT2Encoder.default_hparams()
-        hparams.update({
-            "num_classes": 2,
-            "logit_layer_kwargs": None,
-            "clas_strategy": "cls_time",
-            "max_seq_length": None,
-            "dropout": 0.1,
-            "name": "gpt2_classifier"
-        })
-        return hparams
-
-    def _build(self,
-               inputs,
-               sequence_length=None,
-               mode=None,
-               **kwargs):
-        r"""Feeds the inputs through the network and makes classification.
-
-        The arguments are the same as in
-        :class:`~texar.tf.modules.GPT2Encoder`.
-
-        Args:
-            inputs: A 2D Tensor of shape `[batch_size, max_time]`,
-                containing the token ids of tokens in input sequences.
-            sequence_length (optional): A 1D Tensor of shape `[batch_size]`.
-                Input tokens beyond respective sequence lengths are masked
-                out automatically.
-            mode (optional): A tensor taking value in
-                :tf_main:`tf.estimator.ModeKeys <estimator/ModeKeys>`,
-                including `TRAIN`, `EVAL`, and `PREDICT`. Used to toggle
-                dropout.
-                If `None` (default), :func:`texar.tf.global_mode` is used.
-            **kwargs: Keyword arguments.
-
-        Returns:
-            A tuple `(logits, pred)`, containing the logits over classes and
-            the predictions, respectively.
-
-            - If "clas_strategy"=="cls_time" or "all_time"
-
-                - If "num_classes"==1, `logits` and `pred` are of both \
-                  shape `[batch_size]`
-                - If "num_classes">1, `logits` is of shape \
-                  `[batch_size, num_classes]` and `pred` is of shape \
-                  `[batch_size]`.
-
-            - If "clas_strategy"=="time_wise",
-
-                - If "num_classes"==1, `logits` and `pred` are of both \
-                  shape `[batch_size, max_time]`
-                - If "num_classes">1, `logits` is of shape \
-                  `[batch_size, max_time, num_classes]` and `pred` is of shape \
-                  `[batch_size, max_time]`.
-        """
-        enc_outputs = self._encoder(inputs, sequence_length, mode)
-
-        # Compute logits
-        strategy = self._hparams.clas_strategy
-        if strategy == 'time_wise':
-            logits = enc_outputs
-        elif strategy == "cls_time":
-            if sequence_length is None:
-                logits = enc_outputs[:, -1, :]
-            else:
-                logits = tf.stack([enc_outputs[batch_idx, time_idx - 1, :]
-                                   for batch_idx, time_idx in
-                                   enumerate(sequence_length)], axis=0)
-        elif strategy == "all_time":
-            # Pad `enc_outputs` to have max_seq_length before flatten
-            length_diff = self._hparams.max_seq_length - tf.shape(inputs)[1]
-            length_diff = tf.reshape(length_diff, [1, 1])
-            # Set `paddings = [[0, 0], [0, length_dif], [0, 0]]`
-            paddings = tf.pad(length_diff, paddings=[[1, 1], [1, 0]])
-            logit_input = tf.pad(enc_outputs, paddings=paddings)
-            logit_input_dim = (self._hparams.encoder.dim *
-                               self._hparams.max_seq_length)
-            logits = tf.reshape(logit_input, [-1, logit_input_dim])
-        else:
-            raise ValueError('Unknown classification strategy: {}'.format(
-                strategy))
-
-        if self._logit_layer is not None:
-            logits = self._dropout_layer(logits, training=mode)
-            logits = self._logit_layer(logits)
-
-        # Compute predications
-        num_classes = self._hparams.num_classes
-        is_binary = num_classes == 1
-        is_binary = is_binary or (num_classes <= 0 and logits.shape[-1] == 1)
-
-        if strategy == 'time_wise':
-            if is_binary:
-                pred = tf.squeeze(tf.greater(logits, 0), -1)
-                logits = tf.squeeze(logits, -1)
-            else:
-                pred = tf.argmax(logits, axis=-1)
-        else:
-            if is_binary:
-                pred = tf.greater(logits, 0)
-                logits = tf.reshape(logits, [-1])
-            else:
-                pred = tf.argmax(logits, axis=-1)
-            pred = tf.reshape(pred, [-1])
-        pred = tf.cast(pred, tf.int64)
-
-        if not self._built:
-            self._add_internal_trainable_variables()
-            if self._logit_layer:
-                self._add_trainable_variable(
-                    self._logit_layer.trainable_variables)
-            self._built = True
-
-        return logits, pred
diff --git a/texar/tf/modules/classifiers/gpt2_classifier_test.py b/texar/tf/modules/classifiers/gpt2_classifier_test.py
deleted file mode 100644
index 0c983917..00000000
--- a/texar/tf/modules/classifiers/gpt2_classifier_test.py
+++ /dev/null
@@ -1,184 +0,0 @@
-"""
-Unit tests for BERT classifiers.
-"""
-
-import numpy as np
-import tensorflow as tf
-
-from texar.tf.modules.classifiers.gpt2_classifier import GPT2Classifier
-from texar.tf.utils.test import pretrained_test
-
-
-class GPT2ClassifierTest(tf.test.TestCase):
-    """Tests :class:`~texar.tf.modules.GPT2Classifier` class.
-    """
-
-    @pretrained_test
-    def test_model_loading(self):
-        r"""Tests model loading functionality."""
-
-        inputs = tf.placeholder(dtype=tf.int32, shape=[None, None])
-
-        for pretrained_model_name in GPT2Classifier.available_checkpoints():
-            classifier = GPT2Classifier(
-                pretrained_model_name=pretrained_model_name)
-            _, _ = classifier(inputs)
-
-    def test_trainable_variables(self):
-        """Tests the functionality of automatically collecting trainable
-        variables.
-        """
-        inputs = tf.placeholder(dtype=tf.int32, shape=[None, None])
-
-        # case 1
-        hparams = {
-            "pretrained_model_name": None,
-        }
-        clas = GPT2Classifier(hparams=hparams)
-        _, _ = clas(inputs)
-        self.assertEqual(len(clas.trainable_variables), 198)
-
-        # case 2
-        hparams = {
-            "pretrained_model_name": None,
-            "clas_strategy": "all_time",
-            "max_seq_length": 8,
-        }
-        clas = GPT2Classifier(hparams=hparams)
-        _, _ = clas(inputs)
-        self.assertEqual(len(clas.trainable_variables), 198)
-
-        # case 3
-        hparams = {
-            "pretrained_model_name": None,
-            "clas_strategy": "time_wise",
-        }
-        clas = GPT2Classifier(hparams=hparams)
-        _, _ = clas(inputs)
-        self.assertEqual(len(clas.trainable_variables), 198)
-
-    def test_classification(self):
-        r"""Tests classificaiton.
-        """
-        max_time = 8
-        batch_size = 16
-        inputs = tf.random_uniform([batch_size, max_time],
-                                   maxval=30521, dtype=tf.int32)
-
-        # case 1
-        hparams = {
-            "pretrained_model_name": None,
-        }
-        classifier = GPT2Classifier(hparams=hparams)
-        logits, preds = classifier(inputs)
-
-        with self.test_session() as sess:
-            sess.run(tf.global_variables_initializer())
-            logits_, pred_ = sess.run([logits, preds])
-            self.assertEqual(logits_.shape, (batch_size, 2))
-            self.assertEqual(pred_.shape, (batch_size,))
-
-        # case 2
-        hparams = {
-            "pretrained_model_name": None,
-            "num_classes": 10,
-            "clas_strategy": "time_wise",
-        }
-        classifier = GPT2Classifier(hparams=hparams)
-        logits, preds = classifier(inputs)
-
-        with self.test_session() as sess:
-            sess.run(tf.global_variables_initializer())
-            logits_, pred_ = sess.run([logits, preds])
-            self.assertEqual(logits_.shape, (batch_size, max_time, 10))
-            self.assertEqual(pred_.shape, (batch_size, max_time))
-
-        # case 3
-        hparams = {
-            "pretrained_model_name": None,
-            "num_classes": 0,
-            "clas_strategy": "time_wise",
-        }
-        classifier = GPT2Classifier(hparams=hparams)
-        logits, preds = classifier(inputs)
-
-        with self.test_session() as sess:
-            sess.run(tf.global_variables_initializer())
-            logits_, pred_ = sess.run([logits, preds])
-            self.assertEqual(logits_.shape, (batch_size, max_time, 768))
-            self.assertEqual(pred_.shape, (batch_size, max_time))
-
-        # case 4
-        hparams = {
-            "pretrained_model_name": None,
-            "num_classes": 10,
-            "clas_strategy": "all_time",
-            "max_seq_length": max_time,
-        }
-        classifier = GPT2Classifier(hparams=hparams)
-        logits, preds = classifier(inputs)
-
-        with self.test_session() as sess:
-            sess.run(tf.global_variables_initializer())
-            logits_, pred_ = sess.run([logits, preds])
-            self.assertEqual(logits_.shape, (batch_size, 10))
-            self.assertEqual(pred_.shape, (batch_size,))
-
-    def test_binary(self):
-        r"""Tests binary classification.
-        """
-        max_time = 8
-        batch_size = 16
-        inputs = tf.random_uniform([batch_size, max_time],
-                                   maxval=30521, dtype=tf.int32)
-
-        # case 1
-        hparams = {
-            "pretrained_model_name": None,
-            "num_classes": 1,
-            "clas_strategy": "time_wise",
-        }
-        classifier = GPT2Classifier(hparams=hparams)
-        logits, preds = classifier(inputs)
-
-        with self.test_session() as sess:
-            sess.run(tf.global_variables_initializer())
-            logits_, pred_ = sess.run([logits, preds])
-            self.assertEqual(logits_.shape, (batch_size, max_time))
-            self.assertEqual(pred_.shape, (batch_size, max_time))
-
-        # case 2
-        hparams = {
-            "pretrained_model_name": None,
-            "num_classes": 1,
-            "clas_strategy": "cls_time",
-            "max_seq_length": max_time,
-        }
-        classifier = GPT2Classifier(hparams=hparams)
-        logits, preds = classifier(inputs)
-
-        with self.test_session() as sess:
-            sess.run(tf.global_variables_initializer())
-            logits_, pred_ = sess.run([logits, preds])
-            self.assertEqual(logits_.shape, (batch_size,))
-            self.assertEqual(pred_.shape, (batch_size,))
-
-        # case 3
-        hparams = {
-            "pretrained_model_name": None,
-            "num_classes": 1,
-            "clas_strategy": "all_time",
-            "max_seq_length": max_time,
-        }
-        classifier = GPT2Classifier(hparams=hparams)
-        logits, preds = classifier(inputs)
-
-        with self.test_session() as sess:
-            sess.run(tf.global_variables_initializer())
-            logits_, pred_ = sess.run([logits, preds])
-            self.assertEqual(logits_.shape, (batch_size,))
-            self.assertEqual(pred_.shape, (batch_size,))
-
-
-if __name__ == "__main__":
-    tf.test.main()
diff --git a/texar/tf/modules/classifiers/rnn_classifiers.py b/texar/tf/modules/classifiers/rnn_classifiers.py
deleted file mode 100644
index 15fe0012..00000000
--- a/texar/tf/modules/classifiers/rnn_classifiers.py
+++ /dev/null
@@ -1,347 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Various RNN classifiers.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-import tensorflow as tf
-from tensorflow.contrib.framework import nest
-
-from texar.tf.modules.classifiers.classifier_base import ClassifierBase
-from texar.tf.modules.encoders.rnn_encoders import \
-        UnidirectionalRNNEncoder, _forward_single_output_layer
-from texar.tf.core import layers
-from texar.tf.utils import utils, shapes
-from texar.tf.hyperparams import HParams
-
-# pylint: disable=too-many-arguments, invalid-name, no-member,
-# pylint: disable=too-many-branches, too-many-locals, too-many-statements
-
-__all__ = [
-    "UnidirectionalRNNClassifier"
-]
-
-# def RNNClassifierBase(ClassifierBase):
-#    """Base class inherited by all RNN classifiers.
-#    """
-#
-#    def __init__(self, hparams=None):
-#        ClassifierBase.__init__(self, hparams)
-
-
-class UnidirectionalRNNClassifier(ClassifierBase):
-    """One directional RNN classifier.
-    This is a combination of the
-    :class:`~texar.tf.modules.UnidirectionalRNNEncoder` with a classification
-    layer. Both step-wise classification and sequence-level classification
-    are supported, specified in :attr:`hparams`.
-
-    Arguments are the same as in
-    :class:`~texar.tf.modules.UnidirectionalRNNEncoder`.
-
-    Args:
-        cell: (RNNCell, optional) If not specified,
-            a cell is created as specified in :attr:`hparams["rnn_cell"]`.
-        cell_dropout_mode (optional): A Tensor taking value of
-            :tf_main:`tf.estimator.ModeKeys <estimator/ModeKeys>`, which
-            toggles dropout in the RNN cell (e.g., activates dropout in
-            TRAIN mode). If `None`, :func:`~texar.tf.global_mode` is used.
-            Ignored if :attr:`cell` is given.
-        output_layer (optional): An instance of
-            :tf_main:`tf.layers.Layer <layers/Layer>`. Applies to the RNN cell
-            output of each step. If `None` (default), the output layer is
-            created as specified in :attr:`hparams["output_layer"]`.
-        hparams (dict or HParams, optional): Hyperparameters. Missing
-            hyperparamerter will be set to default values. See
-            :meth:`default_hparams` for the hyperparameter sturcture and
-            default values.
-
-    .. document private functions
-    .. automethod:: _build
-    """
-
-    def __init__(self,
-                 cell=None,
-                 cell_dropout_mode=None,
-                 output_layer=None,
-                 hparams=None):
-        ClassifierBase.__init__(self, hparams)
-
-        with tf.variable_scope(self.variable_scope):
-            # Creates the underlying encoder
-            encoder_hparams = utils.dict_fetch(
-                hparams, UnidirectionalRNNEncoder.default_hparams())
-            if encoder_hparams is not None:
-                encoder_hparams['name'] = None
-            self._encoder = UnidirectionalRNNEncoder(
-                cell=cell,
-                cell_dropout_mode=cell_dropout_mode,
-                output_layer=output_layer,
-                hparams=encoder_hparams)
-
-            # Creates an additional classification layer if needed
-            self._num_classes = self._hparams.num_classes
-            if self._num_classes <= 0:
-                self._logit_layer = None
-            else:
-                logit_kwargs = self._hparams.logit_layer_kwargs
-                if logit_kwargs is None:
-                    logit_kwargs = {}
-                elif not isinstance(logit_kwargs, HParams):
-                    raise ValueError(
-                        "hparams['logit_layer_kwargs'] must be a dict.")
-                else:
-                    logit_kwargs = logit_kwargs.todict()
-                logit_kwargs.update({"units": self._num_classes})
-                if 'name' not in logit_kwargs:
-                    logit_kwargs['name'] = "logit_layer"
-
-                layer_hparams = {"type": "Dense", "kwargs": logit_kwargs}
-                self._logit_layer = layers.get_layer(hparams=layer_hparams)
-
-    @staticmethod
-    def default_hparams():
-        """Returns a dictionary of hyperparameters with default values.
-
-        .. code-block:: python
-
-            {
-                # (1) Same hyperparameters as in UnidirectionalRNNEncoder
-                ...
-
-                # (2) Additional hyperparameters
-                "num_classes": 2,
-                "logit_layer_kwargs": None,
-                "clas_strategy": "final_time",
-                "max_seq_length": None,
-                "name": "unidirectional_rnn_classifier"
-            }
-
-        Here:
-
-        1. Same hyperparameters as in
-        :class:`~texar.tf.modules.UnidirectionalRNNEncoder`.
-        See the :meth:`~texar.tf.modules.UnidirectionalRNNEncoder.default_hparams`.
-        An instance of UnidirectionalRNNEncoder is created for feature
-        extraction.
-
-        2. Additional hyperparameters:
-
-            "num_classes": int
-                Number of classes:
-
-                - If **`> 0`**, an additional :tf_main:`Dense <layers/Dense>` \
-                layer is appended to the encoder to compute the logits over \
-                classes.
-                - If **`<= 0`**, no dense layer is appended. The number of \
-                classes is assumed to be the final dense layer size of the \
-                encoder.
-
-            "logit_layer_kwargs": dict
-                Keyword arguments for the logit Dense layer constructor,
-                except for argument "units" which is set to "num_classes".
-                Ignored if no extra logit layer is appended.
-
-            "clas_strategy": str
-                The classification strategy, one of:
-
-                - **"final_time"**: Sequence-leve classification based on \
-                the output of the final time step. One sequence has one class.
-                - **"all_time"**: Sequence-level classification based on \
-                the output of all time steps. One sequence has one class.
-                - **"time_wise"**: Step-wise classfication, i.e., make \
-                classification for each time step based on its output.
-
-            "max_seq_length": int, optional
-                Maximum possible length of input sequences. Required if
-                "clas_strategy" is "all_time".
-
-            "name": str
-                Name of the classifier.
-        """
-        hparams = UnidirectionalRNNEncoder.default_hparams()
-        hparams.update({
-            "num_classes": 2,
-            "logit_layer_kwargs": None,
-            "clas_strategy": "final_time",
-            "max_seq_length": None,
-            "name": "unidirectional_rnn_classifier"
-        })
-        return hparams
-
-    def _build(self,
-               inputs,
-               sequence_length=None,
-               initial_state=None,
-               time_major=False,
-               mode=None,
-               **kwargs):
-        """Feeds the inputs through the network and makes classification.
-
-        The arguments are the same as in
-        :class:`~texar.tf.modules.UnidirectionalRNNEncoder`.
-
-        Args:
-            inputs: A 3D Tensor of shape `[batch_size, max_time, dim]`.
-                The first two dimensions
-                `batch_size` and `max_time` may be exchanged if
-                `time_major=True` is specified.
-            sequence_length (optional): A 1D int tensor of shape `[batch_size]`.
-                Sequence lengths
-                of the batch inputs. Used to copy-through state and zero-out
-                outputs when past a batch element's sequence length.
-            initial_state (optional): Initial state of the RNN.
-            time_major (bool): The shape format of the :attr:`inputs` and
-                :attr:`outputs` Tensors. If `True`, these tensors are of shape
-                `[max_time, batch_size, depth]`. If `False` (default),
-                these tensors are of shape `[batch_size, max_time, depth]`.
-            mode (optional): A tensor taking value in
-                :tf_main:`tf.estimator.ModeKeys <estimator/ModeKeys>`, including
-                `TRAIN`, `EVAL`, and `PREDICT`. Controls output layer dropout
-                if the output layer is specified with :attr:`hparams`.
-                If `None` (default), :func:`texar.tf.global_mode()`
-                is used.
-            return_cell_output (bool): Whether to return the output of the RNN
-                cell. This is the results prior to the output layer.
-            **kwargs: Optional keyword arguments of
-                :tf_main:`tf.nn.dynamic_rnn <nn/dynamic_rnn>`,
-                such as `swap_memory`, `dtype`, `parallel_iterations`, etc.
-
-        Returns:
-            A tuple `(logits, pred)`, containing the logits over classes and
-            the predictions, respectively.
-
-            - If "clas_strategy"=="final_time" or "all_time"
-
-                - If "num_classes"==1, `logits` and `pred` are of both \
-                shape `[batch_size]`
-                - If "num_classes">1, `logits` is of shape \
-                `[batch_size, num_classes]` and `pred` is of shape \
-                `[batch_size]`.
-
-            - If "clas_strategy"=="time_wise",
-
-                - If "num_classes"==1, `logits` and `pred` are of both \
-                shape `[batch_size, max_time]`
-                - If "num_classes">1, `logits` is of shape \
-                `[batch_size, max_time, num_classes]` and `pred` is of shape \
-                `[batch_size, max_time]`.
-                - If `time_major` is `True`, the batch and time dimensions are\
-                exchanged.
-        """
-        enc_outputs, _, enc_output_size = self._encoder(
-            inputs=inputs,
-            sequence_length=sequence_length,
-            initial_state=initial_state,
-            time_major=time_major,
-            mode=mode,
-            return_output_size=True,
-            **kwargs)
-
-        # Flatten enc_outputs
-        enc_outputs_flat = nest.flatten(enc_outputs)
-        enc_output_size_flat = nest.flatten(enc_output_size)
-        enc_output_dims_flat = [np.prod(xs) for xs in enc_output_size_flat]
-        enc_outputs_flat = [shapes.flatten(x, 2, xs) for x, xs
-                            in zip(enc_outputs_flat, enc_output_dims_flat)]
-        if len(enc_outputs_flat) == 1:
-            enc_outputs_flat = enc_outputs_flat[0]
-        else:
-            enc_outputs_flat = tf.concat(enc_outputs_flat, axis=2)
-
-        # Compute logits
-        stra = self._hparams.clas_strategy
-        if stra == 'time_wise':
-            logits = enc_outputs_flat
-        elif stra == 'final_time':
-            if time_major:
-                logits = enc_outputs_flat[-1, :, :]
-            else:
-                logits = enc_outputs_flat[:, -1, :]
-        elif stra == 'all_time':
-            if self._logit_layer is None:
-                raise ValueError(
-                    'logit layer must not be `None` if '
-                    'clas_strategy="all_time". Specify the logit layer by '
-                    'either passing the layer in the constructor or '
-                    'specifying the hparams.')
-            if self._hparams.max_seq_length is None:
-                raise ValueError(
-                    'hparams.max_seq_length must not be `None` if '
-                    'clas_strategy="all_time"')
-        else:
-            raise ValueError('Unknown classification strategy: {}'.format(stra))
-
-        if self._logit_layer is not None:
-            logit_input_dim = np.sum(enc_output_dims_flat)
-            if stra == 'time_wise':
-                logits, _ = _forward_single_output_layer(
-                    logits, logit_input_dim, self._logit_layer)
-            elif stra == 'final_time':
-                logits = self._logit_layer(logits)
-            elif stra == 'all_time':
-                # Pad `enc_outputs_flat` to have max_seq_length before flatten
-                length_diff = self._hparams.max_seq_length - tf.shape(inputs)[1]
-                length_diff = tf.reshape(length_diff, [1, 1])
-                # Set `paddings = [[0, 0], [0, length_dif], [0, 0]]`
-                paddings = tf.pad(length_diff, paddings=[[1, 1], [1, 0]])
-                logit_input = tf.pad(enc_outputs_flat, paddings=paddings)
-
-                logit_input_dim *= self._hparams.max_seq_length
-                logit_input = tf.reshape(logit_input, [-1, logit_input_dim])
-
-                logits = self._logit_layer(logit_input)
-
-        # Compute predications
-        num_classes = self._hparams.num_classes
-        is_binary = num_classes == 1
-        is_binary = is_binary or (num_classes <= 0 and logits.shape[-1] == 1)
-
-        if stra == 'time_wise':
-            if is_binary:
-                pred = tf.squeeze(tf.greater(logits, 0), -1)
-                logits = tf.squeeze(logits, -1)
-            else:
-                pred = tf.argmax(logits, axis=-1)
-        else:
-            if is_binary:
-                pred = tf.greater(logits, 0)
-                logits = tf.reshape(logits, [-1])
-            else:
-                pred = tf.argmax(logits, axis=-1)
-            pred = tf.reshape(pred, [-1])
-        pred = tf.cast(pred, tf.int64)
-
-        if not self._built:
-            self._add_internal_trainable_variables()
-            # Add trainable variables of `self._logit_layer`
-            # which may be constructed externally.
-            if self._logit_layer:
-                self._add_trainable_variable(
-                    self._logit_layer.trainable_variables)
-            self._built = True
-
-        return logits, pred
-
-    @property
-    def num_classes(self):
-        """The number of classes, specified in :attr:`hparams`.
-        """
-        return self._hparams.num_classes
diff --git a/texar/tf/modules/classifiers/rnn_classifiers_test.py b/texar/tf/modules/classifiers/rnn_classifiers_test.py
deleted file mode 100644
index 09ee94d2..00000000
--- a/texar/tf/modules/classifiers/rnn_classifiers_test.py
+++ /dev/null
@@ -1,181 +0,0 @@
-#
-"""
-Unit tests for RNN classifiers.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
-
-import numpy as np
-
-import tensorflow as tf
-
-from texar.tf.modules.classifiers.rnn_classifiers import \
-        UnidirectionalRNNClassifier
-
-# pylint: disable=too-many-locals, no-member
-
-
-class UnidirectionalRNNClassifierTest(tf.test.TestCase):
-    """Tests :class:`~texar.tf.modules.UnidirectionalRNNClassifierTest` class.
-    """
-
-    def test_trainable_variables(self):
-        """Tests the functionality of automatically collecting trainable
-        variables.
-        """
-        inputs = tf.placeholder(dtype=tf.float32, shape=[None, None, 100])
-
-        # case 1
-        clas = UnidirectionalRNNClassifier()
-        _, _ = clas(inputs)
-        self.assertEqual(len(clas.trainable_variables), 2 + 2)
-
-        # case 2
-        hparams = {
-            "output_layer": {"num_layers": 2},
-            "logit_layer_kwargs": {"use_bias": False}
-        }
-        clas = UnidirectionalRNNClassifier(hparams=hparams)
-        _, _ = clas(inputs)
-        self.assertEqual(len(clas.trainable_variables), 2 + 2 + 2 + 1)
-        _, _ = clas(inputs)
-        self.assertEqual(len(clas.trainable_variables), 2 + 2 + 2 + 1)
-
-    def test_encode(self):
-        """Tests encoding.
-        """
-        max_time = 8
-        batch_size = 16
-        emb_dim = 100
-        inputs = tf.random_uniform([batch_size, max_time, emb_dim],
-                                   maxval=1., dtype=tf.float32)
-
-        # case 1
-        clas = UnidirectionalRNNClassifier()
-        logits, pred = clas(inputs)
-
-        with self.test_session() as sess:
-            sess.run(tf.global_variables_initializer())
-            logits_, pred_ = sess.run([logits, pred])
-            self.assertEqual(logits_.shape, (batch_size, clas.num_classes))
-            self.assertEqual(pred_.shape, (batch_size, ))
-
-        # case 2
-        hparams = {
-            "num_classes": 10,
-            "clas_strategy": "time_wise"
-        }
-        clas = UnidirectionalRNNClassifier(hparams=hparams)
-        logits, pred = clas(inputs)
-
-        with self.test_session() as sess:
-            sess.run(tf.global_variables_initializer())
-            logits_, pred_ = sess.run([logits, pred])
-            self.assertEqual(logits_.shape,
-                             (batch_size, max_time, clas.num_classes))
-            self.assertEqual(pred_.shape, (batch_size, max_time))
-
-        # case 3
-        hparams = {
-            "output_layer": {
-                "num_layers": 1,
-                "layer_size": 10
-            },
-            "num_classes": 0,
-            "clas_strategy": "time_wise"
-        }
-        clas = UnidirectionalRNNClassifier(hparams=hparams)
-        logits, pred = clas(inputs)
-
-        with self.test_session() as sess:
-            sess.run(tf.global_variables_initializer())
-            logits_, pred_ = sess.run([logits, pred])
-            self.assertEqual(logits_.shape,
-                             (batch_size, max_time, 10))
-            self.assertEqual(pred_.shape, (batch_size, max_time))
-
-        # case 4
-        hparams = {
-            "num_classes": 10,
-            "clas_strategy": "all_time",
-            "max_seq_length": max_time
-        }
-        inputs = tf.placeholder(tf.float32, shape=[batch_size, 6, emb_dim])
-        clas = UnidirectionalRNNClassifier(hparams=hparams)
-        logits, pred = clas(inputs)
-
-        with self.test_session() as sess:
-            sess.run(tf.global_variables_initializer())
-            logits_, pred_ = sess.run(
-                [logits, pred],
-                feed_dict={inputs: np.random.randn(batch_size, 6, emb_dim)})
-            self.assertEqual(logits_.shape, (batch_size, clas.num_classes))
-            self.assertEqual(pred_.shape, (batch_size, ))
-
-    def test_binary(self):
-        """Tests binary classification.
-        """
-        max_time = 8
-        batch_size = 16
-        emb_dim = 100
-        inputs = tf.random_uniform([batch_size, max_time, emb_dim],
-                                   maxval=1., dtype=tf.float32)
-
-        # case 1 omittd
-
-        # case 2
-        hparams = {
-            "num_classes": 1,
-            "clas_strategy": "time_wise"
-        }
-        clas = UnidirectionalRNNClassifier(hparams=hparams)
-        logits, pred = clas(inputs)
-
-        with self.test_session() as sess:
-            sess.run(tf.global_variables_initializer())
-            logits_, pred_ = sess.run([logits, pred])
-            self.assertEqual(logits_.shape, (batch_size, max_time))
-            self.assertEqual(pred_.shape, (batch_size, max_time))
-
-        # case 3
-        hparams = {
-            "output_layer": {
-                "num_layers": 1,
-                "layer_size": 10
-            },
-            "num_classes": 1,
-            "clas_strategy": "time_wise"
-        }
-        clas = UnidirectionalRNNClassifier(hparams=hparams)
-        logits, pred = clas(inputs)
-
-        with self.test_session() as sess:
-            sess.run(tf.global_variables_initializer())
-            logits_, pred_ = sess.run([logits, pred])
-            self.assertEqual(logits_.shape, (batch_size, max_time))
-            self.assertEqual(pred_.shape, (batch_size, max_time))
-
-        # case 4
-        hparams = {
-            "num_classes": 1,
-            "clas_strategy": "all_time",
-            "max_seq_length": max_time
-        }
-        inputs = tf.placeholder(tf.float32, shape=[batch_size, 6, emb_dim])
-        clas = UnidirectionalRNNClassifier(hparams=hparams)
-        logits, pred = clas(inputs)
-
-        with self.test_session() as sess:
-            sess.run(tf.global_variables_initializer())
-            logits_, pred_ = sess.run(
-                [logits, pred],
-                feed_dict={inputs: np.random.randn(batch_size, 6, emb_dim)})
-            self.assertEqual(logits_.shape, (batch_size, ))
-            self.assertEqual(pred_.shape, (batch_size, ))
-
-
-if __name__ == "__main__":
-    tf.test.main()
diff --git a/texar/tf/modules/classifiers/xlnet_classifier.py b/texar/tf/modules/classifiers/xlnet_classifier.py
deleted file mode 100644
index a7741bc0..00000000
--- a/texar/tf/modules/classifiers/xlnet_classifier.py
+++ /dev/null
@@ -1,327 +0,0 @@
-# Copyright 2019 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-XLNet classifiers.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow as tf
-
-from texar.tf.utils.mode import is_train_mode
-from texar.tf.core.layers import get_layer, get_initializer
-from texar.tf.modules.classifiers.classifier_base import ClassifierBase
-from texar.tf.modules.encoders.xlnet_encoder import XLNetEncoder
-from texar.tf.hyperparams import HParams
-from texar.tf.modules.pretrained.xlnet import PretrainedXLNetMixin
-from texar.tf.utils.utils import dict_fetch
-
-# pylint: disable=too-many-arguments, invalid-name, no-member,
-# pylint: disable=too-many-branches, too-many-locals, too-many-statements
-
-__all__ = [
-    "XLNetClassifier"
-]
-
-
-class XLNetClassifier(ClassifierBase, PretrainedXLNetMixin):
-    """Classifier based on XLNet modules. Please see
-    :class:`~texar.tf.modules.PretrainedXLNetMixin` for a brief description
-    of XLNet.
-
-    This is a combination of the :class:`~texar.tf.modules.XLNetEncoder` with a
-    classification layer. Both step-wise classification and sequence-level
-    classification are supported, specified in :attr:`hparams`.
-
-    Arguments are the same as in :class:`~texar.tf.modules.XLNetEncoder`.
-
-    Args:
-        pretrained_model_name (optional): a `str`, the name
-            of pre-trained model (e.g., ``xlnet-based-cased``). Please refer to
-            :class:`~texar.tf.modules.PretrainedXLNetMixin` for
-            all supported models.
-            If `None`, the model name in :attr:`hparams` is used.
-        cache_dir (optional): the path to a folder in which the
-            pre-trained models will be cached. If `None` (default),
-            a default directory (``texar_data`` folder under user's home
-            directory) will be used.
-        hparams (dict or HParams, optional): Hyperparameters. Missing
-            hyperparameters will be set to default values. See
-            :meth:`default_hparams` for the hyperparameter structure
-            and default values.
-
-    .. document private functions
-    .. automethod:: _build
-    """
-
-    def __init__(self,
-                 pretrained_model_name=None,
-                 cache_dir=None,
-                 hparams=None):
-        super(XLNetClassifier, self).__init__(hparams=hparams)
-
-        with tf.variable_scope(self.variable_scope):
-            tf.get_variable_scope().set_initializer(
-                get_initializer(self._hparams.initializer))
-            # Creates the underlying encoder
-            encoder_hparams = dict_fetch(
-                hparams, XLNetEncoder.default_hparams())
-            if encoder_hparams is not None:
-                encoder_hparams['name'] = "encoder"
-            self._encoder = XLNetEncoder(
-                pretrained_model_name=pretrained_model_name,
-                cache_dir=cache_dir,
-                hparams=encoder_hparams)
-            if self._hparams.use_projection:
-                self.projection = get_layer(hparams={
-                    "type": "Dense",
-                    "kwargs": {
-                        "units": self._encoder.output_size
-                    }
-                })
-
-            # Creates an dropout layer
-            drop_kwargs = {"rate": self._hparams.dropout}
-            layer_hparams = {"type": "Dropout", "kwargs": drop_kwargs}
-            self._dropout_layer = get_layer(hparams=layer_hparams)
-
-            # Creates an additional classification layer if needed
-            self._num_classes = self._hparams.num_classes
-            if self._num_classes <= 0:
-                self._logit_layer = None
-            else:
-                logit_kwargs = self._hparams.logit_layer_kwargs
-                if logit_kwargs is None:
-                    logit_kwargs = {}
-                elif not isinstance(logit_kwargs, HParams):
-                    raise ValueError(
-                        "hparams['logit_layer_kwargs'] must be a dict.")
-                else:
-                    logit_kwargs = logit_kwargs.todict()
-                logit_kwargs.update({"units": self._num_classes})
-                if 'name' not in logit_kwargs:
-                    logit_kwargs['name'] = "logit_layer"
-
-                layer_hparams = {"type": "Dense", "kwargs": logit_kwargs}
-                self._logit_layer = get_layer(hparams=layer_hparams)
-
-    @staticmethod
-    def default_hparams():
-        r"""Returns a dictionary of hyperparameters with default values.
-
-        .. code-block:: python
-
-            {
-                # (1) Same hyperparameters as in XLNetEncoder
-                ...
-                # (2) Additional hyperparameters
-                "clas_strategy": "cls_time",
-                "use_projection": True,
-                "num_classes": 2,
-                "logit_layer_kwargs": None,
-                "name": "xlnet_classifier",
-            }
-
-        Here:
-
-        1. Same hyperparameters as in
-            :class:`~texar.tf.modules.XLNetEncoder`.
-            See the :meth:`~texar.tf.modules.XLNetEncoder.default_hparams`.
-            An instance of XLNetEncoder is created for feature extraction.
-
-        2. Additional hyperparameters:
-
-            `"clas_strategy"`: str
-                The classification strategy, one of:
-
-                - **cls_time**: Sequence-level classification based on the
-                  output of the last time step (which is the `CLS` token).
-                  Each sequence has a class.
-                - **all_time**: Sequence-level classification based on
-                  the output of all time steps. Each sequence has a class.
-                - **time_wise**: Step-wise classification, i.e., make
-                  classification for each time step based on its output.
-
-            `"use_projection"`: bool
-                If `True`, an additional `Dense` layer is added after the
-                summary step.
-
-            `"num_classes"`: int
-                Number of classes:
-
-                - If **> 0**, an additional dense layer is appended to the
-                  encoder to compute the logits over classes.
-                - If **<= 0**, no dense layer is appended. The number of
-                  classes is assumed to be the final dense layer size of the
-                  encoder.
-
-            `"logit_layer_kwargs"` : dict
-                Keyword arguments for the logit Dense layer constructor,
-                except for argument "units" which is set to "num_classes".
-                Ignored if no extra logit layer is appended.
-
-            `"name"`: str
-                Name of the classifier.
-        """
-        hparams = XLNetEncoder.default_hparams()
-        hparams.update({
-            "num_classes": 2,
-            "logit_layer_kwargs": None,
-            "clas_strategy": "cls_time",
-            "dropout": 0.1,
-            "use_projection": True,
-            "name": "xlnet_classifier"
-        })
-        return hparams
-
-    def param_groups(self, lr=None, lr_layer_scale=1.0,
-                     decay_base_params=False):
-        r"""Create parameter groups for optimizers. When
-        :attr:`lr_layer_decay_rate` is not 1.0, parameters from each layer form
-        separate groups with different base learning rates.
-
-        This method should be called before applying gradients to the variables
-        through the optimizer. Particularly, after calling the optimizer's
-        `compute_gradients` method, the user can call this method to get
-        variable-specific learning rates for the network. The gradients for each
-        variables can then be scaled accordingly. These scaled gradients are
-        finally applied by calling optimizer's `apply_gradients` method.
-
-        Args:
-            lr (float): The learning rate. Can be omitted if
-                :attr:`lr_layer_decay_rate` is 1.0.
-            lr_layer_scale (float): Per-layer LR scaling rate. The `i`-th layer
-                will be scaled by `lr_layer_scale ^ (num_layers - i - 1)`.
-            decay_base_params (bool): If `True`, treat non-layer parameters
-                (e.g. embeddings) as if they're in layer 0. If `False`, these
-                parameters are not scaled.
-
-        Returns: A dict mapping tensorflow variables to their learning rates.
-        """
-        vars_to_learning_rates = {}
-        if lr_layer_scale != 1.0:
-            if lr is None:
-                raise ValueError(
-                    "lr must be specified when lr_layer_decay_rate is not 1.0")
-
-            scope = self.variable_scope.name
-            projection_vars = tf.trainable_variables(scope=scope + "/dense")
-            logits_vars = tf.trainable_variables(
-                scope=self.variable_scope.name + "/logit_layer")
-            finetune_vars = projection_vars + logits_vars
-            for var in finetune_vars:
-                vars_to_learning_rates[var] = lr
-
-            vars_to_learning_rates.update(
-                self._encoder.param_groups(lr=lr,
-                                           lr_layer_scale=lr_layer_scale,
-                                           decay_base_params=decay_base_params))
-        else:
-            for variable in self.trainable_variables:
-                vars_to_learning_rates[variable] = lr
-
-        return vars_to_learning_rates
-
-    def _build(self, token_ids, segment_ids=None, input_mask=None, mode=None):
-        r"""Feeds the inputs through the network and makes classification.
-
-        Args:
-            token_ids: Shape `[batch_size, max_time]`.
-            segment_ids: Shape `[batch_size, max_time]`.
-            input_mask: Float tensor of shape `[batch_size, max_time]`. Note
-                that positions with value 1 are masked out.
-            mode (optional): A tensor taking value in
-                :tf_main:`tf.estimator.ModeKeys <estimator/ModeKeys>`,
-                including `TRAIN`, `EVAL`, and `PREDICT`. Used to toggle
-                dropout.
-                If `None` (default), :func:`texar.tf.global_mode` is used.
-
-        Returns:
-            A tuple `(logits, preds)`, containing the logits over classes and
-            the predictions, respectively.
-
-            - If ``clas_strategy`` is ``cls_time`` or ``all_time``:
-
-                - If ``num_classes`` == 1, ``logits`` and ``pred`` are both of
-                  shape ``[batch_size]``.
-                - If ``num_classes`` > 1, ``logits`` is of shape
-                  ``[batch_size, num_classes]`` and ``pred`` is of shape
-                  ``[batch_size]``.
-
-            - If ``clas_strategy`` is ``time_wise``:
-
-                - ``num_classes`` == 1, ``logits`` and ``pred`` are both of
-                  shape ``[batch_size, max_time]``.
-                - If ``num_classes`` > 1, ``logits`` is of shape
-                  ``[batch_size, max_time, num_classes]`` and ``pred`` is of
-                  shape ``[batch_size, max_time]``.
-        """
-        is_training = is_train_mode(mode)
-        output, _ = self._encoder(token_ids, segment_ids, input_mask=input_mask,
-                                  mode=mode)
-        strategy = self._hparams.clas_strategy
-        if strategy == "time_wise":
-            summary = output
-        elif strategy == "cls_time":
-            summary = output[:, -1]
-        elif strategy == "all_time":
-            length_diff = self._hparams.max_seq_len - tf.shape(token_ids)[1]
-            summary_input = tf.pad(output,
-                                   paddings=[[0, 0], [0, length_diff], [0, 0]])
-            summary_input_dim = \
-                self._encoder.output_size * self._hparams.max_seq_len
-            summary = tf.reshape(summary_input, shape=[-1, summary_input_dim])
-        else:
-            raise ValueError("Unknown classification strategy: {}"
-                             .format(strategy))
-
-        if self._hparams.use_projection:
-            summary = tf.tanh(self.projection(summary))
-        # summary: (batch_size, hidden_dim)
-        summary = self._dropout_layer(summary, training=is_training)
-
-        logits = (self._logit_layer(summary) if self._logit_layer is not None
-                  else summary)
-
-        # Compute predictions
-        num_classes = self._hparams.num_classes
-        is_binary = num_classes == 1 or (num_classes <= 0
-                                         and logits.shape[-1] == 1)
-
-        if strategy == "time_wise":
-            if is_binary:
-                pred = tf.squeeze(tf.greater(logits, 0), -1)
-                logits = tf.squeeze(logits, -1)
-            else:
-                pred = tf.argmax(logits, axis=-1)
-        else:
-            if is_binary:
-                pred = tf.greater(logits, 0)
-                logits = tf.reshape(logits, [-1])
-            else:
-                pred = tf.argmax(logits, axis=-1)
-            pred = tf.reshape(pred, [-1])
-
-        pred = tf.to_int64(pred)
-
-        if not self._built:
-            self._add_internal_trainable_variables()
-            if self._logit_layer:
-                self._add_trainable_variable(
-                    self._logit_layer.trainable_variables)
-            self._built = True
-
-        return logits, pred
diff --git a/texar/tf/modules/classifiers/xlnet_classifier_test.py b/texar/tf/modules/classifiers/xlnet_classifier_test.py
deleted file mode 100644
index b85dd02a..00000000
--- a/texar/tf/modules/classifiers/xlnet_classifier_test.py
+++ /dev/null
@@ -1,214 +0,0 @@
-#
-"""
-Unit tests for XLNet classifier.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
-
-import numpy as np
-import tensorflow as tf
-
-from texar.tf.modules.classifiers.xlnet_classifier import XLNetClassifier
-from texar.tf.utils.test import pretrained_test
-
-# pylint: disable=too-many-locals, no-member
-
-
-class XLNetClassifierTest(tf.test.TestCase):
-    """Tests :class:`~texar.tf.modules.XLNetClassifier` class.
-    """
-
-    @pretrained_test
-    def test_model_loading(self):
-        r"""Tests model loading functionality."""
-
-        inputs = tf.placeholder(dtype=tf.int32, shape=[None, None])
-
-        for pretrained_model_name in XLNetClassifier.available_checkpoints():
-            classifier = XLNetClassifier(
-                pretrained_model_name=pretrained_model_name)
-            _, _ = classifier(inputs)
-
-    def test_trainable_variables(self):
-        """Tests the functionality of automatically collecting trainable
-        variables.
-        """
-        inputs = tf.placeholder(dtype=tf.int32, shape=[None, None])
-
-        # case 1
-        hparams = {
-            "pretrained_model_name": None,
-        }
-        clas = XLNetClassifier(hparams=hparams)
-        clas(inputs)
-        n_xlnet_vars = 162
-        n_projection_vars = 2
-        n_logits_vars = 2
-        self.assertEqual(len(clas.trainable_variables),
-                         n_xlnet_vars + n_logits_vars + n_projection_vars)
-
-        # case 2
-        hparams = {
-            "pretrained_model_name": None,
-            "clas_strategy": "time_wise"
-        }
-        clas = XLNetClassifier(hparams=hparams)
-        clas(inputs)
-        self.assertEqual(len(clas.trainable_variables),
-                         n_xlnet_vars + n_logits_vars + n_projection_vars)
-
-        # case 3
-        hparams = {
-            "pretrained_model_name": None,
-            "clas_strategy": "all_time"
-        }
-        clas = XLNetClassifier(hparams=hparams)
-        clas(inputs)
-        self.assertEqual(len(clas.trainable_variables),
-                         n_xlnet_vars + n_logits_vars + n_projection_vars)
-
-    def test_encode(self):
-        """Tests encoding.
-        """
-        max_time = 8
-        batch_size = 16
-        inputs = tf.random_uniform([batch_size, max_time],
-                                   maxval=30521, dtype=tf.int32)
-
-        # case 1
-        hparams = {
-            "pretrained_model_name": None
-        }
-        clas = XLNetClassifier(hparams=hparams)
-        logits, pred = clas(inputs)
-
-        with self.test_session() as sess:
-            sess.run(tf.global_variables_initializer())
-            logits_, pred_ = sess.run([logits, pred])
-            self.assertEqual(logits_.shape, (batch_size,
-                                             clas.hparams.num_classes))
-            self.assertEqual(pred_.shape, (batch_size,))
-
-        # case 2
-        hparams = {
-            "pretrained_model_name": None,
-            "num_classes": 10,
-            "clas_strategy": "time_wise"
-        }
-        clas = XLNetClassifier(hparams=hparams)
-        logits, pred = clas(inputs)
-
-        with self.test_session() as sess:
-            sess.run(tf.global_variables_initializer())
-            logits_, pred_ = sess.run([logits, pred])
-            self.assertEqual(logits_.shape,
-                             (batch_size, max_time, clas.hparams.num_classes))
-            self.assertEqual(pred_.shape, (batch_size, max_time))
-
-        # case 3
-        hparams = {
-            "pretrained_model_name": None,
-            "num_classes": 0,
-            "clas_strategy": "time_wise"
-        }
-        clas = XLNetClassifier(hparams=hparams)
-        logits, pred = clas(inputs)
-
-        with self.test_session() as sess:
-            sess.run(tf.global_variables_initializer())
-            logits_, pred_ = sess.run([logits, pred])
-            self.assertEqual(logits_.shape,
-                             (batch_size, max_time, clas.hparams.hidden_dim))
-            self.assertEqual(pred_.shape, (batch_size, max_time))
-
-        # case 4
-        hparams = {
-            "pretrained_model_name": None,
-            "num_classes": 3,
-            "clas_strategy": "all_time",
-            "use_projection": False,
-            "vocab_size": 40000
-        }
-        inputs = tf.placeholder(tf.int32, shape=[batch_size, 6])
-        clas = XLNetClassifier(hparams=hparams)
-        logits, pred = clas(inputs)
-
-        with self.test_session() as sess:
-            sess.run(tf.global_variables_initializer())
-            logits_, pred_ = sess.run(
-                [logits, pred],
-                feed_dict={inputs: np.random.randint(30521,
-                                                     size=(batch_size, 6))})
-            self.assertEqual(logits_.shape, (batch_size,
-                                             clas.hparams.num_classes))
-            self.assertEqual(pred_.shape, (batch_size,))
-
-    def test_binary(self):
-        """Tests binary classification.
-        """
-        max_time = 8
-        batch_size = 16
-        inputs = tf.random_uniform([batch_size, max_time],
-                                   maxval=30521, dtype=tf.int32)
-
-        # case 1
-        hparams = {
-            "pretrained_model_name": None,
-            "num_classes": 1,
-            "clas_strategy": "time_wise"
-        }
-        clas = XLNetClassifier(hparams=hparams)
-        logits, pred = clas(inputs)
-
-        with self.test_session() as sess:
-            sess.run(tf.global_variables_initializer())
-            logits_, pred_ = sess.run([logits, pred])
-            self.assertEqual(logits_.shape, (batch_size, max_time))
-            self.assertEqual(pred_.shape, (batch_size, max_time))
-
-        # case 2
-        hparams = {
-            "pretrained_model_name": None,
-            "num_classes": 1,
-            "clas_strategy": "cls_time",
-            "max_seq_len": max_time
-        }
-        inputs = tf.placeholder(tf.int32, shape=[batch_size, 6])
-        clas = XLNetClassifier(hparams=hparams)
-        logits, pred = clas(inputs)
-
-        with self.test_session() as sess:
-            sess.run(tf.global_variables_initializer())
-            logits_, pred_ = sess.run(
-                [logits, pred],
-                feed_dict={inputs: np.random.randint(30521,
-                                                     size=(batch_size, 6))})
-            self.assertEqual(logits_.shape, (batch_size,))
-            self.assertEqual(pred_.shape, (batch_size,))
-
-        # case 3
-        hparams = {
-            "pretrained_model_name": None,
-            "num_classes": 1,
-            "clas_strategy": "all_time",
-            "max_seq_len": max_time
-        }
-        inputs = tf.placeholder(tf.int32, shape=[batch_size, 6])
-        clas = XLNetClassifier(hparams=hparams)
-        logits, pred = clas(inputs)
-
-        with self.test_session() as sess:
-            sess.run(tf.global_variables_initializer())
-            logits_, pred_ = sess.run(
-                [logits, pred],
-                feed_dict={inputs: np.random.randint(30521,
-                                                     size=(batch_size, 6))})
-            self.assertEqual(logits_.shape, (batch_size,))
-            self.assertEqual(pred_.shape, (batch_size,))
-
-
-if __name__ == "__main__":
-    tf.test.main()
diff --git a/texar/tf/modules/connectors/__init__.py b/texar/tf/modules/connectors/__init__.py
deleted file mode 100644
index a09b904c..00000000
--- a/texar/tf/modules/connectors/__init__.py
+++ /dev/null
@@ -1,25 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Modules of texar library connectors.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# pylint: disable=wildcard-import
-
-from texar.tf.modules.connectors.connector_base import *
-from texar.tf.modules.connectors.connectors import *
diff --git a/texar/tf/modules/connectors/connector_base.py b/texar/tf/modules/connectors/connector_base.py
deleted file mode 100644
index 33346a73..00000000
--- a/texar/tf/modules/connectors/connector_base.py
+++ /dev/null
@@ -1,71 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Base class for connectors that transform inputs into specified output shape.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from texar.tf.module_base import ModuleBase
-
-__all__ = [
-    "ConnectorBase"
-]
-
-
-class ConnectorBase(ModuleBase):
-    """Base class inherited by all connector classes. A connector is to
-    transform inputs into outputs with any specified structure and shape.
-    For example, tranforming the final state of an encoder to the initial
-    state of a decoder, and performing stochastic sampling in between as
-    in Variational Autoencoders (VAEs).
-
-    Args:
-        output_size: Size of output **excluding** the batch dimension. For
-            example, set `output_size` to `dim` to generate output of
-            shape `[batch_size, dim]`.
-            Can be an `int`, a tuple of `int`, a Tensorshape, or a tuple of
-            TensorShapes.
-            For example, to transform inputs to have decoder state size, set
-            `output_size=decoder.state_size`.
-        hparams (dict, optional): Hyperparameters. Missing
-            hyperparamerter will be set to default values. See
-            :meth:`default_hparams` for the hyperparameter sturcture and
-            default values.
-    """
-
-    def __init__(self, output_size, hparams=None):
-        ModuleBase.__init__(self, hparams)
-        self._output_size = output_size
-
-    @staticmethod
-    def default_hparams():
-        """Returns a dictionary of hyperparameters with default values.
-        """
-        return {
-            "name": "connector"
-        }
-
-    def _build(self, *args, **kwargs):
-        """Transforms inputs to outputs with specified shape.
-        """
-        raise NotImplementedError
-
-    @property
-    def output_size(self):
-        """The output size.
-        """
-        return self._output_size
diff --git a/texar/tf/modules/connectors/connectors.py b/texar/tf/modules/connectors/connectors.py
deleted file mode 100644
index 06c40a47..00000000
--- a/texar/tf/modules/connectors/connectors.py
+++ /dev/null
@@ -1,779 +0,0 @@
-# Copyright 2019 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Various connectors.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-import tensorflow as tf
-from tensorflow import distributions as tf_dstr
-from tensorflow.python.util import nest    # pylint: disable=E0611
-
-from texar.tf.modules.connectors.connector_base import ConnectorBase
-from texar.tf.core import layers
-from texar.tf.utils.utils import get_function, check_or_get_instance
-
-# pylint: disable=too-many-locals, arguments-differ
-# pylint: disable=too-many-arguments, invalid-name, no-member
-
-__all__ = [
-    "ConstantConnector",
-    "ForwardConnector",
-    "MLPTransformConnector",
-    "ReparameterizedStochasticConnector",
-    "StochasticConnector",
-    # "ConcatConnector"
-]
-
-
-def _assert_same_size(outputs, output_size):
-    """Check if outputs match output_size
-
-    Args:
-        outputs: A Tensor or a (nested) tuple of tensors
-        output_size: Can be an Integer, a TensorShape, or a (nested) tuple of
-            Integers or TensorShape.
-    """
-    nest.assert_same_structure(outputs, output_size)
-    flat_output_size = nest.flatten(output_size)
-    flat_output = nest.flatten(outputs)
-
-    for (output, size) in zip(flat_output, flat_output_size):
-        if isinstance(size, tf.TensorShape):
-            if output.shape == size:
-                pass
-        elif output[0].shape != tf.TensorShape(size):
-            raise ValueError(
-                "The output size does not match the the required output_size")
-
-
-def _get_tensor_depth(x):
-    """Returns the size of a tensor excluding the first dimension
-    (typically the batch dimension).
-
-    Args:
-        x: A tensor.
-    """
-    return np.prod(x.get_shape().as_list()[1:])
-
-
-def _mlp_transform(inputs, output_size, activation_fn=tf.identity):
-    """Transforms inputs through a fully-connected layer that creates the output
-    with specified size.
-
-    Args:
-        inputs: A Tensor of shape `[batch_size, d1, ..., dn]`, or a (nested)
-            tuple of such elements. The dimensions `d1, ..., dn` will be flatten
-            and transformed by a dense layer.
-        output_size: Can be an Integer, a TensorShape, or a (nested) tuple of
-            Integers or TensorShape.
-        activation_fn: Activation function applied to the output.
-
-    Returns:
-        If :attr:`output_size` is an Integer or a TensorShape, returns a Tensor
-        of shape `[batch_size x output_size]`. If :attr:`output_size` is a tuple
-        of Integers or TensorShape, returns a tuple having the same structure as
-        :attr:`output_size`, where each element Tensor has the same size as
-        defined in :attr:`output_size`.
-    """
-    # Flatten inputs
-    flat_input = nest.flatten(inputs)
-    dims = [_get_tensor_depth(x) for x in flat_input]
-    flat_input = [tf.reshape(x, ([-1, d])) for x, d in zip(flat_input, dims)]
-    concat_input = tf.concat(flat_input, 1)
-
-    # Get output dimension
-    flat_output_size = nest.flatten(output_size)
-    if isinstance(flat_output_size[0], tf.TensorShape):
-        size_list = [0] * len(flat_output_size)
-        for (i, shape) in enumerate(flat_output_size):
-            size_list[i] = np.prod([dim.value for dim in shape])
-    else:
-        size_list = flat_output_size
-    sum_output_size = sum(size_list)
-
-    # fc_output = tf.contrib.layers.fully_connected(
-    #    concat_input, sum_output_size, activation_fn=activation_fn)
-    fc_output = tf.layers.dense(
-        concat_input, sum_output_size, activation=activation_fn)
-
-    flat_output = tf.split(fc_output, size_list, axis=1)
-
-    if isinstance(flat_output_size[0], tf.TensorShape):
-        for (i, shape) in enumerate(flat_output_size):
-            flat_output[i] = tf.reshape(flat_output[i], [-1] + shape.as_list())
-    output = nest.pack_sequence_as(structure=output_size,
-                                   flat_sequence=flat_output)
-
-    return output
-
-
-class ConstantConnector(ConnectorBase):
-    """Creates a constant Tensor or (nested) tuple of Tensors that
-    contains a constant value.
-
-    Args:
-        output_size: Size of output **excluding** the batch dimension. For
-            example, set `output_size` to `dim` to generate output of
-            shape `[batch_size, dim]`.
-            Can be an `int`, a tuple of `int`, a Tensorshape, or a tuple of
-            TensorShapes.
-            For example, to transform inputs to have decoder state size, set
-            `output_size=decoder.state_size`.
-        hparams (dict, optional): Hyperparameters. Missing
-            hyperparamerter will be set to default values. See
-            :meth:`default_hparams` for the hyperparameter sturcture and
-            default values.
-
-    This connector does not have trainable parameters.
-    See :meth:`_build` for the inputs and outputs of the connector.
-
-    Example:
-
-        .. code-block:: python
-
-            connector = Connector(cell.state_size)
-            zero_state = connector(batch_size=64, value=0.)
-            one_state = connector(batch_size=64, value=1.)
-
-    .. document private functions
-    .. automethod:: _build
-    """
-    def __init__(self, output_size, hparams=None):
-        ConnectorBase.__init__(self, output_size, hparams)
-
-    @staticmethod
-    def default_hparams():
-        """Returns a dictionary of hyperparameters with default values.
-
-        .. code-block:: python
-
-            {
-                "value": 0.,
-                "name": "constant_connector"
-            }
-
-        Here:
-
-        "value": float
-            The constant scalar that the output tensor(s) has. Ignored if
-            `value` is given to :meth:`_build`.
-
-        "name": str
-            Name of the connector.
-        """
-        return {
-            "value": 0.,
-            "name": "constant_connector"
-        }
-
-    def _build(self, batch_size, value=None):
-        """Creates output tensor(s) that has the given value.
-
-        Args:
-            batch_size: An `int` or `int` scalar Tensor, the batch size.
-            value (optional): A scalar, the value that
-                the output tensor(s) has. If `None`, "value" in :attr:`hparams`
-                is used.
-
-        Returns:
-            A (structure of) tensor whose structure is the same as
-            :attr:`output_size`, with value speicified by
-            `value` or :attr:`hparams`.
-        """
-        value_ = value
-        if value_ is None:
-            value_ = self.hparams.value
-        output = nest.map_structure(
-            lambda x: tf.constant(value_, shape=[batch_size, x]),
-            self._output_size)
-
-        self._built = True
-
-        return output
-
-
-class ForwardConnector(ConnectorBase):
-    """Transforms inputs to have specified structure.
-
-    Args:
-        output_size: Size of output **excluding** the batch dimension. For
-            example, set `output_size` to `dim` to generate output of
-            shape `[batch_size, dim]`.
-            Can be an `int`, a tuple of `int`, a Tensorshape, or a tuple of
-            TensorShapes.
-            For example, to transform inputs to have decoder state size, set
-            `output_size=decoder.state_size`.
-        hparams (dict, optional): Hyperparameters. Missing
-            hyperparamerter will be set to default values. See
-            :meth:`default_hparams` for the hyperparameter sturcture and
-            default values.
-
-    This connector does not have trainable parameters.
-    See :meth:`_build` for the inputs and outputs of the connector.
-
-    The input to the connector must have the same structure with
-    :attr:`output_size`, or must have the same number of elements and be
-    re-packable into the structure of :attr:`output_size`. Note that if input
-    is or contains a `dict` instance, the keys will be sorted to pack in
-    deterministic order (See
-    :tf_main:`pack_sequence_as <contrib/framework/nest/pack_sequence_as>`
-    for more details).
-
-    Example:
-
-        .. code-block:: python
-
-            cell = LSTMCell(num_units=256)
-            # cell.state_size == LSTMStateTuple(c=256, h=256)
-
-            connector = ForwardConnector(cell.state_size)
-            output = connector([tensor_1, tensor_2])
-            # output == LSTMStateTuple(c=tensor_1, h=tensor_2)
-
-    .. document private functions
-    .. automethod:: _build
-    """
-
-    def __init__(self, output_size, hparams=None):
-        ConnectorBase.__init__(self, output_size, hparams)
-
-    @staticmethod
-    def default_hparams():
-        """Returns a dictionary of hyperparameters with default values.
-
-        .. code-block:: python
-
-            {
-                "name": "forward_connector"
-            }
-
-        Here:
-
-        "name": str
-            Name of the connector.
-        """
-        return {
-            "name": "forward_connector"
-        }
-
-    def _build(self, inputs):
-        """Transforms inputs to have the same structure as with
-        :attr:`output_size`. Values of the inputs are not changed.
-
-        :attr:`inputs` must either have the same structure, or have the same
-        number of elements with :attr:`output_size`.
-
-        Args:
-            inputs: The input (structure of) tensor to pass forward.
-
-        Returns:
-            A (structure of) tensors that re-packs `inputs` to have
-            the specified structure of `output_size`.
-        """
-        output = inputs
-        try:
-            nest.assert_same_structure(inputs, self._output_size)
-        except (ValueError, TypeError):
-            flat_input = nest.flatten(inputs)
-            output = nest.pack_sequence_as(
-                self._output_size, flat_input)
-
-        self._built = True
-
-        return output
-
-
-class MLPTransformConnector(ConnectorBase):
-    """Transforms inputs with an MLP layer and packs the results into the
-    specified structure and size.
-
-    Args:
-        output_size: Size of output **excluding** the batch dimension. For
-            example, set `output_size` to `dim` to generate output of
-            shape `[batch_size, dim]`.
-            Can be an `int`, a tuple of `int`, a Tensorshape, or a tuple of
-            TensorShapes.
-            For example, to transform inputs to have decoder state size, set
-            `output_size=decoder.state_size`.
-        hparams (dict, optional): Hyperparameters. Missing
-            hyperparamerter will be set to default values. See
-            :meth:`default_hparams` for the hyperparameter sturcture and
-            default values.
-
-    See :meth:`_build` for the inputs and outputs of the connector.
-
-    The input to the connector can have arbitrary structure and size.
-
-    Example:
-
-        .. code-block:: python
-
-            cell = LSTMCell(num_units=256)
-            # cell.state_size == LSTMStateTuple(c=256, h=256)
-
-            connector = MLPTransformConnector(cell.state_size)
-            inputs = tf.zeros([64, 10])
-            output = connector(inputs)
-            # output == LSTMStateTuple(c=tensor_of_shape_(64, 256),
-            #                          h=tensor_of_shape_(64, 256))
-
-        .. code-block:: python
-
-            ## Use to connect encoder and decoder with different state size
-            encoder = UnidirectionalRNNEncoder(...)
-            _, final_state = encoder(inputs=...)
-
-            decoder = BasicRNNDecoder(...)
-            connector = MLPTransformConnector(decoder.state_size)
-
-            _ = decoder(
-                initial_state=connector(final_state),
-                ...)
-
-    .. document private functions
-    .. automethod:: _build
-    """
-
-    def __init__(self, output_size, hparams=None):
-        ConnectorBase.__init__(self, output_size, hparams)
-
-    @staticmethod
-    def default_hparams():
-        """Returns a dictionary of hyperparameters with default values.
-
-        .. code-block:: python
-
-            {
-                "activation_fn": "identity",
-                "name": "mlp_connector"
-            }
-
-        Here:
-
-        "activation_fn": str or callable
-            The activation function applied to the outputs of the MLP
-            transformation layer. Can
-            be a function, or its name or module path.
-
-        "name": str
-            Name of the connector.
-        """
-        return {
-            "activation_fn": "identity",
-            "name": "mlp_connector"
-        }
-
-    def _build(self, inputs):
-        """Transforms inputs with an MLP layer and packs the results to have
-        the same structure as specified by :attr:`output_size`.
-
-        Args:
-            inputs: Input (structure of) tensors to be transformed. Must be a
-                Tensor of shape `[batch_size, ...]` or a (nested) tuple of
-                such Tensors. That is, the first dimension of (each) tensor
-                must be the batch dimension.
-
-        Returns:
-            A Tensor or a (nested) tuple of Tensors of the same structure of
-            `output_size`.
-        """
-        activation_fn = layers.get_activation_fn(self.hparams.activation_fn)
-
-        output = _mlp_transform(inputs, self._output_size, activation_fn)
-
-        if not self._built:
-            self._add_internal_trainable_variables()
-            self._built = True
-
-        return output
-
-
-class ReparameterizedStochasticConnector(ConnectorBase):
-    """Samples from a distribution with reparameterization trick, and
-    transforms samples into specified size.
-
-    Reparameterization allows gradients to be back-propagated through the
-    stochastic samples. Used in, e.g., Variational Autoencoders (VAEs).
-
-    Args:
-        output_size: Size of output **excluding** the batch dimension. For
-            example, set `output_size` to `dim` to generate output of
-            shape `[batch_size, dim]`.
-            Can be an `int`, a tuple of `int`, a Tensorshape, or a tuple of
-            TensorShapes.
-            For example, to transform inputs to have decoder state size, set
-            `output_size=decoder.state_size`.
-        hparams (dict, optional): Hyperparameters. Missing
-            hyperparamerter will be set to default values. See
-            :meth:`default_hparams` for the hyperparameter sturcture and
-            default values.
-
-    Example:
-
-        .. code-block:: python
-
-            cell = LSTMCell(num_units=256)
-            # cell.state_size == LSTMStateTuple(c=256, h=256)
-
-            connector = ReparameterizedStochasticConnector(cell.state_size)
-
-            kwargs = {
-                'loc': tf.zeros([batch_size, 10]),
-                'scale_diag': tf.ones([batch_size, 10])
-            }
-            output, sample = connector(distribution_kwargs=kwargs)
-            # output == LSTMStateTuple(c=tensor_of_shape_(batch_size, 256),
-            #                          h=tensor_of_shape_(batch_size, 256))
-            # sample == Tensor([batch_size, 10])
-
-
-            kwargs = {
-                'loc': tf.zeros([10]),
-                'scale_diag': tf.ones([10])
-            }
-            output_, sample_ = connector(distribution_kwargs=kwargs,
-                                         num_samples=batch_size_)
-            # output_ == LSTMStateTuple(c=tensor_of_shape_(batch_size_, 256),
-            #                           h=tensor_of_shape_(batch_size_, 256))
-            # sample == Tensor([batch_size_, 10])
-
-    .. document private functions
-    .. automethod:: _build
-    """
-
-    def __init__(self, output_size, hparams=None):
-        ConnectorBase.__init__(self, output_size, hparams)
-
-    @staticmethod
-    def default_hparams():
-        """Returns a dictionary of hyperparameters with default values.
-
-        .. code-block:: python
-
-            {
-                "activation_fn": "identity",
-                "name": "reparameterized_stochastic_connector"
-            }
-
-        Here:
-
-        "activation_fn": str
-            The activation function applied to the outputs of the MLP
-            transformation layer. Can
-            be a function, or its name or module path.
-
-        "name": str
-            Name of the connector.
-        """
-        return {
-            "activation_fn": "tensorflow.identity",
-            "name": "reparameterized_stochastic_connector"
-        }
-
-    def _build(self,
-               distribution='MultivariateNormalDiag',
-               distribution_kwargs=None,
-               transform=True,
-               num_samples=None):
-        """Samples from a distribution and optionally performs transformation
-        with an MLP layer.
-
-        The distribution must be reparameterizable, i.e.,
-        `distribution.reparameterization_type = FULLY_REPARAMETERIZED`.
-
-        Args:
-            distribution: A instance of subclass of
-                :tf_main:`TF Distribution <distributions/Distribution>`,
-                or :tf_hmpg:`tensorflow_probability Distribution <probability>`,
-                Can be a class, its name or module path, or a class instance.
-            distribution_kwargs (dict, optional): Keyword arguments for the
-                distribution constructor. Ignored if `distribution` is a
-                class instance.
-            transform (bool): Whether to perform MLP transformation of the
-                distribution samples. If `False`, the structure/shape of a
-                sample must match :attr:`output_size`.
-            num_samples (optional): An `int` or `int` Tensor. Number of samples
-                to generate. If not given, generate a single sample. Note
-                that if batch size has already been included in
-                `distribution`'s dimensionality, `num_samples` should be
-                left as `None`.
-
-        Returns:
-            A tuple (output, sample), where
-
-            - output: A Tensor or a (nested) tuple of Tensors with the same \
-            structure and size of :attr:`output_size`. The batch dimension \
-            equals :attr:`num_samples` if specified, or is determined by the \
-            distribution dimensionality. If :attr:`transform` is `False`, \
-            :attr:`output` will be equal to :attr:`sample`.
-            - sample: The sample from the distribution, prior to transformation.
-
-        Raises:
-            ValueError: If distribution cannot be reparametrized.
-            ValueError: The output does not match :attr:`output_size`.
-        """
-        dstr = check_or_get_instance(
-            distribution, distribution_kwargs,
-            ["tensorflow.distributions", "tensorflow_probability.distributions",
-             "texar.tf.custom"])
-
-        if dstr.reparameterization_type == tf_dstr.NOT_REPARAMETERIZED:
-            raise ValueError(
-                "Distribution is not reparameterized: %s" % dstr.name)
-
-        if num_samples:
-            sample = dstr.sample(num_samples)
-        else:
-            sample = dstr.sample()
-
-        # if dstr.event_shape == []:
-        #    sample = tf.reshape(
-        #        sample,
-        #        sample.shape.concatenate(tf.TensorShape(1)))
-
-        # sample = tf.cast(sample, tf.float32)
-        if transform:
-            fn_modules = ['tensorflow', 'tensorflow.nn', 'texar.tf.custom']
-            activation_fn = get_function(self.hparams.activation_fn, fn_modules)
-            output = _mlp_transform(sample, self._output_size, activation_fn)
-        else:
-            output = sample
-
-        _assert_same_size(output, self._output_size)
-        if not self._built:
-            self._add_internal_trainable_variables()
-            self._built = True
-
-        return output, sample
-
-
-class StochasticConnector(ConnectorBase):
-    """Samples from a distribution and transforms samples into specified size.
-
-    The connector is the same as
-    :class:`~texar.tf.modules.ReparameterizedStochasticConnector`, except that
-    here reparameterization is disabled, and thus the gradients cannot be
-    back-propagated through the stochastic samples.
-
-    Args:
-        output_size: Size of output **excluding** the batch dimension. For
-            example, set `output_size` to `dim` to generate output of
-            shape `[batch_size, dim]`.
-            Can be an `int`, a tuple of `int`, a Tensorshape, or a tuple of
-            TensorShapes.
-            For example, to transform inputs to have decoder state size, set
-            `output_size=decoder.state_size`.
-        hparams (dict, optional): Hyperparameters. Missing
-            hyperparamerter will be set to default values. See
-            :meth:`default_hparams` for the hyperparameter sturcture and
-            default values.
-
-    .. document private functions
-    .. automethod:: _build
-    """
-
-    def __init__(self, output_size, hparams=None):
-        ConnectorBase.__init__(self, output_size, hparams)
-
-    @staticmethod
-    def default_hparams():
-        """Returns a dictionary of hyperparameters with default values.
-
-        .. code-block:: python
-
-            {
-                "activation_fn": "identity",
-                "name": "stochastic_connector"
-            }
-
-        Here:
-
-        "activation_fn": str
-            The activation function applied to the outputs of the MLP
-            transformation layer. Can
-            be a function, or its name or module path.
-
-        "name": str
-            Name of the connector.
-        """
-        return {
-            "activation_fn": "tensorflow.identity",
-            "name": "stochastic_connector"
-        }
-
-    def _build(self,
-               distribution='MultivariateNormalDiag',
-               distribution_kwargs=None,
-               transform=True,
-               num_samples=None):
-        """Samples from a distribution and optionally performs transformation
-        with an MLP layer.
-
-        The inputs and outputs are the same as
-        :class:`~texar.tf.modules.ReparameterizedStochasticConnector` except that
-        the distribution does not need to be reparameterizable, and gradient
-        cannot be back-propagate through the samples.
-
-        Args:
-            distribution: A instance of subclass of
-                :tf_main:`TF Distribution <distributions/Distribution>`,
-                or :tf_hmpg:`tensorflow_probability Distribution <probability>`.
-                Can be a class, its name or module path, or a class instance.
-            distribution_kwargs (dict, optional): Keyword arguments for the
-                distribution constructor. Ignored if `distribution` is a
-                class instance.
-            transform (bool): Whether to perform MLP transformation of the
-                distribution samples. If `False`, the structure/shape of a
-                sample must match :attr:`output_size`.
-            num_samples (optional): An `int` or `int` Tensor. Number of samples
-                to generate. If not given, generate a single sample. Note
-                that if batch size has already been included in
-                `distribution`'s dimensionality, `num_samples` should be
-                left as `None`.
-
-        Returns:
-            A tuple (output, sample), where
-
-            - output: A Tensor or a (nested) tuple of Tensors with the same \
-            structure and size of :attr:`output_size`. The batch dimension \
-            equals :attr:`num_samples` if specified, or is determined by the \
-            distribution dimensionality. If :attr:`transform` is `False`, \
-            :attr:`output` will be equal to :attr:`sample`.
-            - sample: The sample from the distribution, prior to transformation.
-
-        Raises:
-            ValueError: The output does not match :attr:`output_size`.
-        """
-        dstr = check_or_get_instance(
-            distribution, distribution_kwargs,
-            ["tensorflow.distributions", "tensorflow_probability.distributions",
-             "texar.tf.custom"])
-
-        if num_samples:
-            sample = dstr.sample(num_samples)
-        else:
-            sample = dstr.sample()
-
-        if dstr.event_shape == []:
-            sample = tf.reshape(sample,
-                                sample.shape.concatenate(tf.TensorShape(1)))
-
-        # Disable gradients through samples
-        sample = tf.stop_gradient(sample)
-
-        sample = tf.cast(sample, tf.float32)
-
-        if transform:
-            fn_modules = ['tensorflow', 'tensorflow.nn', 'texar.tf.custom']
-            activation_fn = get_function(self.hparams.activation_fn, fn_modules)
-            output = _mlp_transform(sample, self._output_size, activation_fn)
-        else:
-            output = sample
-
-        _assert_same_size(output, self._output_size)
-        if not self._built:
-            self._add_internal_trainable_variables()
-            self._built = True
-
-        return output, sample
-
-
-# class ConcatConnector(ConnectorBase):
-#    """Concatenates multiple connectors into one connector. Used in, e.g.,
-#    semi-supervised variational autoencoders, disentangled representation
-#    learning, and other models.
-#
-#    Args:
-#        output_size: Size of output excluding the batch dimension (eg.
-#            :attr:`output_size = p` if :attr:`output.shape` is :attr:`[N, p]`).
-#            Can be an int, a tuple of int, a Tensorshape, or a tuple of
-#            TensorShapes.
-#            For example, to transform to decoder state size, set
-#            `output_size=decoder.cell.state_size`.
-#        hparams (dict): Hyperparameters of the connector.
-#    """
-#
-#    def __init__(self, output_size, hparams=None):
-#        ConnectorBase.__init__(self, output_size, hparams)
-#
-#    @staticmethod
-#    def default_hparams():
-#        """Returns a dictionary of hyperparameters with default values.
-#
-#        Returns:
-#            .. code-block:: python
-#
-#                {
-#                    "activation_fn": "tensorflow.identity",
-#                    "name": "concat_connector"
-#                }
-#
-#            Here:
-#
-#            "activation_fn": (str or callable)
-#                The name or full path to the activation function applied to
-#                the outputs of the MLP layer. The activation functions can be:
-#
-#                - Built-in activation functions defined in :mod:`tf` or \
-#                  :mod:`tf.nn`, e.g., :tf_main:`identity <identity>`.
-#                - User-defined activation functions in `texar.tf.custom`.
-#                - External activation functions. Must provide the full path, \
-#                  e.g., "my_module.my_activation_fn".
-#
-#                The default value is :attr:`"identity"`, i.e., the MLP
-#                transformation is linear.
-#
-#            "name": str
-#                Name of the connector.
-#
-#                The default value is "concat_connector".
-#        """
-#        return {
-#            "activation_fn": "tensorflow.identity",
-#            "name": "concat_connector"
-#        }
-#
-#    def _build(self, connector_inputs, transform=True):
-#        """Concatenate multiple input connectors
-#
-#        Args:
-#            connector_inputs: a list of connector states
-#            transform (bool): If `True`, then the output are automatically
-#                transformed to match :attr:`output_size`.
-#
-#        Returns:
-#            A Tensor or a (nested) tuple of Tensors of the same structure of
-#            the decoder state.
-#        """
-#        connector_inputs = [tf.cast(connector, tf.float32)
-#                            for connector in connector_inputs]
-#        output = tf.concat(connector_inputs, axis=1)
-#
-#        if transform:
-#            fn_modules = ['texar.tf.custom', 'tensorflow', 'tensorflow.nn']
-#            activation_fn = get_function(self.hparams.activation_fn,
-#                                         fn_modules)
-#            output = _mlp_transform(output, self._output_size, activation_fn)
-#        _assert_same_size(output, self._output_size)
-#
-#        self._add_internal_trainable_variables()
-#        self._built = True
-#
-#        return output
diff --git a/texar/tf/modules/connectors/connectors_test.py b/texar/tf/modules/connectors/connectors_test.py
deleted file mode 100644
index 619fa6ab..00000000
--- a/texar/tf/modules/connectors/connectors_test.py
+++ /dev/null
@@ -1,213 +0,0 @@
-#
-"""
-Unit tests for connectors.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
-
-import tensorflow as tf
-from tensorflow_probability import distributions as tfpd
-from tensorflow.python.util import nest    # pylint: disable=E0611
-
-from texar.tf.core import layers
-from texar.tf.modules import ConstantConnector
-from texar.tf.modules import MLPTransformConnector
-from texar.tf.modules import (ReparameterizedStochasticConnector,
-                           StochasticConnector)
-from texar.tf.modules.connectors.connectors import _assert_same_size
-
-# pylint: disable=too-many-locals, invalid-name
-
-
-class TestConnectors(tf.test.TestCase):
-    """Tests various connectors.
-    """
-
-    def setUp(self):
-        tf.test.TestCase.setUp(self)
-        self._batch_size = 100
-
-        self._decoder_cell = layers.get_rnn_cell(
-            layers.default_rnn_cell_hparams())
-
-    def test_constant_connector(self):
-        """Tests the logic of
-        :class:`~texar.tf.modules.connectors.ConstantConnector`.
-        """
-        connector = ConstantConnector(self._decoder_cell.state_size)
-
-        decoder_initial_state_0 = connector(self._batch_size)
-        decoder_initial_state_1 = connector(self._batch_size, value=1.)
-        nest.assert_same_structure(decoder_initial_state_0,
-                                   self._decoder_cell.state_size)
-        nest.assert_same_structure(decoder_initial_state_1,
-                                   self._decoder_cell.state_size)
-
-        with self.test_session() as sess:
-            sess.run(tf.global_variables_initializer())
-            s_0, s_1 = sess.run(
-                [decoder_initial_state_0, decoder_initial_state_1])
-            self.assertEqual(nest.flatten(s_0)[0][0, 0], 0.)
-            self.assertEqual(nest.flatten(s_1)[0][0, 0], 1.)
-
-    def test_forward_connector(self):
-        """Tests the logic of
-        :class:`~texar.tf.modules.connectors.ForwardConnector`.
-        """
-        # TODO(zhiting)
-        pass
-
-    def test_mlp_transform_connector(self):
-        """Tests the logic of
-        :class:`~texar.tf.modules.connectors.MLPTransformConnector`.
-        """
-        connector = MLPTransformConnector(self._decoder_cell.state_size)
-        output = connector(tf.zeros([5, 10]))
-        nest.assert_same_structure(output, self._decoder_cell.state_size)
-
-        with self.test_session() as sess:
-            sess.run(tf.global_variables_initializer())
-
-            output_ = sess.run(output)
-            nest.assert_same_structure(output_, self._decoder_cell.state_size)
-
-    def test_reparameterized_stochastic_connector(self):
-        """Tests the logic of
-        :class:`~texar.tf.modules.ReparameterizedStochasticConnector`.
-        """
-        state_size = (10, 10)
-        variable_size = 100
-        state_size_ts = (tf.TensorShape([10, 10]), tf.TensorShape([2, 3, 4]))
-        sample_num = 10
-
-        mu = tf.zeros([self._batch_size, variable_size])
-        var = tf.ones([self._batch_size, variable_size])
-        mu_vec = tf.zeros([variable_size])
-        var_vec = tf.ones([variable_size])
-        gauss_ds = tfpd.MultivariateNormalDiag(loc=mu, scale_diag=var)
-        gauss_ds_vec = tfpd.MultivariateNormalDiag(loc=mu_vec,
-                                                   scale_diag=var_vec)
-        gauss_connector = ReparameterizedStochasticConnector(state_size)
-        gauss_connector_ts = ReparameterizedStochasticConnector(state_size_ts)
-
-        output_1, _ = gauss_connector(gauss_ds)
-        output_2, _ = gauss_connector(
-            distribution="MultivariateNormalDiag",
-            distribution_kwargs={"loc": mu, "scale_diag": var})
-        sample_ts, _ = gauss_connector_ts(gauss_ds)
-
-        # specify sample num
-        sample_test_num, _ = gauss_connector(
-            gauss_ds_vec, num_samples=sample_num)
-
-        # test when :attr:`transform` is False
-        # sample_test_no_transform = gauss_connector(gauss_ds, transform=False)
-
-        test_list = [output_1, output_2, sample_ts, sample_test_num]
-
-        with self.test_session() as sess:
-            sess.run(tf.global_variables_initializer())
-            out_list = sess.run(test_list)
-            out1 = out_list[0]
-            out2 = out_list[1]
-            out_ts = out_list[2]
-            out_test_num = out_list[3]
-
-            # check the same size
-            self.assertEqual(out_test_num[0].shape,
-                             tf.TensorShape([sample_num, state_size[0]]))
-            self.assertEqual(out1[0].shape,
-                             tf.TensorShape([self._batch_size, state_size[0]]))
-            self.assertEqual(out2[0].shape,
-                             tf.TensorShape([self._batch_size, state_size[0]]))
-            _assert_same_size(out_ts, state_size_ts)
-
-            # sample_mu = np.mean(sample_outputs, axis=0)
-            # # pylint: disable=no-member
-            # sample_var = np.var(sample_outputs, axis=0)
-
-            # check if the value is approximated N(0, 1)
-            # for i in range(variable_size):
-            #     self.assertAlmostEqual(0, sample_mu[i], delta=0.2)
-            #     self.assertAlmostEqual(1, sample_var[i], delta=0.2)
-
-    def test_stochastic_connector(self):
-        """Tests the logic of
-        :class:`~texar.tf.modules.StochasticConnector`.
-        """
-        state_size = (10, 10)
-        variable_size = 100
-        state_size_ts = tf.TensorShape([self._batch_size, variable_size])
-        gauss_connector = StochasticConnector(state_size)
-        mu = tf.zeros([self._batch_size, variable_size])
-        var = tf.ones([self._batch_size, variable_size])
-        gauss_ds = tfpd.MultivariateNormalDiag(loc=mu, scale_diag=var)
-        output_1, _ = gauss_connector(gauss_ds)
-
-        gauss_connector_2 = StochasticConnector(state_size_ts)
-        output_2, sample2 = gauss_connector_2(
-            distribution="MultivariateNormalDiag",
-            distribution_kwargs={"loc": mu, "scale_diag": var}, transform=False)
-        test_list = [output_1, output_2, sample2]
-
-        with self.test_session() as sess:
-            sess.run(tf.global_variables_initializer())
-            out_list = sess.run(test_list)
-            out1 = out_list[0]
-            out2 = out_list[1]
-            sample2 = out_list[2]
-            self.assertEqual(out1[0].shape,
-                             tf.TensorShape([self._batch_size, state_size[0]]))
-            self.assertEqual(out2.shape, state_size_ts)
-            self.assertEqual(out2.shape, sample2.shape)
-
-    # def test_concat_connector(self): # pylint: disable=too-many-locals
-    #    """Tests the logic of
-    #    :class:`~texar.tf.modules.connectors.ConcatConnector`.
-    #    """
-    #    gauss_size = 5
-    #    constant_size = 7
-    #    variable_size = 13
-
-    #    decoder_size1 = 16
-    #    decoder_size2 = (16, 32)
-
-    #    gauss_connector = StochasticConnector(gauss_size)
-    #    categorical_connector = StochasticConnector(1)
-    #    constant_connector = ConstantConnector(constant_size)
-    #    concat_connector1 = ConcatConnector(decoder_size1)
-    #    concat_connector2 = ConcatConnector(decoder_size2)
-
-    #    # pylint: disable=invalid-name
-    #    mu = tf.zeros([self._batch_size, gauss_size])
-    #    var = tf.ones([self._batch_size, gauss_size])
-    #    categorical_prob = tf.constant(
-    #       [[0.1, 0.2, 0.7] for _ in xrange(self._batch_size)])
-    #    categorical_ds = tfds.Categorical(probs = categorical_prob)
-    #    gauss_ds = tfds.MultivariateNormalDiag(loc = mu, scale_diag = var)
-
-    #    gauss_state = gauss_connector(gauss_ds)
-    #    categorical_state = categorical_connector(categorical_ds)
-    #    constant_state = constant_connector(self._batch_size, value=1.)
-    #    with tf.Session() as debug_sess:
-    #        debug_cater = debug_sess.run(categorical_state)
-
-    #    state1 = concat_connector1(
-    #       [gauss_state, categorical_state, constant_state])
-    #    state2 = concat_connector2(
-    #       [gauss_state, categorical_state, constant_state])
-
-    #    with self.test_session() as sess:
-    #        sess.run(tf.global_variables_initializer())
-    #        [output1, output2] = sess.run([state1, state2])
-
-    #        # check the same size
-    #        self.assertEqual(output1.shape[1], decoder_size1)
-    #        self.assertEqual(output2[1].shape[1], decoder_size2[1])
-
-
-if __name__ == "__main__":
-    tf.test.main()
diff --git a/texar/tf/modules/decoders/__init__.py b/texar/tf/modules/decoders/__init__.py
deleted file mode 100644
index 2e725d2c..00000000
--- a/texar/tf/modules/decoders/__init__.py
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Modules of texar library decoders.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# pylint: disable=wildcard-import
-
-from texar.tf.modules.decoders.beam_search_decode import *
-from texar.tf.modules.decoders.gpt2_decoder import *
-from texar.tf.modules.decoders.rnn_decoder_base import *
-from texar.tf.modules.decoders.rnn_decoders import *
-from texar.tf.modules.decoders.tf_helpers import *
-from texar.tf.modules.decoders.rnn_decoder_helpers import *
-from texar.tf.modules.decoders.transformer_decoders import *
diff --git a/texar/tf/modules/decoders/beam_search_decode.py b/texar/tf/modules/decoders/beam_search_decode.py
deleted file mode 100644
index 06dbcd13..00000000
--- a/texar/tf/modules/decoders/beam_search_decode.py
+++ /dev/null
@@ -1,241 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Beam search decoding for RNN decoders.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow as tf
-from tensorflow.contrib.seq2seq import \
-    dynamic_decode, AttentionWrapperState, AttentionWrapper, \
-    BeamSearchDecoder, tile_batch
-
-from texar.tf.modules.decoders.rnn_decoder_base import RNNDecoderBase
-# pylint: disable=too-many-arguments, protected-access, too-many-locals
-# pylint: disable=invalid-name
-
-__all__ = [
-    "beam_search_decode"
-]
-
-
-def _get_initial_state(initial_state,
-                       tiled_initial_state,
-                       cell,
-                       batch_size,
-                       beam_width,
-                       dtype):
-    if tiled_initial_state is None:
-        if isinstance(initial_state, AttentionWrapperState):
-            raise ValueError(
-                '`initial_state` must not be an AttentionWrapperState. Use '
-                'a plain cell state instead, which will be wrapped into an '
-                'AttentionWrapperState automatically.')
-        if initial_state is None:
-            tiled_initial_state = cell.zero_state(batch_size * beam_width,
-                                                  dtype)
-        else:
-            tiled_initial_state = tile_batch(initial_state,
-                                             multiplier=beam_width)
-
-    if isinstance(cell, AttentionWrapper) and \
-            not isinstance(tiled_initial_state, AttentionWrapperState):
-        zero_state = cell.zero_state(batch_size * beam_width, dtype)
-        tiled_initial_state = zero_state.clone(cell_state=tiled_initial_state)
-
-    return tiled_initial_state
-
-
-def beam_search_decode(decoder_or_cell,
-                       embedding,
-                       start_tokens,
-                       end_token,
-                       beam_width,
-                       initial_state=None,
-                       tiled_initial_state=None,
-                       output_layer=None,
-                       length_penalty_weight=0.0,
-                       max_decoding_length=None,
-                       output_time_major=False,
-                       **kwargs):
-    """Performs beam search sampling decoding.
-
-    Args:
-        decoder_or_cell: An instance of
-            subclass of :class:`~texar.tf.modules.RNNDecoderBase`,
-            or an instance of :tf_main:`RNNCell <contrib/rnn/RNNCell>`. The
-            decoder or RNN cell to perform decoding.
-        embedding: A callable that takes a vector tensor of indexes (e.g.,
-            an instance of subclass of :class:`~texar.tf.modules.EmbedderBase`),
-            or the :attr:`params` argument for
-            :tf_main:`tf.nn.embedding_lookup <nn/embedding_lookup>`.
-        start_tokens: `int32` vector shaped `[batch_size]`, the start tokens.
-        end_token: `int32` scalar, the token that marks end of decoding.
-        beam_width (int): Python integer, the number of beams.
-        initial_state (optional): Initial state of decoding. If `None`
-            (default), zero state is used.
-
-            The state must **not** be tiled with
-            :tf_main:`tile_batch <contrib/seq2seq/tile_batch>`.
-            If you have an already-tiled initial state, use
-            :attr:`tiled_initial_state` instead.
-
-            In the case of attention RNN decoder, `initial_state` must
-            **not** be an :tf_main:`AttentionWrapperState
-            <contrib/seq2seq/AttentionWrapperState>`. Instead, it must be a
-            state of the wrapped `RNNCell`, which state will be wrapped into
-            `AttentionWrapperState` automatically.
-
-            Ignored if :attr:`tiled_initial_state` is given.
-        tiled_initial_state (optional): Initial state that has been tiled
-            (typicaly with :tf_main:`tile_batch <contrib/seq2seq/tile_batch>`)
-            so that the batch dimension has size `batch_size * beam_width`.
-
-            In the case of attention RNN decoder, this can be either a state
-            of the wrapped `RNNCell`, or an `AttentionWrapperState`.
-
-            If not given, :attr:`initial_state` is used.
-        output_layer (optional): A :tf_main:`Layer <layers/Layer>` instance to
-            apply to the RNN output prior to storing the result or sampling. If
-            `None` and :attr:`decoder_or_cell` is a decoder, the decoder's
-            output layer will be used.
-        length_penalty_weight: Float weight to penalize length.
-            Disabled with `0.0` (default).
-        max_decoding_length (optional): A int scalar Tensor indicating the
-            maximum allowed number of decoding steps. If `None` (default),
-            decoding will continue until the end token is encountered.
-        output_time_major (bool): If `True`, outputs are returned as
-            time major tensors. If `False` (default), outputs are returned
-            as batch major tensors.
-        **kwargs: Other keyword arguments for :tf_main:`dynamic_decode
-            <contrib/seq2seq/dynamic_decode>` except argument
-            `maximum_iterations` which is set to :attr:`max_decoding_length`.
-
-    Returns:
-        A tuple `(outputs, final_state, sequence_length)`, where
-
-        - outputs: An instance of :tf_main:`FinalBeamSearchDecoderOutput \
-        <contrib/seq2seq/FinalBeamSearchDecoderOutput>`.
-        - final_state: An instance of :tf_main:`BeamSearchDecoderState \
-        <contrib/seq2seq/BeamSearchDecoderState>`.
-        - sequence_length: A Tensor of shape `[batch_size]` containing \
-        the lengths of samples.
-
-    Example:
-
-        .. code-block:: python
-
-            ## Beam search with basic RNN decoder
-
-            embedder = WordEmbedder(vocab_size=data.vocab.size)
-            decoder = BasicRNNDecoder(vocab_size=data.vocab.size)
-
-            outputs, _, _, = beam_search_decode(
-                decoder_or_cell=decoder,
-                embedding=embedder,
-                start_tokens=[data.vocab.bos_token_id] * 100,
-                end_token=data.vocab.eos_token_id,
-                beam_width=5,
-                max_decoding_length=60)
-
-            sample_ids = sess.run(outputs.predicted_ids)
-            sample_text = tx.utils.map_ids_to_strs(sample_id[:,:,0], data.vocab)
-            print(sample_text)
-            # [
-            #   the first sequence sample .
-            #   the second sequence sample .
-            #   ...
-            # ]
-
-        .. code-block:: python
-
-            ## Beam search with attention RNN decoder
-
-            # Encodes the source
-            enc_embedder = WordEmbedder(data.source_vocab.size, ...)
-            encoder = UnidirectionalRNNEncoder(...)
-
-            enc_outputs, enc_state = encoder(
-                inputs=enc_embedder(data_batch['source_text_ids']),
-                sequence_length=data_batch['source_length'])
-
-            # Decodes while attending to the source
-            dec_embedder = WordEmbedder(vocab_size=data.target_vocab.size, ...)
-            decoder = AttentionRNNDecoder(
-                memory=enc_outputs,
-                memory_sequence_length=data_batch['source_length'],
-                vocab_size=data.target_vocab.size)
-
-            # Beam search
-            outputs, _, _, = beam_search_decode(
-                decoder_or_cell=decoder,
-                embedding=dec_embedder,
-                start_tokens=[data.vocab.bos_token_id] * 100,
-                end_token=data.vocab.eos_token_id,
-                beam_width=5,
-                initial_state=enc_state,
-                max_decoding_length=60)
-    """
-    if isinstance(decoder_or_cell, RNNDecoderBase):
-        cell = decoder_or_cell._get_beam_search_cell(beam_width=beam_width)
-    elif isinstance(decoder_or_cell, tf.contrib.rnn.RNNCell):
-        cell = decoder_or_cell
-    else:
-        raise ValueError("`decoder` must be an instance of a subclass of "
-                         "either `RNNDecoderBase` or `RNNCell`.")
-
-    start_tokens = tf.convert_to_tensor(
-        start_tokens, dtype=tf.int32, name="start_tokens")
-    if start_tokens.get_shape().ndims != 1:
-        raise ValueError("`start_tokens` must be a vector")
-    batch_size = tf.size(start_tokens)
-
-    initial_state = _get_initial_state(
-        initial_state, tiled_initial_state, cell,
-        batch_size, beam_width, tf.float32)
-
-    if output_layer is None and isinstance(decoder_or_cell, RNNDecoderBase):
-        output_layer = decoder_or_cell.output_layer
-
-    def _decode():
-        beam_docoder = BeamSearchDecoder(
-            cell=cell,
-            embedding=embedding,
-            start_tokens=start_tokens,
-            end_token=end_token,
-            initial_state=initial_state,
-            beam_width=beam_width,
-            output_layer=None if output_layer is tf.identity else output_layer,
-            length_penalty_weight=length_penalty_weight)
-
-        if 'maximum_iterations' in kwargs:
-            raise ValueError('Use `max_decoding_length` to set the maximum '
-                             'allowed number of decoding steps.')
-        outputs, final_state, _ = dynamic_decode(
-            decoder=beam_docoder,
-            output_time_major=output_time_major,
-            maximum_iterations=max_decoding_length,
-            **kwargs)
-
-        return outputs, final_state, final_state.lengths
-
-    if isinstance(decoder_or_cell, RNNDecoderBase):
-        vs = decoder_or_cell.variable_scope
-        with tf.variable_scope(vs, reuse=tf.AUTO_REUSE):
-            return _decode()
-    else:
-        return _decode()
diff --git a/texar/tf/modules/decoders/beam_search_decode_test.py b/texar/tf/modules/decoders/beam_search_decode_test.py
deleted file mode 100644
index 77aa487c..00000000
--- a/texar/tf/modules/decoders/beam_search_decode_test.py
+++ /dev/null
@@ -1,235 +0,0 @@
-"""
-Unit tests for beam search decoding.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
-
-import numpy as np
-
-import tensorflow as tf
-from tensorflow.contrib.seq2seq import dynamic_decode
-from tensorflow.contrib.seq2seq import BeamSearchDecoder, tile_batch
-
-import texar.tf as tx
-from texar.tf.modules.decoders.beam_search_decode import beam_search_decode
-from texar.tf import context
-
-# pylint: disable=no-member, too-many-instance-attributes, invalid-name
-# pylint: disable=too-many-locals, too-many-arguments
-
-
-class BeamSearchDecodeTest(tf.test.TestCase):
-    """Tests
-    :func:`texar.tf.modules.decoders.beam_search_decode.beam_search_decode`.
-    """
-
-    def setUp(self):
-        tf.test.TestCase.setUp(self)
-        self._vocab_size = 10
-        self._max_time = 16
-        self._batch_size = 8
-        self._emb_dim = 20
-        self._cell_dim = 256
-        self._attention_dim = self._cell_dim
-        self._beam_width = 11
-        self._inputs = tf.random_uniform(
-            [self._batch_size, self._max_time, self._emb_dim],
-            maxval=1., dtype=tf.float32)
-        self._embedding = tf.random_uniform(
-            [self._vocab_size, self._emb_dim], maxval=1., dtype=tf.float32)
-        self._encoder_output = tf.random_uniform(
-            [self._batch_size, self._max_time, 64])
-
-    def _test_beam_search(
-            self, decoder, initial_state=None, tiled_initial_state=None,
-            tf_initial_state=None, beam_width_1=1, initiated=False):
-        # Compare with tf built-in BeamSearchDecoder
-        outputs, final_state, _ = beam_search_decode(
-            decoder_or_cell=decoder,
-            embedding=self._embedding,
-            start_tokens=[1] * self._batch_size,
-            end_token=2,
-            beam_width=beam_width_1,
-            max_decoding_length=20)
-
-        self.assertIsInstance(
-            outputs, tf.contrib.seq2seq.FinalBeamSearchDecoderOutput)
-        self.assertIsInstance(
-            final_state, tf.contrib.seq2seq.BeamSearchDecoderState)
-
-        num_trainable_variables = len(tf.trainable_variables())
-        _ = decoder(
-            decoding_strategy='infer_greedy',
-            embedding=self._embedding,
-            start_tokens=[1] * self._batch_size,
-            end_token=2,
-            max_decoding_length=20)
-        self.assertEqual(num_trainable_variables, len(tf.trainable_variables()))
-
-        if tf_initial_state is None:
-            tf_initial_state = decoder.cell.zero_state(
-                self._batch_size * beam_width_1, tf.float32)
-        beam_decoder = BeamSearchDecoder(
-            cell=decoder.cell,
-            embedding=self._embedding,
-            start_tokens=[1] * self._batch_size,
-            end_token=2,
-            initial_state=tf_initial_state,
-            beam_width=beam_width_1,
-            output_layer=decoder.output_layer)
-
-        outputs_1, final_state_1, _ = dynamic_decode(
-            decoder=beam_decoder, maximum_iterations=20)
-
-        # Tests time major
-        outputs_2, _, _ = beam_search_decode(
-            decoder_or_cell=decoder,
-            embedding=self._embedding,
-            start_tokens=[1] * self._batch_size,
-            end_token=2,
-            beam_width=self._beam_width,
-            initial_state=initial_state,
-            tiled_initial_state=tiled_initial_state,
-            max_decoding_length=21)
-        outputs_3, _, _ = beam_search_decode(
-            decoder_or_cell=decoder,
-            embedding=self._embedding,
-            start_tokens=[1] * self._batch_size,
-            end_token=2,
-            beam_width=self._beam_width,
-            initial_state=initial_state,
-            tiled_initial_state=tiled_initial_state,
-            max_decoding_length=21,
-            output_time_major=True)
-
-        with self.test_session() as sess:
-            if not initiated:
-                sess.run(tf.global_variables_initializer())
-
-            outputs_, final_state_, outputs_1_, final_state_1_ = sess.run(
-                [outputs, final_state, outputs_1, final_state_1],
-                feed_dict={context.global_mode():
-                           tf.estimator.ModeKeys.PREDICT})
-
-            np.testing.assert_array_equal(
-                outputs_.predicted_ids, outputs_1_.predicted_ids)
-            np.testing.assert_array_equal(
-                outputs_.beam_search_decoder_output.scores,
-                outputs_1_.beam_search_decoder_output.scores)
-            np.testing.assert_array_equal(
-                outputs_.beam_search_decoder_output.predicted_ids,
-                outputs_1_.beam_search_decoder_output.predicted_ids)
-            np.testing.assert_array_equal(
-                outputs_.beam_search_decoder_output.parent_ids,
-                outputs_1_.beam_search_decoder_output.parent_ids)
-            np.testing.assert_array_equal(
-                final_state_.log_probs, final_state_1_.log_probs)
-            np.testing.assert_array_equal(
-                final_state_.lengths, final_state_1_.lengths)
-
-            outputs_2_, outputs_3_ = sess.run(
-                [outputs_2, outputs_3],
-                feed_dict={context.global_mode():
-                           tf.estimator.ModeKeys.PREDICT})
-            self.assertEqual(outputs_2_.predicted_ids.shape,
-                             tuple([self._batch_size, 21, 11]))
-            self.assertEqual(outputs_3_.predicted_ids.shape,
-                             tuple([21, self._batch_size, 11]))
-
-    def test_basic_rnn_decoder_beam_search(self):
-        """Tests beam search with BasicRNNDecoder.
-        """
-        hparams = {
-            "rnn_cell": {
-                "kwargs": {"num_units": self._cell_dim}
-            }
-        }
-        decoder = tx.modules.BasicRNNDecoder(
-            vocab_size=self._vocab_size,
-            hparams=hparams)
-
-        self._test_beam_search(decoder)
-
-        self._test_beam_search(
-            decoder, beam_width_1=self._beam_width, initiated=True)
-
-    def test_basic_rnn_decoder_given_initial_state(self):
-        """Tests beam search with BasicRNNDecoder given initial state.
-        """
-        hparams = {
-            "rnn_cell": {
-                "kwargs": {"num_units": self._cell_dim}
-            }
-        }
-        decoder = tx.modules.BasicRNNDecoder(
-            vocab_size=self._vocab_size,
-            hparams=hparams)
-
-        # (zhiting): The beam search decoder does not generate max-length
-        # samples if only one cell_state is created. Perhaps due to
-        # random seed or bugs?
-        cell_state = decoder.cell.zero_state(self._batch_size, tf.float32)
-
-        self._test_beam_search(decoder, initial_state=cell_state)
-
-        tiled_cell_state = tile_batch(cell_state, multiplier=self._beam_width)
-        self._test_beam_search(
-            decoder, tiled_initial_state=tiled_cell_state, initiated=True)
-
-    def test_attention_decoder_beam_search(self):
-        """Tests beam search with RNNAttentionDecoder.
-        """
-        seq_length = np.random.randint(
-            self._max_time, size=[self._batch_size]) + 1
-        encoder_values_length = tf.constant(seq_length)
-        hparams = {
-            "attention": {
-                "kwargs": {"num_units": self._attention_dim}
-            },
-            "rnn_cell": {
-                "kwargs": {"num_units": self._cell_dim}
-            }
-        }
-        decoder = tx.modules.AttentionRNNDecoder(
-            vocab_size=self._vocab_size,
-            memory=self._encoder_output,
-            memory_sequence_length=encoder_values_length,
-            hparams=hparams)
-
-        self._test_beam_search(decoder)
-
-    def test_attention_decoder_given_initial_state(self):
-        """Tests beam search with RNNAttentionDecoder given initial state.
-        """
-        seq_length = np.random.randint(
-            self._max_time, size=[self._batch_size]) + 1
-        encoder_values_length = tf.constant(seq_length)
-        hparams = {
-            "attention": {
-                "kwargs": {"num_units": self._attention_dim}
-            },
-            "rnn_cell": {
-                "kwargs": {"num_units": self._cell_dim}
-            }
-        }
-        decoder = tx.modules.AttentionRNNDecoder(
-            vocab_size=self._vocab_size,
-            memory=self._encoder_output,
-            memory_sequence_length=encoder_values_length,
-            hparams=hparams)
-
-        state = decoder.cell.zero_state(self._batch_size, tf.float32)
-
-        cell_state = state.cell_state
-        self._test_beam_search(decoder, initial_state=cell_state)
-
-        tiled_cell_state = tile_batch(cell_state, multiplier=self._beam_width)
-        self._test_beam_search(
-            decoder, tiled_initial_state=tiled_cell_state, initiated=True)
-
-
-if __name__ == "__main__":
-    tf.test.main()
diff --git a/texar/tf/modules/decoders/dynamic_decode.py b/texar/tf/modules/decoders/dynamic_decode.py
deleted file mode 100644
index ac432ad4..00000000
--- a/texar/tf/modules/decoders/dynamic_decode.py
+++ /dev/null
@@ -1,338 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-# Modifications copyright (C) 2019 Texar
-# ==============================================================================
-"""
-Utility functions for decoding. This file is modified from
-`tf.contrib.seq2seq.dynamic_decode`.
-"""
-
-from __future__ import absolute_import
-from __future__ import print_function
-from __future__ import division
-from __future__ import unicode_literals
-
-# pylint: disable=invalid-name, no-member, protected-access
-
-import tensorflow as tf
-from tensorflow.contrib.seq2seq import Decoder as TFDecoder
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.util import nest
-
-
-__all__ = [
-    "dynamic_decode"
-]
-
-
-def _concat(prefix, suffix, static=False):
-    r"""Concat that enables int, Tensor, or TensorShape values.
-    This function takes a size specification, which can be an integer, a
-    TensorShape, or a Tensor, and converts it into a concatenated Tensor
-    (if static = False) or a list of integers (if static = True).
-
-    Args:
-        prefix: The prefix; usually the batch size (and/or time step size).
-          (TensorShape, int, or Tensor.)
-        suffix: TensorShape, int, or Tensor.
-        static: If `True`, return a python list with possibly unknown
-          dimensions. Otherwise return a `Tensor`.
-
-    Returns:
-        shape: the concatenation of prefix and suffix.
-
-    Raises:
-        ValueError: if `suffix` is not a scalar or vector (or TensorShape).
-        ValueError: if prefix or suffix was `None` and asked for dynamic
-          Tensors out.
-    """
-    if isinstance(prefix, tf.Tensor):
-        p = prefix
-        p_static = tf.get_static_value(prefix)
-        if p.shape.ndims == 0:
-            p = tf.expand_dims(p, 0)
-        elif p.shape.ndims != 1:
-            raise ValueError("prefix tensor must be either a scalar or vector, "
-                             "but saw tensor: %s" % p)
-    else:
-        p = tensor_shape.as_shape(prefix)
-        p_static = p.as_list() if p.ndims is not None else None
-        p = (
-            tf.constant(p.as_list(), dtype=tf.int32)
-            if p.is_fully_defined() else None)
-    if isinstance(suffix, tf.Tensor):
-        s = suffix
-        s_static = tf.get_static_value(suffix)
-        if s.shape.ndims == 0:
-            s = tf.expand_dims(s, 0)
-        elif s.shape.ndims != 1:
-            raise ValueError("suffix tensor must be either a scalar or vector, "
-                             "but saw tensor: %s" % s)
-    else:
-        s = tensor_shape.as_shape(suffix)
-        s_static = s.as_list() if s.ndims is not None else None
-        s = (
-            tf.constant(s.as_list(), dtype=tf.int32)
-            if s.is_fully_defined() else None)
-
-    if static:
-        shape = tensor_shape.as_shape(p_static).concatenate(s_static)
-        shape = shape.as_list() if shape.ndims is not None else None
-    else:
-        if p is None or s is None:
-            raise ValueError("Provided a prefix or suffix of None: %s and %s" %
-                             (prefix, suffix))
-        shape = tf.concat((p, s), 0)
-    return shape
-
-
-def _zero_state_tensors(state_size, batch_size, dtype):
-    r"""Create tensors of zeros based on state_size, batch_size, and dtype."""
-
-    def get_state_shape(s):
-        r"""Combine s with batch_size to get a proper tensor shape."""
-
-        c = _concat(batch_size, s)
-        size = tf.zeros(c, dtype=dtype)
-        return size
-
-    return nest.map_structure(get_state_shape, state_size)
-
-
-def _create_zero_outputs(size, dtype, batch_size):
-    r"""Create a zero outputs Tensor structure."""
-
-    def _create(s, d):
-        return _zero_state_tensors(s, batch_size, d)
-
-    return nest.map_structure(_create, size, dtype)
-
-
-def _transpose_batch_time(x):
-    r"""Transposes the batch and time dimensions of a Tensor.
-
-    If the input tensor has rank < 2 it returns the original tensor. Retains as
-    much of the static shape information as possible.
-
-    Args:
-        x: A Tensor.
-
-    Returns:
-        x transposed along the first two dimensions.
-    """
-    x_static_shape = x.get_shape()
-    if x_static_shape.rank is not None and x_static_shape.rank < 2:
-        return x
-
-    x_rank = tf.rank(x)
-    x_t = tf.transpose(
-        x, tf.concat(([1, 0], tf.range(2, x_rank)), axis=0))
-    x_t.set_shape(
-        tensor_shape.TensorShape(
-            [x_static_shape.dims[1].value,
-             x_static_shape.dims[0].value]).concatenate(x_static_shape[2:]))
-    return x_t
-
-
-def dynamic_decode(decoder,
-                   output_time_major=False,
-                   impute_finished=False,
-                   maximum_iterations=None,
-                   parallel_iterations=32,
-                   swap_memory=False,
-                   scope=None):
-    r"""Perform dynamic decoding with `decoder`.
-
-    Calls initialize() once and step() repeatedly on the Decoder object.
-
-    Args:
-      decoder: A `Decoder` instance.
-      output_time_major: Python boolean.  Default: `False` (batch major).  If
-        `True`, outputs are returned as time major tensors (this mode is faster).
-        Otherwise, outputs are returned as batch major tensors (this adds extra
-        time to the computation).
-      impute_finished: Python boolean.  If `True`, then states for batch
-        entries which are marked as finished get copied through and the
-        corresponding outputs get zeroed out.  This causes some slowdown at
-        each time step, but ensures that the final state and outputs have
-        the correct values and that backprop ignores time steps that were
-        marked as finished.
-      maximum_iterations: `int32` scalar, maximum allowed number of decoding
-        steps.  Default is `None` (decode until the decoder is fully done).
-      parallel_iterations: Argument passed to `tf.while_loop`.
-      swap_memory: Argument passed to `tf.while_loop`.
-      scope: Optional variable scope to use.
-
-    Returns:
-      `(final_outputs, final_state, final_sequence_lengths)`.
-    Raises:
-      TypeError: if `decoder` is not an instance of `Decoder`.
-      ValueError: if `maximum_iterations` is provided but is not a scalar.
-    """
-    if not isinstance(decoder, TFDecoder):
-        raise TypeError("Expected decoder to be type Decoder, but saw: %s" %
-                        type(decoder))
-
-    with tf.variable_scope(scope, "decoder") as varscope:
-        if maximum_iterations is not None:
-            maximum_iterations = tf.convert_to_tensor(
-                maximum_iterations, dtype=tf.int32, name="maximum_iterations")
-            if maximum_iterations.get_shape().ndims != 0:
-                raise ValueError("maximum_iterations must be a scalar")
-
-    initial_finished, initial_inputs, initial_state = decoder.initialize()
-
-    zero_outputs = _create_zero_outputs(decoder.output_size,
-                                        decoder.output_dtype,
-                                        decoder.batch_size)
-
-    if maximum_iterations is not None:
-        initial_finished = tf.logical_or(
-            initial_finished, 0 >= maximum_iterations)
-    initial_sequence_lengths = tf.zeros_like(
-        initial_finished, dtype=tf.int32)
-    initial_time = tf.constant(0, dtype=tf.int32)
-
-    def _shape(batch_size, from_shape):
-        if (not isinstance(from_shape, tensor_shape.TensorShape) or
-                from_shape.ndims == 0):
-            return None
-        else:
-            batch_size = tf.get_static_value(
-                tf.convert_to_tensor(
-                    batch_size, name="batch_size"))
-            return tensor_shape.TensorShape([batch_size]).\
-                concatenate(from_shape)
-
-    dynamic_size = True
-
-    def _create_ta(s, d):
-        return tf.TensorArray(
-            dtype=d,
-            size=0 if dynamic_size else maximum_iterations,
-            dynamic_size=dynamic_size,
-            element_shape=_shape(decoder.batch_size, s))
-
-    initial_outputs_ta = nest.map_structure(_create_ta, decoder.output_size,
-                                            decoder.output_dtype)
-
-    def condition(unused_time, unused_outputs_ta, unused_state, unused_inputs,
-                  finished, unused_sequence_lengths):
-        cond = tf.logical_not(tf.reduce_all(finished))
-        cond_time = (maximum_iterations is None or
-                     unused_time < maximum_iterations)
-        ret = tf.logical_and(cond, tf.convert_to_tensor(cond_time))
-        return ret
-
-    def body(time, outputs_ta, state, inputs, finished, sequence_lengths):
-        r"""Internal while_loop body.
-
-        Args:
-            time: scalar int32 tensor.
-            outputs_ta: structure of TensorArray.
-            state: (structure of) state tensors and TensorArrays.
-            inputs: (structure of) input tensors.
-            finished: bool tensor (keeping track of what's finished).
-            sequence_lengths: int32 tensor (keeping track of time of finish).
-
-        Returns:
-            `(time + 1, outputs_ta, next_state, next_inputs, next_finished,
-            next_sequence_lengths)`.
-        """
-        (next_outputs, state) = decoder.step(time, inputs, state)
-
-        # Check if the maximum iteration is met. If it is met, do not compute
-        # the next inputs.
-        reach_max = tf.equal(time + 1, maximum_iterations)
-        (decoder_finished, next_inputs, decoder_state) = tf.cond(
-            reach_max,
-            lambda: (tf.cast(tf.ones_like(finished), tf.bool),
-                     inputs, state),
-            lambda: decoder.next_inputs(time, next_outputs, state)
-        )
-        if decoder.tracks_own_finished:
-            next_finished = decoder_finished
-        else:
-            next_finished = tf.logical_or(decoder_finished, finished)
-        next_sequence_lengths = tf.where(
-            tf.logical_not(finished),
-            tf.fill(tf.shape(sequence_lengths), time + 1),
-            sequence_lengths)
-
-        nest.assert_same_structure(state, decoder_state)
-        nest.assert_same_structure(outputs_ta, next_outputs)
-        nest.assert_same_structure(inputs, next_inputs)
-
-        # Zero out output values past finish
-        if impute_finished:
-            emit = nest.map_structure(
-                lambda out, zero: tf.where(finished, zero, out),
-                next_outputs,
-                zero_outputs)
-        else:
-            emit = next_outputs
-
-        # Copy through states past finish
-        def _maybe_copy_state(new, cur):
-            # TensorArrays and scalar states get passed through.
-            if isinstance(cur, tf.TensorArray):
-                pass_through = True
-            else:
-                new.set_shape(cur.shape)
-                pass_through = (new.shape.ndims == 0)
-            return new if pass_through else tf.where(finished, cur, new)
-
-        if impute_finished:
-            next_state = nest.map_structure(
-                _maybe_copy_state, decoder_state, state)
-        else:
-            next_state = decoder_state
-
-        outputs_ta = nest.map_structure(lambda ta, out: ta.write(time, out),
-                                        outputs_ta, emit)
-        return (time + 1, outputs_ta, next_state, next_inputs, next_finished,
-                next_sequence_lengths)
-
-    res = tf.while_loop(
-        condition,
-        body,
-        loop_vars=(
-            initial_time,
-            initial_outputs_ta,
-            initial_state,
-            initial_inputs,
-            initial_finished,
-            initial_sequence_lengths,
-        ),
-        parallel_iterations=parallel_iterations,
-        maximum_iterations=maximum_iterations,
-        swap_memory=swap_memory)
-
-    final_outputs_ta = res[1]
-    final_state = res[2]
-    final_sequence_lengths = res[5]
-
-    final_outputs = nest.map_structure(lambda ta: ta.stack(), final_outputs_ta)
-
-    try:
-        final_outputs, final_state = decoder.finalize(
-            final_outputs, final_state, final_sequence_lengths)
-    except NotImplementedError:
-        pass
-
-    if not output_time_major:
-        final_outputs = nest.map_structure(_transpose_batch_time, final_outputs)
-
-    return final_outputs, final_state, final_sequence_lengths
diff --git a/texar/tf/modules/decoders/gpt2_decoder.py b/texar/tf/modules/decoders/gpt2_decoder.py
deleted file mode 100644
index 9763bf20..00000000
--- a/texar/tf/modules/decoders/gpt2_decoder.py
+++ /dev/null
@@ -1,317 +0,0 @@
-# Copyright 2019 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-GPT2 decoders.
-"""
-
-import tensorflow as tf
-
-from texar.tf.modules.decoders.transformer_decoders import TransformerDecoder
-from texar.tf.modules.embedders import PositionEmbedder, WordEmbedder
-from texar.tf.modules.pretrained.gpt2 import PretrainedGPT2Mixin
-
-
-__all__ = [
-    "GPT2Decoder",
-]
-
-
-class GPT2Decoder(PretrainedGPT2Mixin):
-    r"""Raw GPT2 Transformer for decoding sequences. Please see
-    :class:`~texar.tf.modules.PretrainedGPT2Mixin` for a brief description
-    of GPT2.
-
-    This module basically stacks
-    :class:`~texar.tf.modules.WordEmbedder`,
-    :class:`~texar.tf.modules.PositionEmbedder`,
-    :class:`~texar.tf.modules.TransformerDecoder`.
-
-    This module supports the architecture first proposed
-    in `(Radford et al.)` GPT2.
-
-    Args:
-        pretrained_model_name (optional): a `str`, the name
-            of pre-trained model (e.g., ``gpt2-small``). Please refer to
-            :class:`~texar.tf.modules.PretrainedGPT2Mixin` for
-            all supported models.
-            If `None`, the model name in :attr:`hparams` is used.
-        cache_dir (optional): the path to a folder in which the
-            pre-trained models will be cached. If `None` (default),
-            a default directory (``texar_data`` folder under user's home
-            directory) will be used.
-        hparams (dict or HParams, optional): Hyperparameters. Missing
-            hyperparameter will be set to default values. See
-            :meth:`default_hparams` for the hyperparameter structure
-            and default values.
-
-    .. document private functions
-    .. automethod:: _build
-    """
-    _IS_DECODE = True
-
-    def __init__(self,
-                 pretrained_model_name=None,
-                 cache_dir=None,
-                 hparams=None):
-
-        super().__init__(hparams=hparams)
-
-        self.load_pretrained_config(pretrained_model_name, cache_dir)
-
-        with tf.variable_scope(self.variable_scope):
-
-            # Word embedding
-            self.word_embedder = WordEmbedder(
-                vocab_size=self._hparams.vocab_size,
-                hparams=self._hparams.embed)
-
-            # Position embedding
-            self.position_embedder = PositionEmbedder(
-                position_size=self._hparams.position_size,
-                hparams=self._hparams.position_embed)
-
-            # The GPT2 decoder (a TransformerDecoder)
-            self.decoder = TransformerDecoder(
-                vocab_size=self._hparams.vocab_size,
-                output_layer=tf.transpose(self.word_embedder.embedding, (1, 0)),
-                hparams=self._hparams.decoder)
-
-    def embed_tokens(self, tokens, positions):
-        word_embeds = self.word_embedder(tokens)
-        pos_embeds = self.position_embedder(positions)
-        return word_embeds + pos_embeds
-
-    @staticmethod
-    def default_hparams():
-        r"""Returns a dictionary of hyperparameters with default values.
-
-        * The decoder arch is determined by the constructor argument
-          :attr:`pretrained_model_name` if it's specified. In this case,
-          `hparams` are ignored.
-        * Otherwise, the encoder arch is determined by
-          `hparams['pretrained_model_name']` if it's specified. All other
-          configurations in `hparams` are ignored.
-        * If the above two are `None`, the decoder arch is defined by the
-          configurations in `hparams` and weights are randomly initialized.
-
-        .. code-block:: python
-
-            {
-                "name": "gpt2_decoder",
-                "pretrained_model_name": "gpt2-small",
-                "vocab_size": 50257,
-                "context_size": 1024,
-                "embedding_size": 768,
-                "embed": {
-                    "dim": 768,
-                    "name": "word_embeddings"
-                },
-                "position_size": 1024,
-                "position_embed": {
-                    "dim": 768,
-                    "name": "position_embeddings"
-                },
-
-                # hparams for TransformerDecoder
-                "decoder": {
-                    "dim": 768,
-                    "num_blocks": 12,
-                    "use_gpt_config": True,
-                    "embedding_dropout": 0,
-                    "residual_dropout": 0,
-                    "multihead_attention": {
-                        "use_bias": True,
-                        "num_units": 768,
-                        "num_heads": 12,
-                        "dropout_rate": 0.0,
-                        "output_dim": 768
-                    },
-                    "initializer": {
-                        "type": "variance_scaling_initializer",
-                        "kwargs": {
-                            "factor": 1.0,
-                            "mode": "FAN_AVG",
-                            "uniform": True
-                        }
-                    },
-                    "poswise_feedforward": {
-                        "layers": [
-                            {
-                                "type": "Dense",
-                                "kwargs": {
-                                    "activation": "gelu",
-                                    "name": "intermediate",
-                                    "units": 3072,
-                                    "use_bias": True
-                                }
-                            },
-                            {
-                                "type": "Dense",
-                                "kwargs": {
-                                    "activation": None,
-                                    "name": "output",
-                                    "units": 3072,
-                                    "use_bias": True
-                                }
-                            }
-                        ],
-                        "name": "ffn"
-                    }
-                },
-                "name": "gpt2_decoder",
-            }
-
-        Here:
-
-        The default parameters are values for 124M GPT2 model.
-
-        `"pretrained_model_name"`: str or None
-            The name of the pre-trained GPT2 model. If None, the model
-            will be randomly initialized.
-
-        `"embed"`: dict
-            Hyperparameters for word embedding layer.
-
-        `"vocab_size"`: int
-            The vocabulary size of `inputs` in `GPT2Model`.
-
-        `"position_embed"`: dict
-            Hyperparameters for position embedding layer.
-
-        `"position_size"`:  int
-            The maximum sequence length that this model might ever be used with.
-
-        `"name"`: str
-            Name of the module.
-        """
-        return {
-            'decoder': {
-                'name': 'decoder',
-                'dim': 768,
-                'num_blocks': 12,
-                'embedding_dropout': 0,
-                'residual_dropout': 0,
-                'multihead_attention': {
-                    'name': 'self',
-                    'use_bias': True,
-                    'num_units': 768,
-                    'num_heads': 12,
-                    "dropout_rate": 0.0,
-                    'output_dim': 768
-                },
-                'initializer': {
-                    'type': 'variance_scaling_initializer',
-                    'kwargs': {
-                        'factor': 1.0,
-                        'mode': 'FAN_AVG',
-                        'uniform': True
-                    }
-                },
-                'poswise_feedforward': {
-                    'layers': [
-                        {
-                            'type': 'Dense',
-                            'kwargs': {
-                                'activation': 'gelu',
-                                'name': 'intermediate',
-                                'units': 3072,
-                                'use_bias': True
-                            }
-                        },
-                        {
-                            'type': 'Dense',
-                            'kwargs': {
-                                'activation': None,
-                                'name': 'output',
-                                'units': 768,
-                                'use_bias': True
-                            }
-                        }
-                    ],
-                    'name': 'ffn',
-                },
-            },
-
-            'pretrained_model_name': 'gpt2-small',
-            'vocab_size': 50257,
-            'context_size': 1024,
-            'embedding_size': 768,
-            'embed': {
-                'dim': 768,
-                'name': 'word_embeddings'
-            },
-            'position_size': 1024,
-            'position_embed': {
-                'dim': 768,
-                'name': 'position_embeddings'
-            },
-            'name': 'gpt2_decoder',
-            '@no_typecheck': ['pretrained_model_name'],
-        }
-
-    def _build(self,
-               decoding_strategy='train_greedy',
-               inputs=None,
-               memory=None,
-               memory_sequence_length=None,
-               memory_attention_bias=None,
-               beam_width=None,
-               length_penalty=0.,
-               start_tokens=None,
-               end_token=None,
-               context=None,
-               context_sequence_length=None,
-               softmax_temperature=None,
-               max_decoding_length=None,
-               impute_finished=False,
-               helper=None,
-               mode=None):
-        r"""Performs decoding. Has exact the same interfaces with
-        :meth:`texar.tf.modules.TransformerDecoder._build` except inputs
-        which is a tensor with shape `[batch_size, max_time]`. Please refer to
-        it for the detailed usage.
-        """
-        if inputs is not None:
-            batch_size, max_time = inputs.shape.as_list()
-            time = tf.expand_dims(tf.range(max_time), 0)
-            time = tf.broadcast_to(time, [batch_size, max_time])
-            inputs = self.embed_tokens(inputs, time)
-
-        outputs = self.decoder._build(
-            decoding_strategy=decoding_strategy,
-            inputs=inputs,
-            memory=memory,
-            memory_sequence_length=memory_sequence_length,
-            memory_attention_bias=memory_attention_bias,
-            beam_width=beam_width,
-            length_penalty=length_penalty,
-            start_tokens=start_tokens,
-            end_token=end_token,
-            context=context,
-            context_sequence_length=context_sequence_length,
-            softmax_temperature=softmax_temperature,
-            max_decoding_length=max_decoding_length,
-            impute_finished=impute_finished,
-            embedding=lambda a, b: self.embed_tokens(a, b),
-            helper=helper,
-            mode=mode)
-
-        if not self._built:
-            self._add_internal_trainable_variables()
-            self._built = True
-
-            self.init_pretrained_weights(self.variable_scope.name,
-                                         load_output_layer=True)
-
-        return outputs
diff --git a/texar/tf/modules/decoders/gpt2_decoder_test.py b/texar/tf/modules/decoders/gpt2_decoder_test.py
deleted file mode 100644
index d12d9e74..00000000
--- a/texar/tf/modules/decoders/gpt2_decoder_test.py
+++ /dev/null
@@ -1,146 +0,0 @@
-"""
-Unit tests for GPT2 decoder.
-"""
-
-import tensorflow as tf
-
-from texar.tf.modules.decoders.gpt2_decoder import GPT2Decoder
-from texar.tf.modules.decoders.transformer_decoders import \
-    TransformerDecoderOutput
-from texar.tf.utils.test import pretrained_test
-
-
-class GPT2DecoderTest(tf.test.TestCase):
-    r"""Tests :class:`~texar.torch.modules.GPT2Decoder`
-    """
-
-    @pretrained_test
-    def test_hparams(self):
-        r"""Tests the priority of the decoder arch parameters.
-        """
-
-        inputs = tf.placeholder(dtype=tf.int32, shape=[2, 3])
-
-        # case 1: set "pretrained_mode_name" by constructor argument
-        hparams = {
-            "pretrained_model_name": "gpt2-medium",
-        }
-        decoder = GPT2Decoder(pretrained_model_name="gpt2-small",
-                              hparams=hparams)
-        _ = decoder(inputs=inputs)
-        self.assertEqual(decoder.hparams.decoder.num_blocks, 12)
-
-        # case 2: set "pretrained_mode_name" by hparams
-        hparams = {
-            "pretrained_model_name": "gpt2-small",
-            "decoder": {
-                "num_blocks": 6,
-            }
-        }
-        decoder = GPT2Decoder(hparams=hparams)
-        _ = decoder(inputs=inputs)
-        self.assertEqual(decoder.hparams.decoder.num_blocks, 12)
-
-        # case 3: set to None in both hparams and constructor argument
-        hparams = {
-            "pretrained_model_name": None,
-            "decoder": {
-                "num_blocks": 6,
-            }
-        }
-        decoder = GPT2Decoder(hparams=hparams)
-        _ = decoder(inputs=inputs)
-        self.assertEqual(decoder.hparams.decoder.num_blocks, 6)
-
-        # case 4: using default hparams
-        decoder = GPT2Decoder()
-        _ = decoder(inputs=inputs)
-        self.assertEqual(decoder.hparams.decoder.num_blocks, 12)
-
-    @pretrained_test
-    def test_trainable_variables(self):
-        r"""Tests the functionality of automatically collecting trainable
-        variables.
-        """
-
-        inputs = tf.placeholder(dtype=tf.int32, shape=[2, 3])
-
-        def get_variable_num(n_layers: int) -> int:
-            return 1 + 1 + n_layers * 16 + 2
-
-        # case 1: GPT2 small
-        decoder = GPT2Decoder()
-        _ = decoder(inputs=inputs)
-        self.assertEqual(len(decoder.trainable_variables), get_variable_num(12))
-
-        # case 2: GPT2 medium
-        hparams = {
-            "pretrained_model_name": "gpt2-medium",
-        }
-        decoder = GPT2Decoder(hparams=hparams)
-        _ = decoder(inputs=inputs)
-        self.assertEqual(len(decoder.trainable_variables), get_variable_num(24))
-
-        # case 2: GPT2 large
-        hparams = {
-            "pretrained_model_name": "gpt2-large",
-        }
-        decoder = GPT2Decoder(hparams=hparams)
-        _ = decoder(inputs=inputs)
-        self.assertEqual(len(decoder.trainable_variables), get_variable_num(36))
-
-        # case 3: self-designed GPT2
-        hparams = {
-            "pretrained_model_name": None,
-            "decoder": {
-                "num_blocks": 6,
-            }
-        }
-        decoder = GPT2Decoder(hparams=hparams)
-        _ = decoder(inputs=inputs)
-        self.assertEqual(len(decoder.trainable_variables), get_variable_num(6))
-
-    def test_decode_train(self):
-        r"""Tests train_greedy.
-        """
-        hparams = {
-            "pretrained_model_name": None
-        }
-        decoder = GPT2Decoder(hparams=hparams)
-
-        max_time = 8
-        batch_size = 16
-        inputs = tf.random_uniform([batch_size, max_time],
-                                   maxval=50257, dtype=tf.int32)
-        outputs = decoder(inputs=inputs)
-
-        with self.test_session() as sess:
-            sess.run(tf.global_variables_initializer())
-            outputs_ = sess.run(outputs)
-            self.assertEqual(outputs_.logits.shape, (batch_size,
-                                                     max_time, 50257))
-            self.assertEqual(outputs_.sample_id.shape, (batch_size, max_time))
-
-    def test_decode_infer_greedy(self):
-        r"""Tests infer_greedy
-        """
-        hparams = {
-            "pretrained_model_name": None
-        }
-        decoder = GPT2Decoder(hparams=hparams)
-
-        start_tokens = tf.fill([16], 1)
-        end_token = 2
-        outputs, length = decoder(max_decoding_length=4,
-                                  start_tokens=start_tokens,
-                                  end_token=end_token,
-                                  decoding_strategy="infer_greedy")
-
-        with self.test_session() as sess:
-            sess.run(tf.global_variables_initializer())
-            outputs_ = sess.run(outputs)
-            self.assertIsInstance(outputs_, TransformerDecoderOutput)
-
-
-if __name__ == "__main__":
-    tf.test.main()
diff --git a/texar/tf/modules/decoders/rnn_decoder_base.py b/texar/tf/modules/decoders/rnn_decoder_base.py
deleted file mode 100644
index 60ef5e0f..00000000
--- a/texar/tf/modules/decoders/rnn_decoder_base.py
+++ /dev/null
@@ -1,579 +0,0 @@
-# Copyright 2019 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Base class for RNN decoders.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# pylint: disable=too-many-arguments, no-name-in-module
-# pylint: disable=too-many-branches, protected-access, too-many-locals
-# pylint: disable=arguments-differ, unused-argument
-
-import copy
-
-import tensorflow as tf
-from tensorflow.contrib.seq2seq import Decoder as TFDecoder
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.util import nest
-
-from texar.tf.core import layers
-from texar.tf.utils import utils
-from texar.tf.utils.mode import is_train_mode, is_train_mode_py
-from texar.tf.modules.decoders.dynamic_decode import dynamic_decode
-from texar.tf.module_base import ModuleBase
-from texar.tf.modules.decoders import rnn_decoder_helpers
-from texar.tf.utils.dtypes import is_callable
-from texar.tf.utils.shapes import shape_list
-from texar.tf.modules.decoders import tf_helpers as tx_helper
-
-__all__ = [
-    "RNNDecoderBase",
-    "_make_output_layer"
-]
-
-
-def _make_output_layer_from_tensor(output_layer_tensor, vocab_size,
-                                   output_layer_bias, variable_scope):
-    """Creates a dense layer from a Tensor. Used to tie word embedding
-    with the output layer weight.
-    """
-    affine_bias = None
-    if output_layer_bias:
-        with tf.variable_scope(variable_scope):
-            affine_bias = tf.get_variable('affine_bias', [vocab_size])
-
-    def _outputs_to_logits(outputs):
-        shape = shape_list(outputs)
-        dim = shape[-1]
-        outputs = tf.reshape(outputs, [-1, dim])
-        logits = tf.matmul(outputs, output_layer_tensor)
-        if affine_bias is not None:
-            logits += affine_bias
-        logits = tf.reshape(logits, shape[:-1] + [vocab_size])
-        return logits
-
-    return _outputs_to_logits
-
-
-def _make_output_layer(output_layer, vocab_size,
-                       output_layer_bias, variable_scope):
-    """Makes a decoder output layer.
-    """
-    _vocab_size = vocab_size
-    if is_callable(output_layer):
-        _output_layer = output_layer
-    elif tf.contrib.framework.is_tensor(output_layer):
-        _vocab_size = shape_list(output_layer)[1]
-        _output_layer = _make_output_layer_from_tensor(
-            output_layer, _vocab_size, output_layer_bias, variable_scope)
-    elif output_layer is None:
-        if _vocab_size is None:
-            raise ValueError(
-                "Either `output_layer` or `vocab_size` must be provided. "
-                "Set `output_layer=tf.identity` if no output layer is "
-                "wanted.")
-        with tf.variable_scope(variable_scope):
-            # pylint: disable=redefined-variable-type
-            _output_layer = tf.layers.Dense(
-                units=_vocab_size, use_bias=output_layer_bias)
-    else:
-        raise ValueError(
-            "output_layer should be a callable layer, a tensor, or None. "
-            "Unsupported type: ", type(output_layer)
-        )
-
-    return _output_layer, _vocab_size
-
-
-class RNNDecoderBase(ModuleBase, TFDecoder):
-    """Base class inherited by all RNN decoder classes.
-    See :class:`~texar.tf.modules.BasicRNNDecoder` for the argumenrts.
-
-    See :meth:`_build` for the inputs and outputs of RNN decoders in general.
-
-    .. document private functions
-    .. automethod:: _build
-    """
-
-    def __init__(self,
-                 cell=None,
-                 vocab_size=None,
-                 output_layer=None,
-                 cell_dropout_mode=None,
-                 hparams=None):
-        ModuleBase.__init__(self, hparams)
-
-        self._helper = None
-        self._initial_state = None
-
-        # Make rnn cell
-        with tf.variable_scope(self.variable_scope):
-            if cell is not None:
-                self._cell = cell
-            else:
-                self._cell = layers.get_rnn_cell(
-                    self._hparams.rnn_cell, cell_dropout_mode)
-        self._beam_search_cell = None
-
-        # Make the output layer
-        self._output_layer, self._vocab_size = _make_output_layer(
-            output_layer, vocab_size, self._hparams.output_layer_bias,
-            self.variable_scope)
-
-        self.max_decoding_length = None
-
-    @staticmethod
-    def default_hparams():
-        """Returns a dictionary of hyperparameters with default values.
-
-        The hyperparameters are the same as in
-        :meth:`~texar.tf.modules.BasicRNNDecoder.default_hparams` of
-        :class:`~texar.tf.modules.BasicRNNDecoder`, except that the default
-        "name" here is "rnn_decoder".
-        """
-        return {
-            "rnn_cell": layers.default_rnn_cell_hparams(),
-            "helper_train": rnn_decoder_helpers.default_helper_train_hparams(),
-            "helper_infer": rnn_decoder_helpers.default_helper_infer_hparams(),
-            "max_decoding_length_train": None,
-            "max_decoding_length_infer": None,
-            "name": "rnn_decoder",
-            "output_layer_bias": True,
-        }
-
-    def _build(self,
-               decoding_strategy="train_greedy",
-               initial_state=None,
-               inputs=None,
-               sequence_length=None,
-               embedding=None,
-               start_tokens=None,
-               end_token=None,
-               softmax_temperature=None,
-               max_decoding_length=None,
-               impute_finished=False,
-               output_time_major=False,
-               input_time_major=False,
-               helper=None,
-               mode=None,
-               **kwargs):
-        """Performs decoding. This is a shared interface for both
-        :class:`~texar.tf.modules.BasicRNNDecoder` and
-        :class:`~texar.tf.modules.AttentionRNNDecoder`.
-
-        The function provides **3 ways** to specify the
-        decoding method, with varying flexibility:
-
-        1. The :attr:`decoding_strategy` argument: A string taking value of:
-
-            - **"train_greedy"**: decoding in teacher-forcing fashion \
-              (i.e., feeding \
-              `ground truth` to decode the next step), and each sample is \
-              obtained by taking the `argmax` of the RNN output logits. \
-              Arguments :attr:`(inputs, sequence_length, input_time_major)` \
-              are required for this strategy, and argument :attr:`embedding` \
-              is optional.
-            - **"infer_greedy"**: decoding in inference fashion (i.e., feeding \
-              the `generated` sample to decode the next step), and each sample\
-              is obtained by taking the `argmax` of the RNN output logits.\
-              Arguments :attr:`(embedding, start_tokens, end_token)` are \
-              required for this strategy, and argument \
-              :attr:`max_decoding_length` is optional.
-            - **"infer_sample"**: decoding in inference fashion, and each
-              sample is obtained by `random sampling` from the RNN output
-              distribution. Arguments \
-              :attr:`(embedding, start_tokens, end_token)` are \
-              required for this strategy, and argument \
-              :attr:`max_decoding_length` is optional.
-
-          This argument is used only when argument :attr:`helper` is `None`.
-
-          Example:
-
-            .. code-block:: python
-
-                embedder = WordEmbedder(vocab_size=data.vocab.size)
-                decoder = BasicRNNDecoder(vocab_size=data.vocab.size)
-
-                # Teacher-forcing decoding
-                outputs_1, _, _ = decoder(
-                    decoding_strategy='train_greedy',
-                    inputs=embedder(data_batch['text_ids']),
-                    sequence_length=data_batch['length']-1)
-
-                # Random sample decoding. Gets 100 sequence samples
-                outputs_2, _, sequence_length = decoder(
-                    decoding_strategy='infer_sample',
-                    start_tokens=[data.vocab.bos_token_id]*100,
-                    end_token=data.vocab.eos.token_id,
-                    embedding=embedder,
-                    max_decoding_length=60)
-
-        2. The :attr:`helper` argument: An instance of subclass of \
-           :class:`texar.tf.modules.Helper`. This
-           provides a superset of decoding strategies than above, for example:
-
-            - :class:`~texar.tf.modules.TrainingHelper` corresponding to the \
-              "train_greedy" strategy.
-            - :class:`~texar.tf.modules.GreedyEmbeddingHelper` and \
-              :class:`~texar.tf.modules.SampleEmbeddingHelper` corresponding to \
-              the "infer_greedy" and "infer_sample", respectively.
-            - :class:`~texar.tf.modules.TopKSampleEmbeddingHelper` for Top-K \
-              sample decoding.
-            - :class:`ScheduledEmbeddingTrainingHelper` and \
-              :class:`ScheduledOutputTrainingHelper` for scheduled \
-              sampling.
-            - :class:`~texar.tf.modules.SoftmaxEmbeddingHelper` and \
-              :class:`~texar.tf.modules.GumbelSoftmaxEmbeddingHelper` for \
-              soft decoding and gradient backpropagation.
-
-          Helpers give the maximal flexibility of configuring the decoding\
-          strategy.
-
-          Example:
-
-            .. code-block:: python
-
-                embedder = WordEmbedder(vocab_size=data.vocab.size)
-                decoder = BasicRNNDecoder(vocab_size=data.vocab.size)
-
-                # Teacher-forcing decoding, same as above with
-                # `decoding_strategy='train_greedy'`
-                helper_1 = tx.modules.TrainingHelper(
-                    inputs=embedders(data_batch['text_ids']),
-                    sequence_length=data_batch['length']-1)
-                outputs_1, _, _ = decoder(helper=helper_1)
-
-                # Gumbel-softmax decoding
-                helper_2 = GumbelSoftmaxEmbeddingHelper(
-                    embedding=embedder,
-                    start_tokens=[data.vocab.bos_token_id]*100,
-                    end_token=data.vocab.eos_token_id,
-                    tau=0.1)
-                outputs_2, _, sequence_length = decoder(
-                    max_decoding_length=60, helper=helper_2)
-
-        3. :attr:`hparams["helper_train"]` and :attr:`hparams["helper_infer"]`:\
-           Specifying the helper through hyperparameters. Train and infer \
-           strategy is toggled based on :attr:`mode`. Appriopriate arguments \
-           (e.g., :attr:`inputs`, :attr:`start_tokens`, etc) are selected to \
-           construct the helper. Additional arguments for helper constructor \
-           can be provided either through :attr:`**kwargs`, or through \
-           :attr:`hparams["helper_train/infer"]["kwargs"]`.
-
-           This means is used only when both :attr:`decoding_strategy` and \
-           :attr:`helper` are `None`.
-
-           Example:
-
-             .. code-block:: python
-
-                 h = {
-                     "helper_infer": {
-                         "type": "GumbelSoftmaxEmbeddingHelper",
-                         "kwargs": { "tau": 0.1 }
-                     }
-                 }
-                 embedder = WordEmbedder(vocab_size=data.vocab.size)
-                 decoder = BasicRNNDecoder(vocab_size=data.vocab.size, hparams=h)
-
-                 # Gumbel-softmax decoding
-                 output, _, _ = decoder(
-                     decoding_strategy=None, # Sets to None explicit
-                     embedding=embedder,
-                     start_tokens=[data.vocab.bos_token_id]*100,
-                     end_token=data.vocab.eos_token_id,
-                     max_decoding_length=60,
-                     mode=tf.estimator.ModeKeys.PREDICT)
-                         # PREDICT mode also shuts down dropout
-
-        Args:
-            decoding_strategy (str): A string specifying the decoding
-                strategy. Different arguments are required based on the
-                strategy.
-                Ignored if :attr:`helper` is given.
-            initial_state (optional): Initial state of decoding.
-                If `None` (default), zero state is used.
-
-            inputs (optional): Input tensors for teacher forcing decoding.
-                Used when `decoding_strategy` is set to "train_greedy", or
-                when `hparams`-configured helper is used.
-
-                - If :attr:`embedding` is `None`, `inputs` is directly \
-                fed to the decoder. E.g., in `"train_greedy"` strategy, \
-                `inputs` must be a 3D Tensor of shape \
-                `[batch_size, max_time, emb_dim]` (or \
-                `[max_time, batch_size, emb_dim]` if `input_time_major`==True).
-                - If `embedding` is given, `inputs` is used as index \
-                to look up embeddings and feed in the decoder. \
-                E.g., if `embedding` is an instance of \
-                :class:`~texar.tf.modules.WordEmbedder`, \
-                then :attr:`inputs` is usually a 2D int Tensor \
-                `[batch_size, max_time]` (or \
-                `[max_time, batch_size]` if `input_time_major`==True) \
-                containing the token indexes.
-            sequence_length (optional): A 1D int Tensor containing the
-                sequence length of :attr:`inputs`.
-                Used when `decoding_strategy="train_greedy"` or
-                `hparams`-configured helper is used.
-            embedding (optional): Embedding used when:
-
-                - "infer_greedy" or "infer_sample" `decoding_strategy` is \
-                used. This can be a callable or the `params` argument for \
-                :tf_main:`embedding_lookup <nn/embedding_lookup>`. \
-                If a callable, it can take a vector tensor of token `ids`, \
-                or take two arguments (`ids`, `times`), where `ids` \
-                is a vector tensor of token ids, and `times` is a vector tensor\
-                of time steps (i.e., position ids). The latter case can be used\
-                when attr:`embedding` is a combination of word embedding and\
-                position embedding. `embedding` is required in this case.
-                - "train_greedy" `decoding_strategy` is used.\
-                This can be a callable or the `params` argument for \
-                :tf_main:`embedding_lookup <nn/embedding_lookup>`. \
-                If a callable, it can take :attr:`inputs` and returns \
-                the input embedding. `embedding` is optional in this case.
-            start_tokens (optional): A int Tensor of shape `[batch_size]`,
-                the start tokens. Used when `decoding_strategy="infer_greedy"`
-                or `"infer_sample"`, or when the helper specified in `hparams`
-                is used.
-
-                Example:
-
-                    .. code-block:: python
-
-                        data = tx.data.MonoTextData(hparams)
-                        iterator = DataIterator(data)
-                        batch = iterator.get_next()
-
-                        bos_token_id = data.vocab.bos_token_id
-                        start_tokens=tf.ones_like(batch['length'])*bos_token_id
-
-            end_token (optional): A int 0D Tensor, the token that marks end
-                of decoding.
-                Used when `decoding_strategy="infer_greedy"` or
-                `"infer_sample"`, or when the helper specified in `hparams`
-                is used.
-            softmax_temperature (optional): A float 0D Tensor, value to divide
-                the logits by before computing the softmax. Larger values
-                (above 1.0) result in more random samples. Must > 0. If `None`,
-                1.0 is used.
-                Used when `decoding_strategy="infer_sample"`.
-            max_decoding_length: A int scalar Tensor indicating the maximum
-                allowed number of decoding steps. If `None` (default), either
-                `hparams["max_decoding_length_train"]` or
-                `hparams["max_decoding_length_infer"]` is used
-                according to :attr:`mode`.
-            impute_finished (bool): If `True`, then states for batch
-                entries which are marked as finished get copied through and
-                the corresponding outputs get zeroed out.  This causes some
-                slowdown at each time step, but ensures that the final state
-                and outputs have the correct values and that backprop ignores
-                time steps that were marked as finished.
-            output_time_major (bool): If `True`, outputs are returned as
-                time major tensors. If `False` (default), outputs are returned
-                as batch major tensors.
-            input_time_major (optional): Whether the :attr:`inputs` tensor is
-                time major.
-                Used when `decoding_strategy="train_greedy"` or
-                `hparams`-configured helper is used.
-            helper (optional): An instance of
-                :class:`texar.tf.modules.Helper`
-                that defines the decoding strategy. If given,
-                `decoding_strategy`
-                and helper configs in :attr:`hparams` are ignored.
-            mode (str, optional): A string taking value in
-                :tf_main:`tf.estimator.ModeKeys <estimator/ModeKeys>`. If
-                `TRAIN`, training related hyperparameters are used (e.g.,
-                `hparams['max_decoding_length_train']`), otherwise,
-                inference related hyperparameters are used (e.g.,
-                `hparams['max_decoding_length_infer']`).
-                If `None` (default), `TRAIN` mode is used.
-            **kwargs: Other keyword arguments for constructing helpers
-                defined by `hparams["helper_trainn"]` or
-                `hparams["helper_infer"]`.
-
-        Returns:
-            `(outputs, final_state, sequence_lengths)`, where
-
-            - **`outputs`**: an object containing the decoder output on all \
-            time steps.
-            - **`final_state`**: is the cell state of the final time step.
-            - **`sequence_lengths`**: is an int Tensor of shape `[batch_size]` \
-            containing the length of each sample.
-        """
-        # Helper
-        if helper is not None:
-            pass
-        elif decoding_strategy is not None:
-            if decoding_strategy == "train_greedy":
-                helper = rnn_decoder_helpers._get_training_helper(
-                    inputs, sequence_length, embedding, input_time_major)
-            elif decoding_strategy == "infer_greedy":
-                helper = tx_helper.GreedyEmbeddingHelper(
-                    embedding, start_tokens, end_token)
-            elif decoding_strategy == "infer_sample":
-                helper = tx_helper.SampleEmbeddingHelper(
-                    embedding, start_tokens, end_token, softmax_temperature)
-            else:
-                raise ValueError(
-                    "Unknown decoding strategy: {}".format(decoding_strategy))
-        else:
-            if is_train_mode_py(mode):
-                kwargs_ = copy.copy(self._hparams.helper_train.kwargs.todict())
-                helper_type = self._hparams.helper_train.type
-            else:
-                kwargs_ = copy.copy(self._hparams.helper_infer.kwargs.todict())
-                helper_type = self._hparams.helper_infer.type
-            kwargs_.update({
-                "inputs": inputs,
-                "sequence_length": sequence_length,
-                "time_major": input_time_major,
-                "embedding": embedding,
-                "start_tokens": start_tokens,
-                "end_token": end_token,
-                "softmax_temperature": softmax_temperature})
-            kwargs_.update(kwargs)
-            helper = rnn_decoder_helpers.get_helper(helper_type, **kwargs_)
-        self._helper = helper
-
-        # Initial state
-        if initial_state is not None:
-            self._initial_state = initial_state
-        else:
-            self._initial_state = self.zero_state(
-                batch_size=self.batch_size, dtype=tf.float32)
-
-        # Maximum decoding length
-        max_l = max_decoding_length
-        if max_l is None:
-            max_l_train = self._hparams.max_decoding_length_train
-            if max_l_train is None:
-                max_l_train = utils.MAX_SEQ_LENGTH
-            max_l_infer = self._hparams.max_decoding_length_infer
-            if max_l_infer is None:
-                max_l_infer = utils.MAX_SEQ_LENGTH
-            max_l = tf.cond(is_train_mode(mode),
-                            lambda: max_l_train, lambda: max_l_infer)
-        self.max_decoding_length = max_l
-        # Decode
-        outputs, final_state, sequence_lengths = dynamic_decode(
-            decoder=self, impute_finished=impute_finished,
-            maximum_iterations=max_l, output_time_major=output_time_major)
-
-        if not self._built:
-            self._add_internal_trainable_variables()
-            # Add trainable variables of `self._cell` which may be
-            # constructed externally.
-            self._add_trainable_variable(
-                layers.get_rnn_cell_trainable_variables(self._cell))
-            if isinstance(self._output_layer, tf.layers.Layer):
-                self._add_trainable_variable(
-                    self._output_layer.trainable_variables)
-            # Add trainable variables of `self._beam_search_rnn_cell` which
-            # may already be constructed and used.
-            if self._beam_search_cell is not None:
-                self._add_trainable_variable(
-                    self._beam_search_cell.trainable_variables)
-
-            self._built = True
-
-        return outputs, final_state, sequence_lengths
-
-    def _get_beam_search_cell(self, **kwargs):
-        self._beam_search_cell = self._cell
-        return self._cell
-
-    def _rnn_output_size(self):
-        size = self._cell.output_size
-        if self._output_layer is tf.identity:
-            return size
-        else:
-            # To use layer's compute_output_shape, we need to convert the
-            # RNNCell's output_size entries into shapes with an unknown
-            # batch size.  We then pass this through the layer's
-            # compute_output_shape and read off all but the first (batch)
-            # dimensions to get the output size of the rnn with the layer
-            # applied to the top.
-            output_shape_with_unknown_batch = nest.map_structure(
-                lambda s: tensor_shape.TensorShape([None]).concatenate(s),
-                size)
-            layer_output_shape = self._output_layer.compute_output_shape(
-                output_shape_with_unknown_batch)
-            return nest.map_structure(lambda s: s[1:], layer_output_shape)
-
-    @property
-    def batch_size(self):
-        return self._helper.batch_size
-
-    @property
-    def output_size(self):
-        """Output size of one step.
-        """
-        raise NotImplementedError
-
-    @property
-    def output_dtype(self):
-        """Types of output of one step.
-        """
-        raise NotImplementedError
-
-    def initialize(self, name=None):
-        # Inherits from TFDecoder
-        # All RNN decoder classes must implement this
-        raise NotImplementedError
-
-    def step(self, time, inputs, state, name=None):
-        # Inherits from TFDecoder
-        # All RNN decoder classes must implement this
-        raise NotImplementedError
-
-    def finalize(self, outputs, final_state, sequence_lengths):
-        # Inherits from TFDecoder
-        # All RNN decoder classes must implement this
-        raise NotImplementedError
-
-    @property
-    def cell(self):
-        """The RNN cell.
-        """
-        return self._cell
-
-    def zero_state(self, batch_size, dtype):
-        """Zero state of the RNN cell.
-        Equivalent to :attr:`decoder.cell.zero_state`.
-        """
-        return self._cell.zero_state(
-            batch_size=batch_size, dtype=dtype)
-
-    @property
-    def state_size(self):
-        """The state size of decoder cell.
-        Equivalent to :attr:`decoder.cell.state_size`.
-        """
-        return self.cell.state_size
-
-    @property
-    def vocab_size(self):
-        """The vocab size.
-        """
-        return self._vocab_size
-
-    @property
-    def output_layer(self):
-        """The output layer.
-        """
-        return self._output_layer
diff --git a/texar/tf/modules/decoders/rnn_decoder_helpers.py b/texar/tf/modules/decoders/rnn_decoder_helpers.py
deleted file mode 100644
index 5507d2a5..00000000
--- a/texar/tf/modules/decoders/rnn_decoder_helpers.py
+++ /dev/null
@@ -1,480 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Various helper classes and utilities for RNN decoders.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow as tf
-from tensorflow.python.ops import array_ops
-from tensorflow_probability import distributions as tfpd
-
-from texar.tf.modules.decoders.tf_helpers import \
-        Helper, TrainingHelper, GreedyEmbeddingHelper
-from texar.tf.modules.embedders.embedder_utils import soft_embedding_lookup
-from texar.tf.utils import utils
-
-# pylint: disable=not-context-manager, too-many-arguments
-# pylint: disable=too-many-instance-attributes
-
-__all__ = [
-    "default_helper_train_hparams",
-    "default_helper_infer_hparams",
-    "get_helper",
-    "_get_training_helper",
-    "TopKSampleEmbeddingHelper",
-    "SoftmaxEmbeddingHelper",
-    "GumbelSoftmaxEmbeddingHelper",
-]
-
-
-def default_helper_train_hparams():
-    """Returns default hyperparameters of an RNN decoder helper in the training
-    phase.
-
-    See also :meth:`~texar.tf.modules.decoders.rnn_decoder_helpers.get_helper`
-    for information of the hyperparameters.
-
-    Returns:
-        dict: A dictionary with following structure and values:
-
-        .. code-block:: python
-
-            {
-                # The `helper_type` argument for `get_helper`, i.e., the name
-                # or full path to the helper class.
-                "type": "TrainingHelper",
-
-                # The `**kwargs` argument for `get_helper`, i.e., additional
-                # keyword arguments for constructing the helper.
-                "kwargs": {}
-            }
-    """
-    return {
-        "type": "TrainingHelper",
-        "kwargs": {}
-    }
-
-
-def default_helper_infer_hparams():
-    """Returns default hyperparameters of an RNN decoder helper in the inference
-    phase.
-
-    See also :meth:`~texar.tf.modules.decoders.rnn_decoder_helpers.get_helper`
-    for information of the hyperparameters.
-
-    Returns:
-        dict: A dictionary with following structure and values:
-
-        .. code-block:: python
-
-            {
-                # The `helper_type` argument for `get_helper`, i.e., the name
-                # or full path to the helper class.
-                "type": "SampleEmbeddingHelper",
-
-                # The `**kwargs` argument for `get_helper`, i.e., additional
-                # keyword arguments for constructing the helper.
-                "kwargs": {}
-            }
-    """
-    return {
-        "type": "SampleEmbeddingHelper",
-        "kwargs": {}
-    }
-
-
-def get_helper(helper_type,
-               inputs=None,
-               sequence_length=None,
-               embedding=None,
-               start_tokens=None,
-               end_token=None,
-               **kwargs):
-    """Creates a Helper instance.
-
-    Args:
-        helper_type: A :tf_main:`Helper <contrib/seq2seq/Helper>` class, its
-            name or module path, or a class instance. If a class instance
-            is given, it is returned directly.
-        inputs (optional): Inputs to the RNN decoder, e.g., ground truth
-            tokens for teacher forcing decoding.
-        sequence_length (optional): A 1D int Tensor containing the
-            sequence length of :attr:`inputs`.
-        embedding (optional): A callable that takes a vector tensor of
-            indexes (e.g., an instance of subclass of
-            :class:`~texar.tf.modules.EmbedderBase`), or the `params` argument
-            for `embedding_lookup` (e.g., the embedding Tensor).
-        start_tokens (optional): A int Tensor of shape `[batch_size]`,
-            the start tokens.
-        end_token (optional): A int 0D Tensor, the token that marks end
-            of decoding.
-        **kwargs: Additional keyword arguments for constructing the helper.
-
-    Returns:
-        A helper instance.
-    """
-    module_paths = [
-        'texar.tf.modules.decoders.rnn_decoder_helpers',
-        'texar.tf.modules.decoders.tf_helpers',
-        # 'tensorflow.contrib.seq2seq',
-        'texar.tf.custom']
-    class_kwargs = {"inputs": inputs,
-                    "sequence_length": sequence_length,
-                    "embedding": embedding,
-                    "start_tokens": start_tokens,
-                    "end_token": end_token}
-    class_kwargs.update(kwargs)
-    return utils.check_or_get_instance_with_redundant_kwargs(
-        helper_type, class_kwargs, module_paths)
-
-
-def _get_training_helper(  # pylint: disable=invalid-name
-        inputs, sequence_length, embedding=None, time_major=False, name=None):
-    """Returns an instance of :tf_main:`TrainingHelper
-    <contrib/seq2seq/TrainingHelper>` given embeddings.
-
-    Args:
-        inputs: If :attr:`embedding` is given, this is sequences of input
-            token indexes. If :attr:`embedding` is `None`, this is passed to
-            TrainingHelper directly.
-        sequence_length (1D Tensor): Lengths of input token sequences.
-        embedding (optional): The `params` argument of
-            :tf_main:`tf.nn.embedding_lookup
-            <nn/embedding_lookup>` (e.g., the embedding Tensor); or a callable
-            that takes a vector of integer indexes and returns respective
-            embedding (e.g., an instance of subclass of
-            :class:`~texar.tf.modules.EmbedderBase`).
-        time_major (bool): Whether the tensors in `inputs` are time major.
-            If `False` (default), they are assumed to be batch major.
-        name (str, optional): Name scope for any created operations.
-
-    Returns:
-        An instance of TrainingHelper.
-
-    Raises:
-        ValueError: if `sequence_length` is not a 1D tensor.
-    """
-    if embedding is None:
-        return TrainingHelper(inputs=inputs,
-                              sequence_length=sequence_length,
-                              time_major=time_major,
-                              name=name)
-
-    with tf.name_scope(name, "TrainingHelper", [embedding, inputs]):
-        if callable(embedding):
-            embedding_fn = embedding
-        else:
-            embedding_fn = (
-                lambda ids: tf.nn.embedding_lookup(embedding, ids))
-        emb_inputs = embedding_fn(inputs)
-    helper = TrainingHelper(inputs=emb_inputs,
-                            sequence_length=sequence_length,
-                            time_major=time_major,
-                            name=name)
-    return helper
-
-
-def _top_k_logits(logits, k):
-    """Adapted from
-    https://github.com/openai/gpt-2/blob/master/src/sample.py#L63-L77
-    """
-    if k == 0:
-        # no truncation
-        return logits
-
-    def _top_k():
-        values, _ = tf.nn.top_k(logits, k=k)
-        min_values = values[:, -1, tf.newaxis]
-        return tf.where(
-            logits < min_values,
-            tf.ones_like(logits, dtype=logits.dtype) * -1e10,
-            logits,
-        )
-    return tf.cond(
-        tf.equal(k, 0),
-        lambda: logits,
-        lambda: _top_k(),
-    )
-
-
-class TopKSampleEmbeddingHelper(GreedyEmbeddingHelper):
-    """A helper for use during inference.
-
-    Samples from `top_k` most likely candidates from a vocab distribution,
-    and passes the result through an embedding layer to get the next input.
-    """
-
-    def __init__(self, embedding, start_tokens, end_token, top_k=10,
-                 softmax_temperature=None, seed=None):
-        """Initializer.
-
-        Args:
-            embedding: A callable or the `params` argument for
-                `embedding_lookup`. If a callable, it can take a vector tensor
-                of token `ids`, or take two arguments (`ids`, `times`),
-                where `ids` is a vector
-                tensor of token ids, and `times` is a vector tensor of current
-                time steps (i.e., position ids). The latter case can be used
-                when attr:`embedding` is a combination of word embedding and
-                position embedding.
-            start_tokens: `int32` vector shaped `[batch_size]`, the start
-                tokens.
-            end_token: `int32` scalar, the token that marks end of decoding.
-            top_k: `int32` scalar tensor. Number of top candidates to sample
-                from. Must be `>=0`. If set to 0, samples from all candidates
-                (i.e., regular random sample decoding).
-            softmax_temperature (optional): `float32` scalar, value to
-                divide the logits by before computing the softmax. Larger values
-                (above 1.0) result in more random samples, while smaller values
-                push the sampling distribution towards the argmax. Must be
-                strictly greater than 0. Defaults to 1.0.
-            seed (optional): The sampling seed.
-
-        Raises:
-            ValueError: if `start_tokens` is not a 1D tensor or `end_token` is
-            not a scalar.
-        """
-        super(TopKSampleEmbeddingHelper, self).__init__(
-            embedding, start_tokens, end_token)
-        self._top_k = top_k
-        self._softmax_temperature = softmax_temperature
-        self._seed = seed
-
-    def sample(self, time, outputs, state, name=None):
-        """Gets a sample for one step."""
-        del time, state  # unused by sample_fn
-        # Outputs are logits, we sample from the top_k candidates
-        if not isinstance(outputs, tf.Tensor):
-            raise TypeError("Expected outputs to be a single Tensor, got: %s" %
-                            type(outputs))
-        if self._softmax_temperature is None:
-            logits = outputs
-        else:
-            logits = outputs / self._softmax_temperature
-
-        logits = _top_k_logits(logits, k=self._top_k)
-
-        sample_id_sampler = tfpd.Categorical(logits=logits)
-        sample_ids = sample_id_sampler.sample(seed=self._seed)
-
-        return sample_ids
-
-
-class SoftmaxEmbeddingHelper(Helper):
-    """A helper that feeds softmax probabilities over vocabulary
-    to the next step.
-    Uses the softmax probability vector to pass through word embeddings to
-    get the next input (i.e., a mixed word embedding).
-
-    A subclass of
-    :tf_main:`Helper <contrib/seq2seq/Helper>`.
-    Used as a helper to :class:`~texar.tf.modules.RNNDecoderBase` :meth:`_build`
-    in inference mode.
-
-    Args:
-        embedding: A callable or the `params` argument for
-            :tf_main:`tf.nn.embedding_lookup <nn/embedding_lookup>`.
-            If a callable, it can take a float tensor named `soft_ids` which is
-            a distribution over indexes. For example, the shape of the tensor
-            is typically `[batch_size, vocab_size]`. The callable can also
-            take two arguments (`soft_ids`, `times`), where `soft_ids` is
-            as above, and `times` is an int vector tensor of current
-            time steps (i.e., position ids). The latter case can be used
-            when attr:`embedding` is a combination of word embedding and
-            position embedding.
-        start_tokens: An int tensor shaped `[batch_size]`. The
-            start tokens.
-        end_token: An int scalar tensor. The token that marks end of
-            decoding.
-        tau: A float scalar tensor, the softmax temperature.
-        embedding_size (optional): An int scalar tensor, the number of
-            embedding vectors. Usually it is the vocab size. Required if
-            :attr:`embedding` is a callable.
-        stop_gradient (bool): Whether to stop the gradient backpropagation
-            when feeding softmax vector to the next step.
-        use_finish (bool): Whether to stop decoding once `end_token` is
-            generated. If `False`, decoding will continue until
-            `max_decoding_length` of the decoder is reached.
-    """
-
-    def __init__(self, embedding, start_tokens, end_token, tau,
-                 embedding_size=None, stop_gradient=False, use_finish=True):
-        if callable(embedding):
-            self._embedding_fn = embedding
-
-            if embedding_size is None:
-                raise ValueError('`embedding_size` must be provided if '
-                                 '`embedding` is a callable.')
-            self._embedding_size = tf.convert_to_tensor(
-                embedding_size, dtype=tf.int32, name="embedding_size")
-        else:
-            self._embedding_fn = (
-                lambda soft_ids: soft_embedding_lookup(embedding, soft_ids))
-            self._embedding_size = tf.shape(embedding)[0]
-
-        self._start_tokens = tf.convert_to_tensor(
-            start_tokens, dtype=tf.int32, name="start_tokens")
-        self._end_token = tf.convert_to_tensor(
-            end_token, dtype=tf.int32, name="end_token")
-        if self._start_tokens.get_shape().ndims != 1:
-            raise ValueError("start_tokens must be a vector")
-        self._batch_size = array_ops.size(start_tokens)
-        if self._end_token.get_shape().ndims != 0:
-            raise ValueError("end_token must be a scalar")
-
-        soft_start_tokens = tf.one_hot(
-            self._start_tokens, self._embedding_size, dtype=tf.float32)
-        self._embedding_args_cnt = len(utils.get_args(self._embedding_fn))
-        if self._embedding_args_cnt == 1:
-            self._start_inputs = self._embedding_fn(soft_ids=soft_start_tokens)
-        elif self._embedding_args_cnt == 2:
-            # Position index is 0 in the beginning
-            times = tf.zeros([self._batch_size], dtype=tf.int32)
-            self._start_inputs = self._embedding_fn(
-                soft_ids=soft_start_tokens, times=times)
-        else:
-            raise ValueError('`embedding` should expect 1 or 2 arguments.')
-
-        self._batch_size = tf.size(self._start_tokens)
-        self._tau = tau
-        self._stop_gradient = stop_gradient
-        self._use_finish = use_finish
-
-    @property
-    def batch_size(self):
-        return self._batch_size
-
-    @property
-    def sample_ids_dtype(self):
-        return tf.float32
-
-    @property
-    def sample_ids_shape(self):
-        # A trick to convert a scalar Tensor `self._embedding_size` to
-        # a `TensorShape`
-        oh = tf.one_hot(0, self._embedding_size)
-        return oh.get_shape()[:1]
-
-    def initialize(self, name=None):
-        finished = tf.tile([False], [self._batch_size])
-        return (finished, self._start_inputs)
-
-    def sample(self, time, outputs, state, name=None):
-        """Returns `sample_id` which is softmax distributions over vocabulary
-        with temperature `tau`. Shape = `[batch_size, vocab_size]`
-        """
-        sample_ids = tf.nn.softmax(outputs / self._tau)
-        return sample_ids
-
-    def next_inputs(self, time, outputs, state, sample_ids, name=None):
-        if self._use_finish:
-            hard_ids = tf.argmax(sample_ids, axis=-1, output_type=tf.int32)
-            finished = tf.equal(hard_ids, self._end_token)
-        else:
-            finished = tf.tile([False], [self._batch_size])
-        all_finished = tf.reduce_all(finished)
-
-        if self._stop_gradient:
-            sample_ids = tf.stop_gradient(sample_ids)
-
-        if self._embedding_args_cnt == 1:
-            del time, outputs  # unused by next_inputs_fn
-            next_inputs = tf.cond(
-                all_finished,
-                # If we're finished, the next_inputs value doesn't matter
-                lambda: self._start_inputs,
-                lambda: self._embedding_fn(soft_ids=sample_ids))
-        elif self._embedding_args_cnt == 2:
-            # Prepare the position embedding of the next step
-            times = tf.ones(self._batch_size, dtype=tf.int32) * (time + 1)
-            next_inputs = tf.cond(
-                all_finished,
-                # If we're finished, the next_inputs value doesn't matter
-                lambda: self._start_inputs,
-                lambda: self._embedding_fn(soft_ids=sample_ids, times=times))
-
-        return (finished, next_inputs, state)
-
-
-class GumbelSoftmaxEmbeddingHelper(SoftmaxEmbeddingHelper):
-    """A helper that feeds gumbel softmax sample to the next step.
-    Uses the gumbel softmax vector to pass through word embeddings to
-    get the next input (i.e., a mixed word embedding).
-
-    A subclass of
-    :tf_main:`Helper <contrib/seq2seq/Helper>`.
-    Used as a helper to :class:`~texar.tf.modules.RNNDecoderBase` :meth:`_build`
-    in inference mode.
-
-    Same as :class:`~texar.tf.modules.SoftmaxEmbeddingHelper` except that here
-    gumbel softmax (instead of softmax) is used.
-
-    Args:
-        embedding: A callable or the `params` argument for
-            :tf_main:`tf.nn.embedding_lookup <nn/embedding_lookup>`.
-            If a callable, it can take a float tensor named `soft_ids` which is
-            a distribution over indexes. For example, the shape of the tensor
-            is typically `[batch_size, vocab_size]`. The callable can also
-            take two arguments (`soft_ids`, `times`), where `soft_ids` is
-            as above, and `times` is an int vector tensor of current
-            time steps (i.e., position ids). The latter case can be used
-            when attr:`embedding` is a combination of word embedding and
-            position embedding.
-        start_tokens: An int tensor shaped `[batch_size]`. The
-            start tokens.
-        end_token: An int scalar tensor. The token that marks end of
-            decoding.
-        tau: A float scalar tensor, the softmax temperature.
-        embedding_size (optional): An int scalar tensor, the number of
-            embedding vectors. Usually it is the vocab size. Required if
-            :attr:`embedding` is a callable.
-        straight_through (bool): Whether to use straight through gradient
-            between time steps. If `True`, a single token with highest
-            probability (i.e., greedy sample) is fed to the next step and
-            gradient is computed using straight through. If `False` (default),
-            the soft gumbel-softmax distribution is fed to the next step.
-        stop_gradient (bool): Whether to stop the gradient backpropagation
-            when feeding softmax vector to the next step.
-        use_finish (bool): Whether to stop decoding once `end_token` is
-            generated. If `False`, decoding will continue until
-            `max_decoding_length` of the decoder is reached.
-    """
-    def __init__(self, embedding, start_tokens, end_token, tau,
-                 embedding_size=None, straight_through=False,
-                 stop_gradient=False, use_finish=True):
-        super(GumbelSoftmaxEmbeddingHelper, self).__init__(
-            embedding, start_tokens, end_token, tau, embedding_size,
-            stop_gradient, use_finish)
-        self._straight_through = straight_through
-
-    def sample(self, time, outputs, state, name=None):
-        """Returns `sample_id` of shape `[batch_size, vocab_size]`. If
-        `straight_through` is False, this is gumbel softmax distributions over
-        vocabulary with temperature `tau`. If `straight_through` is True,
-        this is one-hot vectors of the greedy samples.
-        """
-        sample_ids = tf.nn.softmax(outputs / self._tau)
-        sample_ids = tfpd.RelaxedOneHotCategorical(
-            self._tau, logits=outputs).sample()
-        if self._straight_through:
-            size = tf.shape(sample_ids)[-1]
-            sample_ids_hard = tf.cast(
-                tf.one_hot(tf.argmax(sample_ids, -1), size), sample_ids.dtype)
-            sample_ids = tf.stop_gradient(sample_ids_hard - sample_ids) \
-                         + sample_ids
-        return sample_ids
diff --git a/texar/tf/modules/decoders/rnn_decoder_helpers_test.py b/texar/tf/modules/decoders/rnn_decoder_helpers_test.py
deleted file mode 100644
index 1e1c199c..00000000
--- a/texar/tf/modules/decoders/rnn_decoder_helpers_test.py
+++ /dev/null
@@ -1,172 +0,0 @@
-"""
-Unit tests for decoder helpers.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
-
-import tensorflow as tf
-
-from texar.tf.modules.decoders.rnn_decoder_helpers import \
-        SoftmaxEmbeddingHelper, GumbelSoftmaxEmbeddingHelper
-from texar.tf.modules.decoders.tf_helpers import GreedyEmbeddingHelper
-from texar.tf.modules.decoders.rnn_decoders import BasicRNNDecoder
-from texar.tf.modules.embedders.embedders import WordEmbedder
-from texar.tf.modules.embedders.position_embedders import PositionEmbedder
-
-# pylint: disable=no-member, too-many-locals, too-many-instance-attributes
-# pylint: disable=too-many-arguments, protected-access, redefined-variable-type
-
-
-class HelpersTest(tf.test.TestCase):
-    """Tests :class:`~texar.tf.modules.decoders.rnn_decoders.BasicRNNDecoder`.
-    """
-
-    def setUp(self):
-        tf.test.TestCase.setUp(self)
-        self._batch_size = 16
-        self._vocab_size = 4
-        self._start_tokens = [self._vocab_size - 2] * self._batch_size
-        self._end_token = self._vocab_size - 1
-        self._max_time = 8
-        self._emb_dim = 100
-        self._inputs = tf.random_uniform(
-            [self._batch_size, self._max_time, self._emb_dim],
-            maxval=1., dtype=tf.float32)
-        self._embedding = tf.random_uniform(
-            [self._vocab_size, self._emb_dim], maxval=1., dtype=tf.float32)
-        self._max_seq_length = 10
-
-    def test_softmax_embedding_helpers(self):
-        """Tests softmax helpers.
-        """
-
-        def _test_fn(helper):
-            _, next_inputs, _ = helper.next_inputs(
-                time=1,
-                outputs=tf.ones([self._batch_size, self._vocab_size]),
-                state=None,
-                sample_ids=tf.ones([self._batch_size, self._vocab_size]))
-
-            self.assertEqual(helper.sample_ids_shape,
-                             tf.TensorShape(self._vocab_size))
-            self.assertEqual(next_inputs.get_shape(),
-                             tf.TensorShape([self._batch_size, self._emb_dim]))
-
-            # Test in an RNN decoder
-            output_layer = tf.layers.Dense(self._vocab_size)
-            decoder = BasicRNNDecoder(vocab_size=self._vocab_size,
-                                      output_layer=output_layer)
-            outputs, final_state, sequence_lengths = decoder(
-                helper=helper, max_decoding_length=self._max_seq_length)
-
-            cell_dim = decoder.hparams.rnn_cell.kwargs.num_units
-            with self.test_session() as sess:
-                sess.run(tf.global_variables_initializer())
-                outputs_, final_state_, sequence_lengths_ = sess.run(
-                    [outputs, final_state, sequence_lengths])
-                max_length = max(sequence_lengths_)
-                self.assertEqual(
-                    outputs_.logits.shape,
-                    (self._batch_size, max_length, self._vocab_size))
-                self.assertEqual(
-                    outputs_.sample_id.shape,
-                    (self._batch_size, max_length, self._vocab_size))
-                self.assertEqual(final_state_[0].shape,
-                                 (self._batch_size, cell_dim))
-
-        # SoftmaxEmbeddingHelper
-
-        # case-(1)
-        helper = SoftmaxEmbeddingHelper(
-            self._embedding, self._start_tokens, self._end_token, 0.7)
-        _test_fn(helper)
-
-        # case-(2)
-        embedder = WordEmbedder(self._embedding)
-        helper = SoftmaxEmbeddingHelper(
-            embedder, self._start_tokens, self._end_token, 0.7,
-            embedding_size=self._vocab_size)
-        _test_fn(helper)
-
-        # case-(3)
-        word_embedder = WordEmbedder(self._embedding)
-        pos_embedder = PositionEmbedder(position_size=self._max_seq_length)
-
-        def _emb_fn(soft_ids, times):
-            return word_embedder(soft_ids=soft_ids) + pos_embedder(times)
-        helper = SoftmaxEmbeddingHelper(
-            _emb_fn, self._start_tokens, self._end_token, 0.7,
-            embedding_size=self._vocab_size)
-        _test_fn(helper)
-
-        # GumbelSoftmaxEmbeddingHelper
-
-        # case-(1)
-        helper = GumbelSoftmaxEmbeddingHelper(
-            self._embedding, self._start_tokens, self._end_token, 0.7)
-        _test_fn(helper)
-
-    def test_infer_helpers(self):
-        """Tests inference helpers.
-        """
-
-        def _test_fn(helper):
-            _, next_inputs, _ = helper.next_inputs(
-                time=1,
-                outputs=tf.ones([self._batch_size, self._vocab_size]),
-                state=None,
-                sample_ids=tf.ones([self._batch_size], dtype=tf.int32))
-
-            self.assertEqual(helper.sample_ids_shape,
-                             tf.TensorShape([]))
-            self.assertEqual(next_inputs.get_shape(),
-                             tf.TensorShape([self._batch_size, self._emb_dim]))
-
-            # Test in an RNN decoder
-            output_layer = tf.layers.Dense(self._vocab_size)
-            decoder = BasicRNNDecoder(vocab_size=self._vocab_size,
-                                      output_layer=output_layer)
-            outputs, final_state, sequence_lengths = decoder(
-                helper=helper, max_decoding_length=self._max_seq_length)
-
-            cell_dim = decoder.hparams.rnn_cell.kwargs.num_units
-            with self.test_session() as sess:
-                sess.run(tf.global_variables_initializer())
-                outputs_, final_state_, sequence_lengths_ = sess.run(
-                    [outputs, final_state, sequence_lengths])
-                max_length = max(sequence_lengths_)
-                self.assertEqual(
-                    outputs_.logits.shape,
-                    (self._batch_size, max_length, self._vocab_size))
-                self.assertEqual(
-                    outputs_.sample_id.shape, (self._batch_size, max_length))
-                self.assertEqual(final_state_[0].shape,
-                                 (self._batch_size, cell_dim))
-
-        # case-(1)
-        helper = GreedyEmbeddingHelper(
-            self._embedding, self._start_tokens, self._end_token)
-        _test_fn(helper)
-
-        # case-(2)
-        embedder = WordEmbedder(self._embedding)
-        helper = GreedyEmbeddingHelper(
-            embedder, self._start_tokens, self._end_token)
-        _test_fn(helper)
-
-        # case-(3)
-        word_embedder = WordEmbedder(self._embedding)
-        pos_embedder = PositionEmbedder(position_size=self._max_seq_length)
-
-        def _emb_fn(ids, times):
-            return word_embedder(ids) + pos_embedder(times)
-        helper = GreedyEmbeddingHelper(
-            _emb_fn, self._start_tokens, self._end_token)
-        _test_fn(helper)
-
-
-if __name__ == "__main__":
-    tf.test.main()
diff --git a/texar/tf/modules/decoders/rnn_decoders.py b/texar/tf/modules/decoders/rnn_decoders.py
deleted file mode 100644
index 4f326ef5..00000000
--- a/texar/tf/modules/decoders/rnn_decoders.py
+++ /dev/null
@@ -1,685 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Various RNN decoders.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# pylint: disable=no-name-in-module, too-many-arguments, too-many-locals
-# pylint: disable=not-context-manager, protected-access, invalid-name
-
-import collections
-import copy
-
-import tensorflow as tf
-from tensorflow.contrib.seq2seq import AttentionWrapper
-from tensorflow.python.util import nest
-from tensorflow.contrib.seq2seq import tile_batch
-
-from texar.tf.modules.decoders.rnn_decoder_base import RNNDecoderBase
-from texar.tf.utils import utils
-
-__all__ = [
-    "BasicRNNDecoderOutput",
-    "AttentionRNNDecoderOutput",
-    "BasicRNNDecoder",
-    "AttentionRNNDecoder"
-]
-
-
-class BasicRNNDecoderOutput(
-        collections.namedtuple("BasicRNNDecoderOutput",
-                               ("logits", "sample_id", "cell_output"))):
-    """The outputs of basic RNN decoder that include both RNN outputs and
-    sampled ids at each step. This is also used to store results of all the
-    steps after decoding the whole sequence.
-
-    Attributes:
-        logits: The outputs of RNN (at each step/of all steps) by applying the
-            output layer on cell outputs. E.g., in
-            :class:`~texar.tf.modules.BasicRNNDecoder` with default
-            hyperparameters, this is a Tensor of
-            shape `[batch_size, max_time, vocab_size]` after decoding the
-            whole sequence.
-        sample_id: The sampled results (at each step/of all steps). E.g., in
-            BasicRNNDecoder with decoding strategy of train_greedy,
-            this is a Tensor
-            of shape `[batch_size, max_time]` containing the sampled token
-            indexes of all steps.
-        cell_output: The output of RNN cell (at each step/of all steps).
-            This is the results prior to the output layer. E.g., in
-            BasicRNNDecoder with default
-            hyperparameters, this is a Tensor of
-            shape `[batch_size, max_time, cell_output_size]` after decoding
-            the whole sequence.
-    """
-    pass
-
-
-class AttentionRNNDecoderOutput(
-        collections.namedtuple(
-            "AttentionRNNDecoderOutput",
-            ["logits", "sample_id", "cell_output",
-             "attention_scores", "attention_context"])):
-    """The outputs of attention RNN decoders that additionally include
-    attention results.
-
-    Attributes:
-        logits: The outputs of RNN (at each step/of all steps) by applying the
-            output layer on cell outputs. E.g., in
-            :class:`~texar.tf.modules.AttentionRNNDecoder`, this is a Tensor of
-            shape `[batch_size, max_time, vocab_size]` after decoding.
-        sample_id: The sampled results (at each step/of all steps). E.g., in
-            :class:`~texar.tf.modules.AttentionRNNDecoder` with decoding strategy
-            of train_greedy, this
-            is a Tensor of shape `[batch_size, max_time]` containing the
-            sampled token indexes of all steps.
-        cell_output: The output of RNN cell (at each step/of all steps).
-            This is the results prior to the output layer. E.g., in
-            AttentionRNNDecoder with default
-            hyperparameters, this is a Tensor of
-            shape `[batch_size, max_time, cell_output_size]` after decoding
-            the whole sequence.
-        attention_scores: A single or tuple of `Tensor`(s) containing the
-            alignments emitted (at the previous time step/of all time steps)
-            for each attention mechanism.
-        attention_context: The attention emitted (at the previous time step/of
-            all time steps).
-    """
-    pass
-
-
-class BasicRNNDecoder(RNNDecoderBase):
-    """Basic RNN decoder.
-
-    Args:
-        cell (RNNCell, optional): An instance of
-            :tf_main:`RNNCell <ontrib/rnn/RNNCell>`. If `None`
-            (default), a cell is created as specified in
-            :attr:`hparams`.
-        cell_dropout_mode (optional): A Tensor taking value of
-            :tf_main:`tf.estimator.ModeKeys <estimator/ModeKeys>`, which
-            toggles dropout in the RNN cell (e.g., activates dropout in
-            TRAIN mode). If `None`, :func:`~texar.tf.global_mode` is used.
-            Ignored if :attr:`cell` is given.
-        vocab_size (int, optional): Vocabulary size. Required if
-            :attr:`output_layer` is `None`.
-        output_layer (optional): An output layer that transforms cell output
-            to logits. This can be:
-
-            - A callable layer, e.g., an instance \
-            of :tf_main:`tf.layers.Layer <layers/Layer>`.
-            - A tensor. A dense layer will be created using the tensor \
-            as the kernel weights. The bias of the dense layer is determined by\
-            `hparams.output_layer_bias`. This can be used to tie the output \
-            layer with the input embedding matrix, as proposed in \
-            https://arxiv.org/pdf/1608.05859.pdf
-            - `None`. A dense layer will be created based on attr:`vocab_size`\
-            and `hparams.output_layer_bias`.
-            - If no output layer after the cell output is needed, set \
-            `(vocab_size=None, output_layer=tf.identity)`.
-        hparams (dict, optional): Hyperparameters. Missing
-            hyperparamerter will be set to default values. See
-            :meth:`default_hparams` for the hyperparameter sturcture and
-            default values.
-
-    See :meth:`~texar.tf.modules.RNNDecoderBase._build` for the inputs and outputs
-    of the decoder. The decoder returns
-    `(outputs, final_state, sequence_lengths)`, where `outputs` is an instance
-    of :class:`~texar.tf.modules.BasicRNNDecoderOutput`.
-
-    Example:
-
-        .. code-block:: python
-
-            embedder = WordEmbedder(vocab_size=data.vocab.size)
-            decoder = BasicRNNDecoder(vocab_size=data.vocab.size)
-
-            # Training loss
-            outputs, _, _ = decoder(
-                decoding_strategy='train_greedy',
-                inputs=embedder(data_batch['text_ids']),
-                sequence_length=data_batch['length']-1)
-
-            loss = tx.losses.sequence_sparse_softmax_cross_entropy(
-                labels=data_batch['text_ids'][:, 1:],
-                logits=outputs.logits,
-                sequence_length=data_batch['length']-1)
-
-            # Inference sample
-            outputs, _, _ = decoder(
-                decoding_strategy='infer_sample',
-                start_tokens=[data.vocab.bos_token_id]*100,
-                end_token=data.vocab.eos.token_id,
-                embedding=embedder,
-                max_decoding_length=60,
-                mode=tf.estimator.ModeKeys.PREDICT)
-
-            sample_id = sess.run(outputs.sample_id)
-            sample_text = tx.utils.map_ids_to_strs(sample_id, data.vocab)
-            print(sample_text)
-            # [
-            #   the first sequence sample .
-            #   the second sequence sample .
-            #   ...
-            # ]
-    """
-
-    def __init__(self,
-                 cell=None,
-                 cell_dropout_mode=None,
-                 vocab_size=None,
-                 output_layer=None,
-                 hparams=None):
-        RNNDecoderBase.__init__(
-            self, cell, vocab_size, output_layer, cell_dropout_mode, hparams)
-
-    @staticmethod
-    def default_hparams():
-        """Returns a dictionary of hyperparameters with default values.
-
-        .. code-block:: python
-
-            {
-                "rnn_cell": default_rnn_cell_hparams(),
-                "max_decoding_length_train": None,
-                "max_decoding_length_infer": None,
-                "helper_train": {
-                    "type": "TrainingHelper",
-                    "kwargs": {}
-                }
-                "helper_infer": {
-                    "type": "SampleEmbeddingHelper",
-                    "kwargs": {}
-                }
-                "name": "basic_rnn_decoder"
-            }
-
-        Here:
-
-        "rnn_cell": dict
-            A dictionary of RNN cell hyperparameters. Ignored if
-            :attr:`cell` is given to the decoder constructor.
-            The default value is defined in
-            :func:`~texar.tf.core.default_rnn_cell_hparams`.
-
-        "max_decoding_length_train": int or None
-            Maximum allowed number of decoding steps in training mode.
-            If `None` (default), decoding is
-            performed until fully done, e.g., encountering the <EOS> token.
-            Ignored if `max_decoding_length` is given when calling
-            the decoder.
-
-        "max_decoding_length_infer": int or None
-            Same as "max_decoding_length_train" but for inference mode.
-
-        "helper_train": dict
-            The hyperparameters of the helper used in training.
-            "type" can be a helper class, its name or module path, or a
-            helper instance. If a class name is given, the class must be
-            from module :tf_main:`tf.contrib.seq2seq <contrib/seq2seq>`,
-            :mod:`texar.tf.modules`, or :mod:`texar.tf.custom`. This is used
-            only when both `decoding_strategy` and `helper` augments are
-            `None` when calling the decoder. See
-            :meth:`~texar.tf.modules.RNNDecoderBase._build` for more details.
-
-        "helper_infer": dict
-            Same as "helper_train" but during inference mode.
-
-        "name": str
-            Name of the decoder.
-
-            The default value is "basic_rnn_decoder".
-        """
-        hparams = RNNDecoderBase.default_hparams()
-        hparams["name"] = "basic_rnn_decoder"
-        return hparams
-
-    def initialize(self, name=None):
-        return self._helper.initialize() + (self._initial_state,)
-
-    def step(self, time, inputs, state, name=None):
-        cell_outputs, cell_state = self._cell(inputs, state)
-        logits = self._output_layer(cell_outputs)
-        sample_ids = self._helper.sample(
-            time=time, outputs=logits, state=cell_state)
-        outputs = BasicRNNDecoderOutput(logits, sample_ids, cell_outputs)
-        return outputs, cell_state
-
-    def next_inputs(self, time, outputs, state):
-        (finished, next_inputs, next_state) = self._helper.next_inputs(
-            time=time,
-            outputs=outputs.logits,
-            state=state,
-            sample_ids=outputs.sample_id)
-        return finished, next_inputs, next_state
-
-    def finalize(self, outputs, final_state, sequence_lengths):
-        return outputs, final_state
-
-    @property
-    def output_size(self):
-        """Output size of one step.
-        """
-        return BasicRNNDecoderOutput(
-            logits=self._rnn_output_size(),
-            sample_id=self._helper.sample_ids_shape,
-            cell_output=self._cell.output_size)
-
-    @property
-    def output_dtype(self):
-        """Types of output of one step.
-        """
-        # Assume the dtype of the cell is the output_size structure
-        # containing the input_state's first component's dtype.
-        # Return that structure and the sample_ids_dtype from the helper.
-        dtype = nest.flatten(self._initial_state)[0].dtype
-        return BasicRNNDecoderOutput(
-            logits=nest.map_structure(lambda _: dtype, self._rnn_output_size()),
-            sample_id=self._helper.sample_ids_dtype,
-            cell_output=nest.map_structure(
-                lambda _: dtype, self._cell.output_size))
-
-
-class AttentionRNNDecoder(RNNDecoderBase):
-    """RNN decoder with attention mechanism.
-
-    Args:
-        memory: The memory to query, e.g., the output of an RNN encoder. This
-            tensor should be shaped `[batch_size, max_time, dim]`.
-        memory_sequence_length (optional): A tensor of shape `[batch_size]`
-            containing the sequence lengths for the batch
-            entries in memory. If provided, the memory tensor rows are masked
-            with zeros for values past the respective sequence lengths.
-        cell (RNNCell, optional): An instance of `RNNCell`. If `None`, a cell
-            is created as specified in :attr:`hparams`.
-        cell_dropout_mode (optional): A Tensor taking value of
-            :tf_main:`tf.estimator.ModeKeys <estimator/ModeKeys>`, which
-            toggles dropout in the RNN cell (e.g., activates dropout in
-            TRAIN mode). If `None`, :func:`~texar.tf.global_mode` is used.
-            Ignored if :attr:`cell` is given.
-        vocab_size (int, optional): Vocabulary size. Required if
-            :attr:`output_layer` is `None`.
-        output_layer (optional): An output layer that transforms cell output
-            to logits. This can be:
-
-            - A callable layer, e.g., an instance \
-            of :tf_main:`tf.layers.Layer <layers/Layer>`.
-            - A tensor. A dense layer will be created using the tensor \
-            as the kernel weights. The bias of the dense layer is determined by\
-            `hparams.output_layer_bias`. This can be used to tie the output \
-            layer with the input embedding matrix, as proposed in \
-            https://arxiv.org/pdf/1608.05859.pdf
-            - `None`. A dense layer will be created based on attr:`vocab_size`\
-            and `hparams.output_layer_bias`.
-            - If no output layer after the cell output is needed, set \
-            `(vocab_size=None, output_layer=tf.identity)`.
-        cell_input_fn (callable, optional): A callable that produces RNN cell
-            inputs. If `None` (default), the default is used:
-            `lambda inputs, attention: tf.concat([inputs, attention], -1)`,
-            which cancats regular RNN cell inputs with attentions.
-        hparams (dict, optional): Hyperparameters. Missing
-            hyperparamerter will be set to default values. See
-            :meth:`default_hparams` for the hyperparameter sturcture and
-            default values.
-
-    See :meth:`~texar.tf.modules.RNNDecoderBase._build` for the inputs and outputs
-    of the decoder. The decoder returns
-    `(outputs, final_state, sequence_lengths)`, where `outputs` is an instance
-    of :class:`~texar.tf.modules.AttentionRNNDecoderOutput`.
-
-    Example:
-
-        .. code-block:: python
-
-            # Encodes the source
-            enc_embedder = WordEmbedder(data.source_vocab.size, ...)
-            encoder = UnidirectionalRNNEncoder(...)
-
-            enc_outputs, _ = encoder(
-                inputs=enc_embedder(data_batch['source_text_ids']),
-                sequence_length=data_batch['source_length'])
-
-            # Decodes while attending to the source
-            dec_embedder = WordEmbedder(vocab_size=data.target_vocab.size, ...)
-            decoder = AttentionRNNDecoder(
-                memory=enc_outputs,
-                memory_sequence_length=data_batch['source_length'],
-                vocab_size=data.target_vocab.size)
-
-            outputs, _, _ = decoder(
-                decoding_strategy='train_greedy',
-                inputs=dec_embedder(data_batch['target_text_ids']),
-                sequence_length=data_batch['target_length']-1)
-    """
-    def __init__(self,
-                 memory,
-                 memory_sequence_length=None,
-                 cell=None,
-                 cell_dropout_mode=None,
-                 vocab_size=None,
-                 output_layer=None,
-                 # attention_layer=None, # TODO(zhiting): only valid for tf>=1.0
-                 cell_input_fn=None,
-                 hparams=None):
-        RNNDecoderBase.__init__(
-            self, cell, vocab_size, output_layer, cell_dropout_mode, hparams)
-
-        attn_hparams = self._hparams['attention']
-        attn_kwargs = attn_hparams['kwargs'].todict()
-
-        # Parse the 'probability_fn' argument
-        if 'probability_fn' in attn_kwargs:
-            prob_fn = attn_kwargs['probability_fn']
-            if prob_fn is not None and not callable(prob_fn):
-                prob_fn = utils.get_function(
-                    prob_fn,
-                    ['tensorflow.nn', 'tensorflow.contrib.sparsemax',
-                     'tensorflow.contrib.seq2seq'])
-            attn_kwargs['probability_fn'] = prob_fn
-
-        attn_kwargs.update({
-            "memory_sequence_length": memory_sequence_length,
-            "memory": memory})
-        self._attn_kwargs = attn_kwargs
-        attn_modules = ['tensorflow.contrib.seq2seq', 'texar.tf.custom']
-        # Use variable_scope to ensure all trainable variables created in
-        # the attention mechanism are collected
-        with tf.variable_scope(self.variable_scope):
-            attention_mechanism = utils.check_or_get_instance(
-                attn_hparams["type"], attn_kwargs, attn_modules,
-                classtype=tf.contrib.seq2seq.AttentionMechanism)
-
-        self._attn_cell_kwargs = {
-            "attention_layer_size": attn_hparams["attention_layer_size"],
-            "alignment_history": attn_hparams["alignment_history"],
-            "output_attention": attn_hparams["output_attention"],
-        }
-        self._cell_input_fn = cell_input_fn
-        # Use variable_scope to ensure all trainable variables created in
-        # AttentionWrapper are collected
-        with tf.variable_scope(self.variable_scope):
-            # if attention_layer is not None:
-            #    self._attn_cell_kwargs["attention_layer_size"] = None
-            attn_cell = AttentionWrapper(
-                self._cell,
-                attention_mechanism,
-                cell_input_fn=self._cell_input_fn,
-                # attention_layer=attention_layer,
-                **self._attn_cell_kwargs)
-            self._cell = attn_cell
-
-    @staticmethod
-    def default_hparams():
-        """Returns a dictionary of hyperparameters with default values:
-
-        Common hyperparameters are the same as in
-        :class:`~texar.tf.modules.BasicRNNDecoder`.
-        :meth:`~texar.tf.modules.BasicRNNDecoder.default_hparams`.
-        Additional hyperparameters are for attention mechanism
-        configuration.
-
-        .. code-block:: python
-
-            {
-                "attention": {
-                    "type": "LuongAttention",
-                    "kwargs": {
-                        "num_units": 256,
-                    },
-                    "attention_layer_size": None,
-                    "alignment_history": False,
-                    "output_attention": True,
-                },
-                # The following hyperparameters are the same as with
-                # `BasicRNNDecoder`
-                "rnn_cell": default_rnn_cell_hparams(),
-                "max_decoding_length_train": None,
-                "max_decoding_length_infer": None,
-                "helper_train": {
-                    "type": "TrainingHelper",
-                    "kwargs": {}
-                }
-                "helper_infer": {
-                    "type": "SampleEmbeddingHelper",
-                    "kwargs": {}
-                }
-                "name": "attention_rnn_decoder"
-            }
-
-        Here:
-
-        "attention": dict
-            Attention hyperparameters, including:
-
-            "type": str or class or instance
-                The attention type. Can be an attention class, its name or
-                module path, or a class instance. The class must be a subclass
-                of :tf_main:`TF AttentionMechanism
-                <contrib/seq2seq/AttentionMechanism>`. If class name is
-                given, the class must be from modules
-                :tf_main:`tf.contrib.seq2seq <contrib/seq2seq>` or
-                :mod:`texar.tf.custom`.
-
-                Example:
-
-                    .. code-block:: python
-
-                        # class name
-                        "type": "LuongAttention"
-                        "type": "BahdanauAttention"
-                        # module path
-                        "type": "tf.contrib.seq2seq.BahdanauMonotonicAttention"
-                        "type": "my_module.MyAttentionMechanismClass"
-                        # class
-                        "type": tf.contrib.seq2seq.LuongMonotonicAttention
-                        # instance
-                        "type": LuongAttention(...)
-
-            "kwargs": dict
-                keyword arguments for the attention class constructor.
-                Arguments :attr:`memory` and
-                :attr:`memory_sequence_length` should **not** be
-                specified here because they are given to the decoder
-                constructor. Ignored if "type" is an attention class
-                instance. For example
-
-                Example:
-
-                    .. code-block:: python
-
-                        "type": "LuongAttention",
-                        "kwargs": {
-                            "num_units": 256,
-                            "probability_fn": tf.nn.softmax
-                        }
-
-                    Here "probability_fn" can also be set to the string name
-                    or module path to a probability function.
-
-                "attention_layer_size": int or None
-                    The depth of the attention (output) layer. The context and
-                    cell output are fed into the attention layer to generate
-                    attention at each time step.
-                    If `None` (default), use the context as attention at each
-                    time step.
-
-                "alignment_history": bool
-                    whether to store alignment history from all time steps
-                    in the final output state. (Stored as a time major
-                    `TensorArray` on which you must call `stack()`.)
-
-                "output_attention": bool
-                    If `True` (default), the output at each time step is
-                    the attention value. This is the behavior of Luong-style
-                    attention mechanisms. If `False`, the output at each
-                    time step is the output of `cell`.  This is the
-                    beahvior of Bhadanau-style attention mechanisms.
-                    In both cases, the `attention` tensor is propagated to
-                    the next time step via the state and is used there.
-                    This flag only controls whether the attention mechanism
-                    is propagated up to the next cell in an RNN stack or to
-                    the top RNN output.
-        """
-        hparams = RNNDecoderBase.default_hparams()
-        hparams["name"] = "attention_rnn_decoder"
-        hparams["attention"] = {
-            "type": "LuongAttention",
-            "kwargs": {
-                "num_units": 256,
-            },
-            "attention_layer_size": None,
-            "alignment_history": False,
-            "output_attention": True,
-        }
-        return hparams
-
-    # pylint: disable=arguments-differ
-    def _get_beam_search_cell(self, beam_width):
-        """Returns the RNN cell for beam search decoding.
-        """
-        with tf.variable_scope(self.variable_scope, reuse=True):
-            attn_kwargs = copy.copy(self._attn_kwargs)
-
-            memory = attn_kwargs['memory']
-            attn_kwargs['memory'] = tile_batch(memory, multiplier=beam_width)
-
-            memory_seq_length = attn_kwargs['memory_sequence_length']
-            if memory_seq_length is not None:
-                attn_kwargs['memory_sequence_length'] = tile_batch(
-                    memory_seq_length, beam_width)
-
-            attn_modules = ['tensorflow.contrib.seq2seq', 'texar.tf.custom']
-            bs_attention_mechanism = utils.check_or_get_instance(
-                self._hparams.attention.type, attn_kwargs, attn_modules,
-                classtype=tf.contrib.seq2seq.AttentionMechanism)
-
-            bs_attn_cell = AttentionWrapper(
-                self._cell._cell,
-                bs_attention_mechanism,
-                cell_input_fn=self._cell_input_fn,
-                **self._attn_cell_kwargs)
-
-            self._beam_search_cell = bs_attn_cell
-
-            return bs_attn_cell
-
-    def initialize(self, name=None):
-        helper_init = self._helper.initialize()
-
-        flat_initial_state = nest.flatten(self._initial_state)
-        dtype = flat_initial_state[0].dtype
-        initial_state = self._cell.zero_state(
-            batch_size=tf.shape(flat_initial_state[0])[0], dtype=dtype)
-        initial_state = initial_state.clone(cell_state=self._initial_state)
-
-        return [helper_init[0], helper_init[1], initial_state]
-
-    def step(self, time, inputs, state, name=None):
-        wrapper_outputs, wrapper_state = self._cell(inputs, state)
-        # Essentisally the same as in BasicRNNDecoder.step()
-        logits = self._output_layer(wrapper_outputs)
-        sample_ids = self._helper.sample(
-            time=time, outputs=logits, state=wrapper_state)
-
-        attention_scores = wrapper_state.alignments
-        attention_context = wrapper_state.attention
-        outputs = AttentionRNNDecoderOutput(
-            logits, sample_ids, wrapper_outputs,
-            attention_scores, attention_context)
-
-        return (outputs, wrapper_state)
-
-    def next_inputs(self, time, outputs, state):
-        (finished, next_inputs, state) = self._helper.next_inputs(
-            time=time,
-            outputs=outputs.logits,
-            state=state,
-            sample_ids=outputs.sample_id)
-        return (finished, next_inputs, state)
-
-    def finalize(self, outputs, final_state, sequence_lengths):
-        return outputs, final_state
-
-    def _alignments_size(self):
-        # Reimplementation of the alignments_size of each of
-        # AttentionWrapper.attention_mechanisms. The original implementation
-        # of `_BaseAttentionMechanism._alignments_size`:
-        #
-        #    self._alignments_size = (self._keys.shape[1].value or
-        #                       array_ops.shape(self._keys)[1])
-        #
-        # can be `None` when the seq length of encoder outputs are priori
-        # unknown.
-        alignments_size = []
-        for am in self._cell._attention_mechanisms:
-            az = (am._keys.shape[1].value or tf.shape(am._keys)[1:-1])
-            alignments_size.append(az)
-        return self._cell._item_or_tuple(alignments_size)
-
-    @property
-    def output_size(self):
-        return AttentionRNNDecoderOutput(
-            logits=self._rnn_output_size(),
-            sample_id=self._helper.sample_ids_shape,
-            cell_output=self._cell.output_size,
-            attention_scores=self._alignments_size(),
-            attention_context=self._cell.state_size.attention)
-
-    @property
-    def output_dtype(self):
-        """Types of output of one step.
-        """
-        # Assume the dtype of the cell is the output_size structure
-        # containing the input_state's first component's dtype.
-        # Return that structure and the sample_ids_dtype from the helper.
-        dtype = nest.flatten(self._initial_state)[0].dtype
-        return AttentionRNNDecoderOutput(
-            logits=nest.map_structure(lambda _: dtype, self._rnn_output_size()),
-            sample_id=self._helper.sample_ids_dtype,
-            cell_output=nest.map_structure(
-                lambda _: dtype, self._cell.output_size),
-            attention_scores=nest.map_structure(
-                lambda _: dtype, self._alignments_size()),
-            attention_context=nest.map_structure(
-                lambda _: dtype, self._cell.state_size.attention))
-
-    def zero_state(self, batch_size, dtype):
-        """Returns zero state of the basic cell.
-        Equivalent to :attr:`decoder.cell._cell.zero_state`.
-        """
-        return self._cell._cell.zero_state(batch_size=batch_size, dtype=dtype)
-
-    def wrapper_zero_state(self, batch_size, dtype):
-        """Returns zero state of the attention-wrapped cell.
-        Equivalent to :attr:`decoder.cell.zero_state`.
-        """
-        return self._cell.zero_state(batch_size=batch_size, dtype=dtype)
-
-    @property
-    def state_size(self):
-        """The state size of the basic cell.
-        Equivalent to :attr:`decoder.cell._cell.state_size`.
-        """
-        return self._cell._cell.state_size
-
-    @property
-    def wrapper_state_size(self):
-        """The state size of the attention-wrapped cell.
-        Equivalent to :attr:`decoder.cell.state_size`.
-        """
-        return self._cell.state_size
diff --git a/texar/tf/modules/decoders/rnn_decoders_test.py b/texar/tf/modules/decoders/rnn_decoders_test.py
deleted file mode 100644
index fd59b2bb..00000000
--- a/texar/tf/modules/decoders/rnn_decoders_test.py
+++ /dev/null
@@ -1,388 +0,0 @@
-"""
-Unit tests for RNN decoders.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
-
-import numpy as np
-
-import tensorflow as tf
-
-from texar.tf.modules.decoders.rnn_decoders import BasicRNNDecoderOutput
-from texar.tf.modules.decoders.rnn_decoders import BasicRNNDecoder
-from texar.tf.modules.decoders.rnn_decoders import AttentionRNNDecoderOutput
-from texar.tf.modules.decoders.rnn_decoders import AttentionRNNDecoder
-from texar.tf.modules.decoders.rnn_decoder_helpers import get_helper
-from texar.tf import context
-
-# pylint: disable=no-member, too-many-locals, too-many-instance-attributes
-# pylint: disable=too-many-arguments, protected-access
-
-
-class BasicRNNDecoderTest(tf.test.TestCase):
-    """Tests :class:`~texar.tf.modules.decoders.rnn_decoders.BasicRNNDecoder`.
-    """
-
-    def setUp(self):
-        tf.test.TestCase.setUp(self)
-        self._vocab_size = 4
-        self._max_time = 8
-        self._batch_size = 16
-        self._emb_dim = 20
-        self._inputs = tf.random_uniform(
-            [self._batch_size, self._max_time, self._emb_dim],
-            maxval=1., dtype=tf.float32)
-        self._embedding = tf.random_uniform(
-            [self._vocab_size, self._emb_dim], maxval=1., dtype=tf.float32)
-
-    def _test_outputs(self, decoder, outputs, final_state, sequence_lengths,
-                      test_mode=False):
-        # 4 trainable variables: cell-kernel, cell-bias,
-        # fc-layer-weights, fc-layer-bias
-        self.assertEqual(len(decoder.trainable_variables), 4)
-
-        cell_dim = decoder.hparams.rnn_cell.kwargs.num_units
-        with self.test_session() as sess:
-            sess.run(tf.global_variables_initializer())
-
-            outputs_, final_state_, sequence_lengths_ = sess.run(
-                [outputs, final_state, sequence_lengths],
-                feed_dict={context.global_mode(): tf.estimator.ModeKeys.TRAIN})
-            self.assertIsInstance(outputs_, BasicRNNDecoderOutput)
-            if not test_mode:
-                self.assertEqual(
-                    outputs_.logits.shape,
-                    (self._batch_size, self._max_time, self._vocab_size))
-                self.assertEqual(
-                    outputs_.sample_id.shape,
-                    (self._batch_size, self._max_time))
-                np.testing.assert_array_equal(
-                    sequence_lengths_, [self._max_time] * self._batch_size)
-            self.assertEqual(final_state_[0].shape,
-                             (self._batch_size, cell_dim))
-
-    def test_output_layer(self):
-        decoder = BasicRNNDecoder(vocab_size=self._vocab_size,
-                                  output_layer=None)
-        self.assertIsInstance(decoder, BasicRNNDecoder)
-
-        decoder = BasicRNNDecoder(output_layer=tf.identity)
-        self.assertIsInstance(decoder, BasicRNNDecoder)
-
-        tensor = tf.random_uniform(
-            [self._emb_dim, self._vocab_size], maxval=1, dtype=tf.float32
-        )
-        decoder = BasicRNNDecoder(output_layer=tensor)
-        self.assertIsInstance(decoder, BasicRNNDecoder)
-        self.assertEqual(decoder.vocab_size, self._vocab_size)
-
-    def test_decode_train(self):
-        """Tests decoding in training mode.
-        """
-        output_layer = tf.layers.Dense(self._vocab_size)
-        decoder = BasicRNNDecoder(vocab_size=self._vocab_size,
-                                  output_layer=output_layer)
-
-        helper_train = get_helper(
-            decoder.hparams.helper_train.type,
-            inputs=self._inputs,
-            sequence_length=[self._max_time] * self._batch_size,
-            **decoder.hparams.helper_train.kwargs.todict())
-        outputs, final_state, sequence_lengths = decoder(helper=helper_train)
-        self._test_outputs(decoder, outputs, final_state, sequence_lengths)
-
-        outputs, final_state, sequence_lengths = decoder(
-            inputs=self._inputs,
-            sequence_length=[self._max_time] * self._batch_size)
-        self._test_outputs(decoder, outputs, final_state, sequence_lengths)
-
-        outputs, final_state, sequence_lengths = decoder(
-            decoding_strategy=None,
-            inputs=self._inputs,
-            sequence_length=[self._max_time] * self._batch_size)
-        self._test_outputs(decoder, outputs, final_state, sequence_lengths)
-
-        outputs, final_state, sequence_lengths = decoder(
-            decoding_strategy=None,
-            embedding=self._embedding,
-            start_tokens=[1] * self._batch_size,
-            end_token=2,
-            mode=tf.estimator.ModeKeys.EVAL)
-        self._test_outputs(decoder, outputs, final_state, sequence_lengths,
-                           test_mode=True)
-
-    def test_decode_train_with_tf(self):
-        """Compares decoding results with TF built-in decoder.
-        """
-        _inputs_placeholder = tf.placeholder(
-            tf.int32, [self._batch_size, self._max_time], name="inputs")
-        _embedding_placeholder = tf.placeholder(
-            tf.float32, [self._vocab_size, self._emb_dim], name="emb")
-        inputs = tf.nn.embedding_lookup(_embedding_placeholder,
-                                        _inputs_placeholder)
-
-        output_layer = tf.layers.Dense(self._vocab_size)
-        decoder = BasicRNNDecoder(vocab_size=self._vocab_size,
-                                  output_layer=output_layer)
-
-        helper_train = get_helper(
-            decoder.hparams.helper_train.type,
-            inputs=inputs,
-            sequence_length=[self._max_time] * self._batch_size,
-            **decoder.hparams.helper_train.kwargs.todict())
-
-        outputs, final_state, sequence_lengths = decoder(helper=helper_train)
-
-        tf_helper = tf.contrib.seq2seq.TrainingHelper(
-            inputs, [self._max_time] * self._batch_size)
-
-        tf_decoder = tf.contrib.seq2seq.BasicDecoder(
-            decoder.cell,
-            tf_helper,
-            decoder.cell.zero_state(self._batch_size, tf.float32),
-            output_layer=output_layer)
-
-        tf_outputs, tf_final_state, tf_sequence_lengths = \
-            tf.contrib.seq2seq.dynamic_decode(tf_decoder)
-
-        cell_dim = decoder.hparams.rnn_cell.kwargs.num_units
-        with self.test_session() as sess:
-            sess.run(tf.global_variables_initializer())
-            inputs_ = np.random.randint(
-                self._vocab_size, size=(self._batch_size, self._max_time),
-                dtype=np.int32)
-            embedding_ = np.random.randn(self._vocab_size, self._emb_dim)
-
-            outputs_, final_state_, sequence_lengths_ = sess.run(
-                [outputs, final_state, sequence_lengths],
-                feed_dict={context.global_mode(): tf.estimator.ModeKeys.TRAIN,
-                           _inputs_placeholder: inputs_,
-                           _embedding_placeholder: embedding_})
-            self.assertEqual(final_state_[0].shape,
-                             (self._batch_size, cell_dim))
-
-            tf_outputs_, tf_final_state_, tf_sequence_lengths_ = sess.run(
-                [tf_outputs, tf_final_state, tf_sequence_lengths],
-                feed_dict={context.global_mode(): tf.estimator.ModeKeys.TRAIN,
-                           _inputs_placeholder: inputs_,
-                           _embedding_placeholder: embedding_})
-
-            np.testing.assert_array_equal(outputs_.logits,
-                                          tf_outputs_.rnn_output)
-            np.testing.assert_array_equal(outputs_.sample_id,
-                                          tf_outputs_.sample_id)
-            np.testing.assert_array_equal(final_state_.c, tf_final_state_.c)
-            np.testing.assert_array_equal(final_state_.h, tf_final_state_.h)
-            np.testing.assert_array_equal(sequence_lengths_,
-                                          tf_sequence_lengths_)
-
-    def test_decode_infer(self):
-        """Tests decoding in inferencee mode.
-        """
-        output_layer = tf.layers.Dense(self._vocab_size)
-        decoder = BasicRNNDecoder(vocab_size=self._vocab_size,
-                                  output_layer=output_layer)
-
-        helper_infer = get_helper(
-            decoder.hparams.helper_infer.type,
-            embedding=self._embedding,
-            start_tokens=[self._vocab_size - 2] * self._batch_size,
-            end_token=self._vocab_size - 1,
-            **decoder.hparams.helper_train.kwargs.todict())
-
-        outputs, final_state, sequence_lengths = decoder(helper=helper_infer)
-
-        # 4 trainable variables: embedding, cell-kernel, cell-bias,
-        # fc-layer-weights, fc-layer-bias
-        self.assertEqual(len(decoder.trainable_variables), 4)
-
-        cell_dim = decoder.hparams.rnn_cell.kwargs.num_units
-        with self.test_session() as sess:
-            sess.run(tf.global_variables_initializer())
-            outputs_, final_state_, sequence_lengths_ = sess.run(
-                [outputs, final_state, sequence_lengths],
-                feed_dict={context.global_mode():
-                           tf.estimator.ModeKeys.PREDICT})
-            self.assertIsInstance(outputs_, BasicRNNDecoderOutput)
-            max_length = max(sequence_lengths_)
-            self.assertEqual(
-                outputs_.logits.shape,
-                (self._batch_size, max_length, self._vocab_size))
-            self.assertEqual(
-                outputs_.sample_id.shape, (self._batch_size, max_length))
-            self.assertEqual(final_state_[0].shape,
-                             (self._batch_size, cell_dim))
-
-
-class AttentionRNNDecoderTest(tf.test.TestCase):
-    """Tests :class:`~texar.tf.modules.decoders.rnn_decoders.AttentionRNNDecoder`.
-    """
-
-    def setUp(self):
-        tf.test.TestCase.setUp(self)
-        self._vocab_size = 10
-        self._max_time = 16
-        self._batch_size = 8
-        self._emb_dim = 20
-        self._attention_dim = 256
-        self._inputs = tf.random_uniform(
-            [self._batch_size, self._max_time, self._emb_dim],
-            maxval=1., dtype=tf.float32)
-        self._embedding = tf.random_uniform(
-            [self._vocab_size, self._emb_dim], maxval=1., dtype=tf.float32)
-        self._encoder_output = tf.random_uniform(
-            [self._batch_size, self._max_time, 64])
-
-    def test_decode_train(self):
-        """Tests decoding in training mode.
-        """
-        seq_length = np.random.randint(
-            self._max_time, size=[self._batch_size]) + 1
-        encoder_values_length = tf.constant(seq_length)
-        hparams = {
-            "attention": {
-                "kwargs": {
-                    "num_units": self._attention_dim,
-                    # Note: to use sparsemax in TF-CPU, it looks
-                    # `memory_sequence_length` must equal max_time.
-                    # "probability_fn": "sparsemax"
-                }
-            }
-        }
-        decoder = AttentionRNNDecoder(
-            memory=self._encoder_output,
-            memory_sequence_length=encoder_values_length,
-            vocab_size=self._vocab_size,
-            hparams=hparams)
-
-        helper_train = get_helper(
-            decoder.hparams.helper_train.type,
-            inputs=self._inputs,
-            sequence_length=[self._max_time] * self._batch_size,
-            **decoder.hparams.helper_train.kwargs.todict())
-
-        outputs, final_state, sequence_lengths = decoder(helper=helper_train)
-        # 4+1 trainable variables: cell-kernel, cell-bias,
-        # fc-weight, fc-bias, and
-        # memory_layer: For LuongAttention, we only transform the memory layer;
-        # thus num_units *must* match the expected query depth.
-        self.assertEqual(len(decoder.trainable_variables), 5)
-
-        cell_dim = decoder.hparams.rnn_cell.kwargs.num_units
-        with self.test_session() as sess:
-            sess.run(tf.global_variables_initializer())
-            outputs_, final_state_, sequence_lengths_ = sess.run(
-                [outputs, final_state, sequence_lengths],
-                feed_dict={context.global_mode(): tf.estimator.ModeKeys.TRAIN})
-            self.assertIsInstance(outputs_, AttentionRNNDecoderOutput)
-            self.assertEqual(
-                outputs_.logits.shape,
-                (self._batch_size, self._max_time, self._vocab_size))
-            self.assertEqual(
-                outputs_.sample_id.shape, (self._batch_size, self._max_time))
-            self.assertEqual(final_state_.cell_state[0].shape,
-                             (self._batch_size, cell_dim))
-            np.testing.assert_array_equal(
-                sequence_lengths_, [self._max_time] * self._batch_size)
-
-    def test_decode_infer(self):
-        """Tests decoding in inference mode.
-        """
-        seq_length = np.random.randint(
-            self._max_time, size=[self._batch_size]) + 1
-        encoder_values_length = tf.constant(seq_length)
-        hparams = {
-            "attention": {
-                "kwargs": {
-                    "num_units": 256,
-                }
-            }
-        }
-        decoder = AttentionRNNDecoder(
-            vocab_size=self._vocab_size,
-            memory=self._encoder_output,
-            memory_sequence_length=encoder_values_length,
-            hparams=hparams)
-
-        helper_infer = get_helper(
-            decoder.hparams.helper_infer.type,
-            embedding=self._embedding,
-            start_tokens=[1] * self._batch_size,
-            end_token=2,
-            **decoder.hparams.helper_train.kwargs.todict())
-
-        outputs, final_state, sequence_lengths = decoder(helper=helper_infer)
-
-        # 4+1 trainable variables: cell-kernel, cell-bias,
-        # fc-weight, fc-bias, and
-        # memory_layer: For LuongAttention, we only transform the memory layer;
-        # thus num_units *must* match the expected query depth.
-        self.assertEqual(len(decoder.trainable_variables), 5)
-        cell_dim = decoder.hparams.rnn_cell.kwargs.num_units
-        with self.test_session() as sess:
-            sess.run(tf.global_variables_initializer())
-            outputs_, final_state_, sequence_lengths_ = sess.run(
-                [outputs, final_state, sequence_lengths],
-                feed_dict={context.global_mode():
-                           tf.estimator.ModeKeys.PREDICT})
-            self.assertIsInstance(outputs_, AttentionRNNDecoderOutput)
-            max_length = max(sequence_lengths_)
-            self.assertEqual(
-                outputs_.logits.shape,
-                (self._batch_size, max_length, self._vocab_size))
-            self.assertEqual(
-                outputs_.sample_id.shape, (self._batch_size, max_length))
-            self.assertEqual(final_state_.cell_state[0].shape,
-                             (self._batch_size, cell_dim))
-
-    def test_beam_search_cell(self):
-        """Tests :meth:`texar.tf.modules.AttentionRNNDecoder._get_beam_search_cell`
-        """
-        seq_length = np.random.randint(
-            self._max_time, size=[self._batch_size]) + 1
-        encoder_values_length = tf.constant(seq_length)
-        hparams = {
-            "attention": {
-                "kwargs": {
-                    "num_units": self._attention_dim,
-                    "probability_fn": "sparsemax"
-                }
-            }
-        }
-        decoder = AttentionRNNDecoder(
-            memory=self._encoder_output,
-            memory_sequence_length=encoder_values_length,
-            vocab_size=self._vocab_size,
-            hparams=hparams)
-
-        helper_train = get_helper(
-            decoder.hparams.helper_train.type,
-            inputs=self._inputs,
-            sequence_length=[self._max_time] * self._batch_size,
-            **decoder.hparams.helper_train.kwargs.todict())
-
-        _, _, _ = decoder(helper=helper_train)
-
-        # 4+1 trainable variables: cell-kernel, cell-bias,
-        # fc-weight, fc-bias, and
-        # memory_layer: For LuongAttention, we only transform the memory layer;
-        # thus num_units *must* match the expected query depth.
-        self.assertEqual(len(decoder.trainable_variables), 5)
-
-        beam_width = 3
-        beam_cell = decoder._get_beam_search_cell(beam_width)
-        cell_input = tf.random_uniform([self._batch_size * beam_width,
-                                        self._emb_dim])
-        cell_state = beam_cell.zero_state(self._batch_size * beam_width,
-                                          tf.float32)
-        _ = beam_cell(cell_input, cell_state)
-        # Test if beam_cell is sharing variables with decoder cell.
-        for tvar in beam_cell.trainable_variables:
-            self.assertTrue(tvar in decoder.trainable_variables)
-
-
-if __name__ == "__main__":
-    tf.test.main()
diff --git a/texar/tf/modules/decoders/tf_helpers.py b/texar/tf/modules/decoders/tf_helpers.py
deleted file mode 100644
index 18259ded..00000000
--- a/texar/tf/modules/decoders/tf_helpers.py
+++ /dev/null
@@ -1,763 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-# Modifications copyright (C) 2019 Texar
-# ==============================================================================
-"""A library of helpers for use with Texar RNN/Transformer decoders.
-
-Adapted from the `tensorflow.contrib.seq2seq` package.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# pylint: disable=no-name-in-module
-
-import abc
-
-import six
-
-import tensorflow as tf
-from tensorflow.contrib.seq2seq.python.ops import decoder
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import embedding_ops
-from tensorflow.python.ops import gen_array_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import tensor_array_ops
-from tensorflow_probability import distributions as tfpd
-from tensorflow.python.util import nest
-
-from texar.tf.utils.shapes import shape_list
-from texar.tf.utils.utils import get_args
-
-__all__ = [
-    "Helper",
-    "TrainingHelper",
-    "GreedyEmbeddingHelper",
-    "SampleEmbeddingHelper",
-    "CustomHelper",
-    "ScheduledEmbeddingTrainingHelper",
-    "ScheduledOutputTrainingHelper",
-    "InferenceHelper",
-]
-
-_transpose_batch_time = decoder._transpose_batch_time  # pylint: disable=protected-access
-
-
-def _unstack_ta(inp):
-    return tensor_array_ops.TensorArray(
-        dtype=inp.dtype, size=array_ops.shape(inp)[0],
-        element_shape=inp.get_shape()[1:]).unstack(inp)
-
-
-@six.add_metaclass(abc.ABCMeta)
-class Helper(object):
-    """Interface for implementing different decoding strategies in
-    :class:`RNN decoders <texar.tf.modules.RNNDecoderBase>` and
-    :class:`Transformer decoder <texar.tf.modules.TransformerDecoder>`.
-
-    Adapted from the `tensorflow.contrib.seq2seq` package.
-    """
-
-    @abc.abstractproperty
-    def batch_size(self):
-        """Batch size of tensor returned by `sample`.
-
-        Returns a scalar int32 tensor.
-        """
-        raise NotImplementedError("batch_size has not been implemented")
-
-    @abc.abstractproperty
-    def sample_ids_shape(self):
-        """Shape of tensor returned by `sample`, excluding the batch dimension.
-
-        Returns a `TensorShape`.
-        """
-        raise NotImplementedError("sample_ids_shape has not been implemented")
-
-    @abc.abstractproperty
-    def sample_ids_dtype(self):
-        """DType of tensor returned by `sample`.
-
-        Returns a DType.
-        """
-        raise NotImplementedError("sample_ids_dtype has not been implemented")
-
-    @abc.abstractmethod
-    def initialize(self, name=None):
-        """Returns `(initial_finished, initial_inputs)`."""
-        pass
-
-    @abc.abstractmethod
-    def sample(self, time, outputs, state, name=None):
-        """Returns `sample_ids`."""
-        pass
-
-    @abc.abstractmethod
-    def next_inputs(self, time, outputs, state, sample_ids, name=None):
-        """Returns `(finished, next_inputs, next_state)`."""
-        pass
-
-
-class CustomHelper(Helper):
-    """Base abstract class that allows the user to customize decoding."""
-
-    def __init__(self, initialize_fn, sample_fn, next_inputs_fn,
-                 sample_ids_shape=None, sample_ids_dtype=None):
-        """Initializer.
-
-        Args:
-          initialize_fn: callable that returns `(finished, next_inputs)`
-            for the first iteration.
-          sample_fn: callable that takes `(time, outputs, state)`
-            and emits tensor `sample_ids`.
-          next_inputs_fn: callable that takes `(time, outputs, state, sample_ids)`
-            and emits `(finished, next_inputs, next_state)`.
-          sample_ids_shape: Either a list of integers, or a 1-D Tensor of type
-            `int32`, the shape of each value in the `sample_ids` batch. Defaults to
-            a scalar.
-          sample_ids_dtype: The dtype of the `sample_ids` tensor. Defaults to int32.
-        """
-        self._initialize_fn = initialize_fn
-        self._sample_fn = sample_fn
-        self._next_inputs_fn = next_inputs_fn
-        self._batch_size = None
-        self._sample_ids_shape = tensor_shape.TensorShape(sample_ids_shape or [])
-        self._sample_ids_dtype = sample_ids_dtype or dtypes.int32
-
-    @property
-    def batch_size(self):
-        if self._batch_size is None:
-            raise ValueError("batch_size accessed before initialize was called")
-        return self._batch_size
-
-    @property
-    def sample_ids_shape(self):
-        return self._sample_ids_shape
-
-    @property
-    def sample_ids_dtype(self):
-        return self._sample_ids_dtype
-
-    def initialize(self, name=None):
-        with ops.name_scope(name, "%sInitialize" % type(self).__name__):
-            (finished, next_inputs) = self._initialize_fn()
-            if self._batch_size is None:
-                self._batch_size = array_ops.size(finished)
-        return (finished, next_inputs)
-
-    def sample(self, time, outputs, state, name=None):
-        with ops.name_scope(
-                name, "%sSample" % type(self).__name__, (time, outputs, state)):
-            return self._sample_fn(time=time, outputs=outputs, state=state)
-
-    def next_inputs(self, time, outputs, state, sample_ids, name=None):
-        with ops.name_scope(
-                name, "%sNextInputs" % type(self).__name__, (time, outputs, state)):
-            return self._next_inputs_fn(
-                time=time, outputs=outputs, state=state, sample_ids=sample_ids)
-
-
-class TrainingHelper(Helper):
-    """A helper for use during training. Performs teacher-forcing decoding.
-
-    Returned sample_ids are the argmax of the RNN output logits.
-
-    Note that for teacher-forcing decoding, Texar's decoders provide a simpler
-    interface by specifying `decoding_strategy='train_greedy'` when calling a
-    decoder (see, e.g.,,
-    :meth:`RNN decoder <texar.tf.modules.RNNDecoderBase._build>`). In this case,
-    use of TrainingHelper is not necessary.
-    """
-
-    def __init__(self, inputs, sequence_length, time_major=False, name=None):
-        """Initializer.
-
-        Args:
-          inputs: A (structure of) input tensors.
-          sequence_length: An int32 vector tensor.
-          time_major: Python bool.  Whether the tensors in `inputs` are time major.
-            If `False` (default), they are assumed to be batch major.
-          name: Name scope for any created operations.
-
-        Raises:
-          ValueError: if `sequence_length` is not a 1D tensor.
-        """
-        with ops.name_scope(name, "TrainingHelper", [inputs, sequence_length]):
-            inputs = ops.convert_to_tensor(inputs, name="inputs")
-            self._inputs = inputs
-            if not time_major:
-                inputs = nest.map_structure(_transpose_batch_time, inputs)
-
-            self._input_tas = nest.map_structure(_unstack_ta, inputs)
-            self._sequence_length = ops.convert_to_tensor(
-                sequence_length, name="sequence_length")
-            if self._sequence_length.get_shape().ndims != 1:
-                raise ValueError(
-                    "Expected sequence_length to be a vector, but received shape: %s" %
-                    self._sequence_length.get_shape())
-
-            self._zero_inputs = nest.map_structure(
-                lambda inp: array_ops.zeros_like(inp[0, :]), inputs)
-            self._start_inputs = self._zero_inputs
-            self._batch_size = shape_list(sequence_length)[0]
-
-    @property
-    def inputs(self):
-        return self._inputs
-
-    @property
-    def sequence_length(self):
-        return self._sequence_length
-
-    @property
-    def batch_size(self):
-        return self._batch_size
-
-    @property
-    def sample_ids_shape(self):
-        return tensor_shape.TensorShape([])
-
-    @property
-    def sample_ids_dtype(self):
-        return dtypes.int32
-
-    def initialize(self, name=None):
-        with ops.name_scope(name, "TrainingHelperInitialize"):
-            finished = math_ops.equal(0, self._sequence_length)
-            all_finished = math_ops.reduce_all(finished)
-            next_inputs = control_flow_ops.cond(
-                all_finished, lambda: self._zero_inputs,
-                lambda: nest.map_structure(lambda inp: inp.read(0), self._input_tas))
-            return (finished, next_inputs)
-
-    def sample(self, time, outputs, name=None, **unused_kwargs):
-        """Gets a sample for one step."""
-        with ops.name_scope(name, "TrainingHelperSample", [time, outputs]):
-            sample_ids = math_ops.cast(
-                math_ops.argmax(outputs, axis=-1), dtypes.int32)
-            return sample_ids
-
-    def next_inputs(self, time, outputs, state, name=None, **unused_kwargs):
-        """Gets the inputs for next step."""
-        with ops.name_scope(name, "TrainingHelperNextInputs",
-                            [time, outputs, state]):
-            next_time = time + 1
-            finished = (next_time >= self._sequence_length)
-            all_finished = math_ops.reduce_all(finished)
-
-            def read_from_ta(inp):
-                return inp.read(next_time)
-
-            next_inputs = control_flow_ops.cond(
-                all_finished, lambda: self._zero_inputs,
-                lambda: nest.map_structure(read_from_ta, self._input_tas))
-            return (finished, next_inputs, state)
-
-
-class ScheduledEmbeddingTrainingHelper(TrainingHelper):
-    """A training helper that adds scheduled sampling.
-
-    Returns -1s for sample_ids where no sampling took place; valid sample id
-    values elsewhere.
-    """
-
-    def __init__(self, inputs, sequence_length, embedding, sampling_probability,
-                 time_major=False, seed=None, scheduling_seed=None, name=None):
-        """Initializer.
-
-        Args:
-          inputs: A (structure of) input tensors.
-          sequence_length: An int32 vector tensor.
-          embedding: A callable or the `params` argument for `embedding_lookup`.
-            If a callable, it can take a vector tensor of token `ids`,
-            or take two arguments (`ids`, `times`), where `ids` is a vector
-            tensor of token ids, and `times` is a vector tensor of current
-            time steps (i.e., position ids). The latter case can be used when
-            attr:`embedding` is a combination of word embedding and position
-            embedding.
-          sampling_probability: A 0D `float32` tensor: the probability of sampling
-            categorically from the output ids instead of reading directly from the
-            inputs.
-          time_major: Python bool.  Whether the tensors in `inputs` are time major.
-            If `False` (default), they are assumed to be batch major.
-          seed: The sampling seed.
-          scheduling_seed: The schedule decision rule sampling seed.
-          name: Name scope for any created operations.
-
-        Raises:
-          ValueError: if `sampling_probability` is not a scalar or vector.
-        """
-        with ops.name_scope(name, "ScheduledEmbeddingSamplingWrapper",
-                            [embedding, sampling_probability]):
-            if callable(embedding):
-                self._embedding_fn = embedding
-            else:
-                self._embedding_fn = (
-                    lambda ids: embedding_ops.embedding_lookup(embedding, ids))
-
-            self._embedding_args_cnt = len(get_args(self._embedding_fn))
-            if self._embedding_args_cnt != 1 and self._embedding_args_cnt != 2:
-                raise ValueError('`embedding` should expect 1 or 2 arguments.')
-
-            self._sampling_probability = ops.convert_to_tensor(
-                sampling_probability, name="sampling_probability")
-            if self._sampling_probability.get_shape().ndims not in (0, 1):
-                raise ValueError(
-                    "sampling_probability must be either a scalar or a vector. "
-                    "saw shape: %s" % (self._sampling_probability.get_shape()))
-            self._seed = seed
-            self._scheduling_seed = scheduling_seed
-            super(ScheduledEmbeddingTrainingHelper, self).__init__(
-                inputs=inputs,
-                sequence_length=sequence_length,
-                time_major=time_major,
-                name=name)
-
-    def initialize(self, name=None):
-        return super(ScheduledEmbeddingTrainingHelper, self).initialize(
-            name=name)
-
-    def sample(self, time, outputs, state, name=None):
-        """Gets a sample for one step."""
-        with ops.name_scope(name, "ScheduledEmbeddingTrainingHelperSample",
-                            [time, outputs, state]):
-            # Return -1s where we did not sample, and sample_ids elsewhere
-            select_sampler = tfpd.Bernoulli(
-                probs=self._sampling_probability, dtype=dtypes.bool)
-            select_sample = select_sampler.sample(
-                sample_shape=self.batch_size, seed=self._scheduling_seed)
-            sample_id_sampler = tfpd.Categorical(logits=outputs)
-            return array_ops.where(
-                select_sample,
-                sample_id_sampler.sample(seed=self._seed),
-                gen_array_ops.fill([self.batch_size], -1))
-
-    def next_inputs(self, time, outputs, state, sample_ids, name=None):
-        """Gets the outputs for next step."""
-        with ops.name_scope(name, "ScheduledEmbeddingTrainingHelperNextInputs",
-                            [time, outputs, state, sample_ids]):
-            (finished, base_next_inputs, state) = (
-                super(ScheduledEmbeddingTrainingHelper, self).next_inputs(
-                    time=time,
-                    outputs=outputs,
-                    state=state,
-                    sample_ids=sample_ids,
-                    name=name))
-
-            def maybe_sample():
-                """Perform scheduled sampling."""
-                where_sampling = math_ops.cast(
-                    array_ops.where(sample_ids > -1), dtypes.int32)
-                where_not_sampling = math_ops.cast(
-                    array_ops.where(sample_ids <= -1), dtypes.int32)
-                sample_ids_sampling = array_ops.gather_nd(sample_ids, where_sampling)
-                inputs_not_sampling = array_ops.gather_nd(
-                    base_next_inputs, where_not_sampling)
-
-                if self._embedding_args_cnt == 1:
-                    sampled_next_inputs = self._embedding_fn(
-                        sample_ids_sampling)
-                elif self._embedding_args_cnt == 2:
-                    # Prepare the position embedding of the next step
-                    times = tf.ones(self._batch_size,
-                                    dtype=tf.int32) * (time + 1)
-                    sampled_next_inputs = self._embedding_fn(
-                        sample_ids_sampling, times)
-                base_shape = array_ops.shape(base_next_inputs)
-                return (array_ops.scatter_nd(indices=where_sampling,
-                                             updates=sampled_next_inputs,
-                                             shape=base_shape)
-                        + array_ops.scatter_nd(indices=where_not_sampling,
-                                               updates=inputs_not_sampling,
-                                               shape=base_shape))
-
-            all_finished = math_ops.reduce_all(finished)
-            next_inputs = control_flow_ops.cond(
-                all_finished, lambda: base_next_inputs, maybe_sample)
-            return (finished, next_inputs, state)
-
-
-class ScheduledOutputTrainingHelper(TrainingHelper):
-    """A training helper that adds scheduled sampling directly to outputs.
-
-    Returns False for sample_ids where no sampling took place; True elsewhere.
-    """
-
-    def __init__(self, inputs, sequence_length, sampling_probability,
-                 time_major=False, seed=None, next_inputs_fn=None,
-                 auxiliary_inputs=None, name=None):
-        """Initializer.
-
-        Args:
-          inputs: A (structure) of input tensors.
-          sequence_length: An int32 vector tensor.
-          sampling_probability: A 0D `float32` tensor: the probability of sampling
-            from the outputs instead of reading directly from the inputs.
-          time_major: Python bool.  Whether the tensors in `inputs` are time major.
-            If `False` (default), they are assumed to be batch major.
-          seed: The sampling seed.
-          next_inputs_fn: (Optional) callable to apply to the RNN outputs to create
-            the next input when sampling. If `None` (default), the RNN outputs will
-            be used as the next inputs.
-          auxiliary_inputs: An optional (structure of) auxiliary input tensors with
-            a shape that matches `inputs` in all but (potentially) the final
-            dimension. These tensors will be concatenated to the sampled output or
-            the `inputs` when not sampling for use as the next input.
-          name: Name scope for any created operations.
-
-        Raises:
-          ValueError: if `sampling_probability` is not a scalar or vector.
-        """
-        with ops.name_scope(name, "ScheduledOutputTrainingHelper",
-                            [inputs, auxiliary_inputs, sampling_probability]):
-            self._sampling_probability = ops.convert_to_tensor(
-                sampling_probability, name="sampling_probability")
-            if self._sampling_probability.get_shape().ndims not in (0, 1):
-                raise ValueError(
-                    "sampling_probability must be either a scalar or a vector. "
-                    "saw shape: %s" % (self._sampling_probability.get_shape()))
-
-            if auxiliary_inputs is None:
-                maybe_concatenated_inputs = inputs
-            else:
-                inputs = ops.convert_to_tensor(inputs, name="inputs")
-                auxiliary_inputs = ops.convert_to_tensor(
-                    auxiliary_inputs, name="auxiliary_inputs")
-                maybe_concatenated_inputs = nest.map_structure(
-                    lambda x, y: array_ops.concat((x, y), -1),
-                    inputs, auxiliary_inputs)
-                if not time_major:
-                    auxiliary_inputs = nest.map_structure(
-                        _transpose_batch_time, auxiliary_inputs)
-
-            self._auxiliary_input_tas = (
-                nest.map_structure(_unstack_ta, auxiliary_inputs)
-                if auxiliary_inputs is not None else None)
-
-            self._seed = seed
-
-            self._next_inputs_fn = next_inputs_fn
-
-            super(ScheduledOutputTrainingHelper, self).__init__(
-                inputs=maybe_concatenated_inputs,
-                sequence_length=sequence_length,
-                time_major=time_major,
-                name=name)
-
-    def initialize(self, name=None):
-        return super(ScheduledOutputTrainingHelper, self).initialize(name=name)
-
-    def sample(self, time, outputs, state, name=None):
-        """Gets a sample for one step."""
-        with ops.name_scope(name, "ScheduledOutputTrainingHelperSample",
-                            [time, outputs, state]):
-            sampler = tfpd.Bernoulli(probs=self._sampling_probability)
-            return sampler.sample(sample_shape=self.batch_size, seed=self._seed)
-
-    def next_inputs(self, time, outputs, state, sample_ids, name=None):
-        """Gets the next inputs for next step."""
-        with ops.name_scope(name, "ScheduledOutputTrainingHelperNextInputs",
-                            [time, outputs, state, sample_ids]):
-            (finished, base_next_inputs, state) = (
-                super(ScheduledOutputTrainingHelper, self).next_inputs(
-                    time=time,
-                    outputs=outputs,
-                    state=state,
-                    sample_ids=sample_ids,
-                    name=name))
-            sample_ids = math_ops.cast(sample_ids, dtypes.bool)
-
-            def maybe_sample():
-                """Perform scheduled sampling."""
-
-                def maybe_concatenate_auxiliary_inputs(outputs_, indices=None):
-                    """Concatenate outputs with auxiliary inputs, if they exist."""
-                    if self._auxiliary_input_tas is None:
-                        return outputs_
-
-                    next_time = time + 1
-                    auxiliary_inputs = nest.map_structure(
-                        lambda ta: ta.read(next_time), self._auxiliary_input_tas)
-                    if indices is not None:
-                        auxiliary_inputs = array_ops.gather_nd(auxiliary_inputs, indices)
-                    return nest.map_structure(
-                        lambda x, y: array_ops.concat((x, y), -1),
-                        outputs_, auxiliary_inputs)
-
-                if self._next_inputs_fn is None:
-                    return array_ops.where(
-                        sample_ids, maybe_concatenate_auxiliary_inputs(outputs),
-                        base_next_inputs)
-
-                where_sampling = math_ops.cast(
-                    array_ops.where(sample_ids), dtypes.int32)
-                where_not_sampling = math_ops.cast(
-                    array_ops.where(math_ops.logical_not(sample_ids)), dtypes.int32)
-                outputs_sampling = array_ops.gather_nd(outputs, where_sampling)
-                inputs_not_sampling = array_ops.gather_nd(base_next_inputs,
-                                                          where_not_sampling)
-                sampled_next_inputs = maybe_concatenate_auxiliary_inputs(
-                    self._next_inputs_fn(outputs_sampling), where_sampling)
-
-                base_shape = array_ops.shape(base_next_inputs)
-                return (array_ops.scatter_nd(indices=where_sampling,
-                                             updates=sampled_next_inputs,
-                                             shape=base_shape)
-                        + array_ops.scatter_nd(indices=where_not_sampling,
-                                               updates=inputs_not_sampling,
-                                               shape=base_shape))
-
-            all_finished = math_ops.reduce_all(finished)
-            no_samples = math_ops.logical_not(math_ops.reduce_any(sample_ids))
-            next_inputs = control_flow_ops.cond(
-                math_ops.logical_or(all_finished, no_samples),
-                lambda: base_next_inputs, maybe_sample)
-            return (finished, next_inputs, state)
-
-
-class GreedyEmbeddingHelper(Helper):
-    """A helper for use during inference.
-
-    Uses the argmax of the output (treated as logits) and passes the
-    result through an embedding layer to get the next input.
-
-    Note that for greedy decoding, Texar's decoders provide a simpler
-    interface by specifying `decoding_strategy='infer_greedy'` when calling a
-    decoder (see, e.g.,,
-    :meth:`RNN decoder <texar.tf.modules.RNNDecoderBase._build>`). In this case,
-    use of GreedyEmbeddingHelper is not necessary.
-    """
-
-    def __init__(self, embedding, start_tokens, end_token):
-        """Initializer.
-
-        Args:
-          embedding: A callable or the `params` argument for `embedding_lookup`.
-            If a callable, it can take a vector tensor of `ids` (argmax ids),
-            or take two arguments (`ids`, `times`), where `ids` is a vector
-            tensor of argmax ids, and `times` is a vector tensor of current
-            time steps (i.e., position ids). The latter case can be used when
-            attr:`embedding` is a combination of word embedding and position
-            embedding.
-            The returned tensor will be returned by :meth:`next_inputs`.
-          start_tokens: `int32` vector shaped `[batch_size]`, the start tokens.
-          end_token: `int32` scalar, the token that marks end of decoding.
-
-        Raises:
-          ValueError: if `start_tokens` is not a 1D tensor or `end_token` is not a
-            scalar.
-        """
-        if callable(embedding):
-            self._embedding_fn = embedding
-        else:
-            self._embedding_fn = (
-                lambda ids: embedding_ops.embedding_lookup(embedding, ids))
-
-        self._start_tokens = ops.convert_to_tensor(
-            start_tokens, dtype=dtypes.int32, name="start_tokens")
-        self._end_token = ops.convert_to_tensor(
-            end_token, dtype=dtypes.int32, name="end_token")
-        if self._start_tokens.get_shape().ndims != 1:
-            raise ValueError("start_tokens must be a vector")
-        self._batch_size = shape_list(start_tokens)[0]
-        if self._end_token.get_shape().ndims != 0:
-            raise ValueError("end_token must be a scalar")
-
-        self._embedding_args_cnt = len(get_args(self._embedding_fn))
-        if self._embedding_args_cnt == 1:
-            self._start_inputs = self._embedding_fn(self._start_tokens)
-        elif self._embedding_args_cnt == 2:
-            # Position index is 0 in the beginning
-            times = tf.zeros([self._batch_size], dtype=tf.int32)
-            self._start_inputs = self._embedding_fn(self._start_tokens, times)
-        else:
-            raise ValueError('`embedding` should expect 1 or 2 arguments.')
-
-    @property
-    def batch_size(self):
-        return self._batch_size
-
-    @property
-    def sample_ids_shape(self):
-        return tensor_shape.TensorShape([])
-
-    @property
-    def sample_ids_dtype(self):
-        return dtypes.int32
-
-    def initialize(self, name=None):
-        finished = array_ops.tile([False], [self._batch_size])
-        return finished, self._start_inputs
-
-    def sample(self, time, outputs, state, name=None):
-        """Gets a sample for one step."""
-        del time, state  # unused by sample_fn
-        # Outputs are logits, use argmax to get the most probable id
-        if not isinstance(outputs, ops.Tensor):
-            raise TypeError("Expected outputs to be a single Tensor, got: %s" %
-                            type(outputs))
-        sample_ids = math_ops.argmax(outputs, axis=-1, output_type=dtypes.int32)
-        return sample_ids
-
-    def next_inputs(self, time, outputs, state, sample_ids, name=None):
-        """Gets the inputs for next step."""
-        finished = math_ops.equal(sample_ids, self._end_token)
-        all_finished = math_ops.reduce_all(finished)
-
-        if self._embedding_args_cnt == 1:
-            del time, outputs  # unused by next_inputs_fn
-            next_inputs = control_flow_ops.cond(
-                all_finished,
-                # If we're finished, the next_inputs value doesn't matter
-                lambda: self._start_inputs,
-                lambda: self._embedding_fn(sample_ids))
-        elif self._embedding_args_cnt == 2:
-            del outputs
-            # Prepare the position embedding of the next step
-            times = tf.ones(self._batch_size, dtype=tf.int32) * (time + 1)
-            next_inputs = control_flow_ops.cond(
-                all_finished,
-                # If we're finished, the next_inputs value doesn't matter
-                lambda: self._start_inputs,
-                lambda: self._embedding_fn(sample_ids, times))
-
-        return finished, next_inputs, state
-
-
-class SampleEmbeddingHelper(GreedyEmbeddingHelper):
-    """A helper for use during inference.
-
-    Uses sampling (from a distribution) instead of argmax and passes the
-    result through an embedding layer to get the next input.
-
-    Note that for sample decoding, Texar's decoders provide a simpler
-    interface by specifying `decoding_strategy='infer_sample'` when calling a
-    decoder (see, e.g.,,
-    :meth:`RNN decoder <texar.tf.modules.RNNDecoderBase._build>`). In this case,
-    use of SampleEmbeddingHelper is not necessary.
-    """
-
-    def __init__(self, embedding, start_tokens, end_token,
-                 softmax_temperature=None, seed=None):
-        """Initializer.
-
-        Args:
-          embedding: A callable or the `params` argument for `embedding_lookup`.
-            If a callable, it can take a vector tensor of token `ids`,
-            or take two arguments (`ids`, `times`), where `ids` is a vector
-            tensor of token ids, and `times` is a vector tensor of current
-            time steps (i.e., position ids). The latter case can be used when
-            attr:`embedding` is a combination of word embedding and position
-            embedding.
-            The returned tensor will be returned by :meth:`next_inputs`.
-          start_tokens: `int32` vector shaped `[batch_size]`, the start tokens.
-          end_token: `int32` scalar, the token that marks end of decoding.
-          softmax_temperature: (Optional) `float32` scalar, value to divide the
-            logits by before computing the softmax. Larger values (above 1.0) result
-            in more random samples, while smaller values push the sampling
-            distribution towards the argmax. Must be strictly greater than 0.
-            Defaults to 1.0.
-          seed: (Optional) The sampling seed.
-
-        Raises:
-          ValueError: if `start_tokens` is not a 1D tensor or `end_token` is not a
-            scalar.
-        """
-        super(SampleEmbeddingHelper, self).__init__(
-            embedding, start_tokens, end_token)
-        self._softmax_temperature = softmax_temperature
-        self._seed = seed
-
-    def sample(self, time, outputs, state, name=None):
-        """Gets a sample for one step."""
-        del time, state  # unused by sample_fn
-        # Outputs are logits, we sample instead of argmax (greedy).
-        if not isinstance(outputs, ops.Tensor):
-            raise TypeError("Expected outputs to be a single Tensor, got: %s" %
-                            type(outputs))
-        if self._softmax_temperature is None:
-            logits = outputs
-        else:
-            logits = outputs / self._softmax_temperature
-
-        sample_id_sampler = tfpd.Categorical(logits=logits)
-        sample_ids = sample_id_sampler.sample(seed=self._seed)
-
-        return sample_ids
-
-
-class InferenceHelper(Helper):
-    """A helper to use during inference with a custom sampling function."""
-
-    def __init__(self, sample_fn, sample_shape, sample_dtype,
-                 start_inputs, end_fn, next_inputs_fn=None):
-        """Initializer.
-
-        Args:
-          sample_fn: A callable that takes `outputs` and emits tensor `sample_ids`.
-          sample_shape: Either a list of integers, or a 1-D Tensor of type `int32`,
-            the shape of the each sample in the batch returned by `sample_fn`.
-          sample_dtype: the dtype of the sample returned by `sample_fn`.
-          start_inputs: The initial batch of inputs.
-          end_fn: A callable that takes `sample_ids` and emits a `bool` vector
-            shaped `[batch_size]` indicating whether each sample is an end token.
-          next_inputs_fn: (Optional) A callable that takes `sample_ids` and returns
-            the next batch of inputs. If not provided, `sample_ids` is used as the
-            next batch of inputs.
-        """
-        self._sample_fn = sample_fn
-        self._end_fn = end_fn
-        self._sample_shape = tensor_shape.TensorShape(sample_shape)
-        self._sample_dtype = sample_dtype
-        self._next_inputs_fn = next_inputs_fn
-        self._batch_size = array_ops.shape(start_inputs)[0]
-        self._start_inputs = ops.convert_to_tensor(
-            start_inputs, name="start_inputs")
-
-    @property
-    def batch_size(self):
-        return self._batch_size
-
-    @property
-    def sample_ids_shape(self):
-        return self._sample_shape
-
-    @property
-    def sample_ids_dtype(self):
-        return self._sample_dtype
-
-    def initialize(self, name=None):
-        finished = array_ops.tile([False], [self._batch_size])
-        return (finished, self._start_inputs)
-
-    def sample(self, time, outputs, state, name=None):
-        """Gets a sample for one step."""
-        del time, state  # unused by sample
-        return self._sample_fn(outputs)
-
-    def next_inputs(self, time, outputs, state, sample_ids, name=None):
-        """Gets the outputs for next step."""
-        del time, outputs  # unused by next_inputs
-        if self._next_inputs_fn is None:
-            next_inputs = sample_ids
-        else:
-            next_inputs = self._next_inputs_fn(sample_ids)
-        finished = self._end_fn(sample_ids)
-        return (finished, next_inputs, state)
diff --git a/texar/tf/modules/decoders/transformer_decoders.py b/texar/tf/modules/decoders/transformer_decoders.py
deleted file mode 100644
index 6f58d39c..00000000
--- a/texar/tf/modules/decoders/transformer_decoders.py
+++ /dev/null
@@ -1,845 +0,0 @@
-# Copyright 2019 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Transformer decoder.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# pylint: disable=no-name-in-module, too-many-arguments, too-many-locals
-# pylint: disable=invalid-name, too-many-instance-attributes,
-# pylint: disable=too-many-branches, redefined-variable-type
-
-import collections
-
-import tensorflow as tf
-from tensorflow.contrib.seq2seq import Decoder as TFDecoder
-
-from texar.tf.core import layers
-from texar.tf.module_base import ModuleBase
-from texar.tf.modules.networks.networks import FeedForwardNetwork
-from texar.tf.modules.encoders.transformer_encoders import \
-    default_transformer_poswise_net_hparams
-from texar.tf.modules.encoders.multihead_attention import \
-    MultiheadAttentionEncoder
-from texar.tf.modules.decoders.rnn_decoder_base import _make_output_layer
-from texar.tf.modules.decoders import tf_helpers as tx_helper
-from texar.tf.utils import beam_search, transformer_attentions as attn
-from texar.tf.utils.shapes import shape_list
-from texar.tf.utils.mode import is_train_mode
-from texar.tf.modules.decoders.dynamic_decode import dynamic_decode
-
-
-__all__ = [
-    "TransformerDecoderOutput",
-    "TransformerDecoder"
-]
-
-
-class TransformerDecoderOutput(collections.namedtuple(
-        "TransformerDecoderOutput",
-        ("logits", "sample_id"))):
-    """The output of :class:`TransformerDecoder`.
-
-    Attributes:
-        logits: A float Tensor of shape
-            `[batch_size, max_time, vocab_size]` containing the logits.
-        sample_id: An int Tensor of shape `[batch_size, max_time]`
-            containing the sampled token indexes.
-    """
-
-
-class TransformerDecoder(ModuleBase, TFDecoder):
-    """Transformer decoder that applies multi-head self-attention for
-    sequence decoding.
-
-    It is a stack of :class:`~texar.tf.modules.encoders.MultiheadAttentionEncoder`,
-    :class:`~texar.tf.modules.FeedForwardNetwork` and residual connections.
-
-    Args:
-        vocab_size (int, optional): Vocabulary size. Required if
-            :attr:`output_layer` is `None`.
-        output_layer (optional): An output layer that transforms cell output
-            to logits. This can be:
-
-            - A callable layer, e.g., an instance \
-            of :tf_main:`tf.layers.Layer <layers/Layer>`.
-            - A tensor. A dense layer will be created using the tensor \
-            as the kernel weights. The bias of the dense layer is determined by\
-            `hparams.output_layer_bias`. This can be used to tie the output \
-            layer with the input embedding matrix, as proposed in \
-            https://arxiv.org/pdf/1608.05859.pdf
-            - `None`. A dense layer will be created based on attr:`vocab_size`\
-            and `hparams.output_layer_bias`.
-            - If no output layer in the end is needed, set \
-            `(vocab_size=None, output_layer=tf.identity)`.
-
-    .. document private functions
-    .. automethod:: _build
-    """
-
-    def __init__(self,
-                 vocab_size=None,
-                 output_layer=None,
-                 hparams=None):
-        ModuleBase.__init__(self, hparams)
-
-        with tf.variable_scope(self.variable_scope):
-            if self._hparams.initializer:
-                tf.get_variable_scope().set_initializer(
-                    layers.get_initializer(self._hparams.initializer))
-
-            # Make the output layer
-            self._output_layer, self._vocab_size = _make_output_layer(
-                output_layer, vocab_size, self._hparams.output_layer_bias,
-                self.variable_scope)
-
-            # Make attention and poswise networks
-            self.multihead_attentions = {
-                'self_att': [],
-                'encdec_att': []
-            }
-            self.poswise_networks = []
-            for i in range(self._hparams.num_blocks):
-                layer_name = 'layer_{}'.format(i)
-                with tf.variable_scope(layer_name):
-                    with tf.variable_scope("self_attention"):
-                        multihead_attention = MultiheadAttentionEncoder(
-                            self._hparams.multihead_attention)
-                        self.multihead_attentions['self_att'].append(
-                            multihead_attention)
-
-                    if self._hparams.dim != \
-                            multihead_attention.hparams.output_dim:
-                        raise ValueError('The output dimenstion of '
-                                         'MultiheadEncoder should be equal '
-                                         'to the dim of TransformerDecoder')
-
-                    with tf.variable_scope('encdec_attention'):
-                        multihead_attention = MultiheadAttentionEncoder(
-                            self._hparams.multihead_attention)
-                        self.multihead_attentions['encdec_att'].append(
-                            multihead_attention)
-
-                    if self._hparams.dim != \
-                            multihead_attention.hparams.output_dim:
-                        raise ValueError('The output dimenstion of '
-                                         'MultiheadEncoder should be equal '
-                                         'to the dim of TransformerDecoder')
-
-                    pw_net = FeedForwardNetwork(
-                        hparams=self._hparams['poswise_feedforward'])
-                    final_dim = pw_net.hparams.layers[-1]['kwargs']['units']
-                    if self._hparams.dim != final_dim:
-                        raise ValueError(
-                            'The output dimenstion of '
-                            '"poswise_feedforward" should be equal '
-                            'to the "dim" of TransformerDecoder.')
-                    self.poswise_networks.append(pw_net)
-
-            # Built in _build()
-            self.context = None
-            self.context_sequence_length = None
-            self.embedding = None
-            self._helper = None
-            self._cache = None
-            self.max_decoding_length = None
-
-    @staticmethod
-    def default_hparams():
-        """Returns a dictionary of hyperparameters with default values.
-
-        .. code-block:: python
-
-            {
-                # Same as in TransformerEncoder
-                "num_blocks": 6,
-                "dim": 512,
-                "embedding_dropout": 0.1,
-                "residual_dropout": 0.1,
-                "poswise_feedforward": default_transformer_poswise_net_hparams,
-                "multihead_attention": {
-                    'name': 'multihead_attention',
-                    'num_units': 512,
-                    'output_dim': 512,
-                    'num_heads': 8,
-                    'dropout_rate': 0.1,
-                    'output_dim': 512,
-                    'use_bias': False,
-                },
-                "initializer": None,
-                "name": "transformer_decoder"
-                # Additional for TransformerDecoder
-                "embedding_tie": True,
-                "output_layer_bias": False,
-                "max_decoding_length": int(1e10),
-            }
-
-        Here:
-
-        "num_blocks": int
-            Number of stacked blocks.
-
-        "dim": int
-            Hidden dimension of the encoder.
-
-        "embedding_dropout": float
-            Dropout rate of the input word and position embeddings.
-
-        "residual_dropout":  float
-            Dropout rate of the residual connections.
-
-        "poswise_feedforward": dict
-            Hyperparameters for a feed-forward network used in residual
-            connections.
-            Make sure the dimension of the output tensor is equal to `dim`.
-
-            See :func:`~texar.tf.modules.default_transformer_poswise_net_hparams`
-            for details.
-
-        "multihead_attention": dict
-            Hyperparameters for the multihead attention strategy.
-            Make sure the `output_dim` in this module is equal to `dim`.
-
-            See :func:`~texar.tf.modules.MultiheadAttentionEncoder.default_hparams`
-            for details.
-
-        "initializer": dict, optional
-            Hyperparameters of the default initializer that initializes
-            variables created in this module.
-            See :func:`~texar.tf.core.get_initializer` for details.
-
-        "output_layer_bias": bool
-            Whether to use bias to the output layer.
-            Used only if :attr:`output_layer` is `None` when constructing
-            the class instance.
-
-        "max_decoding_length": int
-            The maximum allowed number of decoding steps.
-            Set to a very large number of avoid the length constraint.
-            Ignored if provided in :meth:`_build` or
-            "train_greedy" decoding is used.
-
-            Length penalty coefficient. Refer to
-            https://arxiv.org/abs/1609.08144 for more details.
-
-        "name": str
-            Name of the module.
-        """
-        return {
-            "num_blocks": 6,
-            "dim": 512,
-            "embedding_tie": True,
-            "output_layer_bias": False,
-            "max_decoding_length": int(1e10),
-            "embedding_dropout": 0.1,
-            "residual_dropout": 0.1,
-            "poswise_feedforward": default_transformer_poswise_net_hparams(),
-            'multihead_attention': {
-                'name': 'multihead_attention',
-                'num_units': 512,
-                'num_heads': 8,
-                'dropout_rate': 0.1,
-                'output_dim': 512,
-                'use_bias': False,
-            },
-            "initializer": None,
-            "name": "transformer_decoder",
-        }
-
-    def _inputs_to_outputs(self, inputs, cache):
-        """The function is called in dynamic decoding.
-
-        `inputs` should be of shape `[batch_size, dim]`.
-
-        Returns outputs (i.e. logits) of shape `[batch_size, vocab_size]`
-        and updated cache.
-        """
-        outputs = self._self_attention_stack(
-            tf.expand_dims(inputs, axis=1),
-            memory=cache.get('memory'),
-            cache=cache,
-        )
-        outputs = self._output_layer(outputs)
-        outputs = tf.squeeze(outputs, axis=[1])
-        return outputs, cache
-
-    def _input_ids_to_outputs(self, input_ids, step, cache):
-        """The function is called in beam-search decoding.
-
-        `inputs` should be of shape `[batch_size]`.
-
-        Returns outputs (i.e. logits) of shape `[batch_size, vocab_size]`
-        and updated cache.
-        """
-        _batch_size = shape_list(input_ids)[0]
-        times = tf.ones([_batch_size], dtype=tf.int32) * step
-        inputs = self.embedding(input_ids, times)
-
-        outputs = self._self_attention_stack(
-            tf.expand_dims(inputs, axis=1),
-            memory=cache.get('memory'),
-            cache=cache,
-        )
-        outputs = self._output_layer(outputs)
-        outputs = tf.squeeze(outputs, axis=[1])
-        return outputs, cache
-
-    def _build(self,  # pylint: disable=arguments-differ, too-many-statements
-               decoding_strategy='train_greedy',
-               inputs=None,
-               memory=None,
-               memory_sequence_length=None,
-               memory_attention_bias=None,
-               beam_width=None,
-               length_penalty=0.,
-               start_tokens=None,
-               end_token=None,
-               context=None,
-               context_sequence_length=None,
-               softmax_temperature=None,
-               max_decoding_length=None,
-               impute_finished=False,
-               embedding=None,
-               helper=None,
-               mode=None):
-        """Performs decoding.
-
-        The interface is mostly the same with that of RNN decoders
-        (see :meth:`~texar.tf.modules.RNNDecoderBase._build`). The main difference
-        is that, here, `sequence_length` is not needed, and continuation
-        generation is additionally supported.
-
-        The function provides **3 ways** to specify the decoding method, with
-        varying flexibility:
-
-        1. The :attr:`decoding_strategy` argument.
-
-            - **"train_greedy"**: decoding in teacher-forcing fashion (i.e.,
-              feeding ground truth to decode the next step), and for each step
-              sample is obtained by taking the `argmax` of logits.
-              Argument :attr:`inputs` is required for this strategy.
-            - **"infer_greedy"**: decoding in inference fashion (i.e., feeding
-              `generated` sample to decode the next step), and for each step
-              sample is obtained by taking the `argmax` of logits.
-              Arguments :attr:`(start_tokens, end_token)` are
-              required for this strategy, and argument
-              :attr:`max_decoding_length` is optional.
-            - **"infer_sample"**: decoding in inference fashion, and for each
-              step sample is obtained by `random sampling` from the logits.
-              Arguments :attr:`(start_tokens, end_token)` are required for this
-              strategy, and argument :attr:`max_decoding_length` is optional.
-
-          This argument is used only when arguments :attr:`helper` and
-          :attr:`beam_width` are both `None`.
-
-        2. The :attr:`helper` argument: An instance of subclass of
-           :class:`texar.tf.modules.Helper`.
-           This provides a superset of decoding strategies than above.
-           The interface is the same as in RNN decoders.
-           Please refer to :meth:`texar.tf.modules.RNNDecoderBase._build` for
-           detailed usage and examples.
-
-           Note that, here, though using a
-           :class:`~texar.tf.modules.TrainingHelper` corresponds to the
-           "train_greedy" strategy above and will get the same output results,
-           the implementation is *slower* than
-           directly setting `decoding_strategy="train_greedy"`.
-
-           Argument :attr:`max_decoding_length` is optional.
-
-        3. **Beam search**: set :attr:`beam_width` to use beam search decoding.
-           Arguments :attr:`(start_tokens, end_token)` are required,
-           and argument :attr:`max_decoding_length` is optional.
-
-        Args:
-            memory (optional): The memory to attend, e.g., the output of an RNN
-                encoder. A Tensor of shape `[batch_size, memory_max_time, dim]`.
-            memory_sequence_length (optional): A Tensor of shape `[batch_size]`
-                containing the sequence lengths for the batch entries in
-                memory. Used to create attention bias of
-                :attr:`memory_attention_bias` is not given. Ignored if
-                `memory_attention_bias` is provided.
-            memory_attention_bias (optional): A Tensor of shape
-                `[batch_size, num_heads, memory_max_time, dim]`.
-                An attention bias typically sets the value of a padding
-                position to a large negative value for masking. If not given,
-                :attr:`memory_sequence_length` is used to automatically
-                create an attention bias.
-            inputs (optional): Input tensor for teacher forcing decoding, of
-                shape `[batch_size, target_max_time, emb_dim]` containing the
-                target sequence word embeddings.
-                Used when :attr:`decoding_strategy` is set to "train_greedy".
-            decoding_strategy (str): A string specifying the decoding
-                strategy, including "train_greedy", "infer_greedy",
-                "infer_sample".
-                Different arguments are required based on the
-                strategy. See above for details. Ignored if
-                :attr:`beam_width` or :attr:`helper` is set.
-            beam_width (int): Set to use beam search. If given,
-                :attr:`decoding_strategy` is ignored.
-            length_penalty (float): Length penalty coefficient used in beam
-                search decoding. Refer to https://arxiv.org/abs/1609.08144
-                for more details.
-                It Should be larger if longer sentences are wanted.
-            start_tokens (optional): An int Tensor of shape `[batch_size]`,
-                containing the start tokens.
-                Used when :attr:`decoding_strategy` = "infer_greedy" or
-                "infer_sample", or :attr:`beam_width` is set.
-                Ignored when context is set.
-            end_token (optional): An int 0D Tensor, the token that marks end
-                of decoding.
-                Used when :attr:`decoding_strategy` = "infer_greedy" or
-                "infer_sample", or :attr:`beam_width` is set.
-            context (optional): An int Tensor of shape `[batch_size, length]`,
-                containing the starting tokens for decoding.
-                If context is set, the start_tokens will be ignored.
-            context_sequence_length (optional): specify the length of context.
-            softmax_temperature (optional): A float 0D Tensor, value to divide
-                the logits by before computing the softmax. Larger values
-                (above 1.0) result in more random samples. Must > 0. If `None`,
-                1.0 is used.
-                Used when :attr:`decoding_strategy` = "infer_sample"`.
-            max_decoding_length (optional): An int scalar Tensor indicating
-                the maximum allowed number of decoding steps.
-                If `None` (default), use "max_decoding_length" defined in
-                :attr:`hparams`. Ignored in "train_greedy" decoding.
-            impute_finished (bool): If `True`, then states for batch
-                entries which are marked as finished get copied through and
-                the corresponding outputs get zeroed out.  This causes some
-                slowdown at each time step, but ensures that the final state
-                and outputs have the correct values and that backprop ignores
-                time steps that were marked as finished. Ignored in
-                "train_greedy" decoding.
-            embedding (optional): Embedding used when
-                "infer_greedy" or "infer_sample" `decoding_strategy`, or
-                beam search, is used. This can be
-                a callable or the `params` argument for
-                :tf_main:`embedding_lookup <nn/embedding_lookup>`.
-                If a callable, it can take a vector tensor of token `ids`,
-                or take two arguments (`ids`, `times`), where `ids`
-                is a vector tensor of token ids, and `times` is a vector tensor
-                of time steps (i.e., position ids). The latter case can be used
-                when attr:`embedding` is a combination of word embedding and
-                position embedding.
-            helper (optional): An instance of
-                :tf_main:`Helper <contrib/seq2seq/Helper>` that defines the
-                decoding strategy. If given, :attr:`decoding_strategy` is
-                ignored.
-            mode (optional): A tensor taking value in
-                :tf_main:`tf.estimator.ModeKeys <estimator/ModeKeys>`, including
-                `TRAIN`, `EVAL`, and `PREDICT`. Controls dropout mode.
-                If `None` (default), :func:`texar.tf.global_mode`
-                is used.
-
-        Returns:
-
-            - For **"train_greedy"** decoding, returns an instance of \
-            :class:`~texar.tf.modules.TransformerDecoderOutput` which contains\
-            `sample_id` and `logits`.
-
-            - For **"infer_greedy"** and **"infer_sample"** decoding or\
-            decoding with :attr:`helper`, returns\
-            a tuple `(outputs, sequence_lengths)`, where `outputs` is an \
-            instance of :class:`~texar.tf.modules.TransformerDecoderOutput` as\
-            in "train_greedy", and `sequence_lengths` is a Tensor of shape\
-            `[batch_size]` containing the length of each sample.
-
-            - For **beam search** decoding, returns a `dict` containing keys\
-            "sample_id" and "log_prob".
-
-                - **"sample_id"** is an int Tensor of shape \
-                `[batch_size, max_time, beam_width]` containing generated\
-                token indexes. `sample_id[:,:,0]` is the highest-probable \
-                sample.
-                - **"log_prob"** is a float Tensor of shape \
-                `[batch_size, beam_width]` containing the log probability \
-                of each sequence sample.
-        """
-
-        if memory is not None:
-            if memory_attention_bias is None:
-                if memory_sequence_length is None:
-                    raise ValueError(
-                        "`memory_sequence_length` is required if "
-                        "`memory_attention_bias` is not given.")
-
-                enc_padding = 1 - tf.sequence_mask(
-                    memory_sequence_length, shape_list(memory)[1],
-                    dtype=tf.float32)
-                memory_attention_bias = attn.attention_bias_ignore_padding(
-                    enc_padding)
-
-        # record the context, which will be used in step function
-        # for dynamic_decode
-        if context is not None:
-            start_tokens = context[:, 0]
-            self.context = context[:, 1:]
-            self.context_sequence_length = context_sequence_length - 1
-        else:
-            self.context = None
-
-        self.embedding = embedding
-
-        if helper is None and beam_width is None and \
-                decoding_strategy == 'train_greedy':  # Teacher-forcing
-
-            decoder_self_attention_bias = (
-                attn.attention_bias_lower_triangle(
-                    shape_list(inputs)[1]))
-
-            decoder_output = self._self_attention_stack(
-                inputs,
-                memory,
-                decoder_self_attention_bias=decoder_self_attention_bias,
-                memory_attention_bias=memory_attention_bias,
-                cache=None,
-                mode=mode)
-            logits = self._output_layer(decoder_output)
-            preds = tf.cast(tf.argmax(logits, axis=-1), tf.int32)
-            rets = TransformerDecoderOutput(
-                logits=logits,
-                sample_id=preds
-            )
-
-        else:
-            if max_decoding_length is None:
-                max_decoding_length = self._hparams.max_decoding_length
-            self.max_decoding_length = max_decoding_length
-            if beam_width is None:  # Inference-like decoding
-                # Prepare helper
-                if helper is None:
-                    if decoding_strategy == "infer_greedy":
-                        helper = tx_helper.GreedyEmbeddingHelper(
-                            embedding, start_tokens, end_token)
-                    elif decoding_strategy == "infer_sample":
-                        helper = tx_helper.SampleEmbeddingHelper(
-                            embedding, start_tokens, end_token,
-                            softmax_temperature)
-                    else:
-                        raise ValueError(
-                            "Unknown decoding strategy: {}".format(
-                                decoding_strategy))
-                self._helper = helper
-
-                self._cache = self._init_cache(memory, memory_attention_bias,
-                                               beam_search_decoding=False)
-                if context is not None:
-                    self.context = tf.pad(
-                        self.context,
-                        [[0, 0],
-                         [0, max_decoding_length - shape_list(self.context)[1]]]
-                    )
-
-                outputs, _, sequence_lengths = dynamic_decode(
-                    decoder=self,
-                    impute_finished=impute_finished,
-                    maximum_iterations=max_decoding_length,
-                    output_time_major=False,
-                    scope=self.variable_scope)
-
-                if context is not None:
-                    # Here the length of sample_id will be larger than that
-                    # of logit by 1, because there will be a additional
-                    # start_token in the returned sample_id.
-                    # the start_id should be the first token of the
-                    # given context
-                    outputs = TransformerDecoderOutput(
-                        logits=outputs.logits,
-                        sample_id=tf.concat(
-                            [tf.expand_dims(start_tokens, 1),
-                             outputs.sample_id],
-                            axis=1
-                        )
-                    )
-                    sequence_lengths = sequence_lengths + 1
-                rets = outputs, sequence_lengths
-
-            else:  # Beam-search decoding
-                # Ignore `decoding_strategy`; Assume `helper` is not set
-                if helper is not None:
-                    raise ValueError("Must not set 'beam_width' and 'helper' "
-                                     "simultaneously.")
-                _batch_size = shape_list(start_tokens)[0]
-                self._cache = self._init_cache(memory, memory_attention_bias,
-                                               beam_search_decoding=True,
-                                               batch_size=_batch_size)
-
-                # The output format is different when running beam search
-                sample_id, log_prob = self._beam_decode(
-                    start_tokens,
-                    end_token,
-                    beam_width=beam_width,
-                    length_penalty=length_penalty,
-                    decode_length=max_decoding_length,
-                )
-                rets = {
-                    'sample_id': sample_id,
-                    'log_prob': log_prob
-                }
-
-        if not self._built:
-            self._add_internal_trainable_variables()
-            self._built = True
-
-        return rets
-
-    def _self_attention_stack(self,
-                              inputs,
-                              memory,
-                              decoder_self_attention_bias=None,
-                              memory_attention_bias=None,
-                              cache=None,
-                              mode=None):
-        """Stacked multihead attention module.
-        """
-
-        def _layer_norm(x, scope):
-            return layers.layer_normalize(x, reuse=tf.AUTO_REUSE, scope=scope)
-
-        inputs = tf.layers.dropout(inputs,
-                                   rate=self._hparams.embedding_dropout,
-                                   training=is_train_mode(mode))
-        if cache is not None:
-            if memory is not None:
-                memory_attention_bias = \
-                    cache['memory_attention_bias']
-        else:
-            assert decoder_self_attention_bias is not None
-
-        x = inputs
-        for i in range(self._hparams.num_blocks):
-            layer_name = 'layer_{}'.format(i)
-            layer_cache = cache[layer_name] if cache is not None else None
-            with tf.variable_scope(layer_name) as layer_scope:
-                with tf.variable_scope("self_attention"):
-                    multihead_attention = \
-                        self.multihead_attentions['self_att'][i]
-                    selfatt_output = multihead_attention(
-                        queries=_layer_norm(x, layer_scope),
-                        memory=None,
-                        memory_attention_bias=decoder_self_attention_bias,
-                        cache=layer_cache,
-                        mode=mode,
-                    )
-                    x = x + tf.layers.dropout(
-                        selfatt_output,
-                        rate=self._hparams.residual_dropout,
-                        training=is_train_mode(mode),
-                    )
-                if memory is not None:
-                    with tf.variable_scope('encdec_attention') as \
-                            encdec_attention_scope:
-                        multihead_attention = \
-                            self.multihead_attentions['encdec_att'][i]
-                        encdec_output = multihead_attention(
-                            queries=_layer_norm(x, encdec_attention_scope),
-                            memory=memory,
-                            memory_attention_bias=memory_attention_bias,
-                            mode=mode,
-                        )
-                        x = x + tf.layers.dropout(
-                            encdec_output,
-                            rate=self._hparams.residual_dropout,
-                            training=is_train_mode(mode))
-                poswise_network = self.poswise_networks[i]
-                with tf.variable_scope('past_poswise_ln') as \
-                        past_poswise_ln_scope:
-                    sub_output = tf.layers.dropout(
-                        poswise_network(_layer_norm(x, past_poswise_ln_scope)),
-                        rate=self._hparams.residual_dropout,
-                        training=is_train_mode(mode),
-                    )
-                    x = x + sub_output
-
-        return _layer_norm(x, scope=self.variable_scope)
-
-    def _init_cache(self, memory, memory_attention_bias, beam_search_decoding,
-                    batch_size=None):
-        """Returns an initialized cache.
-
-        In order to support both inference-like decoding and beam-search
-        decoding, the elements of each layer must be initialized and extended
-        as different structure respectively. Specifically, when inference-like
-        decoding, tf.TensorArray is used, which satisfies the shape consistency
-        check in the while-loop in tf.contrib.seq2seq.dynamic_decode. When
-        beam-search decoding, a tf.Tensor of shape
-        `[batch_size, current_steps, num_units]` is maintained, where
-        `current_steps` is the number of steps currently decoded.
-        """
-        if batch_size is None:
-            batch_size = self.batch_size
-
-        def _shape(batch_size, from_shape):
-            if (not isinstance(from_shape, tf.TensorShape) or
-                    from_shape.ndims == 0):
-                return tf.TensorShape(None)
-
-            batch_size = tf.contrib.util.constant_value(
-                tf.convert_to_tensor(
-                    batch_size, name="batch_size"))
-            return tf.TensorShape([batch_size]).concatenate(from_shape)
-
-        def _create_ta(s, d):
-            return tf.TensorArray(
-                dtype=d,
-                size=0,
-                dynamic_size=True,
-                clear_after_read=False,
-                element_shape=_shape(batch_size, s))
-
-        def _create_empty_tensor(s, d):
-            return tf.zeros(
-                [batch_size, 0] + s.as_list(),
-                dtype=d)
-
-        _create_fn = _create_empty_tensor if beam_search_decoding else \
-            _create_ta
-
-        s = tf.TensorShape([self._hparams.multihead_attention.num_units])
-
-        if memory is not None:
-            cache = {
-                'memory': memory,
-                'memory_attention_bias': memory_attention_bias,
-            }
-            for l in range(self._hparams.num_blocks):
-                cache['layer_{}'.format(l)] = {
-                    'self_keys': _create_fn(s, tf.float32),
-                    'self_values': _create_fn(s, tf.float32),
-                    'memory_keys': _create_fn(s, tf.float32),
-                    'memory_values': _create_fn(s, tf.float32),
-                }
-        else:
-            cache = {}
-            for l in range(self._hparams.num_blocks):
-                cache['layer_{}'.format(l)] = {
-                    'self_keys': _create_fn(s, tf.float32),
-                    'self_values': _create_fn(s, tf.float32),
-                }
-
-        return cache
-
-    def _beam_decode(self,
-                     start_tokens,
-                     end_token,
-                     decode_length,
-                     beam_width,
-                     length_penalty):
-        def _symbols_to_logits_fn(ids, step, cache):
-            return self._input_ids_to_outputs(
-                ids[:, -1], step, cache)
-
-        outputs, log_prob = beam_search.beam_search(
-            _symbols_to_logits_fn,
-            start_tokens,
-            beam_width,
-            decode_length,
-            self._vocab_size,
-            length_penalty,
-            eos_id=end_token,
-            states=self._cache)
-
-        # Ignores <BOS>
-        outputs = outputs[:, :, 1:]
-        # shape = [batch_size, seq_length, beam_width]
-        outputs = tf.transpose(outputs, [0, 2, 1])
-        return (outputs, log_prob)
-
-    @property
-    def batch_size(self):
-        return self._helper.batch_size
-
-    @property
-    def output_size(self):
-        """Output size of one step.
-        """
-        return TransformerDecoderOutput(
-            logits=tf.TensorShape([self._vocab_size]),
-            sample_id=self._helper.sample_ids_shape)
-
-    @property
-    def output_dtype(self):
-        """Types of output of one step.
-        """
-        return TransformerDecoderOutput(
-            logits=tf.float32,
-            sample_id=self._helper.sample_ids_dtype)
-
-    def initialize(self, name=None):
-        """Called before any decoding iterations.
-
-        This methods computes initial input values and initial state
-        (i.e. cache).
-
-        Args:
-            name: Name scope for any created operations.
-
-        Returns:
-            `(finished, initial_inputs, initial_state)`, representing
-            initial values of `finished` flags, inputs and state (i.e. cache).
-        """
-        return self._helper.initialize() + (self._cache,)
-
-    def step(self, time, inputs, state, name=None):
-        """Called per step of decoding.
-
-        Args:
-            time: Scalar `int32` tensor. Current step number.
-            inputs: Input tensor for this time step.
-            state: State (i.e. cache) from previous time step.
-            name: Name scope for any created operations.
-
-        Returns:
-            `(outputs, next_state, next_inputs, finished)`. `outputs` is an
-            object containing the decoder output, `next_state` is the state
-            (i.e. cache), `next_inputs` is the tensor that should be used
-            as input for the next step, `finished` is a boolean tensor telling
-            whether the sequence is complete, for each sequence in the batch.
-        """
-
-        outputs, state = self._inputs_to_outputs(inputs, state)
-        sample_ids = self._helper.sample(
-            time=time, outputs=outputs, state=state)
-        if self.context is not None:
-            _times = tf.ones([self.batch_size], dtype=tf.int32) * time
-            sample_ids = tf.where(
-                self.context_sequence_length > _times,
-                self.context[:, time],
-                sample_ids
-            )
-
-        wrapper_outputs = TransformerDecoderOutput(
-            logits=outputs,
-            sample_id=sample_ids)
-        return (wrapper_outputs, state)
-
-    def next_inputs(self, time, outputs, state):
-        (finished, next_inputs, state) = self._helper.next_inputs(
-            time=time,
-            outputs=outputs.logits,
-            state=state,
-            sample_ids=outputs.sample_id)
-        return (finished, next_inputs, state)
-
-    def finalize(self, outputs, final_state, sequence_lengths):
-        return outputs, final_state
-
-    @property
-    def vocab_size(self):
-        """The vocab size.
-        """
-        return self._vocab_size
diff --git a/texar/tf/modules/decoders/transformer_decoders_test.py b/texar/tf/modules/decoders/transformer_decoders_test.py
deleted file mode 100644
index b1e2fad8..00000000
--- a/texar/tf/modules/decoders/transformer_decoders_test.py
+++ /dev/null
@@ -1,229 +0,0 @@
-#
-"""
-Unit tests for Transformer decodre.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
-
-import tensorflow as tf
-
-from texar.tf.modules.decoders.transformer_decoders import TransformerDecoder
-from texar.tf.modules.decoders.transformer_decoders import TransformerDecoderOutput
-from texar.tf.modules.decoders import tf_helpers as tx_helper
-
-# pylint: disable=too-many-instance-attributes
-
-
-class TransformerDecoderTest(tf.test.TestCase):
-    """Tests :class:`~texar.tf.modules.TransformerDecoder`
-    """
-
-    def setUp(self):
-        tf.test.TestCase.setUp(self)
-        self._vocab_size = 15
-        self._batch_size = 2
-        self._max_time = 10
-        self._emb_dim = 512
-        self._max_decode_len = 16
-        self._inputs = tf.random_uniform(
-            [self._batch_size, self._max_time, self._emb_dim],
-            maxval=1, dtype=tf.float32)
-
-        self._memory = tf.random_uniform(
-            [self._batch_size, self._max_time, self._emb_dim],
-            maxval=1, dtype=tf.float32)
-        self._memory_sequence_length = tf.random_uniform(
-            [self._batch_size], maxval=self._max_time, dtype=tf.int32)
-
-        self._embedding = tf.random_uniform(
-            [self._vocab_size, self._emb_dim], maxval=1, dtype=tf.float32)
-        self._pos_embedding = tf.random_uniform(
-            [self._max_decode_len, self._emb_dim], maxval=1, dtype=tf.float32)
-
-        def _embedding_fn(x, y):
-            x_emb = tf.nn.embedding_lookup(self._embedding, x)
-            y_emb = tf.nn.embedding_lookup(self._pos_embedding, y)
-            return x_emb * self._emb_dim ** 0.5 + y_emb
-        self._embedding_fn = _embedding_fn
-
-        self._output_layer = tf.random_uniform(
-            [self._emb_dim, self._vocab_size], maxval=1, dtype=tf.float32)
-
-        self._start_tokens = tf.fill([self._batch_size], 1)
-        self._end_token = 2
-        self.max_decoding_length = self._max_time
-
-        _context = [[3, 4, 5, 2, 0], [4, 3, 5, 7, 2]]
-        _context_length = [4, 5]
-        self._context = tf.Variable(_context)
-        self._context_length = tf.Variable(_context_length)
-
-    def test_output_layer(self):
-        decoder = TransformerDecoder(vocab_size=self._vocab_size,
-                                     output_layer=None)
-        self.assertIsInstance(decoder, TransformerDecoder)
-
-        decoder = TransformerDecoder(output_layer=tf.identity)
-        self.assertIsInstance(decoder, TransformerDecoder)
-
-        tensor = tf.random_uniform(
-            [self._emb_dim, self._vocab_size], maxval=1, dtype=tf.float32
-        )
-        decoder = TransformerDecoder(output_layer=tensor)
-        self.assertIsInstance(decoder, TransformerDecoder)
-        self.assertEqual(decoder.vocab_size, self._vocab_size)
-
-    def test_decode_train(self):
-        """Tests train_greedy
-        """
-        decoder = TransformerDecoder(
-            vocab_size=self._vocab_size,
-            output_layer=self._output_layer
-        )
-        # 6 blocks
-        # -self multihead_attention: 4 dense without bias + 2 layer norm vars
-        # -encdec multihead_attention: 4 dense without bias + 2 layer norm vars
-        # -poswise_network: Dense with bias, Dense with bias + 2 layer norm vars
-        # 2 layer norm vars
-        outputs = decoder(memory=self._memory,
-                          memory_sequence_length=self._memory_sequence_length,
-                          memory_attention_bias=None,
-                          inputs=self._inputs,
-                          decoding_strategy='train_greedy',
-                          mode=tf.estimator.ModeKeys.TRAIN)
-        self.assertEqual(len(decoder.trainable_variables), 110)
-        with self.test_session() as sess:
-            sess.run(tf.global_variables_initializer())
-            outputs_ = sess.run(outputs)
-
-            self.assertIsInstance(outputs_, TransformerDecoderOutput)
-
-    def test_decode_infer_greedy(self):
-        """Tests train_greedy
-        """
-        decoder = TransformerDecoder(
-            vocab_size=self._vocab_size,
-            output_layer=self._output_layer
-        )
-        helper = tx_helper.GreedyEmbeddingHelper(
-            self._embedding_fn, self._start_tokens, self._end_token)
-
-        outputs, length = decoder(
-            memory=self._memory,
-            memory_sequence_length=self._memory_sequence_length,
-            memory_attention_bias=None,
-            inputs=None,
-            helper=helper,
-            max_decoding_length=self._max_decode_len,
-            mode=tf.estimator.ModeKeys.PREDICT)
-        with self.test_session() as sess:
-            sess.run(tf.global_variables_initializer())
-            outputs_ = sess.run(outputs)
-            self.assertIsInstance(outputs_, TransformerDecoderOutput)
-
-    def test_infer_greedy_with_context_without_memory(self):
-        """Tests train_greedy with context
-        """
-        decoder = TransformerDecoder(
-            vocab_size=self._vocab_size,
-            output_layer=self._output_layer
-        )
-        helper = tx_helper.GreedyEmbeddingHelper(
-            self._embedding_fn, self._start_tokens, self._end_token)
-
-        outputs, length = decoder(
-            memory=None,
-            memory_sequence_length=None,
-            memory_attention_bias=None,
-            inputs=None,
-            decoding_strategy='infer_greedy',
-            helper=helper,
-            context=self._context,
-            context_sequence_length=self._context_length,
-            end_token=self._end_token,
-            max_decoding_length=self._max_decode_len,
-            mode=tf.estimator.ModeKeys.PREDICT)
-        with self.test_session() as sess:
-            sess.run(tf.global_variables_initializer())
-            outputs_ = sess.run(outputs)
-            self.assertIsInstance(outputs_, TransformerDecoderOutput)
-
-    def test_decode_infer_sample(self):
-        """Tests infer_sample
-        """
-        decoder = TransformerDecoder(
-            vocab_size=self._vocab_size,
-            output_layer=self._output_layer
-        )
-        helper = tx_helper.SampleEmbeddingHelper(
-            self._embedding_fn, self._start_tokens, self._end_token)
-
-        outputs, length = decoder(
-            memory=self._memory,
-            memory_sequence_length=self._memory_sequence_length,
-            memory_attention_bias=None,
-            inputs=None,
-            helper=helper,
-            max_decoding_length=self._max_decode_len,
-            mode=tf.estimator.ModeKeys.PREDICT)
-        with self.test_session() as sess:
-            sess.run(tf.global_variables_initializer())
-            outputs_ = sess.run(outputs)
-            self.assertIsInstance(outputs_, TransformerDecoderOutput)
-
-    def test_beam_search(self):
-        """Tests beam_search
-        """
-        decoder = TransformerDecoder(
-            vocab_size=self._vocab_size,
-            output_layer=self._output_layer
-        )
-
-        outputs = decoder(
-            memory=self._memory,
-            memory_sequence_length=self._memory_sequence_length,
-            memory_attention_bias=None,
-            inputs=None,
-            embedding=self._embedding_fn,
-            beam_width=5,
-            start_tokens=self._start_tokens,
-            end_token=self._end_token,
-            max_decoding_length=self._max_decode_len,
-            mode=tf.estimator.ModeKeys.PREDICT
-        )
-
-        with self.test_session() as sess:
-            sess.run(tf.global_variables_initializer())
-            outputs_ = sess.run(outputs)
-            self.assertEqual(outputs_['log_prob'].shape,
-                             (self._batch_size, 5))
-            self.assertEqual(outputs_['sample_id'].shape,
-                             (self._batch_size, self._max_decode_len, 5))
-
-    def test_greedy_embedding_helper(self):
-        """Tests with tf.contrib.seq2seq.GreedyEmbeddingHelper
-        """
-        decoder = TransformerDecoder(
-            vocab_size=self._vocab_size,
-            output_layer=self._output_layer
-        )
-        helper = tx_helper.GreedyEmbeddingHelper(
-            self._embedding, self._start_tokens, self._end_token)
-        outputs, length = decoder(
-            memory=self._memory,
-            memory_sequence_length=self._memory_sequence_length,
-            memory_attention_bias=None,
-            helper=helper,
-            max_decoding_length=self._max_decode_len,
-            mode=tf.estimator.ModeKeys.PREDICT)
-        with self.test_session() as sess:
-            sess.run(tf.global_variables_initializer())
-            outputs_ = sess.run(outputs)
-            self.assertIsInstance(outputs_, TransformerDecoderOutput)
-
-
-if __name__ == "__main__":
-    tf.test.main()
diff --git a/texar/tf/modules/embedders/__init__.py b/texar/tf/modules/embedders/__init__.py
index 6dc0b6de..511c4957 100644
--- a/texar/tf/modules/embedders/__init__.py
+++ b/texar/tf/modules/embedders/__init__.py
@@ -1,4 +1,4 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
+# Copyright 2019 The Texar Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -15,8 +15,6 @@
 Modules of Texar library embedders.
 """
 
-# pylint: disable=wildcard-import
-
 from texar.tf.modules.embedders.embedder_base import *
 from texar.tf.modules.embedders.embedders import *
 from texar.tf.modules.embedders.position_embedders import *
diff --git a/texar/tf/modules/embedders/embedder_base.py b/texar/tf/modules/embedders/embedder_base.py
index 86591d5e..5bb113f3 100644
--- a/texar/tf/modules/embedders/embedder_base.py
+++ b/texar/tf/modules/embedders/embedder_base.py
@@ -1,4 +1,4 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
+# Copyright 2019 The Texar Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -17,11 +17,9 @@
 
 import tensorflow as tf
 
+from texar.tf.core import layers
 from texar.tf.module_base import ModuleBase
-from texar.tf.modules.embedders import embedder_utils
-from texar.tf.utils.shapes import shape_list
 
-# pylint: disable=invalid-name
 
 __all__ = [
     "EmbedderBase"
@@ -34,35 +32,68 @@ class EmbedderBase(ModuleBase):
     Args:
         num_embeds (int, optional): The number of embedding elements, e.g.,
             the vocabulary size of a word embedder.
+        init_value (optional): A Tensor or numpy array that contains the
+            initial value of embeddings. It is typically of shape
+            ``[vocab_size] + embedding-dim``. Embedding can have dimensionality
+            > 1.
+
+            If `None`, embedding is initialized as specified in
+            ``hparams["initializer"]``. Otherwise, the
+            ``"initializer"`` and ``"dim"`` hyperparameters in
+            :attr:`hparams` are ignored.
         hparams (dict or HParams, optional): Embedder hyperparameters. Missing
-            hyperparamerter will be set to default values. See
+            hyperparamerters will be set to default values. See
             :meth:`default_hparams` for the hyperparameter structure and
             default values.
     """
 
-    def __init__(self, num_embeds=None, hparams=None):
-        ModuleBase.__init__(self, hparams)
+    def __init__(self, num_embeds=None, init_value=None, hparams=None):
+        super().__init__(hparams=hparams)
 
         self._num_embeds = num_embeds
+        self._init_value = init_value
 
     # pylint: disable=attribute-defined-outside-init
-    def _init_parameterized_embedding(self, init_value, num_embeds, hparams):
-        self._embedding = embedder_utils.get_embedding(
-            hparams, init_value, num_embeds, self.variable_scope)
-        if hparams.trainable:
-            self._add_trainable_variable(self._embedding)
+    def build(self, input_shape):
+        r"""Build embedding layer.
+        """
+        with tf.name_scope('Embedding'):
+            regularizer = layers.get_regularizer(self.hparams["regularizer"])
+            if self._init_value is None:
+                initializer = layers.get_initializer(
+                    getattr(self.hparams, "initializer", None))
+                dim = self.hparams["dim"]
+                if not isinstance(self.hparams["dim"], (list, tuple)):
+                    dim = [dim]
+                if not initializer:
+                    initializer = tf.initializers.GlorotUniform()
+                self._embedding = self.add_weight(
+                    name='w',
+                    shape=[self._num_embeds] + dim,
+                    initializer=initializer,
+                    regularizer=regularizer,
+                    trainable=self.hparams["trainable"]
+                )
+            else:
+                init_value = tf.cast(self._init_value, tf.float32)
+                self._embedding = tf.Variable(
+                    name='w',
+                    initial_value=init_value,
+                    trainable=self.hparams["trainable"])
 
-        self._num_embeds = shape_list(self._embedding)[0]
+            self._num_embeds = self._embedding.shape[0]
 
-        self._dim = shape_list(self._embedding)[1:]
-        self._dim_rank = len(self._dim)
-        if self._dim_rank == 1:
-            self._dim = self._dim[0]
+            self._dim = self._embedding.shape[1:].as_list()
+            self._dim_rank = len(self._dim)
+            if self._dim_rank == 1:
+                self._dim = self._dim[0]
+
+        super().build(input_shape)
 
     def _get_dropout_layer(self, hparams, ids_rank=None, dropout_input=None,
                            dropout_strategy=None):
         r"""Creates dropout layer according to dropout strategy.
-        Called in :meth:`_build`.
+        Called in :meth:`call`.
         """
         dropout_layer = None
 
@@ -75,14 +106,14 @@ def _get_dropout_layer(self, hparams, ids_rank=None, dropout_input=None,
             elif st == 'item':
                 assert dropout_input is not None
                 assert ids_rank is not None
-                noise_shape = (shape_list(dropout_input)[:ids_rank]
+                noise_shape = (dropout_input.shape[:ids_rank]
                                + [1] * self._dim_rank)
             elif st == 'item_type':
-                noise_shape = [None] + [1] * self._dim_rank  # type: ignore
+                noise_shape = [None] + [1] * self._dim_rank
             else:
                 raise ValueError('Unknown dropout strategy: {}'.format(st))
 
-            dropout_layer = tf.layers.Dropout(
+            dropout_layer = tf.keras.layers.Dropout(
                 rate=hparams.dropout_rate, noise_shape=noise_shape)
 
         return dropout_layer
@@ -92,7 +123,6 @@ def default_hparams():
         r"""Returns a dictionary of hyperparameters with default values.
 
         .. code-block:: python
-
             {
                 "name": "embedder"
             }
diff --git a/texar/tf/modules/embedders/embedder_utils.py b/texar/tf/modules/embedders/embedder_utils.py
index 3ce13182..f75b5cdd 100644
--- a/texar/tf/modules/embedders/embedder_utils.py
+++ b/texar/tf/modules/embedders/embedder_utils.py
@@ -1,4 +1,4 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
+# Copyright 2019 The Texar Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -16,8 +16,8 @@
 
 import tensorflow as tf
 
-from texar.tf.hyperparams import HParams
 from texar.tf.core import layers
+from texar.tf.hyperparams import HParams
 
 __all__ = [
     "default_embedding_hparams",
@@ -63,7 +63,6 @@ def default_embedding_hparams():
             example is as
 
             .. code-block:: python
-
                 {
                     "type": "random_uniform_initializer",
                     "kwargs": {
@@ -85,8 +84,7 @@ def default_embedding_hparams():
                   :tf_main:`tf.initializers <initializers>`, e.g.,
                   :tf_main:`random_uniform <random_uniform_initializer>`
                   (a.k.a :class:`tf.random_uniform_initializer`), or
-                  in :mod:`tf`, e.g., :tf_main:`glorot_uniform_initializer
-                  <glorot_uniform_initializer>`, or in
+                  in :mod:`tf`, or in
                   :tf_main:`tf.keras.initializers <keras/initializers>`.
                 - User-defined initializer in :mod:`texar.tf.custom`.
                 - External initializer. Must provide the full path,
@@ -163,13 +161,19 @@ class can be
     }
 
 
-def get_embedding(hparams=None,
+def get_embedding(num_embeds=None,
                   init_value=None,
-                  num_embeds=None,
-                  variable_scope='Embedding'):
+                  hparams=None,
+                  name_scope='Embedding'):
     r"""Creates embedding variable if not exists.
 
     Args:
+        num_embeds (int, optional): The number of embedding items
+            (e.g., vocabulary size). Required if :attr:`init_value` is
+            not provided.
+        init_value (Tensor or numpy array, optional): Initial values of the
+            embedding variable. If not given, embedding is initialized as
+            specified in :attr:`hparams["initializer"]`.
         hparams (dict or HParams, optional): Embedding hyperparameters. Missing
             hyperparameters are set to default values. See
             :func:`~texar.tf.modules.default_embedding_hparams`
@@ -177,39 +181,38 @@ def get_embedding(hparams=None,
 
             If :attr:`init_value` is given, :attr:`hparams["initializer"]`,
             and :attr:`hparams["dim"]` are ignored.
-        init_value (Tensor or numpy array, optional): Initial values of the
-            embedding variable. If not given, embedding is initialized as
-            specified in :attr:`hparams["initializer"]`.
-        num_embeds (int, optional): The number of embedding items
-            (e.g., vocabulary size). Required if :attr:`init_value` is
-            not provided.
-        variable_scope (str or VariableScope, optional): Variable scope of
-            the embedding variable.
+        name_scope (str, optional): Name scope of the embedding variable.
 
     Returns:
-        Variable or Tensor: A 2D `Variable` or `Tensor` of the same shape with
-        :attr:`init_value` or of the shape ``[num_embeds, hparams["dim"]]``.
+        A 2D `Variable` of the same shape with :attr:`init_value` or of
+        the shape ``[num_embeds, hparams["dim"]]``.
     """
-    with tf.variable_scope(variable_scope):
+    with tf.name_scope(name_scope):
         if hparams is None or isinstance(hparams, dict):
             hparams = HParams(hparams, default_embedding_hparams())
-        regularizer = layers.get_regularizer(hparams["regularizer"])
+        # TODO: Add regularizer
         if init_value is None:
-            initializer = layers.get_initializer(hparams["initializer"])
+            initializer = layers.get_initializer(
+                getattr(hparams, "initializer", None))
             dim = hparams["dim"]
             if not isinstance(hparams["dim"], (list, tuple)):
                 dim = [dim]
-            embedding = tf.get_variable(name='w',
-                                        shape=[num_embeds] + dim,
-                                        initializer=initializer,
-                                        regularizer=regularizer,
-                                        trainable=hparams["trainable"])
+            if initializer:
+                embedding = tf.Variable(
+                    name='w',
+                    initial_value=initializer([num_embeds] + dim),
+                    trainable=hparams["trainable"])
+            else:
+                initializer = tf.initializers.GlorotUniform()
+                embedding = tf.Variable(
+                    name='w',
+                    initial_value=initializer([num_embeds] + dim),
+                    trainable=hparams["trainable"])
         else:
             init_value = tf.cast(init_value, tf.float32)
-            embedding = tf.get_variable(name='w',
-                                        initializer=init_value,
-                                        regularizer=regularizer,
-                                        trainable=hparams["trainable"])
+            embedding = tf.Variable(name='w',
+                                    initial_value=init_value,
+                                    trainable=hparams["trainable"])
 
         return embedding
 
@@ -229,8 +232,8 @@ def soft_embedding_lookup(embedding, soft_ids):
     Returns:
         A Tensor of shape ``shape(soft_ids)[:-1] + shape(embedding)[1:]``. For
         example, if ``shape(soft_ids) = [batch_size, max_time, vocab_size]``
-        and ``shape(embedding) = [vocab_size, emb_dim]``, then the return tensor
-        has shape ``[batch_size, max_time, emb_dim]``.
+        and ``shape(embedding) = [vocab_size, emb_dim]``, then the returned
+        tensor has shape ``[batch_size, max_time, emb_dim]``.
 
     Example::
 
diff --git a/texar/tf/modules/embedders/embedder_utils_test.py b/texar/tf/modules/embedders/embedder_utils_test.py
index 9c28585c..21edbb59 100644
--- a/texar/tf/modules/embedders/embedder_utils_test.py
+++ b/texar/tf/modules/embedders/embedder_utils_test.py
@@ -15,23 +15,19 @@ def test_get_embedding(self):
         """
         vocab_size = 100
         emb = embedder_utils.get_embedding(num_embeds=vocab_size)
-        self.assertEqual(emb.shape[0].value, vocab_size)
-        self.assertEqual(emb.shape[1].value,
+        self.assertEqual(emb.shape[0], vocab_size)
+        self.assertEqual(emb.shape[1],
                          embedder_utils.default_embedding_hparams()["dim"])
 
         hparams = {
             "initializer": {
                 "type": tf.random_uniform_initializer(minval=-0.1, maxval=0.1)
             },
-            "regularizer": {
-                "type": tf.keras.regularizers.L1L2(0.1, 0.1)
-            }
         }
         emb = embedder_utils.get_embedding(
-            hparams=hparams, num_embeds=vocab_size,
-            variable_scope='embedding_2')
-        self.assertEqual(emb.shape[0].value, vocab_size)
-        self.assertEqual(emb.shape[1].value,
+            hparams=hparams, num_embeds=vocab_size)
+        self.assertEqual(emb.shape[0], vocab_size)
+        self.assertEqual(emb.shape[1],
                          embedder_utils.default_embedding_hparams()["dim"])
 
 
diff --git a/texar/tf/modules/embedders/embedders.py b/texar/tf/modules/embedders/embedders.py
index 91509c1b..3b66a176 100644
--- a/texar/tf/modules/embedders/embedders.py
+++ b/texar/tf/modules/embedders/embedders.py
@@ -1,4 +1,4 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
+# Copyright 2019 The Texar Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -17,8 +17,8 @@
 
 import tensorflow as tf
 
-from texar.tf.modules.embedders.embedder_base import EmbedderBase
 from texar.tf.modules.embedders import embedder_utils
+from texar.tf.modules.embedders.embedder_base import EmbedderBase
 from texar.tf.utils.mode import is_train_mode
 from texar.tf.utils.shapes import get_rank
 
@@ -51,7 +51,7 @@ class WordEmbedder(EmbedderBase):
             :meth:`default_hparams` for the hyperparameter structure and
             default values.
 
-    See :meth:`_build` for the inputs and outputs of the embedder.
+    See :meth:`__call__` for the inputs and outputs of the embedder.
 
     Example:
 
@@ -84,31 +84,24 @@ class WordEmbedder(EmbedderBase):
             # Use pre-trained embedding
             embedder_2 = WordEmbedder(init_value=data.embedding_init_value)
             emb_2 = embedder_2(batch['text_ids'])
-
-
-    .. document private functions
-    .. automethod:: _build
     """
 
     def __init__(self, init_value=None, vocab_size=None, hparams=None):
-        EmbedderBase.__init__(self, hparams=hparams)
 
         if init_value is None and vocab_size is None:
             raise ValueError(
                 "Either `init_value` or `vocab_size` is required.")
 
-        self._init_parameterized_embedding(init_value, vocab_size,
-                                           self._hparams)
+        if init_value is not None and vocab_size is not None:
+            if init_value.shape[0] != vocab_size:
+                raise ValueError(
+                    'vocab_size must equal to init_value.shape[0].'
+                    'Got %d and %d' % (vocab_size, init_value.shape[0]))
 
-        self._vocab_size = vocab_size
-        if vocab_size is None:
-            self._vocab_size = self._num_embeds
-        if self._vocab_size != self._num_embeds:
-            raise ValueError(
-                'vocab_size must equal to init_value.shape[0].'
-                'Got %d and %d' % (self._vocab_size, self._num_embeds))
+        super().__init__(init_value=init_value,
+                         num_embeds=vocab_size, hparams=hparams)
 
-        self._built = True
+        self._vocab_size = vocab_size
 
     @staticmethod
     def default_hparams():
@@ -184,7 +177,8 @@ def default_hparams():
         hparams["name"] = "word_embedder"
         return hparams
 
-    def _build(self, ids=None, soft_ids=None, mode=None, **kwargs):
+    # pylint: disable=useless-super-delegation
+    def __call__(self, ids=None, soft_ids=None, mode=None, **kwargs):
         r"""Embeds (soft) ids.
 
         Either :attr:`ids` or :attr:`soft_ids` must be given, and they
@@ -196,8 +190,7 @@ def _build(self, ids=None, soft_ids=None, mode=None, **kwargs):
                 mix the embedding vectors.
             mode (optional): A tensor taking value in
                 :tf_main:`tf.estimator.ModeKeys <estimator/ModeKeys>`, including
-                `TRAIN`, `EVAL`, and `PREDICT`. If `None`, dropout is
-                controlled by :func:`texar.tf.global_mode`.
+                `TRAIN`, `EVAL`, and `PREDICT`.
             kwargs: Additional keyword arguments for
                 :tf_main:`tf.nn.embedding_lookup <nn/embedding_lookup>` besides
                 :attr:`params` and :attr:`ids`.
@@ -215,6 +208,12 @@ def _build(self, ids=None, soft_ids=None, mode=None, **kwargs):
             and ``shape(embedding) = [vocab_size, emb_dim]``, then the return
             tensor has shape ``[batch_size, max_time, emb_dim]``.
         """
+        return super().__call__([ids, soft_ids], mode, **kwargs)
+
+    def call(self, inputs, mode, **kwargs):
+        r"""Embeds (soft) ids.
+        """
+        ids, soft_ids = inputs
         if ids is not None:
             if soft_ids is not None:
                 raise ValueError(
@@ -231,8 +230,8 @@ def _build(self, ids=None, soft_ids=None, mode=None, **kwargs):
         if self._hparams.dropout_strategy == 'item_type':
             dropout_layer = self._get_dropout_layer(self._hparams)
             if dropout_layer:
-                embedding = dropout_layer.apply(inputs=embedding,
-                                                training=is_training)
+                embedding = dropout_layer(inputs=embedding,
+                                          training=is_training)
 
         if ids is not None:
             outputs = tf.nn.embedding_lookup(embedding, ids, **kwargs)
@@ -243,8 +242,7 @@ def _build(self, ids=None, soft_ids=None, mode=None, **kwargs):
             dropout_layer = self._get_dropout_layer(
                 self._hparams, ids_rank=ids_rank, dropout_input=outputs)
             if dropout_layer:
-                outputs = dropout_layer.apply(
-                    inputs=outputs, training=is_training)
+                outputs = dropout_layer(inputs=outputs, training=is_training)
 
         return outputs
 
@@ -264,4 +262,7 @@ def dim(self):
     def vocab_size(self):
         r"""The vocabulary size.
         """
-        return self._vocab_size
+        if self._vocab_size:
+            return self._vocab_size
+        else:
+            return self._num_embeds
diff --git a/texar/tf/modules/embedders/embedders_test.py b/texar/tf/modules/embedders/embedders_test.py
index af5ca29f..36139ecd 100644
--- a/texar/tf/modules/embedders/embedders_test.py
+++ b/texar/tf/modules/embedders/embedders_test.py
@@ -7,8 +7,6 @@
 import tensorflow as tf
 
 from texar.tf.modules.embedders.embedders import WordEmbedder
-from texar.tf.modules.embedders.position_embedders import PositionEmbedder
-from texar.tf.context import global_mode
 
 
 class EmbedderTest(tf.test.TestCase):
@@ -41,54 +39,8 @@ def _test_word_embedder(self, hparams):
         self.assertEqual(embedder.vocab_size, 100)
         self.assertEqual(len(embedder.trainable_variables), 1)
 
-        with self.test_session() as sess:
-            sess.run(tf.global_variables_initializer())
-            outputs_, outputs_soft_ = sess.run(
-                [outputs, outputs_soft],
-                feed_dict={global_mode(): tf.estimator.ModeKeys.TRAIN})
-            self.assertEqual(outputs_.shape, (64, 16) + tuple(emb_dim))
-            self.assertEqual(outputs_soft_.shape, (64, 16) + tuple(emb_dim))
-
-        # Tests unknown input shapes
-        inputs = tf.placeholder(dtype=tf.int64, shape=[None, None])
-        outputs = embedder(inputs)
-        self.assertEqual(len(outputs.get_shape()), 2 + len(hparams_dim))
-
-        inputs_soft = tf.placeholder(dtype=tf.int64, shape=[None, None, None])
-        outputs_soft = embedder(soft_ids=inputs_soft)
-        self.assertEqual(len(outputs_soft.get_shape()), 2 + len(hparams_dim))
-
-    def _test_position_embedder(self, hparams):
-        """Tests :class:`texar.tf.modules.PositionEmbedder`.
-        """
-        pos_size = 100
-        embedder = PositionEmbedder(
-            position_size=pos_size, hparams=hparams)
-        inputs = tf.ones([64, 16], dtype=tf.int32)
-        outputs = embedder(inputs)
-
-        emb_dim = embedder.dim
-        if not isinstance(emb_dim, (list, tuple)):
-            emb_dim = [emb_dim]
-
-        hparams_dim = hparams["dim"]
-        if not isinstance(hparams["dim"], (list, tuple)):
-            hparams_dim = [hparams["dim"]]
-
-        self.assertEqual(outputs.shape, [64, 16] + emb_dim)
-        self.assertEqual(emb_dim, hparams_dim)
-        self.assertEqual(embedder.position_size, 100)
-        self.assertEqual(len(embedder.trainable_variables), 1)
-
-        seq_length = tf.random_uniform([64], maxval=pos_size, dtype=tf.int32)
-        outputs = embedder(sequence_length=seq_length)
-        with self.test_session() as sess:
-            sess.run(tf.global_variables_initializer())
-            outputs_, max_seq_length = sess.run(
-                [outputs, tf.reduce_max(seq_length)],
-                feed_dict={global_mode(): tf.estimator.ModeKeys.TRAIN})
-            self.assertEqual(outputs_.shape,
-                             (64, max_seq_length) + tuple(emb_dim))
+        self.assertEqual(outputs.shape, (64, 16) + tuple(emb_dim))
+        self.assertEqual(outputs_soft.shape, (64, 16) + tuple(emb_dim))
 
     def test_embedder(self):
         """Tests various embedders.
@@ -96,59 +48,47 @@ def test_embedder(self):
         # no dropout
         hparams = {"dim": 1024, "dropout_rate": 0}
         self._test_word_embedder(hparams)
-        self._test_position_embedder(hparams)
 
         hparams = {"dim": [1024], "dropout_rate": 0}
         self._test_word_embedder(hparams)
-        self._test_position_embedder(hparams)
 
         hparams = {"dim": [1024, 10], "dropout_rate": 0}
         self._test_word_embedder(hparams)
-        self._test_position_embedder(hparams)
 
         # dropout with default strategy
         hparams = {"dim": 1024, "dropout_rate": 0.3}
         self._test_word_embedder(hparams)
-        self._test_position_embedder(hparams)
 
         hparams = {"dim": [1024], "dropout_rate": 0.3}
         self._test_word_embedder(hparams)
-        self._test_position_embedder(hparams)
 
         hparams = {"dim": [1024, 10], "dropout_rate": 0.3}
         self._test_word_embedder(hparams)
-        self._test_position_embedder(hparams)
 
         # dropout with different strategies
         hparams = {"dim": 1024, "dropout_rate": 0.3,
                    "dropout_strategy": "item"}
         self._test_word_embedder(hparams)
-        self._test_position_embedder(hparams)
 
         hparams = {"dim": [1024], "dropout_rate": 0.3,
                    "dropout_strategy": "item"}
         self._test_word_embedder(hparams)
-        self._test_position_embedder(hparams)
 
         hparams = {"dim": [1024, 10], "dropout_rate": 0.3,
                    "dropout_strategy": "item"}
         self._test_word_embedder(hparams)
-        self._test_position_embedder(hparams)
 
         hparams = {"dim": 1024, "dropout_rate": 0.3,
                    "dropout_strategy": "item_type"}
         self._test_word_embedder(hparams)
-        self._test_position_embedder(hparams)
 
         hparams = {"dim": [1024], "dropout_rate": 0.3,
                    "dropout_strategy": "item_type"}
         self._test_word_embedder(hparams)
-        self._test_position_embedder(hparams)
 
         hparams = {"dim": [1024, 10], "dropout_rate": 0.3,
                    "dropout_strategy": "item_type"}
         self._test_word_embedder(hparams)
-        self._test_position_embedder(hparams)
 
     def test_embedder_multi_calls(self):
         """Tests embedders called by multiple times.
@@ -183,13 +123,10 @@ def test_word_embedder_soft_ids(self):
         ids = np.array([3])
         soft_ids = np.array([[0, 0, 0, 1, 0]])
 
-        outputs = embedder(ids=ids)
+        outputs = embedder(ids)
         soft_outputs = embedder(soft_ids=soft_ids)
 
-        with self.test_session() as sess:
-            sess.run(tf.global_variables_initializer())
-            outputs_, soft_outputs_ = sess.run([outputs, soft_outputs])
-            self.assertEqual(outputs_, soft_outputs_)
+        self.assertEqual(outputs, soft_outputs)
 
 
 if __name__ == "__main__":
diff --git a/texar/tf/modules/embedders/position_embedders.py b/texar/tf/modules/embedders/position_embedders.py
index 0ae9c64a..4a3cff98 100644
--- a/texar/tf/modules/embedders/position_embedders.py
+++ b/texar/tf/modules/embedders/position_embedders.py
@@ -23,13 +23,11 @@
 from texar.tf.modules.embedders import embedder_utils
 from texar.tf.utils.mode import is_train_mode
 from texar.tf.utils.shapes import mask_sequences
-from texar.tf.utils.shapes import shape_list
 
-# pylint: disable=arguments-differ, invalid-name
 
 __all__ = [
     "PositionEmbedder",
-    "SinusoidsPositionEmbedder"
+    "SinusoidsPositionEmbedder",
 ]
 
 
@@ -44,7 +42,6 @@ class PositionEmbedder(EmbedderBase):
         init_value (optional): A Tensor or numpy array that contains the
             initial value of embeddings. It is typically of shape
             ``[position_size, embedding dim]``
-
             If `None`, embedding is initialized as specified in
             ``hparams["initializer"]``. Otherwise, the
             ``"initializer"`` and ``"dim"``
@@ -55,34 +52,26 @@ class PositionEmbedder(EmbedderBase):
         hparams (dict, optional): Embedder hyperparameters. If it is not
             specified, the default hyperparameter setting is used. See
             :attr:`default_hparams` for the structure and default values.
-
-
-    .. document private functions
-    .. automethod:: _build
     """
 
     def __init__(self, init_value=None, position_size=None, hparams=None):
-        EmbedderBase.__init__(self, hparams=hparams)
 
         if init_value is None and position_size is None:
             raise ValueError(
                 "Either `init_value` or `position_size` is required."
             )
 
-        self._init_parameterized_embedding(
-            init_value, position_size, self._hparams
-        )
+        if init_value is not None and position_size is not None:
+            if init_value.shape[0] != position_size:
+                raise ValueError(
+                    "position_size must equal to init_value.shape[0]."
+                    "Got %d and %d" % (position_size, init_value.shape[0])
+                )
 
-        self._position_size = position_size
-        if position_size is None:
-            self._position_size = self._num_embeds
-        if self._position_size != self._num_embeds:
-            raise ValueError(
-                "position_size must equal to init_value.shape[0]."
-                "Got %d and %d" % (self._position_size, self._num_embeds)
-            )
+        super().__init__(init_value=init_value,
+                         num_embeds=position_size, hparams=hparams)
 
-        self._built = True
+        self._position_size = position_size
 
     @staticmethod
     def default_hparams():
@@ -119,7 +108,9 @@ def default_hparams():
         hparams["name"] = "position_embedder"
         return hparams
 
-    def _build(self, positions=None, sequence_length=None, mode=None, **kwargs):
+    # pylint: disable=useless-super-delegation
+    def __call__(self, positions=None,
+                 sequence_length=None, mode=None, **kwargs):
         r"""Embeds the positions.
 
         Either :attr:`positions` or :attr:`sequence_length` is required:
@@ -147,28 +138,32 @@ def _build(self, positions=None, sequence_length=None, mode=None, **kwargs):
         Returns:
             A `Tensor` of shape `shape(inputs) + embedding dimension`.
         """
+        return super().__call__([positions, sequence_length], mode, **kwargs)
+
+    def call(self, inputs, mode, **kwargs):
+        r"""Embeds the positions.
+        """
         # Gets embedder inputs
-        # pylint:disable=too-many-locals
-        inputs = positions
-        if positions is None:
+        position, sequence_length = inputs
+        inputs = position
+        if position is None:
             if sequence_length is None:
                 raise ValueError(
-                    "Either `positions` or `sequence_length` is required."
-                )
+                    "Either `positions` or `sequence_length` is required.")
             max_length = tf.reduce_max(sequence_length)
             single_inputs = tf.range(start=0, limit=max_length, dtype=tf.int32)
             # Expands `single_inputs` to have shape [batch_size, max_length]
             expander = tf.expand_dims(tf.ones_like(sequence_length), -1)
             inputs = expander * tf.expand_dims(single_inputs, 0)
-        ids_rank = len(inputs.shape.dims)
 
+        ids_rank = len(inputs.shape)
         embedding = self._embedding
 
         is_training = is_train_mode(mode)
 
         # Gets dropout strategy
         st = self._hparams.dropout_strategy
-        if positions is None and st == "item":
+        if position is None and st == "item":
             # If `inputs` is based on `sequence_length`, then dropout
             # strategies 'item' and 'item_type' have the same effect, we
             # use 'item_type' to avoid unknown noise_shape in the 'item'
@@ -181,9 +176,8 @@ def _build(self, positions=None, sequence_length=None, mode=None, **kwargs):
                 self._hparams, dropout_strategy=st
             )
             if dropout_layer:
-                embedding = dropout_layer.apply(
-                    inputs=embedding, training=is_training
-                )
+                embedding = dropout_layer(inputs=embedding,
+                                          training=is_training)
 
         # Embeds
         outputs = tf.nn.embedding_lookup(embedding, inputs, **kwargs)
@@ -194,20 +188,13 @@ def _build(self, positions=None, sequence_length=None, mode=None, **kwargs):
                 self._hparams,
                 ids_rank=ids_rank,
                 dropout_input=outputs,
-                dropout_strategy=st,
-            )
+                dropout_strategy=st)
             if dropout_layer:
-                outputs = dropout_layer.apply(
-                    inputs=outputs, training=is_training
-                )
+                outputs = dropout_layer(inputs=outputs, training=is_training)
 
         # Optionally masks
         if sequence_length is not None:
-            outputs = mask_sequences(
-                outputs,
-                sequence_length,
-                tensor_rank=len(inputs.shape.dims) + self._dim_rank,
-            )
+            outputs = mask_sequences(outputs, sequence_length)
 
         return outputs
 
@@ -227,7 +214,10 @@ def dim(self):
     def position_size(self):
         r"""The position size, i.e., maximum number of positions.
         """
-        return self._position_size
+        if self._position_size:
+            return self._position_size
+        else:
+            return self._num_embeds
 
 
 class SinusoidsPositionEmbedder(EmbedderBase):
@@ -256,13 +246,10 @@ class SinusoidsPositionEmbedder(EmbedderBase):
             sequence length. Set ``position_size=None`` and
             ``hparams['cache_embeddings']=False`` to enable infinite large or
             negative position indexes.
-
-    .. document private functions
-    .. automethod:: _build
     """
 
     def __init__(self, position_size, hparams=None):
-        EmbedderBase.__init__(self, hparams=hparams)
+        super().__init__(self, hparams=hparams)
 
         self._num_embeds = position_size
         self._dim = self._hparams.dim
@@ -283,9 +270,8 @@ def __init__(self, position_size, hparams=None):
 
         if self._cache_embeddings:
             if position_size is None:
-                raise ValueError(
-                    "'position_size' must not be None when "
-                    "'cache_embeddings' is set to True")
+                raise ValueError("'position_size' must not be None when "
+                                 "'cache_embeddings' is set to True")
             positions = tf.range(position_size, dtype=tf.float32)
             signal = self._compute_embeddings(positions)
             self.signal = signal
@@ -295,7 +281,7 @@ def default_hparams():
         r"""Returns a dictionary of hyperparameters with default values
         We use a geometric sequence of timescales starting with
         min_timescale and ending with max_timescale. The number of different
-        timescales is equal to ``dim/2``.
+        timescales is equal to ``dim / 2``.
 
         .. code-block:: python
 
@@ -316,8 +302,7 @@ def default_hparams():
 
             If `False`, embeddings are computed on-the-fly during lookup.
             Set to `False` if your application needs to handle sequences
-            of arbitrary length, or requires embeddings at negative
-            positions.
+            of arbitrary length, or requires embeddings at negative positions.
         """
         hparams = {
             "min_timescale": 1.0,
@@ -335,11 +320,12 @@ def _compute_embeddings(self, positions):
         signal = tf.concat(
             [tf.sin(scaled_time), tf.cos(scaled_time)], axis=1
         )
-        signal = tf.pad(signal, [[0, 0], [0, tf.mod(self._dim, 2)]])
-        signal = tf.reshape(signal, shape_list(positions) + [self._dim])
+        signal = tf.pad(signal, [[0, 0], [0, tf.math.mod(self._dim, 2)]])
+        signal = tf.reshape(signal, positions.shape.as_list() + [self._dim])
         return signal
 
-    def _build(self, positions=None, sequence_length=None):
+    # pylint: disable=useless-super-delegation
+    def __call__(self, positions=None, sequence_length=None, **kwargs):
         r"""Embeds.
         Either :attr:`positions` or :attr:`sequence_length` is required:
 
@@ -360,7 +346,15 @@ def _build(self, positions=None, sequence_length=None):
         Returns:
             A Tensor of shape ``[batch_size, max_time, dim]``.
         """
-        if positions is None:
+        return super().__call__(positions, sequence_length, **kwargs)
+
+    def build(self, input_shape):
+        self.built = True  # pylint: disable=attribute-defined-outside-init
+
+    def call(self, inputs, sequence_length=None, **kwargs):
+        r"""Embeds.
+        """
+        if inputs is None:
             if sequence_length is None:
                 raise ValueError(
                     "Either `positions` or `sequence_length` is required."
@@ -370,8 +364,6 @@ def _build(self, positions=None, sequence_length=None):
             # Expands `single_inputs` to have shape [batch_size, max_length]
             expander = tf.expand_dims(tf.ones_like(sequence_length), -1)
             inputs = expander * tf.expand_dims(single_inputs, 0)
-        else:
-            inputs = positions
 
         if self._cache_embeddings:
             outputs = tf.nn.embedding_lookup(self.signal, inputs)
@@ -379,3 +371,9 @@ def _build(self, positions=None, sequence_length=None):
             outputs = self._compute_embeddings(inputs)
 
         return outputs
+
+    @property
+    def dim(self):
+        r"""The embedding dimension.
+        """
+        return self._dim
diff --git a/texar/tf/modules/embedders/position_embedders_test.py b/texar/tf/modules/embedders/position_embedders_test.py
new file mode 100644
index 00000000..9174b5c0
--- /dev/null
+++ b/texar/tf/modules/embedders/position_embedders_test.py
@@ -0,0 +1,117 @@
+"""
+Unit tests for position embedders.
+"""
+
+import numpy as np
+
+import tensorflow as tf
+
+from texar.tf.modules.embedders.position_embedders import (
+    PositionEmbedder, SinusoidsPositionEmbedder)
+
+
+class PositionEmbedderTest(tf.test.TestCase):
+    """Tests position embedder.
+    """
+
+    def _test_position_embedder(self, hparams):
+        """Tests :class:`texar.tf.modules.PositionEmbedder`.
+        """
+        pos_size = 100
+        embedder = PositionEmbedder(
+            position_size=pos_size, hparams=hparams)
+        inputs = tf.ones([64, 16], dtype=tf.int32)
+        outputs = embedder(inputs)
+
+        emb_dim = embedder.dim
+        if not isinstance(emb_dim, (list, tuple)):
+            emb_dim = [emb_dim]
+
+        hparams_dim = hparams["dim"]
+        if not isinstance(hparams["dim"], (list, tuple)):
+            hparams_dim = [hparams["dim"]]
+
+        self.assertEqual(outputs.shape, [64, 16] + emb_dim)
+        self.assertEqual(emb_dim, hparams_dim)
+        self.assertEqual(embedder.position_size, 100)
+        self.assertEqual(len(embedder.trainable_variables), 1)
+
+        seq_length = tf.random.uniform([64], maxval=pos_size, dtype=tf.int32)
+        outputs = embedder(sequence_length=seq_length)
+        self.assertEqual(outputs.shape, (64, tf.reduce_max(seq_length)) +
+                         tuple(emb_dim))
+
+    def test_embedder(self):
+        """Tests various embedders.
+        """
+        # no dropout
+        hparams = {"dim": 1024, "dropout_rate": 0}
+        self._test_position_embedder(hparams)
+
+        hparams = {"dim": [1024], "dropout_rate": 0}
+        self._test_position_embedder(hparams)
+
+        hparams = {"dim": [1024, 10], "dropout_rate": 0}
+        self._test_position_embedder(hparams)
+
+        # dropout with default strategy
+        hparams = {"dim": 1024, "dropout_rate": 0.3}
+        self._test_position_embedder(hparams)
+
+        hparams = {"dim": [1024], "dropout_rate": 0.3}
+        self._test_position_embedder(hparams)
+
+        hparams = {"dim": [1024, 10], "dropout_rate": 0.3}
+        self._test_position_embedder(hparams)
+
+        # dropout with different strategies
+        hparams = {"dim": 1024, "dropout_rate": 0.3,
+                   "dropout_strategy": "item"}
+        self._test_position_embedder(hparams)
+
+        hparams = {"dim": [1024], "dropout_rate": 0.3,
+                   "dropout_strategy": "item"}
+        self._test_position_embedder(hparams)
+
+        hparams = {"dim": [1024, 10], "dropout_rate": 0.3,
+                   "dropout_strategy": "item"}
+        self._test_position_embedder(hparams)
+
+        hparams = {"dim": 1024, "dropout_rate": 0.3,
+                   "dropout_strategy": "item_type"}
+        self._test_position_embedder(hparams)
+
+        hparams = {"dim": [1024], "dropout_rate": 0.3,
+                   "dropout_strategy": "item_type"}
+        self._test_position_embedder(hparams)
+
+        hparams = {"dim": [1024, 10], "dropout_rate": 0.3,
+                   "dropout_strategy": "item_type"}
+        self._test_position_embedder(hparams)
+
+    def test_sinusoids_position_embedder(self):
+        """Tests :class:`texar.tf.modules.SinusoidsPositionEmbedder`.
+        """
+        position_size = 64
+        input_size = (23, 18)
+        hparams = {'dim': 513}  # use odd dimension to ensure padding correct
+        embedder = SinusoidsPositionEmbedder(position_size, hparams=hparams)
+        inputs = tf.random.uniform(shape=input_size, maxval=position_size - 1,
+                                   dtype=tf.dtypes.int64)
+        outputs = embedder(inputs)
+        self.assertEqual(outputs.shape, input_size + (hparams['dim'],))
+
+        embedder_no_cache = SinusoidsPositionEmbedder(
+            None, hparams={**hparams, 'cache_embeddings': False})
+        wide_inputs = tf.random.uniform(minval=-position_size,
+                                        maxval=position_size * 2,
+                                        shape=input_size,
+                                        dtype=tf.dtypes.int64)
+        wide_outputs = embedder_no_cache(wide_inputs)
+        self.assertEqual(wide_outputs.shape, input_size + (hparams['dim'],))
+        no_cache_outputs = embedder_no_cache(inputs)
+        np.testing.assert_array_equal(outputs, no_cache_outputs)
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/texar/tf/modules/encoders/__init__.py b/texar/tf/modules/encoders/__init__.py
index 6eb1d482..cda77cdf 100644
--- a/texar/tf/modules/encoders/__init__.py
+++ b/texar/tf/modules/encoders/__init__.py
@@ -1,4 +1,4 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
+# Copyright 2019 The Texar Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,21 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-Modules of texar library encoders.
+Modules of Texar library encoders.
 """
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# pylint: disable=wildcard-import
-
 from texar.tf.modules.encoders.encoder_base import *
-from texar.tf.modules.encoders.bert_encoder import *
-from texar.tf.modules.encoders.gpt2_encoder import *
-from texar.tf.modules.encoders.conv_encoders import *
-from texar.tf.modules.encoders.hierarchical_encoders import *
 from texar.tf.modules.encoders.multihead_attention import *
-from texar.tf.modules.encoders.rnn_encoders import *
-from texar.tf.modules.encoders.transformer_encoders import *
-from texar.tf.modules.encoders.xlnet_encoder import *
+from texar.tf.modules.encoders.transformer_encoder import *
diff --git a/texar/tf/modules/encoders/bert_encoder.py b/texar/tf/modules/encoders/bert_encoder.py
index c8735c0a..30c0233f 100644
--- a/texar/tf/modules/encoders/bert_encoder.py
+++ b/texar/tf/modules/encoders/bert_encoder.py
@@ -15,14 +15,10 @@
 BERT encoders.
 """
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
 import tensorflow as tf
 
 from texar.tf.core.layers import get_initializer, get_layer
-from texar.tf.modules.encoders.transformer_encoders import TransformerEncoder
+from texar.tf.modules.encoders.transformer_encoder import TransformerEncoder
 from texar.tf.modules.embedders.embedders import WordEmbedder
 from texar.tf.modules.embedders.position_embedders import PositionEmbedder
 from texar.tf.modules.encoders.encoder_base import EncoderBase
@@ -57,50 +53,38 @@ class BERTEncoder(EncoderBase, PretrainedBERTMixin):
             hyperparameter will be set to default values. See
             :meth:`default_hparams` for the hyperparameter structure
             and default values.
-
-    .. document private functions
-    .. automethod:: _build
     """
 
     def __init__(self,
                  pretrained_model_name=None,
                  cache_dir=None,
                  hparams=None):
-        super(BERTEncoder, self).__init__(hparams=hparams)
+        super().__init__(hparams=hparams)
 
         self.load_pretrained_config(pretrained_model_name, cache_dir)
 
-        with tf.variable_scope(self.variable_scope):
-
-            # Word embedding
-            self.word_embedder = WordEmbedder(
-                vocab_size=self._hparams.vocab_size,
-                hparams=self._hparams.embed)
-
-            # Segment embedding for each type of tokens
-            self.segment_embedder = WordEmbedder(
-                vocab_size=self._hparams.type_vocab_size,
-                hparams=self._hparams.segment_embed)
+        # Word embedding
+        self.word_embedder = WordEmbedder(
+            vocab_size=self._hparams.vocab_size,
+            hparams=self._hparams.embed)
 
-            # Position embedding
-            self.position_embedder = PositionEmbedder(
-                position_size=self._hparams.position_size,
-                hparams=self._hparams.position_embed)
+        # Segment embedding for each type of tokens
+        self.segment_embedder = WordEmbedder(
+            vocab_size=self._hparams.type_vocab_size,
+            hparams=self._hparams.segment_embed)
 
-            # The BERT encoder (a TransformerEncoder)
-            self.encoder = TransformerEncoder(hparams=self._hparams.encoder)
+        # Position embedding
+        self.position_embedder = PositionEmbedder(
+            position_size=self._hparams.position_size,
+            hparams=self._hparams.position_embed)
 
-            with tf.variable_scope("pooler"):
-                kwargs_i = {"units": self._hparams.hidden_size,
-                            "activation": tf.tanh}
-                layer_hparams = {"type": "Dense", "kwargs": kwargs_i}
-                self.pooler = get_layer(hparams=layer_hparams)
+        # The BERT encoder (a TransformerEncoder)
+        self.encoder = TransformerEncoder(hparams=self._hparams.encoder)
 
-    def reset_parameters(self):
-        with tf.variable_scope(self.variable_scope):
-            if self._hparams.initializer:
-                tf.get_variable_scope().set_initializer(
-                    get_initializer(self._hparams.initializer))
+        kwargs_i = {"units": self._hparams.hidden_size,
+                    "activation": tf.tanh}
+        layer_hparams = {"type": "Dense", "kwargs": kwargs_i}
+        self.pooler = get_layer(hparams=layer_hparams)
 
     @staticmethod
     def default_hparams():
@@ -253,8 +237,7 @@ def default_hparams():
                     'layers': [
                         {
                             'kwargs': {
-                                'activation': 'gelu',
-                                'name': 'intermediate',
+                                'activation': 'relu',  # TODO: should be gelu
                                 'units': 3072,
                                 'use_bias': True
                             },
@@ -263,7 +246,6 @@ def default_hparams():
                         {
                             'kwargs': {
                                 'activation': None,
-                                'name': 'output',
                                 'units': 768,
                                 'use_bias': True
                             },
@@ -280,13 +262,12 @@ def default_hparams():
             '@no_typecheck': ['pretrained_model_name']
         }
 
-    def _build(self,
-               inputs,
-               sequence_length=None,
-               segment_ids=None,
-               mode=None,
-               **kwargs):
-        """Encodes the inputs.
+    def __call__(self,
+                 inputs,
+                 sequence_length=None,
+                 segment_ids=None,
+                 mode=None):
+        r"""Encodes the inputs.
 
         Args:
             inputs: A 2D Tensor of shape `[batch_size, max_time]`,
@@ -303,21 +284,25 @@ def _build(self,
                 including `TRAIN`, `EVAL`, and `PREDICT`. Used to toggle
                 dropout.
                 If `None` (default), :func:`texar.tf.global_mode` is used.
-            **kwargs: Keyword arguments.
 
         Returns:
             A pair :attr:`(outputs, pooled_output)`
 
-                - :attr:`outputs`:  A Tensor of shape \
-                `[batch_size, max_time, dim]` containing the \
-                 encoded vectors.
+                - :attr:`outputs`:  A Tensor of shape
+                  `[batch_size, max_time, dim]` containing the
+                  encoded vectors.
 
-                - :attr:`pooled_output`: A Tensor of size \
-                `[batch_size, hidden_size]` which is the output of a \
-                pooler berts on top of the hidden state associated \
-                to the first character of the input (`CLS`), see BERT's \
-                paper.
+                - :attr:`pooled_output`: A Tensor of size
+                  `[batch_size, hidden_size]` which is the output of a
+                  pooler berts on top of the hidden state associated
+                  to the first character of the input (`CLS`), see BERT's paper.
         """
+        return super().__call__([inputs, sequence_length, segment_ids], mode)
+
+    def call(self, inputs, mode):
+        r"""Encodes the inputs.
+        """
+        inputs, sequence_length, segment_ids = inputs
 
         if segment_ids is None:
             segment_ids = tf.zeros_like(inputs)
@@ -338,15 +323,8 @@ def _build(self,
 
         output = self.encoder(input_embeds, sequence_length, mode)
 
-        with tf.variable_scope("pooler"):
-            # taking the hidden state corresponding to the first token.
-            first_token_tensor = tf.squeeze(output[:, 0:1, :], axis=1)
-            pooled_output = self.pooler(first_token_tensor)
-
-        if not self._built:
-            self._add_internal_trainable_variables()
-            self._built = True
-
-            self.init_pretrained_weights(self.variable_scope.name)
+        # taking the hidden state corresponding to the first token.
+        first_token_tensor = tf.squeeze(output[:, 0:1, :], axis=1)
+        pooled_output = self.pooler(first_token_tensor)
 
         return output, pooled_output
diff --git a/texar/tf/modules/encoders/bert_encoder_test.py b/texar/tf/modules/encoders/bert_encoder_test.py
index 082b21cd..c86dba0e 100644
--- a/texar/tf/modules/encoders/bert_encoder_test.py
+++ b/texar/tf/modules/encoders/bert_encoder_test.py
@@ -2,11 +2,6 @@
 Unit tests for BERT encoders.
 """
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
-
 import tensorflow as tf
 
 from texar.tf.modules.encoders.bert_encoder import BERTEncoder
@@ -17,91 +12,6 @@ class BERTEncoderTest(tf.test.TestCase):
     """Tests :class:`~texar.tf.modules.BERTEncoder` class.
     """
 
-    @pretrained_test
-    def test_model_loading(self):
-        r"""Tests model loading functionality."""
-
-        inputs = tf.placeholder(dtype=tf.int32, shape=[None, None])
-
-        for pretrained_model_name in BERTEncoder.available_checkpoints():
-            encoder = BERTEncoder(pretrained_model_name=pretrained_model_name)
-            _, _ = encoder(inputs)
-
-    @pretrained_test
-    def test_hparams(self):
-        """Tests the priority of the encoder arch parameter.
-        """
-
-        inputs = tf.placeholder(dtype=tf.int32, shape=[None, None])
-
-        # case 1: set "pretrained_mode_name" by constructor argument
-        hparams = {
-            "pretrained_model_name": "bert-large-uncased",
-        }
-        encoder = BERTEncoder(pretrained_model_name="bert-base-uncased",
-                              hparams=hparams)
-        _, _ = encoder(inputs)
-        self.assertEqual(encoder.hparams.encoder.num_blocks, 12)
-
-        # case 2: set "pretrained_mode_name" by hparams
-        hparams = {
-            "pretrained_model_name": "bert-large-uncased",
-            "encoder": {
-                "num_blocks": 6
-            }
-        }
-        encoder = BERTEncoder(hparams=hparams)
-        _, _ = encoder(inputs)
-        self.assertEqual(encoder.hparams.encoder.num_blocks, 24)
-
-        # case 3: set to None in both hparams and constructor argument
-        hparams = {
-            "pretrained_model_name": None,
-            "encoder": {
-                "num_blocks": 6
-            },
-        }
-        encoder = BERTEncoder(hparams=hparams)
-        _, _ = encoder(inputs)
-        self.assertEqual(encoder.hparams.encoder.num_blocks, 6)
-
-        # case 4: using default hparams
-        encoder = BERTEncoder()
-        _, _ = encoder(inputs)
-        self.assertEqual(encoder.hparams.encoder.num_blocks, 12)
-
-    @pretrained_test
-    def test_trainable_variables(self):
-        """Tests the functionality of automatically collecting trainable
-        variables.
-        """
-
-        inputs = tf.placeholder(dtype=tf.int32, shape=[None, None])
-
-        # case 1: bert base
-        encoder = BERTEncoder()
-        _, _ = encoder(inputs)
-        self.assertEqual(len(encoder.trainable_variables), 3 + 2 + 12 * 16 + 2)
-
-        # case 2: bert large
-        hparams = {
-            "pretrained_model_name": "bert-large-uncased"
-        }
-        encoder = BERTEncoder(hparams=hparams)
-        _, _ = encoder(inputs)
-        self.assertEqual(len(encoder.trainable_variables), 3 + 2 + 24 * 16 + 2)
-
-        # case 3: self-designed bert
-        hparams = {
-            "encoder": {
-                "num_blocks": 6,
-            },
-            "pretrained_model_name": None
-        }
-        encoder = BERTEncoder(hparams=hparams)
-        _, _ = encoder(inputs)
-        self.assertEqual(len(encoder.trainable_variables), 3 + 2 + 6 * 16 + 2)
-
     def test_encode(self):
         """Tests encoding.
         """
@@ -113,19 +23,15 @@ def test_encode(self):
 
         max_time = 8
         batch_size = 16
-        inputs = tf.random_uniform([batch_size, max_time],
+        inputs = tf.random.uniform([batch_size, max_time],
                                    maxval=30521, dtype=tf.int32)
         outputs, pooled_output = encoder(inputs)
 
         outputs_dim = encoder.hparams.encoder.dim
         pooled_output_dim = encoder.hparams.hidden_size
-        with self.test_session() as sess:
-            sess.run(tf.global_variables_initializer())
-            outputs_, pooled_output_ = sess.run([outputs, pooled_output])
-            self.assertEqual(outputs_.shape, (batch_size,
-                                              max_time, outputs_dim))
-            self.assertEqual(pooled_output_.shape, (batch_size,
-                                                    pooled_output_dim))
+
+        self.assertEqual(outputs.shape, (batch_size, max_time, outputs_dim))
+        self.assertEqual(pooled_output.shape, (batch_size, pooled_output_dim))
 
         # case 2: self-designed bert
         hparams = {
@@ -136,19 +42,15 @@ def test_encode(self):
 
         max_time = 8
         batch_size = 16
-        inputs = tf.random_uniform([batch_size, max_time],
+        inputs = tf.random.uniform([batch_size, max_time],
                                    maxval=30521, dtype=tf.int32)
         outputs, pooled_output = encoder(inputs)
 
         outputs_dim = encoder.hparams.encoder.dim
         pooled_output_dim = encoder.hparams.hidden_size
-        with self.test_session() as sess:
-            sess.run(tf.global_variables_initializer())
-            outputs_, pooled_output_ = sess.run([outputs, pooled_output])
-            self.assertEqual(outputs_.shape, (batch_size,
-                                              max_time, outputs_dim))
-            self.assertEqual(pooled_output_.shape,
-                             (batch_size, pooled_output_dim))
+
+        self.assertEqual(outputs.shape, (batch_size, max_time, outputs_dim))
+        self.assertEqual(pooled_output.shape, (batch_size, pooled_output_dim))
 
 
 if __name__ == "__main__":
diff --git a/texar/tf/modules/encoders/conv_encoders.py b/texar/tf/modules/encoders/conv_encoders.py
deleted file mode 100644
index 3ce199b1..00000000
--- a/texar/tf/modules/encoders/conv_encoders.py
+++ /dev/null
@@ -1,52 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Various convolutional network encoders.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from texar.tf.modules.encoders.encoder_base import EncoderBase
-from texar.tf.modules.networks.conv_networks import Conv1DNetwork
-
-__all__ = [
-    "Conv1DEncoder"
-]
-
-
-class Conv1DEncoder(Conv1DNetwork, EncoderBase):
-    """Simple Conv-1D encoder which consists of a sequence of conv layers
-    followed with a sequence of dense layers.
-
-    Wraps :class:`~texar.tf.modules.Conv1DNetwork` to be a subclass of
-    :class:`~texar.tf.modules.EncoderBase`. Has exact the same functionality
-    with :class:`~texar.tf.modules.Conv1DNetwork`.
-    """
-
-    def __init__(self, hparams=None):  # pylint: disable=super-init-not-called
-        Conv1DNetwork.__init__(self, hparams)
-
-    @staticmethod
-    def default_hparams():
-        """Returns a dictionary of hyperparameters with default values.
-
-        The same as :meth:`~texar.tf.modules.Conv1DNetwork.default_hparams`
-        of :class:`~texar.tf.modules.Conv1DNetwork`, except that the default name
-        is 'conv_encoder'.
-        """
-        hparams = Conv1DNetwork.default_hparams()
-        hparams['name'] = 'conv_encoder'
-        return hparams
diff --git a/texar/tf/modules/encoders/conv_encoders_test.py b/texar/tf/modules/encoders/conv_encoders_test.py
deleted file mode 100644
index 72e93695..00000000
--- a/texar/tf/modules/encoders/conv_encoders_test.py
+++ /dev/null
@@ -1,117 +0,0 @@
-#
-"""
-Unit tests for conv encoders.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
-
-import tensorflow as tf
-
-import texar.tf as tx
-from texar.tf.modules.encoders.conv_encoders import Conv1DEncoder
-
-
-class Conv1DEncoderTest(tf.test.TestCase):
-    """Tests :class:`~texar.tf.modules.Conv1DEncoder` class.
-    """
-
-    def test_encode(self):
-        """Tests encode.
-        """
-        encoder_1 = Conv1DEncoder()
-        self.assertEqual(len(encoder_1.layers), 4)
-        self.assertTrue(isinstance(encoder_1.layer_by_name("conv_pool_1"),
-                                   tx.core.MergeLayer))
-        for layer in encoder_1.layers[0].layers:
-            self.assertTrue(isinstance(layer, tx.core.SequentialLayer))
-
-        inputs_1 = tf.ones([64, 16, 300], tf.float32)
-        outputs_1 = encoder_1(inputs_1)
-        self.assertEqual(outputs_1.shape, [64, 128])
-
-        hparams = {
-            # Conv layers
-            "num_conv_layers": 2,
-            "filters": 128,
-            "kernel_size": [[3, 4, 5], 4],
-            "other_conv_kwargs": {"padding": "same"},
-            # Pooling layers
-            "pooling": "AveragePooling",
-            "pool_size": 2,
-            "pool_strides": 1,
-            # Dense layers
-            "num_dense_layers": 3,
-            "dense_size": [128, 128, 10],
-            "dense_activation": "relu",
-            "other_dense_kwargs": {"use_bias": False},
-            # Dropout
-            "dropout_conv": [0, 1, 2],
-            "dropout_dense": 2
-        }
-        encoder_2 = Conv1DEncoder(hparams)
-        # nlayers = nconv-pool + nconv + npool + ndense + ndropout + flatten
-        self.assertEqual(len(encoder_2.layers), 1 + 1 + 1 + 3 + 4 + 1)
-        self.assertTrue(isinstance(encoder_2.layer_by_name("conv_pool_1"),
-                                   tx.core.MergeLayer))
-        for layer in encoder_2.layers[1].layers:
-            self.assertTrue(isinstance(layer, tx.core.SequentialLayer))
-
-        inputs_2 = tf.ones([64, 16, 300], tf.float32)
-        outputs_2 = encoder_2(inputs_2)
-        self.assertEqual(outputs_2.shape, [64, 10])
-
-    def test_unknown_seq_length(self):
-        """Tests use of pooling layer when the seq_length dimension of inputs
-        is `None`.
-        """
-        encoder_1 = Conv1DEncoder()
-        inputs_1 = tf.placeholder(tf.float32, [64, None, 300])
-        outputs_1 = encoder_1(inputs_1)
-        self.assertEqual(outputs_1.shape, [64, 128])
-
-        hparams = {
-            # Conv layers
-            "num_conv_layers": 2,
-            "filters": 128,
-            "kernel_size": [[3, 4, 5], 4],
-            # Pooling layers
-            "pooling": "AveragePooling",
-            "pool_size": [2, None],
-            # Dense layers
-            "num_dense_layers": 1,
-            "dense_size": 10,
-        }
-        encoder = Conv1DEncoder(hparams)
-        # nlayers = nconv-pool + nconv + npool + ndense + ndropout + flatten
-        self.assertEqual(len(encoder.layers), 1 + 1 + 1 + 1 + 1 + 1)
-        self.assertTrue(isinstance(encoder.layer_by_name('pool_2'),
-                                   tx.core.AverageReducePooling1D))
-
-        inputs = tf.placeholder(tf.float32, [64, None, 300])
-        outputs = encoder(inputs)
-        self.assertEqual(outputs.shape, [64, 10])
-
-        hparams_2 = {
-            # Conv layers
-            "num_conv_layers": 1,
-            "filters": 128,
-            "kernel_size": 4,
-            "other_conv_kwargs": {'data_format': 'channels_first'},
-            # Pooling layers
-            "pooling": "MaxPooling",
-            "other_pool_kwargs": {'data_format': 'channels_first'},
-            # Dense layers
-            "num_dense_layers": 1,
-            "dense_size": 10,
-        }
-        encoder_2 = Conv1DEncoder(hparams_2)
-        inputs_2 = tf.placeholder(tf.float32, [64, 300, None])
-        outputs_2 = encoder_2(inputs_2)
-        self.assertEqual(outputs_2.shape, [64, 10])
-
-
-if __name__ == "__main__":
-    tf.test.main()
diff --git a/texar/tf/modules/encoders/encoder_base.py b/texar/tf/modules/encoders/encoder_base.py
index fe93aed7..3b57af92 100644
--- a/texar/tf/modules/encoders/encoder_base.py
+++ b/texar/tf/modules/encoders/encoder_base.py
@@ -1,4 +1,4 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
+# Copyright 2019 The Texar Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -15,41 +15,23 @@
 Base class for encoders.
 """
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+from abc import ABC
 
 from texar.tf.module_base import ModuleBase
 
 __all__ = [
-    "EncoderBase"
+    "EncoderBase",
 ]
 
 
-class EncoderBase(ModuleBase):
-    """Base class inherited by all encoder classes.
+class EncoderBase(ModuleBase, ABC):
+    r"""Base class inherited by all encoder classes.
     """
 
-    def __init__(self, hparams=None):
-        ModuleBase.__init__(self, hparams)
-
     @staticmethod
     def default_hparams():
-        """Returns a dictionary of hyperparameters with default values.
+        r"""Returns a dictionary of hyperparameters with default values.
         """
         return {
             "name": "encoder"
         }
-
-    def _build(self, inputs, *args, **kwargs):
-        """Encodes the inputs.
-
-        Args:
-          inputs: Inputs to the encoder.
-          *args: Other arguments.
-          **kwargs: Keyword arguments.
-
-        Returns:
-          Encoding results.
-        """
-        raise NotImplementedError
diff --git a/texar/tf/modules/encoders/gpt2_encoder.py b/texar/tf/modules/encoders/gpt2_encoder.py
deleted file mode 100644
index d1a4bff9..00000000
--- a/texar/tf/modules/encoders/gpt2_encoder.py
+++ /dev/null
@@ -1,313 +0,0 @@
-# Copyright 2019 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-GPT2 encoders.
-"""
-
-from typing import Optional
-
-import tensorflow as tf
-
-from texar.tf.core.layers import get_initializer
-from texar.tf.modules.embedders.embedders import WordEmbedder
-from texar.tf.modules.embedders.position_embedders import PositionEmbedder
-from texar.tf.modules.encoders.encoder_base import EncoderBase
-from texar.tf.modules.encoders.transformer_encoders import TransformerEncoder
-from texar.tf.modules.pretrained.gpt2 import PretrainedGPT2Mixin
-
-__all__ = [
-    "GPT2Encoder",
-]
-
-
-class GPT2Encoder(EncoderBase, PretrainedGPT2Mixin):
-    r"""Raw GPT2 Transformer for encoding sequences. Please see
-    :class:`~texar.tf.modules.PretrainedGPT2Mixin` for a brief description
-    of GPT2.
-
-    This module basically stacks
-    :class:`~texar.tf.modules.WordEmbedder`,
-    :class:`~texar.tf.modules.PositionEmbedder`,
-    :class:`~texar.tf.modules.TransformerEncoder`.
-
-    Args:
-        pretrained_model_name (optional): a `str`, the name
-            of pre-trained model (e.g., ``gpt2-small``). Please refer to
-            :class:`~texar.tf.modules.PretrainedGPT2Mixin` for
-            all supported models.
-            If `None`, the model name in :attr:`hparams` is used.
-        cache_dir (optional): the path to a folder in which the
-            pre-trained models will be cached. If `None` (default),
-            a default directory (``texar_data`` folder under user's home
-            directory) will be used.
-        hparams (dict or HParams, optional): Hyperparameters. Missing
-            hyperparameter will be set to default values. See
-            :meth:`default_hparams` for the hyperparameter structure
-            and default values.
-
-    .. document private functions
-    .. automethod:: _build
-    """
-
-    def __init__(self,
-                 pretrained_model_name: Optional[str] = None,
-                 cache_dir: Optional[str] = None,
-                 hparams=None):
-        super(GPT2Encoder, self).__init__(hparams=hparams)
-
-        self.load_pretrained_config(pretrained_model_name, cache_dir)
-
-        with tf.variable_scope(self.variable_scope):
-
-            # Word embedding
-            self.word_embedder = WordEmbedder(
-                vocab_size=self._hparams.vocab_size,
-                hparams=self._hparams.embed)
-
-            # Position embedding
-            self.position_embedder = PositionEmbedder(
-                position_size=self._hparams.position_size,
-                hparams=self._hparams.position_embed)
-
-            # The GPT2 encoder (a TransformerEncoder)
-            self.encoder = TransformerEncoder(hparams=self._hparams.encoder)
-
-    def reset_parameters(self):
-        with tf.variable_scope(self.variable_scope):
-            if self._hparams.initializer:
-                tf.get_variable_scope().set_initializer(
-                    get_initializer(self._hparams.initializer))
-
-    @staticmethod
-    def default_hparams():
-        r"""Returns a dictionary of hyperparameters with default values.
-
-        * The encoder arch is determined by the constructor argument
-          :attr:`pretrained_model_name` if it's specified. In this case,
-          `hparams` are ignored.
-        * Otherwise, the encoder arch is determined by
-          `hparams['pretrained_model_name']` if it's specified. All other
-          configurations in `hparams` are ignored.
-        * If the above two are `None`, the encoder arch is defined by the
-          configurations in `hparams` and weights are randomly initialized.
-
-        .. code-block:: python
-
-            {
-                "pretrained_model_name": "gpt2-small",
-                "vocab_size": 50257,
-                "context_size": 1024,
-                "embedding_size": 768,
-                "embed": {
-                    "dim": 768,
-                    "name": "word_embeddings"
-                },
-                "position_size": 1024,
-                "position_embed": {
-                    "dim": 768,
-                    "name": "position_embeddings"
-                },
-
-                "encoder": {
-                    "dim": 768,
-                    "num_blocks": 12,
-                    "use_gpt_config": True,
-                    "embedding_dropout": 0,
-                    "residual_dropout": 0,
-                    "multihead_attention": {
-                        "use_bias": True,
-                        "num_units": 768,
-                        "num_heads": 12,
-                        "output_dim": 768
-                    },
-                    "initializer": {
-                        "type": "variance_scaling_initializer",
-                        "kwargs": {
-                            "factor": 1.0,
-                            "mode": "FAN_AVG",
-                            "uniform": True
-                        }
-                    },
-                    "poswise_feedforward": {
-                        "layers": [
-                            {
-                                "type": "Dense",
-                                "kwargs": {
-                                    "activation": "gelu",
-                                    "name": "intermediate",
-                                    "units": 3072,
-                                    "use_bias": True
-                                }
-                            },
-                            {
-                                "type": "Dense",
-                                "kwargs": {
-                                    "activation": None,
-                                    "name": "output",
-                                    "units": 3072,
-                                    "use_bias": True
-                                }
-                            }
-                        ],
-                        "name": "ffn"
-                    }
-                },
-                "initializer": None,
-                "name": "gpt2_encoder",
-            }
-
-        Here:
-
-        The default parameters are values for 124M GPT2 model.
-
-        `"pretrained_model_name"`: str or None
-            The name of the pre-trained GPT2 model. If None, the model
-            will be randomly initialized.
-
-        `"embed"`: dict
-            Hyperparameters for word embedding layer.
-
-        `"vocab_size"`: int
-            The vocabulary size of `inputs` in `GPT2Model`.
-
-        `"position_embed"`: dict
-            Hyperparameters for position embedding layer.
-
-        `"position_size"`:  int
-            The maximum sequence length that this model might ever be used with.
-
-        `"decoder"`: dict
-            Hyperparameters for the TransformerDecoder.
-            See :func:`~texar.torch.modules.TransformerDecoder.default_hparams`
-            for details.
-
-        `"initializer"`: dict, optional
-            Hyperparameters of the default initializer that initializes
-            variables created in this module.
-            See :func:`~texar.torch.core.get_initializer` for details.
-
-        `"name"`: str
-            Name of the module.
-        """
-        return {
-            'encoder': {
-                'name': 'encoder',
-                'dim': 768,
-                'num_blocks': 12,
-                'use_bert_config': False,
-                'embedding_dropout': 0,
-                'residual_dropout': 0,
-                'multihead_attention': {
-                    'name': 'self',
-                    'use_bias': True,
-                    'num_units': 768,
-                    'num_heads': 12,
-                    'output_dim': 768
-                },
-                'initializer': {
-                    'type': 'variance_scaling_initializer',
-                    'kwargs': {
-                        'factor': 1.0,
-                        'mode': 'FAN_AVG',
-                        'uniform': True
-                    }
-                },
-                'poswise_feedforward': {
-                    'layers': [
-                        {
-                            'type': 'Dense',
-                            'kwargs': {
-                                'activation': 'gelu',
-                                'name': 'intermediate',
-                                'units': 3072,
-                                'use_bias': True
-                            }
-                        },
-                        {
-                            'type': 'Dense',
-                            'kwargs': {
-                                'activation': None,
-                                'name': 'output',
-                                'units': 768,
-                                'use_bias': True
-                            }
-                        }
-                    ],
-                    'name': 'ffn',
-                },
-            },
-
-            'pretrained_model_name': 'gpt2-small',
-            'vocab_size': 50257,
-            'context_size': 1024,
-            'embedding_size': 768,
-            'embed': {
-                'dim': 768,
-                'name': 'word_embeddings'
-            },
-            'position_size': 1024,
-            'position_embed': {
-                'dim': 768,
-                'name': 'position_embeddings'
-            },
-            'initializer': None,
-            'name': 'gpt2_encoder',
-            '@no_typecheck': ['pretrained_model_name'],
-        }
-
-    def _build(self,
-               inputs,
-               sequence_length=None,
-               mode=None,
-               **kwargs):
-        r"""Encodes the inputs.
-
-        Args:
-            inputs: A 2D Tensor of shape `[batch_size, max_time]`,
-                containing the token ids of tokens in the input sequences.
-            sequence_length (optional): A 1D Tensor of shape `[batch_size]`.
-                Input tokens beyond respective sequence lengths are masked
-                out automatically.
-            mode (optional): A tensor taking value in
-                :tf_main:`tf.estimator.ModeKeys <estimator/ModeKeys>`,
-                including `TRAIN`, `EVAL`, and `PREDICT`. Used to toggle
-                dropout.
-                If `None` (default), :func:`texar.tf.global_mode` is used.
-            **kwargs: Keyword arguments.
-
-        Returns:
-            outputs:  A Tensor of shape
-            `[batch_size, max_time, dim]` containing the encoded vectors.
-        """
-        word_embeds = self.word_embedder(inputs)
-
-        batch_size = tf.shape(inputs)[0]
-        pos_length = tf.ones([batch_size], tf.int32) * tf.shape(inputs)[1]
-        pos_embeds = self.position_embedder(sequence_length=pos_length)
-
-        inputs_embeds = word_embeds + pos_embeds
-
-        if sequence_length is None:
-            sequence_length = tf.ones([batch_size], tf.int32) \
-                              * tf.shape(inputs)[1]
-
-        output = self.encoder(inputs_embeds, sequence_length, mode)
-
-        if not self._built:
-            self._add_internal_trainable_variables()
-            self._built = True
-
-            self.init_pretrained_weights(self.variable_scope.name,
-                                         load_output_layer=False)
-        return output
diff --git a/texar/tf/modules/encoders/gpt2_encoder_test.py b/texar/tf/modules/encoders/gpt2_encoder_test.py
deleted file mode 100644
index e75a718b..00000000
--- a/texar/tf/modules/encoders/gpt2_encoder_test.py
+++ /dev/null
@@ -1,135 +0,0 @@
-"""
-Unit tests for GPT2 encoder.
-"""
-
-import tensorflow as tf
-
-from texar.tf.modules.encoders.gpt2_encoder import GPT2Encoder
-from texar.tf.utils.test import pretrained_test
-
-
-class GPT2EncoderTest(tf.test.TestCase):
-    r"""Tests :class:`~texar.torch.modules.GPT2Encoder` class.
-    """
-
-    @pretrained_test
-    def test_model_loading(self):
-        r"""Tests model loading functionality."""
-
-        inputs = tf.placeholder(dtype=tf.int32, shape=[None, None])
-
-        for pretrained_model_name in GPT2Encoder.available_checkpoints():
-            encoder = GPT2Encoder(pretrained_model_name=pretrained_model_name)
-            _ = encoder(inputs)
-
-    @pretrained_test
-    def test_hparams(self):
-        """Tests the priority of the encoder arch parameter.
-        """
-
-        inputs = tf.placeholder(dtype=tf.int32, shape=[None, None])
-
-        # case 1: set "pretrained_mode_name" by constructor argument
-        hparams = {
-            "pretrained_model_name": "gpt2-medium",
-        }
-        encoder = GPT2Encoder(pretrained_model_name="gpt2-small",
-                              hparams=hparams)
-        _ = encoder(inputs)
-        self.assertEqual(encoder.hparams.encoder.num_blocks, 12)
-
-        # case 2: set "pretrained_mode_name" by hparams
-        hparams = {
-            "pretrained_model_name": "gpt2-small",
-            "encoder": {
-                "num_blocks": 6
-            }
-        }
-        encoder = GPT2Encoder(hparams=hparams)
-        _ = encoder(inputs)
-        self.assertEqual(encoder.hparams.encoder.num_blocks, 12)
-
-        # case 3: set to None in both hparams and constructor argument
-        hparams = {
-            "pretrained_model_name": None,
-            "encoder": {
-                "num_blocks": 6
-            },
-        }
-        encoder = GPT2Encoder(hparams=hparams)
-        _ = encoder(inputs)
-        self.assertEqual(encoder.hparams.encoder.num_blocks, 6)
-
-        # case 4: using default hparams
-        encoder = GPT2Encoder()
-        _ = encoder(inputs)
-        self.assertEqual(encoder.hparams.encoder.num_blocks, 12)
-
-    @pretrained_test
-    def test_trainable_variables(self):
-        r"""Tests the functionality of automatically collecting trainable
-        variables.
-        """
-
-        inputs = tf.placeholder(dtype=tf.int32, shape=[None, None])
-
-        def get_variable_num(n_layers: int) -> int:
-            return 1 + 1 + n_layers * 16 + 2
-
-        # case 1: GPT2 small
-        encoder = GPT2Encoder()
-        _ = encoder(inputs)
-        self.assertEqual(len(encoder.trainable_variables), get_variable_num(12))
-
-        # case 2: GPT2 medium
-        hparams = {
-            "pretrained_model_name": "gpt2-medium",
-        }
-        encoder = GPT2Encoder(hparams=hparams)
-        _ = encoder(inputs)
-        self.assertEqual(len(encoder.trainable_variables), get_variable_num(24))
-
-        # case 3: GPT2 large
-        hparams = {
-            "pretrained_model_name": "gpt2-large",
-        }
-        encoder = GPT2Encoder(hparams=hparams)
-        _ = encoder(inputs)
-        self.assertEqual(len(encoder.trainable_variables), get_variable_num(36))
-
-        # case 4: self-designed GPT2
-        hparams = {
-            "pretrained_model_name": None,
-            "encoder": {
-                "num_blocks": 6
-            },
-        }
-        encoder = GPT2Encoder(hparams=hparams)
-        _ = encoder(inputs)
-        self.assertEqual(len(encoder.trainable_variables), get_variable_num(6))
-
-    def test_encode(self):
-        r"""Tests encoding.
-        """
-        # case 1: GPT2 small
-        hparams = {
-            "pretrained_model_name": None
-        }
-        encoder = GPT2Encoder(hparams=hparams)
-
-        max_time = 8
-        batch_size = 16
-        inputs = tf.random_uniform([batch_size, max_time],
-                                   maxval=30521, dtype=tf.int32)
-        outputs = encoder(inputs)
-
-        outputs_dim = encoder.hparams.encoder.dim
-        with self.test_session() as sess:
-            sess.run(tf.global_variables_initializer())
-            outputs_ = sess.run(outputs)
-            self.assertEqual(outputs_.shape, (batch_size,
-                                              max_time, outputs_dim))
-
-
-if __name__ == "__main__":
-    tf.test.main()
diff --git a/texar/tf/modules/encoders/hierarchical_encoders.py b/texar/tf/modules/encoders/hierarchical_encoders.py
deleted file mode 100644
index f5e47bd3..00000000
--- a/texar/tf/modules/encoders/hierarchical_encoders.py
+++ /dev/null
@@ -1,378 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Various encoders that encode data with hierarchical structure.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import collections
-
-import tensorflow as tf
-from tensorflow.contrib.rnn import LSTMStateTuple
-from tensorflow.python.util import nest    # pylint: disable=E0611
-
-from texar.tf.modules.encoders.encoder_base import EncoderBase
-from texar.tf.utils import utils
-
-# pylint: disable=invalid-name, too-many-arguments, too-many-locals
-
-__all__ = [
-    "HierarchicalRNNEncoder"
-]
-
-
-class HierarchicalRNNEncoder(EncoderBase):
-    """A hierarchical encoder that stacks basic RNN encoders into two layers.
-    Can be used to encode long, structured sequences, e.g. paragraphs, dialog
-    history, etc.
-
-    Args:
-        encoder_major (optional): An instance of subclass of
-            :class:`~texar.tf.modules.RNNEncoderBase`
-            The high-level encoder taking final
-            states from low-level encoder as its
-            inputs. If not specified, an encoder
-            is created as specified in
-            :attr:`hparams["encoder_major"]`.
-        encoder_minor (optional): An instance of subclass of
-            :class:`~texar.tf.modules.RNNEncoderBase`
-            The low-level encoder. If not
-            specified, an encoder is created as specified
-            in :attr:`hparams["encoder_minor"]`.
-        hparams (dict or HParams, optional): Hyperparameters. Missing
-            hyperparamerter will be set to default values. See
-            :meth:`default_hparams` for the hyperparameter sturcture and
-            default values.
-
-    See :meth:`_build` for the inputs and outputs of the encoder.
-
-    .. document private functions
-    .. automethod:: _build
-    """
-
-    def __init__(self, encoder_major=None, encoder_minor=None,
-                 hparams=None):
-        EncoderBase.__init__(self, hparams)
-
-        encoder_major_hparams = utils.get_instance_kwargs(
-            None, self._hparams.encoder_major_hparams)
-        encoder_minor_hparams = utils.get_instance_kwargs(
-            None, self._hparams.encoder_minor_hparams)
-
-        if encoder_major is not None:
-            self._encoder_major = encoder_major
-        else:
-            with tf.variable_scope(self.variable_scope.name):
-                with tf.variable_scope('encoder_major'):
-                    self._encoder_major = utils.check_or_get_instance(
-                        self._hparams.encoder_major_type,
-                        encoder_major_hparams,
-                        ['texar.tf.modules.encoders', 'texar.tf.custom'])
-
-        if encoder_minor is not None:
-            self._encoder_minor = encoder_minor
-        elif self._hparams.config_share:
-            with tf.variable_scope(self.variable_scope.name):
-                with tf.variable_scope('encoder_minor'):
-                    self._encoder_minor = utils.check_or_get_instance(
-                        self._hparams.encoder_major_type,
-                        encoder_major_hparams,
-                        ['texar.tf.modules.encoders', 'texar.tf.custom'])
-        else:
-            with tf.variable_scope(self.variable_scope.name):
-                with tf.variable_scope('encoder_minor'):
-                    self._encoder_minor = utils.check_or_get_instance(
-                        self._hparams.encoder_minor_type,
-                        encoder_minor_hparams,
-                        ['texar.tf.modules.encoders', 'texar.tf.custom'])
-
-    @staticmethod
-    def default_hparams():
-        """Returns a dictionary of hyperparameters with default values.
-
-        .. role:: python(code)
-           :language: python
-
-        .. code-block:: python
-
-            {
-                "encoder_major_type": "UnidirectionalRNNEncoder",
-                "encoder_major_hparams": {},
-                "encoder_minor_type": "UnidirectionalRNNEncoder",
-                "encoder_minor_hparams": {},
-                "config_share": False,
-                "name": "hierarchical_encoder_wrapper"
-            }
-
-        Here:
-
-        "encoder_major_type": str or class or instance
-            The high-level encoder. Can be a RNN encoder class, its name or
-            module path, or a class instance.
-            Ignored if `encoder_major` is given to the encoder constructor.
-
-        "encoder_major_hparams": dict
-            The hyperparameters for the high-level encoder. The high-level
-            encoder is created with
-            :python:`encoder_class(hparams=encoder_major_hparams)`.
-            Ignored if `encoder_major` is given to the encoder constructor,
-            or if "encoder_major_type" is an encoder instance.
-
-        "encoder_minor_type": str or class or instance
-            The low-level encoder. Can be a RNN encoder class, its name or
-            module path, or a class instance.
-            Ignored if `encoder_minor` is given to the encoder constructor,
-            or if "config_share" is True.
-
-        "encoder_minor_hparams": dict
-            The hyperparameters for the low-level encoder. The high-level
-            encoder is created with
-            :python:`encoder_class(hparams=encoder_minor_hparams)`.
-            Ignored if `encoder_minor` is given to the encoder constructor,
-            or if "config_share" is True,
-            or if "encoder_minor_type" is an encoder instance.
-
-        "config_share":
-            Whether to use encoder_major's hyperparameters
-            to construct encoder_minor.
-
-        "name":
-            Name of the encoder.
-        """
-        hparams = {
-            "name": "hierarchical_encoder",
-            "encoder_major_type": "UnidirectionalRNNEncoder",
-            "encoder_major_hparams": {},
-            "encoder_minor_type": "UnidirectionalRNNEncoder",
-            "encoder_minor_hparams": {},
-            "config_share": False,
-            "@no_typecheck": [
-                'encoder_major_hparams',
-                'encoder_minor_hparams'
-            ]
-        }
-        hparams.update(EncoderBase.default_hparams())
-        return hparams
-
-    def _build(self,
-               inputs,
-               order='btu',
-               medium=None,
-               sequence_length_major=None,
-               sequence_length_minor=None,
-               **kwargs):
-        """Encodes the inputs.
-
-        Args:
-            inputs: A 4-D tensor of shape `[B, T, U, dim]`, where
-
-                - B: batch_size
-                - T: the max length of high-level sequences. E.g., the max \
-                number of utterances in dialog history.
-                - U: the max length of low-level sequences. E.g., the max \
-                length of each utterance in dialog history.
-                - dim: embedding dimension
-
-                The order of first three dimensions can be changed
-                according to :attr:`order`.
-
-            order: A 3-char string containing 'b', 't', and 'u',
-                that specifies the order of inputs dimensions above.
-                Following four can be accepted:
-
-                    - **'btu'**: None of the encoders are time-major.
-                    - **'utb'**: Both encoders are time-major.
-                    - **'tbu'**: The major encoder is time-major.
-                    - **'ubt'**: The minor encoder is time-major.
-
-            medium (optional): A list of callables that subsequently process the
-                final states of minor encoder and obtain the inputs
-                for the major encoder.
-                If not specified, :meth:`flatten` is used for processing
-                the minor's final states.
-            sequence_length_major (optional): The `sequence_length` argument
-                sent to major encoder. This is a 1-D Tensor of shape
-                `[B]`.
-            sequence_length_minor (optional): The `sequence_length` argument
-                sent to minor encoder. It can be either a 1-D Tensor of shape
-                `[B*T]`, or a 2-D Tensor of shape `[B, T]` or `[T, B]`
-                according to :attr:`order`.
-            **kwargs: Other keyword arguments for the major and minor encoders,
-                such as `initial_state`, etc.
-                Note that `sequence_length`, and `time_major`
-                must not be included here.
-                `time_major` is derived from :attr:`order` automatically.
-                By default, arguments will be sent to both major and minor
-                encoders. To specify which encoder an argument should be sent
-                to, add '_minor'/'_major' as its suffix.
-
-                Note that `initial_state_minor` must have a batch dimension
-                of size `B*T`. If you have an initial state of batch dimension
-                = `T`, use :meth:`tile_initial_state_minor` to tile it
-                according to `order`.
-
-        Returns:
-            A tuple `(outputs, final_state)` by the major encoder.
-
-            See
-            the return values of `_build()` method of respective encoder class
-            for details.
-        """
-
-        def _kwargs_split(kwargs):
-            kwargs_minor, kwargs_major = {}, {}
-            for k, v in kwargs.items():
-                if len(k) >= 6 and k[-6:] == ['_minor']:
-                    kwargs_minor[k[:-6]] = v
-                if len(k) >= 6 and k[-6:] == ['_major']:
-                    kwargs_major[k[:-6]] = v
-            return kwargs_minor, kwargs_major
-
-        kwargs_minor, kwargs_major = _kwargs_split(kwargs)
-        if sequence_length_minor is not None:
-            sequence_length_minor = tf.reshape(sequence_length_minor, [-1])
-        kwargs_minor['sequence_length'] = sequence_length_minor
-        kwargs_major['sequence_length'] = sequence_length_major
-
-        expand, shape = self._get_flatten_order(
-            order, kwargs_minor, kwargs_major, tf.shape(inputs))
-
-        inputs = tf.reshape(inputs, shape + [inputs.shape[3]])
-
-        _, states_minor = self._encoder_minor(inputs, **kwargs_minor)
-
-        self.states_minor_before_medium = states_minor
-
-        if medium is None:
-            states_minor = self.flatten(states_minor)
-        else:
-            if not isinstance(medium, collections.Sequence):
-                medium = [medium]
-            for fn in medium:
-                if isinstance(fn, str) and fn == 'flatten':
-                    states_minor = self.flatten(states_minor)
-                else:
-                    states_minor = fn(states_minor)
-
-        self.states_minor_after_medium = states_minor
-
-        states_minor = tf.reshape(
-            states_minor, tf.concat([expand, tf.shape(states_minor)[1:]], 0))
-
-        outputs_major, states_major = self._encoder_major(states_minor,
-                                                          **kwargs_major)
-
-        # Add trainable variables of `self._cell` which may be constructed
-        # externally
-        if not self._built:
-            self._add_trainable_variable(
-                self._encoder_minor.trainable_variables)
-            self._add_trainable_variable(
-                self._encoder_major.trainable_variables)
-            self._built = True
-
-        return outputs_major, states_major
-
-    @staticmethod
-    def tile_initial_state_minor(initial_state, order, inputs_shape):
-        """Tiles an initial state to be used for encoder minor.
-
-        The batch dimension of :attr:`initial_state` must equal `T`. The
-        state will be copied for `B` times and used to start encoding each
-        low-level sequence. For example, the first utterance in each dialog
-        history in the batch will have the same initial state.
-
-        Args:
-            initial_state: Initial state with the batch dimension of size `T`.
-            order (str): The dimension order of inputs. Must be the same as
-                used in :meth:`_build`.
-            inputs_shape: Shape of `inputs` for :meth:`_build`. Can usually
-                be Obtained with `tf.shape(inputs)`.
-
-        Returns:
-            A tiled initial state with batch dimension of size `B*T`
-        """
-        def _nest_tile(t, multiplier):
-            return nest.map_structure(lambda x: tf.tile(x, multiplier), t)
-
-        if order == 'btu':
-            return _nest_tile(initial_state, inputs_shape[0])
-        elif order == 'ubt':
-            return _nest_tile(initial_state, inputs_shape[1])
-        elif order == 'utb':
-            return tf.contrib.seq2seq.tile_batch(initial_state, inputs_shape[2])
-        elif order == 'tbu':
-            return tf.contrib.seq2seq.tile_batch(initial_state, inputs_shape[1])
-        else:
-            raise ValueError('Unknown order: {}'.format(order))
-
-    @staticmethod
-    def _get_flatten_order(order, kwargs_minor, kwargs_major, shape):
-        if order == 'btu':
-            kwargs_minor.setdefault('time_major', False)
-            kwargs_major.setdefault('time_major', False)
-            expand = shape[0:2]
-            shape = [shape[0] * shape[1], shape[2]]
-        elif order == 'utb':
-            kwargs_minor.setdefault('time_major', True)
-            kwargs_major.setdefault('time_major', True)
-            expand = shape[1:3]
-            shape = [shape[0], shape[1] * shape[2]]
-        elif order == 'tbu':
-            kwargs_minor.setdefault('time_major', False)
-            kwargs_major.setdefault('time_major', True)
-            expand = shape[0:2]
-            shape = [shape[0] * shape[1], shape[2]]
-        elif order == 'ubt':
-            kwargs_minor.setdefault('time_major', True)
-            kwargs_major.setdefault('time_major', False)
-            expand = shape[1:3]
-            shape = [shape[0], shape[1] * shape[2]]
-        else:
-            raise ValueError('Unknown order: {}'.format(order))
-
-        return expand, shape
-
-    @staticmethod
-    def flatten(x):
-        """Flattens a cell state by concatenating a sequence of cell
-        states along the last dimension. If the cell states are
-        :tf_main:`LSTMStateTuple <contrib/rnn/LSTMStateTuple>`, only the
-        hidden `LSTMStateTuple.h` is used.
-
-        This process is used by default if :attr:`medium` is not provided
-        to :meth:`_build`.
-        """
-        if isinstance(x, LSTMStateTuple):
-            return x.h
-        if isinstance(x, collections.Sequence):
-            return tf.concat(
-                [HierarchicalRNNEncoder.flatten(v) for v in x], -1)
-        else:
-            return x
-
-    @property
-    def encoder_major(self):
-        """The high-level encoder.
-        """
-        return self._encoder_major
-
-    @property
-    def encoder_minor(self):
-        """The low-level encoder.
-        """
-        return self._encoder_minor
diff --git a/texar/tf/modules/encoders/hierarchical_encoders_test.py b/texar/tf/modules/encoders/hierarchical_encoders_test.py
deleted file mode 100644
index 9206af75..00000000
--- a/texar/tf/modules/encoders/hierarchical_encoders_test.py
+++ /dev/null
@@ -1,150 +0,0 @@
-#
-"""
-Unit tests for RNN encoders.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
-
-import tensorflow as tf
-
-from texar.tf.modules.encoders.hierarchical_encoders import HierarchicalRNNEncoder
-
-# pylint: disable=too-many-locals
-
-
-class HierarchicalRNNEncoderTest(tf.test.TestCase):
-    """Tests HierarchicalRNNEncoder
-    """
-
-    def test_trainable_variables(self):
-        encoder = HierarchicalRNNEncoder()
-
-        inputs = tf.random_uniform(
-            [3, 2, 3, 4],
-            maxval=1,
-            minval=-1,
-            dtype=tf.float32)
-        _, _ = encoder(inputs)
-
-        self.assertEqual(
-            len(encoder.trainable_variables),
-            len(encoder.encoder_major.trainable_variables) +
-            len(encoder.encoder_minor.trainable_variables))
-
-    def test_encode(self):
-        encoder = HierarchicalRNNEncoder()
-
-        batch_size = 16
-        max_major_time = 8
-        max_minor_time = 6
-        dim = 10
-        inputs = tf.random_uniform(
-            [batch_size, max_major_time, max_minor_time, dim],
-            maxval=1,
-            minval=-1,
-            dtype=tf.float32)
-        outputs, state = encoder(inputs)
-
-        cell_dim = encoder.encoder_major.hparams.rnn_cell.kwargs.num_units
-
-        with self.test_session() as sess:
-            sess.run(tf.global_variables_initializer())
-            outputs_, state_ = sess.run([outputs, state])
-            self.assertEqual(state_[0].shape, (batch_size, cell_dim))
-
-    def test_order(self):
-        encoder = HierarchicalRNNEncoder()
-
-        batch_size = 16
-        max_major_time = 8
-        max_minor_time = 6
-        dim = 10
-        inputs = tf.random_uniform(
-            [batch_size, max_major_time, max_minor_time, dim],
-            maxval=1,
-            minval=-1,
-            dtype=tf.float32)
-
-        outputs_1, state_1 = encoder(inputs, order='btu')
-        outputs_2, state_2 = encoder(inputs, order='utb')
-        outputs_3, state_3 = encoder(inputs, order='tbu')
-        outputs_4, state_4 = encoder(inputs, order='ubt')
-
-        with self.test_session() as sess:
-            sess.run(tf.global_variables_initializer())
-            sess.run([outputs_1, state_1, outputs_2, state_2,
-                      outputs_3, state_3, outputs_4, state_4])
-
-    def test_depack(self):
-        hparams = {
-            "encoder_major_type": "BidirectionalRNNEncoder",
-            "encoder_major_hparams": {
-                "rnn_cell_fw": {
-                    "type": "LSTMCell",
-                    "kwargs": {
-                        "num_units": 100
-                    }
-                }
-            }
-        }
-        encoder = HierarchicalRNNEncoder(hparams=hparams)
-
-        batch_size = 16
-        max_major_time = 8
-        max_minor_time = 6
-        dim = 10
-        inputs = tf.random_uniform(
-            [batch_size, max_major_time, max_minor_time, dim],
-            maxval=1,
-            minval=-1,
-            dtype=tf.float32)
-
-        _, _ = encoder(inputs)
-
-        self.assertEqual(
-            encoder.states_minor_before_medium.h.shape[1],
-            encoder.states_minor_after_medium.shape[1])
-
-    def test_encoder_minor_as_birnn(self):
-        """Tests encoder_minor as a BidirectionalRNNEncoder
-        """
-        hparams = {
-            "encoder_minor_type": "BidirectionalRNNEncoder",
-            "encoder_minor_hparams": {
-                "rnn_cell_fw": {
-                    "type": "LSTMCell",
-                    "kwargs": {
-                        "num_units": 100
-                    }
-                }
-            },
-            "encoder_major_hparams": {
-                "rnn_cell": {
-                    "type": "LSTMCell",
-                    "kwargs": {
-                        "num_units": 200
-                    }
-                }
-            }
-        }
-        encoder = HierarchicalRNNEncoder(hparams=hparams)
-
-        batch_size = 16
-        max_major_time = 8
-        max_minor_time = 6
-        dim = 10
-        inputs = tf.random_uniform(
-            [batch_size, max_major_time, max_minor_time, dim],
-            maxval=1,
-            minval=-1,
-            dtype=tf.float32)
-
-        outputs, _ = encoder(inputs)
-        self.assertEqual(list(outputs.shape), [16, 8, 200])
-
-
-if __name__ == "__main__":
-    tf.test.main()
diff --git a/texar/tf/modules/encoders/multihead_attention.py b/texar/tf/modules/encoders/multihead_attention.py
index e2648260..edb374ef 100644
--- a/texar/tf/modules/encoders/multihead_attention.py
+++ b/texar/tf/modules/encoders/multihead_attention.py
@@ -1,4 +1,4 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
+# Copyright 2019 The Texar Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,23 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-Transformer encoders with multihead self attention.
+Transformer encoders with multi-head self attention.
 """
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
 import tensorflow as tf
 
-from texar.tf.core import layers
+from texar.tf.core.layers import get_initializer
 from texar.tf.modules.encoders.encoder_base import EncoderBase
-from texar.tf.utils.shapes import shape_list
 from texar.tf.utils.mode import is_train_mode
-from texar.tf.utils import transpose_batch_time
+from texar.tf.utils.shapes import transpose_batch_time
 
-# pylint: disable=too-many-locals, invalid-name, arguments-differ
-# pylint: disable=too-many-arguments
 
 __all__ = [
     "MultiheadAttentionEncoder"
@@ -36,79 +29,76 @@
 
 
 class MultiheadAttentionEncoder(EncoderBase):
-    """Multihead Attention Encoder
+    r"""Multi-head Attention Encoder
 
     Args:
         hparams (dict or HParams, optional): Hyperparameters. Missing
             hyperparamerter will be set to default values. See
             :meth:`default_hparams` for the hyperparameter sturcture and
             default values.
-
-    .. document private functions
-    .. automethod:: _build
     """
     def __init__(self, hparams=None):
-        EncoderBase.__init__(self, hparams)
+        super().__init__(hparams=hparams)
         use_bias = self._hparams.use_bias
-
-        with tf.variable_scope(self.variable_scope):
-            if self._hparams.initializer:
-                tf.get_variable_scope().set_initializer(
-                    layers.get_initializer(self._hparams.initializer))
-
-            self.Q_dense = tf.layers.Dense(self._hparams.num_units,
-                                           use_bias=use_bias,
-                                           name='query')
-            self.K_dense = tf.layers.Dense(self._hparams.num_units,
-                                           use_bias=use_bias,
-                                           name='key')
-            self.V_dense = tf.layers.Dense(self._hparams.num_units,
-                                           use_bias=use_bias,
-                                           name='value')
-            self.O_dense = tf.layers.Dense(self._hparams.output_dim,
-                                           use_bias=use_bias,
-                                           name='output')
+        initializer = get_initializer(self._hparams.initializer)
+
+        self.Q_dense = tf.keras.layers.Dense(self._hparams.num_units,
+                                             use_bias=use_bias,
+                                             kernel_initializer=initializer,
+                                             bias_initializer=initializer)
+        self.K_dense = tf.keras.layers.Dense(self._hparams.num_units,
+                                             use_bias=use_bias,
+                                             kernel_initializer=initializer,
+                                             bias_initializer=initializer)
+        self.V_dense = tf.keras.layers.Dense(self._hparams.num_units,
+                                             use_bias=use_bias,
+                                             kernel_initializer=initializer,
+                                             bias_initializer=initializer)
+        self.O_dense = tf.keras.layers.Dense(self._hparams.output_dim,
+                                             use_bias=use_bias,
+                                             kernel_initializer=initializer,
+                                             bias_initializer=initializer)
 
     @staticmethod
     def default_hparams():
-        """Returns a dictionary of hyperparameters with default values.
+        r"""Returns a dictionary of hyperparameters with default values.
 
         .. code-block:: python
 
             {
                 "initializer": None,
-                'num_heads': 8,
-                'output_dim': 512,
-                'num_units': 512,
-                'dropout_rate': 0.1,
-                'use_bias': False,
+                "num_heads": 8,
+                "output_dim": 512,
+                "num_units": 512,
+                "dropout_rate": 0.1,
+                "use_bias": False,
                 "name": "multihead_attention"
             }
 
         Here:
 
-        "initializer": dict, optional
+        `"initializer"`: dict, optional
             Hyperparameters of the default initializer that initializes
             variables created in this module.
             See :func:`~texar.tf.core.get_initializer` for details.
 
-        "num_heads": int
+        `"num_heads"`: int
             Number of heads for attention calculation.
 
-        "output_dim": int
+        `"output_dim"`: int
             Output dimension of the returned tensor.
 
-        "num_units": int
+        `"num_units"`: int
             Hidden dimension of the unsplitted attention space.
             Should be devisible by `num_heads`.
 
-        "dropout_rate: : float
+        `"dropout_rate"`: : float
             Dropout rate in the attention.
 
-        "use_bias": bool
+        `"use_bias"`: bool
             Use bias when projecting the key, value and query.
 
-        "name": str
+        `"name"`: str
             Name of the module.
         """
         return {
@@ -121,133 +111,134 @@ def default_hparams():
             "name": "multihead_attention",
         }
 
-    def _build(self, queries, memory, memory_attention_bias,
-               cache=None, mode=None):
-        """Encodes the inputs.
+    def __call__(self, queries, memory, memory_attention_bias,
+             cache=None, mode=None):
+        r"""Encodes the inputs.
 
         Args:
-            queries: A 3d tensor with shape of [batch, length_query,
-                depth_query].
-            memory: A 3d tensor with shape of [batch, length_key, depth_key].
+            queries: A 3d tensor with shape of
+                ``[batch, length_query, depth_query]``.
+            memory: A 3d tensor with shape of
+                ``[batch, length_key, depth_key]``.
             memory_attention_bias: A 3d tensor with shape of
-                [batch, length_key, num_units].
-            cache: Memory cache only when inferencing the sentence from sractch.
+                ``[batch, length_key, num_units]``.
+            cache: Memory cache only when inferring the sentence from scratch.
             mode (optional): A tensor taking value in
                 :tf_main:`tf.estimator.ModeKeys <estimator/ModeKeys>`, including
-                `TRAIN`, `EVAL` and `PREDICT`. Controls dropout mode.
-                If `None` (default), :func:`texar.tf.global_mode` is used.
+                `TRAIN`, `EVAL`, and `PREDICT`.
 
         Returns:
-            A Tensor of shape `[batch_size, max_time, dim]` containing the
+            A Tensor of shape ``[batch_size, max_time, dim]`` containing the
             encoded vectors.
         """
+        return super().__call__([queries, memory, memory_attention_bias],
+                                cache, mode)
 
-        with tf.variable_scope(self.variable_scope):
-            num_heads = self._hparams.num_heads
-            num_units = self._hparams.num_units
-            if num_units % num_heads:
-                raise ValueError("Value depth (%d) must be divisible by "
-                                 "the number of attention heads (%d)."
-                                 % (num_units, num_heads))
-
-            def _update_and_return(layer, key):
-                if memory is None:
-                    # Self Attention
-                    out = layer(queries)
-
-                    if cache is not None:
-                        # 'decoder self attention when dynamic decoding'
-                        key = 'self_{}'.format(key)
-                        res = cache[key]
-                        if isinstance(res, tf.TensorArray):
-                            # inference-like decoding
-                            # TODO(zhiting): This writing op may cause a bug
-                            # on CPU--it looks the two TensorArray
-                            # cache['self_keys'] and cache['self_values']
-                            # will mix up starting from certain step, causing
-                            # shape mismatch. This op looks fine on GPU.
-                            res = res.write(
-                                res.size(), tf.squeeze(out, axis=[1]))
-                            out = transpose_batch_time(res.stack())
-                        else:
-                            # normal decoding
-                            res = tf.concat([res, out], axis=1)
-                            out = res
-                        cache[key] = res
-
-                else:
-                    # encoder decoder attention
-                    if cache is not None:
-                        key = 'memory_{}'.format(key)
-                        res = cache[key]
-                        if isinstance(res, tf.TensorArray):
-                            # inference-like decoding
-                            size = res.size()
-                            false_fn = lambda: transpose_batch_time(res.stack())
-                        else:
-                            # normal decoding
-                            size = tf.shape(res)[1]
-                            false_fn = lambda: res
-                        out = tf.cond(
-                            tf.equal(size, 0),
-                            true_fn=lambda: layer(memory),
-                            false_fn=false_fn)
+    def call(self, inputs, cache, mode):
+        r"""Encodes the inputs.
+        """
+        queries, memory, memory_attention_bias = inputs
+        num_heads = self._hparams.num_heads
+        num_units = self._hparams.num_units
+        if num_units % num_heads:
+            raise ValueError("Value depth (%d) must be divisible by "
+                             "the number of attention heads (%d)."
+                             % (num_units, num_heads))
+
+        def _update_and_return(layer, key):
+            if memory is None:
+                # Self Attention
+                out = layer(queries)
+
+                if cache is not None:
+                    # 'decoder self attention when dynamic decoding'
+                    key = 'self_{}'.format(key)
+                    res = cache[key]
+                    if isinstance(res, tf.TensorArray):
+                        # inference-like decoding
+                        # TODO(zhiting): This writing op may cause a bug
+                        # on CPU--it looks the two TensorArray
+                        # cache['self_keys'] and cache['self_values']
+                        # will mix up starting from certain step, causing
+                        # shape mismatch. This op looks fine on GPU.
+                        res = res.write(res.size(), tf.squeeze(out, axis=[1]))
+                        out = transpose_batch_time(res.stack())
                     else:
-                        out = layer(memory)
-
-                return out
-
-            Q = self.Q_dense(queries)
-            K = _update_and_return(self.K_dense, 'keys')
-            V = _update_and_return(self.V_dense, 'values')
-
-            Q_ = self._split_heads(Q)
-            K_ = self._split_heads(K)
-            V_ = self._split_heads(V)
-            # [batch_size, num_heads, seq_length, memory_depth]
-            key_depth_per_head = num_units // num_heads
-            Q_ *= key_depth_per_head**-0.5
-
-            logits = tf.matmul(Q_, K_, transpose_b=True)
-            if memory_attention_bias is not None:
-                logits += memory_attention_bias
-            weights = tf.nn.softmax(logits, name="attention_weights")
-            weights = tf.layers.dropout(weights,
-                                        rate=self._hparams.dropout_rate,
-                                        training=is_train_mode(mode))
-            outputs = tf.matmul(weights, V_)
-
-            outputs = self._combine_heads(outputs)
-            outputs = self.O_dense(outputs)
-            # (batch_size, length_query, output_dim)
+                        # normal decoding
+                        res = tf.concat([res, out], axis=1)
+                        out = res
+                    cache[key] = res
+
+            else:
+                # encoder decoder attention
+                if cache is not None:
+                    key = 'memory_{}'.format(key)
+                    res = cache[key]
+                    if isinstance(res, tf.TensorArray):
+                        # inference-like decoding
+                        size = res.size()
+                        false_fn = lambda: transpose_batch_time(res.stack())
+                    else:
+                        # normal decoding
+                        size = res.shape[1]
+                        false_fn = lambda: res
 
-        if not self._built:
-            self._add_internal_trainable_variables()
-            self._built = True
+                    if size == 0:
+                        out = layer(memory)
+                    else:
+                        out = false_fn()
+                else:
+                    out = layer(memory)
+
+            return out
+
+        Q = self.Q_dense(queries)
+        K = _update_and_return(self.K_dense, 'keys')
+        V = _update_and_return(self.V_dense, 'values')
+
+        Q_ = self._split_heads(Q)
+        K_ = self._split_heads(K)
+        V_ = self._split_heads(V)
+        # [batch_size, num_heads, seq_length, memory_depth]
+        key_depth_per_head = num_units // num_heads
+        Q_ *= key_depth_per_head**-0.5
+
+        logits = tf.matmul(Q_, K_, transpose_b=True)
+        if memory_attention_bias is not None:
+            logits += memory_attention_bias
+        weights = tf.nn.softmax(logits, name="attention_weights")
+        if is_train_mode(mode):
+            weights = tf.nn.dropout(weights,
+                                    rate=self._hparams.dropout_rate)
+        outputs = tf.matmul(weights, V_)
+
+        outputs = self._combine_heads(outputs)
+        outputs = self.O_dense(outputs)
+        # (batch_size, length_query, output_dim)
 
         return outputs
 
     def _split_heads(self, x):
-        """Split channels (dimension 2) into multiple heads,
+        r"""Split channels (dimension 2) into multiple heads,
         becomes dimension 1).
 
-        Must ensure `x.shape[-1]` can be deviced by num_heads
+        Must ensure `x.shape[-1]` can be divided by num_heads.
         """
-        depth = shape_list(x)[-1]
+        depth = x.shape.as_list()[-1]
         splitted_x = tf.reshape(x, [tf.shape(x)[0], tf.shape(x)[1],
                                     self._hparams.num_heads,
                                     depth // self._hparams.num_heads])
         return tf.transpose(splitted_x, [0, 2, 1, 3])
 
     def _combine_heads(self, x):
-        """
+        r"""
         Args:
-            x: A Tensor of shape `[batch, num_heads, seq_len, dim]`
+            x: A Tensor of shape ``[batch, num_heads, seq_len, dim]``
 
         Returns:
-            A Tensor of shape `[batch, seq_len, num_heads * dim]`
+            A Tensor of shape ``[batch, seq_len, num_heads * dim]``
         """
         t = tf.transpose(x, [0, 2, 1, 3])  # [batch, seq_len, num_heads, dim]
-        num_heads, dim = shape_list(t)[-2:]
+        num_heads, dim = t.shape.as_list()[-2:]
         assert num_heads == self._hparams.num_heads
         return tf.reshape(t, [tf.shape(t)[0], tf.shape(t)[1], num_heads * dim])
diff --git a/texar/tf/modules/encoders/rnn_encoders.py b/texar/tf/modules/encoders/rnn_encoders.py
deleted file mode 100644
index d8e6d327..00000000
--- a/texar/tf/modules/encoders/rnn_encoders.py
+++ /dev/null
@@ -1,921 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Various RNN encoders.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import functools
-import numpy as np
-
-import tensorflow as tf
-from tensorflow.contrib.framework import nest
-
-from texar.tf.modules.encoders.encoder_base import EncoderBase
-from texar.tf.modules.networks.conv_networks import _to_list
-from texar.tf.core import layers
-from texar.tf.utils.mode import is_train_mode
-from texar.tf.utils.shapes import mask_sequences
-from texar.tf.hyperparams import HParams
-
-# pylint: disable=too-many-arguments, too-many-locals, invalid-name, no-member
-
-__all__ = [
-    "_forward_single_output_layer",
-    "RNNEncoderBase",
-    "UnidirectionalRNNEncoder",
-    "BidirectionalRNNEncoder"
-]
-
-
-def _default_output_layer_hparams():
-    return {
-        "num_layers": 0,
-        "layer_size": 128,
-        "activation": "identity",
-        "final_layer_activation": None,
-        "other_dense_kwargs": None,
-        "dropout_layer_ids": [],
-        "dropout_rate": 0.5,
-        "variational_dropout": False,
-        "@no_typecheck": ["activation", "final_layer_activation",
-                          "layer_size", "dropout_layer_ids"]
-    }
-
-
-def _build_dense_output_layer(hparams):
-    nlayers = hparams.num_layers
-
-    if nlayers <= 0:
-        return None
-
-    layer_size = _to_list(
-        hparams.layer_size, 'output_layer.layer_size', nlayers)
-
-    other_kwargs = hparams.other_dense_kwargs or {}
-    if isinstance(other_kwargs, HParams):
-        other_kwargs = other_kwargs.todict()
-    if not isinstance(other_kwargs, dict):
-        raise ValueError(
-            "hparams 'output_layer.other_dense_kwargs' must be a dict.")
-
-    dense_layers = []
-    for i in range(nlayers):
-        if i == nlayers - 1:
-            activation = hparams.final_layer_activation
-        else:
-            activation = hparams.activation
-
-        kwargs_i = {"units": layer_size[i],
-                    "activation": activation,
-                    "name": "dense_%d" % (i + 1)}
-        kwargs_i.update(other_kwargs)
-
-        layer_hparams = {"type": "Dense", "kwargs": kwargs_i}
-        dense_layers.append(layers.get_layer(hparams=layer_hparams))
-
-    if len(dense_layers) == 1:
-        dense_layers = dense_layers[0]
-
-    return dense_layers
-
-
-def _forward_single_output_layer(inputs, input_size, output_layer):
-    """Forwards the input through a single output layer.
-
-    Args:
-        inputs: A Tensor of shape `[batch_size, max_time] + input_size` if
-            :attr:`time_major=False`, or shape
-            `[max_time, batch_size] + input_size` if :attr:`time_major=True`.
-        input_size: An `int` or 1D `int` array.
-    """
-    dim = np.prod(input_size)
-    inputs_flat = inputs
-    inputs_flat = tf.reshape(inputs_flat, [-1, dim])
-    # Feed to the layer
-    output_flat = output_layer(inputs_flat)
-    output_size = output_layer.compute_output_shape([1, dim]).as_list()[1:]
-    output_size = np.array(output_size)
-    # Reshape output to [batch_size/max_time, max_time/batch_size] + output_size
-    output_shape = tf.concat([tf.shape(inputs)[:2], output_size], axis=0)
-    output = tf.reshape(output_flat, output_shape)
-    return output, output_size
-
-
-def _apply_dropout(inputs, time_major, hparams, training):
-    """Applies dropout to the inputs.
-
-    :attr:`inputs` is a Tensor of shape `[batch_size, max_time, dim]`
-    if :attr:`time_major=False`, or shape `[max_time, batch_size, dim]`
-    if :attr:`time_major=True`.
-    """
-    noise_shape = None
-    if hparams.variational_dropout:
-        if time_major:
-            noise_shape = [1, None, None]
-        else:
-            noise_shape = [None, 1, None]
-    return tf.layers.dropout(inputs, rate=hparams.dropout_rate,
-                             noise_shape=noise_shape, training=training)
-
-
-def _forward_output_layers(inputs, input_size, output_layer, time_major,
-                           hparams, mode, sequence_length=None):
-    """Forwards inputs through the output layers.
-
-    Args:
-        inputs: A Tensor of shape `[batch_size, max_time] + input_size` if
-            :attr:`time_major=False`, or shape
-            `[max_time, batch_size] + input_size` if :attr:`time_major=True`.
-
-    Returns:
-        A pair :attr:`(outputs, outputs_size), where
-
-        - :attr:`outputs`: A Tensor of shape \
-          `[batch_size, max_time] + outputs_size`.
-
-        - :attr:`outputs_size`: An `int` or 1D `int` array representing the \
-          output size.
-    """
-    if output_layer is None:
-        return inputs, input_size
-
-    if hparams is None:
-        # output_layer was passed in from the constructor
-        if isinstance(output_layer, (list, tuple)):
-            raise ValueError('output_layer must not be a list or tuple.')
-        output, output_size = _forward_single_output_layer(
-            inputs, input_size, output_layer)
-    else:
-        # output_layer was built based on hparams
-        output_layer = _to_list(output_layer)
-
-        dropout_layer_ids = _to_list(hparams.dropout_layer_ids)
-        if len(dropout_layer_ids) > 0:
-            training = is_train_mode(mode)
-
-        output = inputs
-        output_size = input_size
-        for i, layer in enumerate(output_layer):
-            if i in dropout_layer_ids:
-                output = _apply_dropout(output, time_major, hparams, training)
-            output, output_size = _forward_single_output_layer(
-                output, output_size, layer)
-
-        if len(output_layer) in dropout_layer_ids:
-            output = _apply_dropout(output, time_major, hparams, training)
-
-    if sequence_length is not None:
-        output = mask_sequences(
-            output, sequence_length, time_major=time_major, tensor_rank=3)
-
-    return output, output_size
-
-
-def _apply_rnn_encoder_output_layer(output_layer, time_major, hparams, mode,
-                                    cell_outputs, cell_output_size):
-    map_func = functools.partial(
-        _forward_output_layers,
-        output_layer=output_layer,
-        time_major=time_major,
-        hparams=hparams,
-        mode=mode)
-    cell_outputs_flat = nest.flatten(cell_outputs)
-    cell_output_size_flat = nest.flatten(cell_output_size)
-    o = [map_func(inputs=x, input_size=xs)
-         for x, xs in zip(cell_outputs_flat, cell_output_size_flat)]
-    outputs_flat, output_size_flat = zip(*o)
-    outputs = nest.pack_sequence_as(cell_outputs, outputs_flat)
-    output_size = nest.pack_sequence_as(cell_outputs, output_size_flat)
-    return outputs, output_size
-
-
-class RNNEncoderBase(EncoderBase):
-    """Base class for all RNN encoder classes to inherit.
-
-    Args:
-        hparams (dict or HParams, optional): Hyperparameters. Missing
-            hyperparamerter will be set to default values. See
-            :meth:`default_hparams` for the hyperparameter sturcture and
-            default values.
-    """
-
-    def __init__(self, hparams=None):
-        EncoderBase.__init__(self, hparams)
-
-    @staticmethod
-    def default_hparams():
-        """Returns a dictionary of hyperparameters with default values.
-
-        .. code-block:: python
-
-            {
-                "name": "rnn_encoder"
-            }
-        """
-        return {
-            "name": "rnn_encoder"
-        }
-
-    def _build(self, inputs, *args, **kwargs):
-        """Encodes the inputs.
-
-        Args:
-            inputs: Inputs to the encoder.
-            *args: Other arguments.
-            **kwargs: Keyword arguments.
-
-        Returns:
-            Encoding results.
-        """
-        raise NotImplementedError
-
-
-class UnidirectionalRNNEncoder(RNNEncoderBase):
-    """One directional RNN encoder.
-
-    Args:
-        cell: (RNNCell, optional) If not specified,
-            a cell is created as specified in :attr:`hparams["rnn_cell"]`.
-        cell_dropout_mode (optional): A Tensor taking value of
-            :tf_main:`tf.estimator.ModeKeys <estimator/ModeKeys>`, which
-            toggles dropout in the RNN cell (e.g., activates dropout in
-            TRAIN mode). If `None`, :func:`~texar.tf.global_mode` is used.
-            Ignored if :attr:`cell` is given.
-        output_layer (optional): An instance of
-            :tf_main:`tf.layers.Layer <layers/Layer>`. Applies to the RNN cell
-            output of each step. If `None` (default), the output layer is
-            created as specified in :attr:`hparams["output_layer"]`.
-        hparams (dict or HParams, optional): Hyperparameters. Missing
-            hyperparamerter will be set to default values. See
-            :meth:`default_hparams` for the hyperparameter sturcture and
-            default values.
-
-    See :meth:`_build` for the inputs and outputs of the encoder.
-
-    Example:
-
-        .. code-block:: python
-
-            # Use with embedder
-            embedder = WordEmbedder(vocab_size, hparams=emb_hparams)
-            encoder = UnidirectionalRNNEncoder(hparams=enc_hparams)
-
-            outputs, final_state = encoder(
-                inputs=embedder(data_batch['text_ids']),
-                sequence_length=data_batch['length'])
-
-    .. document private functions
-    .. automethod:: _build
-    """
-
-    def __init__(self,
-                 cell=None,
-                 cell_dropout_mode=None,
-                 output_layer=None,
-                 hparams=None):
-        RNNEncoderBase.__init__(self, hparams)
-
-        # Make RNN cell
-        with tf.variable_scope(self.variable_scope):
-            if cell is not None:
-                self._cell = cell
-            else:
-                self._cell = layers.get_rnn_cell(
-                    self._hparams.rnn_cell, cell_dropout_mode)
-
-        # Make output layer
-        with tf.variable_scope(self.variable_scope):
-            if output_layer is not None:
-                self._output_layer = output_layer
-                self._output_layer_hparams = None
-            else:
-                self._output_layer = _build_dense_output_layer(
-                    self._hparams.output_layer)
-                self._output_layer_hparams = self._hparams.output_layer
-
-    @staticmethod
-    def default_hparams():
-        """Returns a dictionary of hyperparameters with default values.
-
-        .. code-block:: python
-
-            {
-                "rnn_cell": default_rnn_cell_hparams(),
-                "output_layer": {
-                    "num_layers": 0,
-                    "layer_size": 128,
-                    "activation": "identity",
-                    "final_layer_activation": None,
-                    "other_dense_kwargs": None,
-                    "dropout_layer_ids": [],
-                    "dropout_rate": 0.5,
-                    "variational_dropout": False
-                },
-                "name": "unidirectional_rnn_encoder"
-            }
-
-        Here:
-
-        "rnn_cell": dict
-            A dictionary of RNN cell hyperparameters. Ignored if
-            :attr:`cell` is given to the encoder constructor.
-
-            The default value is defined in
-            :func:`~texar.tf.core.default_rnn_cell_hparams`.
-
-        "output_layer": dict
-            Output layer hyperparameters. Ignored if :attr:`output_layer`
-            is given to the encoder constructor. Includes:
-
-            "num_layers": int
-                The number of output (dense) layers. Set to 0 to avoid any
-                output layers applied to the cell outputs..
-
-            "layer_size": int or list
-                The size of each of the output (dense) layers.
-
-                If an `int`, each output layer will have the same size. If
-                a list, the length must equal to :attr:`num_layers`.
-
-            "activation": str or callable or None
-                Activation function for each of the output (dense)
-                layer except for the final layer. This can be
-                a function, or its string name or module path.
-                If function name is given, the function must be from
-                module :tf_main:`tf.nn <nn>` or :tf_main:`tf < >`.
-                For example
-
-                .. code-block:: python
-
-                    "activation": "relu" # function name
-                    "activation": "my_module.my_activation_fn" # module path
-                    "activation": my_module.my_activation_fn # function
-
-                Default is `None` which maintains a linear activation.
-
-            "final_layer_activation": str or callable or None
-                The activation function for the final output layer.
-
-            "other_dense_kwargs": dict or None
-                Other keyword arguments to construct each of the output
-                dense layers, e.g., `use_bias`. See
-                :tf_main:`Dense <layers/Dense>` for the keyword arguments.
-
-            "dropout_layer_ids": int or list
-                The indexes of layers (starting from `0`) whose inputs
-                are applied with dropout. The index = :attr:`num_layers`
-                means dropout applies to the final layer output. E.g.,
-
-                .. code-block:: python
-
-                    {
-                        "num_layers": 2,
-                        "dropout_layer_ids": [0, 2]
-                    }
-
-                will leads to a series of layers as
-                `-dropout-layer0-layer1-dropout-`.
-
-                The dropout mode (training or not) is controlled
-                by the :attr:`mode` argument of :meth:`_build`.
-
-            "dropout_rate": float
-                The dropout rate, between 0 and 1. E.g.,
-                `"dropout_rate": 0.1` would drop out 10% of elements.
-
-            "variational_dropout": bool
-                Whether the dropout mask is the same across all time steps.
-
-        "name": str
-            Name of the encoder
-        """
-        hparams = RNNEncoderBase.default_hparams()
-        hparams.update({
-            "rnn_cell": layers.default_rnn_cell_hparams(),
-            "output_layer": _default_output_layer_hparams(),
-            "name": "unidirectional_rnn_encoder"
-        })
-        return hparams
-
-    def _build(self,
-               inputs,
-               sequence_length=None,
-               initial_state=None,
-               time_major=False,
-               mode=None,
-               return_cell_output=False,
-               return_output_size=False,
-               **kwargs):
-        """Encodes the inputs.
-
-        Args:
-            inputs: A 3D Tensor of shape `[batch_size, max_time, dim]`.
-                The first two dimensions
-                :attr:`batch_size` and :attr:`max_time` are exchanged if
-                :attr:`time_major=True` is specified.
-            sequence_length (optional): A 1D int tensor of shape `[batch_size]`.
-                Sequence lengths
-                of the batch inputs. Used to copy-through state and zero-out
-                outputs when past a batch element's sequence length.
-            initial_state (optional): Initial state of the RNN.
-            time_major (bool): The shape format of the :attr:`inputs` and
-                :attr:`outputs` Tensors. If `True`, these tensors are of shape
-                `[max_time, batch_size, depth]`. If `False` (default),
-                these tensors are of shape `[batch_size, max_time, depth]`.
-            mode (optional): A tensor taking value in
-                :tf_main:`tf.estimator.ModeKeys <estimator/ModeKeys>`, including
-                `TRAIN`, `EVAL`, and `PREDICT`. Controls output layer dropout
-                if the output layer is specified with :attr:`hparams`.
-                If `None` (default), :func:`texar.tf.global_mode`
-                is used.
-            return_cell_output (bool): Whether to return the output of the RNN
-                cell. This is the results prior to the output layer.
-            return_output_size (bool): Whether to return the size of the
-                output (i.e., the results after output layers).
-            **kwargs: Optional keyword arguments of
-                :tf_main:`tf.nn.dynamic_rnn <nn/dynamic_rnn>`,
-                such as `swap_memory`, `dtype`, `parallel_iterations`, etc.
-
-        Returns:
-            - By default (both `return_cell_output` and \
-            `return_output_size` are False), returns a pair \
-            :attr:`(outputs, final_state)`
-
-                - :attr:`outputs`: The RNN output tensor by the output layer \
-                (if exists) or the RNN cell (otherwise). The tensor is of \
-                shape `[batch_size, max_time, output_size]` if \
-                `time_major` is False, or \
-                `[max_time, batch_size, output_size]` if \
-                `time_major` is True. \
-                If RNN cell output is a (nested) tuple of Tensors, then the \
-                :attr:`outputs` will be a (nested) tuple having the same \
-                nest structure as the cell output.
-
-                - :attr:`final_state`: The final state of the RNN, which is a \
-                Tensor of shape `[batch_size] + cell.state_size` or \
-                a (nested) tuple of Tensors if `cell.state_size` is a (nested)\
-                tuple.
-
-            - If `return_cell_output` is True, returns a triple \
-            :attr:`(outputs, final_state, cell_outputs)`
-
-                - :attr:`cell_outputs`: The outputs by the RNN cell prior to \
-                the \
-                output layer, having the same structure with :attr:`outputs` \
-                except for the `output_dim`.
-
-            - If `return_output_size` is `True`, returns a tuple \
-            :attr:`(outputs, final_state, output_size)`
-
-                - :attr:`output_size`: A (possibly nested tuple of) int \
-                representing the size of :attr:`outputs`. If a single int or \
-                an int array, then `outputs` has shape \
-                `[batch/time, time/batch] + output_size`. If \
-                a (nested) tuple, then `output_size` has the same \
-                structure as with `outputs`.
-
-            - If both `return_cell_output` and \
-            `return_output_size` are True, returns \
-            :attr:`(outputs, final_state, cell_outputs, output_size)`.
-        """
-        if ('dtype' not in kwargs) and (initial_state is None):
-            cell_outputs, state = tf.nn.dynamic_rnn(
-                cell=self._cell,
-                inputs=inputs,
-                sequence_length=sequence_length,
-                initial_state=initial_state,
-                time_major=time_major,
-                dtype=tf.float32,
-                **kwargs)
-        else:
-            cell_outputs, state = tf.nn.dynamic_rnn(
-                cell=self._cell,
-                inputs=inputs,
-                sequence_length=sequence_length,
-                initial_state=initial_state,
-                time_major=time_major,
-                **kwargs)
-
-        outputs, output_size = _apply_rnn_encoder_output_layer(
-            self._output_layer, time_major, self._output_layer_hparams,
-            mode, cell_outputs, self._cell.output_size)
-
-        if not self._built:
-            self._add_internal_trainable_variables()
-            # Add trainable variables of `self._cell` and `self._output_layer`
-            # which may be constructed externally.
-            self._add_trainable_variable(
-                layers.get_rnn_cell_trainable_variables(self._cell))
-            if self._output_layer and \
-                    not isinstance(self._output_layer, (list, tuple)):
-                self._add_trainable_variable(
-                    self._output_layer.trainable_variables)
-            self._built = True
-
-        rets = (outputs, state)
-        if return_cell_output:
-            rets += (cell_outputs, )
-        if return_output_size:
-            rets += (output_size, )
-        return rets
-
-    @property
-    def cell(self):
-        """The RNN cell.
-        """
-        return self._cell
-
-    @property
-    def state_size(self):
-        """The state size of encoder cell.
-
-        Same as :attr:`encoder.cell.state_size`.
-        """
-        return self.cell.state_size
-
-    @property
-    def output_layer(self):
-        """The output layer.
-        """
-        return self._output_layer
-
-
-class BidirectionalRNNEncoder(RNNEncoderBase):
-    """Bidirectional forward-backward RNN encoder.
-
-    Args:
-        cell_fw (RNNCell, optional): The forward RNN cell. If not given,
-            a cell is created as specified in :attr:`hparams["rnn_cell_fw"]`.
-        cell_bw (RNNCell, optional): The backward RNN cell. If not given,
-            a cell is created as specified in :attr:`hparams["rnn_cell_bw"]`.
-        cell_dropout_mode (optional): A tensor taking value of
-            :tf_main:`tf.estimator.ModeKeys <estimator/ModeKeys>`, which
-            toggles dropout in the RNN cells (e.g., activates dropout in
-            TRAIN mode). If `None`, :func:`~texar.tf.global_mode()` is
-            used. Ignored if respective cell is given.
-        output_layer_fw (optional): An instance of
-            :tf_main:`tf.layers.Layer <layers/Layer>`. Apply to the forward
-            RNN cell output of each step. If `None` (default), the output
-            layer is created as specified in :attr:`hparams["output_layer_fw"]`.
-        output_layer_bw (optional): An instance of
-            :tf_main:`tf.layers.Layer <layers/Layer>`. Apply to the backward
-            RNN cell output of each step. If `None` (default), the output
-            layer is created as specified in :attr:`hparams["output_layer_bw"]`.
-        hparams (dict or HParams, optional): Hyperparameters. Missing
-            hyperparamerter will be set to default values. See
-            :meth:`default_hparams` for the hyperparameter sturcture and
-            default values.
-
-    See :meth:`_build` for the inputs and outputs of the encoder.
-
-    Example:
-
-        .. code-block:: python
-
-            # Use with embedder
-            embedder = WordEmbedder(vocab_size, hparams=emb_hparams)
-            encoder = BidirectionalRNNEncoder(hparams=enc_hparams)
-
-            outputs, final_state = encoder(
-                inputs=embedder(data_batch['text_ids']),
-                sequence_length=data_batch['length'])
-            # outputs == (outputs_fw, outputs_bw)
-            # final_state == (final_state_fw, final_state_bw)
-
-    .. document private functions
-    .. automethod:: _build
-    """
-
-    def __init__(self,
-                 cell_fw=None,
-                 cell_bw=None,
-                 cell_dropout_mode=None,
-                 output_layer_fw=None,
-                 output_layer_bw=None,
-                 hparams=None):
-        RNNEncoderBase.__init__(self, hparams)
-
-        # Make RNN cells
-        with tf.variable_scope(self.variable_scope):
-            if cell_fw is not None:
-                self._cell_fw = cell_fw
-            else:
-                self._cell_fw = layers.get_rnn_cell(
-                    self._hparams.rnn_cell_fw, cell_dropout_mode)
-
-            if cell_bw is not None:
-                self._cell_bw = cell_bw
-            elif self._hparams.rnn_cell_share_config:
-                self._cell_bw = layers.get_rnn_cell(
-                    self._hparams.rnn_cell_fw, cell_dropout_mode)
-            else:
-                self._cell_bw = layers.get_rnn_cell(
-                    self._hparams.rnn_cell_bw, cell_dropout_mode)
-
-        # Make output layers
-        with tf.variable_scope(self.variable_scope):
-            if output_layer_fw is not None:
-                self._output_layer_fw = output_layer_fw
-                self._output_layer_hparams_fw = None
-            else:
-                self._output_layer_fw = _build_dense_output_layer(
-                    self._hparams.output_layer_fw)
-                self._output_layer_hparams_fw = self._hparams.output_layer_fw
-
-            if output_layer_bw is not None:
-                self._output_layer_bw = output_layer_bw
-                self._output_layer_hparams_bw = None
-            elif self._hparams.output_layer_share_config:
-                self._output_layer_bw = _build_dense_output_layer(
-                    self._hparams.output_layer_fw)
-                self._output_layer_hparams_bw = self._hparams.output_layer_fw
-            else:
-                self._output_layer_bw = _build_dense_output_layer(
-                    self._hparams.output_layer_bw)
-                self._output_layer_hparams_bw = self._hparams.output_layer_bw
-
-    @staticmethod
-    def default_hparams():
-        """Returns a dictionary of hyperparameters with default values.
-
-        .. code-block:: python
-
-            {
-                "rnn_cell_fw": default_rnn_cell_hparams(),
-                "rnn_cell_bw": default_rnn_cell_hparams(),
-                "rnn_cell_share_config": True,
-                "output_layer_fw": {
-                    "num_layers": 0,
-                    "layer_size": 128,
-                    "activation": "identity",
-                    "final_layer_activation": None,
-                    "other_dense_kwargs": None,
-                    "dropout_layer_ids": [],
-                    "dropout_rate": 0.5,
-                    "variational_dropout": False
-                },
-                "output_layer_bw": {
-                    # Same hyperparams and default values as "output_layer_fw"
-                    # ...
-                },
-                "output_layer_share_config": True,
-                "name": "bidirectional_rnn_encoder"
-            }
-
-        Here:
-
-        "rnn_cell_fw": dict
-            Hyperparameters of the forward RNN cell.
-            Ignored if :attr:`cell_fw` is given to the encoder constructor.
-
-            The default value is defined in
-            :func:`~texar.tf.core.default_rnn_cell_hparams`.
-
-        "rnn_cell_bw": dict
-            Hyperparameters of the backward RNN cell.
-            Ignored if :attr:`cell_bw` is given to the encoder constructor
-            , or if :attr:`"rnn_cell_share_config"` is `True`.
-
-            The default value is defined in
-            :meth:`~texar.tf.core.default_rnn_cell_hparams`.
-
-        "rnn_cell_share_config": bool
-            Whether share hyperparameters of the backward cell with the
-            forward cell. Note that the cell parameters (variables) are not
-            shared.
-
-        "output_layer_fw": dict
-            Hyperparameters of the forward output layer. Ignored if
-            :attr:`output_layer_fw` is given to the constructor.
-            See the "output_layer" field of
-            :meth:`~texar.tf.modules.UnidirectionalRNNEncoder.default_hparams` for
-            details.
-
-        "output_layer_bw": dict
-            Hyperparameters of the backward output layer. Ignored if
-            :attr:`output_layer_bw` is given to the constructor. Have the
-            same structure and defaults with :attr:`"output_layer_fw"`.
-
-            Ignored if :attr:`"output_layer_share_config"` is True.
-
-        "output_layer_share_config": bool
-            Whether share hyperparameters of the backward output layer
-            with the forward output layer. Note that the layer parameters
-            (variables) are not shared.
-
-        "name": str
-            Name of the encoder
-        """
-        hparams = RNNEncoderBase.default_hparams()
-        hparams.update({
-            "rnn_cell_fw": layers.default_rnn_cell_hparams(),
-            "rnn_cell_bw": layers.default_rnn_cell_hparams(),
-            "rnn_cell_share_config": True,
-            "output_layer_fw": _default_output_layer_hparams(),
-            "output_layer_bw": _default_output_layer_hparams(),
-            "output_layer_share_config": True,
-            "name": "bidirectional_rnn_encoder"
-        })
-        return hparams
-
-    def _build(self,
-               inputs,
-               sequence_length=None,
-               initial_state_fw=None,
-               initial_state_bw=None,
-               time_major=False,
-               mode=None,
-               return_cell_output=False,
-               return_output_size=False,
-               **kwargs):
-        """Encodes the inputs.
-
-        Args:
-            inputs: A 3D Tensor of shape `[batch_size, max_time, dim]`.
-                The first two dimensions
-                `batch_size` and `max_time` may be exchanged if
-                `time_major=True` is specified.
-            sequence_length (optional): A 1D int tensor of shape `[batch_size]`.
-                Sequence lengths
-                of the batch inputs. Used to copy-through state and zero-out
-                outputs when past a batch element's sequence length.
-            initial_state (optional): Initial state of the RNN.
-            time_major (bool): The shape format of the :attr:`inputs` and
-                :attr:`outputs` Tensors. If `True`, these tensors are of shape
-                `[max_time, batch_size, depth]`. If `False` (default),
-                these tensors are of shape `[batch_size, max_time, depth]`.
-            mode (optional): A tensor taking value in
-                :tf_main:`tf.estimator.ModeKeys <estimator/ModeKeys>`, including
-                `TRAIN`, `EVAL`, and `PREDICT`. Controls output layer dropout
-                if the output layer is specified with :attr:`hparams`.
-                If `None` (default), :func:`texar.tf.global_mode()`
-                is used.
-            return_cell_output (bool): Whether to return the output of the RNN
-                cell. This is the results prior to the output layer.
-            **kwargs: Optional keyword arguments of
-                :tf_main:`tf.nn.dynamic_rnn <nn/dynamic_rnn>`,
-                such as `swap_memory`, `dtype`, `parallel_iterations`, etc.
-
-        Returns:
-            - By default (both `return_cell_output` and `return_output_size` \
-            are False), returns a pair :attr:`(outputs, final_state)`
-
-                - :attr:`outputs`: A tuple `(outputs_fw, outputs_bw)` \
-                containing \
-                the forward and the backward RNN outputs, each of which is of \
-                shape `[batch_size, max_time, output_dim]` if \
-                `time_major` is False, or \
-                `[max_time, batch_size, output_dim]` if \
-                `time_major` is True. \
-                If RNN cell output is a (nested) tuple of Tensors, then \
-                `outputs_fw` and `outputs_bw` will be a (nested) tuple having \
-                the same structure as the cell output.
-
-                - :attr:`final_state`: A tuple \
-                `(final_state_fw, final_state_bw)` \
-                containing the final states of the forward and backward \
-                RNNs, each of which is a \
-                Tensor of shape `[batch_size] + cell.state_size`, or \
-                a (nested) tuple of Tensors if `cell.state_size` is a (nested)\
-                tuple.
-
-            - If `return_cell_output` is True, returns a triple \
-            :attr:`(outputs, final_state, cell_outputs)` where
-
-                - :attr:`cell_outputs`: A tuple \
-                `(cell_outputs_fw, cell_outputs_bw)` containting the outputs \
-                by the forward and backward RNN cells prior to the \
-                output layers, having the same structure with :attr:`outputs` \
-                except for the `output_dim`.
-
-            - If `return_output_size` is True, returns a tuple \
-            :attr:`(outputs, final_state, output_size)` where
-
-                - :attr:`output_size`: A tupple \
-                `(output_size_fw, output_size_bw)` containing the size of \
-                `outputs_fw` and `outputs_bw`, respectively. \
-                Take `*_fw` for example, \
-                `output_size_fw` is a (possibly nested tuple of) int. \
-                If a single int or an int array, then `outputs_fw` has shape \
-                `[batch/time, time/batch] + output_size_fw`. If \
-                a (nested) tuple, then `output_size_fw` has the same \
-                structure as with `outputs_fw`. The same applies to  \
-                `output_size_bw`.
-
-            - If both `return_cell_output` and \
-            `return_output_size` are True, returns \
-            :attr:`(outputs, final_state, cell_outputs, output_size)`.
-        """
-        no_initial_state = initial_state_fw is None and initial_state_bw is None
-        if ('dtype' not in kwargs) and no_initial_state:
-            cell_outputs, states = tf.nn.bidirectional_dynamic_rnn(
-                cell_fw=self._cell_fw,
-                cell_bw=self._cell_bw,
-                inputs=inputs,
-                sequence_length=sequence_length,
-                initial_state_fw=initial_state_fw,
-                initial_state_bw=initial_state_bw,
-                time_major=time_major,
-                dtype=tf.float32,
-                **kwargs)
-        else:
-            cell_outputs, states = tf.nn.bidirectional_dynamic_rnn(
-                cell_fw=self._cell_fw,
-                cell_bw=self._cell_bw,
-                inputs=inputs,
-                sequence_length=sequence_length,
-                initial_state_fw=initial_state_fw,
-                initial_state_bw=initial_state_bw,
-                time_major=time_major,
-                **kwargs)
-
-        outputs_fw, output_size_fw = _apply_rnn_encoder_output_layer(
-            self._output_layer_fw, time_major, self._output_layer_hparams_fw,
-            mode, cell_outputs[0], self._cell_fw.output_size)
-
-        outputs_bw, output_size_bw = _apply_rnn_encoder_output_layer(
-            self._output_layer_bw, time_major, self._output_layer_hparams_bw,
-            mode, cell_outputs[1], self._cell_bw.output_size)
-
-        outputs = (outputs_fw, outputs_bw)
-        output_size = (output_size_fw, output_size_bw)
-
-        if not self._built:
-            self._add_internal_trainable_variables()
-            # Add trainable variables of cells and output layers
-            # which may be constructed externally.
-            self._add_trainable_variable(
-                layers.get_rnn_cell_trainable_variables(self._cell_fw))
-            self._add_trainable_variable(
-                layers.get_rnn_cell_trainable_variables(self._cell_bw))
-            if self._output_layer_fw and \
-                    not isinstance(self._output_layer_fw, (list, tuple)):
-                self._add_trainable_variable(
-                    self._output_layer_fw.trainable_variables)
-            if self._output_layer_bw and \
-                    not isinstance(self._output_layer_bw, (list, tuple)):
-                self._add_trainable_variable(
-                    self._output_layer_bw.trainable_variables)
-            self._built = True
-
-        returns = (outputs, states)
-        if return_cell_output:
-            returns += (cell_outputs, )
-        if return_output_size:
-            returns += (output_size, )
-        return returns
-
-    @property
-    def cell_fw(self):
-        """The forward RNN cell.
-        """
-        return self._cell_fw
-
-    @property
-    def cell_bw(self):
-        """The backward RNN cell.
-        """
-        return self._cell_bw
-
-    @property
-    def state_size_fw(self):
-        """The state size of the forward encoder cell.
-
-        Same as :attr:`encoder.cell_fw.state_size`.
-        """
-        return self.cell_fw.state_size
-
-    @property
-    def state_size_bw(self):
-        """The state size of the backward encoder cell.
-
-        Same as :attr:`encoder.cell_bw.state_size`.
-        """
-        return self.cell_bw.state_size
-
-    @property
-    def output_layer_fw(self):
-        """The output layer of the forward RNN.
-        """
-        return self._output_layer_fw
-
-    @property
-    def output_layer_bw(self):
-        """The output layer of the backward RNN.
-        """
-        return self._output_layer_bw
diff --git a/texar/tf/modules/encoders/rnn_encoders_test.py b/texar/tf/modules/encoders/rnn_encoders_test.py
deleted file mode 100644
index 567e9c24..00000000
--- a/texar/tf/modules/encoders/rnn_encoders_test.py
+++ /dev/null
@@ -1,267 +0,0 @@
-#
-"""
-Unit tests for RNN encoders.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
-
-import tensorflow as tf
-
-from texar.tf.modules.encoders.rnn_encoders import UnidirectionalRNNEncoder
-from texar.tf.modules.encoders.rnn_encoders import BidirectionalRNNEncoder
-from texar.tf.modules.embedders.embedders import WordEmbedder
-
-# pylint: disable=too-many-locals
-
-
-class UnidirectionalRNNEncoderTest(tf.test.TestCase):
-    """Tests :class:`~texar.tf.modules.UnidirectionalRNNEncoder` class.
-    """
-
-    def test_trainable_variables(self):
-        """Tests the functionality of automatically collecting trainable
-        variables.
-        """
-        inputs = tf.placeholder(dtype=tf.float32, shape=[None, None, 100])
-
-        # case 1
-        encoder = UnidirectionalRNNEncoder()
-        _, _ = encoder(inputs)
-        self.assertEqual(len(encoder.trainable_variables), 2)
-
-        # case 2
-        hparams = {
-            "rnn_cell": {
-                "dropout": {
-                    "input_keep_prob": 0.5
-                }
-            }
-        }
-        encoder = UnidirectionalRNNEncoder(hparams=hparams)
-        _, _ = encoder(inputs)
-        self.assertEqual(len(encoder.trainable_variables), 2)
-
-        # case 3
-        hparams = {
-            "output_layer": {
-                "num_layers": 2,
-                "layer_size": [100, 6],
-                "activation": "relu",
-                "final_layer_activation": "identity",
-                "dropout_layer_ids": [0, 1, 2],
-                "variational_dropout": False
-            }
-        }
-        encoder = UnidirectionalRNNEncoder(hparams=hparams)
-        _, _ = encoder(inputs)
-        self.assertEqual(len(encoder.trainable_variables), 2 + 2 + 2)
-        _, _ = encoder(inputs)
-        self.assertEqual(len(encoder.trainable_variables), 2 + 2 + 2)
-
-    def test_encode(self):
-        """Tests encoding.
-        """
-        # case 1
-        encoder = UnidirectionalRNNEncoder()
-
-        max_time = 8
-        batch_size = 16
-        emb_dim = 100
-        inputs = tf.random_uniform([batch_size, max_time, emb_dim],
-                                   maxval=1., dtype=tf.float32)
-        outputs, state = encoder(inputs)
-
-        cell_dim = encoder.hparams.rnn_cell.kwargs.num_units
-        with self.test_session() as sess:
-            sess.run(tf.global_variables_initializer())
-            outputs_, state_ = sess.run([outputs, state])
-            self.assertEqual(outputs_.shape, (batch_size, max_time, cell_dim))
-            self.assertEqual(state_[0].shape, (batch_size, cell_dim))
-
-        # case 2: with output layers
-        hparams = {
-            "output_layer": {
-                "num_layers": 2,
-                "layer_size": [100, 6],
-                "dropout_layer_ids": [0, 1, 2],
-                "variational_dropout": True
-            }
-        }
-        encoder = UnidirectionalRNNEncoder(hparams=hparams)
-
-        max_time = 8
-        batch_size = 16
-        emb_dim = 100
-        inputs = tf.random_uniform([batch_size, max_time, emb_dim],
-                                   maxval=1., dtype=tf.float32)
-        outputs, state, cell_outputs, output_size = encoder(
-            inputs, return_cell_output=True, return_output_size=True)
-
-        self.assertEqual(output_size[0], 6)
-        self.assertEqual(cell_outputs.shape[-1], encoder.cell.output_size)
-
-        out_dim = encoder.hparams.output_layer.layer_size[-1]
-        with self.test_session() as sess:
-            sess.run(tf.global_variables_initializer())
-            outputs_ = sess.run(outputs)
-            self.assertEqual(outputs_.shape, (batch_size, max_time, out_dim))
-
-    def test_encode_with_embedder(self):
-        """Tests encoding companioned with :mod:`texar.tf.modules.embedders`.
-        """
-        embedder = WordEmbedder(vocab_size=20, hparams={"dim": 100})
-        inputs = tf.ones([64, 16], dtype=tf.int32)
-
-        encoder = UnidirectionalRNNEncoder()
-        outputs, state = encoder(embedder(inputs))
-
-        cell_dim = encoder.hparams.rnn_cell.kwargs.num_units
-        with self.test_session() as sess:
-            sess.run(tf.global_variables_initializer())
-            outputs_, state_ = sess.run([outputs, state])
-            self.assertEqual(outputs_.shape, (64, 16, cell_dim))
-            self.assertEqual(state_[0].shape, (64, cell_dim))
-
-
-class BidirectionalRNNEncoderTest(tf.test.TestCase):
-    """Tests :class:`~texar.tf.modules.BidirectionalRNNEncoder` class.
-    """
-
-    def test_trainable_variables(self):
-        """Tests the functionality of automatically collecting trainable
-        variables.
-        """
-        inputs = tf.placeholder(dtype=tf.float32, shape=[None, None, 100])
-
-        # case 1
-        encoder = BidirectionalRNNEncoder()
-        _, _ = encoder(inputs)
-        self.assertEqual(len(encoder.trainable_variables), 4)
-
-        # case 2
-        hparams = {
-            "rnn_cell_fw": {
-                "dropout": {
-                    "input_keep_prob": 0.5
-                }
-            }
-        }
-        encoder = BidirectionalRNNEncoder(hparams=hparams)
-        _, _ = encoder(inputs)
-        self.assertEqual(len(encoder.trainable_variables), 4)
-
-        # case 3
-        hparams = {
-            "output_layer_fw": {
-                "num_layers": 2,
-                "layer_size": [100, 6],
-                "activation": "relu",
-                "final_layer_activation": "identity",
-                "dropout_layer_ids": [0, 1, 2],
-                "variational_dropout": False
-            },
-            "output_layer_bw": {
-                "num_layers": 3,
-                "other_dense_kwargs": {"use_bias": False}
-            },
-            "output_layer_share_config": False
-        }
-        encoder = BidirectionalRNNEncoder(hparams=hparams)
-        _, _ = encoder(inputs)
-        self.assertEqual(len(encoder.trainable_variables), 4 + 4 + 3)
-        _, _ = encoder(inputs)
-        self.assertEqual(len(encoder.trainable_variables), 4 + 4 + 3)
-
-    def test_encode(self):
-        """Tests encoding.
-        """
-        # case 1
-        encoder = BidirectionalRNNEncoder()
-
-        max_time = 8
-        batch_size = 16
-        emb_dim = 100
-        inputs = tf.random_uniform([batch_size, max_time, emb_dim],
-                                   maxval=1., dtype=tf.float32)
-        outputs, state = encoder(inputs)
-
-        cell_dim = encoder.hparams.rnn_cell_fw.kwargs.num_units
-        with self.test_session() as sess:
-            sess.run(tf.global_variables_initializer())
-            outputs_, state_ = sess.run([outputs, state])
-            self.assertEqual(outputs_[0].shape,
-                             (batch_size, max_time, cell_dim))
-            self.assertEqual(state_[0][0].shape, (batch_size, cell_dim))
-
-        # case 2: with output layers
-        hparams = {
-            "output_layer_fw": {
-                "num_layers": 2,
-                "layer_size": [100, 6],
-                "dropout_layer_ids": [0, 1, 2],
-                "variational_dropout": True
-            }
-        }
-        encoder = BidirectionalRNNEncoder(hparams=hparams)
-
-        max_time = 8
-        batch_size = 16
-        emb_dim = 100
-        inputs = tf.random_uniform([batch_size, max_time, emb_dim],
-                                   maxval=1., dtype=tf.float32)
-        outputs, state, cell_outputs, output_size = encoder(
-            inputs, return_cell_output=True, return_output_size=True)
-
-        self.assertEqual(output_size[0][0], 6)
-        self.assertEqual(output_size[1][0], 6)
-        self.assertEqual(cell_outputs[0].shape[-1], encoder.cell_fw.output_size)
-        self.assertEqual(cell_outputs[1].shape[-1], encoder.cell_bw.output_size)
-
-        out_dim = encoder.hparams.output_layer_fw.layer_size[-1]
-        with self.test_session() as sess:
-            sess.run(tf.global_variables_initializer())
-            outputs_ = sess.run(outputs)
-            self.assertEqual(outputs_[0].shape, (batch_size, max_time, out_dim))
-            self.assertEqual(outputs_[1].shape, (batch_size, max_time, out_dim))
-
-# TODO(zhiting): not completed yet
-# class HierarchicalForwardRNNEncoderTest(tf.test.TestCase):
-#    """Tests HierarchicalForwardRNNEncoder class.
-#    """
-#
-#    def test_trainable_variables(self):
-#        """Tests the functionality of automatically collecting trainable
-#        variables.
-#        """
-#        encoder = HierarchicalForwardRNNEncoder(vocab_size=2)
-#        inputs = [[[1, 0], [0, 1], [0, 1]]]
-#        _, _ = encoder(inputs)
-#        self.assertEqual(len(encoder.trainable_variables), 5)
-#
-#    def test_encode(self):
-#        """Tests encoding.
-#        """
-#        vocab_size = 4
-#        encoder = HierarchicalForwardRNNEncoder(vocab_size=vocab_size)
-#
-#        max_major_time = 8
-#        max_minor_time = 6
-#        batch_size = 16
-#        inputs = tf.random_uniform([batch_size, max_major_time, max_minor_time],
-#                                   maxval=vocab_size,
-#                                   dtype=tf.int32)
-#        outputs, state = encoder(inputs)
-#
-#        cell_dim = encoder.hparams.rnn_cell.kwargs.num_units
-#        with self.test_session() as sess:
-#            sess.run(tf.global_variables_initializer())
-#            outputs_, state_ = sess.run([outputs, state])
-#            self.assertEqual(outputs_.shape, (batch_size, max_major_time, cell_dim))
-#            self.assertEqual(state_[0].shape, (batch_size, cell_dim))
-
-
-if __name__ == "__main__":
-    tf.test.main()
diff --git a/texar/tf/modules/encoders/transformer_encoders.py b/texar/tf/modules/encoders/transformer_encoder.py
similarity index 55%
rename from texar/tf/modules/encoders/transformer_encoders.py
rename to texar/tf/modules/encoders/transformer_encoder.py
index 647579af..fa1ba528 100644
--- a/texar/tf/modules/encoders/transformer_encoders.py
+++ b/texar/tf/modules/encoders/transformer_encoder.py
@@ -12,25 +12,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-Transformer encoders with multihead self attention.
+Transformer encoders with multi-head self attention.
 """
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
 import tensorflow as tf
 
-from texar.tf.core import layers
 from texar.tf.utils import transformer_attentions as attn, transformer_utils
 from texar.tf.modules.encoders.encoder_base import EncoderBase
-from texar.tf.modules.encoders.multihead_attention import MultiheadAttentionEncoder
+from texar.tf.modules.encoders.multihead_attention import \
+    MultiheadAttentionEncoder
 from texar.tf.modules.networks.networks import FeedForwardNetwork
-from texar.tf.utils.shapes import shape_list
 from texar.tf.utils.mode import is_train_mode
 
-# pylint: disable=too-many-locals, invalid-name
-# pylint: disable=arguments-differ, too-many-branches, too-many-statements
 
 __all__ = [
     "default_transformer_poswise_net_hparams",
@@ -39,7 +32,7 @@
 
 
 def default_transformer_poswise_net_hparams(output_dim=512):
-    """Returns default hyperparameters of a
+    r"""Returns default hyperparameters of a
     :class:`~texar.tf.modules.FeedForwardNetwork` as a pos-wise network used
     in :class:`~texar.tf.modules.TransformerEncoder` and
     :class:`~texar.tf.modules.TransformerDecoder`.
@@ -85,7 +78,6 @@ def default_transformer_poswise_net_hparams(output_dim=512):
             {
                 "type": "Dense",
                 "kwargs": {
-                    "name": "conv1",
                     "units": output_dim * 4,
                     "activation": "relu",
                     "use_bias": True,
@@ -100,7 +92,6 @@ def default_transformer_poswise_net_hparams(output_dim=512):
             {
                 "type": "Dense",
                 "kwargs": {
-                    "name": "conv2",
                     "units": output_dim,
                     "use_bias": True,
                 }
@@ -111,7 +102,7 @@ def default_transformer_poswise_net_hparams(output_dim=512):
 
 
 class TransformerEncoder(EncoderBase):
-    """Transformer encoder that applies multi-head self attention for encoding
+    r"""Transformer encoder that applies multi-head self attention for encoding
     sequences.
 
     This module basically stacks
@@ -128,49 +119,62 @@ class TransformerEncoder(EncoderBase):
     Args:
         hparams (dict or HParams, optional): Hyperparameters. Missing
             hyperparameter will be set to default values. See
-            :meth:`default_hparams` for the hyperparameter sturcture and
+            :meth:`default_hparams` for the hyperparameter structure and
             default values.
-
-    .. document private functions
-    .. automethod:: _build
     """
     def __init__(self, hparams=None):
-        EncoderBase.__init__(self, hparams)
-
-        with tf.variable_scope(self.variable_scope):
-            if self._hparams.initializer:
-                tf.get_variable_scope().set_initializer(
-                    layers.get_initializer(self._hparams.initializer))
-
-            self.multihead_attention_list = []
-            self.poswise_networks = []
-            for i in range(self._hparams.num_blocks):
-                with tf.variable_scope("layer_{}".format(i)):
-
-                    with tf.variable_scope('attention'):
-                        mh_attn = MultiheadAttentionEncoder(
-                            self._hparams.multihead_attention)
-                        self.multihead_attention_list.append(mh_attn)
-
-                        if self._hparams.dim != mh_attn.hparams.output_dim:
-                            raise ValueError(
-                                'The "dim" in the hparams of '
-                                '"multihead_attention" should be equal to the '
-                                '"dim" of TransformerEncoder')
-
-                    pw_net = FeedForwardNetwork(
-                        hparams=self._hparams['poswise_feedforward'])
-                    final_dim = pw_net.hparams.layers[-1]['kwargs']['units']
-                    if self._hparams.dim != final_dim:
-                        raise ValueError(
-                            'The output dimenstion of '
-                            '"poswise_feedforward" should be equal '
-                            'to the "dim" of TransformerEncoder.')
-                    self.poswise_networks.append(pw_net)
+        super().__init__(hparams=hparams)
+
+        self.multihead_attention_list = []
+        self.poswise_networks = []
+
+        if not self._hparams.use_bert_config:
+            self.self_attn_layer_norm = []
+        else:
+            self.output_layer_norm = []
+        self.poswise_layer_norm = []
+
+        for i in range(self._hparams.num_blocks):
+            with tf.name_scope("layer_{}".format(i)):
+
+                with tf.name_scope("attention"):
+                    mh_attn = MultiheadAttentionEncoder(
+                        self._hparams.multihead_attention)
+                    self.multihead_attention_list.append(mh_attn)
+
+                if not self._hparams.use_bert_config:
+                    self.self_attn_layer_norm.append(
+                        tf.keras.layers.LayerNormalization())
+
+                if self._hparams.dim != mh_attn.hparams.output_dim:
+                    raise ValueError(
+                        'The "dim" in the hparams of '
+                        '"multihead_attention" should be equal to the '
+                        '"dim" of TransformerEncoder')
+
+                pw_net = FeedForwardNetwork(
+                    hparams=self._hparams['poswise_feedforward'])
+                final_dim = pw_net.hparams.layers[-1]['kwargs']['units']
+                if self._hparams.dim != final_dim:
+                    raise ValueError(
+                        'The output dimenstion of '
+                        '"poswise_feedforward" should be equal '
+                        'to the "dim" of TransformerEncoder.')
+                self.poswise_networks.append(pw_net)
+                self.poswise_layer_norm.append(
+                    tf.keras.layers.LayerNormalization())
+                if self._hparams.use_bert_config:
+                    self.output_layer_norm.append(
+                        tf.keras.layers.LayerNormalization())
+
+        if self._hparams.use_bert_config:
+            self.input_normalizer = tf.keras.layers.LayerNormalization()
+        else:
+            self.final_layer_norm = tf.keras.layers.LayerNormalization()
 
     @staticmethod
     def default_hparams():
-        """Returns a dictionary of hyperparameters with default values.
+        r"""Returns a dictionary of hyperparameters with default values.
 
         .. code-block:: python
 
@@ -196,13 +200,13 @@ def default_hparams():
 
         Here:
 
-        "num_blocks": int
+        `"num_blocks"`: int
             Number of stacked blocks.
 
-        "dim": int
+        `"dim"`: int
             Hidden dimension of the encoders.
 
-        "use_bert_config": bool
+        `"use_bert_config"`: bool
             If `False`, apply the standard Transformer Encoder architecture from
             the original paper `(Vaswani et al.) "Attention is All You Need"`.
             If `True`, apply the Transformer Encoder architecture used in BERT
@@ -210,44 +214,46 @@ def default_hparams():
 
             The differences lie in:
 
-                1. The standard arch restricts the word embedding of PAD token \
+                1. The standard arch restricts the word embedding of PAD token
                    to all zero. The BERT arch does not.
 
-                2. The attention bias for padding tokens: \
-                   The standard arch uses `-1e8` for nagative attention mask. \
-                   BERT uses `-1e4` instead.
+                2. The attention bias for padding tokens:
+                   The standard arch uses ``-1e8`` for negative attention mask.
+                   BERT uses ``-1e4`` instead.
 
-                3. The residual connections between internal tensors: \
-                   In BERT, a residual layer connects the tensors *after* \
-                   layer normalization. In the standard arch, the tensors are \
+                3. The residual connections between internal tensors:
+                   In BERT, a residual layer connects the tensors *after*
+                   layer normalization. In the standard arch, the tensors are
                    connected *before* layer normalization.
 
-        "embedding_dropout": float
+        `"embedding_dropout"`: float
             Dropout rate of the input embedding.
 
-        "residual_dropout":  float
+        `"residual_dropout"`:  float
             Dropout rate of the residual connections.
 
-        "poswise_feedforward": dict
+        `"poswise_feedforward"`: dict
             Hyperparameters for a feed-forward network used in residual
             connections.
-            Make sure the dimension of the output tensor is equal to `dim`.
+            Make sure the dimension of the output tensor is equal to ``"dim"``.
 
-            See :func:`~texar.tf.modules.default_transformer_poswise_net_hparams`
+            See
+            :func:`~texar.tf.modules.default_transformer_poswise_net_hparams`
             for details.
 
-        "multihead_attention": dict
-            Hyperparameters for the multihead attention strategy.
-            Make sure the "output_dim" in this module is equal to "dim".
-            See :func:`~texar.tf.modules.MultiheadAttentionEncoder.default_harams`
+        `"multihead_attention"`: dict
+            Hyperparameters for the multi-head attention strategy.
+            Make sure the ``"output_dim"`` in this module is equal to ``"dim"``.
+            See
+            :func:`~texar.tf.modules.MultiheadAttentionEncoder.default_harams`
             for details.
 
-        "initializer": dict, optional
+        `"initializer"`: dict, optional
             Hyperparameters of the default initializer that initializes
             variables created in this module.
             See :func:`~texar.tf.core.get_initializer` for details.
 
-        "name": str
+        `"name"`: str
             Name of the module.
         """
         return {
@@ -269,28 +275,32 @@ def default_hparams():
             'name': 'transformer_encoder',
         }
 
-    def _build(self, inputs, sequence_length, mode=None):
-        """Encodes the inputs.
+    def __call__(self, inputs, sequence_length, mode=None):
+        r"""Encodes the inputs.
 
         Args:
-            inputs: A 3D Tensor of shape `[batch_size, max_time, dim]`,
+            inputs: A 3D Tensor of shape ``[batch_size, max_time, dim]``,
                 containing the embedding of input sequences. Note that
                 the embedding dimension `dim` must equal "dim" in
                 :attr:`hparams`. The input embedding is typically an aggregation
                 of word embedding and position embedding.
-            sequence_length: A 1D Tensor of shape `[batch_size]`. Input tokens
+            sequence_length: A 1D Tensor of shape ``[batch_size]``. Input tokens
                 beyond respective sequence lengths are masked out
                 automatically.
             mode (optional): A tensor taking value in
-                :tf_main:`tf.estimator.ModeKeys <estimator/ModeKeys>`,
-                including `TRAIN`, `EVAL`, and `PREDICT`. Used to toggle
-                dropout.
-                If `None` (default), :func:`texar.tf.global_mode` is used.
+                :tf_main:`tf.estimator.ModeKeys <estimator/ModeKeys>`, including
+                `TRAIN`, `EVAL`, and `PREDICT`.
 
         Returns:
-            A Tensor of shape `[batch_size, max_time, dim]` containing the
+            A Tensor of shape ``[batch_size, max_time, dim]`` containing the
             encoded vectors.
         """
+        return super().__call__([inputs, sequence_length], mode)
+
+    def call(self, inputs, mode):
+        r"""Encodes the inputs.
+        """
+        inputs, sequence_length = inputs
         # Multiply input embedding with the sqrt of its dimension for
         # normalization
 
@@ -305,16 +315,18 @@ def _build(self, inputs, sequence_length, mode=None):
         encoder_self_attention_bias = ignore_padding
 
         input_embedding = inputs
+        is_training = is_train_mode(mode)
 
         if self._hparams.use_bert_config:
-            x = layers.layer_normalize(input_embedding)
-            x = tf.layers.dropout(x,
-                                  rate=self._hparams.embedding_dropout,
-                                  training=is_train_mode(mode))
+            x = self.input_normalizer(input_embedding)
+            if is_training:
+                x = tf.nn.dropout(x, rate=self._hparams.embedding_dropout)
         else:
-            x = tf.layers.dropout(input_embedding,
-                                  rate=self._hparams.embedding_dropout,
-                                  training=is_train_mode(mode))
+            if is_training:
+                x = tf.nn.dropout(input_embedding,
+                                  rate=self._hparams.embedding_dropout)
+            else:
+                x = input_embedding
 
         # Just to keep consistent with BERT, actually makes no difference
         if self._hparams.use_bert_config:
@@ -323,63 +335,62 @@ def _build(self, inputs, sequence_length, mode=None):
             pad_remover = transformer_utils.PadRemover(inputs_padding)
 
         for i in range(self._hparams.num_blocks):
-            with tf.variable_scope("layer_{}".format(i)):
-                multihead_attention = self.multihead_attention_list[i]
-
-                # trivial difference between BERT and original Transformer
-                if self._hparams.use_bert_config:
-                    _queries_input = x
-                else:
-                    _queries_input = layers.layer_normalize(x)
-
-                attention_output = multihead_attention(
-                    queries=_queries_input,
-                    memory=_queries_input,
-                    memory_attention_bias=encoder_self_attention_bias,
-                    mode=mode,
-                )
-                attention_output = tf.layers.dropout(
+            multihead_attention = self.multihead_attention_list[i]
+
+            # trivial difference between BERT and original Transformer
+            if self._hparams.use_bert_config:
+                _queries_input = x
+            else:
+                _queries_input = self.self_attn_layer_norm[i](x)
+
+            attention_output = multihead_attention(
+                queries=_queries_input,
+                memory=_queries_input,
+                memory_attention_bias=encoder_self_attention_bias,
+                mode=mode,
+            )
+
+            if is_training:
+                attention_output = tf.nn.dropout(
                     attention_output,
-                    rate=self._hparams.residual_dropout,
-                    training=is_train_mode(mode),
-                )
-                x = x + attention_output
-                with tf.variable_scope('output'):
-                    if self._hparams.use_bert_config:
-                        x = layers.layer_normalize(x)
-                        y = x
-                    else:
-                        y = layers.layer_normalize(x)
-
-                poswise_network = self.poswise_networks[i]
-                with tf.variable_scope(poswise_network.variable_scope):
-                    original_shape = shape_list(y)
-                    y = tf.reshape(y, [-1, self._hparams.dim])
-                    if pad_remover:
-                        y = tf.expand_dims(pad_remover.remove(y), axis=0)
-                        # [1, batch_size*seq_length, hidden_dim]
-                    layer_output = poswise_network(y, mode=mode)
-                    sub_output = tf.layers.dropout(
-                        layer_output,
-                        rate=self._hparams.residual_dropout,
-                        training=is_train_mode(mode)
-                    )
-                    if pad_remover:
-                        sub_output = tf.reshape(
-                            pad_remover.restore(tf.squeeze(sub_output, axis=0)),
-                            original_shape)
-                    else:
-                        sub_output = tf.reshape(sub_output, original_shape)
-
-                    x = x + sub_output
-                    if self._hparams.use_bert_config:
-                        x = layers.layer_normalize(x)
+                    rate=self._hparams.residual_dropout)
+
+            x = x + attention_output
+
+            poswise_normalizer = self.poswise_layer_norm[i]
+
+            if self._hparams.use_bert_config:
+                x = poswise_normalizer(x)
+                y = x
+            else:
+                y = poswise_normalizer(x)
+
+            poswise_network = self.poswise_networks[i]
+
+            original_shape = y.shape.as_list()
+            y = tf.reshape(y, [-1, self._hparams.dim])
+            if pad_remover:
+                y = tf.expand_dims(pad_remover.remove(y), axis=0)
+                # [1, batch_size*seq_length, hidden_dim]
+            layer_output = poswise_network(y, mode=mode)
+            if is_training:
+                sub_output = tf.nn.dropout(
+                    layer_output,
+                    rate=self._hparams.residual_dropout)
+            else:
+                sub_output = layer_output
+            if pad_remover:
+                sub_output = tf.reshape(
+                    pad_remover.restore(tf.squeeze(sub_output, axis=0)),
+                    original_shape)
+            else:
+                sub_output = tf.reshape(sub_output, original_shape)
+
+            x = x + sub_output
+            if self._hparams.use_bert_config:
+                x = self.output_layer_norm[i](x)
 
         if not self._hparams.use_bert_config:
-            x = layers.layer_normalize(x)
-
-        if not self._built:
-            self._add_internal_trainable_variables()
-            self._built = True
+            x = self.final_layer_norm(x)
 
         return x
diff --git a/texar/tf/modules/encoders/transformer_encoder_test.py b/texar/tf/modules/encoders/transformer_encoder_test.py
new file mode 100644
index 00000000..35705da0
--- /dev/null
+++ b/texar/tf/modules/encoders/transformer_encoder_test.py
@@ -0,0 +1,73 @@
+"""
+Unit tests for Transformer encoder.
+"""
+
+import tensorflow as tf
+
+from texar.tf.modules.encoders.transformer_encoder import TransformerEncoder
+
+
+class TransformerEncoderTest(tf.test.TestCase):
+
+    def setUp(self):
+        self._batch_size = 2
+        self._emb_dim = 512
+        self._max_time = 7
+
+    def test_trainable_variables(self):
+        inputs = tf.random.uniform(
+            [self._batch_size, self._max_time, self._emb_dim],
+            dtype=tf.float32)
+
+        sequence_length = tf.random.uniform([self._batch_size],
+                                            maxval=self._max_time,
+                                            dtype=tf.int32)
+
+        encoder = TransformerEncoder()
+
+        _ = encoder(inputs=inputs, sequence_length=sequence_length)
+
+        # 6 blocks
+        # -self multihead_attention: 4 dense without bias + 2 layer norm vars
+        # -poswise_network: Dense with bias, Dense with bias + 2 layer norm vars
+        # 2 output layer norm vars
+
+        self.assertEqual(len(encoder.trainable_variables), 74)
+
+        hparams = {"use_bert_config": True}
+        encoder = TransformerEncoder(hparams=hparams)
+
+        # 6 blocks
+        # -self multihead_attention: 4 dense without bias + 2 layer norm vars
+        # -poswise_network: Dense with bias, Dense with bias + 2 layer norm vars
+        # -output: 2 layer norm vars
+        # 2 input layer norm vars
+        _ = encoder(inputs=inputs, sequence_length=sequence_length)
+
+        self.assertEqual(len(encoder.trainable_variables), 74)
+
+    def test_encode(self):
+        inputs = tf.random.uniform(
+            [self._batch_size, self._max_time, self._emb_dim],
+            dtype=tf.float32)
+
+        sequence_length = tf.random.uniform([self._batch_size],
+                                            maxval=self._max_time,
+                                            dtype=tf.int32)
+
+        encoder = TransformerEncoder()
+        outputs = encoder(inputs=inputs, sequence_length=sequence_length)
+        self.assertEqual(outputs.shape.as_list(), [self._batch_size,
+                                                   self._max_time,
+                                                   self._emb_dim])
+
+        hparams = {"use_bert_config": True}
+        encoder = TransformerEncoder(hparams=hparams)
+        outputs = encoder(inputs=inputs, sequence_length=sequence_length)
+        self.assertEqual(outputs.shape.as_list(), [self._batch_size,
+                                                   self._max_time,
+                                                   self._emb_dim])
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/texar/tf/modules/encoders/xlnet_encoder.py b/texar/tf/modules/encoders/xlnet_encoder.py
deleted file mode 100644
index de57b7e7..00000000
--- a/texar/tf/modules/encoders/xlnet_encoder.py
+++ /dev/null
@@ -1,623 +0,0 @@
-# Copyright 2019 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-XLNet encoders.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow as tf
-
-from texar.tf.utils.mode import is_train_mode
-
-from texar.tf.core.layers import get_initializer, get_layer
-from texar.tf.modules.embedders.embedders import WordEmbedder
-from texar.tf.modules.encoders.encoder_base import EncoderBase
-from texar.tf.modules.pretrained.xlnet import PretrainedXLNetMixin
-from texar.tf.modules.pretrained.xlnet_utils import \
-    (PositionWiseFF, RelativePositionalEncoding, RelativeMutiheadAttention)
-from texar.tf.utils.utils import dict_fetch
-
-__all__ = [
-    "XLNetEncoder"
-]
-
-
-class XLNetEncoder(EncoderBase, PretrainedXLNetMixin):
-    r"""Raw XLNet module for encoding sequences. Please see
-    :class:`~texar.tf.modules.PretrainedXLNetMixin` for a brief description
-    of XLNet.
-
-    Args:
-        pretrained_model_name (optional): a `str`, the name
-            of pre-trained model (e.g., ``xlnet-based-cased``). Please refer to
-            :class:`~texar.tf.modules.PretrainedXLNetMixin` for
-            all supported models.
-            If `None`, the model name in :attr:`hparams` is used.
-        cache_dir (optional): the path to a folder in which the
-            pre-trained models will be cached. If `None` (default),
-            a default directory (``texar_data`` folder under user's home
-            directory) will be used.
-        hparams (dict or HParams, optional): Hyperparameters. Missing
-            hyperparameter will be set to default values. See
-            :meth:`default_hparams` for the hyperparameter structure
-            and default values.
-
-    .. document private functions
-    .. automethod:: _build
-    """
-
-    def __init__(self,
-                 pretrained_model_name=None,
-                 cache_dir=None,
-                 hparams=None):
-        super(XLNetEncoder, self).__init__(hparams=hparams)
-
-        self.load_pretrained_config(pretrained_model_name, cache_dir)
-
-        num_layers = self._hparams.num_layers
-        use_segments = self._hparams.use_segments
-        untie_r = self._hparams.untie_r
-
-        with tf.variable_scope(self.variable_scope):
-
-            if untie_r:
-                self.r_w_bias = tf.get_variable('r_w_bias',
-                                                [num_layers,
-                                                 self._hparams.num_heads,
-                                                 self._hparams.head_dim],
-                                                dtype=tf.float32)
-                self.r_r_bias = tf.get_variable('r_r_bias',
-                                                [num_layers,
-                                                 self._hparams.num_heads,
-                                                 self._hparams.head_dim],
-                                                dtype=tf.float32)
-            else:
-                self.r_w_bias = tf.get_variable('r_w_bias',
-                                                [self._hparams.num_heads,
-                                                 self._hparams.head_dim],
-                                                dtype=tf.float32)
-                self.r_r_bias = tf.get_variable('r_r_bias',
-                                                [self._hparams.num_heads,
-                                                 self._hparams.head_dim],
-                                                dtype=tf.float32)
-
-            if use_segments:
-                self.segment_embed = tf.get_variable('seg_embed',
-                                                     [num_layers, 2,
-                                                      self._hparams.num_heads,
-                                                      self._hparams.head_dim],
-                                                     dtype=tf.float32)
-                self.r_s_bias = (tf.get_variable('r_s_bias',
-                                                 [num_layers,
-                                                  self._hparams.num_heads,
-                                                  self._hparams.head_dim],
-                                                 dtype=tf.float32) if untie_r
-                                 else tf.get_variable('r_s_bias',
-                                                      [self._hparams.num_heads,
-                                                       self._hparams.head_dim],
-                                                      dtype=tf.float32))
-            else:
-                self.segment_embed = None
-                self.r_s_bias = None
-
-            # Word embedding
-            self.word_embedder = WordEmbedder(
-                vocab_size=self._hparams.vocab_size,
-                hparams={"dim": self._hparams.hidden_dim})
-
-            # Position embedding
-            self.pos_embed = RelativePositionalEncoding(hparams={
-                "dim": self._hparams.hidden_dim,
-                "max_seq_len": self._hparams.max_seq_len
-            })
-
-            self.attn_layers = []
-            self.ff_layers = []
-            rel_attn_hparams = dict_fetch(
-                self._hparams, RelativeMutiheadAttention.default_hparams())
-            rel_attn_hparams["name"] = "rel_attn"
-
-            ff_hparams = dict_fetch(
-                self._hparams, PositionWiseFF.default_hparams())
-            ff_hparams["name"] = "ff"
-
-            for i in range(num_layers):
-                with tf.variable_scope("layer_{}".format(i)):
-                    if self._hparams.untie_r:
-                        if use_segments:
-                            self.attn_layers.append(RelativeMutiheadAttention(
-                                self.r_r_bias[i], self.r_w_bias[i],
-                                self.r_s_bias[i],
-                                self.segment_embed[i],
-                                hparams=rel_attn_hparams))
-                        else:
-                            self.attn_layers.append(RelativeMutiheadAttention(
-                                self.r_r_bias[i], self.r_w_bias[i],
-                                hparams=rel_attn_hparams))
-                    else:
-                        if use_segments:
-                            self.attn_layers.append(RelativeMutiheadAttention(
-                                self.r_r_bias, self.r_w_bias,
-                                self.r_s_bias,
-                                self.segment_embed[i],
-                                hparams=rel_attn_hparams))
-                        else:
-                            self.attn_layers.append(RelativeMutiheadAttention(
-                                self.r_r_bias, self.r_w_bias,
-                                hparams=rel_attn_hparams))
-                    self.ff_layers.append(PositionWiseFF(hparams=ff_hparams))
-
-            dropout_hparams = {
-                "type": "Dropout",
-                "kwargs": {
-                    "rate": self._hparams.dropout
-                }
-            }
-            self.dropout = get_layer(hparams=dropout_hparams)
-
-            self.mask_embed = tf.get_variable(
-                'mask_emb', [1, 1, self.hparams.hidden_dim], dtype=tf.float32)
-
-    def reset_parameters(self):
-        with tf.variable_scope(self.variable_scope):
-            if self._hparams.initializer:
-                tf.get_variable_scope().set_initializer(
-                    get_initializer(self._hparams.initializer))
-
-    @staticmethod
-    def default_hparams():
-        r"""Returns a dictionary of hyperparameters with default values.
-
-        * The encoder arch is determined by the constructor argument
-        :attr:`pretrained_model_name` if it's specified. In this case,
-        hparams are ignored.
-        * Otherwise, the encoder arch is determined by
-        `hparams['pretrained_model_name']` if it's specified. All other
-        configs in hparams are ignored.
-        * If the above two are `None`, the encoder arch is defined by
-        the configs in hparams and weights are randomly initialized.
-
-        .. code-block:: python
-
-            {
-                "name": "xlnet_encoder",
-                "pretrained_model_name": "xlnet-base-cased",
-                "untie_r": True,
-                "num_layers": 12,
-                "mem_len": 0,
-                "reuse_len": 0,
-                "initializer": None,
-                "num_heads": 12,
-                "hidden_dim": 768,
-                "head_dim": 64,
-                "dropout": 0.1,
-                "attention_dropout": 0.1,
-                "use_segments": True,
-                "ffn_inner_dim": 3072,
-                "activation": 'gelu',
-                "vocab_size": 32000,
-                "max_seq_len": 512,
-            }
-
-
-
-        Here:
-
-        The default parameters are values for cased XLNet-Base model.
-
-
-        "pretrained_model_name": str or None
-             The name of the pre-trained bert model. If None, the model
-             will be randomly initialized.
-
-        "untie_r": bool
-            Boolean value to indicate if biases should be untied for all the
-            layers
-
-        "num_layers": int
-            Number of layers in the network
-
-        "mem_len": int
-            Length of the memory to be used during attention score calculation.
-
-        "reuse_len": int
-            Length of the memory that can be re-used
-
-        "initializer": dict, optional
-            Hyperparameters of the default initializer that initializes
-            variables created in this module.
-            See :func:`~texar.tf.core.get_initializer` for details.
-
-        "num_heads": int
-            Number of heads in the attention
-
-        "hidden_dim": int
-            Hidden dimension of the embeddings
-
-        "head_dim": int
-            Size of the vectors after head projection.
-
-        "dropout": float
-            Dropout rate for layers
-
-        "attention_dropout": float
-            Dropout rate for attention layers
-
-        "use_segments": bool
-            Boolean to indicate if the input has segments
-
-        "ffn_inner_dim": int
-            Dimension of PositionWise FF network's hidden layer
-
-        "activation": str or callable
-            Activation function applied to the output of the PositionWise FF.
-            See :func:`~texar.tf.core.get_activation_fn` for more details.
-
-        "vocab_size": int
-            The vocabulary size of `inputs` in `XLNet`.
-
-        "max_seq_len": int
-            Maximum len of the sequence allowed in one segment
-
-        "name": str
-            Name of the module.
-        """
-
-        return {
-            "name": "xlnet_encoder",
-            'pretrained_model_name': 'xlnet-base-cased',
-            "untie_r": True,
-            "num_layers": 12,
-            "mem_len": 0,
-            "reuse_len": 0,
-            # initializer
-            "initializer": None,
-            # layer
-            "num_heads": 12,
-            "hidden_dim": 768,
-            "head_dim": 64,
-            "dropout": 0.1,
-            "attention_dropout": 0.1,
-            "use_segments": True,
-            # ffn
-            "ffn_inner_dim": 3072,
-            "activation": 'gelu',
-            # embedding
-            "vocab_size": 32000,
-            "max_seq_len": 512,
-            '@no_typecheck': ['pretrained_model_name']
-        }
-
-    def param_groups(self, lr=None, lr_layer_scale=1.0,
-                     decay_base_params=False):
-        r"""Create parameter groups for optimizers. When
-        :attr:`lr_layer_decay_rate` is not 1.0, parameters from each layer form
-        separate groups with different base learning rates.
-
-        This method should be called before applying gradients to the variables
-        through the optimizer. Particularly, after calling the optimizer's
-        `compute_gradients` method, the user can call this method to get
-        variable-specific learning rates for the network. The gradients for each
-        variables can then be scaled accordingly. These scaled gradients are
-        finally applied by calling optimizer's `apply_gradients` method.
-
-        Example:
-
-            .. code-block:: python
-
-            grads_and_vars = optimizer.compute_gradients(loss)
-
-            vars_to_grads = {key: value for key, value in grads_and_vars}
-
-            vars_to_learning_rates = xlnet_encoder.param_groups(
-                                        lr=1, ly_layer_scale=0.75)
-
-            for key in vars_to_grads.keys():
-                vars_to_grads[key] *= vars_to_learning_rates[key]
-
-            train_op = optimizer.apply_gradients(zip(*vars_to_grads.items()))
-
-
-        Args:
-            lr (float): The learning rate. Can be omitted if
-                :attr:`lr_layer_decay_rate` is 1.0.
-            lr_layer_scale (float): Per-layer LR scaling rate. The `i`-th layer
-                will be scaled by `lr_layer_scale ^ (num_layers - i - 1)`.
-            decay_base_params (bool): If `True`, treat non-layer parameters
-                (e.g. embeddings) as if they're in layer 0. If `False`, these
-                parameters are not scaled.
-
-        Returns: A dict mapping tensorflow variables to their learning rates.
-        """
-        vars_to_learning_rates = {}
-        if lr_layer_scale != 1.0:
-            if lr is None:
-                raise ValueError(
-                    "lr must be specified when lr_layer_decay_rate is not 1.0")
-
-            num_layers = self._hparams.num_layers
-            scope = self.variable_scope.name
-            base_var_names = ['r_w_bias', 'r_r_bias', 'word_embedder']
-
-            if self._hparams.use_segments:
-                base_var_names.extend(['r_s_bias', 'seg_embed'])
-
-            for var in base_var_names:
-                tf_variable = tf.trainable_variables(scope=scope + "/" + var)[0]
-                vars_to_learning_rates[tf_variable] = \
-                    lr * (lr_layer_scale ** num_layers if decay_base_params
-                          else 1.0)
-
-            for idx in range(num_layers):
-                decay_rate = lr_layer_scale ** (num_layers - idx - 1)
-                layer_variables = tf.trainable_variables(
-                    scope=scope + "/" + "layer_{}".format(idx))
-                for variable in layer_variables:
-                    vars_to_learning_rates[variable] = lr * decay_rate
-        else:
-            for variable in self.trainable_variables:
-                vars_to_learning_rates[variable] = lr
-
-        return vars_to_learning_rates
-
-    @property
-    def output_size(self):
-        r"""The last dimension of the encoder output.
-
-        Note: The :meth:`_build` returns two tensors of shapes
-        `[batch_size, max_time, hidden_dim]` and
-        `[batch_size, cache_len, hidden_dim]`. `output_size` here equals
-        `hidden_dim`
-        """
-        return self._hparams.hidden_dim
-
-    @staticmethod
-    def _cache_mem(curr_out, prev_mem, mem_len, reuse_len=None):
-        r"""Cache hidden states into memory."""
-        assert mem_len > 0
-
-        if reuse_len is not None and reuse_len > 0:
-            curr_out = curr_out[:reuse_len]
-
-        if prev_mem is None:
-            new_mem = curr_out[-mem_len:]
-        else:
-            new_mem = tf.concat([prev_mem, curr_out], 0)[-mem_len:]
-
-        return tf.stop_gradient(new_mem)
-
-    def _create_mask(self, qlen, mlen, dtype=tf.float32, same_length=False):
-        r"""Create causal attention mask."""
-        attn_mask = tf.ones([qlen, qlen], dtype=dtype)
-        mask_u = tf.matrix_band_part(attn_mask, 0, -1)
-        mask_dia = tf.matrix_band_part(attn_mask, 0, 0)
-        attn_mask_pad = tf.zeros([qlen, mlen], dtype=dtype)
-        ret = tf.concat([attn_mask_pad, mask_u - mask_dia], axis=1)
-        if same_length:
-            mask_l = tf.matrix_band_part(attn_mask, -1, 0)
-            ret = tf.concat([ret[:, :qlen] + mask_l - mask_dia, ret[:, qlen:]],
-                            axis=1)
-
-        return ret
-
-    def _build(self, token_ids, segment_ids=None, input_mask=None,
-               memory=None, permute_mask=None, target_mapping=None,
-               bi_data=False, clamp_len=None, cache_len=0, same_length=False,
-               attn_type='bi', two_stream=False, mode=None):
-        r"""Compute XLNet representations for the input.
-
-        Args:
-            token_ids: Shape `[batch_size, max_time]`.
-            segment_ids: Shape `[batch_size, max_time]`.
-            input_mask: Float tensor of shape `[batch_size, max_time]`. Note that
-                positions with value 1 are masked out.
-            memory: Memory from previous batches. A list of length `num_layers`,
-                each tensor of shape `[batch_size, mem_len, hidden_dim]`.
-            permute_mask: The permutation mask. Float tensor of shape
-                `[batch_size, max_time, max_time]`.
-                A value of 0 for ``permute_mask[i, j, k]`` indicates that
-                position `i` attends to position `j` in batch `k`.
-            target_mapping: The target token mapping. Float tensor of shape
-                `[batch_size, num_targets, max_time]`.
-                A value of 1 for ``target_mapping[i, j, k]`` indicates that
-                the `i`-th target token (in order of permutation) in batch `k`
-                is the token at position `j`.
-                Each row ``target_mapping[i, :, k]`` can have no more than one
-                value of 1.
-            bi_data (bool): Whether to use bidirectional data input pipeline.
-            clamp_len (int): Clamp all relative distances larger than
-                :attr:`clamp_len`. A value of -1 means no clamping.
-            cache_len (int): Length of memory (number of tokens) to cache.
-            same_length (bool): Whether to use the same attention length for
-                each token.
-            attn_type (str): Attention type. Supported values are `"uni"`
-                and `"bi"`.
-            two_stream (bool): Whether to use two-stream attention. Only set to
-                `True` when pre-training or generating text. Defaults to
-                `False`.
-
-        Returns: A tuple of `(output, new_memory)`:
-
-            - **output**: The final layer output representations. Shape
-              `[batch_size, max_time, hidden_dim]`.
-            - **new_memory**: The memory of the current batch.
-              If `cache_len` is 0, then `new_memory` is `None`. Otherwise, it is
-              a list of length `num_layers`, each tensor of shape
-              `[batch_size, cache_len, hidden_dim]`.
-              This can be used as the :attr:`memory` argument in the next batch.
-        """
-        return self._execute(self.word_embedder(token_ids),
-                             segment_ids=segment_ids, input_mask=input_mask,
-                             memory=memory, permute_mask=permute_mask,
-                             target_mapping=target_mapping, bi_data=bi_data,
-                             clamp_len=clamp_len, cache_len=cache_len,
-                             same_length=same_length, attn_type=attn_type,
-                             two_stream=two_stream, mode=mode)
-
-    def _execute(self, word_embed, segment_ids=None,  # noqa: C901
-                 input_mask=None, memory=None, permute_mask=None,
-                 target_mapping=None, bi_data=False, clamp_len=None,
-                 cache_len=0, same_length=False, attn_type='bi',
-                 two_stream=False, mode=None):
-        r"""Compute XLNet representations for the input. This layer exists
-        because :class:`XLNetDecoder` compute embeddings in the decoder helper.
-        `word_embed` has shape `[batch_size, max_time, word_embed_dim]`.
-        Please refer to :meth:`_build` for the detailed information of other
-        arguments.
-        """
-        # word_embed: [max_time, batch_size, word_embed_dim]
-        word_embed = tf.transpose(word_embed, perm=[1, 0, 2])
-        # segment_ids: [max_time, batch_size]
-        if segment_ids is not None:
-            segment_ids = tf.transpose(segment_ids, perm=[1, 0])
-        # input_mask: [max_time, batch_size]
-        if input_mask is not None:
-            input_mask = tf.transpose(input_mask, perm=[1, 0])
-        # memory: A list of length num_layers
-        # each tensor of shape [mem_len, batch_size, hidden_dim]
-        if memory is not None:
-            memory = [tf.transpose(m, perm=[1, 0, 2]) for m in memory]
-        # permute_mask: [max_time, max_time, batch_size]
-        if permute_mask is not None:
-            permute_mask = tf.transpose(permute_mask, perm=[1, 2, 0])
-        # target_mapping: [num_targets, max_time, batch_size]
-        if target_mapping is not None:
-            target_mapping = tf.transpose(target_mapping, perm=[1, 2, 0])
-
-        max_time = tf.shape(word_embed)[0]
-        batch_size = tf.shape(word_embed)[1]
-        mem_len = tf.shape(memory[0])[0] if memory is not None else 0
-        tot_len = max_time + mem_len
-        reuse_len = self._hparams.reuse_len
-        is_training = is_train_mode(mode)
-
-        # Attention mask
-        # causal attention mask
-        if attn_type == 'uni':
-            attn_mask = self._create_mask(max_time, mem_len, tf.float32,
-                                          same_length)
-            attn_mask = attn_mask[:, :, None, None]
-        elif attn_type == 'bi':
-            attn_mask = None
-        else:
-            raise ValueError('Unsupported attention type: {}'.format(attn_type))
-
-        # data mask: input mask & perm mask
-        if input_mask is not None and permute_mask is not None:
-            data_mask = input_mask[None] + permute_mask
-        elif input_mask is not None and permute_mask is None:
-            data_mask = input_mask[None]
-        elif input_mask is None and permute_mask is not None:
-            data_mask = permute_mask
-        else:
-            data_mask = None
-
-        if data_mask is not None:
-            # all mems can be attended to
-            mems_mask = tf.zeros([tf.shape(data_mask)[0], mem_len, batch_size],
-                                 dtype=tf.float32)
-            data_mask = tf.concat([mems_mask, data_mask], 1)
-            if attn_mask is None:
-                attn_mask = data_mask[:, :, :, None]
-            else:
-                attn_mask += data_mask[:, :, :, None]
-
-        if attn_mask is not None:
-            attn_mask = tf.cast(attn_mask > 0, dtype=tf.float32)
-
-        if attn_mask is not None:
-            non_tgt_mask = -tf.eye(max_time, dtype=tf.float32)
-            non_tgt_mask = tf.concat([tf.zeros([max_time, mem_len],
-                                               dtype=tf.float32),
-                                      non_tgt_mask], axis=-1)
-            non_tgt_mask = tf.cast(
-                (attn_mask + non_tgt_mask[:, :, None, None]) > 0,
-                dtype=tf.float32)
-        else:
-            non_tgt_mask = None
-
-        # Segment embedding
-        if segment_ids is not None:
-            mem_pad = tf.zeros([mem_len, batch_size], dtype=tf.int32)
-            cat_ids = tf.concat([mem_pad, segment_ids], 0)
-            segment_matrix = tf.cast(
-                tf.logical_not(
-                    tf.equal(segment_ids[:, None], cat_ids[None, :])),
-                tf.int32)
-            segment_matrix = tf.one_hot(segment_matrix, 2, dtype=tf.float32)
-        else:
-            segment_matrix = None
-
-        # Position embedding
-        pos_embed = self.pos_embed(
-            batch_size, max_time, tot_len, clamp_len, attn_type, bi_data)
-        pos_embed = self.dropout(pos_embed,
-                                 training=is_training)
-
-        states_h = self.dropout(word_embed,
-                                training=is_training)
-
-        if two_stream:
-            if target_mapping is not None:
-                word_embed_q = tf.tile(
-                    self.mask_embed, [tf.shape(target_mapping)[0],
-                                      batch_size, 1])
-            else:
-                word_embed_q = word_embed
-            states_g = self.dropout(word_embed_q)
-        else:
-            states_g = None
-
-        new_memory = []
-        num_layers = self._hparams.num_layers
-        for i in range(num_layers):
-            cur_memory = memory[i] if memory is not None else None
-            if cache_len > 0:
-                new_memory.append(
-                    self._cache_mem(states_h, cur_memory, cache_len, reuse_len))
-            states_h, states_g = self.attn_layers[i](
-                states_h=states_h, pos_embed=pos_embed, states_g=states_g,
-                segment_mat=segment_matrix, attn_mask_h=non_tgt_mask,
-                attn_mask_g=attn_mask, target_mapping=None, memory=cur_memory,
-                mode=mode)
-            ff_layer = self.ff_layers[i]
-            states_h = ff_layer(states_h, mode=mode)
-
-            if states_g is not None:
-                states_g = ff_layer(states_g, mode=mode)
-
-        output = self.dropout(states_h if states_g is None else states_g,
-                              training=is_training)
-
-        # Now output: [max_time, batch_size, hidden_dim]
-        # new_memory: None or A list of length num_layers,
-        # each tensor of shape [cache_len, batch_size, hidden_dim]
-        output = tf.transpose(output, perm=[1, 0, 2])
-        if new_memory is not None:
-            new_memory = [tf.transpose(m, perm=[1, 0, 2]) for m in new_memory]
-
-        if not self._built:
-            self._add_internal_trainable_variables()
-            self._built = True
-
-            if self.pretrained_model_dir:
-                self.init_pretrained_weights(self.variable_scope.name)
-
-        if cache_len == 0:
-            return output, None
-
-        return output, new_memory
diff --git a/texar/tf/modules/encoders/xlnet_encoder_test.py b/texar/tf/modules/encoders/xlnet_encoder_test.py
deleted file mode 100644
index 6ce7a6a5..00000000
--- a/texar/tf/modules/encoders/xlnet_encoder_test.py
+++ /dev/null
@@ -1,167 +0,0 @@
-#
-"""
-Unit tests for XLNet encoders.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
-
-import tensorflow as tf
-
-from texar.tf.modules.encoders.xlnet_encoder import XLNetEncoder
-from texar.tf.utils.test import pretrained_test
-
-
-class XLNetEncoderTest(tf.test.TestCase):
-    """Tests :class:`~texar.tf.modules.XLNetEncoder` class.
-    """
-
-    @pretrained_test
-    def test_model_loading(self):
-        r"""Tests model loading functionality."""
-
-        inputs = tf.placeholder(dtype=tf.int32, shape=[None, None])
-
-        for pretrained_model_name in XLNetEncoder.available_checkpoints():
-            encoder = XLNetEncoder(pretrained_model_name=pretrained_model_name)
-            _ = encoder(inputs)
-
-    @pretrained_test
-    def test_hparams(self):
-        """Tests the priority of the encoder architecture parameter.
-        """
-
-        inputs = tf.placeholder(dtype=tf.int32, shape=[None, None])
-
-        # case 1: set "pretrained_mode_name" by constructor argument
-        encoder = XLNetEncoder(pretrained_model_name="xlnet-large-cased",
-                               hparams={})
-        encoder(inputs)
-        self.assertEqual(len(encoder.attn_layers), 24)
-        self.assertEqual(len(encoder.ff_layers), 24)
-
-        # case 2: set "pretrained_mode_name" by hparams
-        hparams = {
-            "pretrained_model_name": "xlnet-base-cased"
-        }
-        encoder = XLNetEncoder(hparams=hparams)
-        encoder(inputs)
-        self.assertEqual(len(encoder.attn_layers), 12)
-        self.assertEqual(len(encoder.ff_layers), 12)
-
-        # case 3: set to None in both hparams and constructor argument
-        # load no pre-trained model
-        hparams = {
-            "pretrained_model_name": None,
-            "num_layers": 16
-        }
-        encoder = XLNetEncoder(hparams=hparams)
-        encoder(inputs)
-        self.assertEqual(len(encoder.attn_layers), 16)
-        self.assertEqual(len(encoder.ff_layers), 16)
-
-        # case 4: using default hparams
-        encoder = XLNetEncoder()
-        encoder(inputs)
-        self.assertEqual(len(encoder.attn_layers), 12)
-        self.assertEqual(len(encoder.ff_layers), 12)
-
-    @pretrained_test
-    def test_trainable_variables(self):
-        """Tests the functionality of automatically collecting trainable
-        variables.
-        """
-
-        inputs = tf.placeholder(dtype=tf.int32, shape=[None, None])
-
-        # case 1: XLNet with no pre-trained model
-        encoder = XLNetEncoder(hparams={
-                                   "pretrained_model_name": None,
-                                   "untie_r": False
-                               })
-        encoder(inputs)
-
-        n_word_embed_vars = 1
-        n_mask_embed_vars = 1
-        n_bias_vars = 3  # r_r_bias, r_w_bias, r_s_bias
-        n_pos_wise_ff_vars = 6  # 2 kernels + 2 bias + beta + gamma
-        n_rel_multi_head_vars = 7  # q,k,v,r,o + beta + gamma
-        n_segment_embed_vars = 1
-        n_layers = encoder.hparams.num_layers
-        n_trainable_variables = \
-            n_word_embed_vars + n_segment_embed_vars + n_mask_embed_vars + \
-            n_layers * (n_rel_multi_head_vars + n_pos_wise_ff_vars) + \
-            n_bias_vars
-        self.assertEqual(len(encoder.trainable_variables),
-                         n_trainable_variables)
-
-        # case 2: XLNet with pre-trained model
-        hparams = {
-            "pretrained_model_name": "xlnet-large-cased"
-        }
-        encoder = XLNetEncoder(hparams=hparams)
-        encoder(inputs)
-        n_segment_embed_vars = 1
-        n_layers = encoder.hparams.num_layers
-        n_trainable_variables = \
-            n_word_embed_vars + n_segment_embed_vars + n_mask_embed_vars + \
-            n_layers * (n_rel_multi_head_vars + n_pos_wise_ff_vars) \
-            + n_bias_vars
-        self.assertEqual(len(encoder.trainable_variables),
-                         n_trainable_variables)
-
-    def test_encode(self):
-        """Tests encoding.
-        """
-        # case 1: XLNet pre-trained
-        hparams = {
-            "pretrained_model_name": None,
-            "untie_r": False
-        }
-        encoder = XLNetEncoder(hparams=hparams)
-
-        max_time = 8
-        batch_size = 128
-        inputs = tf.random_uniform([batch_size, max_time],
-                                   maxval=30521, dtype=tf.int32)
-        outputs, _ = encoder(inputs)
-
-        outputs_dim = encoder.hparams.hidden_dim
-        with self.session() as sess:
-            sess.run(tf.global_variables_initializer())
-            outputs_ = sess.run(outputs)
-            self.assertEqual(outputs_.shape,
-                             (batch_size, max_time, outputs_dim))
-
-        # case 2: XLNet pre-trained, untie_r=True
-        hparams = {
-            "pretrained_model_name": None,
-            "untie_r": True
-        }
-
-        encoder = XLNetEncoder(hparams=hparams)
-        outputs, _ = encoder(inputs)
-        with self.session() as sess:
-            sess.run(tf.global_variables_initializer())
-            outputs_ = sess.run(outputs)
-            self.assertEqual(outputs_.shape,
-                             (batch_size, max_time, outputs_dim))
-
-        # case 3: XLNet with no pre-trained model
-        hparams = {
-            "pretrained_model_name": None
-        }
-        encoder = XLNetEncoder(hparams=hparams)
-        outputs_dim = encoder.hparams.hidden_dim
-        outputs, _ = encoder(inputs)
-        with self.session() as sess:
-            sess.run(tf.global_variables_initializer())
-            outputs_ = sess.run(outputs)
-            self.assertEqual(outputs_.shape,
-                             (batch_size, max_time, outputs_dim))
-
-
-if __name__ == "__main__":
-    tf.test.main()
diff --git a/texar/tf/modules/memory/__init__.py b/texar/tf/modules/memory/__init__.py
deleted file mode 100644
index 9c58cf0a..00000000
--- a/texar/tf/modules/memory/__init__.py
+++ /dev/null
@@ -1,25 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Memory modules.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# pylint: disable=wildcard-import
-
-from texar.tf.modules.memory.memory_network import *
-from texar.tf.modules.memory.embed_fns import *
diff --git a/texar/tf/modules/memory/embed_fns.py b/texar/tf/modules/memory/embed_fns.py
deleted file mode 100644
index 4be60499..00000000
--- a/texar/tf/modules/memory/embed_fns.py
+++ /dev/null
@@ -1,74 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Some embed_fn s used in :class:`~texar.tf.modules.memory.MemNetBase` and its
-subclasses.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# pylint: disable=invalid-name, too-many-arguments
-
-__all__ = [
-    'default_memnet_embed_fn_hparams',
-]
-
-
-def default_memnet_embed_fn_hparams():
-    """Returns a dictionary of hyperparameters with default hparams for
-    :func:`~texar.tf.modules.memory.default_embed_fn`
-
-    .. code-block:: python
-
-        {
-            "embedding": {
-                "dim": 100
-            },
-            "temporal_embedding": {
-                "dim": 100
-            },
-            "combine_mode": "add"
-        }
-
-    Here:
-
-    "embedding": dict, optional
-        Hyperparameters for embedding operations. See
-        :meth:`~texar.tf.modules.WordEmbedder.default_hparams` of
-        :class:`~texar.tf.modules.WordEmbedder` for details. If `None`, the
-        default hyperparameters are used.
-
-    "temporal_embedding": dict, optional
-        Hyperparameters for temporal embedding operations. See
-        :meth:`~texar.tf.modules.PositionEmbedder.default_hparams` of
-        :class:`~texar.tf.modules.PositionEmbedder` for details. If `None`, the
-        default hyperparameters are used.
-
-    "combine_mode": str
-        Either **'add'** or **'concat'**. If 'add', memory
-        embedding and temporal embedding are added up. In this case the two
-        embedders must have the same dimension. If 'concat', the two
-        embeddings are concated.
-    """
-    return {
-        "embedding": {
-            "dim": 100
-        },
-        "temporal_embedding": {
-            "dim": 100
-        },
-        "combine_mode": "add"
-    }
diff --git a/texar/tf/modules/memory/memory_network.py b/texar/tf/modules/memory/memory_network.py
deleted file mode 100644
index f8a53c9e..00000000
--- a/texar/tf/modules/memory/memory_network.py
+++ /dev/null
@@ -1,620 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-End-to-end memory network described in
-(Sukhbaatar et al.) End-To-End Memory Networks
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow as tf
-
-from texar.tf.module_base import ModuleBase
-from texar.tf.modules.embedders import WordEmbedder, PositionEmbedder
-from texar.tf.utils.mode import switch_dropout
-from texar.tf.modules.memory.embed_fns import default_memnet_embed_fn_hparams
-
-# pylint: disable=invalid-name, too-many-instance-attributes, too-many-arguments
-# pylint: disable=too-many-locals
-
-__all__ = [
-    'MemNetBase',
-    'MemNetRNNLike',
-]
-
-
-class MemNetSingleLayer(ModuleBase):
-    """An A-C layer for memory network.
-
-    Args:
-        H (optional): The matrix :attr:`H` multiplied to :attr:`o` at the end.
-        hparams (dict or HParams, optional): Memory network single layer
-            hyperparameters. If it is not specified, the default hyperparameter
-            setting is used. See :attr:`default_hparams` for the structure and
-            default values.
-    """
-
-    def __init__(self, H=None, hparams=None):
-        ModuleBase.__init__(self, hparams)
-
-        self._H = H
-
-    @staticmethod
-    def default_hparams():
-        """Returns a dictionary of hyperparameters with default values.
-
-        .. code-block:: python
-
-            {
-                "name": "memnet_single_layer"
-            }
-
-        Here:
-
-        "name": str
-            Name of the memory network single layer.
-        """
-        return {
-            "name": "memnet_single_layer"
-        }
-
-    def _build(self, u, m, c, **kwargs):
-        """An A-C operation with memory and query vector.
-
-        Args:
-            u (Tensor): The input query `Tensor` of shape `[None, memory_dim]`.
-            m (Tensor): Output of A operation. Should be in shape
-                `[None, memory_size, memory_dim]`.
-            c (Tensor): Output of C operation. Should be in shape
-                `[None, memory_size, memory_dim]`.
-
-        Returns:
-            A `Tensor` of shape same as :attr:`u`.
-        """
-        # Input memory representation
-        p = tf.matmul(m, tf.expand_dims(u, axis=2))
-        p = tf.transpose(p, perm=[0, 2, 1])
-
-        p = tf.nn.softmax(p)  # equ. (1)
-
-        # Output memory representation
-        o = tf.matmul(p, c)  # equ. (2)
-        o = tf.squeeze(o, axis=[1])
-
-        if self._H:
-            u = tf.matmul(u, self._H)  # RNN-like style
-        u_ = tf.add(u, o)  # u^{k+1} = H u^k + o^k
-
-        if not self._built:
-            self._add_internal_trainable_variables()
-            if self._H:
-                self._add_trainable_variable(self._H)
-            self._built = True
-
-        return u_
-
-
-class MemNetBase(ModuleBase):
-    """Base class inherited by all memory network classes.
-
-    Args:
-        raw_memory_dim (int): Dimension size of raw memory entries
-            (before embedding). For example,
-            if a raw memory entry is a word, this is the **vocabulary size**
-            (imagine a one-hot representation of word). If a raw memory entry
-            is a dense vector, this is the dimension size of the vector.
-        input_embed_fn (optional): A callable that embeds raw memory entries
-            as inputs.
-            This corresponds to the `A` embedding operation in
-            (Sukhbaatar et al.)
-            If not provided, a default embedding operation is created as
-            specified in :attr:`hparams`. See
-            :meth:`~texar.tf.modules.MemNetBase.get_default_embed_fn`
-            for details.
-        output_embed_fn (optional): A callable that embeds raw memory entries
-            as outputs.
-            This corresponds to the `C` embedding operation in
-            (Sukhbaatar et al.)
-            If not provided, a default embedding operation is created as
-            specified in :attr:`hparams`. See
-            :meth:`~texar.tf.modules.MemNetBase.get_default_embed_fn`
-            for details.
-        query_embed_fn (optional): A callable that embeds query.
-            This corresponds to the `B` embedding operation in
-            (Sukhbaatar et al.). If not provided and "use_B" is True
-            in :attr:`hparams`, a default embedding operation is created as
-            specified in :attr:`hparams`. See
-            :meth:`~texar.tf.modules.MemNetBase.get_default_embed_fn`
-            for details.
-            Notice: If you'd like to customize this callable, please follow
-            the same number and style of dimensions as in `input_embed_fn` or
-            `output_embed_fn`, and assume that the 2nd dimension of its
-            input and output (which corresponds to `memory_size`) is 1.
-        hparams (dict or HParams, optional): Hyperparameters. Missing
-            hyperparamerter will be set to default values. See
-            :meth:`default_hparams` for the hyperparameter sturcture and
-            default values.
-    """
-
-    def __init__(self,
-                 raw_memory_dim,
-                 input_embed_fn=None,
-                 output_embed_fn=None,
-                 query_embed_fn=None,
-                 hparams=None):
-        ModuleBase.__init__(self, hparams)
-
-        self._raw_memory_dim = raw_memory_dim
-
-        self._n_hops = self._hparams.n_hops
-        self._relu_dim = self._hparams.relu_dim
-        self._memory_size = self._hparams.memory_size
-
-        with tf.variable_scope(self.variable_scope):
-            self._A, self._C, self._B, self._memory_dim = self._build_embed_fn(
-                input_embed_fn, output_embed_fn, query_embed_fn)
-
-            self.H = None
-            if self.hparams.use_H:
-                self.H = tf.get_variable(
-                    name="H", shape=[self._memory_dim, self._memory_dim])
-
-    def _build_embed_fn(self, input_embed_fn, output_embed_fn, query_embed_fn):
-        # Optionally creates embed_fn's
-        memory_dim = self.hparams.memory_dim
-        mdim_A, mdim_C, mdim_B = None, None, None
-
-        A = input_embed_fn
-        if input_embed_fn is None:
-            A, mdim_A = self.get_default_embed_fn(
-                self._memory_size, self._hparams.A)
-            memory_dim = mdim_A
-
-        C = output_embed_fn
-        if output_embed_fn is None:
-            C, mdim_C = self.get_default_embed_fn(
-                self._memory_size, self._hparams.C)
-            if mdim_A is not None and mdim_A != mdim_C:
-                raise ValueError('Embedding config `A` and `C` must have '
-                                 'the same output dimension.')
-            memory_dim = mdim_C
-
-        B = query_embed_fn
-        if query_embed_fn is None and self._hparams.use_B:
-            B, mdim_B = self.get_default_embed_fn(1, self._hparams.B)
-            if mdim_A is not None and mdim_A != mdim_B:
-                raise ValueError('Embedding config `A` and `B` must have '
-                                 'the same output dimension.')
-            if mdim_C is not None and mdim_C != mdim_B:
-                raise ValueError('Embedding config `C` and `B` must have '
-                                 'the same output dimension.')
-            memory_dim = mdim_B
-
-        return A, C, B, memory_dim
-
-    def get_default_embed_fn(self, memory_size, embed_fn_hparams):
-        """Creates a default embedding function. Can be used for A, C, or B
-        operation.
-
-        For B operation (i.e., query_embed_fn), :attr:`memory_size` must be 1.
-
-        The function is a combination of both memory embedding and temporal
-        embedding, with the combination method specified by "combine_mode" in
-        the `embed_fn_hparams`.
-
-        .. role:: python(code)
-           :language: python
-
-        Args:
-            embed_fn_hparams (dict or HParams): Hyperparameter of the
-                embedding function. See
-                :func:`~texar.tf.modules.default_memnet_embed_fn` for details.
-
-        Returns:
-            A tuple `(embed_fn, memory_dim)`, where
-
-            - **`memory_dim`** is the dimension of memory entry embedding, \
-            inferred from :attr:`embed_fn_hparams`.
-
-                - If `combine_mode` == 'add', `memory_dim` is the \
-                embedder dimension.
-                - If `combine_mode` == 'concat', `memory_dim` is the sum \
-                of the memory embedder dimension and the temporal embedder \
-                dimension.
-
-            - **`embed_fn`** is an embedding function that takes in memory \
-            and returns memory embedding. \
-            Specifically, the function has signature \
-            :python:`memory_embedding= embed_fn(memory=None, soft_memory=None)`\
-            where one of `memory` and `soft_memory` is provided (but not both).
-
-            Args:
-                memory: An `int` Tensor of shape
-                    `[batch_size, memory_size]`
-                    containing memory indexes used for embedding lookup.
-                soft_memory: A Tensor of shape
-                    `[batch_size, memory_size, raw_memory_dim]`
-                    containing soft weights used to mix the embedding vectors.
-
-            Returns:
-                A Tensor of shape `[batch_size, memory_size, memory_dim]`
-                containing the memory entry embeddings.
-
-        """
-        # memory embedder
-        embedder = WordEmbedder(
-            vocab_size=self._raw_memory_dim,
-            hparams=embed_fn_hparams["embedding"]
-        )
-        # temporal embedder
-        temporal_embedder = PositionEmbedder(
-            position_size=memory_size,
-            hparams=embed_fn_hparams["temporal_embedding"]
-        )
-
-        combine = embed_fn_hparams['combine_mode']
-        if combine == 'add':
-            if embedder.dim != temporal_embedder.dim:
-                raise ValueError('`embedding` and `temporal_embedding` must '
-                                 'have the same dimension for "add" '
-                                 'combination.')
-            memory_dim = embedder.dim
-        elif combine == 'concat':
-            memory_dim = embedder.dim + temporal_embedder.dim
-
-        def _embed_fn(memory, soft_memory, mode=None):
-            if memory is None and soft_memory is None:
-                raise ValueError(
-                    "Either `memory` or `soft_memory` is required.")
-            if memory is not None and soft_memory is not None:
-                raise ValueError(
-                    "Must not specify `memory` and `soft_memory` at the "
-                    "same time.")
-
-            embedded_memory = embedder(
-                ids=memory, soft_ids=soft_memory, mode=mode)
-            temporal_embedded = temporal_embedder(
-                sequence_length=tf.constant([memory_size]), mode=mode)
-            temporal_embedded = tf.tile(
-                temporal_embedded, [tf.shape(embedded_memory)[0], 1, 1])
-
-            if combine == 'add':
-                return tf.add(embedded_memory, temporal_embedded)
-            elif combine == 'concat':
-                return tf.concat([embedded_memory, temporal_embedded], axis=-1)
-            else:
-                raise ValueError('Unknown combine method: {}'.format(combine))
-
-        return _embed_fn, memory_dim
-
-    @staticmethod
-    def default_hparams():
-        """Returns a dictionary of hyperparameters with default values.
-
-        .. code-block:: python
-
-            {
-                "n_hops": 1,
-                "memory_dim": 100,
-                "relu_dim": 50,
-                "memory_size": 100,
-                "A": default_embed_fn_hparams,
-                "C": default_embed_fn_hparams,
-                "B": default_embed_fn_hparams,
-                "use_B": False,
-                "use_H": False,
-                "dropout_rate": 0,
-                "variational": False,
-                "name": "memnet",
-            }
-
-        Here:
-
-        "n_hops": int
-            Number of hops.
-
-        "memory_dim": int
-            Memory dimension, i.e., the dimension size of a memory entry
-            embedding. Ignored if at least one of the embedding functions is
-            created according to :attr:`hparams`. In this case
-            :attr:`memory_dim` is inferred from the created embed_fn.
-
-        "relu_dim": int
-            Number of elements in :attr:`memory_dim` that have relu at the end
-            of each hop.
-            Should be not less than 0 and not more than :attr`memory_dim`.
-
-        "memory_size": int
-            Number of entries in memory.
-
-            For example, the number of sentences {x_i} in Fig.1(a) of
-            (Sukhbaatar et al.) End-To-End Memory Networks.
-
-        "use_B": bool
-            Whether to create the query embedding function. Ignored if
-            `query_embed_fn` is given to the constructor.
-
-        "use_H": bool
-            Whether to perform a linear transformation with matrix `H` at
-            the end of each A-C layer.
-
-        "dropout_rate": float
-            The dropout rate to apply to the output of each hop. Should
-            be between 0 and 1.
-            E.g., `dropout_rate=0.1` would drop out 10% of the units.
-
-        "variational": bool
-            Whether to share dropout masks after each hop.
-        """
-        return {
-            "n_hops": 1,
-            "memory_dim": 100,
-            "relu_dim": 50,
-            "memory_size": 100,
-            "A": default_memnet_embed_fn_hparams(),
-            "C": default_memnet_embed_fn_hparams(),
-            "B": default_memnet_embed_fn_hparams(),
-            "use_B": False,
-            "use_H": False,
-            "dropout_rate": 0,
-            "variational": False,
-            "name": "memnet",
-        }
-
-    def _build(self, memory, query, **kwargs):
-        raise NotImplementedError
-
-    @property
-    def memory_size(self):
-        """The memory size.
-        """
-        return self._memory_size
-
-    @property
-    def raw_memory_dim(self):
-        """The dimension of memory element (or vocabulary size).
-        """
-        return self._raw_memory_dim
-
-    @property
-    def memory_dim(self):
-        """The dimension of embedded memory and all vectors in hops.
-        """
-        return self._memory_dim
-
-
-class MemNetRNNLike(MemNetBase):
-    """An implementation of multi-layer end-to-end memory network,
-    with RNN-like weight tying described in
-    (Sukhbaatar et al.) End-To-End Memory Networks .
-
-    See :meth:`~texar.tf.modules.MemNetBase.get_default_embed_fn` for default
-    embedding functions. Customized embedding functions must follow
-    the same signature.
-
-    Args:
-        raw_memory_dim (int): Dimension size of raw memory entries
-            (before embedding). For example,
-            if a raw memory entry is a word, this is the **vocabulary size**
-            (imagine a one-hot representation of word). If a raw memory entry
-            is a dense vector, this is the dimension size of the vector.
-        input_embed_fn (optional): A callable that embeds raw memory entries
-            as inputs.
-            This corresponds to the `A` embedding operation in
-            (Sukhbaatar et al.)
-            If not provided, a default embedding operation is created as
-            specified in :attr:`hparams`. See
-            :meth:`~texar.tf.modules.MemNetBase.get_default_embed_fn`
-            for details.
-        output_embed_fn (optional): A callable that embeds raw memory entries
-            as outputs.
-            This corresponds to the `C` embedding operation in
-            (Sukhbaatar et al.)
-            If not provided, a default embedding operation is created as
-            specified in :attr:`hparams`. See
-            :meth:`~texar.tf.modules.MemNetBase.get_default_embed_fn`
-            for details.
-        query_embed_fn (optional): A callable that embeds query.
-            This corresponds to the `B` embedding operation in
-            (Sukhbaatar et al.). If not provided and "use_B" is True
-            in :attr:`hparams`, a default embedding operation is created as
-            specified in :attr:`hparams`. See
-            :meth:`~texar.tf.modules.MemNetBase.get_default_embed_fn`
-            for details.
-            For customized query_embed_fn, note that the function must follow
-            the signature of the default embed_fn where `memory_size` must
-            be 1.
-        hparams (dict or HParams, optional): Hyperparameters. Missing
-            hyperparamerter will be set to default values. See
-            :meth:`default_hparams` for the hyperparameter sturcture and
-            default values.
-    """
-
-    def __init__(self,
-                 raw_memory_dim,
-                 input_embed_fn=None,
-                 output_embed_fn=None,
-                 query_embed_fn=None,
-                 hparams=None):
-        MemNetBase.__init__(self, raw_memory_dim, input_embed_fn,
-                            output_embed_fn, query_embed_fn, hparams)
-
-        with tf.variable_scope(self.variable_scope):
-            self._AC = MemNetSingleLayer(
-                self.H,
-                hparams={"name": "AC"})
-
-            self._W = tf.layers.Dense(
-                units=raw_memory_dim,
-                use_bias=False,
-                name="W")
-
-    @staticmethod
-    def default_hparams():
-        """Returns a dictionary of hyperparameters with default values.
-
-        .. code-block:: python
-
-            {
-                "n_hops": 1,
-                "memory_dim": 100,
-                "relu_dim": 50,
-                "memory_size": 100,
-                "A": default_embed_fn_hparams,
-                "C": default_embed_fn_hparams,
-                "B": default_embed_fn_hparams,
-                "use_B": False,
-                "use_H": True,
-                "dropout_rate": 0,
-                "variational": False,
-                "name": "memnet_rnnlike",
-            }
-
-        Here:
-
-        "n_hops": int
-            Number of hops.
-
-        "memory_dim": int
-            Memory dimension, i.e., the dimension size of a memory entry
-            embedding. Ignored if at least one of the embedding functions is
-            created according to :attr:`hparams`. In this case
-            :attr:`memory_dim` is inferred from the created embed_fn.
-
-        "relu_dim": int
-            Number of elements in :attr:`memory_dim` that have relu at the end
-            of each hop.
-            Should be not less than 0 and not more than :attr`memory_dim`.
-
-        "memory_size": int
-            Number of entries in memory.
-
-            For example, the number of sentences {x_i} in Fig.1(a) of
-            (Sukhbaatar et al.) End-To-End Memory Networks.
-
-        "use_B": bool
-            Whether to create the query embedding function. Ignored if
-            `query_embed_fn` is given to the constructor.
-
-        "use_H": bool
-            Whether to perform a linear transformation with matrix `H` at
-            the end of each A-C layer.
-
-        "dropout_rate": float
-            The dropout rate to apply to the output of each hop. Should
-            be between 0 and 1.
-            E.g., `dropout_rate=0.1` would drop out 10% of the units.
-
-        "variational": bool
-            Whether to share dropout masks after each hop.
-        """
-        hparams = MemNetBase.default_hparams()
-        hparams.update({
-            "use_H": True,
-            "name": "memnet_rnnlike"
-        })
-        return hparams
-
-    def _build(self, memory=None, query=None, soft_memory=None, soft_query=None,
-               mode=None, **kwargs):
-        """Pass the :attr:`memory` and :attr:`query` through the memory network
-        and return the :attr:`logits` after the final matrix.
-
-        Only one of :attr:`memory` and :attr:`soft_memory` can be specified.
-        They should not be specified at the same time.
-
-        Args:
-            memory (optional): Memory used in A/C operations. By default, it
-                should be an integer tensor of shape
-                `[batch_size, memory_size]`,
-                containing the ids to embed if provided.
-            query (optional): Query vectors as the intial input of the memory
-                network.
-                If you'd like to apply some transformation (e.g., embedding)
-                on it before it's fed into the network, please set `use_B` to
-                True and add `query_embed_fn` when constructing this instance.
-                If `query_embed_fn` is set to
-                :meth:`~texar.tf.modules.MemNetBase.get_default_embed_fn`,
-                it should be of shape `[batch_size]`.
-                If `use_B` is not set, it should be of shape
-                `[batch_size, memory_dim]`.
-            soft_memory (optional): Soft memory used in A/C operations. By
-                default, it should be a tensor of shape
-                `[batch_size, memory_size, raw_memory_dim]`,
-                containing the weights used to mix the embedding vectors.
-                If you'd like to apply a matrix multiplication on the memory,
-                this option can also be used.
-            soft_query (optional): Query vectors as the intial input of the
-                memory network.
-                If you'd like to apply some transformation (e.g., embedding)
-                on it before it's fed into the network, please set `use_B` to
-                True and add `query_embed_fn` when constructing this instance.
-                Similar to :attr:`soft_memory`, if `query_embed_fn` is set to
-                :meth:`~texar.tf.modules.MemNetBase.get_default_embed_fn`,
-                then it must be of shape `[batch_size, raw_memory_dim]`.
-                Ignored if `use_B` is not set.
-            mode (optional): A tensor taking value in
-                :tf_main:`tf.estimator.ModeKeys <estimator/ModeKeys>`, including
-                `TRAIN`, `EVAL`, and `PREDICT`. If `None`, dropout is
-                controlled by :func:`texar.tf.global_mode`.
-        """
-        if self._B is not None:
-            def _unsqueeze(x):
-                return x if x is None else tf.expand_dims(x, 1)
-            query = tf.squeeze(
-                self._B(_unsqueeze(query), _unsqueeze(soft_query), mode=mode),
-                1)
-        self._u = [query]
-        self._m = self._A(memory, soft_memory, mode=mode)
-        self._c = self._C(memory, soft_memory, mode=mode)
-
-        keep_prob = switch_dropout(1 - self.hparams.dropout_rate, mode=mode)
-        if self.hparams.variational:
-            with tf.variable_scope("variational_dropout"):
-                noise = tf.random_uniform(tf.shape(self._u[-1]))
-                random_tensor = keep_prob + noise
-                binary_tensor = tf.floor(random_tensor)
-
-            def _variational_dropout(val):
-                return tf.math.div(val, keep_prob) * binary_tensor
-
-        for _ in range(self._n_hops):
-            u_ = self._AC(self._u[-1], self._m, self._c)
-            if self._relu_dim == 0:
-                pass
-            elif self._relu_dim == self._memory_dim:
-                u_ = tf.nn.relu(u_)
-            elif 0 < self._relu_dim < self._memory_dim:
-                linear_part = u_[:, : self._memory_dim - self._relu_dim]
-                relu_part = u_[:, self._memory_dim - self._relu_dim:]
-                relued_part = tf.nn.relu(relu_part)
-                u_ = tf.concat(axis=1, values=[linear_part, relued_part])
-            else:
-                raise ValueError(
-                    "relu_dim = {} is illegal".format(self._relu_dim))
-            if self.hparams.variational:
-                u_ = _variational_dropout(u_)
-            else:
-                u_ = tf.nn.dropout(u_, keep_prob)
-            self._u.append(u_)
-
-        logits = self._W(self._u[-1])
-
-        if not self._built:
-            self._add_internal_trainable_variables()
-            self._built = True
-
-        return logits
diff --git a/texar/tf/modules/memory/memory_network_test.py b/texar/tf/modules/memory/memory_network_test.py
deleted file mode 100644
index 363e3727..00000000
--- a/texar/tf/modules/memory/memory_network_test.py
+++ /dev/null
@@ -1,100 +0,0 @@
-"""
-Unit tests for memory networks.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow as tf
-
-from texar.tf.modules.memory.memory_network import MemNetRNNLike
-from texar.tf import context
-
-# pylint: disable=no-member, too-many-locals, too-many-instance-attributes
-# pylint: disable=too-many-arguments, protected-access
-
-
-class MemNetRNNLikeTest(tf.test.TestCase):
-    """Tests :class:`~texar.tf.modules.memory.memory_network.MemNetRNNLike`.
-    """
-
-    def _test_memory_dim(self, combine_mode='add', soft_memory=False,
-                         soft_query=False, use_B=False):
-        """Tests :attr:`memory_dim` in the :attr:`combine_mode` and soft
-        options.
-        """
-        print('testing: combine_mode={}, soft_memory={}, soft_query={}, '
-              'use_B={}'.format(combine_mode, soft_memory, soft_query, use_B))
-
-        n_hops = 3
-        if combine_mode == 'add' or combine_mode is None:
-            memory_dim = 19
-            embedding_dim = memory_dim
-            temporal_embedding_dim = memory_dim
-        elif combine_mode == 'concat':
-            embedding_dim = 19
-            temporal_embedding_dim = 17
-            memory_dim = embedding_dim + temporal_embedding_dim
-        else:
-            raise ValueError(
-                "combine_mode = {} is not recognized".format(combine_mode))
-        relu_dim = 13
-        memory_size = 7
-        raw_memory_dim = 11
-        batch_size = 2
-        embed_hparams = {
-            "embedding": {
-                "dim": embedding_dim,
-            },
-            "temporal_embedding": {
-                "dim": temporal_embedding_dim,
-            },
-            "combine_mode": combine_mode,
-        }
-        memnet_hparams = {
-            "n_hops": n_hops,
-            "relu_dim": relu_dim,
-            "memory_size": memory_size,
-            "A": embed_hparams,
-            "C": embed_hparams,
-            "B": embed_hparams,
-            "use_B": use_B,
-        }
-        memnet = MemNetRNNLike(raw_memory_dim=raw_memory_dim,
-                               hparams=memnet_hparams)
-        kwargs = {}
-        if soft_memory:
-            kwargs['soft_memory'] = tf.random_uniform(
-                [batch_size, memory_size, raw_memory_dim])
-        else:
-            kwargs['memory'] = tf.tile(tf.expand_dims(
-                tf.range(memory_size, dtype=tf.int32), 0), [batch_size, 1])
-        if use_B:
-            if soft_query:
-                kwargs['soft_query'] = tf.random_uniform(
-                    [batch_size, raw_memory_dim])
-            else:
-                kwargs['query'] = tf.random_uniform(
-                    [batch_size], maxval=raw_memory_dim, dtype=tf.int32)
-        else:
-            kwargs['query'] = tf.random_uniform([batch_size, memory_dim])
-        logits = memnet(**kwargs)
-        self.assertEqual(memnet.memory_dim, memory_dim)
-        self.assertEqual(logits.shape[0], batch_size)
-        self.assertEqual(logits.shape[1], raw_memory_dim)
-
-    def test_memory_dim(self):
-        """Tests :attr:`memory_dim` in different :attr:`combine_mode` and
-        different soft options.
-        """
-        for combine_mode in ['add', 'concat']:
-            for soft_memory in [False, True]:
-                for use_B in [False, True]:
-                    for soft_query in ([False, True] if use_B else [False]):
-                        self._test_memory_dim(combine_mode, soft_memory,
-                                              soft_query, use_B)
-
-
-if __name__ == "__main__":
-    tf.test.main()
diff --git a/texar/tf/modules/networks/__init__.py b/texar/tf/modules/networks/__init__.py
index 81717afe..66a31fba 100644
--- a/texar/tf/modules/networks/__init__.py
+++ b/texar/tf/modules/networks/__init__.py
@@ -1,4 +1,4 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
+# Copyright 2019 The Texar Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -15,12 +15,5 @@
 Modules of networks.
 """
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# pylint: disable=wildcard-import
-
 from texar.tf.modules.networks.network_base import *
 from texar.tf.modules.networks.networks import *
-from texar.tf.modules.networks.conv_networks import *
diff --git a/texar/tf/modules/networks/conv_networks.py b/texar/tf/modules/networks/conv_networks.py
deleted file mode 100644
index bf55d224..00000000
--- a/texar/tf/modules/networks/conv_networks.py
+++ /dev/null
@@ -1,482 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Various convolutional networks.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow as tf
-
-from texar.tf.modules.networks.network_base import FeedForwardNetworkBase
-from texar.tf.modules.networks.network_base import _build_layers
-from texar.tf.core.layers import get_pooling_layer_hparams, get_activation_fn
-from texar.tf.utils.utils import uniquify_str
-from texar.tf.utils.shapes import mask_sequences
-from texar.tf.hyperparams import HParams
-
-# pylint: disable=too-many-arguments, too-many-locals
-
-__all__ = [
-    "_to_list",
-    "Conv1DNetwork"
-]
-
-
-def _to_list(value, name=None, list_length=None):
-    """Converts hparam value into a list.
-
-    If :attr:`list_length` is given,
-    then the canonicalized :attr:`value` must be of
-    length :attr:`list_length`.
-    """
-    if not isinstance(value, (list, tuple)):
-        if list_length is not None:
-            value = [value] * list_length
-        else:
-            value = [value]
-    if list_length is not None and len(value) != list_length:
-        name = '' if name is None else name
-        raise ValueError("hparams '%s' must be a list of length %d"
-                         % (name, list_length))
-    return value
-
-
-class Conv1DNetwork(FeedForwardNetworkBase):
-    """Simple Conv-1D network which consists of a sequence of conv layers
-    followed with a sequence of dense layers.
-
-    Args:
-        hparams (dict, optional): Hyperparameters. Missing
-            hyperparamerter will be set to default values. See
-            :meth:`default_hparams` for the hyperparameter sturcture and
-            default values.
-
-    See :meth:`_build` for the inputs and outputs. The inputs must be a
-    3D Tensor of shape `[batch_size, length, channels]` (default), or
-    `[batch_size, channels, length]` (if `data_format` is set to
-    `'channels_last'` through :attr:`hparams`). For example, for sequence
-    classification, `length` corresponds to time steps, and `channels`
-    corresponds to embedding dim.
-
-    Example:
-
-        .. code-block:: python
-
-            nn = Conv1DNetwork() # Use the default structure
-
-            inputs = tf.random_uniform([64, 20, 256])
-            outputs = nn(inputs)
-            # outputs == Tensor of shape [64, 128], cuz the final dense layer
-            # has size 128.
-
-    .. document private functions
-    .. automethod:: _build
-    """
-
-    def __init__(self, hparams=None):
-        FeedForwardNetworkBase.__init__(self, hparams)
-
-        with tf.variable_scope(self.variable_scope):
-            layer_hparams = self._build_layer_hparams()
-            _build_layers(self, layers=None, layer_hparams=layer_hparams)
-
-    @staticmethod
-    def default_hparams():
-        """Returns a dictionary of hyperparameters with default values.
-
-        .. code-block:: python
-
-            {
-                # (1) Conv layers
-                "num_conv_layers": 1,
-                "filters": 128,
-                "kernel_size": [3, 4, 5],
-                "conv_activation": "relu",
-                "conv_activation_kwargs": None,
-                "other_conv_kwargs": None,
-                # (2) Pooling layers
-                "pooling": "MaxPooling1D",
-                "pool_size": None,
-                "pool_strides": 1,
-                "other_pool_kwargs": None,
-                # (3) Dense layers
-                "num_dense_layers": 1,
-                "dense_size": 128,
-                "dense_activation": "identity",
-                "dense_activation_kwargs": None,
-                "final_dense_activation": None,
-                "final_dense_activation_kwargs": None,
-                "other_dense_kwargs": None,
-                # (4) Dropout
-                "dropout_conv": [1],
-                "dropout_dense": [],
-                "dropout_rate": 0.75,
-                # (5) Others
-                "name": "conv1d_network",
-            }
-
-        Here:
-
-        1. For **convolutional** layers:
-
-            "num_conv_layers": int
-                Number of convolutional layers.
-
-            "filters": int or list
-                The number of filters in the convolution, i.e., the
-                dimensionality
-                of the output space. If "num_conv_layers" > 1, "filters" must be
-                a list of "num_conv_layers" integers.
-
-            "kernel_size": int or list
-                Lengths of 1D convolution windows.
-
-                - If "num_conv_layers" == 1, this can be a list of arbitrary \
-                number\
-                of `int` denoting different sized conv windows. The number of \
-                filters of each size is specified by "filters". For example,\
-                the default values will create 3 sets of filters, each of which\
-                has kernel size of 3, 4, and 5, respectively, and has filter\
-                number 128.
-                - If "num_conv_layers" > 1, this must be a list of length \
-                "num_conv_layers". Each element can be an `int` or a list \
-                of arbitrary number of `int` denoting the kernel size of \
-                respective layer.
-
-            "conv_activation": str or callable
-                Activation function applied to the output of the convolutional
-                layers. Set to "indentity" to maintain a linear activation.
-                See :func:`~texar.tf.core.get_activation_fn` for more details.
-
-            "conv_activation_kwargs": dict, optional
-                Keyword arguments for conv layer activation functions.
-                See :func:`~texar.tf.core.get_activation_fn` for more details.
-
-            "other_conv_kwargs": dict, optional
-                Other keyword arguments for
-                :tf_main:`tf.layers.Conv1D <layers/Conv1d>` constructor, e.g.,
-                "data_format", "padding", etc.
-
-        2. For **pooling** layers:
-
-            "pooling": str or class or instance
-                Pooling layer after each of the convolutional layer(s). Can
-                a pooling layer class, its name or module path, or a class
-                instance.
-
-            "pool_size": int or list, optional
-                Size of the pooling window. If an `int`, all pooling layer
-                will have the same pool size. If a list, the list length must
-                equal "num_conv_layers". If `None` and the pooling type
-                is either
-                :tf_main:`MaxPooling <layers/MaxPooling1D>` or
-                :tf_main:`AveragePooling <layers/AveragePooling1D>`, the
-                pool size will be set to input size. That is, the output of
-                the pooling layer is a single unit.
-
-            "pool_strides": int or list, optional
-                Strides of the pooling operation. If an `int`, all pooling layer
-                will have the same stride. If a list, the list length must
-                equal "num_conv_layers".
-
-            "other_pool_kwargs": dict, optional
-                Other keyword arguments for pooling layer class constructor.
-
-        3. For **dense** layers (note that here dense layers always follow conv
-           and pooling layers):
-
-            "num_dense_layers": int
-                Number of dense layers.
-
-            "dense_size": int or list
-                Number of units of each dense layers. If an `int`, all dense
-                layers will have the same size. If a list of `int`, the list
-                length must equal "num_dense_layers".
-
-            "dense_activation": str or callable
-                Activation function applied to the output of the dense
-                layers **except** the last dense layer output . Set to
-                "indentity" to maintain a linear activation.
-                See :func:`~texar.tf.core.get_activation_fn` for more details.
-
-            "dense_activation_kwargs": dict, optional
-                Keyword arguments for dense layer activation functions before
-                the last dense layer.
-                See :func:`~texar.tf.core.get_activation_fn` for more details.
-
-            "final_dense_activation": str or callable
-                Activation function applied to the output of the **last** dense
-                layer. Set to `None` or
-                "indentity" to maintain a linear activation.
-                See :func:`~texar.tf.core.get_activation_fn` for more details.
-
-            "final_dense_activation_kwargs": dict, optional
-                Keyword arguments for the activation function of last
-                dense layer.
-                See :func:`~texar.tf.core.get_activation_fn` for more details.
-
-            "other_dense_kwargs": dict, optional
-                Other keyword arguments for
-                :tf_main:`Dense <layers/Dense>`
-                layer class constructor.
-
-        4. For **dropouts**:
-
-            "dropout_conv": int or list
-                The indexes of conv layers (starting from `0`) whose **inputs**
-                are applied with dropout. The index = :attr:`num_conv_layers`
-                means dropout applies to the final conv layer output. E.g.,
-
-                .. code-block:: python
-
-                    {
-                        "num_conv_layers": 2,
-                        "dropout_conv": [0, 2]
-                    }
-
-                will leads to a series of layers as
-                `-dropout-conv0-conv1-dropout-`.
-
-                The dropout mode (training or not) is controlled
-                by the :attr:`mode` argument of :meth:`_build`.
-
-            "dropout_dense": int or list
-                Same as "dropout_conv" but applied to dense layers (index
-                starting from `0`).
-
-            "dropout_rate": float
-                The dropout rate, between 0 and 1. E.g.,
-                `"dropout_rate": 0.1` would drop out 10% of elements.
-
-        5. Others:
-
-            "name": str
-                Name of the network.
-        """
-        return {
-            # Conv layers
-            "num_conv_layers": 1,
-            "filters": 128,
-            "kernel_size": [3, 4, 5],
-            "conv_activation": "relu",
-            "conv_activation_kwargs": None,
-            "other_conv_kwargs": None,
-            # Pooling layers
-            "pooling": "MaxPooling1D",
-            "pool_size": None,
-            "pool_strides": 1,
-            "other_pool_kwargs": None,
-            # Dense layers
-            "num_dense_layers": 1,
-            "dense_size": 128,
-            "dense_activation": "identity",
-            "dense_activation_kwargs": None,
-            "final_dense_activation": None,
-            "final_dense_activation_kwargs": None,
-            "other_dense_kwargs": None,
-            # Dropout
-            "dropout_conv": [1],
-            "dropout_dense": [],
-            "dropout_rate": 0.75,
-            # Others
-            "name": "conv1d_network",
-            "@no_typecheck": ["filters", "kernel_size", "conv_activation",
-                              "pool_size", "pool_strides",
-                              "dense_size", "dense_activation",
-                              "dropout_conv", "dropout_dense"]
-        }
-
-    def _build_pool_hparams(self):
-        pool_type = self._hparams.pooling
-        if pool_type == "MaxPooling":
-            pool_type = "MaxPooling1D"
-        elif pool_type == "AveragePooling":
-            pool_type = "AveragePooling1D"
-
-        npool = self._hparams.num_conv_layers
-        pool_size = _to_list(self._hparams.pool_size, "pool_size", npool)
-        strides = _to_list(self._hparams.pool_strides, "pool_strides", npool)
-
-        other_kwargs = self._hparams.other_pool_kwargs or {}
-        if isinstance(other_kwargs, HParams):
-            other_kwargs = other_kwargs.todict()
-        if not isinstance(other_kwargs, dict):
-            raise ValueError("hparams['other_pool_kwargs'] must be a dict.")
-
-        pool_hparams = []
-        for i in range(npool):
-            kwargs_i = {"pool_size": pool_size[i], "strides": strides[i],
-                        "name": "pool_%d" % (i + 1)}
-            kwargs_i.update(other_kwargs)
-            pool_hparams_ = get_pooling_layer_hparams({"type": pool_type,
-                                                       "kwargs": kwargs_i})
-            pool_hparams.append(pool_hparams_)
-
-        return pool_hparams
-
-    def _build_conv1d_hparams(self, pool_hparams):
-        """Creates the hparams for each of the conv layers usable for
-        :func:`texar.tf.core.layers.get_layer`.
-        """
-        nconv = self._hparams.num_conv_layers
-        if len(pool_hparams) != nconv:
-            raise ValueError("`pool_hparams` must be of length %d" % nconv)
-
-        filters = _to_list(self._hparams.filters, 'filters', nconv)
-
-        if nconv == 1:
-            kernel_size = _to_list(self._hparams.kernel_size)
-            if not isinstance(kernel_size[0], (list, tuple)):
-                kernel_size = [kernel_size]
-        elif nconv > 1:
-            kernel_size = _to_list(self._hparams.kernel_size,
-                                   'kernel_size', nconv)
-            kernel_size = [_to_list(ks) for ks in kernel_size]
-
-        other_kwargs = self._hparams.other_conv_kwargs or {}
-        if isinstance(other_kwargs, HParams):
-            other_kwargs = other_kwargs.todict()
-        if not isinstance(other_kwargs, dict):
-            raise ValueError("hparams['other_conv_kwargs'] must be a dict.")
-
-        conv_pool_hparams = []
-        activation_fn = get_activation_fn(
-            self._hparams.conv_activation,
-            self._hparams.conv_activation_kwargs)
-        for i in range(nconv):
-            hparams_i = []
-            names = []
-            for ks_ij in kernel_size[i]:
-                name = uniquify_str("conv_%d" % (i + 1), names)
-                names.append(name)
-                conv_kwargs_ij = {
-                    "filters": filters[i],
-                    "kernel_size": ks_ij,
-                    "activation": activation_fn,
-                    "name": name
-                }
-                conv_kwargs_ij.update(other_kwargs)
-                hparams_i.append(
-                    {"type": "Conv1D", "kwargs": conv_kwargs_ij})
-            if len(hparams_i) == 1:
-                conv_pool_hparams.append([hparams_i[0], pool_hparams[i]])
-            else:  # creates MergeLayer
-                mrg_kwargs_layers = []
-                for hparams_ij in hparams_i:
-                    seq_kwargs_j = {"layers": [hparams_ij, pool_hparams[i]]}
-                    mrg_kwargs_layers.append(
-                        {"type": "SequentialLayer", "kwargs": seq_kwargs_j})
-                mrg_hparams = {"type": "MergeLayer",
-                               "kwargs": {"layers": mrg_kwargs_layers,
-                                          "name": "conv_pool_%d" % (i + 1)}}
-                conv_pool_hparams.append(mrg_hparams)
-
-        return conv_pool_hparams
-
-    def _build_dense_hparams(self):
-        ndense = self._hparams.num_dense_layers
-        dense_size = _to_list(self._hparams.dense_size, 'dense_size', ndense)
-
-        other_kwargs = self._hparams.other_dense_kwargs or {}
-        if isinstance(other_kwargs, HParams):
-            other_kwargs = other_kwargs.todict()
-        if not isinstance(other_kwargs, dict):
-            raise ValueError("hparams['other_dense_kwargs'] must be a dict.")
-
-        dense_hparams = []
-        activation_fn = get_activation_fn(
-            self._hparams.dense_activation,
-            self._hparams.dense_activation_kwargs)
-        for i in range(ndense):
-            if i == ndense - 1:
-                activation_fn = get_activation_fn(
-                    self._hparams.final_dense_activation,
-                    self._hparams.final_dense_activation_kwargs)
-
-            kwargs_i = {"units": dense_size[i],
-                        "activation": activation_fn,
-                        "name": "dense_%d" % (i + 1)}
-            kwargs_i.update(other_kwargs)
-
-            dense_hparams.append({"type": "Dense", "kwargs": kwargs_i})
-
-        return dense_hparams
-
-    def _build_layer_hparams(self):
-        pool_hparams = self._build_pool_hparams()
-        conv_pool_hparams = self._build_conv1d_hparams(pool_hparams)
-        dense_hparams = self._build_dense_hparams()
-
-        def _dropout_hparams(layer_id):
-            return {"type": "Dropout",
-                    "kwargs": {"rate": self._hparams.dropout_rate,
-                               "name": "dropout_%d" % layer_id}}
-        dropout_conv = _to_list(self._hparams.dropout_conv)
-        dropout_dense = _to_list(self._hparams.dropout_dense)
-
-        layers_hparams = []
-        nconv = self._hparams.num_conv_layers
-        for conv_i in range(nconv):
-            if conv_i in dropout_conv:
-                layers_hparams.append(_dropout_hparams(conv_i))
-            if isinstance(conv_pool_hparams[conv_i], (list, tuple)):
-                layers_hparams += conv_pool_hparams[conv_i]
-            else:
-                layers_hparams.append(conv_pool_hparams[conv_i])
-        if nconv in dropout_conv:
-            layers_hparams.append(_dropout_hparams(nconv))
-
-        ndense = self._hparams.num_dense_layers
-        if ndense > 0:  # Add flatten layers before dense layers
-            layers_hparams.append({"type": "Flatten"})
-        for dense_i in range(ndense):
-            if dense_i in dropout_dense:
-                layers_hparams.append(_dropout_hparams(dense_i + nconv))
-            layers_hparams.append(dense_hparams[dense_i])
-        if ndense in dropout_dense:
-            layers_hparams.append(_dropout_hparams(ndense + nconv))
-
-        return layers_hparams
-
-    def _build(self,    # pylint: disable=arguments-differ
-               inputs,
-               sequence_length=None,
-               dtype=None,
-               mode=None):
-        """Feeds forward inputs through the network layers and returns outputs.
-
-        Args:
-            inputs: The inputs to the network, which is a 3D tensor.
-            sequence_length (optional): An int tensor of shape `[batch_size]`
-                containing the length of each element in :attr:`inputs`.
-                If given, time steps beyond the length will first be masked out
-                before feeding to the layers.
-            dtype (optional): Type of the inputs. If not provided, infers
-                from inputs automatically.
-            mode (optional): A tensor taking value in
-                :tf_main:`tf.estimator.ModeKeys <estimator/ModeKeys>`, including
-                `TRAIN`, `EVAL`, and `PREDICT`. If `None`,
-                :func:`texar.tf.global_mode` is used.
-
-        Returns:
-            The output of the final layer.
-        """
-        if sequence_length is not None:
-            inputs = mask_sequences(
-                inputs, sequence_length, dtype=dtype, time_major=False,
-                tensor_rank=3)
-        return super(Conv1DNetwork, self)._build(inputs, mode=mode)
diff --git a/texar/tf/modules/networks/conv_networks_test.py b/texar/tf/modules/networks/conv_networks_test.py
deleted file mode 100644
index 1d5a60af..00000000
--- a/texar/tf/modules/networks/conv_networks_test.py
+++ /dev/null
@@ -1,126 +0,0 @@
-#
-"""
-Unit tests for conv networks.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
-
-import tensorflow as tf
-
-import texar.tf as tx
-from texar.tf.modules.networks.conv_networks import Conv1DNetwork
-
-
-class Conv1DNetworkTest(tf.test.TestCase):
-    """Tests :class:`~texar.tf.modules.Conv1DNetwork` class.
-    """
-
-    def test_feedforward(self):
-        """Tests feed forward.
-        """
-        network_1 = Conv1DNetwork()
-        self.assertEqual(len(network_1.layers), 4)
-        self.assertTrue(isinstance(network_1.layer_by_name("conv_pool_1"),
-                                   tx.core.MergeLayer))
-        for layer in network_1.layers[0].layers:
-            self.assertTrue(isinstance(layer, tx.core.SequentialLayer))
-
-        inputs_1 = tf.ones([64, 16, 300], tf.float32)
-        outputs_1 = network_1(inputs_1)
-        self.assertEqual(outputs_1.shape, [64, 128])
-
-        hparams = {
-            # Conv layers
-            "num_conv_layers": 2,
-            "filters": 128,
-            "kernel_size": [[3, 4, 5], 4],
-            "other_conv_kwargs": {"padding": "same"},
-            # Pooling layers
-            "pooling": "AveragePooling",
-            "pool_size": 2,
-            "pool_strides": 1,
-            # Dense layers
-            "num_dense_layers": 3,
-            "dense_size": [128, 128, 10],
-            "dense_activation": "relu",
-            "other_dense_kwargs": {"use_bias": False},
-            # Dropout
-            "dropout_conv": [0, 1, 2],
-            "dropout_dense": 2
-        }
-        network_2 = Conv1DNetwork(hparams)
-        # nlayers = nconv-pool + nconv + npool + ndense + ndropout + flatten
-        self.assertEqual(len(network_2.layers), 1 + 1 + 1 + 3 + 4 + 1)
-        self.assertTrue(isinstance(network_2.layer_by_name("conv_pool_1"),
-                                   tx.core.MergeLayer))
-        for layer in network_2.layers[1].layers:
-            self.assertTrue(isinstance(layer, tx.core.SequentialLayer))
-
-        inputs_2 = tf.ones([64, 16, 300], tf.float32)
-        outputs_2 = network_2(inputs_2)
-        self.assertEqual(outputs_2.shape, [64, 10])
-
-    def test_unknown_seq_length(self):
-        """Tests use of pooling layer when the seq_length dimension of inputs
-        is `None`.
-        """
-        network_1 = Conv1DNetwork()
-        inputs_1 = tf.placeholder(tf.float32, [64, None, 300])
-        outputs_1 = network_1(inputs_1)
-        self.assertEqual(outputs_1.shape, [64, 128])
-
-        hparams = {
-            # Conv layers
-            "num_conv_layers": 2,
-            "filters": 128,
-            "kernel_size": [[3, 4, 5], 4],
-            # Pooling layers
-            "pooling": "AveragePooling",
-            "pool_size": [2, None],
-            # Dense layers
-            "num_dense_layers": 1,
-            "dense_size": 10,
-        }
-        network = Conv1DNetwork(hparams)
-        # nlayers = nconv-pool + nconv + npool + ndense + ndropout + flatten
-        self.assertEqual(len(network.layers), 1 + 1 + 1 + 1 + 1 + 1)
-        self.assertTrue(isinstance(network.layer_by_name('pool_2'),
-                                   tx.core.AverageReducePooling1D))
-
-        inputs = tf.placeholder(tf.float32, [64, None, 300])
-        outputs = network(inputs)
-        self.assertEqual(outputs.shape, [64, 10])
-
-        hparams_2 = {
-            # Conv layers
-            "num_conv_layers": 1,
-            "filters": 128,
-            "kernel_size": 4,
-            "other_conv_kwargs": {'data_format': 'channels_first'},
-            # Pooling layers
-            "pooling": "MaxPooling",
-            "other_pool_kwargs": {'data_format': 'channels_first'},
-            # Dense layers
-            "num_dense_layers": 1,
-            "dense_size": 10,
-        }
-        network_2 = Conv1DNetwork(hparams_2)
-        inputs_2 = tf.placeholder(tf.float32, [64, 300, None])
-        outputs_2 = network_2(inputs_2)
-        self.assertEqual(outputs_2.shape, [64, 10])
-
-    def test_mask_input(self):
-        """Tests masked inputs.
-        """
-        network_1 = Conv1DNetwork()
-        inputs_1 = tf.ones([3, 16, 300], tf.float32)
-        seq_length = [10, 15, 1]
-        outputs_1 = network_1(inputs_1, sequence_length=seq_length)
-        self.assertEqual(outputs_1.shape, [3, 128])
-
-
-if __name__ == "__main__":
-    tf.test.main()
diff --git a/texar/tf/modules/networks/network_base.py b/texar/tf/modules/networks/network_base.py
index 4be3cce9..8188f847 100644
--- a/texar/tf/modules/networks/network_base.py
+++ b/texar/tf/modules/networks/network_base.py
@@ -1,4 +1,4 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
+# Copyright 2019 The Texar Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -15,19 +15,13 @@
 Base class for feed forward neural networks.
 """
 
-from __future__ import absolute_import
-from __future__ import print_function
-from __future__ import division
-
 import tensorflow as tf
 
 from texar.tf.module_base import ModuleBase
-from texar.tf.utils import TexarError
 from texar.tf.core.layers import get_layer
-from texar.tf.utils.utils import uniquify_str
 from texar.tf.utils.mode import is_train_mode
+from texar.tf.utils.utils import uniquify_str
 
-# pylint: disable=too-many-instance-attributes, arguments-differ
 # pylint: disable=protected-access
 
 __all__ = [
@@ -37,29 +31,28 @@
 
 
 def _build_layers(network, layers=None, layer_hparams=None):
-    """Builds layers.
+    r"""Builds layers.
 
     Either :attr:`layer_hparams` or :attr:`layers` must be
     provided. If both are given, :attr:`layers` will be used.
 
     Args:
         network: An instance of a subclass of
-            :class:`~texar.tf.modules.networks.network_base.FeedForwardNetworkBase`
+            :class:`~texar.tf.modules.networks.FeedForwardNetworkBase`
         layers (optional): A list of layer instances.
         layer_hparams (optional): A list of layer hparams, each to which
             is fed to :func:`~texar.tf.core.layers.get_layer` to create the
             layer instance.
     """
-    with tf.variable_scope(network.variable_scope):
-        if layers is not None:
-            network._layers = layers
-        else:
-            if layer_hparams is None:
-                raise ValueError(
-                    'Either `layer` or `layer_hparams` is required.')
-            network._layers = []
-            for _, hparams in enumerate(layer_hparams):
-                network._layers.append(get_layer(hparams=hparams))
+    if layers is not None:
+        network._layers = layers
+    else:
+        if layer_hparams is None:
+            raise ValueError(
+                'Either `layer` or `layer_hparams` is required.')
+        network._layers = []
+        for _, hparams in enumerate(layer_hparams):
+            network._layers.append(get_layer(hparams=hparams))
 
     for layer in network._layers:
         layer_name = uniquify_str(layer.name, network._layer_names)
@@ -68,7 +61,7 @@ def _build_layers(network, layers=None, layer_hparams=None):
 
 
 class FeedForwardNetworkBase(ModuleBase):
-    """Base class inherited by all feed-forward network classes.
+    r"""Base class inherited by all feed-forward network classes.
 
     Args:
         hparams (dict, optional): Hyperparameters. Missing
@@ -76,11 +69,11 @@ class FeedForwardNetworkBase(ModuleBase):
             :meth:`default_hparams` for the hyperparameter sturcture and
             default values.
 
-    See :meth:`_build` for the inputs and outputs.
+    See :meth:`call` for the inputs and outputs.
     """
 
     def __init__(self, hparams=None):
-        ModuleBase.__init__(self, hparams)
+        super().__init__(hparams=hparams)
 
         self._layers = []
         self._layer_names = []
@@ -90,7 +83,7 @@ def __init__(self, hparams=None):
 
     @staticmethod
     def default_hparams():
-        """Returns a dictionary of hyperparameters with default values.
+        r"""Returns a dictionary of hyperparameters with default values.
 
         .. code-block:: python
 
@@ -102,8 +95,8 @@ def default_hparams():
             "name": "NN"
         }
 
-    def _build(self, inputs, mode=None):
-        """Feeds forward inputs through the network layers and returns outputs.
+    def call(self, inputs, mode=None):
+        r"""Feeds forward inputs through the network layers and returns outputs.
 
         Args:
             inputs: The inputs to the network. The requirements on inputs
@@ -111,8 +104,7 @@ def _build(self, inputs, mode=None):
                 network.
             mode (optional): A tensor taking value in
                 :tf_main:`tf.estimator.ModeKeys <estimator/ModeKeys>`, including
-                `TRAIN`, `EVAL`, and `PREDICT`. If `None`,
-                :func:`texar.tf.global_mode` is used.
+                `TRAIN`, `EVAL`, and `PREDICT`.
 
         Returns:
             The output of the network.
@@ -121,8 +113,8 @@ def _build(self, inputs, mode=None):
 
         prev_outputs = inputs
         for layer_id, layer in enumerate(self._layers):
-            if isinstance(layer, tf.layers.Dropout) or \
-                    isinstance(layer, tf.layers.BatchNormalization):
+            if isinstance(layer, (tf.keras.layers.Dropout,
+                                  tf.keras.layers.BatchNormalization)):
                 outputs = layer(prev_outputs, training=training)
             else:
                 outputs = layer(prev_outputs)
@@ -130,39 +122,30 @@ def _build(self, inputs, mode=None):
             self._layer_outputs_by_name[self._layer_names[layer_id]] = outputs
             prev_outputs = outputs
 
-        if not self._built:
-            self._add_internal_trainable_variables()
-            # Add trainable variables of `self._layers` which may be
-            # constructed externally.
-            for layer in self._layers:
-                self._add_trainable_variable(layer.trainable_variables)
-            self._built = True
-
         return outputs
 
     def append_layer(self, layer):
-        """Appends a layer to the end of the network. The method is only
-        feasible before :attr:`_build` is called.
+        r"""Appends a layer to the end of the network. The method is only
+        feasible before :attr:`call` is called.
 
         Args:
-            layer: A :tf_main:`tf.layers.Layer <layers/Layer>` instance, or
-                a dict of layer hyperparameters.
+            layer: A :tf_main:`tf.keras.layers.Layer <layers/Layer>` instance,
+                or a dict of layer hyperparameters.
         """
-        if self._built:
-            raise TexarError("`FeedForwardNetwork.append_layer` can be "
-                             "called only before `_build` is called.")
-
-        with tf.variable_scope(self.variable_scope):
-            layer_ = layer
-            if not isinstance(layer_, tf.layers.Layer):
-                layer_ = get_layer(hparams=layer_)
-            self._layers.append(layer_)
-            layer_name = uniquify_str(layer_.name, self._layer_names)
-            self._layer_names.append(layer_name)
-            self._layers_by_name[layer_name] = layer_
+        if self.built:
+            raise ValueError("`FeedForwardNetwork.append_layer` can be "
+                             "called only before `call` is called.")
+
+        layer_ = layer
+        if not isinstance(layer_, tf.keras.layers.Layer):
+            layer_ = get_layer(hparams=layer_)
+        self._layers.append(layer_)
+        layer_name = uniquify_str(layer_.name, self._layer_names)
+        self._layer_names.append(layer_name)
+        self._layers_by_name[layer_name] = layer_
 
     def has_layer(self, layer_name):
-        """Returns `True` if the network with the name exists. Returns `False`
+        r"""Returns `True` if the network with the name exists. Returns `False`
         otherwise.
 
         Args:
@@ -171,7 +154,7 @@ def has_layer(self, layer_name):
         return layer_name in self._layers_by_name
 
     def layer_by_name(self, layer_name):
-        """Returns the layer with the name. Returns 'None' if the layer name
+        r"""Returns the layer with the name. Returns 'None' if the layer name
         does not exist.
 
         Args:
@@ -181,24 +164,24 @@ def layer_by_name(self, layer_name):
 
     @property
     def layers_by_name(self):
-        """A dictionary mapping layer names to the layers.
+        r"""A dictionary mapping layer names to the layers.
         """
         return self._layers_by_name
 
     @property
     def layers(self):
-        """A list of the layers.
+        r"""A list of the layers.
         """
         return self._layers
 
     @property
     def layer_names(self):
-        """A list of uniquified layer names.
+        r"""A list of uniquified layer names.
         """
         return self._layer_names
 
     def layer_outputs_by_name(self, layer_name):
-        """Returns the output tensors of the layer with the specified name.
+        r"""Returns the output tensors of the layer with the specified name.
         Returns `None` if the layer name does not exist.
 
         Args:
@@ -208,6 +191,6 @@ def layer_outputs_by_name(self, layer_name):
 
     @property
     def layer_outputs(self):
-        """A list containing output tensors of each layer.
+        r"""A list containing output tensors of each layer.
         """
         return self._layer_outputs
diff --git a/texar/tf/modules/networks/networks.py b/texar/tf/modules/networks/networks.py
index 2a02292f..a6a9fb18 100644
--- a/texar/tf/modules/networks/networks.py
+++ b/texar/tf/modules/networks/networks.py
@@ -1,4 +1,4 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
+# Copyright 2019 The Texar Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -15,12 +15,6 @@
 Various neural networks and related utilities.
 """
 
-from __future__ import absolute_import
-from __future__ import print_function
-from __future__ import division
-
-import tensorflow as tf
-
 from texar.tf.modules.networks.network_base import FeedForwardNetworkBase
 from texar.tf.modules.networks.network_base import _build_layers
 
@@ -30,7 +24,7 @@
 
 
 class FeedForwardNetwork(FeedForwardNetworkBase):
-    """Feed-forward neural network that consists of a sequence of layers.
+    r"""Feed-forward neural network that consists of a sequence of layers.
 
     Args:
         layers (list, optional): A list of :tf_main:`Layer <layers/Layer>`
@@ -41,8 +35,9 @@ class FeedForwardNetwork(FeedForwardNetworkBase):
             :meth:`default_hparams` for the hyperparameter sturcture and
             default values.
 
-    See :meth:`~texar.tf.modules.RNNDecoderBase._build` of
-    :class:`~texar.tf.modules.FeedForwardNetworkBase` for the inputs and outputs.
+    See :meth:`~texar.tf.modules.RNNDecoderBase.call` of
+    :class:`~texar.tf.modules.FeedForwardNetworkBase` for the inputs and
+    outputs.
 
     Example:
 
@@ -56,21 +51,19 @@ class FeedForwardNetwork(FeedForwardNetworkBase):
             }
             nn = FeedForwardNetwork(hparams=hparams)
 
-            inputs = tf.random_uniform([64, 100])
+            inputs = tf.random.uniform([64, 100])
             outputs = nn(inputs)
             # outputs == Tensor of shape [64, 10]
     """
 
     def __init__(self, layers=None, hparams=None):
-        FeedForwardNetworkBase.__init__(self, hparams)
+        super().__init__(hparams=hparams)
 
-        with tf.variable_scope(self.variable_scope):
-            _build_layers(
-                self, layers=layers, layer_hparams=self._hparams.layers)
+        _build_layers(self, layers=layers, layer_hparams=self._hparams.layers)
 
     @staticmethod
     def default_hparams():
-        """Returns a dictionary of hyperparameters with default values.
+        r"""Returns a dictionary of hyperparameters with default values.
 
         .. code-block:: python
 
@@ -82,8 +75,9 @@ def default_hparams():
         Here:
 
         "layers": list
-            A list of layer hyperparameters. See :func:`~texar.tf.core.get_layer`
-            for the details of layer hyperparameters.
+            A list of layer hyperparameters. See
+            :func:`~texar.tf.core.get_layer` for the details of layer
+            hyperparameters.
 
         "name": str
             Name of the network.
diff --git a/texar/tf/modules/networks/networks_test.py b/texar/tf/modules/networks/networks_test.py
index 5ebf052c..67052d99 100644
--- a/texar/tf/modules/networks/networks_test.py
+++ b/texar/tf/modules/networks/networks_test.py
@@ -2,24 +2,18 @@
 Unit tests for feed forward neural networks.
 """
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
 import tensorflow as tf
 
 from texar.tf.modules.networks.networks import FeedForwardNetwork
 
-# pylint: disable=no-member, invalid-name
-
 
 class FeedForwardNetworkTest(tf.test.TestCase):
-    """Tests the class
+    r"""Tests the class
     :class:`~texar.tf.modules.networks.networks.FeedForwardNetwork`.
     """
 
     def test_feedforward(self):
-        """Tests feed-forward.
+        r"""Tests feed-forward.
         """
         hparams = {
             "layers": [
diff --git a/texar/tf/modules/policies/__init__.py b/texar/tf/modules/policies/__init__.py
deleted file mode 100644
index 5ce3ea88..00000000
--- a/texar/tf/modules/policies/__init__.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Modules of Texar policies.
-"""
-
-# pylint: disable=wildcard-import
-
-from texar.tf.modules.policies.policy_nets import *
diff --git a/texar/tf/modules/policies/policy_nets.py b/texar/tf/modules/policies/policy_nets.py
deleted file mode 100644
index fbe2dd6b..00000000
--- a/texar/tf/modules/policies/policy_nets.py
+++ /dev/null
@@ -1,333 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Policy models based on feed forward networks.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-import tensorflow as tf
-from tensorflow_probability import distributions as tfpd
-
-from texar.tf.module_base import ModuleBase
-from texar.tf.agents.agent_utils import Space
-from texar.tf.utils import utils
-from texar.tf.utils.dtypes import get_tf_dtype
-
-# pylint: disable=no-member
-
-__all__ = [
-    'PolicyNetBase',
-    'CategoricalPolicyNet'
-]
-
-
-class PolicyNetBase(ModuleBase):
-    """Policy net that takes in states and outputs actions.
-
-    Args:
-        network (optional): A network that takes in state and returns
-            outputs for generating actions. For example, an instance of subclass
-            of :class:`~texar.tf.modules.FeedForwardNetworkBase`. If `None`,
-            a network is created as specified in :attr:`hparams`.
-        network_kwargs (dict, optional): Keyword arguments for network
-            constructor.
-            Note that the `hparams` argument for network
-            constructor is specified in the "network_hparams" field of
-            :attr:`hparams` and should not be included in `network_kwargs`.
-            Ignored if :attr:`network` is given.
-        hparams (dict or HParams, optional): Hyperparameters. Missing
-            hyperparamerter will be set to default values. See
-            :meth:`default_hparams` for the hyperparameter sturcture and
-            default values.
-    """
-    def __init__(self,
-                 network=None,
-                 network_kwargs=None,
-                 hparams=None):
-        ModuleBase.__init__(self, hparams=hparams)
-
-        with tf.variable_scope(self.variable_scope):
-            self._build_network(network, network_kwargs)
-
-    @staticmethod
-    def default_hparams():
-        """Returns a dictionary of hyperparameters with default values.
-
-        .. role:: python(code)
-           :language: python
-
-        .. code-block:: python
-
-            {
-                'network_type': 'FeedForwardNetwork',
-                'network_hparams': {
-                    'layers': [
-                        {
-                            'type': 'Dense',
-                            'kwargs': {'units': 256, 'activation': 'relu'}
-                        },
-                        {
-                            'type': 'Dense',
-                            'kwargs': {'units': 256, 'activation': 'relu'}
-                        },
-                    ]
-                },
-                'distribution_kwargs': None,
-                'name': 'policy_net',
-            }
-
-        Here:
-
-        "network_type": str or class or instance
-            A network that takes in state and returns outputs for
-            generating actions. This can be a class, its name or module path,
-            or a class instance. Ignored if `network` is given to the
-            constructor.
-
-        "network_hparams": dict
-            Hyperparameters for the network. With the :attr:`network_kwargs`
-            argument to the constructor, a network is created with
-            :python:`network_class(**network_kwargs, hparams=network_hparams)`.
-
-            For example, the default values creates a two-layer dense network.
-
-        "distribution_kwargs": dict, optional
-            Keyword arguments for distribution constructor. A distribution
-            would be created for action sampling.
-
-        "name": str
-            Name of the policy.
-        """
-        return {
-            'network_type': 'FeedForwardNetwork',
-            'network_hparams': {
-                'layers': [
-                    {
-                        'type': 'Dense',
-                        'kwargs': {'units': 256, 'activation': 'relu'}
-                    },
-                    {
-                        'type': 'Dense',
-                        'kwargs': {'units': 256, 'activation': 'relu'}
-                    },
-                ]
-            },
-            'distribution_kwargs': None,
-            'name': 'policy_net',
-            '@no_typecheck': ['network_type', 'network_hparams']
-        }
-
-    def _build_network(self, network, kwargs):
-        if network is not None:
-            self._network = network
-        else:
-            kwargs = utils.get_instance_kwargs(
-                kwargs, self._hparams.network_hparams)
-            self._network = utils.check_or_get_instance(
-                self._hparams.network_type,
-                kwargs,
-                module_paths=['texar.tf.modules', 'texar.tf.custom'])
-
-    def _build(self, inputs, mode=None):  # pylint: disable=arguments-differ
-        raise NotImplementedError
-
-    @property
-    def network(self):
-        """The network.
-        """
-        return self._network
-
-
-# TODO(zhiting): Allow structured discrete actions.
-class CategoricalPolicyNet(PolicyNetBase):
-    """Policy net with Categorical distribution for discrete scalar actions.
-
-    This is a combination of a network with a top-layer distribution for
-    action sampling.
-
-    Args:
-        action_space (optional): An instance of :class:`~texar.tf.agents.Space`
-            specifying the action space. If not given, an discrete action space
-            `[0, high]` is created with `high` specified in :attr:`hparams`.
-        network (optional): A network that takes in state and returns
-            outputs for generating actions. For example, an instance of subclass
-            of :class:`~texar.tf.modules.FeedForwardNetworkBase`. If `None`,
-            a network is created as specified in :attr:`hparams`.
-        network_kwargs (dict, optional): Keyword arguments for network
-            constructor.
-            Note that the `hparams` argument for network
-            constructor is specified in the "network_hparams" field of
-            :attr:`hparams` and should not be included in `network_kwargs`.
-            Ignored if :attr:`network` is given.
-        hparams (dict or HParams, optional): Hyperparameters. Missing
-            hyperparamerter will be set to default values. See
-            :meth:`default_hparams` for the hyperparameter sturcture and
-            default values.
-
-    .. document private functions
-    .. automethod:: _build
-    """
-
-    def __init__(self,
-                 action_space=None,
-                 network=None,
-                 network_kwargs=None,
-                 hparams=None):
-        PolicyNetBase.__init__(self, hparams=hparams)
-
-        with tf.variable_scope(self.variable_scope):
-            if action_space is None:
-                action_space = Space(
-                    low=0, high=self._hparams.action_space, dtype=np.int32)
-            self._action_space = action_space
-            self._append_output_layer()
-
-    @staticmethod
-    def default_hparams():
-        """Returns a dictionary of hyperparameters with default values.
-
-        .. code-block:: python
-
-            {
-                'network_type': 'FeedForwardNetwork',
-                'network_hparams': {
-                    'layers': [
-                        {
-                            'type': 'Dense',
-                            'kwargs': {'units': 256, 'activation': 'relu'}
-                        },
-                        {
-                            'type': 'Dense',
-                            'kwargs': {'units': 256, 'activation': 'relu'}
-                        },
-                    ]
-                },
-                'distribution_kwargs': {
-                    'dtype': 'int32',
-                    'validate_args': False,
-                    'allow_nan_stats': True
-                },
-                'action_space': 2,
-                'make_output_layer': True,
-                'name': 'categorical_policy_net'
-            }
-
-        Here:
-
-        "distribution_kwargs": dict
-            Keyword arguments for the :tf_main:`Categorical
-            <distributions/Categorical>` distribution constructor. Arguments
-            `logits` and `probs` should not be included as they are inferred
-            from the inputs. Argument `dtype` can be a string (e.g., `int32`)
-            and will be converted to a corresponding tf dtype.
-
-        "action_space": int
-            Upper bound of the action space. The resulting action space is
-            all discrete scalar numbers between 0 and the upper bound specified
-            here (both inclusive).
-
-        "make_output_layer": bool
-            Whether to append a dense layer to the network to transform
-            features to logits for action sampling. If `False`, the final layer
-            output of network must match the action space.
-
-        See :class:`~texar.tf.modules.PolicyNetBase.default_hparams` for details
-        of other hyperparameters.
-        """
-        hparams = PolicyNetBase.default_hparams()
-        hparams.update({
-            'distribution_kwargs': {
-                'dtype': 'int32',
-                'validate_args': False,
-                'allow_nan_stats': True
-            },
-            'action_space': 2,
-            'make_output_layer': True,
-            'name': 'categorical_policy_net'
-        })
-        return hparams
-
-    def _append_output_layer(self):
-        if not self._hparams.make_output_layer:
-            return
-
-        if self._action_space.shape != ():
-            raise ValueError('Only scalar discrete action is supported.')
-        else:
-            output_size = self._action_space.high - self._action_space.low
-
-        layer_hparams = {
-            'type': 'Dense',
-            'kwargs': {'units': output_size}
-        }
-        self._network.append_layer(layer_hparams)
-
-    def _build(self, inputs, mode=None):
-        """Takes in states and outputs actions.
-
-        Args:
-            inputs: Inputs to the policy network with the first dimension
-                the batch dimension.
-            mode (optional): A tensor taking value in
-                :tf_main:`tf.estimator.ModeKeys <estimator/ModeKeys>`, including
-                `TRAIN`, `EVAL`, and `PREDICT`. If `None`,
-                :func:`texar.tf.global_mode` is used.
-
-        Returns
-            A `dict` including fields `"logits"`, `"action"`, and `"dist"`,
-            where
-
-            - **"logits"**: A Tensor of shape \
-            `[batch_size] + action_space size` used for categorical \
-            distribution sampling.
-            - **"action"**: A Tensor of shape \
-            `[batch_size] + action_space.shape`.
-            - **"dist"**: The \
-            :tf_main:`Categorical <distributions/Categorical>` based on the \
-            logits.
-        """
-        logits = self._network(inputs, mode=mode)
-
-        dkwargs = self._hparams.distribution_kwargs.todict()
-        dkwargs['dtype'] = get_tf_dtype(dkwargs['dtype'])
-        dist = tfpd.Categorical(logits=logits, **dkwargs)
-
-        action = dist.sample()
-        to_shape = [-1]  # for batch dimension
-        to_shape.extend(list(self._action_space.shape))
-        action = tf.reshape(action, to_shape)
-
-        outputs = {
-            "logits": logits,
-            "action": action,
-            "dist": dist
-        }
-
-        if not self._built:
-            self._add_internal_trainable_variables()
-            self._add_trainable_variable(self._network.trainable_variables)
-            self._built = True
-
-        return outputs
-
-    @property
-    def action_space(self):
-        """An instance of :class:`~texar.tf.agents.Space` specifiying the
-        action space.
-        """
-        return self._action_space
diff --git a/texar/tf/modules/policies/policy_nets_test.py b/texar/tf/modules/policies/policy_nets_test.py
deleted file mode 100644
index ff4a04da..00000000
--- a/texar/tf/modules/policies/policy_nets_test.py
+++ /dev/null
@@ -1,41 +0,0 @@
-#
-"""
-Tests policy nets.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
-
-import tensorflow as tf
-from tensorflow_probability import distributions as tfpd
-
-from texar.tf.modules.policies.policy_nets import CategoricalPolicyNet
-
-
-class CategoricalPolicyNetTest(tf.test.TestCase):
-    """Tests :class:`texar.tf.modules.CategoricalPolicyNet`.
-    """
-
-    def test_categorical_policy(self):
-        """Tests logics.
-        """
-        policy = CategoricalPolicyNet()
-
-        inputs = tf.random_uniform(shape=[1, 4])
-        outputs = policy(inputs=inputs)
-        self.assertEqual(list(outputs['action'].shape[1:]),
-                         list(policy.action_space.shape))
-        self.assertIsInstance(outputs['dist'],
-                              tfpd.Categorical)
-
-        inputs = tf.random_uniform(shape=[64, 4])
-        outputs = policy(inputs=inputs)
-        self.assertEqual(list(outputs['action'].shape[1:]),
-                         list(policy.action_space.shape))
-        self.assertEqual(int(outputs['action'].shape[0]), 64)
-
-
-if __name__ == "__main__":
-    tf.test.main()
diff --git a/texar/tf/modules/pretrained/__init__.py b/texar/tf/modules/pretrained/__init__.py
index 7e272000..432ec8fe 100644
--- a/texar/tf/modules/pretrained/__init__.py
+++ b/texar/tf/modules/pretrained/__init__.py
@@ -17,5 +17,3 @@
 
 from texar.tf.modules.pretrained.pretrained_base import *
 from texar.tf.modules.pretrained.bert import *
-from texar.tf.modules.pretrained.gpt2 import *
-from texar.tf.modules.pretrained.xlnet import *
diff --git a/texar/tf/modules/pretrained/bert.py b/texar/tf/modules/pretrained/bert.py
index b1598604..ad52a0d4 100644
--- a/texar/tf/modules/pretrained/bert.py
+++ b/texar/tf/modules/pretrained/bert.py
@@ -15,19 +15,10 @@
 Utils of BERT Modules.
 """
 
-from __future__ import absolute_import
-from __future__ import print_function
-from __future__ import division
-from __future__ import unicode_literals
-
-import collections
 import json
 import os
-import re
-
-from abc import ABCMeta
 
-import tensorflow as tf
+from abc import ABC
 
 from texar.tf.modules.pretrained.pretrained_base import PretrainedMixin
 
@@ -36,45 +27,91 @@
 ]
 
 _BERT_PATH = "https://storage.googleapis.com/bert_models/"
+_BIOBERT_PATH = "https://github.com/naver/biobert-pretrained/releases/download/"
+_SCIBERT_PATH = "https://s3-us-west-2.amazonaws.com/ai2-s2-research/" \
+                "scibert/tensorflow_models/"
 
 
-class PretrainedBERTMixin(PretrainedMixin):
+class PretrainedBERTMixin(PretrainedMixin, ABC):
     r"""A mixin class to support loading pre-trained checkpoints for modules
     that implement the BERT model.
 
-    The BERT model was proposed in (`Devlin et al`. 2018)
-    `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`_
-    . A bidirectional Transformer language model pre-trained on large text
-    corpora. Available model names include:
-
-      * ``bert-base-uncased``: 12-layer, 768-hidden, 12-heads,
-        110M parameters.
-      * ``bert-large-uncased``: 24-layer, 1024-hidden, 16-heads,
-        340M parameters.
-      * ``bert-base-cased``: 12-layer, 768-hidden, 12-heads , 110M parameters.
-      * ``bert-large-cased``: 24-layer, 1024-hidden, 16-heads,
-        340M parameters.
-      * ``bert-base-multilingual-uncased``: 102 languages, 12-layer,
-        768-hidden, 12-heads, 110M parameters.
-      * ``bert-base-multilingual-cased``: 104 languages, 12-layer, 768-hidden,
-        12-heads, 110M parameters.
-      * ``bert-base-chinese``: Chinese Simplified and Traditional, 12-layer,
-        768-hidden, 12-heads, 110M parameters.
+    Both standard BERT models and many domain specific BERT-based models are
+    supported. You can specify the :attr:`pretrained_model_name` argument to
+    pick which pre-trained BERT model to use. All available categories of
+    pre-trained models (and names) include:
+
+    * **Standard BERT**: proposed in (`Devlin et al`. 2018)
+      `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`_
+      . A bidirectional Transformer language model pre-trained on large text
+      corpora. Available model names include:
+
+        * ``bert-base-uncased``: 12-layer, 768-hidden, 12-heads,
+          110M parameters.
+        * ``bert-large-uncased``: 24-layer, 1024-hidden, 16-heads,
+          340M parameters.
+        * ``bert-base-cased``: 12-layer, 768-hidden, 12-heads , 110M parameters.
+        * ``bert-large-cased``: 24-layer, 1024-hidden, 16-heads,
+          340M parameters.
+        * ``bert-base-multilingual-uncased``: 102 languages, 12-layer,
+          768-hidden, 12-heads, 110M parameters.
+        * ``bert-base-multilingual-cased``: 104 languages, 12-layer, 768-hidden,
+          12-heads, 110M parameters.
+        * ``bert-base-chinese``: Chinese Simplified and Traditional, 12-layer,
+          768-hidden, 12-heads, 110M parameters.
+
+    * **BioBERT**: proposed in (`Lee et al`. 2019)
+      `BioBERT: a pre-trained biomedical language representation model for biomedical text mining`_
+      . A domain specific language representation model pre-trained on
+      large-scale biomedical corpora. Based on the BERT architecture, BioBERT
+      effectively transfers the knowledge from a large amount of biomedical
+      texts to biomedical text mining models with minimal task-specific
+      architecture modifications. Available model names include:
+
+        * ``biobert-v1.0-pmc``: BioBERT v1.0 (+ PMC 270K) - based on
+          BERT-base-Cased (same vocabulary).
+        * ``biobert-v1.0-pubmed-pmc``: BioBERT v1.0 (+ PubMed 200K + PMC 270K) -
+          based on BERT-base-Cased (same vocabulary).
+        * ``biobert-v1.0-pubmed``: BioBERT v1.0 (+ PubMed 200K) - based on
+          BERT-base-Cased (same vocabulary).
+        * ``biobert-v1.1-pubmed``: BioBERT v1.1 (+ PubMed 1M) - based on
+          BERT-base-Cased (same vocabulary).
+
+    * **SciBERT**: proposed in (`Beltagy et al`. 2019)
+      `SciBERT: A Pretrained Language Model for Scientific Text`_. A BERT model
+      trained on scientific text. SciBERT leverages unsupervised pre-training
+      on a large multi-domain corpus of scientific publications to improve
+      performance on downstream scientific NLP tasks. Available model
+      names include:
+
+        * ``scibert-scivocab-uncased``: Uncased version of the model trained
+          on its own vocabulary.
+        * ``scibert-scivocab-cased``: Cased version of the model trained on
+          its own vocabulary.
+        * ``scibert-basevocab-uncased``: Uncased version of the model trained
+          on the original BERT vocabulary.
+        * ``scibert-basevocab-cased``: Cased version of the model trained on
+          the original BERT vocabulary.
 
     We provide the following BERT classes:
 
-      * :class:`~texar.tf.modules.BERTEncoder` for text encoding.
-      * :class:`~texar.tf.modules.BERTClassifier` for text classification and
+      * :class:`~texar.torch.modules.BERTEncoder` for text encoding.
+      * :class:`~texar.torch.modules.BERTClassifier` for text classification and
         sequence tagging.
 
     .. _`BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`:
         https://arxiv.org/abs/1810.04805
-    """
 
-    __metaclass__ = ABCMeta
+    .. _`BioBERT: a pre-trained biomedical language representation model for biomedical text mining`:
+        https://arxiv.org/abs/1901.08746
+
+    .. _`SciBERT: A Pretrained Language Model for Scientific Text`:
+        https://arxiv.org/abs/1903.10676
+    """
 
     _MODEL_NAME = "BERT"
     _MODEL2URL = {
+        # Standard BERT
         'bert-base-uncased':
             _BERT_PATH + "2018_10_18/uncased_L-12_H-768_A-12.zip",
         'bert-large-uncased':
@@ -89,6 +126,48 @@ class PretrainedBERTMixin(PretrainedMixin):
             _BERT_PATH + "2018_11_03/multilingual_L-12_H-768_A-12.zip",
         'bert-base-chinese':
             _BERT_PATH + "2018_11_03/chinese_L-12_H-768_A-12.zip",
+
+        # BioBERT
+        'biobert-v1.0-pmc':
+            _BIOBERT_PATH + 'v1.0-pmc/biobert_v1.0_pmc.tar.gz',
+        'biobert-v1.0-pubmed-pmc':
+            _BIOBERT_PATH + 'v1.0-pubmed-pmc/biobert_v1.0_pubmed_pmc.tar.gz',
+        'biobert-v1.0-pubmed':
+            _BIOBERT_PATH + 'v1.0-pubmed/biobert_v1.0_pubmed.tar.gz',
+        'biobert-v1.1-pubmed':
+            _BIOBERT_PATH + 'v1.1-pubmed/biobert_v1.1_pubmed.tar.gz',
+
+        # SciBERT
+        'scibert-scivocab-uncased':
+            _SCIBERT_PATH + 'scibert_scivocab_uncased.tar.gz',
+        'scibert-scivocab-cased':
+            _SCIBERT_PATH + 'scibert_scivocab_cased.tar.gz',
+        'scibert-basevocab-uncased':
+            _SCIBERT_PATH + 'scibert_basevocab_uncased.tar.gz',
+        'scibert-basevocab-cased':
+            _SCIBERT_PATH + 'scibert_basevocab_cased.tar.gz',
+    }
+    _MODEL2CKPT = {
+        # Standard BERT
+        'bert-base-uncased': 'bert_model.ckpt',
+        'bert-large-uncased': 'bert_model.ckpt',
+        'bert-base-cased': 'bert_model.ckpt',
+        'bert-large-cased': 'bert_model.ckpt',
+        'bert-base-multilingual-uncased': 'bert_model.ckpt',
+        'bert-base-multilingual-cased': 'bert_model.ckpt',
+        'bert-base-chinese': 'bert_model.ckpt',
+
+        # BioBERT
+        'biobert-v1.0-pmc': 'biobert_model.ckpt',
+        'biobert-v1.0-pubmed-pmc': 'biobert_model.ckpt',
+        'biobert-v1.0-pubmed': 'biobert_model.ckpt',
+        'biobert-v1.1-pubmed': 'model.ckpt-1000000',
+
+        # SciBERT
+        'scibert-scivocab-uncased': 'bert_model.ckpt',
+        'scibert-scivocab-cased': 'bert_model.ckpt',
+        'scibert-basevocab-uncased': 'bert_model.ckpt',
+        'scibert-basevocab-cased': 'bert_model.ckpt',
     }
 
     @classmethod
@@ -166,88 +245,5 @@ def _transform_config(cls, pretrained_model_name, cache_dir):
         return configs
 
     def _init_from_checkpoint(self, pretrained_model_name,
-                              cache_dir, scope_name, **kwargs):
-        tvars = tf.trainable_variables()
-        init_checkpoint = os.path.abspath(os.path.join(cache_dir,
-                                                       'bert_model.ckpt'))
-        if init_checkpoint:
-            assignment_map, initialized_variable_names = \
-                self._get_assignment_map_from_checkpoint(
-                    tvars, init_checkpoint, scope_name)
-            tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
-
-    def _get_assignment_map_from_checkpoint(self, tvars, init_checkpoint,
-                                            scope_name):
-        r"""`https://github.com/google-research/bert/blob/master/modeling.py`
-
-        Compute the union of the current variables and checkpoint variables.
-        Because the variable scope of the original BERT and Texar
-        implementation, we need to build a assignment map to match the
-        variables.
-        """
-        initialized_variable_names = {}
-
-        name_to_variable = collections.OrderedDict()
-        for var in tvars:
-            name = var.name
-            m = re.match("^(.*):\\d+$", name)
-            if m is not None:
-                name = m.group(1)
-            name_to_variable[name] = var
-
-        init_vars = tf.train.list_variables(init_checkpoint)
-
-        assignment_map = {
-            'bert/embeddings/word_embeddings':
-                scope_name + '/word_embeddings/w',
-            'bert/embeddings/token_type_embeddings':
-                scope_name + '/token_type_embeddings/w',
-            'bert/embeddings/position_embeddings':
-                scope_name + '/position_embeddings/w',
-            'bert/embeddings/LayerNorm/beta':
-                scope_name + '/encoder/LayerNorm/beta',
-            'bert/embeddings/LayerNorm/gamma':
-                scope_name + '/encoder/LayerNorm/gamma',
-        }
-        for check_name, model_name in assignment_map.items():
-            initialized_variable_names[model_name] = 1
-            initialized_variable_names[model_name + ":0"] = 1
-
-        for check_name, _ in init_vars:
-            if check_name.startswith('bert'):
-                if check_name.startswith('bert/embeddings'):
-                    continue
-                check_name_scope = check_name.replace("bert/", scope_name + '/')
-                model_name = re.sub(
-                    'layer_\\d+/output/dense',
-                    lambda x: x.group(0).replace('output/dense', 'ffn/output'),
-                    check_name_scope)
-                if model_name == check_name_scope:
-                    model_name = re.sub(
-                        'layer_\\d+/output/LayerNorm',
-                        lambda x: x.group(0).replace('output/LayerNorm',
-                                                     'ffn/LayerNorm'),
-                        check_name_scope)
-                if model_name == check_name_scope:
-                    model_name = re.sub(
-                        'layer_\\d+/intermediate/dense',
-                        lambda x: x.group(0).replace('intermediate/dense',
-                                                     'ffn/intermediate'),
-                        check_name_scope)
-                if model_name == check_name_scope:
-                    model_name = re.sub('attention/output/dense',
-                                        'attention/self/output',
-                                        check_name_scope)
-                if model_name == check_name_scope:
-                    model_name = check_name_scope.replace(
-                        'attention/output/LayerNorm', 'output/LayerNorm')
-
-                if model_name in name_to_variable.keys():
-                    assignment_map[check_name] = model_name
-                    initialized_variable_names[model_name] = 1
-                    initialized_variable_names[model_name + ":0"] = 1
-                else:
-                    tf.logging.info(
-                        'model name:{} not exist'.format(model_name))
-
-        return assignment_map, initialized_variable_names
+                              cache_dir, **kwargs):
+        return
diff --git a/texar/tf/modules/pretrained/bert_test.py b/texar/tf/modules/pretrained/bert_test.py
index 8a221386..78b2eafd 100644
--- a/texar/tf/modules/pretrained/bert_test.py
+++ b/texar/tf/modules/pretrained/bert_test.py
@@ -2,11 +2,6 @@
 Unit tests for BERT utils.
 """
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
-
 import os
 import tensorflow as tf
 
diff --git a/texar/tf/modules/pretrained/gpt2.py b/texar/tf/modules/pretrained/gpt2.py
deleted file mode 100644
index 446afa10..00000000
--- a/texar/tf/modules/pretrained/gpt2.py
+++ /dev/null
@@ -1,347 +0,0 @@
-# Copyright 2019 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Utils of GPT2 Modules.
-"""
-
-import collections
-import json
-import os
-import re
-import warnings
-
-from abc import ABC
-from typing import Any, Dict
-
-import tensorflow as tf
-import numpy as np
-
-from texar.tf.modules.pretrained.pretrained_base import PretrainedMixin
-
-__all__ = [
-    "PretrainedGPT2Mixin",
-]
-
-_GPT2_PATH = "https://storage.googleapis.com/gpt-2/models/"
-_CHECKPOINT_FILES = [
-    "checkpoint", "encoder.json", "hparams.json", "vocab.bpe",
-    "model.ckpt.data-00000-of-00001", "model.ckpt.index", "model.ckpt.meta"]
-
-
-class PretrainedGPT2Mixin(PretrainedMixin, ABC):
-    r"""A mixin class to support loading pre-trained checkpoints for modules
-    that implement the GPT2 model.
-
-    The GPT2 model was proposed in
-    `Language Models are Unsupervised Multitask Learners`_
-    by `Radford et al.` from OpenAI. It is a unidirectional Transformer model
-    pre-trained using the vanilla language modeling objective on a large corpus.
-
-    The available GPT2 models are as follows:
-
-      * ``gpt2-small``: Small version of GPT-2, 124M parameters.
-      * ``gpt2-medium``: Medium version of GPT-2, 355M parameters.
-      * ``gpt2-large``: Large version of GPT-2, 774M parameters.
-      * ``gpt2-xl``: XL version of GPT-2, 1558M parameters.
-
-    We provide the following GPT2 classes:
-
-      * :class:`~texar.tf.modules.GPT2Encoder` for text encoding.
-      * :class:`~texar.tf.modules.GPT2Decoder` for text generation and
-        decoding.
-      * :class:`~texar.tf.modules.GPT2Classifier` for text classification and
-        sequence tagging.
-
-    .. _`Language Models are Unsupervised Multitask Learners`:
-        https://openai.com/blog/better-language-models/
-    """
-    _IS_DECODE = False
-    _MODEL_NAME = "GPT2"
-    _MODEL2URL = {
-        'gpt2-small': [_GPT2_PATH + f"124M/{file}"
-                       for file in _CHECKPOINT_FILES],
-        'gpt2-medium': [_GPT2_PATH + f"355M/{file}"
-                        for file in _CHECKPOINT_FILES],
-        'gpt2-large': [_GPT2_PATH + f"774M/{file}"
-                       for file in _CHECKPOINT_FILES],
-        'gpt2-xl': [_GPT2_PATH + f"1558M/{file}"
-                    for file in _CHECKPOINT_FILES],
-    }
-
-    # Raise warning for the deprecated pre-trained model names
-    class MyDict(dict):
-        def __contains__(self, key):
-            if key == '117M':
-                warnings.warn("Pre-trained model name '117M' is deprecated, "
-                              "use 'gpt2-small' instead.", UserWarning)
-                return True
-            elif key == '345M':
-                warnings.warn("Pre-trained model name '345M' is deprecated, "
-                              "use 'gpt2-medium' instead.", UserWarning)
-                return True
-            else:
-                return super().__contains__(key)
-
-    _DEPRECATED_MODEL2URL = {
-        '117M': [_GPT2_PATH + f"124M/{file}" for file in _CHECKPOINT_FILES],
-        '345M': [_GPT2_PATH + f"355M/{file}" for file in _CHECKPOINT_FILES],
-    }
-    _MODEL2URL.update(_DEPRECATED_MODEL2URL)
-    _MODEL2URL = MyDict(_MODEL2URL)  # type: ignore
-
-    def _transform_config(self, pretrained_model_name: str,
-                          cache_dir: str) -> Dict[str, Any]:
-        info = list(os.walk(cache_dir))
-        root, _, files = info[0]
-        config_path = None
-        for file in files:
-            if file.endswith('hparams.json'):
-                config_path = os.path.join(root, file)
-        if config_path is None:
-            raise ValueError(f"Cannot find the config file in {cache_dir}")
-
-        with open(config_path) as f:
-            config_gpt = json.loads(f.read())
-
-        hidden_dim = config_gpt["n_embd"]
-        configs = {
-            "vocab_size": config_gpt["n_vocab"],
-            "context_size": config_gpt["n_ctx"],
-            "embedding_size": config_gpt["n_embd"], "embed": {
-                "dim": hidden_dim,
-            },
-            "position_size": config_gpt["n_ctx"],
-            "position_embed": {
-                "dim": hidden_dim
-            }
-        }
-
-        module_name = "decoder" if self._IS_DECODE else "encoder"
-        configs.update({module_name: {
-            "dim": hidden_dim,
-            "num_blocks": config_gpt["n_layer"],
-            "embedding_dropout": 0,
-            "residual_dropout": 0,
-            "multihead_attention": {
-                "use_bias": True,
-                "num_units": hidden_dim,
-                "num_heads": config_gpt["n_head"],
-                "output_dim": hidden_dim,
-            },
-            "initializer": {
-                "type": "variance_scaling_initializer",
-                "kwargs": {
-                        'factor': 1.0,
-                        'mode': 'FAN_AVG',
-                        'uniform': True
-                },
-            },
-            "poswise_feedforward": {
-                "layers": [
-                    {
-                        "type": "Dense",
-                        "kwargs": {
-                            'name': 'intermediate',
-                            'activation': 'gelu',
-                            "units": hidden_dim * 4,
-                            "use_bias": True,
-                        }
-                    },
-                    {
-                        "type": "Dense",
-                        "kwargs": {
-                            'activation': None,
-                            'name': 'output',
-                            "units": hidden_dim,
-                            "use_bias": True,
-                        }
-                    }
-                ],
-            },
-        }})
-        return configs
-
-    def _init_from_checkpoint(self, pretrained_model_name, cache_dir,
-                              scope_name, load_output_layer=True, **kwargs):
-        r"""Initialize model parameters from weights stored in the pre-trained
-        checkpoint.
-
-        Args:
-            pretrained_model_name (str): Name of the pre-trained model.
-            cache_dir (str): Path to the cache directory.
-            scope_name (str): Scope name of the model.
-            load_output_layer (bool): If `False`, will not load weights of the
-                output layer. Set this argument to `False` when loading weights
-                into a GPT2 encoder. Defaults to `True`.
-        """
-        init_checkpoint = os.path.abspath(os.path.join(cache_dir,
-                                                       'model.ckpt'))
-        ckpt = tf.train.load_checkpoint(init_checkpoint)
-        ckpt_params = {key: ckpt.get_tensor(key) for key in
-                       ckpt.get_variable_to_shape_map().keys()}
-
-        tvars = tf.trainable_variables()
-        name_to_variable = collections.OrderedDict()
-        for var in tvars:
-            name = var.name
-            m = re.match("^(.*):\\d+$", name)
-            if m is not None:
-                name = m.group(1)
-            name_to_variable[name] = var
-
-        if load_output_layer:
-            global_tensor_map = {
-                'model/wte': scope_name + '/word_embeddings/w',
-                'model/wpe': scope_name + '/position_embeddings/w',
-                'model/ln_f/b': scope_name + '/decoder/beta',
-                'model/ln_f/g': scope_name + '/decoder/gamma',
-            }
-
-            layer_tensor_map = {
-                "ln_1/b": scope_name + '/layer_{}/beta',
-                "ln_1/g": scope_name + '/layer_{}/gamma',
-                "ln_2/b": scope_name + '/layer_{}/past_poswise_ln/beta',
-                "ln_2/g": scope_name + '/layer_{}/past_poswise_ln/gamma',
-                "mlp/c_fc/b": scope_name + '/decoder/layer_{}'
-                                           '/ffn/intermediate/bias',
-                "mlp/c_fc/w": scope_name + '/decoder/layer_{}'
-                                           '/ffn/intermediate/kernel',
-                "mlp/c_proj/b": scope_name + '/decoder/layer_{}/ffn/output/'
-                                             'bias',
-                "mlp/c_proj/w": scope_name + '/decoder/layer_{}/ffn/output/'
-                                             'kernel',
-                "attn/c_attn/b": None,
-                "attn/c_attn/w": None,
-                "attn/c_proj/b": scope_name + '/decoder/layer_{}'
-                                              '/self_attention/self/output/'
-                                              'bias',
-                "attn/c_proj/w": scope_name + '/decoder/layer_{}'
-                                              '/self_attention/self/output/'
-                                              'kernel',
-            }
-        else:
-            global_tensor_map = {
-                'model/wte': scope_name + '/word_embeddings/w',
-                'model/wpe': scope_name + '/position_embeddings/w',
-                'model/ln_f/b': scope_name + '/encoder/LayerNorm/beta',
-                'model/ln_f/g': scope_name + '/encoder/LayerNorm/gamma',
-            }
-
-            layer_tensor_map = {
-                "ln_1/b": scope_name + '/encoder/layer_{}/LayerNorm/beta',
-                "ln_1/g": scope_name + '/encoder/layer_{}/LayerNorm/gamma',
-                "ln_2/b": scope_name + '/encoder/layer_{}/output/'
-                                       'LayerNorm/beta',
-                "ln_2/g": scope_name + '/encoder/layer_{}/output/'
-                                       'LayerNorm/gamma',
-                "mlp/c_fc/b": scope_name + '/encoder/layer_{}'
-                                           '/ffn/intermediate/bias',
-                "mlp/c_fc/w": scope_name + '/encoder/layer_{}'
-                                           '/ffn/intermediate/kernel',
-                "mlp/c_proj/b": scope_name + '/encoder/layer_{}/ffn/output/'
-                                             'bias',
-                "mlp/c_proj/w": scope_name + '/encoder/layer_{}/ffn/output/'
-                                             'kernel',
-                "attn/c_attn/b": None,
-                "attn/c_attn/w": None,
-                "attn/c_proj/b": scope_name + '/encoder/layer_{}'
-                                              '/attention/self/output/bias',
-                "attn/c_proj/w": scope_name + '/encoder/layer_{}'
-                                              '/attention/self/output/kernel',
-            }
-
-        for name, array in ckpt_params.items():
-            if name in global_tensor_map:
-                v_name = global_tensor_map[name]
-                pointer = name_to_variable[v_name]
-                pointer._initializer_op = tf.assign(pointer._variable, array)
-            else:
-                name_tmp = name.split("/")
-                layer_no = name_tmp[1][1:]
-                name = "/".join(name_tmp[2:])
-
-                if name in layer_tensor_map:
-                    if name == "attn/c_attn/b":
-                        if load_output_layer:
-                            K = name_to_variable[
-                                scope_name + '/decoder/layer_' + layer_no +
-                                '/self_attention/self/key/bias']
-                            Q = name_to_variable[
-                                scope_name + '/decoder/layer_' + layer_no +
-                                '/self_attention/self/query/bias']
-                            V = name_to_variable[
-                                scope_name + '/decoder/layer_' + layer_no +
-                                '/self_attention/self/value/bias']
-                        else:
-                            K = name_to_variable[
-                                scope_name + '/encoder/layer_' + layer_no +
-                                '/attention/self/key/bias']
-                            Q = name_to_variable[
-                                scope_name + '/encoder/layer_' + layer_no +
-                                '/attention/self/query/bias']
-                            V = name_to_variable[
-                                scope_name + '/encoder/layer_' + layer_no +
-                                '/attention/self/value/bias']
-
-                        index_d = array.shape[-1] // 3
-
-                        Q_w = array[:index_d]
-                        K_w = array[index_d: 2 * index_d]
-                        V_w = array[2 * index_d:]
-
-                        K._initializer_op = tf.assign(K._variable, K_w)
-                        Q._initializer_op = tf.assign(Q._variable, Q_w)
-                        V._initializer_op = tf.assign(V._variable, V_w)
-                    elif name == "attn/c_attn/w":
-                        if load_output_layer:
-                            K = name_to_variable[
-                                scope_name + '/decoder/layer_' + layer_no +
-                                '/self_attention/self/key/kernel']
-                            Q = name_to_variable[
-                                scope_name + '/decoder/layer_' + layer_no +
-                                '/self_attention/self/query/kernel']
-                            V = name_to_variable[
-                                scope_name + '/decoder/layer_' + layer_no +
-                                '/self_attention/self/value/kernel']
-                        else:
-                            K = name_to_variable[
-                                scope_name + '/encoder/layer_' + layer_no +
-                                '/attention/self/key/kernel']
-                            Q = name_to_variable[
-                                scope_name + '/encoder/layer_' + layer_no +
-                                '/attention/self/query/kernel']
-                            V = name_to_variable[
-                                scope_name + '/encoder/layer_' + layer_no +
-                                '/attention/self/value/kernel']
-
-                        index_d = array.shape[-1] // 3
-
-                        Q_w = np.transpose(array[0, :, :index_d])
-                        K_w = np.transpose(array[0, :, index_d: 2 * index_d])
-                        V_w = np.transpose(array[0, :, 2 * index_d:])
-
-                        K._initializer_op = tf.assign(K._variable, K_w)
-                        Q._initializer_op = tf.assign(Q._variable, Q_w)
-                        V._initializer_op = tf.assign(V._variable, V_w)
-                    elif (name == "attn/c_proj/w" or name == "mlp/c_fc/w" or
-                          name == "mlp/c_proj/w"):
-                        v_name = layer_tensor_map[name]
-                        pointer = name_to_variable[v_name.format(layer_no)]
-                        pointer._initializer_op = tf.assign(pointer._variable,
-                                                            array[0])
-                    else:
-                        v_name = layer_tensor_map[name]
-                        pointer = name_to_variable[v_name.format(layer_no)]
-                        pointer._initializer_op = tf.assign(pointer._variable,
-                                                            array)
diff --git a/texar/tf/modules/pretrained/gpt2_test.py b/texar/tf/modules/pretrained/gpt2_test.py
deleted file mode 100644
index 06c3f699..00000000
--- a/texar/tf/modules/pretrained/gpt2_test.py
+++ /dev/null
@@ -1,95 +0,0 @@
-"""
-Unit tests for GPT2 utils.
-"""
-
-import os
-import tensorflow as tf
-
-from texar.tf.modules.pretrained.gpt2 import *
-from texar.tf.utils.test import pretrained_test
-
-
-class GPT2UtilsTest(tf.test.TestCase):
-    r"""Tests GPT2 utils.
-    """
-
-    @pretrained_test
-    def test_load_pretrained_gpt2_AND_transform_gpt2_to_texar_config(self):
-        pretrained_model_dir = PretrainedGPT2Mixin.download_checkpoint(
-            pretrained_model_name="gpt2-small")
-
-        info = list(os.walk(pretrained_model_dir))
-        _, _, files = info[0]
-        self.assertIn('checkpoint', files)
-        self.assertIn('encoder.json', files)
-        self.assertIn('hparams.json', files)
-        self.assertIn('model.ckpt.data-00000-of-00001', files)
-        self.assertIn('model.ckpt.index', files)
-        self.assertIn('model.ckpt.meta', files)
-        self.assertIn('vocab.bpe', files)
-
-        model_config = PretrainedGPT2Mixin._transform_config(
-            pretrained_model_name="gpt2-small",
-            cache_dir=pretrained_model_dir)
-
-        exp_config = {
-            'vocab_size': 50257,
-            'context_size': 1024,
-            'embedding_size': 768,
-            'embed': {
-                'dim': 768
-            },
-            'position_size': 1024,
-            'position_embed': {
-                'dim': 768
-            },
-
-            'encoder': {
-                'dim': 768,
-                'num_blocks': 12,
-                'embedding_dropout': 0,
-                'residual_dropout': 0,
-                'multihead_attention': {
-                    'use_bias': True,
-                    'num_units': 768,
-                    'num_heads': 12,
-                    'output_dim': 768
-                },
-                'initializer': {
-                    'type': 'variance_scaling_initializer',
-                    'kwargs': {
-                        'factor': 1.0,
-                        'mode': 'FAN_AVG',
-                        'uniform': True
-                    }
-                },
-                'poswise_feedforward': {
-                    'layers': [
-                        {
-                            "type": "Dense",
-                            "kwargs": {
-                                'name': 'intermediate',
-                                'activation': 'gelu',
-                                "units": 3072,
-                                "use_bias": True,
-                            }
-                        },
-                        {
-                            "type": "Dense",
-                            "kwargs": {
-                                'activation': None,
-                                'name': 'output',
-                                "units": 768,
-                                "use_bias": True,
-                            }
-                        }
-                    ]
-                }
-            }
-        }
-
-        self.assertDictEqual(model_config, exp_config)
-
-
-if __name__ == "__main__":
-    tf.test.main()
diff --git a/texar/tf/modules/pretrained/pretrained_base.py b/texar/tf/modules/pretrained/pretrained_base.py
index bbbbeba9..e778c6db 100644
--- a/texar/tf/modules/pretrained/pretrained_base.py
+++ b/texar/tf/modules/pretrained/pretrained_base.py
@@ -14,21 +14,17 @@
 """
 Base class for Pre-trained Modules.
 """
-
-from __future__ import absolute_import
-from __future__ import print_function
-from __future__ import division
-from __future__ import unicode_literals
-
 import os
 import sys
 
-from abc import ABCMeta, abstractmethod
+from abc import ABC, abstractmethod
 from pathlib import Path
+from typing import Dict, Optional
 
 from texar.tf.data.data_utils import maybe_download
 from texar.tf.hyperparams import HParams
-from texar.tf.module_base import ModuleBase
+from texar.tf.utils.types import MaybeList
+
 
 __all__ = [
     "default_download_dir",
@@ -39,7 +35,7 @@
 _default_texar_download_dir = None
 
 
-def default_download_dir(name):
+def default_download_dir(name: str) -> Path:
     r"""Return the directory to which packages will be downloaded by default.
     """
     global _default_texar_download_dir  # pylint: disable=global-statement
@@ -49,13 +45,13 @@ def default_download_dir(name):
             home_dir = Path(os.environ['APPDATA'])
         else:
             # Otherwise, install in the user's home directory.
-            home_dir = Path(os.environ["HOME"])
+            home_dir = Path.home()
 
-        if os.access(str(home_dir), os.W_OK):
+        if os.access(home_dir, os.W_OK):
             _default_texar_download_dir = home_dir / 'texar_data'
         else:
-            raise ValueError("The path {} is not writable. Please manually "
-                             "specify the download directory".format(home_dir))
+            raise ValueError(f"The path {home_dir} is not writable. Please "
+                             f"manually specify the download directory")
 
     if not _default_texar_download_dir.exists():
         _default_texar_download_dir.mkdir(parents=True)
@@ -69,23 +65,22 @@ def set_default_download_dir(path):
     elif not isinstance(path, Path):
         raise ValueError("`path` must be a string or a pathlib.Path object")
 
-    if not os.access(str(path), os.W_OK):
+    if not os.access(path, os.W_OK):
         raise ValueError(
-            "The specified download directory {} is not writable".format(path))
+            f"The specified download directory {path} is not writable")
 
     global _default_texar_download_dir  # pylint: disable=global-statement
     _default_texar_download_dir = path
 
 
-class PretrainedMixin(ModuleBase):
+class PretrainedMixin(ABC):
     r"""A mixin class for all pre-trained classes to inherit.
     """
-    __metaclass__ = ABCMeta
 
-    _MODEL_NAME = None
-    _MODEL2URL = None
+    _MODEL_NAME: str
+    _MODEL2URL: Dict[str, MaybeList[str]]
 
-    pretrained_model_dir = None
+    pretrained_model_dir: Optional[str]
 
     @classmethod
     def available_checkpoints(cls):
@@ -145,11 +140,11 @@ def load_pretrained_config(self,
             self._hparams = HParams(
                 pretrained_model_hparams, self._hparams.todict())
 
-    def init_pretrained_weights(self, scope_name, **kwargs):
+    def init_pretrained_weights(self, *args, **kwargs):
         if self.pretrained_model_dir:
             self._init_from_checkpoint(
                 self.pretrained_model_name,
-                self.pretrained_model_dir, scope_name, **kwargs)
+                self.pretrained_model_dir, *args, **kwargs)
         else:
             self.reset_parameters()
 
@@ -193,7 +188,7 @@ def download_checkpoint(cls, pretrained_model_name, cache_dir=None):
             download_path = cls._MODEL2URL[pretrained_model_name]
         else:
             raise ValueError(
-                "Pre-trained model not found: {}".format(pretrained_model_name))
+                f"Pre-trained model not found: {pretrained_model_name}")
 
         if cache_dir is None:
             cache_path = default_download_dir(cls._MODEL_NAME)
@@ -202,12 +197,9 @@ def download_checkpoint(cls, pretrained_model_name, cache_dir=None):
         cache_path = cache_path / pretrained_model_name
 
         if not cache_path.exists():
-            if isinstance(download_path, list):
-                for path in download_path:
-                    maybe_download(path, str(cache_path))
-            else:
+            if isinstance(download_path, str):
                 filename = download_path.split('/')[-1]
-                maybe_download(download_path, str(cache_path), extract=True)
+                maybe_download(download_path, cache_path, extract=True)
                 folder = None
                 for file in cache_path.iterdir():
                     if file.is_dir():
@@ -217,11 +209,14 @@ def download_checkpoint(cls, pretrained_model_name, cache_dir=None):
                 for file in folder.iterdir():
                     file.rename(file.parents[1] / file.name)
                 folder.rmdir()
-            print("Pre-trained {} checkpoint {} cached to {}".format(
-                cls._MODEL_NAME, pretrained_model_name, cache_path))
+            else:
+                for path in download_path:
+                    maybe_download(path, cache_path)
+            print(f"Pre-trained {cls._MODEL_NAME} checkpoint "
+                  f"{pretrained_model_name} cached to {cache_path}")
         else:
-            print("Using cached pre-trained {} checkpoint from {}.".format(
-                cls._MODEL_NAME, cache_path))
+            print(f"Using cached pre-trained {cls._MODEL_NAME} checkpoint "
+                  f"from {cache_path}.")
 
         return str(cache_path)
 
@@ -241,15 +236,13 @@ def _transform_config(cls, pretrained_model_name, cache_dir):
         raise NotImplementedError
 
     @abstractmethod
-    def _init_from_checkpoint(self, pretrained_model_name, cache_dir,
-                              scope_name, **kwargs):
+    def _init_from_checkpoint(self, pretrained_model_name, cache_dir, **kwargs):
         r"""Initialize model parameters from weights stored in the pre-trained
         checkpoint.
 
         Args:
             pretrained_model_name (str): Name of the pre-trained model.
             cache_dir (str): Path to the cache directory.
-            scope_name: Variable scope.
             **kwargs: Additional arguments for specific models.
         """
         raise NotImplementedError
diff --git a/texar/tf/modules/pretrained/xlnet.py b/texar/tf/modules/pretrained/xlnet.py
deleted file mode 100644
index 5fa6d29a..00000000
--- a/texar/tf/modules/pretrained/xlnet.py
+++ /dev/null
@@ -1,177 +0,0 @@
-# Copyright 2019 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Utils of XLNet Modules.
-"""
-
-from __future__ import absolute_import
-from __future__ import print_function
-from __future__ import division
-from __future__ import unicode_literals
-
-import collections
-import json
-import os
-import re
-
-from abc import ABCMeta
-
-import tensorflow as tf
-
-from texar.tf.modules.pretrained.pretrained_base import PretrainedMixin
-
-__all__ = [
-    "PretrainedXLNetMixin",
-]
-
-_XLNET_PATH = "https://storage.googleapis.com/xlnet/released_models/"
-
-
-class PretrainedXLNetMixin(PretrainedMixin):
-    r"""A mixin class to support loading pre-trained checkpoints for modules
-    that implement the XLNet model.
-
-    The XLNet model was proposed in
-    `XLNet: Generalized Autoregressive Pretraining for Language Understanding`_
-    by `Yang et al.` It is based on the Transformer-XL model, pre-trained on a
-    large corpus using a language modeling objective that considers all
-    permutations of the input sentence.
-
-    The available XLNet models are as follows:
-
-      * ``xlnet-based-cased``: 12-layer, 768-hidden, 12-heads. This model is
-        trained on full data (different from the one in the paper).
-      * ``xlnet-large-cased``: 24-layer, 1024-hidden, 16-heads.
-
-    We provide the following XLNet classes:
-
-      * :class:`~texar.torch.modules.XLNetEncoder` for text encoding.
-      * :class:`~texar.torch.modules.XLNetDecoder` for text generation and
-        decoding.
-      * :class:`~texar.torch.modules.XLNetClassifier` for text classification
-        and sequence tagging.
-      * :class:`~texar.torch.modules.XLNetRegressor` for text regression.
-
-    .. _`XLNet: Generalized Autoregressive Pretraining for Language Understanding`:
-        http://arxiv.org/abs/1906.08237
-    """
-
-    __metaclass__ = ABCMeta
-
-    _MODEL_NAME = "XLNet"
-    _MODEL2URL = {
-        'xlnet-base-cased':
-            _XLNET_PATH + "cased_L-12_H-768_A-12.zip",
-        'xlnet-large-cased':
-            _XLNET_PATH + "cased_L-24_H-1024_A-16.zip",
-    }
-
-    @classmethod
-    def _transform_config(cls, pretrained_model_name, cache_dir):
-        info = list(os.walk(cache_dir))
-        root, _, files = info[0]
-        config_path = None
-        for file in files:
-            if file.endswith('config.json'):
-                config_path = os.path.join(root, file)
-        if config_path is None:
-            raise ValueError("Cannot find the config file in {}".format(
-                cache_dir))
-
-        with open(config_path) as f:
-            config_ckpt = json.loads(f.read())
-
-        configs = {
-            "head_dim": config_ckpt["d_head"],
-            "ffn_inner_dim": config_ckpt["d_inner"],
-            "hidden_dim": config_ckpt["d_model"],
-            "activation": config_ckpt["ff_activation"],
-            "num_heads": config_ckpt["n_head"],
-            "num_layers": config_ckpt["n_layer"],
-            "vocab_size": config_ckpt["n_token"],
-            "untie_r": config_ckpt["untie_r"]
-        }
-
-        return configs
-
-    def _init_from_checkpoint(self, pretrained_model_name,
-                              cache_dir, scope_name, **kwargs):
-
-        tvars = tf.trainable_variables()
-        init_checkpoint = os.path.join(cache_dir, 'xlnet_model.ckpt')
-        if init_checkpoint:
-            assignment_map, initialized_variable_names = \
-                self._get_assignment_map_from_checkpoint(
-                    tvars, init_checkpoint, scope_name)
-            tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
-
-    def _get_assignment_map_from_checkpoint(self, tvars, init_checkpoint,
-                                            scope_name):
-        r"""
-        Compute the union of the current variables and checkpoint variables.
-        Because of the variable scope of the original XLNet and Texar
-        implementation, we need to build a assignment map to match the variables.
-        """
-        assignment_map = {}
-        initialized_variable_names = {}
-
-        name_to_variable = collections.OrderedDict()
-        for var in tvars:
-            name = var.name
-            m = re.match("^(.*):\\d+$", name)
-            if m is not None:
-                name = m.group(1)
-            name_to_variable[name] = var
-
-        init_vars = tf.train.list_variables(init_checkpoint)
-
-        for check_name, _ in init_vars:
-            check_name_scope = check_name.replace(
-                'model/transformer/', scope_name + '/')
-            model_name = check_name_scope
-            if check_name.startswith('model/lm_loss/bias'):
-                model_name = scope_name + '/lm_loss/bias'
-            elif check_name.startswith('model/transformer/mask_emb'):
-                model_name = check_name_scope.replace(
-                    'mask_emb/mask_emb', 'mask_emb')
-            elif check_name.startswith('model/transformer/word_embedding'):
-                model_name = scope_name + '/word_embedder/w'
-            elif re.match('model/transformer/r_[r,s,w]_bias', check_name):
-                model_name = check_name_scope
-            elif re.match('model/transformer/seg_embed', check_name):
-                model_name = check_name_scope
-            elif re.match('model/transformer/layer_\\d+/rel_attn/[q,k,v,r,o]',
-                          check_name):
-                model_name = check_name_scope
-            elif re.match('model/transformer/layer_\\d+/rel_attn/LayerNorm',
-                          check_name):
-                model_name = check_name_scope.replace('LayerNorm/', '')
-            elif re.match('model/transformer/layer_\\d+/ff/layer_[1,2]',
-                          check_name):
-                model_name = check_name_scope.replace('ff/layer_1', 'ff/dense')
-                if model_name == check_name_scope:
-                    model_name = check_name_scope.replace(
-                        'ff/layer_2', 'ff/dense_1')
-            elif re.match('model/transformer/layer_\\d+/ff/LayerNorm',
-                          check_name):
-                model_name = check_name_scope.replace('LayerNorm/', '')
-
-            if model_name in name_to_variable.keys():
-                assignment_map[check_name] = model_name
-                initialized_variable_names[model_name] = 1
-                initialized_variable_names[model_name + ":0"] = 1
-            else:
-                tf.logging.info('model name:{} not exist'.format(model_name))
-
-        return assignment_map, initialized_variable_names
diff --git a/texar/tf/modules/pretrained/xlnet_test.py b/texar/tf/modules/pretrained/xlnet_test.py
deleted file mode 100644
index 34c89386..00000000
--- a/texar/tf/modules/pretrained/xlnet_test.py
+++ /dev/null
@@ -1,55 +0,0 @@
-"""
-Unit tests for xlnet utils.
-"""
-
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
-
-import os
-import tensorflow as tf
-
-from texar.tf.modules.pretrained.xlnet import *
-from texar.tf.utils.test import pretrained_test
-
-
-class XLNetUtilsTest(tf.test.TestCase):
-    r"""Tests XLNet utils.
-    """
-
-    @pretrained_test
-    def test_load_pretrained_model_AND_transform_xlnet_to_texar_config(self):
-
-        pretrained_model_dir = PretrainedXLNetMixin.download_checkpoint(
-            pretrained_model_name="xlnet-base-cased")
-
-        info = list(os.walk(pretrained_model_dir))
-        _, _, files = info[0]
-        self.assertIn('spiece.model', files)
-        self.assertIn('xlnet_model.ckpt.meta', files)
-        self.assertIn('xlnet_model.ckpt.data-00000-of-00001', files)
-        self.assertIn('xlnet_model.ckpt.index', files)
-        self.assertIn('xlnet_config.json', files)
-
-        model_config = PretrainedXLNetMixin._transform_config(
-            pretrained_model_name="xlnet-base-cased",
-            cache_dir=pretrained_model_dir)
-
-        expected_config = {
-            'head_dim': 64,
-            'ffn_inner_dim': 3072,
-            'hidden_dim': 768,
-            'activation': 'gelu',
-            'num_heads': 12,
-            'num_layers': 12,
-            'vocab_size': 32000,
-            'untie_r': True
-        }
-
-        self.assertDictEqual(model_config, expected_config)
-
-
-if __name__ == "__main__":
-    tf.test.main()
diff --git a/texar/tf/modules/pretrained/xlnet_utils.py b/texar/tf/modules/pretrained/xlnet_utils.py
deleted file mode 100644
index 3894f105..00000000
--- a/texar/tf/modules/pretrained/xlnet_utils.py
+++ /dev/null
@@ -1,560 +0,0 @@
-# Copyright 2019 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Model Utils of XLNet Modules.
-Adapted from
-https://github.com/zihangdai/xlnet/blob/master/modeling.py
-"""
-
-from __future__ import absolute_import
-from __future__ import print_function
-from __future__ import division
-from __future__ import unicode_literals
-
-import tensorflow as tf
-
-from texar.tf.core import layers
-from texar.tf.module_base import ModuleBase
-from texar.tf.utils.mode import is_train_mode
-
-
-__all__ = [
-    'PositionWiseFF',
-    'PositionalEmbedding',
-    'RelativePositionalEncoding',
-    'RelativeMutiheadAttention'
-]
-
-
-class PositionWiseFF(ModuleBase):
-    r"""Position Wise feed forward."""
-    def __init__(self, hparams=None):
-        ModuleBase.__init__(self, hparams)
-
-        hidden_dim = self._hparams.hidden_dim
-        ffn_inner_dim = self._hparams.ffn_inner_dim
-        dropout = self._hparams.dropout
-        activation = self._hparams.activation
-        if activation == 'gelu':
-            activation = layers.gelu
-
-        with tf.variable_scope(self.variable_scope):
-            tf.get_variable_scope().set_initializer(
-                layers.get_initializer(self._hparams.initializer))
-            l1_hparams = {
-                "type": "Dense",
-                "kwargs": {
-                    "units": ffn_inner_dim,
-                    "activation": activation
-                }
-            }
-            self.linear1 = layers.get_layer(hparams=l1_hparams)
-            dropout_hparams = {
-                "type": "Dropout",
-                "kwargs": {
-                    "rate": dropout
-                }
-            }
-            self.dropout = layers.get_layer(hparams=dropout_hparams)
-            l2_hparams = {
-                "type": "Dense",
-                "kwargs": {
-                    "units": hidden_dim
-                }
-            }
-            self.linear2 = layers.get_layer(hparams=l2_hparams)
-
-    @staticmethod
-    def default_hparams():
-        r"""Returns a dictionary of hyperparameters with default values.
-
-        .. code-block:: python
-
-            {
-                "hidden_dim": 768,
-                "ffn_inner_dim": 3072,
-                "dropout": 0.1,
-                "activation": 'gelu'
-            }
-
-        Here
-
-        `"hidden_dim"`: int
-            Dimension of the layer fed as input to feed forward network
-
-        `"ffn_inner_dim"`: int
-            Inner dimension of the feed forward layer
-
-        `"dropout"`: float
-            Dropout rate for layers
-
-        `"activation"`: str or callable
-            Activation function applied to the output of the PositionWise FF.
-            See :func:`~texar.tf.core.get_activation_fn` for more details.
-        """
-        return {
-            "name": "ff",
-            "initializer": None,
-            "hidden_dim": 768,
-            "ffn_inner_dim": 3072,
-            "dropout": 0.1,
-            "activation": 'gelu',
-        }
-
-    def _build(self, input, mode=None):
-        r"""Compute feed forward for the input.
-
-        Args:
-            input: Input tensor of size `(max_time, batch_size, hidden_dim)`
-            mode (optional): A tensor taking value in
-                :tf_main:`tf.estimator.ModeKeys <estimator/ModeKeys>`, including
-                `TRAIN`, `EVAL`, and `PREDICT`. If `None`, dropout is
-                controlled by :func:`texar.tf.global_mode`.
-
-        :returns: A tensor output of the position wise feed forward network
-        """
-        is_training = is_train_mode(mode)
-        output = self.linear1(input)
-        output = self.dropout(output, training=is_training)
-        output = self.linear2(output)
-        output = self.dropout(output, training=is_training)
-
-        # residual + layer norm
-        output = tf.contrib.layers.layer_norm(
-            input + output, begin_norm_axis=-1, scope=self.variable_scope,
-            reuse=tf.AUTO_REUSE)
-
-        return output
-
-
-class PositionalEmbedding(ModuleBase):
-    r"""Sinosoidal Positional Embedding.
-    """
-
-    # TODO(avinash) : See if this can be merged with Sinosoidal Position
-    # Embedder
-    def __init__(self, embed_dim):
-        ModuleBase.__init__(self)
-        freq_seq = tf.range(0.0, embed_dim, 2.0)
-        self.inv_freq = 1 / (10000 ** (freq_seq / embed_dim))
-
-    def _build(self, pos_seq):
-        r"""Compute sinosoidal positional embeddings.
-
-        Args:
-            pos_seq: A 1D tensor of position sequences
-
-        :returns: A 2D tensor of sinosoidal embeddings for the sequence.
-        """
-        pos_seq = tf.dtypes.cast(pos_seq, dtype=self.inv_freq.dtype)
-        sinusoid_inp = tf.einsum('i,d->id', pos_seq, self.inv_freq)
-        pos_emb = tf.concat([tf.sin(sinusoid_inp), tf.cos(sinusoid_inp)], -1)
-        return pos_emb
-
-
-class RelativePositionalEncoding(ModuleBase):
-    r"""Relative positional encodings."""
-    def __init__(self, hparams=None):
-        ModuleBase.__init__(self, hparams)
-        self.sinusoid_embed = PositionalEmbedding(self._hparams.dim)
-
-    @staticmethod
-    def default_hparams():
-        r"""Returns a dictionary of hyperparameters with default values.
-
-        .. code-block:: python
-
-            {
-                "dim": 768,
-                "max_seq_len": 512
-            }
-
-        Here
-
-        `"dim"`: int
-            Dimension size of the positional embedding
-
-        `"max_seq_len"`: int
-            Maximum size of the sequence length
-        """
-        return {
-            "name": "relative_positional_encoder",
-            "dim": 768,
-            "max_seq_len": 512
-        }
-
-    def _create_positional_embedding(self, start, end, step, batch_size,
-                                     clamp_len=None):
-        pos_seq = tf.range(start, end, step)
-        if clamp_len is not None:
-            pos_seq = tf.clip_by_value(pos_seq, -clamp_len, clamp_len)
-        pos_emb = self.sinusoid_embed(pos_seq)
-        pos_emb = pos_emb[:, None, :]
-
-        if batch_size is not None:
-            pos_emb = tf.tile(pos_emb, [1, batch_size, 1])
-
-        return pos_emb
-
-    def _build(self, batch_size, max_time, total_len, clamp_len=None,
-               attn_type='bi', bi_data=True):
-        r"""Compute relative positional encoding.
-
-        Args
-            batch_size: int
-                Batch size of the input
-
-            max_time: int
-                Sequence length of the input
-
-            total_len: int
-                Sequence length + Memory length
-
-            clamp_len (optional): int
-                Clamp all relative distances larger than clamp_len.
-                None means no clamping.
-
-            attn_type (optional): str
-                Attention type. Supported values are `"uni"` and `"bi"`.
-
-            bi_data (optional): bool
-                Whether to use bidirectional data input pipeline. Usually set to
-                True during pretraining and False during finetuning.
-
-        :returns: A tensor of shape `[total_len + max_time, batch_size, dim]`
-            (if attn_type == `"bi"`) or of shape `[total_len, batch_size, dim]`
-            (if attn_type == `"uni"`) representing relative positional encoding
-            of the sequence.
-        """
-        if attn_type == 'bi':
-            start, end = total_len, -max_time
-        elif attn_type == 'uni':
-            start, end = total_len, -1
-        else:
-            raise ValueError("Unknown `attn_type` {}".format(attn_type))
-
-        if bi_data:
-            if batch_size % 2 != 0:
-                raise ValueError("`batch_size` must be an even number")
-            fwd_pos_embed = self._create_positional_embedding(
-                start, end, -1, batch_size // 2, clamp_len)
-            bwd_pos_embed = self._create_positional_embedding(
-                -start, -end, 1, batch_size // 2, clamp_len)
-            pos_embed = tf.concat([fwd_pos_embed, bwd_pos_embed], axis=1)
-        else:
-            pos_embed = self._create_positional_embedding(
-                start, end, -1, batch_size, clamp_len)
-        return pos_embed
-
-
-class RelativeMutiheadAttention(ModuleBase):
-    r"""Compute relative multi-head attention for XLNet encoder.
-
-    This module computes relative multi-head attention as explained in
-    `Transformer-XL, (Zihang et. al)` and in `XLNet (Zhiling et. al)`.
-
-    Args:
-        r_r_bias: A tensor of shape `(num_heads, head_dim)`.
-            The bias value added to query head while computing position based
-            attention score.
-
-        r_w_bias: A tensor of shape `(num_heads, head_dim)`.
-            The bias value added to query head while computing content based
-            attention score.
-
-        r_s_bias (optional): A tensor of shape `(num_heads, head_dim)`.
-            The bias value added to query head while computing segment based
-            attention score.
-
-        segment_embed (optional): A tensor of shape `(2, num_heads, head_dim)`
-            if use_segments is True. Otherwise, this is set to None.
-
-        hparams (dict or HParams, optional): Hyperparameters. Missing
-            hyperparameter will be set to default values. See
-            :meth:`default_hparams` for the hyperparameter sturcture
-            and default values.
-    """
-    def __init__(self, r_r_bias, r_w_bias, r_s_bias=None, segment_embed=None,
-                 hparams=None):
-        ModuleBase.__init__(self, hparams=hparams)
-
-        self.num_heads = self._hparams.num_heads
-        self.head_dim = self._hparams.head_dim
-        hidden_dim = self._hparams.hidden_dim
-
-        with tf.variable_scope(self.variable_scope):
-            if self._hparams.initializer:
-                tf.get_variable_scope().set_initializer(
-                    layers.get_initializer(self._hparams.initializer))
-
-            # Official implementation creates these head variables.
-            # If we create dense layers instead, there would be dimension
-            # mismatch while loading the tensors
-            # TODO(avinash) : Can we reshape tensors while loading the ckpt?
-            self.q_head = tf.get_variable(
-                'q/kernel', [hidden_dim, self.num_heads, self.head_dim])
-
-            self.k_head = tf.get_variable(
-                'k/kernel', [hidden_dim, self.num_heads, self.head_dim])
-
-            self.v_head = tf.get_variable(
-                'v/kernel', [hidden_dim, self.num_heads, self.head_dim])
-
-            self.k_head_r = tf.get_variable(
-                'r/kernel', [hidden_dim, self.num_heads, self.head_dim])
-
-            self.dropout = layers.get_layer(hparams={
-                "type": "Dropout",
-                "kwargs": {
-                    "rate": self._hparams.dropout
-                }
-            })
-
-            self.dropout_attn = layers.get_layer(hparams={
-                "type": "Dropout",
-                "kwargs": {
-                    "rate": self._hparams.attention_dropout
-                }
-            })
-
-            self.output_projection = tf.get_variable(
-                'o/kernel', [hidden_dim, self.num_heads, self.head_dim])
-
-            self.r_r_bias = r_r_bias
-            self.r_w_bias = r_w_bias
-
-            if self._hparams.use_segments:
-                self.segment_embed = segment_embed
-                self.r_s_bias = r_s_bias
-
-            self.scale = 1 / (self.head_dim ** 0.5)
-
-    @staticmethod
-    def default_hparams():
-        r"""Returns a dictionary of hyperparameters with default values.
-
-        .. code-block:: python
-
-            {
-                "name": "rel_attn",
-                "initializer": None,
-                "num_heads": 12,
-                "hidden_dim": 768,
-                "head_dim": 64,
-                "dropout": 0.1,
-                "attention_dropout": 0.1,
-                "use_segments": True
-            }
-
-
-
-        Here:
-
-        The default parameters are values for cased XLNet-Base model.
-
-        "initializer": dict, optional
-            Hyperparameters of the default initializer that initializes
-            variables created in this module.
-            See :func:`~texar.tf.core.get_initializer` for details.
-
-        "num_heads": int
-            Number of heads in the attention
-
-        "hidden_dim": int
-            Hidden dimension of the embeddings
-
-        "head_dim": int
-            Size of the vectors after head projection.
-
-        "dropout": float
-            Dropout rate for layers
-
-        "attention_dropout": float
-            Dropout rate for attention layers
-
-        "use_segments": bool
-            Boolean to indicate if the input has segments
-
-        "name": str
-            Name of the module.
-        """
-        return {
-            "name": "rel_attn",
-            "initializer": None,
-            "num_heads": 12,
-            "hidden_dim": 768,
-            "head_dim": 64,
-            "dropout": 0.1,
-            "attention_dropout": 0.1,
-            "use_segments": True,
-        }
-
-    @staticmethod
-    def _rel_shift(x, klen=-1):
-        """Perform relative shift to form the relative attention score."""
-        x_size = tf.shape(x)
-
-        x = tf.reshape(x, [x_size[1], x_size[0], x_size[2], x_size[3]])
-        x = tf.slice(x, [1, 0, 0, 0], [-1, -1, -1, -1])
-        x = tf.reshape(x, [x_size[0], x_size[1] - 1, x_size[2], x_size[3]])
-        x = tf.slice(x, [0, 0, 0, 0], [-1, klen, -1, -1])
-
-        return x
-
-    def _compute_attention_score(self, q_head, k_head_h, v_head_h, k_head_r,
-                                 segment_mat, attn_mask=None, mode=None):
-        is_training = is_train_mode(mode)
-
-        # Content based attention score.
-        q_head_rw = q_head + self.r_w_bias
-        # attn_ac: (max_time, tot_len, batch_size, n_head)
-        attn_ac = tf.einsum('ibnd,jbnd->ijbn', q_head_rw, k_head_h)
-
-        # Position based attention score.
-        q_head_rr = q_head + self.r_r_bias
-        # attn_bd: (max_time, tot_len, batch_size, n_head)
-        attn_bd = tf.einsum('ibnd,jbnd->ijbn', q_head_rr, k_head_r)
-        attn_bd = self._rel_shift(attn_bd, klen=tf.shape(attn_ac)[1])
-
-        # Segment based attention score.
-        if segment_mat is None:
-            attn_ef = 0
-        else:
-            q_head_rs = q_head + self.r_s_bias
-            attn_ef = tf.einsum(
-                'ibnd,snd->ibns', q_head_rs, self.segment_embed)
-            attn_ef = tf.einsum('ijbs,ibns->ijbn', segment_mat, attn_ef)
-
-        # Merge attention scores and perform masking.
-        # attn_score: (max_time, tot_len, batch_size, n_head)
-        attn_score = (attn_ac + attn_bd + attn_ef) * self.scale
-        if attn_mask is not None:
-            # attn_score = attn_score * (1 - attn_mask) - 1e30 * attn_mask
-            attn_score = attn_score - 1e30 * attn_mask
-
-        # attention probability
-        attn_prob = tf.nn.softmax(attn_score, 1)
-        attn_prob = self.dropout_attn(attn_prob, training=is_training)
-
-        # attention output
-        attn_vec = tf.einsum('ijbn,jbnd->ibnd', attn_prob, v_head_h)
-
-        return attn_vec
-
-    def _post_attention(self, attn_vec, mode=None):
-        is_training = is_train_mode(mode)
-        attn_out = tf.einsum('ibnd,hnd->ibh', attn_vec, self.output_projection)
-        attn_out = self.dropout(attn_out, training=is_training)
-        return attn_out
-
-    def _build(self, states_h, pos_embed, states_g=None, segment_mat=None,
-               attn_mask_h=None, attn_mask_g=None, target_mapping=None,
-               memory=None, mode=None):
-        r"""Compute relative multi-head attention with relative positional
-        encoding.
-
-        Args:
-            states_h: A content representation tensor of shape
-                `[max_time, batch_size, hidden_dim]`
-
-            pos_embed: Position embedding tensor of shape
-                `[max_time, batch_size, hidden_dim]`.
-
-            states_g (optional): A query representation tensor of shape
-                `[max_time, batch_size, hidden_dim]`. This tensor is set during
-                decoding.
-
-            segment_mat (optional): A tensor of size
-                `[max_time, tot_len, batch_size]` indicating if tokens are in the
-                 same seqment. A value at `(i, j, k)` of `1` indicates tokens at
-                  `i` and `j` are not in the same sequence in batch k.
-
-            attn_mask_h (optional): A tensor of shape
-                `[max_time, max_time, batch_size, 1]` Attention mask used while
-                computing attention score for `states_h`
-
-            attn_mask_g (optional): A tensor of shape
-                `[max_time, max_time, batch_size, 1]` Attention mask used while
-                computing attention score for `states_g`
-
-            target_mapping (optional): The target token mapping. Float tensor of
-                shape `[num_targets, max_time, batch_size]`.
-                A value of 1 for ``target_mapping[i, j, k]`` indicates that
-                the `i`-th target token (in order of permutation) in batch `k`
-                is the token at position `j`.
-                Each row ``target_mapping[i, :, k]`` can have no more than one
-                value of 1.
-
-            memory (optional): Memory from previous batches. A list of length
-                `num_layers`, each a tensor of shape
-                `[mem_len, batch_size, hidden_dim]`.
-
-            mode (optional): A tensor taking value in
-                :tf_main:`tf.estimator.ModeKeys <estimator/ModeKeys>`, including
-                `TRAIN`, `EVAL`, and `PREDICT`. If `None`, dropout is
-                controlled by :func:`texar.tf.global_mode`.
-
-        :returns: Returns output states for `states_h` and `states_g`
-            (`states_g` is not None)
-        """
-        batch_size = tf.shape(states_h)[1]
-
-        if memory is not None and memory.shape.ndims > 1:
-            concat_input = tf.concat([memory, states_h], axis=0)
-        else:
-            concat_input = states_h
-
-        # Content heads.
-        q_head_h = tf.einsum('ibh,hnd->ibnd', states_h, self.q_head)
-        k_head_h = tf.einsum('ibh,hnd->ibnd', concat_input, self.k_head)
-        v_head_h = tf.einsum('ibh,hnd->ibnd', concat_input, self.v_head)
-
-        # Positional heads.
-        k_head_r = tf.einsum('ibh,hnd->ibnd', pos_embed, self.k_head_r)
-
-        # Core attention ops.
-        attn_vec_h = self._compute_attention_score(
-            q_head_h, k_head_h, v_head_h, k_head_r, segment_mat, attn_mask_h,
-            mode)
-
-        # Post attention processing.
-        attn_out_h = self._post_attention(attn_vec_h, mode=mode)
-
-        output_h = tf.contrib.layers.layer_norm(
-            attn_out_h + states_h, begin_norm_axis=-1,
-            scope=self.variable_scope, reuse=tf.AUTO_REUSE)
-
-        if states_g is not None:
-            q_head_g = tf.einsum('ibh,hnd->ibnd', states_g, self.q_head)
-            shape = tf.shape(q_head_g)
-            q_head_g = tf.reshape(
-                q_head_g,
-                shape=(shape[0], batch_size, self.num_heads, self.head_dim))
-            if target_mapping is not None:
-                q_head_g = tf.einsum(
-                    'mbnd,mlb->lbnd', q_head_g, target_mapping)
-            attn_vec_g = self._compute_attention_score(
-                q_head_g, k_head_h, v_head_h, k_head_r,
-                segment_mat, attn_mask_g, mode)
-            if target_mapping is not None:
-                attn_vec_g = tf.einsum(
-                    'lbnd,mlb->mbnd', attn_vec_g, target_mapping)
-            attn_out_g = self._post_attention(attn_vec_g, mode=mode)
-            output_g = tf.contrib.layers.layer_norm(
-                attn_out_g + states_g, begin_norm_axis=-1,
-                scope=self.variable_scope, reuse=tf.AUTO_REUSE)
-        else:
-            output_g = None
-
-        return output_h, output_g
diff --git a/texar/tf/modules/pretrained/xlnet_utils_test.py b/texar/tf/modules/pretrained/xlnet_utils_test.py
deleted file mode 100644
index f52f8bbc..00000000
--- a/texar/tf/modules/pretrained/xlnet_utils_test.py
+++ /dev/null
@@ -1,89 +0,0 @@
-"""
-Unit tests for xlnet model utils.
-"""
-import tensorflow as tf
-
-from texar.tf.modules.pretrained.xlnet_utils import \
-    PositionWiseFF, RelativePositionalEncoding, RelativeMutiheadAttention
-
-
-class XLNetModelUtilsTest(tf.test.TestCase):
-    r"""Tests xlnet model utils.
-    """
-
-    def test_PositionWiseFF(self):
-
-        # Case 1
-        model = PositionWiseFF()
-        inputs = tf.random_uniform(shape=(32, model.hparams.hidden_dim))
-        outputs = model(inputs)
-        self.assertEqual(outputs.shape, [32, model._hparams.hidden_dim])
-
-        # Case 2
-        hparams = {
-            "hidden_dim": 16,
-            "ffn_inner_dim": 32,
-            "dropout": 0.1,
-            "activation": 'relu',
-        }
-        model = PositionWiseFF(hparams=hparams)
-        inputs = tf.random_uniform(shape=(32, 16))
-        outputs = model(inputs)
-        self.assertEqual(outputs.shape, [32, 16])
-
-        # Case 3
-        hparams = {
-            "hidden_dim": 16,
-            "ffn_inner_dim": 32,
-            "dropout": 0.1,
-            "activation": 'gelu',
-        }
-        model = PositionWiseFF(hparams=hparams)
-        inputs = tf.random_uniform(shape=(32, 16))
-        outputs = model(inputs)
-        self.assertEqual(outputs.shape, [32, 16])
-
-    def test_RelativeMultiheadAttention(self):
-        num_heads = 12
-        head_dim = 64
-
-        r_r_bias = tf.random_normal(shape=(num_heads, head_dim))
-        r_w_bias = tf.random_normal(shape=(num_heads, head_dim))
-
-        model = RelativeMutiheadAttention(r_r_bias=r_r_bias, r_w_bias=r_w_bias)
-
-        states_h = tf.random_uniform(shape=(16, 32, model._hparams.hidden_dim))
-        pos_embed = tf.random_uniform(shape=(24, 32, model._hparams.hidden_dim))
-
-        output_h, output_g = model(states_h=states_h, pos_embed=pos_embed)
-
-        self.assertEqual(output_h.shape,
-                         [16, 32, model._hparams.hidden_dim])
-        self.assertEqual(output_g, None)
-
-    def test_RelativePositionalEncoding(self):
-
-        batch_size = 16
-        max_time = 8
-        total_len = 32
-
-        # Case 1
-        model = RelativePositionalEncoding()
-        pos_embed = model(batch_size=batch_size,
-                          max_time=max_time,
-                          total_len=total_len)
-        self.assertEqual(pos_embed.shape,
-                         [40, 16, model._hparams.dim])
-
-        # Case 2
-        model = RelativePositionalEncoding()
-        pos_embed = model(batch_size=batch_size,
-                          max_time=max_time,
-                          total_len=total_len,
-                          attn_type='uni')
-        self.assertEqual(pos_embed.shape,
-                         [33, 16, model._hparams.dim])
-
-
-if __name__ == "__main__":
-    tf.test.main()
diff --git a/texar/tf/modules/qnets/__init__.py b/texar/tf/modules/qnets/__init__.py
deleted file mode 100644
index 7b7f1840..00000000
--- a/texar/tf/modules/qnets/__init__.py
+++ /dev/null
@@ -1,24 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Modules of texar library qnets.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# pylint: disable=wildcard-import
-
-from texar.tf.modules.qnets.qnets import *
diff --git a/texar/tf/modules/qnets/qnets.py b/texar/tf/modules/qnets/qnets.py
deleted file mode 100644
index b03ebd3a..00000000
--- a/texar/tf/modules/qnets/qnets.py
+++ /dev/null
@@ -1,282 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Q networks for RL.
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-import tensorflow as tf
-
-from texar.tf.module_base import ModuleBase
-from texar.tf.agents.agent_utils import Space
-from texar.tf.utils import utils
-
-# pylint: disable=no-member
-
-__all__ = [
-    'QNetBase',
-    'CategoricalQNet'
-]
-
-
-class QNetBase(ModuleBase):
-    """Base class inheritted by all Q net classes. A Q net takes in states
-    and outputs Q value of actions.
-
-    Args:
-        network (optional): A network that takes in state and returns
-            Q values. For example, an instance of subclass
-            of :class:`~texar.tf.modules.FeedForwardNetworkBase`. If `None`,
-            a network is created as specified in :attr:`hparams`.
-        network_kwargs (dict, optional): Keyword arguments for network
-            constructor.
-            Note that the `hparams` argument for network
-            constructor is specified in the "network_hparams" field of
-            :attr:`hparams` and should not be included in `network_kwargs`.
-            Ignored if :attr:`network` is given.
-        hparams (dict or HParams, optional): Hyperparameters. Missing
-            hyperparamerter will be set to default values. See
-            :meth:`default_hparams` for the hyperparameter sturcture and
-            default values.
-    """
-    def __init__(self,
-                 network=None,
-                 network_kwargs=None,
-                 hparams=None):
-        ModuleBase.__init__(self, hparams=hparams)
-
-        with tf.variable_scope(self.variable_scope):
-            self._build_network(network, network_kwargs)
-
-    @staticmethod
-    def default_hparams():
-        """Returns a dictionary of hyperparameters with default values.
-
-        .. role:: python(code)
-           :language: python
-
-        .. code-block:: python
-
-            {
-                'network_type': 'FeedForwardNetwork',
-                'network_hparams': {
-                    'layers': [
-                        {
-                            'type': 'Dense',
-                            'kwargs': {'units': 256, 'activation': 'relu'}
-                        },
-                        {
-                            'type': 'Dense',
-                            'kwargs': {'units': 256, 'activation': 'relu'}
-                        },
-                    ]
-                },
-                'name': 'q_net',
-            }
-
-        Here:
-
-        "network_type": str or class or instance
-            A network that takes in state and returns outputs for
-            generating actions. This can be a class, its name or module path,
-            or a class instance. Ignored if `network` is given to the
-            constructor.
-
-        "network_hparams": dict
-            Hyperparameters for the network. With the :attr:`network_kwargs`
-            argument to the constructor, a network is created with
-            :python:`network_class(**network_kwargs, hparams=network_hparams)`.
-
-            For example, the default values creates a two-layer dense network.
-
-        "name": str
-            Name of the Q net.
-        """
-        return {
-            'network_type': 'FeedForwardNetwork',
-            'network_hparams': {
-                'layers': [
-                    {
-                        'type': 'Dense',
-                        'kwargs': {'units': 256, 'activation': 'relu'}
-                    },
-                    {
-                        'type': 'Dense',
-                        'kwargs': {'units': 256, 'activation': 'relu'}
-                    },
-                ]
-            },
-            'name': 'q_net',
-            '@no_typecheck': ['network_type', 'network_hparams']
-        }
-
-    def _build_network(self, network, kwargs):
-        if network is not None:
-            self._network = network
-        else:
-            kwargs = utils.get_instance_kwargs(
-                kwargs, self._hparams.network_hparams)
-            self._network = utils.check_or_get_instance(
-                self._hparams.network_type,
-                kwargs,
-                module_paths=['texar.tf.modules', 'texar.tf.custom'])
-
-    def _build(self, inputs, mode=None):  # pylint: disable=arguments-differ
-        raise NotImplementedError
-
-    @property
-    def network(self):
-        """The network.
-        """
-        return self._network
-
-
-class CategoricalQNet(QNetBase):
-    """Q net with categorical scalar action space.
-
-    Args:
-        action_space (optional): An instance of :class:`~texar.tf.agents.Space`
-            specifying the action space. If not given, an discrete action space
-            `[0, high]` is created with `high` specified in :attr:`hparams`.
-        network (optional): A network that takes in state and returns
-            Q values. For example, an instance of subclass
-            of :class:`~texar.tf.modules.FeedForwardNetworkBase`. If `None`,
-            a network is created as specified in :attr:`hparams`.
-        network_kwargs (dict, optional): Keyword arguments for network
-            constructor.
-            Note that the `hparams` argument for network
-            constructor is specified in the "network_hparams" field of
-            :attr:`hparams` and should not be included in `network_kwargs`.
-            Ignored if :attr:`network` is given.
-        hparams (dict or HParams, optional): Hyperparameters. Missing
-            hyperparamerter will be set to default values. See
-            :meth:`default_hparams` for the hyperparameter sturcture and
-            default values.
-
-    .. document private functions
-    .. automethod:: _build
-    """
-    def __init__(self,
-                 action_space=None,
-                 network=None,
-                 network_kwargs=None,
-                 hparams=None):
-        QNetBase.__init__(self, hparams=hparams)
-
-        with tf.variable_scope(self.variable_scope):
-            if action_space is None:
-                action_space = Space(
-                    low=0, high=self._hparams.action_space, dtype=np.int32)
-            self._action_space = action_space
-            self._append_output_layer()
-
-    @staticmethod
-    def default_hparams():
-        """Returns a dictionary of hyperparameters with default values.
-
-        .. code-block:: python
-
-            {
-                'network_type': 'FeedForwardNetwork',
-                'network_hparams': {
-                    'layers': [
-                        {
-                            'type': 'Dense',
-                            'kwargs': {'units': 256, 'activation': 'relu'}
-                        },
-                        {
-                            'type': 'Dense',
-                            'kwargs': {'units': 256, 'activation': 'relu'}
-                        },
-                    ]
-                },
-                'action_space': 2,
-                'make_output_layer': True,
-                'name': 'q_net'
-            }
-
-        Here:
-
-        "action_space": int
-            Upper bound of the action space. The resulting action space is
-            all discrete scalar numbers between 0 and the upper bound specified
-            here (both inclusive).
-
-        "make_output_layer": bool
-            Whether to append a dense layer to the network to transform
-            features to Q values. If `False`, the final layer
-            output of network must match the action space.
-
-        See :class:`~texar.tf.modules.QNetBase.default_hparams` for details
-        of other hyperparameters.
-        """
-        hparams = QNetBase.default_hparams()
-        hparams.update({
-            'action_space': 2,
-            'make_output_layer': True})
-        return hparams
-
-    def _append_output_layer(self):
-        if not self._hparams.make_output_layer:
-            return
-
-        if self._action_space.shape != ():
-            raise ValueError('Only scalar discrete action is supported.')
-        else:
-            output_size = self._action_space.high - self._action_space.low
-
-        layer_hparams = {
-            'type': 'Dense',
-            'kwargs': {'units': output_size}}
-        self._network.append_layer(layer_hparams)
-
-    def _build(self, inputs, mode=None):
-        """Takes in states and outputs Q values.
-
-        Args:
-            inputs: Inputs to the Q net with the first dimension
-                the batch dimension.
-            mode (optional): A tensor taking value in
-                :tf_main:`tf.estimator.ModeKeys <estimator/ModeKeys>`, including
-                `TRAIN`, `EVAL`, and `PREDICT`. If `None`,
-                :func:`texar.tf.global_mode` is used.
-
-        Returns
-            A `dict` including fields `"qvalues"`.
-            where
-
-            - **"qvalues"**: A Tensor of shape \
-            `[batch_size] + action_space size` containing Q values of all\
-            possible actions.
-        """
-        outputs = {
-            "qvalues": self._network(inputs, mode=mode)
-        }
-
-        if not self._built:
-            self._add_internal_trainable_variables()
-            self._add_trainable_variable(self._network.trainable_variables)
-            self._built = True
-
-        return outputs
-
-    @property
-    def action_space(self):
-        """An instance of :class:`~texar.tf.agents.Space` specifiying the
-        action space.
-        """
-        return self._action_space
diff --git a/texar/tf/modules/regressors/regressor_base.py b/texar/tf/modules/regressors/regressor_base.py
deleted file mode 100644
index 1e2aadcc..00000000
--- a/texar/tf/modules/regressors/regressor_base.py
+++ /dev/null
@@ -1,55 +0,0 @@
-# Copyright 2019 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Base class for Regressors.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from texar.tf.module_base import ModuleBase
-
-__all__ = [
-    "RegressorBase"
-]
-
-
-class RegressorBase(ModuleBase):
-    """Base class inherited by all regressor classes.
-    """
-
-    def __init__(self, hparams=None):
-        ModuleBase.__init__(self, hparams)
-
-    @staticmethod
-    def default_hparams():
-        """Returns a dictionary of hyperparameters with default values.
-        """
-        return {
-            "name": "regressor"
-        }
-
-    def _build(self, inputs, *args, **kwargs):
-        """Runs regressors on inputs.
-
-        Args:
-          inputs: Inputs to the regressor.
-          *args: Other arguments.
-          **kwargs: Keyword arguments.
-
-        Returns:
-          Regression output.
-        """
-        raise NotImplementedError
diff --git a/texar/tf/modules/regressors/xlnet_regressor.py b/texar/tf/modules/regressors/xlnet_regressor.py
deleted file mode 100644
index 98fe007d..00000000
--- a/texar/tf/modules/regressors/xlnet_regressor.py
+++ /dev/null
@@ -1,280 +0,0 @@
-# Copyright 2019 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-XLNet Regressor.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow as tf
-
-from texar.tf.utils.mode import is_train_mode
-from texar.tf.core.layers import get_layer, get_initializer
-from texar.tf.modules.regressors.regressor_base import RegressorBase
-from texar.tf.modules.encoders.xlnet_encoder import XLNetEncoder
-from texar.tf.hyperparams import HParams
-from texar.tf.modules.pretrained.xlnet import PretrainedXLNetMixin
-from texar.tf.utils.utils import dict_fetch
-
-# pylint: disable=too-many-arguments, invalid-name, no-member,
-# pylint: disable=too-many-branches, too-many-locals, too-many-statements
-
-__all__ = [
-    "XLNetRegressor"
-]
-
-
-class XLNetRegressor(RegressorBase, PretrainedXLNetMixin):
-    """Regressor based on XLNet modules. Please see
-    :class:`~texar.tf.modules.PretrainedXLNetMixin` for a brief description
-    of XLNet.
-
-    This is a combination of the :class:`~texar.tf.modules.XLNetEncoder` with a
-    classification layer. Both step-wise classification and sequence-level
-    classification are supported, specified in :attr:`hparams`.
-
-    Arguments are the same as in :class:`~texar.tf.modules.XLNetEncoder`.
-
-    Args:
-        pretrained_model_name (optional): a `str`, the name
-            of pre-trained model (e.g., ``xlnet-based-cased``). Please refer to
-            :class:`~texar.tf.modules.PretrainedXLNetMixin` for
-            all supported models.
-            If `None`, the model name in :attr:`hparams` is used.
-        cache_dir (optional): the path to a folder in which the
-            pre-trained models will be cached. If `None` (default),
-            a default directory (``texar_data`` folder under user's home
-            directory) will be used.
-        hparams (dict or HParams, optional): Hyperparameters. Missing
-            hyperparameters will be set to default values. See
-            :meth:`default_hparams` for the hyperparameter structure
-            and default values.
-
-    .. document private functions
-    .. automethod:: _build
-    """
-
-    def __init__(self,
-                 pretrained_model_name=None,
-                 cache_dir=None,
-                 hparams=None):
-        RegressorBase.__init__(self, hparams)
-
-        with tf.variable_scope(self.variable_scope):
-            tf.get_variable_scope().set_initializer(
-                get_initializer(self._hparams.initializer))
-            # Creates the underlying encoder
-            encoder_hparams = dict_fetch(
-                hparams, XLNetEncoder.default_hparams())
-            if encoder_hparams is not None:
-                encoder_hparams['name'] = "encoder"
-            self._encoder = XLNetEncoder(
-                pretrained_model_name=pretrained_model_name,
-                cache_dir=cache_dir,
-                hparams=encoder_hparams)
-            if self._hparams.use_projection:
-                self.projection = get_layer(hparams={
-                    "type": "Dense",
-                    "kwargs": {
-                        "units": self._encoder.output_size
-                    }
-                })
-
-            # Creates an dropout layer
-            drop_kwargs = {"rate": self._hparams.dropout}
-            layer_hparams = {"type": "Dropout", "kwargs": drop_kwargs}
-            self._dropout_layer = get_layer(hparams=layer_hparams)
-
-            logit_kwargs = self._hparams.logit_layer_kwargs
-            if logit_kwargs is None:
-                logit_kwargs = {}
-            elif not isinstance(logit_kwargs, HParams):
-                raise ValueError(
-                    "hparams['logit_layer_kwargs'] must be a dict.")
-            else:
-                logit_kwargs = logit_kwargs.todict()
-            logit_kwargs.update({"units": 1})
-            if 'name' not in logit_kwargs:
-                logit_kwargs['name'] = "logit_layer"
-
-            layer_hparams = {"type": "Dense", "kwargs": logit_kwargs}
-            self._logit_layer = get_layer(hparams=layer_hparams)
-
-    @staticmethod
-    def default_hparams():
-        r"""Returns a dictionary of hyperparameters with default values.
-
-        .. code-block:: python
-
-            {
-                # (1) Same hyperparameters as in XLNetEncoder
-                ...
-                # (2) Additional hyperparameters
-                "regr_strategy": "cls_time",
-                "use_projection": True,
-                "logit_layer_kwargs": None,
-                "name": "xlnet_regressor",
-            }
-
-        Here:
-
-        1. Same hyperparameters as in
-           :class:`~texar.tf.modules.XLNetEncoder`.
-           See the :meth:`~texar.tf.modules.XLNetEncoder.default_hparams`.
-           An instance of XLNetEncoder is created for feature extraction.
-
-        2. Additional hyperparameters:
-
-            `"regr_strategy"`: str
-                The regression strategy, one of:
-
-                - **cls_time**: Sequence-level regression based on the
-                  output of the first time step (which is the `CLS` token).
-                  Each sequence has a prediction.
-                - **all_time**: Sequence-level regression based on
-                  the output of all time steps. Each sequence has a prediction.
-                - **time_wise**: Step-wise regression, i.e., make
-                  regression for each time step based on its output.
-
-            `"logit_layer_kwargs"` : dict
-                Keyword arguments for the logit Dense layer constructor,
-                except for argument "units" which is set to "num_classes".
-                Ignored if no extra logit layer is appended.
-
-            `"use_projection"`: bool
-                If `True`, an additional dense layer is added after
-                the summary step.
-
-            `"name"`: str
-                Name of the regressor.
-        """
-        hparams = XLNetEncoder.default_hparams()
-        hparams.update({
-            "logit_layer_kwargs": None,
-            "regr_strategy": "cls_time",
-            "dropout": 0.1,
-            "use_projection": True,
-            "name": "xlnet_regressor"
-        })
-        return hparams
-
-    def param_groups(self, lr=None, lr_layer_scale=1.0,
-                     decay_base_params=False):
-        r"""Create parameter groups for optimizers. When
-        :attr:`lr_layer_decay_rate` is not 1.0, parameters from each layer form
-        separate groups with different base learning rates.
-
-        This method should be called before applying gradients to the variables
-        through the optimizer. Particularly, after calling the optimizer's
-        `compute_gradients` method, the user can call this method to get
-        variable-specific learning rates for the network. The gradients for each
-        variables can then be scaled accordingly. These scaled gradients are
-        finally applied by calling optimizer's `apply_gradients` method.
-
-        Args:
-            lr (float): The learning rate. Can be omitted if
-                :attr:`lr_layer_decay_rate` is 1.0.
-            lr_layer_scale (float): Per-layer LR scaling rate. The `i`-th layer
-                will be scaled by `lr_layer_scale ^ (num_layers - i - 1)`.
-            decay_base_params (bool): If `True`, treat non-layer parameters
-                (e.g. embeddings) as if they're in layer 0. If `False`, these
-                parameters are not scaled.
-
-        Returns: A dict mapping tensorflow variables to their learning rates.
-        """
-        vars_to_learning_rates = {}
-        if lr_layer_scale != 1.0:
-            if lr is None:
-                raise ValueError(
-                    "lr must be specified when lr_layer_decay_rate is not 1.0")
-
-            scope = self.variable_scope.name
-            projection_vars = tf.trainable_variables(scope=scope + "/dense")
-            logits_vars = tf.trainable_variables(
-                scope=self.variable_scope.name + "/logit_layer")
-            finetune_vars = projection_vars + logits_vars
-            for var in finetune_vars:
-                vars_to_learning_rates[var] = lr
-
-            vars_to_learning_rates.update(
-                self._encoder.param_groups(lr=lr,
-                                           lr_layer_scale=lr_layer_scale,
-                                           decay_base_params=decay_base_params))
-        else:
-            for variable in self.trainable_variables:
-                vars_to_learning_rates[variable] = lr
-
-        return vars_to_learning_rates
-
-    def _build(self, token_ids, segment_ids=None, input_mask=None, mode=None):
-        r"""Feeds the inputs through the network and makes regression.
-
-        Args:
-            token_ids: Shape `[batch_size, max_time]`.
-            segment_ids: Shape `[batch_size, max_time]`.
-            input_mask: Float tensor of shape `[batch_size, max_time]`. Note
-                that positions with value 1 are masked out.
-            mode (optional): A tensor taking value in
-                :tf_main:`tf.estimator.ModeKeys <estimator/ModeKeys>`,
-                including `TRAIN`, `EVAL`, and `PREDICT`. Used to toggle
-                dropout.
-                If `None` (default), :func:`texar.tf.global_mode` is used.
-
-        Returns:
-            Regression predictions.
-
-            - If ``regr_strategy`` is ``cls_time`` or ``all_time``, predictions
-              have shape `[batch_size]`.
-
-            - If ``clas_strategy`` is ``time_wise``, predictions have shape
-              `[batch_size, max_time]`.
-        """
-        is_training = is_train_mode(mode)
-        output, _ = self._encoder(token_ids, segment_ids, input_mask=input_mask,
-                                  mode=mode)
-
-        strategy = self._hparams.regr_strategy
-        if strategy == "time_wise":
-            summary = output
-        elif strategy == "cls_time":
-            summary = output[:, -1]
-        elif strategy == "all_time":
-            length_diff = self._hparams.max_seq_len - tf.shape(token_ids)[1]
-            summary_input = tf.pad(output,
-                                   paddings=[[0, 0], [0, length_diff], [0, 0]])
-            summary_input_dim = \
-                self._encoder.output_size * self._hparams.max_seq_len
-            summary = tf.reshape(summary_input, shape=[-1, summary_input_dim])
-        else:
-            raise ValueError("Unknown classification strategy: {}".
-                             format(strategy))
-
-        if self._hparams.use_projection:
-            summary = tf.tanh(self.projection(summary))
-
-        # summary: (batch_size, hidden_dim)
-        summary = self._dropout_layer(summary, training=is_training)
-
-        logits = tf.squeeze(self._logit_layer(summary), -1)
-
-        if not self._built:
-            self._add_internal_trainable_variables()
-            if self._logit_layer:
-                self._add_trainable_variable(
-                    self._logit_layer.trainable_variables)
-            self._built = True
-
-        return logits
diff --git a/texar/tf/modules/regressors/xlnet_regressor_test.py b/texar/tf/modules/regressors/xlnet_regressor_test.py
deleted file mode 100644
index ff96e837..00000000
--- a/texar/tf/modules/regressors/xlnet_regressor_test.py
+++ /dev/null
@@ -1,160 +0,0 @@
-#
-"""
-Unit tests for XLNet regressor.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
-
-import numpy as np
-import tensorflow as tf
-
-from texar.tf.modules.regressors.xlnet_regressor import XLNetRegressor
-from texar.tf.utils.test import pretrained_test
-
-# pylint: disable=too-many-locals, no-member
-
-
-class XLNetRegressorTest(tf.test.TestCase):
-    """Tests :class:`~texar.tf.modules.XLNetRegressor` class.
-    """
-
-    @pretrained_test
-    def test_model_loading(self):
-        r"""Tests model loading functionality."""
-
-        inputs = tf.placeholder(dtype=tf.int32, shape=[None, None])
-
-        for pretrained_model_name in XLNetRegressor.available_checkpoints():
-            regressor = XLNetRegressor(
-                pretrained_model_name=pretrained_model_name)
-            _ = regressor(inputs)
-
-    def test_trainable_variables(self):
-        """Tests the functionality of automatically collecting trainable
-        variables.
-        """
-        inputs = tf.placeholder(dtype=tf.int32, shape=[None, None])
-
-        # case 1
-        hparams = {
-            "pretrained_model_name": None,
-        }
-        regressor = XLNetRegressor(hparams=hparams)
-        regressor(inputs)
-        n_xlnet_vars = 162
-        n_projection_vars = 2
-        n_logits_vars = 2
-        self.assertEqual(len(regressor.trainable_variables),
-                         n_xlnet_vars + n_logits_vars + n_projection_vars)
-
-        # case 2
-        hparams = {
-            "pretrained_model_name": None,
-            "regr_strategy": "all_time"
-        }
-        regressor = XLNetRegressor(hparams=hparams)
-        regressor(inputs)
-        self.assertEqual(len(regressor.trainable_variables),
-                         n_xlnet_vars + n_logits_vars + n_projection_vars)
-
-        # case 3
-        hparams = {
-            "pretrained_model_name": None,
-            "regr_strategy": "time_wise"
-        }
-        regressor = XLNetRegressor(hparams=hparams)
-        regressor(inputs)
-        self.assertEqual(len(regressor.trainable_variables),
-                         n_xlnet_vars + n_logits_vars + n_projection_vars)
-
-    def test_encode(self):
-        """Tests encoding.
-        """
-        max_time = 8
-        batch_size = 16
-        inputs = tf.random_uniform([batch_size, max_time],
-                                   maxval=30521, dtype=tf.int32)
-
-        # case 1
-        hparams = {
-            "pretrained_model_name": None,
-        }
-        regressor = XLNetRegressor(hparams=hparams)
-        logits = regressor(inputs)
-
-        with self.test_session() as sess:
-            sess.run(tf.global_variables_initializer())
-            logits_ = sess.run(logits)
-            self.assertEqual(logits_.shape, (batch_size,))
-
-        # case 2
-        hparams = {
-            "pretrained_model_name": None,
-            "regr_strategy": "cls_time"
-        }
-        regressor = XLNetRegressor(hparams=hparams)
-        logits = regressor(inputs)
-
-        with self.test_session() as sess:
-            sess.run(tf.global_variables_initializer())
-            logits_ = sess.run(logits)
-            self.assertEqual(logits_.shape, (batch_size,))
-
-        # case 3
-        hparams = {
-            "pretrained_model_name": None,
-            "regr_strategy": "time_wise"
-        }
-        regressor = XLNetRegressor(hparams=hparams)
-        logits = regressor(inputs)
-
-        with self.test_session() as sess:
-            sess.run(tf.global_variables_initializer())
-            logits_ = sess.run(logits)
-            self.assertEqual(logits_.shape,
-                             (batch_size, max_time))
-
-        # case 4
-        hparams = {
-            "pretrained_model_name": None,
-            "regr_strategy": "all_time",
-            "max_seq_len": max_time
-        }
-        inputs = tf.placeholder(tf.int32, shape=[batch_size, 6])
-        regressor = XLNetRegressor(hparams=hparams)
-        logits = regressor(inputs)
-
-        with self.test_session() as sess:
-            sess.run(tf.global_variables_initializer())
-            logits_ = sess.run(
-                logits,
-                feed_dict={inputs: np.random.randint(30521,
-                                                     size=(batch_size, 6))})
-            self.assertEqual(logits_.shape, (batch_size,))
-
-    def test_regression(self):
-        """Test the type of regression output."""
-        batch_size = 8
-
-        hparams = {
-            "pretrained_model_name": None,
-            "regr_strategy": "cls_time"
-        }
-        inputs = tf.placeholder(tf.int32, shape=[batch_size, 6])
-        regressor = XLNetRegressor(hparams=hparams)
-        logits = regressor(inputs)
-
-        with self.test_session() as sess:
-            sess.run(tf.global_variables_initializer())
-            logits_ = sess.run(
-                logits,
-                feed_dict={inputs: np.random.randint(30521,
-                                                     size=(batch_size, 6))})
-            self.assertEqual(logits_.dtype, np.float32)
-
-
-if __name__ == "__main__":
-    tf.test.main()
diff --git a/texar/tf/run/__init__.py b/texar/tf/run/__init__.py
deleted file mode 100644
index 4294ede0..00000000
--- a/texar/tf/run/__init__.py
+++ /dev/null
@@ -1,24 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Modules of texar library run.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# pylint: disable=wildcard-import
-
-from texar.tf.run.executor import *
diff --git a/texar/tf/run/executor.py b/texar/tf/run/executor.py
deleted file mode 100644
index 9c22be5e..00000000
--- a/texar/tf/run/executor.py
+++ /dev/null
@@ -1,175 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-A class that executes training, evaluation, prediction, export of estimators.
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow as tf
-
-from texar.tf.utils.dtypes import maybe_hparams_to_dict
-
-# pylint: disable=too-many-instance-attributes, too-many-arguments
-
-__all__ = [
-    "Executor"
-]
-
-
-class Executor(object):
-    """Class that executes training, evaluation, prediction, export, and other
-    actions of :tf_main:`Estimator <estimator/Estimator>`.
-
-    Args:
-        model: An instance of a subclass of
-            :class:`~texar.tf.models.model_base.ModelBase`.
-        data_hparams: A `dict` or an instance of :class:`~texar.tf.hparams.HParams`
-            containing the hyperparameters of data. It must contain `train`
-            and/or `eval` fields for relevant processes. For example, for
-            :meth:`train_and_evaluate`, both fields are required.
-        config: An instance of
-            :tf_main:`tf.estimator.RunConfig <estimator/RunConfig>`, used as
-            the :attr:`config` argument of
-            :tf_main:`Estimator <estimator/Estimator#__init__>`.
-        model_hparams (optional): A `dict` or an instance of
-            :class:`~texar.tf.hparams.HParams` containing the hyperparameters of
-            the model. If `None`, uses :attr:`model.hparams`. Used as
-            the :attr:`params` argument of
-            :tf_main:`Estimator <estimator/Estimator#__init__>`.
-        train_hooks (optional): Iterable of :tf_main:`tf.train.SessionRunHook
-            <train/SessionRunHook>` objects to run during training.
-        eval_hooks (optional): Iterable of :tf_main:`tf.train.SessionRunHook
-            <train/SessionRunHook>` objects to run during evaluation.
-        session_config (optional): An instance of
-            :tf_main:`tf.ConfigProto <ConfigProto>`, used as the :attr:`config`
-            argument of :tf_main:`tf session <Session>`.
-
-    Example:
-
-        .. code-block:: python
-
-            model = BasicSeq2seq(data_hparams, model_hparams)
-            exor = Executor(
-                model=model,
-                data_hparams=data_hparams,
-                config=run_config)
-            exor.train_and_evaluate(
-                max_train_steps=10000,
-                eval_steps=100)
-
-    See `bin/train.py` for the usage in detail.
-    """
-
-    def __init__(self,
-                 model,
-                 data_hparams,
-                 config,
-                 model_hparams=None,
-                 train_hooks=None,
-                 eval_hooks=None,
-                 session_config=None):
-        self._model = model
-        self._data_hparams = maybe_hparams_to_dict(data_hparams)
-        self._config = config
-        self._train_hooks = train_hooks
-        self._eval_hooks = eval_hooks
-        self._session_config = session_config
-
-        if model_hparams is None:
-            model_hparams = model.hparams
-        self._model_hparams = maybe_hparams_to_dict(model_hparams)
-
-        self._estimator = tf.estimator.Estimator(
-            model_fn=self._model, config=config, params=self._model_hparams)
-
-    def _get_train_spec(self, max_steps=None):
-        if 'train' not in self._data_hparams:
-            raise ValueError('`data_hparams` must contain field `train` for '
-                             'training data config.')
-        input_fn = self._model.get_input_fn(
-            mode=tf.estimator.ModeKeys.TRAIN,
-            hparams=self._data_hparams['train'])
-        return tf.estimator.TrainSpec(
-            input_fn=input_fn,
-            max_steps=max_steps,
-            hooks=self._train_hooks)
-
-    def _get_eval_spec(self, steps):
-        if 'eval' not in self._data_hparams:
-            raise ValueError('`data_hparams` must contain field `eval` for '
-                             'evaluation data config.')
-        input_fn = self._model.get_input_fn(
-            mode=tf.estimator.ModeKeys.EVAL,
-            hparams=self._data_hparams['eval'])
-        return tf.estimator.EvalSpec(
-            input_fn=input_fn,
-            steps=steps,
-            hooks=self._eval_hooks)
-
-    def train(self, max_steps=None):
-        """Trains the model. See :tf_main:`tf.estimator.Estimator.train
-        <estimator/Estimator#train>` for more details.
-
-        Args:
-            max_steps (int, optional): Total number of steps for which
-                to train model. If `None`, train forever or until the train
-                data generates the OutOfRange exception. If OutOfRange occurs
-                in the middle, training stops before :attr:`max_steps` steps.
-        """
-        train_spec = self._get_train_spec(max_steps=max_steps)
-        self._estimator.train(
-            input_fn=train_spec.input_fn,
-            hooks=train_spec.hooks,
-            max_steps=train_spec.max_steps)
-
-    def evaluate(self, steps=None, checkpoint_path=None):
-        """Evaluates the model. See :tf_main:`tf.estimator.Estimator.evaluate
-        <estimator/Estimator#evaluate>` for more details.
-
-        Args:
-            steps (int, optional): Number of steps for which to evaluate
-                model. If `None`, evaluates until the eval data raises an
-                OutOfRange exception.
-            checkpoint_path (str, optional): Path of a specific checkpoint to
-                evaluate. If `None`, the the latest checkpoint in
-                :attr:`config.model_dir` is used. If there are no checkpoints
-                in :attr:`model_dir`, evaluation is run with newly initialized
-                variables instead of restored from checkpoint.
-        """
-        eval_spec = self._get_eval_spec(steps=steps)
-        self._estimator.evaluate(
-            input_fn=eval_spec.input_fn,
-            steps=eval_spec.steps,
-            hooks=eval_spec.hooks,
-            checkpoint_path=checkpoint_path)
-
-    def train_and_evaluate(self, max_train_steps=None, eval_steps=None):
-        """Trains and evaluates the model. See
-        :tf_main:`tf.estimator.train_and_evaluate
-        <estimator/train_and_evaluate>` for more details.
-
-        Args:
-            max_train_steps (int, optional): Total number of steps for which
-                to train model. If `None`, train forever or until the train
-                data generates the OutOfRange exception. If OutOfRange occurs
-                in the middle, training stops before :attr:`max_steps` steps.
-            eval_steps (int, optional): Number of steps for which to evaluate
-                model. If `None`, evaluates until the eval data raises an
-                OutOfRange exception.
-        """
-        train_spec = self._get_train_spec(max_steps=max_train_steps)
-        eval_spec = self._get_eval_spec(steps=eval_steps)
-        tf.estimator.train_and_evaluate(self._estimator, train_spec, eval_spec)
diff --git a/texar/tf/run/executor_test.py b/texar/tf/run/executor_test.py
deleted file mode 100644
index beb722df..00000000
--- a/texar/tf/run/executor_test.py
+++ /dev/null
@@ -1,85 +0,0 @@
-# -*- coding: utf-8 -*-
-#
-"""
-Unit tests for executor.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
-
-import tempfile
-import shutil
-
-import tensorflow as tf
-
-from texar.tf.run.executor import Executor
-from texar.tf.models.seq2seq.basic_seq2seq import BasicSeq2seq
-
-
-class ExecutorTest(tf.test.TestCase):
-    """Tests :class:`texar.tf.run.executor.Executor`
-    """
-
-    def setUp(self):
-        tf.test.TestCase.setUp(self)
-
-        # Create data
-        vocab_list = ['This', 'is', 'a', 'word', '词']
-        vocab_file = tempfile.NamedTemporaryFile()
-        vocab_file.write('\n'.join(vocab_list).encode("utf-8"))
-        vocab_file.flush()
-        self._vocab_file = vocab_file
-        self._vocab_size = len(vocab_list)
-
-        src_text = ['This is a sentence from source .', '词 词 。 source']
-        src_text_file = tempfile.NamedTemporaryFile()
-        src_text_file.write('\n'.join(src_text).encode("utf-8"))
-        src_text_file.flush()
-        self._src_text_file = src_text_file
-
-        tgt_text = ['This is a sentence from target .', '词 词 。 target']
-        tgt_text_file = tempfile.NamedTemporaryFile()
-        tgt_text_file.write('\n'.join(tgt_text).encode("utf-8"))
-        tgt_text_file.flush()
-        self._tgt_text_file = tgt_text_file
-
-        self._data_hparams = {
-            "num_epochs": 20,
-            "batch_size": 2,
-            "source_dataset": {
-                "files": [self._src_text_file.name],
-                "vocab_file": self._vocab_file.name,
-            },
-            "target_dataset": {
-                "files": self._tgt_text_file.name,
-                "vocab_share": True,
-            }
-        }
-
-    def test_execute_seq2seq(self):
-        """Tests running seq2seq with Executor.
-        """
-        seq2seq = BasicSeq2seq(self._data_hparams)
-        data_hparams = {'train': self._data_hparams, 'eval': self._data_hparams}
-
-        model_dir = tempfile.mkdtemp()
-        config = tf.estimator.RunConfig(
-            model_dir=model_dir,
-            save_summary_steps=10,
-            save_checkpoints_steps=10,
-            save_checkpoints_secs=None)
-
-        exor = Executor(model=seq2seq, data_hparams=data_hparams, config=config)
-
-        exor.train_and_evaluate(max_train_steps=20, eval_steps=5)
-
-        exor.train(max_steps=20)
-        exor.evaluate(steps=5)
-
-        shutil.rmtree(model_dir)
-
-
-if __name__ == "__main__":
-    tf.test.main()
diff --git a/texar/tf/utils/__init__.py b/texar/tf/utils/__init__.py
index 7a947e93..5e9385be 100644
--- a/texar/tf/utils/__init__.py
+++ b/texar/tf/utils/__init__.py
@@ -1,4 +1,4 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
+# Copyright 2019 The Texar Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,20 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-Modules of texar library utils.
+Modules of Texar library utils.
 """
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# pylint: disable=wildcard-import
-
-from texar.tf.utils.utils import *
-from texar.tf.utils.exceptions import *
-from texar.tf.utils.shapes import *
 from texar.tf.utils.dtypes import *
-from texar.tf.utils.variables import *
 from texar.tf.utils.mode import *
-from texar.tf.utils.average_recorder import *
+from texar.tf.utils.shapes import *
+from texar.tf.utils.transformer_attentions import *
+from texar.tf.utils.transformer_utils import *
+from texar.tf.utils.utils import *
 from texar.tf.utils.utils_io import *
diff --git a/texar/tf/utils/average_recorder.py b/texar/tf/utils/average_recorder.py
deleted file mode 100644
index 05fd9a91..00000000
--- a/texar/tf/utils/average_recorder.py
+++ /dev/null
@@ -1,321 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Utilities for maintaining moving average.
-"""
-
-from __future__ import absolute_import
-from __future__ import print_function
-from __future__ import division
-
-from collections import deque
-
-# pylint: disable=invalid-name
-
-__all__ = [
-    "_SingleAverageRecorder",
-    "AverageRecorder"
-]
-
-
-class _SingleAverageRecorder(object):
-    """Maintains the moving average (i.e., the average of the latest N records)
-    of a single metric.
-
-    Args:
-        size (int, optional): The window size of moving average. If `None`,
-            the average of all added records is maintained.
-        name (str, optional): name of the recorder. Used when printing.
-    """
-
-    def __init__(self, size=None, name=None):
-        if size is not None and size <= 0:
-            raise ValueError("`size` must be > 0 or `None`.")
-        self._size = size
-        self._q = deque([])
-        self._w = deque([])
-        self._sum = 0.
-        self._w_sum = 0
-        self._name = name
-
-    def add(self, record, weight=None):
-        """Appends a new record.
-
-        Args:
-            record: A scalar; the new record to append.
-            weight (optional): A scalar, weight of the new record for
-                calculating a weighted average. If `None`, weight is set to `1`.
-                For example, :attr:`weight` can be set to batch size and
-                :attr:`record` the average value of certain metric on the batch
-                in order to calculate the average metric value on a whole
-                dataset.
-
-        Returns:
-            The (moving) average after appending the record.
-        """
-        w = weight if weight is not None else 1
-        self._w_sum += w
-        self._sum += record * w
-
-        if self._size is not None:
-            if len(self._q) == self._size:
-                w_pop = self._w.popleft()
-                self._sum -= self._q.popleft() * w_pop
-                self._w_sum -= w_pop
-            self._q.append(record)
-            self._w.append(w)
-
-        return self.avg()
-
-    def avg(self):
-        """Returns the (moving) average.
-        """
-        if self._w_sum == 0:
-            return 0.
-        return self._sum / self._w_sum
-
-    def reset(self):
-        """Cleans all records.
-        """
-        self._q.clear()
-        self._w.clear()
-        self._sum = 0.
-        self._w_sum = 0
-
-    def to_str(self, precision=None):
-        """Returns a string of the average value.
-
-        Args:
-            precision (int, optional): The number of decimal places to keep in
-                the returned string. E.g., for an average value of `0.1234`,
-                :attr:`precision = 2` leads to `'0.12'`.
-
-        Returns:
-            A string of the average value. If :meth:`name` is given, the
-            string is of the format like `'name: 0.1234'`, otherwise
-            the string is of the format like `'0.1234'`.
-        """
-        prec_str = "{}"
-        if precision is not None:
-            prec_str = "{:.%df}" % precision
-
-        avg_str = prec_str.format(self.avg())
-        if self._name is not None:
-            avg_str = "{}: {}".format(self._name, avg_str)
-
-        return avg_str
-
-    @property
-    def name(self):
-        """The name of the recorder.
-        """
-        return self.name
-
-
-class AverageRecorder(object):
-    """Maintains the moving averages (i.e., the average of the latest N
-    records) of (possibly multiple) fields.
-
-    Fields are determined by the first call of :meth:`add`.
-
-    Args:
-        size (int, optional): The window size of moving average. If `None`,
-            the average of all added records is maintained.
-
-    Example:
-
-        .. code-block:: python
-
-            ## Use to maintain moving average of training loss
-            avg_rec = AverageRecorder(size=10) # average over latest 10 records
-            while training:
-                loss_0, loss_1  = ...
-                avg_rec.add([loss_0, loss_1])
-                # avg_rec.avg() == [0.12343452, 0.567800323]
-                # avg_rec.avg(0) == 0.12343452
-                # avg_rec.to_str(precision=2, ) == '0.12 0.57'
-
-            ## Use to maintain average of test metrics on the whole test set
-            avg_rec = AverageRecorder() # average over ALL records
-            while test:
-                metric_0, metric_1  = ...
-                avg_rec.add({'m0': metric_0, 'm1': metric_1}) # dict is allowed
-            print(avg_rec.to_str(precision=4, delimiter=' , '))
-            # 'm0: 0.1234 , m1: 0.5678'
-            #
-            # avg_rec.avg() == {'m0': 0.12343452, 'm1': 0.567800323}
-            # avg_rec.avg(0) == 0.12343452
-
-    """
-
-    def __init__(self, size=None):
-        if size is not None and size <= 0:
-            raise ValueError("`size` must be > 0 or `None`.")
-        self._size = size
-        self._recorders = None
-        self._default_metric_name = "metric"
-        self._record_type = None
-
-    def _to_dict(self, record):
-        if isinstance(record, dict):
-            record_dict = record
-        elif isinstance(record, (list, tuple)):
-            record_dict = {i: vi for i, vi in enumerate(record)}
-        else:
-            record_dict = {self._default_metric_name: record}
-        return record_dict
-
-    def add(self, record, weight=None):
-        """Appends a new record.
-
-        :attr:`record` can be a `list`, `dict`, or a single scalar. The
-        record type is determined at the first time :meth:`add` is called.
-        All subsequent calls to :meth:`add` must have the same type of
-        :attr:`record`.
-
-        :attr:`record` in subsequent calls to :meth:`add` can contain only
-        a subset of fields than the first call to :meth:`add`.
-
-        Example:
-
-            .. code-block:: python
-
-                recorder.add({'1': 0.2, '2': 0.2}) # 1st call to `add`
-                x = recorder.add({'1': 0.4}) # 2nd call to `add`
-                # x == {'1': 0.3, '2': 0.2}
-
-        Args:
-            record: A single scalar, a list of scalars, or a dict of scalars.
-            weight (optional): A scalar, weight of the new record for
-                calculating a weighted average. If `None`, weight is set to `1`.
-                For example, :attr:`weight` can be set to batch size and
-                :attr:`record` the average value of certain metrics on the batch
-                in order to calculate the average metric values on a whole
-                dataset.
-
-        Returns:
-            The (moving) average after appending the record, with the same
-            type as :attr:`record`.
-        """
-        if self._record_type is None:
-            self._record_type = type(record)
-        elif self._record_type != type(record):
-            raise ValueError('The type of `record` is not consistent. '
-                             'Expect type `{}`'.format(self._record_type))
-
-        record_dict = self._to_dict(record)
-        if self._recorders is None:
-            self._recorders = {
-                name: _SingleAverageRecorder(
-                    self._size, name if self._record_type == dict else None)
-                for name in record_dict.keys()
-            }
-
-        for name, val in record_dict.items():
-            self._recorders[name].add(val, weight=weight)
-
-        return self.avg()
-
-    def avg(self, id_or_name=None):
-        """Returns the (moving) average.
-
-        Args:
-            id_or_name (optional): A list of or a single element.
-                Each element is the index (if the record type is `list`) or
-                name (if the record type is `dict`) of the field for which
-                the average is calculated. If not given, the average of all
-                fields are returned.
-
-        Returns:
-            The average value(s). If :attr:`id_or_name` is a single element
-            (not a list), then returns the average value of the corresponding
-            field. Otherwise, if :attr:`id_or_name` is a list of element(s),
-            then returns average value(s) in the same type as :attr:`record`
-            of :meth:`add`.
-        """
-        if self._recorders is None:
-            return 0.
-
-        keys = id_or_name
-        if keys is None:
-            keys = list(self._recorders.keys())
-
-        if not isinstance(keys, (list, tuple)):
-            return self._recorders[keys].avg()
-
-        avg = {key: self._recorders[key].avg() for key in keys}
-        if self._record_type in {list, tuple}:
-            ret_avg = []
-            for k, v in avg.items():
-                if k in keys:
-                    ret_avg.append(v)
-            return self._record_type(ret_avg)
-        elif self._record_type == dict:
-            return avg
-        else:
-            return avg[self._default_metric_name]
-
-    def reset(self, id_or_name=None):
-        """Resets the record.
-
-        Args:
-            id_or_name (optional): A list or a single element. Each element is
-                the index (if the record type is `list`) or name (if the
-                record type is `dict`) of the field to reset.
-                If `None`, all fields are reset.
-        """
-        keys = id_or_name
-        if keys is None:
-            keys = list(self._recorders.keys())
-        elif not isinstance(keys, (list, tuple)):
-            keys = [keys]
-
-        for key in keys:
-            self._recorders[key].reset()
-
-    def to_str(self, precision=None, delimiter=' '):
-        """Returns a string of the average values of the records.
-
-        Args:
-            precision (int, optional): The number of decimal places to keep in
-                the returned string. E.g., for an average value of `0.1234`,
-                :attr:`precision = 2` leads to `'0.12'`.
-            delimiter (str): The delimiter string that separates between
-                fields.
-
-        Returns:
-            A string of the average values.
-
-            If record is of type `dict`, the string is a concatenation of
-            'field_name: average_value', delimited with :attr:`delimiter`.
-            E.g., `'field_name_1: 0.1234 field_name_2: 0.5678 ...'`.
-
-            Otherwise, the string is of a concatenation of 'average_value'.
-            E.g., `'0.1234 0.5678 ...'`
-        """
-        strs = {name: rec.to_str(precision=precision)
-                for name, rec in self._recorders.items()}
-        str_list = []
-        if self._record_type in {list, tuple}:
-            for i in range(len(strs)):
-                # Enumerates the keys in order, which are the indexes
-                str_list.append(strs[i])
-        elif self._record_type == dict:
-            str_list = list(strs.values())
-        else:
-            str_list = [strs[self._default_metric_name]]
-
-        avg_str = delimiter.join(str_list)
-
-        return avg_str
diff --git a/texar/tf/utils/average_recorder_test.py b/texar/tf/utils/average_recorder_test.py
deleted file mode 100644
index c01b4d9f..00000000
--- a/texar/tf/utils/average_recorder_test.py
+++ /dev/null
@@ -1,72 +0,0 @@
-"""
-Unit tests for average recoder.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow as tf
-
-from texar.tf.utils.average_recorder import _SingleAverageRecorder, AverageRecorder
-
-
-class AverageRecorderTest(tf.test.TestCase):
-    """Tests average recoder.
-    """
-
-    def test_single_average_recoder(self):
-        """Tests :class:`~texar.tf.utils._SingleAverageRecorder`
-        """
-        recoder = _SingleAverageRecorder(5)
-        for i in range(100):
-            self.assertEqual(recoder.add(1), 1.)
-            self.assertEqual(recoder.avg(), 1.)
-
-        recoder = _SingleAverageRecorder()
-        for i in range(100):
-            self.assertEqual(recoder.add(1), 1.)
-            self.assertEqual(recoder.avg(), 1.)
-
-        def _cal_ground_truth(n):
-            """Calculates ((n-4)^2 + ... + n^5) / (n-4 + ... + n)
-            """
-            lb = max(n - 4, 0)
-            _sum = 0
-            _w = 0
-            for i in range(lb, n + 1):
-                _sum += i * i
-                _w += i
-            if _w == 0:
-                return 0
-            return _sum / _w
-
-        recoder = _SingleAverageRecorder(5)
-        for i in range(100):
-            self.assertEqual(recoder.add(i, i), _cal_ground_truth(i))
-            self.assertEqual(recoder.avg(), _cal_ground_truth(i))
-
-    def test_average_recorder(self):
-        """Tests :class:`~texar.tf.utils.AverageRecorder`
-        """
-        recorder = AverageRecorder(5)
-        for i in range(100):
-            self.assertEqual(recorder.add([1., 2.]), [1., 2.])
-            self.assertEqual(recorder.add([1.]), [1., 2.])
-            self.assertEqual(recorder.avg(), [1., 2.])
-            self.assertEqual(recorder.avg(0), 1.)
-            self.assertEqual(recorder.avg(1), 2.)
-            self.assertEqual(recorder.avg([0, 1]), [1., 2.])
-
-        recorder = AverageRecorder()
-        for i in range(100):
-            self.assertEqual(recorder.add({'1': 1, '2': 2}), {'1': 1., '2': 2.})
-            self.assertEqual(recorder.add({'1': 1}), {'1': 1., '2': 2.})
-            self.assertEqual(recorder.avg(), {'1': 1., '2': 2.})
-            self.assertEqual(recorder.avg('1'), 1.)
-            self.assertEqual(recorder.avg('2'), 2.)
-            self.assertEqual(recorder.avg(['1', '2']), {'1': 1., '2': 2.})
-
-
-if __name__ == "__main__":
-    tf.test.main()
diff --git a/texar/tf/utils/beam_search.py b/texar/tf/utils/beam_search.py
deleted file mode 100644
index 112abdbc..00000000
--- a/texar/tf/utils/beam_search.py
+++ /dev/null
@@ -1,606 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Tensor2Tensor Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# Modifications copyright (C) 2019 Texar
-# ==============================================================================
-"""
-Implementation of beam search with penalties.
-Adapted from tensor2tensor repository.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow as tf
-
-from tensorflow.python.util import nest
-from texar.tf.utils.shapes import shape_list
-
-# Default value for INF
-INF = 1. * 1e7
-
-
-def _merge_beam_dim(tensor):
-    """Reshapes first two dimensions in to single dimension.
-
-    Args:
-        tensor: Tensor to reshape of shape [A, B, ...]
-
-    Returns:
-        Reshaped tensor of shape [A*B, ...]
-    """
-    if not isinstance(tensor, tf.Tensor):
-        return tensor
-    shape = shape_list(tensor)
-    shape[0] *= shape[1]    # batch -> batch * beam_size
-    shape.pop(1)    # Remove beam dim
-    return tf.reshape(tensor, shape)
-
-
-def _unmerge_beam_dim(tensor, batch_size, beam_size):
-    """Reshapes first dimension back to [batch_size, beam_size].
-
-    Args:
-        tensor: Tensor to reshape of shape [batch_size*beam_size, ...]
-        batch_size: Tensor, original batch size.
-        beam_size: int, original beam size.
-
-    Returns:
-        Reshaped tensor of shape [batch_size, beam_size, ...]
-    """
-    if not isinstance(tensor, tf.Tensor):
-        return tensor
-    shape = shape_list(tensor)
-    new_shape = [batch_size] + [beam_size] + shape[1:]
-    return tf.reshape(tensor, new_shape)
-
-
-def _expand_to_beam_size(tensor, beam_size):
-    """Tiles a given tensor by beam_size.
-
-    Args:
-        tensor: tensor to tile [batch_size, ...]
-        beam_size: How much to tile the tensor by.
-
-    Returns:
-        Tiled tensor [batch_size, beam_size, ...]
-    """
-    if not isinstance(tensor, tf.Tensor):
-        return tensor
-    tensor = tf.expand_dims(tensor, axis=1)
-    tile_dims = [1] * tensor.shape.ndims
-    tile_dims[1] = beam_size
-
-    return tf.tile(tensor, tile_dims)
-
-
-def get_state_shape_invariants(tensor):
-    """Returns the shape of the tensor but sets middle dims to None."""
-    shape = tensor.shape.as_list()
-    for i in range(1, len(shape) - 1):
-        shape[i] = None
-    return tf.TensorShape(shape)
-
-
-def log_prob_from_logits(logits):
-    return logits - tf.reduce_logsumexp(logits, axis=-1, keepdims=True)
-
-
-def compute_batch_indices(batch_size, beam_size):
-    """Computes the i'th coodinate that contains the batch index for
-    gathers.
-
-    Batch pos is a tensor like [[0,0,0,0,],[1,1,1,1],..]. It says which
-    batch the beam item is in. This will create the i of the i,j coordinate
-    needed for the gather.
-
-    Args:
-        batch_size: Batch size
-        beam_size: Size of the beam.
-
-    Returns:
-        batch_pos: [batch_size, beam_size] tensor of ids
-    """
-    batch_pos = tf.range(batch_size * beam_size) // beam_size
-    batch_pos = tf.reshape(batch_pos, [batch_size, beam_size])
-    return batch_pos
-
-
-def compute_topk_scores_and_seq(sequences, scores, scores_to_gather, flags,
-                                beam_size, batch_size, prefix="default",
-                                states_to_gather=None):
-    """Given sequences and scores, will gather the top k=beam size
-    sequences.
-
-    This function is used to grow alive, and finished. It takes sequences,
-    scores, and flags, and returns the top k from sequence
-    scores_to_gather, and flags based on the values in scores.
-
-    This method permits easy introspection using tfdbg. It adds three
-    named ops that are prefixed by `prefix`:
-        - _topk_seq: the tensor for topk_seq returned by this method.
-        - _topk_flags: the tensor for topk_finished_flags returned by this
-            method.
-        - _topk_scores: the tensor for tokp_gathered_scores returned by
-            this method.
-
-    Args:
-        sequences: Tensor of sequences that we need to gather from.
-            [batch_size, beam_size, seq_length]
-        scores: Tensor of scores for each sequence in sequences.
-            [batch_size, beam_size]. We will use these to compute the topk.
-        scores_to_gather: Tensor of scores for each sequence in sequences.
-            [batch_size, beam_size]. We will return the gathered scores
-            from here.
-            Scores to gather is different from scores because for
-            grow_alive, we will need to return log_probs, while for
-            grow_finished, we will need to return the length penalized
-            scors.
-        flags: Tensor of bools for sequences that say whether a sequence
-            has reached EOS or not
-        beam_size: int
-        batch_size: int
-        prefix: string that will prefix unique names for the ops run.
-        states_to_gather: dict (possibly nested) of decoding states.
-
-    Returns:
-        Tuple of
-        (topk_seq [batch_size, beam_size, decode_length],
-         topk_gathered_scores [batch_size, beam_size],
-         topk_finished_flags[batch_size, beam_size])
-    """
-    _, topk_indexes = tf.nn.top_k(scores, k=beam_size)
-    # The next three steps are to create coordinates for tf.gather_nd to
-    # pull out the topk sequences from sequences based on scores.
-    # batch pos is a tensor like [[0,0,0,0,],[1,1,1,1],..]. It says which
-    # batch the beam item is in. This will create the i of the i,j
-    # coordinate needed for the gather
-    batch_pos = compute_batch_indices(batch_size, beam_size)
-
-    # top coordinates will give us the actual coordinates to do the gather.
-    # stacking will create a tensor of dimension batch * beam * 2, where
-    # the last dimension contains the i,j gathering coordinates.
-    top_coordinates = tf.stack([batch_pos, topk_indexes], axis=2)
-
-    # Gather up the highest scoring sequences.    For each operation
-    # added, give it a concrete name to simplify observing these
-    # operations with tfdbg. Clients can capture these tensors by watching
-    # these node names.
-    def gather(tensor, name):
-        if not isinstance(tensor, tf.Tensor):
-            return tensor
-        return tf.gather_nd(tensor, top_coordinates, name=(prefix + name))
-    topk_seq = gather(sequences, "_topk_seq")
-    topk_flags = gather(flags, "_topk_flags")
-    topk_gathered_scores = gather(scores_to_gather, "_topk_scores")
-    if states_to_gather:
-        topk_gathered_states = nest.map_structure(
-            lambda state: gather(state, "_topk_states"), states_to_gather)
-    else:
-        topk_gathered_states = states_to_gather
-    return topk_seq, topk_gathered_scores, topk_flags, topk_gathered_states
-
-
-def beam_search(symbols_to_logits_fn,
-                initial_ids,
-                beam_size,
-                decode_length,
-                vocab_size,
-                alpha,
-                eos_id,
-                states=None,
-                stop_early=True):
-    """Beam search with length penalties.
-
-    Requires a function that can take the currently decoded symbols and
-    return the logits for the next symbol. The implementation is inspired
-    by https://arxiv.org/abs/1609.08144.
-
-    When running, the beam search steps can be visualized by using tfdbg to
-    watch the operations generating the output ids for each beam step.
-    These operations have the pattern:
-        (alive|finished)_topk_(seq,scores)
-
-    Operations marked `alive` represent the new beam sequences that will be
-    processed in the next step.    Operations marked `finished` represent
-    the completed beam sequences, which may be padded with 0s if no beams
-    finished.
-
-    Operations marked `seq` store the full beam sequence for the time step.
-    Operations marked `scores` store the sequence's final log scores.
-
-    The beam search steps will be processed sequentially in order, so when
-    capturing observed from these operations, tensors, clients can make
-    assumptions about which step is being recorded.
-
-    WARNING: Assumes 2nd dimension of tensors in `states` and not
-    invariant, this means that the shape of the 2nd dimension of these
-    tensors will not be available (i.e. set to None) inside
-    symbols_to_logits_fn.
-
-    Args:
-        symbols_to_logits_fn: Interface to the model, to provide logits.
-            Should take [batch_size, decoded_ids] and return
-            [batch_size, vocab_size]
-        initial_ids: Ids to start off the decoding, this will be the first
-            thing handed to symbols_to_logits_fn (after expanding to beam size)
-            [batch_size]
-        beam_size: Size of the beam.
-        decode_length: Number of steps to decode for.
-        vocab_size: Size of the vocab, must equal the size of the logits
-            returned by symbols_to_logits_fn
-        alpha: alpha for length penalty.
-        states: dict (possibly nested) of decoding states.
-        eos_id: ID for end of sentence.
-        stop_early: a boolean - stop once best sequence is provably
-            determined.
-
-    Returns:
-        Tuple of
-        (decoded beams [batch_size, beam_size, decode_length]
-         decoding probablities [batch_size, beam_size])
-    """
-    batch_size = shape_list(initial_ids)[0]
-
-    # Assume initial_ids are prob 1.0
-    initial_log_probs = tf.constant([[0.] + [-float("inf")] * (
-        beam_size - 1)])
-    # Expand to beam_size (batch_size, beam_size)
-    alive_log_probs = tf.tile(initial_log_probs, [batch_size, 1])
-
-    # Expand each batch and state to beam_size
-    alive_seq = _expand_to_beam_size(initial_ids, beam_size)
-    alive_seq = tf.expand_dims(alive_seq, axis=2)
-
-    # (batch_size, beam_size, 1)
-    if states:
-        states = nest.map_structure(
-            lambda state: _expand_to_beam_size(state, beam_size), states)
-    else:
-        states = {}
-
-    # Finished will keep track of all the sequences that have finished so
-    # far
-    # Finished log probs will be negative infinity in the beginning
-    # finished_flags will keep track of booleans
-    finished_seq = tf.zeros(shape_list(alive_seq), tf.int32)
-    # Setting the scores of the initial to negative infinity.
-    finished_scores = tf.ones([batch_size, beam_size]) * -INF
-    finished_flags = tf.zeros([batch_size, beam_size], tf.bool)
-
-    def grow_finished(finished_seq, finished_scores, finished_flags,
-        curr_seq, curr_scores, curr_finished):
-        """Given sequences and scores, will gather the top k=beam size
-        sequences.
-
-        Args:
-            finished_seq: Current finished sequences.
-                [batch_size, beam_size, current_decoded_length]
-            finished_scores: scores for each of these sequences.
-                [batch_size, beam_size]
-            finished_flags: finished bools for each of these sequences.
-                [batch_size, beam_size]
-            curr_seq: current topk sequence that has been grown by one
-                position.
-                [batch_size, beam_size, current_decoded_length]
-            curr_scores: scores for each of these sequences. [batch_size,
-                beam_size]
-            curr_finished: Finished flags for each of these sequences.
-                [batch_size, beam_size]
-
-        Returns:
-            Tuple of
-                (Topk sequences based on scores,
-                 log probs of these sequences,
-                 Finished flags of these sequences)
-        """
-        # First append a column of 0'ids to finished to make the same
-        # length with finished scores
-        finished_seq = tf.concat(
-                [finished_seq,
-                 tf.zeros([batch_size, beam_size, 1], tf.int32)], axis=2)
-
-        # Set the scores of the unfinished seq in curr_seq to large
-        # negative values
-        curr_scores += (1. - tf.cast(curr_finished, tf.float32)) * -INF
-        # concatenating the sequences and scores along beam axis
-        curr_finished_seq = tf.concat([finished_seq, curr_seq], axis=1)
-        curr_finished_scores = tf.concat([finished_scores, curr_scores],
-            axis=1)
-        curr_finished_flags = tf.concat([finished_flags, curr_finished],
-            axis=1)
-        return compute_topk_scores_and_seq(
-            curr_finished_seq, curr_finished_scores, curr_finished_scores,
-                curr_finished_flags, beam_size, batch_size,
-                "grow_finished")
-
-    def grow_alive(curr_seq, curr_scores, curr_log_probs, curr_finished,
-        states):
-        """Given sequences and scores, will gather the top k=beam size
-        sequences.
-
-        Args:
-            curr_seq: current topk sequence that has been grown by one
-                position.
-                [batch_size, beam_size, i+1]
-            curr_scores: scores for each of these sequences. [batch_size,
-                beam_size]
-            curr_log_probs: log probs for each of these sequences.
-                [batch_size, beam_size]
-            curr_finished: Finished flags for each of these sequences.
-                [batch_size, beam_size]
-            states: dict (possibly nested) of decoding states.
-
-        Returns:
-            Tuple of
-                (Topk sequences based on scores,
-                 log probs of these sequences,
-                 Finished flags of these sequences)
-        """
-        # Set the scores of the finished seq in curr_seq to large negative
-        # values
-        curr_scores += tf.cast(curr_finished, tf.float32) * -INF
-        return compute_topk_scores_and_seq(curr_seq, curr_scores,
-            curr_log_probs, curr_finished, beam_size, batch_size,
-            "grow_alive", states)
-
-    def grow_topk(i, alive_seq, alive_log_probs, states):
-        r"""Inner beam seach loop.
-
-        This function takes the current alive sequences, and grows them to
-        topk sequences where k = 2*beam. We use 2*beam because, we could
-        have beam_size number of sequences that might hit <EOS> and there
-        will be no alive sequences to continue. With 2*beam_size, this
-        will not happen. This relies on the assumption the vocab size is >
-        beam size. If this is true, we'll have at least beam_size non
-        <EOS> extensions if we extract the next top 2*beam words.
-        Length penalty is given by = (5+len(decode)/6) ^ -\alpha.
-        Pls refer to https://arxiv.org/abs/1609.08144.
-
-        Args:
-            i: loop index
-            alive_seq: Topk sequences decoded so far [batch_size,
-                beam_size, i+1]
-            alive_log_probs: probabilities of these sequences.
-                [batch_size, beam_size]
-            states: dict (possibly nested) of decoding states.
-
-        Returns:
-            Tuple of
-                (Topk sequences extended by the next word,
-                 The log probs of these sequences,
-                 The scores with length penalty of these sequences,
-                 Flags indicating which of these sequences have finished
-                 decoding, dict of transformed decoding states)
-        """
-        # Get the logits for all the possible next symbols
-        flat_ids = tf.reshape(alive_seq, [batch_size * beam_size, -1])
-
-        # (batch_size * beam_size, decoded_length)
-        if states:
-            flat_states = nest.map_structure(_merge_beam_dim, states)
-            flat_logits, flat_states = symbols_to_logits_fn(flat_ids, i,
-                                                            flat_states)
-            states = nest.map_structure(
-                lambda t: _unmerge_beam_dim(t, batch_size, beam_size),
-                flat_states)
-        else:
-            flat_logits = symbols_to_logits_fn(flat_ids)
-        logits = tf.reshape(flat_logits, [batch_size, beam_size, -1])
-
-        # Convert logits to normalized log probs
-        candidate_log_probs = log_prob_from_logits(logits)
-
-        # Multiply the probabilites by the current probabilites of the
-        # beam.
-        # (batch_size, beam_size, vocab_size) + (batch_size, beam_size, 1)
-        log_probs = candidate_log_probs + tf.expand_dims(alive_log_probs,
-            axis=2)
-        i_p = tf.cast(i + 1, tf.float32)
-        length_penalty = tf.pow(((5. + i_p) / 6.), alpha)
-
-        curr_scores = log_probs / length_penalty
-        # Flatten out (beam_size, vocab_size) probs in to a list of
-        # possibilites
-        flat_curr_scores = tf.reshape(curr_scores,
-            [-1, beam_size * vocab_size])
-
-        topk_scores, topk_ids = tf.nn.top_k(flat_curr_scores,
-            k=beam_size * 2)
-
-        # Recovering the log probs because we will need to send them back
-        topk_log_probs = topk_scores * length_penalty
-
-        # Work out what beam the top probs are in.
-        topk_beam_index = topk_ids // vocab_size
-        topk_ids %= vocab_size    # Unflatten the ids
-
-        # The next three steps are to create coordinates for tf.gather_nd
-        # to pull out the correct seqences from id's that we need to grow.
-        # We will also use the coordinates to gather the booleans of the
-        # beam items that survived.
-        batch_pos = compute_batch_indices(batch_size, beam_size * 2)
-
-        # top beams will give us the actual coordinates to do the gather.
-        # stacking will create a tensor of dimension batch * beam * 2,
-        # where the last dimension contains the i,j gathering coordinates.
-        topk_coordinates = tf.stack([batch_pos, topk_beam_index], axis=2)
-
-        # Gather up the most probable 2*beams both for the ids and
-        # finished_in_alive bools
-        topk_seq = tf.gather_nd(alive_seq, topk_coordinates)
-        if states:
-            states = nest.map_structure(
-                lambda state: tf.gather_nd(state, topk_coordinates), states)
-
-        # Append the most probable alive
-        topk_seq = tf.concat([topk_seq, tf.expand_dims(topk_ids, axis=2)],
-                             axis=2)
-
-        topk_finished = tf.equal(topk_ids, eos_id)
-
-        return topk_seq, topk_log_probs, topk_scores, topk_finished, states
-
-    def inner_loop(i, alive_seq, alive_log_probs, finished_seq,
-                   finished_scores, finished_flags, states):
-        """Inner beam search loop.
-
-        There are three groups of tensors, alive, finished, and topk.
-        The alive group contains information about the current alive
-        sequences. The topk group contains information about alive + topk
-        current decoded words the finished group contains information
-        about finished sentences, that is, the ones that have decoded to
-        <EOS>. These are what we return.
-        The general beam search algorithm is as follows:
-        While we haven't terminated (pls look at termination condition)
-            1. Grow the current alive to get beam*2 topk sequences
-            2. Among the topk, keep the top beam_size ones that haven't
-            reached EOS into alive
-            3. Among the topk, keep the top beam_size ones have reached
-            EOS into finished
-        Repeat
-        To make things simple with using fixed size tensors, we will end
-        up inserting unfinished sequences into finished in the beginning.
-        To stop that we add -ve INF to the score of the unfinished
-        sequence so that when a true finished sequence does appear, it
-        will have a higher score than all the unfinished ones.
-
-        Args:
-            i: loop index
-            alive_seq: Topk sequences decoded so far [batch_size,
-                beam_size, i+1]
-            alive_log_probs: probabilities of the beams. [batch_size,
-                beam_size]
-            finished_seq: Current finished sequences.
-                [batch_size, beam_size, i+1]
-            finished_scores: scores for each of these sequences.
-                [batch_size, beam_size]
-            finished_flags: finished bools for each of these sequences.
-                [batch_size, beam_size]
-            states: dict (possibly nested) of decoding states.
-
-        Returns:
-            Tuple of
-                (Incremented loop index
-                 New alive sequences,
-                 Log probs of the alive sequences,
-                 New finished sequences,
-                 Scores of the new finished sequences,
-                 Flags inidicating which sequence in finished as reached
-                 EOS,
-                 dict of final decoding states)
-        """
-
-        # Each inner loop, we carry out three steps:
-        # 1. Get the current topk items.
-        # 2. Extract the ones that have finished and haven't finished
-        # 3. Recompute the contents of finished based on scores.
-        topk_seq, topk_log_probs, topk_scores, topk_finished, states =\
-            grow_topk(i, alive_seq, alive_log_probs, states)
-        alive_seq, alive_log_probs, _, states = grow_alive(
-            topk_seq, topk_scores, topk_log_probs, topk_finished, states)
-        finished_seq, finished_scores, finished_flags, _ = grow_finished(
-            finished_seq, finished_scores, finished_flags, topk_seq,
-            topk_scores, topk_finished)
-
-        return (i + 1, alive_seq, alive_log_probs, finished_seq,
-            finished_scores, finished_flags, states)
-
-    def _is_finished(i, unused_alive_seq, alive_log_probs,
-            unused_finished_seq, finished_scores, finished_in_finished,
-            unused_states):
-        """Checking termination condition.
-
-        We terminate when we decoded up to decode_length or the lowest
-        scoring item in finished has a greater score that the higest prob
-        item in alive divided by the max length penalty
-
-        Args:
-            i: loop index
-            alive_log_probs: probabilities of the beams. [batch_size,
-                beam_size]
-            finished_scores: scores for each of these sequences.
-                [batch_size, beam_size]
-            finished_in_finished: finished bools for each of these
-                sequences. [batch_size, beam_size]
-
-        Returns:
-            Bool.
-        """
-        if not stop_early:
-            return tf.less(i, decode_length)
-        max_length_penalty = tf.pow(
-            ((5. + tf.cast(decode_length, tf.float32)) / 6.), alpha)
-        # The best possible score of the most likley alive sequence
-        lower_bound_alive_scores = alive_log_probs[:, 0] /\
-            max_length_penalty
-
-        # Now to compute the lowest score of a finished sequence in
-        # finished
-        # If the sequence isn't finished, we multiply it's score by 0.
-        # since scores are all -ve, taking the min will give us the score
-        # of the lowest finished item.
-        lowest_score_of_fininshed_in_finished = tf.reduce_min(
-            finished_scores * tf.cast(finished_in_finished, tf.float32),
-            axis=1)
-        # If none of the sequences have finished, then the min will be 0
-        # and we have to replace it by -ve INF if it is. The score of any
-        # seq in alive will be much higher than -ve INF and the
-        # termination condition will not be met.
-        lowest_score_of_fininshed_in_finished += (
-            (1. - tf.cast(tf.reduce_any(finished_in_finished,
-            1), tf.float32)) * -INF)
-
-        bound_is_met = tf.reduce_all(
-            tf.greater(lowest_score_of_fininshed_in_finished,
-            lower_bound_alive_scores))
-
-        return tf.logical_and(
-            tf.less(i, decode_length), tf.logical_not(bound_is_met))
-
-    (_, alive_seq, alive_log_probs, finished_seq, finished_scores,
-     finished_flags, _) = tf.while_loop(
-        _is_finished,
-        inner_loop, [
-            tf.constant(0), alive_seq, alive_log_probs, finished_seq,
-            finished_scores, finished_flags, states
-        ],
-        shape_invariants=[
-            tf.TensorShape([]),
-            tf.TensorShape([None, None, None]),
-            alive_log_probs.get_shape(),
-            tf.TensorShape([None, None, None]),
-            finished_scores.get_shape(),
-            finished_flags.get_shape(),
-            nest.map_structure(get_state_shape_invariants, states),
-        ],
-        parallel_iterations=1,
-        back_prop=False)
-
-    alive_seq.set_shape((None, beam_size, None))
-    finished_seq.set_shape((None, beam_size, None))
-
-    # Accounting for corner case: It's possible that no sequence in alive
-    # for a particular batch item ever reached EOS. In that case, we
-    # should just copy the contents of alive for that batch item. tf
-    # reduce_any(finished_flags, 1)
-    # if 0, means that no sequence for that batch index had reached EOS.
-    # We need to do the same for the scores as well.
-    finished_seq = tf.where(
-        tf.reduce_any(finished_flags, 1), finished_seq, alive_seq)
-    finished_scores = tf.where(
-        tf.reduce_any(finished_flags, 1), finished_scores, alive_log_probs)
-    return finished_seq, finished_scores
diff --git a/texar/tf/utils/dtypes.py b/texar/tf/utils/dtypes.py
index e4b3b325..f27903e1 100644
--- a/texar/tf/utils/dtypes.py
+++ b/texar/tf/utils/dtypes.py
@@ -1,4 +1,4 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
+# Copyright 2019 The Texar Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -15,14 +15,6 @@
 Utility functions related to data types.
 """
 
-from __future__ import absolute_import
-from __future__ import print_function
-from __future__ import division
-from __future__ import unicode_literals
-
-# pylint: disable=invalid-name, no-member, protected-access
-
-import six
 import numpy as np
 
 import tensorflow as tf
@@ -31,14 +23,12 @@
     "get_tf_dtype",
     "is_callable",
     "is_str",
-    "is_placeholder",
-    "maybe_hparams_to_dict",
-    "compat_as_text"
+    "compat_as_text",
 ]
 
 
-def get_tf_dtype(dtype):  # pylint: disable=too-many-return-statements
-    """Returns equivalent tf dtype.
+def get_tf_dtype(dtype):
+    r"""Returns equivalent tf dtype.
 
     Args:
         dtype: A str, python numeric or string type, numpy data type, or
@@ -75,42 +65,16 @@ def get_tf_dtype(dtype):  # pylint: disable=too-many-return-statements
 
 
 def is_callable(x):
-    """Return `True` if :attr:`x` is callable.
+    r"""Return `True` if :attr:`x` is callable.
     """
-    try:
-        _is_callable = callable(x)
-    except BaseException:  # pylint: disable=bare-except
-        _is_callable = hasattr(x, '__call__')
-    return _is_callable
+    return callable(x)
 
 
 def is_str(x):
-    """Returns `True` if :attr:`x` is either a str or unicode. Returns `False`
-    otherwise.
-    """
-    return isinstance(x, six.string_types)
-
-
-def is_placeholder(x):
-    """Returns `True` if :attr:`x` is a :tf_main:`tf.placeholder <placeholder>`
-    or :tf_main:`tf.placeholder_with_default <placeholder_with_default>`.
-    """
-    try:
-        return x._ops.type in ['Placeholder', 'PlaceholderWithDefault']
-    except BaseException:  # pylint: disable=bare-except
-        return False
-
-
-def maybe_hparams_to_dict(hparams):
-    """If :attr:`hparams` is an instance of :class:`~texar.tf.HParams`,
-    converts it to a `dict` and returns. If :attr:`hparams` is a `dict`,
-    returns as is.
+    r"""Returns `True` if :attr:`x` is either a str or unicode.
+    Returns `False` otherwise.
     """
-    if hparams is None:
-        return None
-    if isinstance(hparams, dict):
-        return hparams
-    return hparams.todict()
+    return isinstance(x, str)
 
 
 def _maybe_list_to_array(str_list, dtype_as):
@@ -123,10 +87,10 @@ def _maybe_list_to_array(str_list, dtype_as):
 
 
 def compat_as_text(str_):
-    """Converts strings into `unicode` (Python 2) or `str` (Python 3).
+    r"""Converts strings into ``unicode`` (Python 2) or ``str`` (Python 3).
 
     Args:
-        str_: A string or other data types convertible to string, or an
+        str\_: A string or other data types convertible to string, or an
             `n`-D numpy array or (possibly nested) list of such elements.
 
     Returns:
diff --git a/texar/tf/utils/exceptions.py b/texar/tf/utils/exceptions.py
deleted file mode 100644
index afa64dad..00000000
--- a/texar/tf/utils/exceptions.py
+++ /dev/null
@@ -1,32 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Texar defined exceptions.
-"""
-
-
-from __future__ import absolute_import
-from __future__ import print_function
-from __future__ import division
-
-__all__ = [
-    "TexarError"
-]
-
-
-class TexarError(Exception):
-    """
-    Texar error.
-    """
-    pass
diff --git a/texar/tf/utils/mode.py b/texar/tf/utils/mode.py
index 08b6519a..84ea7daa 100644
--- a/texar/tf/utils/mode.py
+++ b/texar/tf/utils/mode.py
@@ -1,4 +1,4 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
+# Copyright 2019 The Texar Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -15,71 +15,26 @@
 Utility functions related to mode.
 """
 
-from __future__ import absolute_import
-from __future__ import print_function
-from __future__ import division
-
 import tensorflow as tf
 
-from texar.tf import context
-
 __all__ = [
-    "maybe_global_mode",
-    "is_train_mode",
-    "is_eval_mode",
-    "is_predict_mode",
-    "is_train_mode_py",
-    "is_eval_mode_py",
-    "is_predict_mode_py",
-    "switch_dropout"
+    'valid_modes',
+    'is_train_mode',
+    'is_eval_mode',
+    'is_predict_mode',
 ]
 
 
-def maybe_global_mode(mode):
-    """Returns :func:`texar.tf.global_mode` if :attr:`mode` is `None`,
-    otherwise returns :attr:`mode` as-is.
-    """
-    if mode is None:
-        return context.global_mode()
-    else:
-        return mode
-
-
-def is_train_mode(mode):
-    """Returns a bool Tensor indicating whether the global mode is TRAIN.
-    If :attr:`mode` is `None`, the mode is determined by
-    :func:`texar.tf.global_mode`.
+def valid_modes():
+    r"""Returns a set of possible values of mode.
     """
-    if mode is None:
-        return context.global_mode_train()
-    else:
-        return tf.equal(mode, tf.estimator.ModeKeys.TRAIN)
-
+    return {tf.estimator.ModeKeys.TRAIN,
+            tf.estimator.ModeKeys.EVAL,
+            tf.estimator.ModeKeys.PREDICT}
 
-def is_eval_mode(mode):
-    """Returns a bool Tensor indicating whether the global mode is EVAL.
-    If :attr:`mode` is `None`, the mode is determined by
-    :func:`texar.tf.global_mode`.
-    """
-    if mode is None:
-        return context.global_mode_eval()
-    else:
-        return tf.equal(mode, tf.estimator.ModeKeys.EVAL)
 
-
-def is_predict_mode(mode):
-    """Returns a bool Tensor indicating whether the global mode is PREDICT.
-    If :attr:`mode` is `None`, the mode is determined by
-    :func:`texar.tf.global_mode`.
-    """
-    if mode is None:
-        return context.global_mode_predict()
-    else:
-        return tf.equal(mode, tf.estimator.ModeKeys.PREDICT)
-
-
-def is_train_mode_py(mode, default=True):
-    """Returns a python boolean indicating whether the mode is TRAIN.
+def is_train_mode(mode, default=True):
+    r"""Returns a python boolean indicating whether the mode is TRAIN.
 
     Args:
         mode: A string taking value in
@@ -93,13 +48,13 @@ def is_train_mode_py(mode, default=True):
     """
     if mode is None:
         return default
-    if mode not in context.valid_modes():
+    if mode not in valid_modes():
         raise ValueError('Unknown mode: {}'.format(mode))
     return mode == tf.estimator.ModeKeys.TRAIN
 
 
-def is_eval_mode_py(mode, default=False):
-    """Returns a python boolean indicating whether the mode is EVAL.
+def is_eval_mode(mode, default=False):
+    r"""Returns a python boolean indicating whether the mode is EVAL.
 
     Args:
         mode: A string taking value in
@@ -113,13 +68,13 @@ def is_eval_mode_py(mode, default=False):
     """
     if mode is None:
         return default
-    if mode not in context.valid_modes():
+    if mode not in valid_modes():
         raise ValueError('Unknown mode: {}'.format(mode))
     return mode == tf.estimator.ModeKeys.EVAL
 
 
-def is_predict_mode_py(mode, default=False):
-    """Returns a python boolean indicating whether the mode is PREDICT.
+def is_predict_mode(mode, default=False):
+    r"""Returns a python boolean indicating whether the mode is PREDICT.
 
     Args:
         mode: A string taking value in
@@ -133,25 +88,6 @@ def is_predict_mode_py(mode, default=False):
     """
     if mode is None:
         return default
-    if mode not in context.valid_modes():
+    if mode not in valid_modes():
         raise ValueError('Unknown mode: {}'.format(mode))
     return mode == tf.estimator.ModeKeys.PREDICT
-
-
-def switch_dropout(dropout_keep_prob, mode=None):
-    """Turns off dropout when not in training mode.
-
-    Args:
-        dropout_keep_prob: Dropout keep probability in training mode
-        mode (optional): A Tensor taking values of
-            :tf_main:`tf.estimator.ModeKeys <estimator/ModeKeys>`.
-            Dropout is activated if :attr:`mode` is `TRAIN`.
-            If `None`, the mode is inferred from
-            :func:`texar.tf.global_mode`.
-
-    Returns:
-        A unit Tensor that equals the dropout keep probability in `TRAIN` mode,
-        and `1.0` in other modes.
-    """
-    return 1. - (1. - dropout_keep_prob) \
-        * tf.cast(is_train_mode(mode), tf.float32)
diff --git a/texar/tf/utils/mode_test.py b/texar/tf/utils/mode_test.py
index 72158abb..1d959808 100644
--- a/texar/tf/utils/mode_test.py
+++ b/texar/tf/utils/mode_test.py
@@ -1,46 +1,39 @@
-
 """
 Unit tests for mode-related utility functions.
 """
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
 import tensorflow as tf
 
 from texar.tf.utils import mode
-from texar.tf import context
 
 
 class UtilsTest(tf.test.TestCase):
-    """Tests utility functions.
+    r"""Tests utility functions.
     """
 
     def test_mode(self):
-        """ Tests mode related utilities.
+        r""" Tests mode related utilities.
         """
         training = mode.is_train_mode(None)
-        with self.test_session() as sess:
-            sess.run(tf.global_variables_initializer())
-            training_ = sess.run(training)
-            self.assertTrue(training_)
-
-            training_ = sess.run(
-                training,
-                feed_dict={context.global_mode(): tf.estimator.ModeKeys.TRAIN})
-            self.assertTrue(training_)
-
-            training_ = sess.run(
-                training,
-                feed_dict={context.global_mode(): tf.estimator.ModeKeys.EVAL})
-            self.assertFalse(training_)
-
-        training = mode.is_train_mode(tf.estimator.ModeKeys.TRAIN)
-        with self.test_session() as sess:
-            sess.run(tf.global_variables_initializer())
-            training_ = sess.run(training)
-            self.assertTrue(training_)
+        self.assertTrue(training)
+
+        training = mode.is_train_mode('train')
+        self.assertTrue(training)
+
+        training = mode.is_train_mode('eval')
+        self.assertFalse(training)
+
+        infering = mode.is_eval_mode(None)
+        self.assertFalse(infering)
+
+        infering = mode.is_eval_mode('eval')
+        self.assertTrue(infering)
+
+        infering = mode.is_predict_mode(None)
+        self.assertFalse(infering)
+
+        infering = mode.is_predict_mode('infer')
+        self.assertTrue(infering)
 
 
 if __name__ == "__main__":
diff --git a/texar/tf/utils/shapes.py b/texar/tf/utils/shapes.py
index 8616a485..506777a3 100644
--- a/texar/tf/utils/shapes.py
+++ b/texar/tf/utils/shapes.py
@@ -1,4 +1,4 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
+# Copyright 2019 The Texar Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -15,79 +15,55 @@
 Utility functions related to tensor shapes.
 """
 
-from __future__ import absolute_import
-from __future__ import print_function
-from __future__ import division
-
-# pylint: disable=no-name-in-module, protected-access, no-member, invalid-name
-
 import numpy as np
-
 import tensorflow as tf
-from tensorflow.python.util import nest
-from tensorflow.python.ops import rnn
-from tensorflow.python.framework import ops
+
 
 __all__ = [
     "transpose_batch_time",
     "get_batch_size",
     "get_rank",
     "mask_sequences",
-    "_mask_sequences_tensor",
-    "_mask_sequences_py",
-    "reduce_with_weights",
-    "flatten",
-    "shape_list",
-    "pad_and_concat",
-    "varlength_concat",
-    "varlength_concat_py",
-    "varlength_roll"
 ]
 
 
 def transpose_batch_time(inputs):
-    """Transposes inputs between time-major and batch-major.
+    r"""Transposes inputs between time-major and batch-major.
 
     Args:
-        inputs: A Tensor of shape `[batch_size, max_time, ...]` (batch-major)
-            or `[max_time, batch_size, ...]` (time-major), or a (possibly
+        inputs: A Tensor of shape ``[batch_size, max_time, ...]`` (batch-major)
+            or ``[max_time, batch_size, ...]`` (time-major), or a (possibly
             nested) tuple of such elements.
 
     Returns:
         A (possibly nested tuple of) Tensor with transposed batch and
         time dimensions of inputs.
     """
-    flat_input = nest.flatten(inputs)
-    flat_input = [ops.convert_to_tensor(input_) for input_ in flat_input]
-    # pylint: disable=protected-access
-    flat_input = [rnn._transpose_batch_time(input_) for input_ in flat_input]
-    return nest.pack_sequence_as(structure=inputs, flat_sequence=flat_input)
+    rank = get_rank(inputs)
+    perm = [1, 0] + [i for i in range(2, rank)]
+    return tf.transpose(inputs, perm=perm)
 
 
 def get_batch_size(tensor):
-    """Returns a unit `Tensor` representing the batch size, i.e.,
+    r"""Returns an  ``int`` representing the batch size, i.e.,
     the size of the 1st dimension of :attr:`tensor`.
     """
-    return tf.shape(tensor)[0]
+    return tensor.shape[0]
 
 
 def get_rank(tensor):
-    """Returns the tensor rank as a python `int`. The input tensor can also be
-    a python array.
+    r"""Returns the tensor rank as a python ``int``. The input tensor can also
+    be a python array.
 
     Args:
         tensor: A Tensor or python array.
 
     Returns:
-        A python `int` representing the rank of :attr:`tensor`. Returns
+        A python ``int`` representing the rank of :attr:`tensor`. Returns
         `None` if the rank cannot be determined.
     """
-    if tf.contrib.framework.is_tensor(tensor):
-        shape = tensor.shape
-        try:
-            rank = len(shape.as_list())
-        except ValueError:  # when `shape==TensorShape(None)`
-            rank = None
+    if tf.is_tensor(tensor):
+        rank = len(tensor.shape)
     else:
         array = np.asarray(tensor)
         rank = array.ndim
@@ -97,9 +73,8 @@ def get_rank(tensor):
 def mask_sequences(sequence,
                    sequence_length,
                    dtype=None,
-                   time_major=False,
-                   tensor_rank=2):
-    """Masks out sequence entries that are beyond the respective sequence
+                   time_major=False):
+    r"""Masks out sequence entries that are beyond the respective sequence
     lengths. Masks along the time dimension.
 
     :attr:`sequence` and :attr:`sequence_length` can either be python
@@ -108,23 +83,19 @@ def mask_sequences(sequence,
 
     Args:
         sequence: A Tensor or python array of sequence values.
-            If `time_major==False` (default), this must be a Tensor of shape
-            `[batch_size, max_time, ...]`. The batch and time dimension is
-            exchanged if `time_major==True`.
-        sequence_length: A Tensor or python array of shape `[batch_size]`.
+            If ``time_major==False`` (default), this must be a Tensor of shape
+            ``[batch_size, max_time, ...]``. The batch and time dimension is
+            exchanged if ``time_major==True``.
+        sequence_length: A Tensor or python array of shape ``[batch_size]``.
             Time steps beyond the respective sequence lengths will be
             made zero.
         dtype (dtype): Type of :attr:`sequence`. If `None`, infer from
             :attr:`sequence` automatically.
         time_major (bool): The shape format of the inputs. If `True`,
             :attr:`sequence` must have shape
-            `[max_time, batch_size, ...]`.
+            ``[max_time, batch_size, ...]``.
             If `False` (default), :attr:`sequence` must have
-            shape `[batch_size, max_time, ...]`.
-        tensor_rank (int): The number of dimensions of :attr:`sequence`.
-            Default is 2, i.e., :attr:`sequence` is a 2D Tensor consisting
-            of batch and time dimensions. Ignored if both :attr:`sequence`
-            and :attr:`sequence_length` are python arrays.
+            shape ``[batch_size, max_time, ...]``.
 
     Returns:
         The masked sequence, i.e., a Tensor or python array of the same shape
@@ -133,559 +104,23 @@ def mask_sequences(sequence,
         If both :attr:`sequence` and :attr:`sequence_length` are python
         arrays, the returned value is a python array as well.
     """
-    is_tensor = tf.contrib.framework.is_tensor
-    if is_tensor(sequence) or is_tensor(sequence_length):
-        return _mask_sequences_tensor(
-            sequence, sequence_length, dtype, time_major, tensor_rank)
-    else:
-        return _mask_sequences_py(
-            sequence, sequence_length, dtype, time_major)
-
-
-def _mask_sequences_tensor(sequence,
-                           sequence_length,
-                           dtype=None,
-                           time_major=False,
-                           tensor_rank=2):
-    """Masks out sequence entries that are beyond the respective sequence
-    lengths. Masks along the time dimension.
-
-    Args:
-        sequence: A Tensor of sequence values.
-
-            If `time_major=False` (default), this must be a Tensor of shape:
-                `[batch_size, max_time, d_2, ..., d_rank]`, where the rank of
-                the Tensor is specified with :attr:`tensor_rank`.
+    if not tf.is_tensor(sequence):
+        sequence = tf.convert_to_tensor(sequence, dtype=dtype)
 
-            If `time_major=True`, this must be a Tensor of shape:
-                `[max_time, batch_size, d_2, ..., d_rank].`
-        sequence_length: A Tensor of shape `[batch_size]`. Time steps beyond
-            the respective sequence lengths will be made zero.
-        dtype (dtype): Type of :attr:`sequence`. If `None`, infer from
-            :attr:`sequence` automatically.
-        time_major (bool): The shape format of the inputs. If `True`,
-            :attr:`sequence` must have shape
-            `[max_time, batch_size, d_2, ..., d_rank]`.
-            If `False` (default), :attr:`sequence` must have
-            shape `[batch_size, max_time, d_2, ..., d_rank]`.
-        tensor_rank (int): The number of dimensions of :attr:`sequence`.
-            Default is 2, i.e., :attr:`sequence` is a 2D Tensor consisting
-            of batch and time dimensions.
+    rank = get_rank(sequence)
+    if rank < 2:
+        raise ValueError("`sequence` must be 2D or higher order.")
 
-    Returns:
-        The masked sequence, i.e., a Tensor of the same shape as
-        :attr:`sequence` but with masked-out entries (set to zero).
-    """
-    if tensor_rank is None:
-        tensor_rank = 2
-    if tensor_rank < 2:
-        raise ValueError(
-            "tensor_rank must be > 2. Got tensor_rank = {}".format(tensor_rank))
     if time_major:
-        sequence = rnn._transpose_batch_time(sequence)
-    max_time = tf.cast(tf.shape(sequence)[1], tf.int32)
+        sequence = transpose_batch_time(sequence)
+    max_time = sequence.shape[1]
     if dtype is None:
         dtype = sequence.dtype
     mask = tf.sequence_mask(
         tf.cast(sequence_length, tf.int32), max_time, dtype=dtype)
-    for _ in range(2, tensor_rank):
-        mask = tf.expand_dims(mask, axis=-1)
-    sequence = sequence * mask
-    if time_major:
-        sequence = rnn._transpose_batch_time(sequence)
-    return sequence
-
-
-def _mask_sequences_py(sequence,
-                       sequence_length,
-                       dtype=None,
-                       time_major=False):
-    """Masks out sequence entries that are beyond the respective sequence
-    lengths. Masks along the time dimension.
-
-    This is the numpy version of :func:`texar.tf.utils.mask_sequences`.
-
-    Args:
-        sequence: An python array of sequence values.
-
-            If `time_major=False` (default), this must be an array of shape:
-                `[batch_size, max_time, ...]`
-
-            If `time_major=True`, this must be a Tensor of shape:
-                `[max_time, batch_size, ...].`
-        sequence_length: An array of shape `[batch_size]`. Time steps beyond
-            the respective sequence lengths will be made zero.
-        dtype (dtype): Type of :attr:`sequence`. If `None`, infer from
-            :attr:`sequence` automatically.
-        time_major (bool): The shape format of the inputs. If `True`,
-            :attr:`sequence` must have shape
-            `[max_time, batch_size, ...]`.
-            If `False` (default), :attr:`sequence` must have
-            shape `[batch_size, max_time, ...]`.
-
-    Returns:
-        The masked sequence, i.e., an array of the same shape as
-        :attr:`sequence` but with masked-out entries (set to zero).
-    """
-    sequence = np.array(sequence)
-    sequence_length = np.array(sequence_length)
-
-    rank = sequence.ndim
-    if rank < 2:
-        raise ValueError("`sequence` must be 2D or higher order.")
-    batch_size = sequence.shape[0]
-    max_time = sequence.shape[1]
-    dtype = dtype or sequence.dtype
-
-    if time_major:
-        sequence = np.transpose(sequence, axes=[1, 0, 2])
-
-    steps = np.tile(np.arange(max_time), [batch_size, 1])
-    mask = np.asarray(steps < sequence_length[:, None], dtype=dtype)
     for _ in range(2, rank):
-        mask = np.expand_dims(mask, -1)
-
+        mask = tf.expand_dims(mask, axis=-1)
     sequence = sequence * mask
-
     if time_major:
-        sequence = np.transpose(sequence, axes=[1, 0, 2])
-
+        sequence = transpose_batch_time(sequence)
     return sequence
-
-
-def reduce_with_weights(tensor,
-                        weights=None,
-                        average_across_batch=True,
-                        average_across_remaining=False,
-                        sum_over_batch=False,
-                        sum_over_remaining=True,
-                        tensor_rank=None):
-    """Weights and reduces tensor.
-
-    Args:
-        tensor: A Tensor to weight and reduce, of shape
-            `[batch_size, ...]`.
-        weights (optional): A Tensor of the same shape and dtype with
-            :attr:`tensor`. For example, this is can be a 0-1 tensor
-            for masking values of :attr:`tensor``.
-        average_across_batch (bool): If set, average the tensor across the
-            batch dimension. Must not set `average_across_batch`'
-            and `sum_over_batch` at the same time.
-        average_across_remaining (bool): If set, average the
-            tensor across the
-            remaining dimensions. Must not set `average_across_remaining`'
-            and `sum_over_remaining` at the same time.
-            If :attr:`weights` is given, this is a weighted average.
-        sum_over_batch (bool): If set, sum the tensor across the
-            batch dimension. Must not set `average_across_batch`
-            and `sum_over_batch` at the same time.
-        sum_over_remaining (bool): If set, sum the tensor
-            across the
-            remaining dimension. Must not set `average_across_remaining`
-            and `sum_over_remaining` at the same time.
-            If :attr:`weights` is given, this is a weighted sum.
-        tensor_rank (int, optional): The number of dimensions of
-            :attr:`tensor`. If not given, inferred from :attr:`tensor`
-            automatically.
-
-    Returns:
-        A Tensor.
-
-    Example:
-        .. code-block:: python
-
-            x = tf.constant([[10, 10, 2, 2],
-                             [20, 2, 2, 2]])
-            mask = tf.constant([[1, 1, 0, 0],
-                                [1, 0, 0, 0]])
-
-            z = reduce_with_weights(x, weights=mask)
-            # z == 20
-            # (all 2 in x are masked)
-    """
-    if tensor_rank is None:
-        tensor_rank = get_rank(tensor)
-    if tensor_rank is None:
-        raise ValueError('Unable to infer the rank of `tensor`. '
-                         'Please set `tensor_rank` explicitly.')
-
-    if weights is not None:
-        tensor = tensor * weights
-
-    if tensor_rank > 1:
-        if average_across_remaining and sum_over_remaining:
-            raise ValueError("Only one of `average_across_remaining` and "
-                             "`sum_over_remaining` can be set.")
-        if average_across_remaining:
-            if weights is None:
-                tensor = tf.reduce_mean(tensor, axis=np.arange(1, tensor_rank))
-            else:
-                tensor = tf.reduce_sum(tensor, axis=np.arange(1, tensor_rank))
-                weights = tf.reduce_sum(weights, axis=np.arange(1, tensor_rank))
-                tensor = tensor / weights
-        elif sum_over_remaining:
-            tensor = tf.reduce_sum(tensor, axis=np.arange(1, tensor_rank))
-
-    if average_across_batch and sum_over_batch:
-        raise ValueError("Only one of `average_across_batch` and "
-                         "`sum_over_batch` can be set.")
-    if sum_over_batch:
-        tensor = tf.reduce_sum(tensor, axis=[0])
-    elif average_across_batch:
-        tensor = tf.reduce_mean(tensor, axis=[0])
-
-    return tensor
-
-
-def flatten(tensor, preserve_dims, flattened_dim=None):
-    """Flattens a tensor whiling keeping several leading dimensions.
-
-    :attr:`preserve_dims` must < tensor's rank
-
-    Args:
-        tensor: A Tensor to flatten.
-        preserve_dims (int): The number of leading dimensions to preserve.
-        flatterned_dim (int, optional): The size of the resulting flattened
-            dimension. If not given, infer automatically, which can cause
-            a statically unknown dimension size.
-
-    Returns:
-        A Tensor with rank :attr:`perserve_dims` + 1.
-
-    Example:
-        .. code-block:: python
-
-            x = tf.ones(shape=[d_1, d_2, d_3, d_4])
-            y = flatten(x, 2) # y.shape == [d_1, d_2, d_3 * d_4]
-    """
-    if flattened_dim is None:
-        flattened_dim = -1
-    shape = tf.concat([tf.shape(tensor)[:preserve_dims], [flattened_dim]],
-                      axis=0)
-    tensor_ = tf.reshape(tensor, shape)
-    return tensor_
-
-
-def shape_list(x):
-    r"""Returns **static** shape of the input Tensor whenever possible.
-
-    Args:
-        x: A Tensor.
-
-    Returns:
-        - If the rank of `x` is unknown, returns the dynamic shape
-          ``tf.shape(x)``
-
-        - Otherwise, returns a list of dims, each of which is either an `int`
-          whenever it can be statically determined, or a scalar Tensor
-          otherwise.
-    """
-    x = tf.convert_to_tensor(x)
-    # If unknown rank, return dynamic shape
-    if x.get_shape().dims is None:
-        return tf.shape(x)
-    static = x.get_shape().as_list()
-    shape = tf.shape(x)
-    ret = []
-    for i, dim in enumerate(static):
-        if dim is None:
-            dim = shape[i]
-        ret.append(dim)
-    return ret
-
-
-def pad_and_concat(values, axis, rank=None, pad_axis=None,
-                   pad_constant_values=0):
-    """Concats tensors along one dimension. Pads each of other dimensions of
-    the tensors to the corresponding maximum size if necessary.
-
-    Args:
-        values: A list of Tensors of the same rank.
-        axis (int): A Python int. Dimension along which to concatenate.
-        rank (int, optional): Rank of the tensors. If `None`, inferred
-            automatically from :attr:`values`.
-        pad_axis (int or list, optional): A Python int or a list of int.
-            Dimensions to pad. Paddings are only added to the end of
-            corresponding dimensions. If `None`, all dimensions except the
-            :attr:`axis` dimension are padded.
-        pad_constant_values: The scalar pad value to use. Must be same type
-            as the tensors.
-
-    Returns:
-        A `Tensor` resulting from padding and concatenation of the input
-        tensors.
-
-    Raises:
-        ValueError: If :attr:`rank` is `None` and cannot be inferred from
-            :attr:`values`.
-
-
-    Example:
-
-        .. code-block:: python
-
-            a = tf.ones([1, 2])
-            b = tf.ones([2, 3])
-
-            c = pad_and_concat([a,b], 0)
-            # c.shape == [3, 3]
-            # c == [[1, 1, 0],
-            #       [1, 1, 1],
-            #       [1, 1, 1]]
-
-            d = pad_and_concat([a,b], 1)
-            # d.shape == [2, 5]
-            # d == [[1, 1, 1, 1, 1]
-            #       [0, 0, 1, 1, 1]]
-    """
-    if rank is None:
-        for value in values:
-            rank = get_rank(value)
-            if rank is not None:
-                break
-    if rank is None:
-        raise ValueError('Cannot determine the rank of the tensors')
-
-    def _pad_to_size(value, axis_, size):
-        """Pads the :attr:`axis_` of a tensor :attr:`value` to the given
-        :attr:`size`. Only pads to the end.
-
-        Args:
-            value: A Tensor.
-            axis_: A Python int.
-            size: A scalar int Tensor or Python int.
-        """
-        paddings = np.zeros([rank, 2], dtype=np.int32)
-        paddings[axis_, 1] = 1
-        paddings = paddings * (size - tf.shape(value)[axis_])
-        return tf.pad(value, paddings, mode='CONSTANT',
-                      constant_values=pad_constant_values)
-
-    if pad_axis is None:
-        pad_axis = [r for r in range(rank) if r != axis]
-
-    pad_axis = pad_axis if isinstance(pad_axis, (list, tuple)) else [pad_axis]
-
-    for pa in pad_axis:
-        max_dim_size = tf.reduce_max([tf.shape(v)[pa] for v in values])
-        for i, v in enumerate(values):
-            values[i] = _pad_to_size(v, pa, max_dim_size)
-
-    return tf.concat(values, axis)
-
-
-def varlength_concat(x, y, x_length, dtype=None, tensor_rank=None):
-    """Concatenates rows of `x` and `y` where each row of
-    `x` has a variable length.
-
-    Both `x` and `y` are of numeric dtypes, such as `tf.int32` and `tf.float32`,
-    with mask value `0`. The two tensors must be of the same dtype.
-
-    Args:
-        x: A tensor of shape `[batch_size, x_dim_2, other_dims]`.
-        y: A tensor of shape `[batch_size, y_dim_2, other_dims]`.
-            All dimensions except the 2nd dimension must be the same
-            with those of `x`.
-        x_length: A 1D int tensor of shape `[batch_size]` containing
-            the length of each `x` row.
-            Elements beyond the respective lengths will be
-            made zero.
-        dtype: Type of :attr:`x`. If `None`, inferred from
-            :attr:`x` automatically.
-        tensor_rank (int, optional): The number of dimensions of
-            :attr:`x`. If not given, inferred from :attr:`x`
-            automatically.
-
-    Returns:
-        A Tensor of shape `[batch_size, x_dim_2 + y_dim_2, other_dims]`.
-
-    Example:
-        .. code-block:: python
-
-            x = tf.constant([[1, 1, 0, 0],
-                             [1, 1, 1, 0]])
-            x_length = [2, 3]
-            y = tf.constant([[2, 2, 0],
-                             [2, 2, 2]])
-
-            out = varlength_concat(x, y, x_length)
-            # out = [[1, 1, 2, 2, 0, 0, 0]
-            #        [1, 1, 1, 2, 2, 2, 0]]
-    """
-    x = tf.convert_to_tensor(x)
-    y = tf.convert_to_tensor(y)
-    x_length = tf.convert_to_tensor(x_length)
-
-    if tensor_rank is None:
-        tensor_rank = get_rank(x) or get_rank(y)
-    if tensor_rank is None:
-        raise ValueError('Unable to infer the rank of `x`. '
-                         'Please set `tensor_rank` explicitly.')
-
-    x_masked = mask_sequences(x, x_length, dtype=dtype, tensor_rank=tensor_rank)
-    zeros_y = tf.zeros_like(y)
-    x_aug = tf.concat([x_masked, zeros_y], axis=1)
-
-    zeros_x = tf.zeros_like(x)
-    y_aug = tf.concat([zeros_x, y], axis=1)
-
-    # Now, x_aug.shape == y_aug.shape
-
-    max_length_x = tf.shape(x)[1]
-    batch_size = tf.shape(x)[0]
-
-    initial_index = tf.constant(0, dtype=tf.int32)
-    initial_outputs_ta = tf.TensorArray(
-        dtype=dtype or x.dtype,
-        size=0,
-        dynamic_size=True)
-
-    def _cond(index, _):
-        return tf.less(index, batch_size)
-
-    def _body(index, outputs_ta):
-        y_aug_i_rolled = tf.roll(
-            input=y_aug[index],
-            shift=x_length[index] - max_length_x,  # shift to left
-            axis=0)
-        xy = x_aug[index] + y_aug_i_rolled
-        return [index + 1, outputs_ta.write(index, xy)]
-
-    res = tf.while_loop(_cond, _body, [initial_index, initial_outputs_ta])
-
-    return res[1].stack()
-
-
-def varlength_concat_py(x, y, x_length, dtype=None):
-    """Concatenates rows of `x` and `y` where each row of
-    `x` has a variable length.
-
-    The function has the same semantic as :func:`varlength_concat`,
-    except that this function is for numpy arrays instead of TF tensors.
-
-    Both `x` and `y` are of numeric dtypes, such as `int32` and `float32`,
-    with mask value `0`. The two arrays must be of the same dtype.
-
-    Args:
-        x: A array of shape `[batch_size, x_dim_2, other_dims]`.
-        y: A array of shape `[batch_size, y_dim_2, other_dims]`.
-            All dimensions except the 2nd dimension must be the same
-            with those of `x`.
-        x_length: A 1D int array of shape `[batch_size]` containing
-            the length of each `x` row.
-            Elements beyond the respective lengths will be
-            made zero.
-        dtype: Type of :attr:`x`. If `None`, inferred from
-            :attr:`x` automatically.
-
-    Returns:
-        An array of shape `[batch_size, x_dim_2 + y_dim_2, other_dims]`.
-
-    Example:
-        .. code-block:: python
-
-            x = np.asarray([[1, 1, 0, 0],
-                            [1, 1, 1, 0]])
-            x_length = [2, 3]
-            y = np.asarray([[2, 2, 0],
-                            [2, 2, 2]])
-
-            out = varlength_concat_py(x, y, x_length)
-            # out = [[1, 1, 2, 2, 0, 0, 0]
-            #        [1, 1, 1, 2, 2, 2, 0]]
-    """
-    x = np.asarray(x, dtype=dtype)
-    y = np.asarray(y, dtype=dtype)
-
-    x_masked = mask_sequences(x, x_length, dtype=dtype)
-    zeros_y = np.zeros_like(y)
-    x_aug = np.concatenate([x_masked, zeros_y], axis=1)
-
-    zeros_x = np.zeros_like(x)
-    y_aug = np.concatenate([zeros_x, y], axis=1)
-
-    # Now, x_aug.shape == y_aug.shape
-
-    max_length_x = x.shape[1]
-    batch_size = x.shape[0]
-
-    for index in np.arange(batch_size):
-        y_aug_i_rolled = np.roll(
-            a=y_aug[index],
-            shift=x_length[index] - max_length_x,
-            axis=0)
-        x_aug[index] += y_aug_i_rolled
-
-    return x_aug
-
-
-def varlength_roll(input, shift, axis=1, dtype=None):
-    """Rolls the elements of *each row* of a tensor along an axis for
-    variable steps.
-
-    This is a `tf.while_loop` wrapper of :tf_main:`tf.roll <roll>`. Note the
-    different definition of :attr:`shift` and :attr:`axis` here compared
-    to :tf_main:`tf.roll <roll>`.
-
-    Args:
-        input: A tensor of shape `[batch_size, other_dims]` where
-            `other_dims` can be multiple dimensions.
-        shift: A 1D int tensor of shape `[batch_size]` containing
-            the steps for which each row in the batch are rolled.
-            Positive shifts will roll towards larger indices, while
-            negative shifts will roll towards smaller indices.
-        axis: A scalar int tensor > 0. The dimension that the roll
-            should occur.
-        dtype: Type of :attr:`input`. If `None`, inferred from
-            :attr:`input` automatically.
-
-    Returns:
-        A Tensor of the same shape/dtype as :attr:`input`.
-
-    Example:
-        .. code-block:: python
-
-            x = tf.constant([[0, 0, 1, 0],
-                             [0, 1, 1, 1]])
-            shift = [-2, -1]
-
-            out = varlength_roll(x, shift)
-            # out = [[1, 0, 0, 0]
-            #        [1, 1, 1, 0]]
-
-
-        .. code-block:: python
-
-            x = tf.constant([[1, 2, 3, 4],
-                             [5, 6, 7, 8]])
-            shift = [1, -1]
-
-            out = varlength_roll(x, shift)
-            # out = [[4, 1, 2, 3]
-            #        [6, 7, 8, 5]]
-    """
-    x = tf.convert_to_tensor(input)
-    # x = input
-    shift = tf.convert_to_tensor(shift)
-
-    batch_size = tf.shape(x)[0]
-
-    initial_index = tf.constant(0, dtype=tf.int32)
-    initial_outputs_ta = tf.TensorArray(
-        dtype=dtype or x.dtype,
-        size=0,
-        dynamic_size=True)
-
-    def _cond(index, _):
-        return tf.less(index, batch_size)
-
-    def _body(index, outputs_ta):
-        x_i_rolled = tf.roll(
-            input=x[index],
-            shift=shift[index],
-            axis=axis - 1)
-        return [index + 1, outputs_ta.write(index, x_i_rolled)]
-
-    res = tf.while_loop(_cond, _body, [initial_index, initial_outputs_ta])
-
-    return res[1].stack()
diff --git a/texar/tf/utils/shapes_test.py b/texar/tf/utils/shapes_test.py
index 656c42eb..ede2876a 100644
--- a/texar/tf/utils/shapes_test.py
+++ b/texar/tf/utils/shapes_test.py
@@ -2,12 +2,6 @@
 Unit tests for shape-related utility functions.
 """
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# pylint: disable=no-member
-
 import numpy as np
 
 import tensorflow as tf
@@ -30,136 +24,6 @@ def test_mask_sequences(self):
         seq_sum = np.sum(masked_seq, axis=(1, 2))
         np.testing.assert_array_equal(seq_sum, seq_length * 3)
 
-    def test_reduce_with_weights(self):
-        """Tests :func:`texar.tf.utils.shapes.reduce_with_weights`
-        """
-        x = np.asarray([[10, 10, 2, 2],
-                        [20, 2, 2, 2]])
-        x = tf.constant(x)
-        w = np.asarray([[1, 1, 0, 0],
-                        [1, 0, 0, 0]])
-
-        z = shapes.reduce_with_weights(x, weights=w)
-
-        with self.test_session() as sess:
-            z_ = sess.run(z)
-            np.testing.assert_array_equal(z_, 20)
-
-    def test_pad_and_concat(self):
-        """Test :func:`texar.tf.utils.shapes.pad_and_concat`.
-        """
-        a = tf.ones([3, 10, 2])
-        b = tf.ones([4, 20, 3])
-        c = tf.ones([5, 1, 4])
-
-        t = shapes.pad_and_concat([a, b, c], 0)
-        self.assertEqual(t.shape, [3 + 4 + 5, 20, 4])
-        t = shapes.pad_and_concat([a, b, c], 1)
-        self.assertEqual(t.shape, [5, 10 + 20 + 1, 4])
-        t = shapes.pad_and_concat([a, b, c], 2)
-        self.assertEqual(t.shape, [5, 20, 2 + 3 + 4])
-
-        d = tf.placeholder(dtype=tf.float32, shape=[6, None, 1])
-        t = shapes.pad_and_concat([a, b, c, d], 0)
-        with self.test_session() as sess:
-            t_ = sess.run(t, feed_dict={d: np.ones([6, 2, 1])})
-            self.assertEqual(list(t_.shape), [3 + 4 + 5 + 6, 20, 4])
-
-    def test_varlength_concat(self):
-        """
-        Tests :func:`texar.tf.utils.shapes.varlength_concat`.
-        """
-        # 2D
-        x = np.asarray(
-            [[1, 1, 0, 0],
-             [1, 0, 0, 0],
-             [1, 1, 1, 1]], dtype=np.int32)
-        x_length = np.asarray([2, 1, 4], dtype=np.int32)
-        y = np.asarray(
-            [[2, 2, 2, 0],
-             [2, 2, 2, 2],
-             [2, 2, 0, 0]], dtype=np.int32)
-
-        z_true = np.asarray(
-            [[1, 1, 2, 2, 2, 0, 0, 0],
-             [1, 2, 2, 2, 2, 0, 0, 0],
-             [1, 1, 1, 1, 2, 2, 0, 0]], dtype=np.int32)
-
-        # py
-        z = shapes.varlength_concat_py(x, y, x_length)
-        np.testing.assert_array_equal(z, z_true)
-
-        # tf
-        z = shapes.varlength_concat(x, y, x_length)
-        with self.test_session() as sess:
-            z_ = sess.run(z)
-            np.testing.assert_array_equal(z_, z_true)
-
-        # 3D
-        x = np.asarray(
-            [[[1], [1], [0], [0]],
-             [[1], [0], [0], [0]],
-             [[1], [1], [1], [1]]], dtype=np.int32)
-        x_length = [2, 1, 4]
-        y = np.asarray(
-            [[[2], [2], [2], [0]],
-             [[2], [2], [2], [2]],
-             [[2], [2], [0], [0]]], dtype=np.int32)
-        z_true = np.asarray(
-            [[[1], [1], [2], [2], [2], [0], [0], [0]],
-             [[1], [2], [2], [2], [2], [0], [0], [0]],
-             [[1], [1], [1], [1], [2], [2], [0], [0]]], dtype=np.int32)
-
-        # py
-        z = shapes.varlength_concat_py(x, y, x_length)
-        np.testing.assert_array_equal(z, z_true)
-
-        # tf
-        z = shapes.varlength_concat(x, y, x_length)
-        with self.test_session() as sess:
-            z_ = sess.run(z)
-            np.testing.assert_array_equal(z_, z_true)
-
-    def test_varlength_roll(self):
-        """
-        Tests :func:`texar.tf.utils.shapes.varlength_roll`.
-        """
-        # 2D
-        x = np.asarray(
-            [[1, 1, 0, 0],
-             [1, 0, 0, 0],
-             [1, 1, 1, 1]], dtype=np.int32)
-        x_length = [-2, -1, -4]
-        z = shapes.varlength_roll(x, x_length)
-
-        with self.test_session() as sess:
-            z_ = sess.run(z)
-
-            z_true = np.asarray(
-                [[0, 0, 1, 1],
-                 [0, 0, 0, 1],
-                 [1, 1, 1, 1]], dtype=np.int32)
-
-            np.testing.assert_array_equal(z_, z_true)
-
-        # 3D
-        x = np.asarray(
-            [[[1], [1], [0], [0]],
-             [[1], [0], [0], [0]],
-             [[1], [1], [1], [1]]], dtype=np.int32)
-        x_length = [-2, -1, -4]
-        z = shapes.varlength_roll(x, x_length)
-
-        with self.test_session() as sess:
-            z_ = sess.run(z)
-
-            z_true = np.asarray(
-                [[[0], [0], [1], [1]],
-                 [[0], [0], [0], [1]],
-                 [[1], [1], [1], [1]]], dtype=np.int32)
-
-            np.testing.assert_array_equal(z_, z_true)
-
 
 if __name__ == "__main__":
     tf.test.main()
diff --git a/texar/tf/utils/transformer_attentions.py b/texar/tf/utils/transformer_attentions.py
index fa281f0c..88f4c0b6 100644
--- a/texar/tf/utils/transformer_attentions.py
+++ b/texar/tf/utils/transformer_attentions.py
@@ -1,4 +1,4 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
+# Copyright 2019 The Texar Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,18 +14,11 @@
 """Attentions specific to Transformer.
 """
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
-
 import numpy as np
 
 import tensorflow as tf
 
 
-# pylint: disable=too-many-arguments, invalid-name, no-member
-
 __all__ = [
     'attention_bias_lower_triangle',
     'attention_bias_ignore_padding',
@@ -34,20 +27,21 @@
 
 
 def attention_bias_lower_triangle(length, bias_value=-1e18):
-    """Create an bias tensor to be added to attention logits.
+    r"""Create an bias tensor to be added to attention logits.
     Allows a query to attend to all positions up to and including its own.
 
     Args:
         length: a scalar.
+        bias_value: value to fill the bias tensor with.
 
     Returns:
-        a `Tensor` with shape [1, 1, length, length].
+        a ``Tensor`` with shape ``[1, 1, length, length]``.
     """
     return attention_bias_local(length, -1, 0, bias_value)
 
 
 def attention_bias_local(length, max_backward, max_forward, bias_value=-1e18):
-    """Create an bias tensor to be added to attention logits.
+    r"""Create an bias tensor to be added to attention logits.
     A position may attend to positions at most max_distance from it,
     forward and backwards.
 
@@ -59,9 +53,10 @@ def attention_bias_local(length, max_backward, max_forward, bias_value=-1e18):
         values indicate unlimited.
         max_forward: int, maximum distance forward to attend. Negative
         values indicate unlimited.
+        bias_value: value to fill the bias tensor with.
 
     Returns:
-        a `Tensor` with shape [1, 1, length, length].
+        a ``Tensor`` with shape [1, 1, length, length].
         [batch_size, num_heads, queri_len, queri_len]
     """
     band = _ones_matrix_band_part(
@@ -74,13 +69,14 @@ def attention_bias_local(length, max_backward, max_forward, bias_value=-1e18):
 
 
 def attention_bias_ignore_padding(memory_padding, bias_value=-1e18):
-    """Create an bias tensor to be added to attention logits.
+    r"""Create an bias tensor to be added to attention logits.
 
     Args:
         memory_padding: a float `Tensor` with shape [batch, memory_length].
+        bias_value: value to fill the bias tensor with.
 
     Returns:
-        a `Tensor` with shape [batch, 1, 1, memory_length].
+        a ``Tensor`` with shape [batch, 1, 1, memory_length].
         each dim corresponding to batch_size, num_heads, queries_len,
         memory_length
     """
@@ -88,12 +84,10 @@ def attention_bias_ignore_padding(memory_padding, bias_value=-1e18):
     return tf.expand_dims(tf.expand_dims(ret, axis=1), axis=1)
 
 
-def _ones_matrix_band_part(rows, cols, num_lower, num_upper,
-    out_shape=None):
-    """Matrix band part of ones.
+def _ones_matrix_band_part(rows, cols, num_lower, num_upper, out_shape=None):
+    r"""Matrix band part of ones.
     """
-    if all([isinstance(el, int) for el in [rows, cols, num_lower,
-        num_upper]]):
+    if all([isinstance(el, int) for el in [rows, cols, num_lower, num_upper]]):
         # Needed info is constant, so we construct in numpy
         if num_lower < 0:
             num_lower = rows - 1
@@ -106,7 +100,7 @@ def _ones_matrix_band_part(rows, cols, num_lower, num_upper,
             band = band.reshape(out_shape)
         band = tf.constant(band, tf.float32)
     else:
-        band = tf.matrix_band_part(tf.ones([rows, cols]),
+        band = tf.linalg.band_part(tf.ones([rows, cols]),
                                    tf.cast(num_lower, tf.int64),
                                    tf.cast(num_upper, tf.int64))
         if out_shape:
diff --git a/texar/tf/utils/transformer_utils.py b/texar/tf/utils/transformer_utils.py
index a1459827..11c6a917 100644
--- a/texar/tf/utils/transformer_utils.py
+++ b/texar/tf/utils/transformer_utils.py
@@ -12,27 +12,20 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-# Modifications copyright (C) 2018 Texar
+# Modifications copyright (C) 2019 Texar
 # ==============================================================================
 """
 This script is adapted from the tensor2tensor repository.
 """
 
-from __future__ import absolute_import
-from __future__ import print_function
-from __future__ import division
-from __future__ import unicode_literals
-
 import tensorflow as tf
 from tensorflow_probability import distributions as tfpd
 
-# pylint: disable=invalid-name, too-many-arguments, too-many-locals
-
 
-class PadRemover(object):
-    """Helper to remove padding from a tensor before sending to the experts.
+class PadRemover:
+    r"""Helper to remove padding from a tensor before sending to the experts.
     The padding is computed for one reference tensor containing the padding mask
-    and then can be applied to any other tensor of shape [dim_origin,...].
+    and then can be applied to any other tensor of shape ``[dim_origin, ...]``.
 
     Example::
 
@@ -52,7 +45,7 @@ class PadRemover(object):
     """
 
     def __init__(self, pad_mask):
-        """Compute and store the location of the padding.
+        r"""Compute and store the location of the padding.
 
         Args:
             pad_mask (tf.Tensor): Reference padding tensor of shape
@@ -74,13 +67,13 @@ def __init__(self, pad_mask):
             self.dim_origin = tf.shape(pad_mask)[:1]
 
     def remove(self, x):
-        """Remove padding from the given tensor.
+        r"""Remove padding from the given tensor.
 
         Args:
-            x: A Tensor of shape [dim_origin,...]
+            x: A Tensor of shape ``[dim_origin, ...]``
 
         Returns:
-            A tensor of shape [dim_compressed,...] with dim_compressed
+            A tensor of shape ``[dim_compressed, ...]`` with dim_compressed
             <= dim_origin
         """
         with tf.name_scope("pad_reduce/remove"):
@@ -96,7 +89,7 @@ def remove(self, x):
         return x
 
     def restore(self, x):
-        """Add padding back to the given tensor.
+        r"""Add padding back to the given tensor.
 
         Args:
             x: A Tensor of shape [dim_compressed,...]
@@ -116,7 +109,7 @@ def restore(self, x):
 
 
 def embedding_to_padding(emb):
-    """Calculates the padding mask based on which embeddings are all zero.
+    r"""Calculates the padding mask based on which embeddings are all zero.
     We have hacked symbol_modality to return all-zero embeddings
     for padding.
 
@@ -136,7 +129,7 @@ def smoothing_cross_entropy(logits,
                             confidence,
                             gaussian=False,
                             zero_pad=True):
-    """Cross entropy with label smoothing to limit over-confidence.
+    r"""Cross entropy with label smoothing to limit over-confidence.
 
     Args:
         logits: Tensor of size [batch_size, ?, vocab_size]
@@ -146,7 +139,7 @@ def smoothing_cross_entropy(logits,
             smoothing. If `gaussian` is true, `confidence` is the
             variance to the gaussian distribution.
         gaussian: Uses a gaussian distribution for label smoothing
-        zero_pad: use 0 as the probabitlity of the padding
+        zero_pad: use 0 as the probability of the padding
             in the smoothed labels. By setting this, we replicate the
             numeric calculation of tensor2tensor, which doesn't set the
             <BOS> token in the vocabulary.
diff --git a/texar/tf/modules/regressors/__init__.py b/texar/tf/utils/types.py
similarity index 60%
rename from texar/tf/modules/regressors/__init__.py
rename to texar/tf/utils/types.py
index 2347b7db..e4a00087 100644
--- a/texar/tf/modules/regressors/__init__.py
+++ b/texar/tf/utils/types.py
@@ -12,13 +12,22 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-Modules of texar library regressors.
+Type annotation helpers.
 """
+import os
+from typing import Dict, List, Sequence, Tuple, TypeVar, Union
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+__all__ = [
+    'MaybeTuple',
+    'MaybeList',
+    'MaybeSeq',
+    'MaybeDict',
+    'PathLike',
+]
 
-# pylint: disable=wildcard-import
-
-from texar.tf.modules.regressors.xlnet_regressor import *
+T = TypeVar('T')
+MaybeTuple = Union[T, Tuple[T, ...]]
+MaybeList = Union[T, List[T]]
+MaybeSeq = Union[T, Sequence[T]]
+MaybeDict = Union[T, Dict[str, T]]
+PathLike = TypeVar('PathLike', str, os.PathLike)
diff --git a/texar/tf/utils/utils.py b/texar/tf/utils/utils.py
index 791c7d14..8b211bda 100644
--- a/texar/tf/utils/utils.py
+++ b/texar/tf/utils/utils.py
@@ -1,4 +1,4 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
+# Copyright 2019 The Texar Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -15,104 +15,38 @@
 Miscellaneous Utility functions.
 """
 
-from __future__ import absolute_import
-from __future__ import print_function
-from __future__ import division
-
-# pylint: disable=invalid-name, no-member, no-name-in-module, protected-access
-# pylint: disable=redefined-outer-name, too-many-arguments
-
-from typing import List, Union
-
 import inspect
-import funcsigs
 from pydoc import locate
-import copy
-import collections
-import numpy as np
-
-import tensorflow as tf
-
-from texar.tf.hyperparams import HParams
-from texar.tf.utils.dtypes import is_str, is_callable, compat_as_text, \
-        _maybe_list_to_array
-
-# pylint: disable=anomalous-backslash-in-string
 
-MAX_SEQ_LENGTH = np.iinfo(np.int32).max
+import funcsigs
 
-# Some modules cannot be imported directly,
-# e.g., `import tensorflow.train` fails.
-# Such modules are treated in a special way in utils like `get_class` as below.
-# _unimportable_modules = {
-#    'tensorflow.train', 'tensorflow.keras.regularizers'
-# }
+import numpy as np
 
 __all__ = [
-    "_inspect_getargspec",
     "get_args",
-    "get_default_arg_values",
     "check_or_get_class",
     "get_class",
     "check_or_get_instance",
     "get_instance",
-    "check_or_get_instance_with_redundant_kwargs",
-    "get_instance_with_redundant_kwargs",
     "get_function",
-    "call_function_with_redundant_kwargs",
-    "get_instance_kwargs",
-    "dict_patch",
     "dict_lookup",
-    "dict_fetch",
-    "dict_pop",
-    "flatten_dict",
-    "strip_token",
-    "strip_eos",
-    "strip_bos",
-    "strip_special_tokens",
-    "str_join",
-    "map_ids_to_strs",
     "default_str",
     "uniquify_str",
     "ceildiv",
-    "straight_through",
     "truncate_seq_pair",
 ]
 
 
-# TODO(zhiting): complete this
-def _expand_name(name):
-    """Replaces common shorthands with respective full names.
-
-        "tf.xxx" --> "tensorflow.xxx"
-        "tx.xxx" --> "texar.tf.xxx"
-    """
-    return name
-
-
-def _inspect_getargspec(fn):
-    """Returns `inspect.getargspec(fn)` for Py2 and `inspect.getfullargspec(fn)`
-    for Py3
-    """
-    try:
-        return inspect.getfullargspec(fn)
-    except AttributeError:
-        try:
-            return inspect.getargspec(fn)
-        except TypeError:
-            return inspect.getargspec(fn.__call__)
-
-
 def get_args(fn):
-    """Gets the arguments of a function.
+    r"""Gets the arguments of a function.
 
     Args:
         fn (callable): The function to inspect.
 
     Returns:
-        list: A list of argument names (str) of the function.
+        list: A list of argument names (``str``) of the function.
     """
-    argspec = _inspect_getargspec(fn)
+    argspec = inspect.getfullargspec(fn)
     args = argspec.args
 
     # Empty args can be because `fn` is decorated. Use `funcsigs.signature`
@@ -124,27 +58,8 @@ def get_args(fn):
     return args
 
 
-def get_default_arg_values(fn):
-    """Gets the arguments and respective default values of a function.
-
-    Only arguments with default values are included in the output dictionary.
-
-    Args:
-        fn (callable): The function to inspect.
-
-    Returns:
-        dict: A dictionary that maps argument names (str) to their default
-        values. The dictionary is empty if no arguments have default values.
-    """
-    argspec = _inspect_getargspec(fn)
-    if argspec.defaults is None:
-        return {}
-    num_defaults = len(argspec.defaults)
-    return dict(zip(argspec.args[-num_defaults:], argspec.defaults))
-
-
-def check_or_get_class(class_or_name, module_path=None, superclass=None):
-    """Returns the class and checks if the class inherits :attr:`superclass`.
+def check_or_get_class(class_or_name, module_paths=None, superclass=None):
+    r"""Returns the class and checks if the class inherits :attr:`superclass`.
 
     Args:
         class_or_name: Name or full path to the class, or the class itself.
@@ -165,8 +80,8 @@ def check_or_get_class(class_or_name, module_path=None, superclass=None):
         TypeError: If class does not inherits :attr:`superclass`.
     """
     class_ = class_or_name
-    if is_str(class_):
-        class_ = get_class(class_, module_path)
+    if isinstance(class_, str):
+        class_ = get_class(class_, module_paths)
     if superclass is not None:
         if not issubclass(class_, superclass):
             raise TypeError(
@@ -176,13 +91,13 @@ def check_or_get_class(class_or_name, module_path=None, superclass=None):
 
 
 def get_class(class_name, module_paths=None):
-    """Returns the class based on class name.
+    r"""Returns the class based on class name.
 
     Args:
         class_name (str): Name or full path to the class.
         module_paths (list): Paths to candidate modules to search for the
             class. This is used if the class cannot be located solely based on
-            `class_name`. The first module in the list that contains the class
+            ``class_name``. The first module in the list that contains the class
             is used.
 
     Returns:
@@ -195,17 +110,9 @@ def get_class(class_name, module_paths=None):
     class_ = locate(class_name)
     if (class_ is None) and (module_paths is not None):
         for module_path in module_paths:
-            # if module_path in _unimportable_modules:
-            # Special treatment for unimportable modules by directly
-            # accessing the class
             class_ = locate('.'.join([module_path, class_name]))
             if class_ is not None:
                 break
-            # else:
-            #    module = importlib.import_module(module_path)
-            #    if class_name in dir(module):
-            #        class_ = getattr(module, class_name)
-            #        break
 
     if class_ is None:
         raise ValueError(
@@ -216,18 +123,17 @@ def get_class(class_name, module_paths=None):
 
 def check_or_get_instance(ins_or_class_or_name, kwargs, module_paths=None,
                           classtype=None):
-    """Returns a class instance and checks types.
+    r"""Returns a class instance and checks types.
 
     Args:
         ins_or_class_or_name: Can be of 3 types:
 
             - A class to instantiate.
-            - A string of the name or full path to a class to \
-              instantiate.
+            - A string of the name or full path to a class to instantiate.
             - The class instance to check types.
 
         kwargs (dict): Keyword arguments for the class constructor. Ignored
-            if `ins_or_class_or_name` is a class instance.
+            if ``ins_or_class_or_name`` is a class instance.
         module_paths (list, optional): Paths to candidate modules to
             search for the class. This is used if the class cannot be
             located solely based on :attr:`class_name`. The first module
@@ -244,7 +150,7 @@ def check_or_get_instance(ins_or_class_or_name, kwargs, module_paths=None,
             :attr:`classtype`.
     """
     ret = ins_or_class_or_name
-    if is_str(ret) or isinstance(ret, type):
+    if isinstance(ret, (str, type)):
         ret = get_instance(ret, kwargs, module_paths)
     if classtype is not None:
         if not isinstance(ret, classtype):
@@ -254,7 +160,7 @@ def check_or_get_instance(ins_or_class_or_name, kwargs, module_paths=None,
 
 
 def get_instance(class_or_name, kwargs, module_paths=None):
-    """Creates a class instance.
+    r"""Creates a class instance.
 
     Args:
         class_or_name: A class, or its name or full path to a class to
@@ -276,7 +182,7 @@ def get_instance(class_or_name, kwargs, module_paths=None):
     """
     # Locate the class
     class_ = class_or_name
-    if is_str(class_):
+    if isinstance(class_, str):
         class_ = get_class(class_, module_paths)
 
     # Check validity of arguments
@@ -293,87 +199,8 @@ def get_instance(class_or_name, kwargs, module_paths=None):
     return class_(**kwargs)
 
 
-def check_or_get_instance_with_redundant_kwargs(
-        ins_or_class_or_name, kwargs, module_paths=None, classtype=None):
-    """Returns a class instance and checks types.
-
-    Only those keyword arguments in :attr:`kwargs` that are included in the
-    class construction method are used.
-
-    Args:
-        ins_or_class_or_name: Can be of 3 types:
-
-            - A class to instantiate.
-            - A string of the name or module path to a class to \
-              instantiate.
-            - The class instance to check types.
-
-        kwargs (dict): Keyword arguments for the class constructor.
-        module_paths (list, optional): Paths to candidate modules to
-            search for the class. This is used if the class cannot be
-            located solely based on :attr:`class_name`. The first module
-            in the list that contains the class is used.
-        classtype (optional): A (list of) classes of which the instance must
-            be an instantiation.
-
-    Raises:
-        ValueError: If class is not found based on :attr:`class_name` and
-            :attr:`module_paths`.
-        ValueError: If :attr:`kwargs` contains arguments that are invalid
-            for the class construction.
-        TypeError: If the instance is not an instantiation of
-            :attr:`classtype`.
-    """
-    ret = ins_or_class_or_name
-    if is_str(ret) or isinstance(ret, type):
-        ret = get_instance_with_redundant_kwargs(ret, kwargs, module_paths)
-    if classtype is not None:
-        if not isinstance(ret, classtype):
-            raise TypeError(
-                "An instance of {} is expected. Got: {}".format(classtype, ret))
-    return ret
-
-
-def get_instance_with_redundant_kwargs(
-        class_name, kwargs, module_paths=None):
-    """Creates a class instance.
-
-    Only those keyword arguments in :attr:`kwargs` that are included in the
-    class construction method are used.
-
-    Args:
-        class_name (str): A class or its name or module path.
-        kwargs (dict): A dictionary of arguments for the class constructor. It
-            may include invalid arguments which will be ignored.
-        module_paths (list of str): A list of paths to candidate modules to
-            search for the class. This is used if the class cannot be located
-            solely based on :attr:`class_name`. The first module in the list
-            that contains the class is used.
-
-    Returns:
-        A class instance.
-
-    Raises:
-        ValueError: If class is not found based on :attr:`class_name` and
-            :attr:`module_paths`.
-    """
-    # Locate the class
-    class_ = get_class(class_name, module_paths)
-
-    # Select valid arguments
-    selected_kwargs = {}
-    class_args = set(get_args(class_.__init__))
-    if kwargs is None:
-        kwargs = {}
-    for key, value in kwargs.items():
-        if key in class_args:
-            selected_kwargs[key] = value
-
-    return class_(**selected_kwargs)
-
-
 def get_function(fn_or_name, module_paths=None):
-    """Returns the function of specified name and module.
+    r"""Returns the function of specified name and module.
 
     Args:
         fn_or_name (str or callable): Name or full path to a function, or the
@@ -385,21 +212,19 @@ def get_function(fn_or_name, module_paths=None):
 
     Returns:
         A function.
+
+    Raises:
+        ValueError: If method with name as :attr:`fn_or_name` is not found.
     """
-    if is_callable(fn_or_name):
+    if callable(fn_or_name):
         return fn_or_name
 
     fn = locate(fn_or_name)
     if (fn is None) and (module_paths is not None):
         for module_path in module_paths:
-            # if module_path in _unimportable_modules:
             fn = locate('.'.join([module_path, fn_or_name]))
             if fn is not None:
                 break
-            # module = importlib.import_module(module_path)
-            # if fn_name in dir(module):
-            #    fn = getattr(module, fn_name)
-            #    break
 
     if fn is None:
         raise ValueError(
@@ -408,92 +233,8 @@ def get_function(fn_or_name, module_paths=None):
     return fn
 
 
-def call_function_with_redundant_kwargs(fn, kwargs):
-    """Calls a function and returns the results.
-
-    Only those keyword arguments in :attr:`kwargs` that are included in the
-    function's argument list are used to call the function.
-
-    Args:
-        fn (function): A callable. If :attr:`fn` is not a python function,
-            :attr:`fn.__call__` is called.
-        kwargs (dict): A `dict` of arguments for the callable. It
-            may include invalid arguments which will be ignored.
-
-    Returns:
-        The returned results by calling :attr:`fn`.
-    """
-    try:
-        fn_args = set(get_args(fn))
-    except TypeError:
-        fn_args = set(get_args(fn.__cal__))
-
-    if kwargs is None:
-        kwargs = {}
-
-    # Select valid arguments
-    selected_kwargs = {}
-    for key, value in kwargs.items():
-        if key in fn_args:
-            selected_kwargs[key] = value
-
-    return fn(**selected_kwargs)
-
-
-def get_instance_kwargs(kwargs, hparams):
-    """Makes a dict of keyword arguments with the following structure:
-
-    `kwargs_ = {'hparams': dict(hparams), **kwargs}`.
-
-    This is typically used for constructing a module which takes a set of
-    arguments as well as a argument named `hparams`.
-
-    Args:
-        kwargs (dict): A dict of keyword arguments. Can be `None`.
-        hparams: A dict or an instance of :class:`~texar.tf.HParams` Can be `None`.
-
-    Returns:
-        A `dict` that contains the keyword arguments in :attr:`kwargs`, and
-        an additional keyword argument named `hparams`.
-    """
-    if hparams is None or isinstance(hparams, dict):
-        kwargs_ = {'hparams': hparams}
-    elif isinstance(hparams, HParams):
-        kwargs_ = {'hparams': hparams.todict()}
-    else:
-        raise ValueError(
-            '`hparams` must be a dict, an instance of HParams, or a `None`.')
-    kwargs_.update(kwargs or {})
-    return kwargs_
-
-
-def dict_patch(tgt_dict, src_dict):
-    """Recursively patch :attr:`tgt_dict` by adding items from :attr:`src_dict`
-    that do not exist in :attr:`tgt_dict`.
-
-    If respective items in :attr:`src_dict` and :attr:`tgt_dict` are both
-    `dict`, the :attr:`tgt_dict` item is patched recursively.
-
-    Args:
-        tgt_dict (dict): Target dictionary to patch.
-        src_dict (dict): Source dictionary.
-
-    Return:
-        dict: The new :attr:`tgt_dict` that is patched.
-    """
-    if src_dict is None:
-        return tgt_dict
-
-    for key, value in src_dict.items():
-        if key not in tgt_dict:
-            tgt_dict[key] = copy.deepcopy(value)
-        elif isinstance(value, dict) and isinstance(tgt_dict[key], dict):
-            tgt_dict[key] = dict_patch(tgt_dict[key], value)
-    return tgt_dict
-
-
 def dict_lookup(dict_, keys, default=None):
-    """Looks up :attr:`keys` in the dict, returns the corresponding values.
+    r"""Looks up :attr:`keys` in the dict, returns the corresponding values.
 
     The :attr:`default` is used for keys not present in the dict.
 
@@ -513,109 +254,31 @@ def dict_lookup(dict_, keys, default=None):
     return np.vectorize(lambda x: dict_.get(x, default))(keys)
 
 
-def dict_fetch(src_dict, tgt_dict_or_keys):
-    """Fetches a sub dict of :attr:`src_dict` with the keys in
-    :attr:`tgt_dict_or_keys`.
-
-    Args:
-        src_dict: A dict or instance of :class:`~texar.tf.HParams`.
-            The source dict to fetch values from.
-        tgt_dict_or_keys: A dict, instance of :class:`~texar.tf.HParams`,
-            or a list (or a dict_keys) of keys to be included in the output
-            dict.
-
-    Returns:
-        A new dict that is a subdict of :attr:`src_dict`.
-    """
-    if src_dict is None:
-        return src_dict
-
-    if isinstance(tgt_dict_or_keys, HParams):
-        tgt_dict_or_keys = tgt_dict_or_keys.todict()
-    if isinstance(tgt_dict_or_keys, dict):
-        tgt_dict_or_keys = tgt_dict_or_keys.keys()
-    keys = list(tgt_dict_or_keys)
-
-    if isinstance(src_dict, HParams):
-        src_dict = src_dict.todict()
-
-    return {k: src_dict[k] for k in keys if k in src_dict}
-
-
-def dict_pop(dict_, pop_keys, default=None):
-    """Removes keys from a dict and returns their values.
-
-    Args:
-        dict_ (dict): A dictionary from which items are removed.
-        pop_keys: A key or a list of keys to remove and return respective
-            values or :attr:`default`.
-        default (optional): Value to be returned when a key is not in
-            :attr:`dict_`. The default value is `None`.
-
-    Returns:
-        A `dict` of the items removed from :attr:`dict_`.
-    """
-    if not isinstance(pop_keys, (list, tuple)):
-        pop_keys = [pop_keys]
-    ret_dict = {key: dict_.pop(key, default) for key in pop_keys}
-    return ret_dict
-
-
-def flatten_dict(dict_, parent_key="", sep="."):
-    """Flattens a nested dictionary. Namedtuples within the dictionary are
-    converted to dicts.
-
-    Adapted from:
-    https://github.com/google/seq2seq/blob/master/seq2seq/models/model_base.py
-
-    Args:
-        dict_ (dict): The dictionary to flatten.
-        parent_key (str): A prefix to prepend to each key.
-        sep (str): Separator that intervenes between parent and child keys.
-            E.g., if `sep` == '.', then `{ "a": { "b": 3 } }` is converted
-            into `{ "a.b": 3 }`.
-
-    Returns:
-        A new flattened `dict`.
-    """
-    items = []
-    for key, value in dict_.items():
-        key_ = parent_key + sep + key if parent_key else key
-        if isinstance(value, collections.MutableMapping):
-            items.extend(flatten_dict(value, key_, sep=sep).items())
-        elif isinstance(value, tuple) and hasattr(value, "_asdict"):
-            dict_items = collections.OrderedDict(zip(value._fields, value))
-            items.extend(flatten_dict(dict_items, key_, sep=sep).items())
-        else:
-            items.append((key_, value))
-    return dict(items)
-
-
-def default_str(str_, default_str):
-    """Returns :attr:`str_` if it is not `None` or empty, otherwise returns
-    :attr:`default_str`.
+def default_str(str_, default_str_):
+    r"""Returns :attr:`str_` if it is not `None` or empty, otherwise returns
+    :attr:`default_str_`.
 
     Args:
         str_: A string.
-        default_str: A string.
+        default_str_: A string.
 
     Returns:
-        Either :attr:`str_` or :attr:`default_str`.
+        Either :attr:`str_` or :attr:`default_str_`.
     """
     if str_ is not None and str_ != "":
         return str_
     else:
-        return default_str
+        return default_str_
 
 
 def uniquify_str(str_, str_set):
-    """Uniquifies :attr:`str_` if :attr:`str_` is included in :attr:`str_set`.
+    r"""Uniquifies :attr:`str_` if :attr:`str_` is included in :attr:`str_set`.
 
     This is done by appending a number to :attr:`str_`. Returns
     :attr:`str_` directly if it is not included in :attr:`str_set`.
 
     Args:
-        str_ (string): A string to uniquify.
+        str\_ (string): A string to uniquify.
         str_set (set, dict, or list): A collection of strings. The returned
             string is guaranteed to be different from the elements in the
             collection.
@@ -639,344 +302,11 @@ def uniquify_str(str_, str_set):
             unique_str = str_ + "_%d" % i
             if unique_str not in str_set:
                 return unique_str
-    raise ValueError("Fails to uniquify string: " + str_)
-
-
-def _recur_split(s, dtype_as):
-    """Splits (possibly nested list of) strings recursively.
-    """
-    if is_str(s):
-        return _maybe_list_to_array(s.split(), dtype_as)
-    else:
-        s_ = [_recur_split(si, dtype_as) for si in s]
-        return _maybe_list_to_array(s_, s)
-
-
-def strip_token(str_, token, is_token_list=False, compat=True):
-    """Returns a copy of strings with leading and trailing tokens removed.
-
-    Note that besides :attr:`token`, all leading and trailing whitespace
-    characters are also removed.
-
-    If :attr:`is_token_list` is False, then the function assumes tokens in
-    :attr:`str_` are separated with whitespace character.
-
-    Args:
-        str_: A `str`, or an `n`-D numpy array or (possibly nested)
-            list of `str`.
-        token (str): The token to strip, e.g., the '<PAD>' token defined in
-            :class:`~texar.tf.data.SpecialTokens`.PAD
-        is_token_list (bool): Whether each sentence in :attr:`str_` is a list
-            of tokens. If False, each sentence in :attr:`str_` is assumed to
-            contain tokens separated with space character.
-        compat (bool): Whether to convert tokens into `unicode` (Python 2)
-            or `str` (Python 3).
-
-    Returns:
-        The stripped strings of the same structure/shape as :attr:`str_`.
-
-    Example:
-
-        .. code-block:: python
-
-            str_ = '<PAD> a sentence <PAD> <PAD>  '
-            str_stripped = strip_token(str_, '<PAD>')
-            # str_stripped == 'a sentence'
-
-            str_ = ['<PAD>', 'a', 'sentence', '<PAD>', '<PAD>', '', '']
-            str_stripped = strip_token(str_, '<PAD>', is_token_list=True)
-            # str_stripped == 'a sentence'
-    """
-    def _recur_strip(s):
-        if is_str(s):
-            if token == "":
-                return ' '.join(s.strip().split())
-            else:
-                return ' '.join(s.strip().split()).\
-                    replace(' ' + token, '').replace(token + ' ', '')
-        else:
-            s_ = [_recur_strip(si) for si in s]
-            return _maybe_list_to_array(s_, s)
-
-    s = str_
-
-    if compat:
-        s = compat_as_text(s)
-
-    if is_token_list:
-        s = str_join(s, compat=False)
-
-    strp_str = _recur_strip(s)
-
-    if is_token_list:
-        strp_str = _recur_split(strp_str, str_)
-
-    return strp_str
-
-
-def strip_eos(str_, eos_token='<EOS>', is_token_list=False, compat=True):
-    """Remove the EOS token and all subsequent tokens.
-
-    If :attr:`is_token_list` is False, then the function assumes tokens in
-    :attr:`str_` are separated with whitespace character.
-
-    Args:
-        str_: A `str`, or an `n`-D numpy array or (possibly nested)
-            list of `str`.
-        eos_token (str): The EOS token. Default is '<EOS>' as defined in
-            :class:`~texar.tf.data.SpecialTokens`.EOS
-        is_token_list (bool): Whether each sentence in :attr:`str_` is a list
-            of tokens. If False, each sentence in :attr:`str_` is assumed to
-            contain tokens separated with space character.
-        compat (bool): Whether to convert tokens into `unicode` (Python 2)
-            or `str` (Python 3).
-
-    Returns:
-        Strings of the same structure/shape as :attr:`str_`.
-    """
-    def _recur_strip(s):
-        if is_str(s):
-            s_tokens = s.split()
-            if eos_token in s_tokens:
-                return ' '.join(s_tokens[:s_tokens.index(eos_token)])
-            else:
-                return s
-        else:
-            s_ = [_recur_strip(si) for si in s]
-            return _maybe_list_to_array(s_, s)
-
-    s = str_
-
-    if compat:
-        s = compat_as_text(s)
-
-    if is_token_list:
-        s = str_join(s, compat=False)
-
-    strp_str = _recur_strip(s)
-
-    if is_token_list:
-        strp_str = _recur_split(strp_str, str_)
-
-    return strp_str
-
-
-_strip_eos_ = strip_eos
-
-
-def strip_bos(str_, bos_token='<BOS>', is_token_list=False, compat=True):
-    """Remove all leading BOS tokens.
-
-    Note that besides :attr:`bos_token`, all leading and trailing whitespace
-    characters are also removed.
-
-    If :attr:`is_token_list` is False, then the function assumes tokens in
-    :attr:`str_` are separated with whitespace character.
-
-    Args:
-        str_: A `str`, or an `n`-D numpy array or (possibly nested)
-            list of `str`.
-        bos_token (str): The BOS token. Default is '<BOS>' as defined in
-            :class:`~texar.tf.data.SpecialTokens`.BOS
-        is_token_list (bool): Whether each sentence in :attr:`str_` is a list
-            of tokens. If False, each sentence in :attr:`str_` is assumed to
-            contain tokens separated with space character.
-        compat (bool): Whether to convert tokens into `unicode` (Python 2)
-            or `str` (Python 3).
-
-    Returns:
-        Strings of the same structure/shape as :attr:`str_`.
-    """
-    def _recur_strip(s):
-        if is_str(s):
-            if bos_token == '':
-                return ' '.join(s.strip().split())
-            else:
-                return ' '.join(s.strip().split()).replace(bos_token + ' ', '')
-        else:
-            s_ = [_recur_strip(si) for si in s]
-            return _maybe_list_to_array(s_, s)
-
-    s = str_
-
-    if compat:
-        s = compat_as_text(s)
-
-    if is_token_list:
-        s = str_join(s, compat=False)
-
-    strp_str = _recur_strip(s)
-
-    if is_token_list:
-        strp_str = _recur_split(strp_str, str_)
-
-    return strp_str
-
-
-_strip_bos_ = strip_bos
-
-
-def strip_special_tokens(str_, strip_pad='<PAD>', strip_bos='<BOS>',
-                         strip_eos='<EOS>', is_token_list=False, compat=True):
-    """Removes special tokens in strings, including:
-
-        - Removes EOS and all subsequent tokens
-        - Removes leading and and trailing PAD tokens
-        - Removes leading BOS tokens
-
-    Note that besides the special tokens, all leading and trailing whitespace
-    characters are also removed.
-
-    This is a joint function of :func:`strip_eos`, :func:`strip_pad`, and
-    :func:`strip_bos`
-
-    Args:
-        str_: A `str`, or an `n`-D numpy array or (possibly nested)
-            list of `str`.
-        strip_pad (str): The PAD token to strip from the strings (i.e., remove
-            the leading and trailing PAD tokens of the strings). Default
-            is '<PAD>' as defined in
-            :class:`~texar.tf.data.SpecialTokens`.PAD.
-            Set to `None` or `False` to disable the stripping.
-        strip_bos (str): The BOS token to strip from the strings (i.e., remove
-            the leading BOS tokens of the strings).
-            Default is '<BOS>' as defined in
-            :class:`~texar.tf.data.SpecialTokens`.BOS.
-            Set to `None` or `False` to disable the stripping.
-        strip_eos (str): The EOS token to strip from the strings (i.e., remove
-            the EOS tokens and all subsequent tokens of the strings).
-            Default is '<EOS>' as defined in
-            :class:`~texar.tf.data.SpecialTokens`.EOS.
-            Set to `None` or `False` to disable the stripping.
-        is_token_list (bool): Whether each sentence in :attr:`str_` is a list
-            of tokens. If False, each sentence in :attr:`str_` is assumed to
-            contain tokens separated with space character.
-        compat (bool): Whether to convert tokens into `unicode` (Python 2)
-            or `str` (Python 3).
-
-    Returns:
-        Strings of the same shape of :attr:`str_` with special tokens stripped.
-    """
-    s = str_
-
-    if compat:
-        s = compat_as_text(s)
-
-    if is_token_list:
-        s = str_join(s, compat=False)
-
-    if strip_eos is not None and strip_eos is not False:
-        s = _strip_eos_(s, strip_eos, is_token_list=False, compat=False)
-
-    if strip_pad is not None and strip_pad is not False:
-        s = strip_token(s, strip_pad, is_token_list=False, compat=False)
-
-    if strip_bos is not None and strip_bos is not False:
-        s = _strip_bos_(s, strip_bos, is_token_list=False, compat=False)
-
-    if is_token_list:
-        s = _recur_split(s, str_)
-
-    return s
-
-
-def str_join(tokens, sep=' ', compat=True):
-    """Concats :attr:`tokens` along the last dimension with intervening
-    occurrences of :attr:`sep`.
-
-    Args:
-        tokens: An `n`-D numpy array or (possibly nested) list of `str`.
-        sep (str): The string intervening between the tokens.
-        compat (bool): Whether to convert tokens into `unicode` (Python 2)
-            or `str` (Python 3).
-
-    Returns:
-        An `(n-1)`-D numpy array (or list) of `str`.
-    """
-    def _recur_join(s):
-        if len(s) == 0:
-            return ''
-        elif is_str(s[0]):
-            return sep.join(s)
-        else:
-            s_ = [_recur_join(si) for si in s]
-            return _maybe_list_to_array(s_, s)
-
-    if compat:
-        tokens = compat_as_text(tokens)
-
-    str_ = _recur_join(tokens)
-
-    return str_
-
-
-def map_ids_to_strs(ids, vocab, join=True, strip_pad='<PAD>',
-                    strip_bos='<BOS>', strip_eos='<EOS>', compat=True):
-    """Transforms `int` indexes to strings by mapping ids to tokens,
-    concatenating tokens into sentences, and stripping special tokens, etc.
-
-    Args:
-        ids: An n-D numpy array or (possibly nested) list of `int` indexes.
-        vocab: An instance of :class:`~texar.tf.data.Vocab`.
-        join (bool): Whether to concat along the last dimension of the
-            the tokens into a string separated with a space character.
-        strip_pad (str): The PAD token to strip from the strings (i.e., remove
-            the leading and trailing PAD tokens of the strings). Default
-            is '<PAD>' as defined in
-            :class:`~texar.tf.data.SpecialTokens`.PAD.
-            Set to `None` or `False` to disable the stripping.
-        strip_bos (str): The BOS token to strip from the strings (i.e., remove
-            the leading BOS tokens of the strings).
-            Default is '<BOS>' as defined in
-            :class:`~texar.tf.data.SpecialTokens`.BOS.
-            Set to `None` or `False` to disable the stripping.
-        strip_eos (str): The EOS token to strip from the strings (i.e., remove
-            the EOS tokens and all subsequent tokens of the strings).
-            Default is '<EOS>' as defined in
-            :class:`~texar.tf.data.SpecialTokens`.EOS.
-            Set to `None` or `False` to disable the stripping.
-
-    Returns:
-        If :attr:`join` is True, returns a `(n-1)`-D numpy array (or list) of
-        concatenated strings. If :attr:`join` is False, returns an `n`-D numpy
-        array (or list) of str tokens.
-
-    Example:
-
-        .. code-block:: python
-
-            text_ids = [[1, 9, 6, 2, 0, 0], [1, 28, 7, 8, 2, 0]]
-
-            text = map_ids_to_strs(text_ids, data.vocab)
-            # text == ['a sentence', 'parsed from ids']
-
-            text = map_ids_to_strs(
-                text_ids, data.vocab, join=False,
-                strip_pad=None, strip_bos=None, strip_eos=None)
-            # text == [['<BOS>', 'a', 'sentence', '<EOS>', '<PAD>', '<PAD>'],
-            #          ['<BOS>', 'parsed', 'from', 'ids', '<EOS>', '<PAD>']]
-    """
-    tokens = vocab.map_ids_to_tokens_py(ids)
-    if isinstance(ids, (list, tuple)):
-        tokens = tokens.tolist()
-
-    if compat:
-        tokens = compat_as_text(tokens)
-
-    str_ = str_join(tokens, compat=False)
-
-    str_ = strip_special_tokens(
-        str_, strip_pad=strip_pad, strip_bos=strip_bos, strip_eos=strip_eos,
-        compat=False)
-
-    if join:
-        return str_
-    else:
-        return _recur_split(str_, ids)
+    raise ValueError("Failed to uniquify string: " + str_)
 
 
 def ceildiv(a, b):
-    """Divides with ceil.
+    r"""Divides with ceil.
 
     E.g., `5 / 2 = 2.5`, `ceildiv(5, 2) = 3`.
 
@@ -990,24 +320,7 @@ def ceildiv(a, b):
     return -(-a // b)
 
 
-def straight_through(fw_tensor, bw_tensor):
-    """Use a tensor in forward pass while backpropagating gradient to another.
-
-    Args:
-        fw_tensor: A tensor to be used in the forward pass.
-        bw_tensor: A tensor to which gradient is backpropagated. Must have the
-            same shape and type with :attr:`fw_tensor`.
-
-    Returns:
-        A tensor of the same shape and value with :attr:`fw_tensor` but will
-        direct gradient to bw_tensor.
-    """
-    return tf.stop_gradient(fw_tensor) + bw_tensor - tf.stop_gradient(bw_tensor)
-
-
-def truncate_seq_pair(tokens_a: Union[List[int], List[str]],
-                      tokens_b: Union[List[int], List[str]],
-                      max_length: int):
+def truncate_seq_pair(tokens_a, tokens_b, max_length):
     r"""Truncates a sequence pair in place to the maximum length.
 
     This is a simple heuristic which will always truncate the longer sequence
@@ -1017,9 +330,6 @@ def truncate_seq_pair(tokens_a: Union[List[int], List[str]],
     longer sequence.
 
     Example:
-
-    .. code-block:: python
-
         tokens_a = [1, 2, 3, 4, 5]
         tokens_b = [6, 7]
         truncate_seq_pair(tokens_a, tokens_b, 5)
diff --git a/texar/tf/utils/utils_io.py b/texar/tf/utils/utils_io.py
index fb8b9bdf..9230d0f6 100644
--- a/texar/tf/utils/utils_io.py
+++ b/texar/tf/utils/utils_io.py
@@ -1,5 +1,4 @@
-# -*- coding: utf-8 -*-
-# Copyright 2018 The Texar Authors. All Rights Reserved.
+# Copyright 2019 The Texar Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -16,248 +15,93 @@
 Utility functions related to input/output.
 """
 
-from __future__ import absolute_import
-from __future__ import print_function
-from __future__ import division
-from __future__ import unicode_literals
-
-# pylint: disable=invalid-name, redefined-builtin, too-many-arguments
-
-from io import open
 import os
-import importlib
-import yaml
-
-import tensorflow as tf
-
-as_text = tf.compat.as_text
 
 __all__ = [
-    "load_config_single",
-    "load_config",
     "write_paired_text",
     "maybe_create_dir",
-    "get_files"
 ]
 
-# def get_tf_logger(fname,
-#                  verbosity=tf.logging.INFO,
-#                  to_stdio=False,
-#                  stdio_verbosity=None):
-#    """Creates TF logger that allows to specify log filename and whether to
-#    print to stdio at the same time.
-#
-#    Args:
-#        fname (str): The log filename.
-#        verbosity: The threshold for what messages will be logged. Default is
-#           `INFO`. Other options include `DEBUG`, `ERROR`, `FATAL`, and `WARN`.
-#            See :tf_main:`tf.logging <logging>`.
-#        to_stdio (bool): Whether to print messages to stdio at the same time.
-#        stido_verbosity (optional): The verbosity level when printing to stdio.
-#            If `None` (default), the level is set to be the same as
-#            :attr:`verbosity`. Ignored if :attr:`to_stdio` is False.
-#
-#    Returns:
-#        The TF logger.
-#    """
-
-
-def _load_config_python(fname):
-    config = {}
-
-    config_module = importlib.import_module(fname.rstrip('.py'))
-    for key in dir(config_module):
-        if not (key.startswith('__') and key.endswith('__')):
-            config[key] = getattr(config_module, key)
-
-    return config
-
-
-def _load_config_yaml(fname):
-    with tf.gfile.GFile(fname) as config_file:
-        config = yaml.load(config_file)
-    return config
-
-
-def load_config_single(fname, config=None):
-    """Loads config from a single file.
-
-    The config file can be either a Python file (with suffix '.py')
-    or a YAML file. If the filename is not suffixed with '.py', the file is
-    parsed as YAML.
-
-    Args:
-        fname (str): The config file name.
-        config (dict, optional): A config dict to which new configurations are
-            added. If `None`, a new config dict is created.
-
-    Returns:
-        A `dict` of configurations.
-    """
-    if fname.endswith('.py'):
-        new_config = _load_config_python(fname)
-    else:
-        new_config = _load_config_yaml(fname)
-
-    if config is None:
-        config = new_config
-    else:
-        for key, value in new_config.items():
-            if key in config:
-                if isinstance(config[key], dict):
-                    config[key].update(value)
-                else:
-                    config[key] = value
-            else:
-                config[key] = value
-
-    return config
-
-
-def load_config(config_path, config=None):
-    """Loads configs from (possibly multiple) file(s).
-
-    A config file can be either a Python file (with suffix '.py')
-    or a YAML file. If the filename is not suffixed with '.py', the file is
-    parsed as YAML.
-
-    Args:
-        config_path: Paths to configuration files. This can be a `list` of
-            config file names, or a path to a directory in which all files
-            are loaded, or a string of multiple file names separated by commas.
-        config (dict, optional): A config dict to which new configurations are
-            added. If `None`, a new config dict is created.
-
-    Returns:
-        A `dict` of configurations.
-    """
-    fnames = []
-    if isinstance(config_path, (list, tuple)):
-        fnames = list(config_path)
-    elif tf.gfile.IsDirectory(config_path):
-        for fname in tf.gfile.ListDirectory(config_path):
-            fname = os.path.join(config_path, fname)
-            if not tf.gfile.IsDirectory(fname):
-                fnames.append(fname)
-    else:
-        for fname in config_path.split(","):
-            fname = fname.strip()
-            if not fname:
-                continue
-            fnames.append(fname)
-
-    if config is None:
-        config = {}
-
-    for fname in fnames:
-        config = load_config_single(fname, config)
 
-    return config
-
-
-# pylint: disable=too-many-locals
 def write_paired_text(src, tgt, fname, append=False, mode='h', sep='\t',
                       src_fname_suffix='src', tgt_fname_suffix='tgt'):
-    """Writes paired text to a file.
+    r"""Writes paired text to a file.
 
     Args:
-        src: A list (or array) of `str` source text.
-        tgt: A list (or array) of `str` target text.
+        src: A list (or array) of ``str`` source text.
+        tgt: A list (or array) of ``str`` target text.
         fname (str): The output filename.
         append (bool): Whether append content to the end of the file if exists.
         mode (str): The mode of writing, with the following options:
 
-            - **'h'**: The "horizontal" mode. Each source target pair is \
-                written in one line, intervened with :attr:`sep`, e.g.::
+            - **'h'**: The "horizontal" mode. Each source target pair is
+              written in one line, intervened with :attr:`sep`, e.g.::
 
-                    source_1 target_1
-                    source_2 target_2
+                  source_1 target_1
+                  source_2 target_2
 
-            - **'v'**: The "vertical" mode. Each source target pair is \
-                written in two consecutive lines, e.g::
+            - **'v'**: The ``"vertical"`` mode. Each source target pair is
+              written in two consecutive lines, e.g::
 
-                    source_1
-                    target_1
-                    source_2
-                    target_2
+                  source_1
+                  target_1
+                  source_2
+                  target_2
 
-            - **'s'**: The "separate" mode. Each source target pair is \
-                    written in corresponding lines of two files named \
-                    as `"{fname}.{src_fname_suffix}"` \
-                    and `"{fname}.{tgt_fname_suffix}"`, respectively.
+            - **'s'**: The "separate" mode. Each source target pair is
+              written in corresponding lines of two files named
+              as ``"{fname}.{src_fname_suffix}"``
+              and ``"{fname}.{tgt_fname_suffix}"``, respectively.
 
         sep (str): The string intervening between source and target. Used
-            when :attr:`mode` is set to 'h'.
-        src_fname_suffix (str): Used when :attr:`mode` is 's'. The suffix to
-            the source output filename. E.g., with
-            `(fname='output', src_fname_suffix='src')`, the output source file
-            is named as `output.src`.
-        tgt_fname_suffix (str): Used when :attr:`mode` is 's'. The suffix to
-            the target output filename.
+            when :attr:`mode` is set to ``"h"``.
+        src_fname_suffix (str): Used when :attr:`mode` is ``"s"``. The suffix
+            to the source output filename. For example, with
+            ``(fname='output', src_fname_suffix='src')``, the output source
+            file is named as ``output.src``.
+        tgt_fname_suffix (str): Used when :attr:`mode` is ``"s"``. The suffix
+            to the target output filename.
 
     Returns:
-        The fileanme(s). If `mode` == 'h' or 'v', returns
-        :attr:`fname`. If `mode` == 's', returns a list of filenames
-        `["{fname}.src", "{fname}.tgt"]`.
+        The filename(s). If ``mode`` == ``"h"`` or ``"v"``, returns
+        :attr:`fname`. If ``mode`` == ``"s"``, returns a list of filenames
+        ``["{fname}.src", "{fname}.tgt"]``.
     """
     fmode = 'a' if append else 'w'
     if mode == 's':
         fn_src = '{}.{}'.format(fname, src_fname_suffix)
         fn_tgt = '{}.{}'.format(fname, tgt_fname_suffix)
         with open(fn_src, fmode, encoding='utf-8') as fs:
-            fs.write(as_text('\n'.join(src)))
+            fs.write('\n'.join(src))
             fs.write('\n')
         with open(fn_tgt, fmode, encoding='utf-8') as ft:
-            ft.write(as_text('\n'.join(tgt)))
+            ft.write('\n'.join(tgt))
             ft.write('\n')
         return fn_src, fn_tgt
     else:
         with open(fname, fmode, encoding='utf-8') as f:
             for s, t in zip(src, tgt):
                 if mode == 'h':
-                    text = '{}{}{}\n'.format(as_text(s), sep, as_text(t))
-                    f.write(as_text(text))
+                    text = '{}{}{}\n'.format(s, sep, t)
+                    f.write(text)
                 elif mode == 'v':
-                    text = '{}\n{}\n'.format(as_text(s), as_text(t))
-                    f.write(as_text(text))
+                    text = '{}\n{}\n'.format(s, t)
+                    f.write(text)
                 else:
                     raise ValueError('Unknown mode: {}'.format(mode))
         return fname
 
 
-def maybe_create_dir(dirname):
-    """Creates directory if doesn't exist
-    """
-    if not tf.gfile.IsDirectory(dirname):
-        tf.gfile.MakeDirs(dirname)
-        return True
-    return False
-
-
-def get_files(file_paths):
-    """Gets a list of file paths given possibly a pattern :attr:`file_paths`.
-
-    Adapted from `tf.contrib.slim.data.parallel_reader.get_data_files`.
+def maybe_create_dir(dirname: str) -> bool:
+    r"""Creates directory if it does not exist.
 
     Args:
-        file_paths: A (list of) path to the files. The path can be a pattern,
-            e.g., /path/to/train*, /path/to/train[12]
+        dirname (str): Path to the directory.
 
     Returns:
-        A list of file paths.
-
-    Raises:
-        ValueError: If no files are not found
+        bool: Whether a new directory is created.
     """
-    if isinstance(file_paths, (list, tuple)):
-        files = []
-        for f in file_paths:
-            files += get_files(f)
-    else:
-        if '*' in file_paths or '?' in file_paths or '[' in file_paths:
-            files = tf.gfile.Glob(file_paths)
-        else:
-            files = [file_paths]
-    if not files:
-        raise ValueError('No data files found in %s' % (file_paths,))
-    return files
+    if not os.path.isdir(dirname):
+        os.makedirs(dirname)
+        return True
+    return False
diff --git a/texar/tf/utils/utils_test.py b/texar/tf/utils/utils_test.py
index b35a3299..a6b87731 100644
--- a/texar/tf/utils/utils_test.py
+++ b/texar/tf/utils/utils_test.py
@@ -1,138 +1,19 @@
-# -*- coding: utf-8 -*-
-#
 """
 Unit tests for utility functions.
 """
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
-
-import tempfile
-import numpy as np
-
 import tensorflow as tf
 
 from texar.tf.utils import utils
-from texar.tf.data.vocabulary import Vocab
 
 
 class UtilsTest(tf.test.TestCase):
     """Tests utility functions.
     """
-
-    def test_dict_patch(self):
-        """Tests :meth:`texar.tf.utils.dict_patch`.
-        """
-        src_dict = {
-            "k1": "k1",
-            "k_dict_1": {
-                "kd1_k1": "kd1_k1",
-                "kd1_k2": "kd1_k2"
-            },
-            "k_dict_2": {
-                "kd2_k1": "kd2_k1"
-            }
-        }
-        tgt_dict = {
-            "k1": "k1_tgt",
-            "k_dict_1": {
-                "kd1_k1": "kd1_k1"
-            },
-            "k_dict_2": "kd2_not_dict"
-        }
-
-        patched_dict = utils.dict_patch(tgt_dict, src_dict)
-        self.assertEqual(patched_dict["k1"], tgt_dict["k1"])
-        self.assertEqual(patched_dict["k_dict_1"], src_dict["k_dict_1"])
-        self.assertEqual(patched_dict["k_dict_2"], tgt_dict["k_dict_2"])
-
-    def test_strip_token(self):
-        """Tests :func:`texar.tf.utils.strip_token`
-        """
-        str_ = " <PAD>  <PAD>\t  i am <PAD> \t <PAD>  \t"
-        self.assertEqual(utils.strip_token(str_, "<PAD>"), "i am")
-        self.assertEqual(utils.strip_token(str_, ""),
-                         "<PAD> <PAD> i am <PAD> <PAD>")
-        self.assertEqual(utils.strip_token([str_], "<PAD>"), ["i am"])
-        self.assertEqual(
-            utils.strip_token(np.asarray([str_]), "<PAD>"),
-            ["i am"])
-        self.assertEqual(type(utils.strip_token(np.asarray([str_]), "<PAD>")),
-                         np.ndarray)
-        self.assertEqual(
-            utils.strip_token([[[str_]], ['']], "<PAD>"),
-            [[["i am"]], ['']])
-
-        str_ = str_.split()
-        self.assertEqual(utils.strip_token(str_, "<PAD>", is_token_list=True),
-                         ["i", "am"])
-        self.assertEqual(utils.strip_token([str_], "<PAD>", is_token_list=True),
-                         [["i", "am"]])
-
-    def test_strip_bos(self):
-        """Tests :func:`texar.tf.utils.strip_bos`
-        """
-        str_ = "<BOS> i am"
-        self.assertEqual(utils.strip_bos(str_, "<BOS>"), "i am")
-        self.assertEqual(utils.strip_bos(str_, ""), "<BOS> i am")
-        self.assertEqual(utils.strip_bos([str_], "<BOS>"), ["i am"])
-
-        str_ = str_.split()
-        self.assertEqual(utils.strip_bos(str_, "<BOS>", is_token_list=True),
-                         ["i", "am"])
-        self.assertEqual(utils.strip_bos([str_], "<BOS>", is_token_list=True),
-                         [["i", "am"]])
-
-    def test_strip_eos(self):
-        """Tests :func:`texar.tf.utils.strip_eos`
-        """
-        str_ = "i am <EOS>"
-        self.assertEqual(utils.strip_eos(str_, "<EOS>"), "i am")
-        self.assertEqual(utils.strip_eos([str_], "<EOS>"), ["i am"])
-
-        str_ = str_.split()
-        self.assertEqual(utils.strip_eos(str_, "<EOS>", is_token_list=True),
-                         ["i", "am"])
-        self.assertEqual(utils.strip_eos([str_], "<EOS>", is_token_list=True),
-                         [["i", "am"]])
-
-    def test_strip_special_tokens(self):
-        """Test :func:`texar.tf.utils.strip_special_tokens`
-        """
-        str_ = "<BOS> i am <EOS> <PAD> <PAD>"
-        self.assertEqual(utils.strip_special_tokens(str_), "i am")
-        self.assertEqual(utils.strip_special_tokens([str_]), ["i am"])
-
-        str_ = str_.split()
-        self.assertEqual(utils.strip_special_tokens(str_, is_token_list=True),
-                         ["i", "am"])
-        self.assertEqual(utils.strip_special_tokens([str_], is_token_list=True),
-                         [["i", "am"]])
-
-    def test_str_join(self):
-        """Tests :func:`texar.tf.utils.str_join`
-        """
-        tokens = np.ones([2, 2, 3], dtype='str')
-
-        str_ = utils.str_join(tokens)
-        np.testing.assert_array_equal(
-            str_, np.asarray([['1 1 1', '1 1 1'], ['1 1 1', '1 1 1']]))
-        self.assertIsInstance(str_, np.ndarray)
-
-        str_ = utils.str_join(tokens.tolist())
-        np.testing.assert_array_equal(
-            str_, [['1 1 1', '1 1 1'], ['1 1 1', '1 1 1']])
-        self.assertIsInstance(str_, list)
-
-        tokens = [[], ['1', '1']]
-        str_ = utils.str_join(tokens)
-        np.testing.assert_array_equal(str_, ['', '1 1'])
-
     def test_uniquify_str(self):
         """Tests :func:`texar.tf.utils.uniquify_str`.
         """
+
         str_set = ['str']
         unique_str = utils.uniquify_str('str', str_set)
         self.assertEqual(unique_str, 'str_1')
@@ -142,26 +23,6 @@ def test_uniquify_str(self):
         unique_str = utils.uniquify_str('str', str_set)
         self.assertEqual(unique_str, 'str_3')
 
-    def test_map_ids_to_strs(self):
-        """Tests :func:`texar.tf.utils.map_ids_to_strs`.
-        """
-        vocab_list = ['word', '词']
-        vocab_file = tempfile.NamedTemporaryFile()
-        vocab_file.write('\n'.join(vocab_list).encode("utf-8"))
-        vocab_file.flush()
-        vocab = Vocab(vocab_file.name)
-
-        text = [['<BOS>', 'word', '词', '<EOS>', '<PAD>'],
-                ['word', '词', 'word', '词', '<PAD>']]
-        text = np.asarray(text)
-        ids = vocab.map_tokens_to_ids_py(text)
-
-        ids = ids.tolist()
-        text_ = utils.map_ids_to_strs(ids, vocab)
-
-        self.assertEqual(text_[0], 'word 词')
-        self.assertEqual(text_[1], 'word 词 word 词')
-
     def test_truncate_seq_pair(self):
 
         tokens_a = [1, 2, 3]
diff --git a/texar/tf/utils/variables.py b/texar/tf/utils/variables.py
deleted file mode 100644
index d4b2b10e..00000000
--- a/texar/tf/utils/variables.py
+++ /dev/null
@@ -1,89 +0,0 @@
-# Copyright 2018 The Texar Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Utility functions related to variables.
-"""
-
-from __future__ import absolute_import
-from __future__ import print_function
-from __future__ import division
-
-# pylint: disable=invalid-name
-
-import tensorflow as tf
-
-__all__ = [
-    "get_unique_named_variable_scope",
-    "add_variable",
-    "collect_trainable_variables"
-]
-
-
-def get_unique_named_variable_scope(base_name):
-    """Returns a variable scope with a unique name.
-
-    Args:
-        base_name (str): The base name to uniquified.
-
-    Returns:
-        An instance of :tf_main:`variable_scope <variable_scope>`.
-
-    Example:
-
-        .. code-block:: python
-
-            vs = get_unique_named_variable_scope('base_name')
-            with tf.variable_scope(vs):
-                ....
-    """
-    with tf.variable_scope(None, default_name=base_name) as vs:
-        return vs
-
-
-def add_variable(variable, var_list):
-    """Adds variable to a given list.
-
-    Args:
-        variable: A (list of) variable(s).
-        var_list (list): The list where the :attr:`variable` are added to.
-    """
-    if isinstance(variable, (list, tuple)):
-        for var in variable:
-            add_variable(var, var_list)
-    else:
-        if variable not in var_list:
-            var_list.append(variable)
-
-
-def collect_trainable_variables(modules):
-    """Collects all trainable variables of modules.
-
-    Trainable variables included in multiple modules occur only once in the
-    returned list.
-
-    Args:
-        modules: A (list of) instance of the subclasses of
-            :class:`~texar.tf.modules.ModuleBase`.
-
-    Returns:
-        A list of trainable variables in the modules.
-    """
-    if not isinstance(modules, (list, tuple)):
-        modules = [modules]
-
-    var_list = []
-    for mod in modules:
-        add_variable(mod.trainable_variables, var_list)
-
-    return var_list
diff --git a/texar/tf/version.py b/texar/tf/version.py
index 89e9c67a..0fd9a2e5 100644
--- a/texar/tf/version.py
+++ b/texar/tf/version.py
@@ -13,8 +13,8 @@
 # limitations under the License.
 
 _MAJOR = "0"
-_MINOR = "2"
-_REVISION = "4-unreleased"
+_MINOR = "4"
+_REVISION = "0-unreleased"
 
 VERSION_SHORT = "{0}.{1}".format(_MAJOR, _MINOR)
 VERSION = "{0}.{1}.{2}".format(_MAJOR, _MINOR, _REVISION)