rnn.py

#   Copyright 2023 Sicong Zang
#
#   Licensed under the Apache License, Version 2.0 (the "License");
#   you may not use this file except in compliance with the License.
#   You may obtain a copy of the License at
#
#       http://www.apache.org/licenses/LICENSE-2.0
#
#   Unless required by applicable law or agreed to in writing, software
#   distributed under the License is distributed on an "AS IS" BASIS,
#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#   See the License for the specific language governing permissions and
#   limitations under the License.
#
#   P.S. We thank Ha and Eck [1] for their public source codes.
#        And the details about their work can be found below.
#
#       [1] https://github.com/tensorflow/magenta/tree/master/magenta/models/sketch_rnn
#
""" Modules for RNN layers"""

import numpy as np
import tensorflow as tf


def orthogonal(shape):
  """Orthogonal initilaizer."""
  flat_shape = (shape[0], np.prod(shape[1:]))
  a = np.random.normal(0.0, 1.0, flat_shape)
  u, _, v = np.linalg.svd(a, full_matrices=False)
  q = u if u.shape == flat_shape else v
  return q.reshape(shape)


def orthogonal_initializer(scale=1.0):
  """Orthogonal initializer."""
  def _initializer(shape, dtype=tf.float32, partition_info=None):  # pylint: disable=unused-argument
    return tf.constant(orthogonal(shape) * scale, dtype)

  return _initializer


def lstm_ortho_initializer(scale=1.0):
  """LSTM orthogonal initializer."""
  def _initializer(shape, dtype=tf.float32, partition_info=None):  # pylint: disable=unused-argument
    size_x = int(shape[0])
    size_h = int(shape[1] / 4)  # Assumes lstm.
    t = np.zeros(shape)
    t[:, :size_h] = orthogonal([size_x, size_h]) * scale
    t[:, size_h:size_h * 2] = orthogonal([size_x, size_h]) * scale
    t[:, size_h * 2:size_h * 3] = orthogonal([size_x, size_h]) * scale
    t[:, size_h * 3:] = orthogonal([size_x, size_h]) * scale
    return tf.constant(t, dtype)

  return _initializer


class LSTMCell(tf.contrib.rnn.RNNCell):
  """ Vanilla LSTM cell.
  Uses ortho initializer, and also recurrent dropout without memory loss
  (https://arxiv.org/abs/1603.05118)
  """

  def __init__(self,
               num_units,
               forget_bias=1.0,
               use_recurrent_dropout=False,
               dropout_keep_prob=0.9):
    self.num_units = num_units
    self.forget_bias = forget_bias
    self.use_recurrent_dropout = use_recurrent_dropout
    self.dropout_keep_prob = dropout_keep_prob

  @property
  def state_size(self):
    return 2 * self.num_units

  @property
  def output_size(self):
    return self.num_units

  def get_output(self, state):
    unused_c, h = tf.split(state, 2, 1)
    return h

  def __call__(self, x, state, scope=None):
    with tf.variable_scope(scope or type(self).__name__):
      c, h = tf.split(state, 2, 1)

      x_size = x.get_shape().as_list()[1]

      w_init = None  # Uniform

      h_init = lstm_ortho_initializer(1.0)

      # Keep W_xh and W_hh separate here as well to use different init methods.
      w_xh = tf.get_variable(
          'W_xh', [x_size, 4 * self.num_units], initializer=w_init)
      w_hh = tf.get_variable(
          'W_hh', [self.num_units, 4 * self.num_units], initializer=h_init)
      bias = tf.get_variable(
          'bias', [4 * self.num_units],
          initializer=tf.constant_initializer(0.0))

      concat = tf.concat([x, h], 1)
      w_full = tf.concat([w_xh, w_hh], 0)
      hidden = tf.matmul(concat, w_full) + bias

      i, j, f, o = tf.split(hidden, 4, 1)

      if self.use_recurrent_dropout:
        g = tf.nn.dropout(tf.tanh(j), self.dropout_keep_prob)
      else:
        g = tf.tanh(j)

      new_c = c * tf.sigmoid(f + self.forget_bias) + tf.sigmoid(i) * g
      new_h = tf.tanh(new_c) * tf.sigmoid(o)

      return new_h, tf.concat([new_c, new_h], 1)  # fuk tuples.


def layer_norm_all(h,
                   batch_size,
                   base,
                   num_units,
                   scope='layer_norm',
                   reuse=False,
                   gamma_start=1.0,
                   epsilon=1e-3,
                   use_bias=True):
  """Layer Norm (faster version, but not using defun)."""
  # Performs layer norm on multiple base at once (ie, i, g, j, o for lstm)
  # Reshapes h in to perform layer norm in parallel
  h_reshape = tf.reshape(h, [batch_size, base, num_units])
  mean = tf.reduce_mean(h_reshape, [2], keep_dims=True)
  var = tf.reduce_mean(tf.square(h_reshape - mean), [2], keep_dims=True)
  epsilon = tf.constant(epsilon)
  rstd = tf.rsqrt(var + epsilon)
  h_reshape = (h_reshape - mean) * rstd
  # Reshape back to original
  h = tf.reshape(h_reshape, [batch_size, base * num_units])
  with tf.variable_scope(scope):
    if reuse:
      tf.get_variable_scope().reuse_variables()
    gamma = tf.get_variable(
        'ln_gamma', [4 * num_units],
        initializer=tf.constant_initializer(gamma_start))
    if use_bias:
      beta = tf.get_variable(
          'ln_beta', [4 * num_units], initializer=tf.constant_initializer(0.0))
  if use_bias:
    return gamma * h + beta
  return gamma * h


def layer_norm(x,
               num_units,
               scope='layer_norm',
               reuse=False,
               gamma_start=1.0,
               epsilon=1e-3,
               use_bias=True):
  """Calculate layer norm."""
  axes = [1]
  mean = tf.reduce_mean(x, axes, keep_dims=True)
  x_shifted = x - mean
  var = tf.reduce_mean(tf.square(x_shifted), axes, keep_dims=True)
  inv_std = tf.rsqrt(var + epsilon)
  with tf.variable_scope(scope):
    if reuse is True:
      tf.get_variable_scope().reuse_variables()
    gamma = tf.get_variable(
        'ln_gamma', [num_units],
        initializer=tf.constant_initializer(gamma_start))
    if use_bias:
      beta = tf.get_variable(
          'ln_beta', [num_units], initializer=tf.constant_initializer(0.0))
  output = gamma * (x_shifted) * inv_std
  if use_bias:
    output += beta
  return output


def raw_layer_norm(x, epsilon=1e-3):
  axes = [1]
  mean = tf.reduce_mean(x, axes, keep_dims=True)
  std = tf.sqrt(
      tf.reduce_mean(tf.square(x - mean), axes, keep_dims=True) + epsilon)
  output = (x - mean) / (std)
  return output


def xavier_init(size):
    in_dim = size[0]
    xavier_stddev = 1. / tf.sqrt(in_dim / 2.)
    return tf.random_normal(shape=size, stddev=xavier_stddev)


def super_linear(x,
                 output_size,
                 scope=None,
                 reuse=False,
                 init_w='ortho',
                 weight_start=0.0,
                 use_bias=True,
                 bias_start=0.0,
                 input_size=None):
  """Performs linear operation. Uses ortho init defined earlier."""
  shape = x.get_shape().as_list()
  with tf.variable_scope(scope or 'linear'):
    if reuse is True:
      tf.get_variable_scope().reuse_variables()

    w_init = None  # Uniform
    if input_size is None:
      x_size = shape[1]
    else:
      x_size = input_size

    if init_w == 'zeros':
      w_init = tf.constant_initializer(0.0)
    elif init_w == 'constant':
      w_init = tf.constant_initializer(weight_start)
    elif init_w == 'gaussian':
      w_init = tf.random_normal_initializer(stddev=weight_start)
    elif init_w == 'ortho':
      w_init = lstm_ortho_initializer(1.0)

    # w = tf.get_variable('super_linear_w', dtype=tf.float32, initializer=xavier_init([x_size, output_size]))
    w = tf.get_variable('super_linear_w', [x_size, output_size], tf.float32, initializer=w_init)
    if use_bias:
      b = tf.get_variable(
          'super_linear_b', [output_size],
          tf.float32,
          initializer=tf.constant_initializer(bias_start))
      return tf.matmul(x, w) + b
    return tf.matmul(x, w)


class LayerNormLSTMCell(tf.contrib.rnn.RNNCell):
  """Layer-Norm, with Ortho Init. and Recurrent Dropout without Memory Loss.

  https://arxiv.org/abs/1607.06450 - Layer Norm
  https://arxiv.org/abs/1603.05118 - Recurrent Dropout without Memory Loss
  """

  def __init__(self,
               num_units,
               forget_bias=1.0,
               use_recurrent_dropout=False,
               dropout_keep_prob=0.90):
    """Initialize the Layer Norm LSTM cell.

    Args:
      num_units: int, The number of units in the LSTM cell.
      forget_bias: float, The bias added to forget gates (default 1.0).
      use_recurrent_dropout: Whether to use Recurrent Dropout (default False)
      dropout_keep_prob: float, dropout keep probability (default 0.90)
    """
    self.num_units = num_units
    self.forget_bias = forget_bias
    self.use_recurrent_dropout = use_recurrent_dropout
    self.dropout_keep_prob = dropout_keep_prob

  @property
  def input_size(self):
    return self.num_units

  @property
  def output_size(self):
    return self.num_units

  @property
  def state_size(self):
    return 2 * self.num_units

  def get_output(self, state):
    h, unused_c = tf.split(state, 2, 1)
    return h

  def __call__(self, x, state, timestep=0, scope=None):
    with tf.variable_scope(scope or type(self).__name__):
      h, c = tf.split(state, 2, 1)

      h_size = self.num_units
      x_size = x.get_shape().as_list()[1]
      batch_size = x.get_shape().as_list()[0]

      w_init = None  # uniform

      h_init = lstm_ortho_initializer(1.0)

      w_xh = tf.get_variable(
          'W_xh', [x_size, 4 * self.num_units], initializer=w_init)
      w_hh = tf.get_variable(
          'W_hh', [self.num_units, 4 * self.num_units], initializer=h_init)

      concat = tf.concat([x, h], 1)  # Concat for speed
      w_full = tf.concat([w_xh, w_hh], 0)
      concat = tf.matmul(concat, w_full)  #+ bias # live life without garbage

      # i = input_gate, j = new_input, f = forget_gate, o = output_gate
      concat = layer_norm_all(concat, batch_size, 4, h_size, 'ln_all')
      i, j, f, o = tf.split(concat, 4, 1)

      if self.use_recurrent_dropout:
        g = tf.nn.dropout(tf.tanh(j), self.dropout_keep_prob)
      else:
        g = tf.tanh(j)

      new_c = c * tf.sigmoid(f + self.forget_bias) + tf.sigmoid(i) * g
      new_h = tf.tanh(layer_norm(new_c, h_size, 'ln_c')) * tf.sigmoid(o)

    return new_h, tf.concat([new_h, new_c], 1)


class HyperLSTMCell(tf.contrib.rnn.RNNCell):
  """HyperLSTM with Ortho Init, Layer Norm, Recurrent Dropout, no Memory Loss.

  https://arxiv.org/abs/1609.09106
  http://blog.otoro.net/2016/09/28/hyper-networks/
  """

  def __init__(self,
               num_units,
               forget_bias=1.0,
               use_recurrent_dropout=False,
               dropout_keep_prob=0.90,
               use_layer_norm=True,
               hyper_num_units=256,
               hyper_embedding_size=32,
               hyper_use_recurrent_dropout=False):
    """Initialize the Layer Norm HyperLSTM cell.

    Args:
      num_units: int, The number of units in the LSTM cell.
      forget_bias: float, The bias added to forget gates (default 1.0).
      use_recurrent_dropout: Whether to use Recurrent Dropout (default False)
      dropout_keep_prob: float, dropout keep probability (default 0.90)
      use_layer_norm: boolean. (default True)
        Controls whether we use LayerNorm layers in main LSTM & HyperLSTM cell.
      hyper_num_units: int, number of units in HyperLSTM cell.
        (default is 128, recommend experimenting with 256 for larger tasks)
      hyper_embedding_size: int, size of signals emitted from HyperLSTM cell.
        (default is 16, recommend trying larger values for large datasets)
      hyper_use_recurrent_dropout: boolean. (default False)
        Controls whether HyperLSTM cell also uses recurrent dropout.
        Recommend turning this on only if hyper_num_units becomes large (>= 512)
    """
    self.num_units = num_units
    self.forget_bias = forget_bias
    self.use_recurrent_dropout = use_recurrent_dropout
    self.dropout_keep_prob = dropout_keep_prob
    self.use_layer_norm = use_layer_norm
    self.hyper_num_units = hyper_num_units
    self.hyper_embedding_size = hyper_embedding_size
    self.hyper_use_recurrent_dropout = hyper_use_recurrent_dropout

    self.total_num_units = self.num_units + self.hyper_num_units

    if self.use_layer_norm:
      cell_fn = LayerNormLSTMCell
    else:
      cell_fn = LSTMCell
    self.hyper_cell = cell_fn(
        hyper_num_units,
        use_recurrent_dropout=hyper_use_recurrent_dropout,
        dropout_keep_prob=dropout_keep_prob)

  @property
  def input_size(self):
    return self._input_size

  @property
  def output_size(self):
    return self.num_units

  @property
  def state_size(self):
    return 2 * self.total_num_units

  def get_output(self, state):
    total_h, unused_total_c = tf.split(state, 2, 1)
    h = total_h[:, 0:self.num_units]
    return h

  def hyper_norm(self, layer, scope='hyper', use_bias=True):
    num_units = self.num_units
    embedding_size = self.hyper_embedding_size
    # Recurrent batch norm init trick (https://arxiv.org/abs/1603.09025).
    init_gamma = 0.10
    with tf.variable_scope(scope):
      zw = super_linear(
          self.hyper_output,
          embedding_size,
          init_w='constant',
          weight_start=0.00,
          use_bias=True,
          bias_start=1.0,
          scope='zw')
      alpha = super_linear(
          zw,
          num_units,
          init_w='constant',
          weight_start=init_gamma / embedding_size,
          use_bias=False,
          scope='alpha')
      result = tf.multiply(alpha, layer)
      if use_bias:
        zb = super_linear(
            self.hyper_output,
            embedding_size,
            init_w='gaussian',
            weight_start=0.01,
            use_bias=False,
            bias_start=0.0,
            scope='zb')
        beta = super_linear(
            zb,
            num_units,
            init_w='constant',
            weight_start=0.00,
            use_bias=False,
            scope='beta')
        result += beta
    return result

  def __call__(self, x, state, timestep=0, scope=None):
    with tf.variable_scope(scope or type(self).__name__):
      total_h, total_c = tf.split(state, 2, 1)
      h = total_h[:, 0:self.num_units]
      c = total_c[:, 0:self.num_units]
      self.hyper_state = tf.concat(
          [total_h[:, self.num_units:], total_c[:, self.num_units:]], 1)

      batch_size = x.get_shape().as_list()[0]
      x_size = x.get_shape().as_list()[1]
      self._input_size = x_size

      w_init = None  # Uniform

      h_init = lstm_ortho_initializer(1.0)

      w_xh = tf.get_variable(
          'W_xh', [x_size, 4 * self.num_units], initializer=w_init)
      w_hh = tf.get_variable(
          'W_hh', [self.num_units, 4 * self.num_units], initializer=h_init)
      bias = tf.get_variable(
          'bias', [4 * self.num_units],
          initializer=tf.constant_initializer(0.0))

      # Concatenate the input and hidden states for hyperlstm input
      hyper_input = tf.concat([x, h], 1)
      hyper_output, hyper_new_state = self.hyper_cell(hyper_input, self.hyper_state)
      self.hyper_output = hyper_output
      self.hyper_state = hyper_new_state

      xh = tf.matmul(x, w_xh)
      hh = tf.matmul(h, w_hh)

      # Split Wxh contributions
      ix, jx, fx, ox = tf.split(xh, 4, 1)
      ix = self.hyper_norm(ix, 'hyper_ix', use_bias=False)
      jx = self.hyper_norm(jx, 'hyper_jx', use_bias=False)
      fx = self.hyper_norm(fx, 'hyper_fx', use_bias=False)
      ox = self.hyper_norm(ox, 'hyper_ox', use_bias=False)

      # Split Whh contributions
      ih, jh, fh, oh = tf.split(hh, 4, 1)
      ih = self.hyper_norm(ih, 'hyper_ih', use_bias=True)
      jh = self.hyper_norm(jh, 'hyper_jh', use_bias=True)
      fh = self.hyper_norm(fh, 'hyper_fh', use_bias=True)
      oh = self.hyper_norm(oh, 'hyper_oh', use_bias=True)

      # Split bias
      ib, jb, fb, ob = tf.split(bias, 4, 0)  # Bias is to be broadcasted.

      # i = input_gate, j = new_input, f = forget_gate, o = output_gate
      i = ix + ih + ib
      j = jx + jh + jb
      f = fx + fh + fb
      o = ox + oh + ob

      if self.use_layer_norm:
        concat = tf.concat([i, j, f, o], 1)
        concat = layer_norm_all(concat, batch_size, 4, self.num_units, 'ln_all')
        i, j, f, o = tf.split(concat, 4, 1)

      if self.use_recurrent_dropout:
        g = tf.nn.dropout(tf.tanh(j), self.dropout_keep_prob)
      else:
        g = tf.tanh(j)

      new_c = c * tf.sigmoid(f + self.forget_bias) + tf.sigmoid(i) * g
      new_h = tf.tanh(layer_norm(new_c, self.num_units, 'ln_c')) * tf.sigmoid(o)

      hyper_h, hyper_c = tf.split(hyper_new_state, 2, 1)
      new_total_h = tf.concat([new_h, hyper_h], 1)
      new_total_c = tf.concat([new_c, hyper_c], 1)
      new_total_state = tf.concat([new_total_h, new_total_c], 1)
    return new_h, new_total_state