Tensorflow/models/research/sequence_projection/sgnn/sgnn.py

# Copyright 2020 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

"""Builds SGNN model.

[1] Sujith Ravi and Zornitsa Kozareva. 2018. "Self-governing neural networks for
on-device short text
classification." In Proceedings of the 2018 Conference on Empirical Methods in
Natural Language
Processing, pages 887-893. Association for Computational Linguistics

The model will be constructed in this way:
* Projects text to float features, the size is defined by projection_size
* Fully connected layer predicts the class of predictions.
"""

import collections
import tensorflow.compat.v2 as tf
import tensorflow_text as tf_text

from tensorflow_lite_support.custom_ops.python import tflite_text_api

# Hparam collections that will be used to tune the model.
Hparams = collections.namedtuple(
    'Hparams',
    [
        # Learning rate for the optimizer.
        'learning_rate'
    ])


def preprocess(text):
  """Normalize the text, and return tokens."""
  assert len(text.get_shape().as_list()) == 2
  assert text.get_shape().as_list()[-1] == 1
  text = tf.reshape(text, [-1])
  text = tf_text.case_fold_utf8(text)
  tokenizer = tflite_text_api.WhitespaceTokenizer()
  return tokenizer.tokenize(text)


def get_ngrams(tokens, n):
  """Generates character ngrams from tokens.

  Args:
    tokens: A string ragged tensor for tokens, in shape of [batch_size,
      num_token].
    n: ngram size for char ngrams.

  Returns:
    A string ragged tensor for ngrams, in shape of [batch_size, num_token,
    ngrams].
  """
  chars_split = tf.strings.unicode_split('^' + tokens + '$', 'UTF-8')
  chars_joined = tflite_text_api.ngrams(
      chars_split,
      width=n,
      axis=-1,
      reduction_type=tf_text.Reduction.STRING_JOIN,
      string_separator='')
  flat_row_splits = tf.nn.embedding_lookup(chars_joined.values.row_splits,
                                           chars_joined.row_splits)
  return tf.RaggedTensor.from_row_splits(chars_joined.values.values,
                                         flat_row_splits)


def project(ngrams, hash_seed, buckets):
  """Projects a ngram RaggedTensor to float tensor.

  Args:
    ngrams: A string ragged tensor, in shape of [batch_size, num_token, ngrams].
    hash_seed: A python int list, in shape of [num_hash].
    buckets: An int for the max value of projected integers.

  Returns:
    A float tensor that projects ngrams to the space represented by hash_seed,
    in shape of [batch_size, num_hash].
  """
  num_hash = len(hash_seed)
  # Hash ngrams string tensor to hash signatures.
  signatures = tf.ragged.map_flat_values(tf.strings.to_hash_bucket_fast, ngrams,
                                         buckets)

  # Each ngram signature will be multiplied by a different hash seed,
  # mod by hash buckets, and linear mapping.
  # value = abs(signature * seed % bucket)
  # if value > bucket / 2: value -= buckets
  hash_tensor = tf.constant(hash_seed, dtype=tf.int64)
  value = tf.math.floormod(
      tf.abs(signatures.values * tf.reshape(hash_tensor, [-1, 1])), buckets)
  value = value - tf.cast(tf.greater(value, buckets >> 1), tf.int64) * buckets

  # Wrap values to ragged tensor, and calculates
  # output_i,j = mean(value_i,j,k) for k-th ngram in i-th text
  # computed with j-th hash seed
  row_lengths = tf.repeat(
      tf.reshape(signatures.row_lengths(), [1, -1]), num_hash, axis=0)
  row_lengths = tf.cast(tf.reshape(row_lengths, [-1]), tf.int32)
  result = tf.RaggedTensor.from_row_lengths(
      tf.RaggedTensor.from_row_lengths(tf.reshape(value, [-1]), row_lengths),
      tf.repeat(tf.shape(signatures.row_lengths()), num_hash))
  result = tf.reduce_mean(result, 2) / (buckets >> 1)
  return tf.transpose(tf.reshape(result.values, [num_hash, -1]))


def fused_project(ngrams, hash_seed, buckets):
  """A wrapper to fuse project method when converting to TFLite model.

  Args:
    ngrams: A string ragged tensor, in shape of [batch_size, num_token, ngrams].
    hash_seed: A python int list, in shape of [num_hash].
    buckets: An int for the max value of projected integers.

  Returns:
    A float tensor that projects ngrams to the space represented by hash_seed,
    in shape of [batch_size, num_hash].
  """
  hash_seed_attr = ' '.join(['i: %d' % seed for seed in hash_seed])
  experimental_implements = [
      'name: "tftext:custom:SgnnProjection"',
      'attr { key: "hash_seed" value { list {%s} } }' % hash_seed_attr,
      'attr { key: "buckets" value { i: %d } }' % buckets,
  ]
  experimental_implements = ' '.join(experimental_implements)

  @tf.function(experimental_implements=experimental_implements)
  def func(ngrams_values, *ngrams_row_splits):
    ngrams = tf.RaggedTensor.from_nested_row_splits(
        flat_values=ngrams_values, nested_row_splits=ngrams_row_splits)
    return project(ngrams, hash_seed, buckets)
  return func(ngrams.flat_values, *ngrams.nested_row_splits)


def sgnn(texts, hash_seed, ngram_size):
  """Projects the string text to float features.

  It first generasts N ngrams of the tokens from given text,
  then projects each ngram tensor with a partion of the seeds.

  Args:
    texts: a string tensor, in shape of [batch_size].
    hash_seed: a list of integers, in shape of [projection_size].
    ngram_size: max size of ngram to generate features.

  Returns:
    A float tensor that projects ngrams to the space represented by hash_seed,
    in shape of [batch_size, projection_size].
  """
  projection_size = len(hash_seed)
  partition_size = int(projection_size / ((ngram_size + 1) * ngram_size / 2))
  if partition_size == 0:
    raise ValueError(
        'projection size %d is not enough for %d ngram partitions' %
        (projection_size, ngram_size))
  indices = [int(i * (i + 1) / 2) * partition_size for i in range(ngram_size)]
  indices.append(projection_size)
  projection_layer = []
  tokens = preprocess(texts)

  for i in range(ngram_size):
    ngram = get_ngrams(tokens, i + 1)
    projection = fused_project(ngram, hash_seed[indices[i]:indices[i + 1]],
                               0x7FFFFFFF)
    projection_layer.append(projection)

  return tf.cast(tf.concat(projection_layer, -1), tf.float32)


class ProjectLayer(tf.keras.layers.Layer):
  """Projects the texts to a fixed sized features."""

  def __init__(self, seed, ngram_size, **kwargs):
    self.seed = seed
    self.ngram_size = ngram_size
    super(ProjectLayer, self).__init__(**kwargs)

  def get_config(self):
    return {
        'seed': self.seed,
        'ngram_size': self.ngram_size,
    }

  def call(self, x):
    return sgnn(x, self.seed, self.ngram_size)

  def compute_output_shape(self, input_shape):
    return (input_shape[0], len(self.seed))


def keras_model(hash_seed, ngram_size, fc_size_list, hparams):
  """Compiles a keras model from projected features to labels.

  Args:
    hash_seed: a list of int used to project the feature.
    ngram_size: maximum size of ngram to generate features from texts.
    fc_size_list: a list of int, sizes of each fully connected layer.
    hparams: hyper parameters for the model.

  Returns:
    A keras model that predicts the language id.

  """
  if not fc_size_list:
    raise ValueError(
        'Must specify one or more fully connected layers via fc_size_list')
  model = tf.keras.Sequential()
  model.add(ProjectLayer(hash_seed, ngram_size))
  for size in fc_size_list[:-1]:
    model.add(tf.keras.layers.Dense(size))
  model.add(tf.keras.layers.Dense(fc_size_list[-1], activation='softmax'))

  model.compile(
      optimizer=tf.keras.optimizers.Adam(lr=hparams.learning_rate),
      loss=tf.keras.losses.SparseCategoricalCrossentropy(),
      metrics=[tf.keras.metrics.SparseCategoricalAccuracy()])
  return model