From 06af6b7b34db37e603d4199eba25ad65dfbe5941 Mon Sep 17 00:00:00 2001
From: riteshghorse <riteshghorse@gmail.com>
Date: Wed, 10 May 2023 09:33:17 -0400
Subject: [PATCH 01/26] automodel first pass

---
 .../huggingface_language_modeling.py          | 194 ++++++++++++++++++
 .../ml/inference/huggingface_inference.py     | 104 ++++++++++
 .../huggingface_inference_it_test.py          |  73 +++++++
 .../huggingface_tests_requirements.txt        |  25 +++
 4 files changed, 396 insertions(+)
 create mode 100644 sdks/python/apache_beam/examples/inference/huggingface_language_modeling.py
 create mode 100644 sdks/python/apache_beam/ml/inference/huggingface_inference.py
 create mode 100644 sdks/python/apache_beam/ml/inference/huggingface_inference_it_test.py
 create mode 100644 sdks/python/apache_beam/ml/inference/huggingface_tests_requirements.txt

diff --git a/sdks/python/apache_beam/examples/inference/huggingface_language_modeling.py b/sdks/python/apache_beam/examples/inference/huggingface_language_modeling.py
new file mode 100644
index 0000000000000..075dc3fdfa9bd
--- /dev/null
+++ b/sdks/python/apache_beam/examples/inference/huggingface_language_modeling.py
@@ -0,0 +1,194 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+""""A pipeline that uses RunInference to perform Language Modeling with Bert.
+
+This pipeline takes sentences from a custom text file, converts the last word
+of the sentence into a [MASK] token, and then uses the BertForMaskedLM from
+Hugging Face to predict the best word for the masked token given all the words
+already in the sentence. The pipeline then writes the prediction to an output
+file in which users can then compare against the original sentence.
+"""
+
+import argparse
+import logging
+from typing import Dict
+from typing import Iterable
+from typing import Iterator
+from typing import Tuple
+
+import torch
+
+import apache_beam as beam
+from apache_beam.ml.inference.huggingface_inference import HuggingFaceModelHandler
+from apache_beam.ml.inference.base import KeyedModelHandler
+from apache_beam.ml.inference.base import PredictionResult
+from apache_beam.ml.inference.base import RunInference
+from apache_beam.options.pipeline_options import PipelineOptions
+from apache_beam.options.pipeline_options import SetupOptions
+from apache_beam.runners.runner import PipelineResult
+
+from transformers import BertForMaskedLM
+from transformers import BertTokenizer
+
+
+def add_mask_to_last_word(text: str) -> Tuple[str, str]:
+  text_list = text.split()
+  return text, ' '.join(text_list[:-2] + ['[MASK]', text_list[-1]])
+
+
+def tokenize_sentence(
+    text_and_mask: Tuple[str, str],
+    bert_tokenizer: BertTokenizer) -> Tuple[str, Dict[str, torch.Tensor]]:
+  text, masked_text = text_and_mask
+  tokenized_sentence = bert_tokenizer.encode_plus(
+      masked_text, return_tensors="pt")
+
+  # Workaround to manually remove batch dim until we have the feature to
+  # add optional batching flag.
+  # TODO(https://github.com/apache/beam/issues/21863): Remove once optional
+  # batching flag added
+  return text, {
+      k: torch.squeeze(v)
+      for k, v in dict(tokenized_sentence).items()
+  }
+
+
+def filter_empty_lines(text: str) -> Iterator[str]:
+  if len(text.strip()) > 0:
+    yield text
+
+
+class PostProcessor(beam.DoFn):
+  """Processes the PredictionResult to get the predicted word.
+
+  The logits are the output of the BERT Model. After applying a softmax
+  activation function to the logits, we get probabilistic distributions for each
+  of the words in BERT’s vocabulary. We can get the word with the highest
+  probability of being a candidate replacement word by taking the argmax.
+  """
+  def __init__(self, bert_tokenizer: BertTokenizer):
+    super().__init__()
+    self.bert_tokenizer = bert_tokenizer
+
+  def process(self, element: Tuple[str, PredictionResult]) -> Iterable[str]:
+    text, prediction_result = element
+    inputs = prediction_result.example
+    logits = prediction_result.inference['logits']
+    mask_token_index = (
+        inputs['input_ids'] == self.bert_tokenizer.mask_token_id).nonzero(
+            as_tuple=True)[0]
+    predicted_token_id = logits[mask_token_index].argmax(axis=-1)
+    decoded_word = self.bert_tokenizer.decode(predicted_token_id)
+    yield text + ';' + decoded_word
+
+
+def parse_known_args(argv):
+  """Parses args for the workflow."""
+  parser = argparse.ArgumentParser()
+  parser.add_argument(
+      '--input',
+      dest='input',
+      help='Path to the text file containing sentences.')
+  parser.add_argument(
+      '--output',
+      dest='output',
+      required=True,
+      help='Path of file in which to save the output predictions.')
+  parser.add_argument(
+      '--bert_tokenizer',
+      dest='bert_tokenizer',
+      default='bert-base-uncased',
+      help='bert uncased model. This can be base model or large model')
+  parser.add_argument(
+      '--model_name',
+      dest='model_name',
+      required=True,
+      help="Name of the model from Hugging Face")
+  return parser.parse_known_args(argv)
+
+
+def run(
+    argv=None,
+    model_class=None,
+    save_main_session=True,
+    test_pipeline=None) -> PipelineResult:
+  """
+  Args:
+    argv: Command line arguments defined for this example.
+    model_class: Reference to the class definition of the model.
+                If None, BertForMaskedLM will be used as default .
+    model_params: Parameters passed to the constructor of the model_class.
+                  These will be used to instantiate the model object in the
+                  RunInference API.
+    save_main_session: Used for internal testing.
+    test_pipeline: Used for internal testing.
+  """
+  known_args, pipeline_args = parse_known_args(argv)
+  pipeline_options = PipelineOptions(pipeline_args)
+  pipeline_options.view_as(SetupOptions).save_main_session = save_main_session
+
+  if not model_class:
+    model_class = BertForMaskedLM
+
+  pipeline = test_pipeline
+  if not test_pipeline:
+    pipeline = beam.Pipeline(options=pipeline_options)
+
+  bert_tokenizer = BertTokenizer.from_pretrained(known_args.bert_tokenizer)
+
+  model_handler = HuggingFaceModelHandler(known_args.model_name)
+  if not known_args.input:
+    text = (pipeline | 'CreateSentences' >> beam.Create([
+      'The capital of France is Paris .',
+      'It is raining cats and dogs .',
+      'He looked up and saw the sun and stars .',
+      'Today is Monday and tomorrow is Tuesday .',
+      'There are 5 coconuts on this palm tree .',
+      'The richest person in the world is not here .',
+      'Malls are amazing places to shop because you can find everything you need under one roof .', # pylint: disable=line-too-long
+      'This audiobook is sure to liquefy your brain .',
+      'The secret ingredient to his wonderful life was gratitude .',
+      'The biggest animal in the world is the whale .',
+    ]))
+  else:
+    text = (
+        pipeline | 'ReadSentences' >> beam.io.ReadFromText(known_args.input))
+  text_and_tokenized_text_tuple = (
+      text
+      | 'FilterEmptyLines' >> beam.ParDo(filter_empty_lines)
+      | 'AddMask' >> beam.Map(add_mask_to_last_word)
+      | 'TokenizeSentence' >>
+      beam.Map(lambda x: tokenize_sentence(x, bert_tokenizer)))
+  output = (
+      text_and_tokenized_text_tuple
+      | 'PyTorchRunInference' >> RunInference(KeyedModelHandler(model_handler))
+      | 'ProcessOutput' >> beam.ParDo(
+          PostProcessor(bert_tokenizer=bert_tokenizer)))
+  output | "WriteOutput" >> beam.io.WriteToText( # pylint: disable=expression-not-assigned
+    known_args.output,
+    shard_name_template='',
+    append_trailing_newlines=True)
+
+  result = pipeline.run()
+  result.wait_until_finish()
+  return result
+
+
+if __name__ == '__main__':
+  logging.getLogger().setLevel(logging.INFO)
+  run()
diff --git a/sdks/python/apache_beam/ml/inference/huggingface_inference.py b/sdks/python/apache_beam/ml/inference/huggingface_inference.py
new file mode 100644
index 0000000000000..7e54cbec7e331
--- /dev/null
+++ b/sdks/python/apache_beam/ml/inference/huggingface_inference.py
@@ -0,0 +1,104 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from collections import defaultdict
+from typing import Any
+from typing import Dict
+from typing import Iterable
+from typing import Optional
+from typing import Sequence
+
+import torch
+from transformers import AutoModel
+
+from apache_beam.ml.inference.base import ModelHandler
+from apache_beam.ml.inference.base import PredictionResult
+from apache_beam.ml.inference import utils
+
+
+class HuggingFaceModelHandler(ModelHandler[Dict[str, torch.Tensor],
+                                           PredictionResult,
+                                           Any]):
+  def __init__(
+      self,
+      model_name: str,
+      model_download_args: Dict[str, Any] = None,
+      min_batch_size: Optional[int] = None,
+      max_batch_size: Optional[int] = None,
+      **kwargs):
+    self._model_handler = None
+    self._model_name = model_name
+    self._model_path = None
+    self._model = None
+    self._model_download_args = model_download_args if model_download_args else {}  # pylint: disable=line-too-long
+    self._batching_kwargs = {}
+    self._env_vars = kwargs.get('env_vars', {})
+    if min_batch_size is not None:
+      self._batching_kwargs['min_batch_size'] = min_batch_size
+    if max_batch_size is not None:
+      self._batching_kwargs['max_batch_size'] = max_batch_size
+
+  def load_model(self):
+    """Loads and initializes a model for processing."""
+    self._model = AutoModel.from_pretrained(self._model_name)
+    return self._model
+
+  def update_model_path(self, model_path: Optional[str] = None):
+    self._model_path = model_path if model_path else self._model_path
+
+  def run_inference(
+      self,
+      batch: Sequence[Dict[str, torch.Tensor]],
+      model: Any,
+      inference_args: Optional[Dict[str, Any]] = None
+  ) -> Iterable[PredictionResult]:
+    key_to_tensor_list = defaultdict(list)
+
+    inference_args = {} if not inference_args else inference_args
+    # torch.no_grad() mitigates GPU memory issues
+    # https://github.com/apache/beam/issues/22811
+
+    with torch.no_grad():
+      for example in batch:
+        for key, tensor in example.items():
+          key_to_tensor_list[key].append(tensor)
+      key_to_batched_tensors = {}
+      for key in key_to_tensor_list:
+        batched_tensors = torch.stack(key_to_tensor_list[key])
+        # batched_tensors = key_to_tensor_list[key]
+        key_to_batched_tensors[key] = batched_tensors
+      predictions = model(**key_to_batched_tensors, **inference_args)
+
+      return utils._convert_to_result(batch, predictions)
+
+  def get_num_bytes(self, batch: Sequence[torch.Tensor]) -> int:
+    """
+    Returns:
+      The number of bytes of data for a batch.
+    """
+    return sum(
+        (el.element_size() for tensor in batch for el in tensor.values()))
+
+  def get_metrics_namespace(self) -> str:
+    """
+    Returns:
+       A namespace for metrics collected by the RunInference transform.
+    """
+    return 'BeamML_HuggingFace_Tensor'
+
+  def batch_elements_kwargs(self):
+    return self._batching_kwargs
diff --git a/sdks/python/apache_beam/ml/inference/huggingface_inference_it_test.py b/sdks/python/apache_beam/ml/inference/huggingface_inference_it_test.py
new file mode 100644
index 0000000000000..47664aaee063c
--- /dev/null
+++ b/sdks/python/apache_beam/ml/inference/huggingface_inference_it_test.py
@@ -0,0 +1,73 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""End-to-End test for Pytorch Inference"""
+
+import logging
+import unittest
+import uuid
+import pytest
+from apache_beam.io.filesystems import FileSystems
+from apache_beam.testing.test_pipeline import TestPipeline
+from apache_beam.examples.inference import huggingface_language_modeling
+
+
+def process_outputs(filepath):
+  with FileSystems().open(filepath) as f:
+    lines = f.readlines()
+  lines = [l.decode('utf-8').strip('\n') for l in lines]
+  return lines
+
+
+class HuggingFaceInference(unittest.TestCase):
+  @pytest.mark.timeout(4200)
+  def test_hf_imagenet_image_segmentation(self):
+    test_pipeline = TestPipeline(is_integration_test=True)
+    # Path to text file containing some sentences
+    file_of_sentences = 'gs://apache-beam-ml/datasets/custom/sentences.txt'  # pylint: disable=line-too-long
+    output_file_dir = 'gs://apache-beam-ml/testing/predictions'
+    output_file = '/'.join([output_file_dir, str(uuid.uuid4()), 'result.txt'])
+
+    model_name = 'bert-base-uncased'
+    extra_opts = {
+        'input': file_of_sentences,
+        'output': output_file,
+        'model_name': model_name,
+    }
+    huggingface_language_modeling.run(
+        test_pipeline.get_full_options_as_args(**extra_opts),
+        save_main_session=False)
+
+    self.assertEqual(FileSystems().exists(output_file), True)
+    predictions = process_outputs(filepath=output_file)
+    actuals_file = 'gs://apache-beam-ml/testing/expected_outputs/test_torch_run_inference_bert_for_masked_lm_actuals.txt'  # pylint: disable=line-too-long
+    actuals = process_outputs(filepath=actuals_file)
+
+    predictions_dict = {}
+    for prediction in predictions:
+      text, predicted_text = prediction.split(';')
+      predictions_dict[text] = predicted_text
+
+    for actual in actuals:
+      text, actual_predicted_text = actual.split(';')
+      predicted_predicted_text = predictions_dict[text]
+      self.assertEqual(actual_predicted_text, predicted_predicted_text)
+
+
+if __name__ == '__main__':
+  logging.getLogger().setLevel(logging.DEBUG)
+  unittest.main()
diff --git a/sdks/python/apache_beam/ml/inference/huggingface_tests_requirements.txt b/sdks/python/apache_beam/ml/inference/huggingface_tests_requirements.txt
new file mode 100644
index 0000000000000..aa8f4ea953e40
--- /dev/null
+++ b/sdks/python/apache_beam/ml/inference/huggingface_tests_requirements.txt
@@ -0,0 +1,25 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+torch>=1.7.1
+torchvision>=0.8.2
+pillow>=8.0.0
+transformers>=4.18.0
+huggingface-hub==0.14.0
+tensorflow>=2.12.0
+tensorflow_hub>=0.10.0
+Pillow>=9.0.0
\ No newline at end of file

From 416166dcc028f30aa11c55d6aba90522d7da89d4 Mon Sep 17 00:00:00 2001
From: riteshghorse <riteshghorse@gmail.com>
Date: Tue, 16 May 2023 11:03:31 -0400
Subject: [PATCH 02/26] new model

---
 .../inference/huggingface_language_modeling.py        | 11 ++---------
 .../ml/inference/huggingface_inference_it_test.py     |  5 +++--
 2 files changed, 5 insertions(+), 11 deletions(-)

diff --git a/sdks/python/apache_beam/examples/inference/huggingface_language_modeling.py b/sdks/python/apache_beam/examples/inference/huggingface_language_modeling.py
index 075dc3fdfa9bd..e2736a0842c6e 100644
--- a/sdks/python/apache_beam/examples/inference/huggingface_language_modeling.py
+++ b/sdks/python/apache_beam/examples/inference/huggingface_language_modeling.py
@@ -42,7 +42,6 @@
 from apache_beam.options.pipeline_options import SetupOptions
 from apache_beam.runners.runner import PipelineResult
 
-from transformers import BertForMaskedLM
 from transformers import BertTokenizer
 
 
@@ -88,7 +87,7 @@ def __init__(self, bert_tokenizer: BertTokenizer):
   def process(self, element: Tuple[str, PredictionResult]) -> Iterable[str]:
     text, prediction_result = element
     inputs = prediction_result.example
-    logits = prediction_result.inference['logits']
+    logits = prediction_result.inference.logits
     mask_token_index = (
         inputs['input_ids'] == self.bert_tokenizer.mask_token_id).nonzero(
             as_tuple=True)[0]
@@ -123,10 +122,7 @@ def parse_known_args(argv):
 
 
 def run(
-    argv=None,
-    model_class=None,
-    save_main_session=True,
-    test_pipeline=None) -> PipelineResult:
+    argv=None, save_main_session=True, test_pipeline=None) -> PipelineResult:
   """
   Args:
     argv: Command line arguments defined for this example.
@@ -142,9 +138,6 @@ def run(
   pipeline_options = PipelineOptions(pipeline_args)
   pipeline_options.view_as(SetupOptions).save_main_session = save_main_session
 
-  if not model_class:
-    model_class = BertForMaskedLM
-
   pipeline = test_pipeline
   if not test_pipeline:
     pipeline = beam.Pipeline(options=pipeline_options)
diff --git a/sdks/python/apache_beam/ml/inference/huggingface_inference_it_test.py b/sdks/python/apache_beam/ml/inference/huggingface_inference_it_test.py
index 47664aaee063c..6d2dc9ce68f84 100644
--- a/sdks/python/apache_beam/ml/inference/huggingface_inference_it_test.py
+++ b/sdks/python/apache_beam/ml/inference/huggingface_inference_it_test.py
@@ -42,15 +42,16 @@ def test_hf_imagenet_image_segmentation(self):
     output_file_dir = 'gs://apache-beam-ml/testing/predictions'
     output_file = '/'.join([output_file_dir, str(uuid.uuid4()), 'result.txt'])
 
-    model_name = 'bert-base-uncased'
+    model_name = 'againeureka/my_awesome_eli5_mlm_model'
     extra_opts = {
         'input': file_of_sentences,
         'output': output_file,
         'model_name': model_name,
+        '--bert_tokenizer': model_name
     }
     huggingface_language_modeling.run(
         test_pipeline.get_full_options_as_args(**extra_opts),
-        save_main_session=False)
+        save_main_session=True)
 
     self.assertEqual(FileSystems().exists(output_file), True)
     predictions = process_outputs(filepath=output_file)

From 6f063e51c026f2779f0465b4073dda762abeadbe Mon Sep 17 00:00:00 2001
From: riteshghorse <riteshghorse@gmail.com>
Date: Wed, 21 Jun 2023 14:55:29 -0400
Subject: [PATCH 03/26] updated model handler api

---
 .../ml/inference/huggingface_inference.py     | 338 ++++++++++++++++--
 1 file changed, 308 insertions(+), 30 deletions(-)

diff --git a/sdks/python/apache_beam/ml/inference/huggingface_inference.py b/sdks/python/apache_beam/ml/inference/huggingface_inference.py
index 7e54cbec7e331..3501ed971c760 100644
--- a/sdks/python/apache_beam/ml/inference/huggingface_inference.py
+++ b/sdks/python/apache_beam/ml/inference/huggingface_inference.py
@@ -16,12 +16,16 @@
 #
 
 from collections import defaultdict
+import sys
 from typing import Any
 from typing import Dict
 from typing import Iterable
 from typing import Optional
 from typing import Sequence
+from typing import Union
 
+import numpy
+import tensorflow as tf
 import torch
 from transformers import AutoModel
 
@@ -30,21 +34,77 @@
 from apache_beam.ml.inference import utils
 
 
-class HuggingFaceModelHandler(ModelHandler[Dict[str, torch.Tensor],
-                                           PredictionResult,
-                                           Any]):
+def _run_inference_torch_keyed_tensor(
+    batch: Sequence[torch.Tensor], model: Any,
+    inference_args: Dict[str, Any]) -> Iterable[PredictionResult]:
+  key_to_tensor_list = defaultdict(list)
+  with torch.no_grad():
+    for example in batch:
+      for key, tensor in example.items():
+        key_to_tensor_list[key].append(tensor)
+    key_to_batched_tensors = {}
+    for key in key_to_tensor_list:
+      batched_tensors = torch.stack(key_to_tensor_list[key])
+      batched_tensors = key_to_tensor_list[key]
+      key_to_batched_tensors[key] = batched_tensors
+    return utils._convert_to_result(
+        batch, model(**key_to_batched_tensors, **inference_args))
+
+
+def _run_inference_tensorflow_keyed_tensor(
+    batch: Sequence[tf.Tensor], model: Any,
+    inference_args: Dict[str, Any]) -> Iterable[PredictionResult]:
+  key_to_tensor_list = defaultdict(list)
+  for example in batch:
+    for key, tensor in example.items():
+      key_to_tensor_list[key].append(tensor)
+  key_to_batched_tensors = {}
+  for key in key_to_tensor_list:
+    batched_tensors = torch.stack(key_to_tensor_list[key])
+    batched_tensors = key_to_tensor_list[key]
+    key_to_batched_tensors[key] = batched_tensors
+  return utils._convert_to_result(
+      batch, model(**key_to_batched_tensors, **inference_args))
+
+
+class HuggingFaceModelHandlerKeyedTensor(ModelHandler[Dict[str,
+                                                           Union[tf.Tensor,
+                                                                 torch.Tensor]],
+                                                      PredictionResult,
+                                                      Any]):
   def __init__(
       self,
-      model_name: str,
-      model_download_args: Dict[str, Any] = None,
+      model_uri: str,
+      model_config_args: Dict[str, Any] = None,
+      inference_args: Optional[Dict[str, Any]] = None,
       min_batch_size: Optional[int] = None,
       max_batch_size: Optional[int] = None,
       **kwargs):
-    self._model_handler = None
-    self._model_name = model_name
-    self._model_path = None
-    self._model = None
-    self._model_download_args = model_download_args if model_download_args else {}  # pylint: disable=line-too-long
+    """Implementation of the ModelHandler interface for HuggingFace with
+      Keyed Tensors for PyTorch/Tensorflow backend.
+
+      Depending on the type of tensors,
+      the model framework is determined automatically.
+
+      Example Usage model::
+      pcoll | RunInference(HuggingFaceModelHandlerKeyedTensor(
+        model_uri="bert-base-uncased"))
+
+    Args:
+      model_uri (str): path to the pretrained model on the hugging face
+        models hub.
+      model_config_args (Dict[str, Any]): keyword arguments to provide load
+        options while loading from Hugging Face Hub. Defaults to None.
+      inference_args (Optional[Dict[str, Any]]): Non-batchable arguments
+        required as inputs to the model's forward() function. Unlike Tensors in
+        `batch`, these parameters will not be dynamically batched.
+        Defaults to None.
+      min_batch_size: the minimum batch size to use when batching inputs.
+      max_batch_size: the maximum batch size to use when batching inputs.
+    """
+    self._model_uri = model_uri
+    self._model_config_args = model_config_args if model_config_args else {}
+    self._inference_args = inference_args if inference_args else {}
     self._batching_kwargs = {}
     self._env_vars = kwargs.get('env_vars', {})
     if min_batch_size is not None:
@@ -53,42 +113,52 @@ def __init__(
       self._batching_kwargs['max_batch_size'] = max_batch_size
 
   def load_model(self):
-    """Loads and initializes a model for processing."""
-    self._model = AutoModel.from_pretrained(self._model_name)
-    return self._model
+    """Loads and initializes the model for processing."""
+    return AutoModel.from_pretrained(
+        self._model_name, **self._model_config_args)
 
   def update_model_path(self, model_path: Optional[str] = None):
     self._model_path = model_path if model_path else self._model_path
 
   def run_inference(
       self,
-      batch: Sequence[Dict[str, torch.Tensor]],
+      batch: Sequence[Dict[str, Union[tf.Tensor, torch.Tensor]]],
       model: Any,
       inference_args: Optional[Dict[str, Any]] = None
   ) -> Iterable[PredictionResult]:
-    key_to_tensor_list = defaultdict(list)
+    """
+    Runs inferences on a batch of Keyed Tensors and returns an Iterable of
+    Tensors Predictions.
 
+    This method stacks the list of Tensors in a vectorized format to optimize
+    the inference call.
+
+    Args:
+      batch: A sequence of Keyed Tensors. These Tensors should be batchable,
+        as this method will call `tf.stack()`/`torch.stack()` and pass in
+        batched Tensors with dimensions (batch_size, n_features, etc.) into the
+        model's predict() function.
+      model: A Tensorflow/PyTorch model.
+      inference_args: Non-batchable arguments required as inputs to the model's
+        forward() function. Unlike Tensors in `batch`, these parameters will
+        not be dynamically batched
+    Returns:
+      An Iterable of type PredictionResult.
+    """
     inference_args = {} if not inference_args else inference_args
     # torch.no_grad() mitigates GPU memory issues
     # https://github.com/apache/beam/issues/22811
+    if isinstance(batch[0], tf.Tensor):
+      return _run_inference_tensorflow_keyed_tensor(
+          batch, model, inference_args)
+    else:
+      return _run_inference_torch_keyed_tensor(batch, model, inference_args)
 
-    with torch.no_grad():
-      for example in batch:
-        for key, tensor in example.items():
-          key_to_tensor_list[key].append(tensor)
-      key_to_batched_tensors = {}
-      for key in key_to_tensor_list:
-        batched_tensors = torch.stack(key_to_tensor_list[key])
-        # batched_tensors = key_to_tensor_list[key]
-        key_to_batched_tensors[key] = batched_tensors
-      predictions = model(**key_to_batched_tensors, **inference_args)
-
-      return utils._convert_to_result(batch, predictions)
-
-  def get_num_bytes(self, batch: Sequence[torch.Tensor]) -> int:
+  def get_num_bytes(
+      self, batch: Sequence[Union[tf.Tensor, torch.Tensor]]) -> int:
     """
     Returns:
-      The number of bytes of data for a batch.
+      The number of bytes of data for the Tensors batch.
     """
     return sum(
         (el.element_size() for tensor in batch for el in tensor.values()))
@@ -102,3 +172,211 @@ def get_metrics_namespace(self) -> str:
 
   def batch_elements_kwargs(self):
     return self._batching_kwargs
+
+
+class HuggingFaceModelHandlerTensor(ModelHandler[Union[tf.Tensor, torch.Tensor],
+                                                 PredictionResult,
+                                                 Any]):
+  def __init__(
+      self,
+      model_uri: str,
+      model_config_args: Dict[str, Any] = None,
+      inference_args: Optional[Dict[str, Any]] = None,
+      min_batch_size: Optional[int] = None,
+      max_batch_size: Optional[int] = None,
+      **kwargs):
+    """Implementation of the ModelHandler interface for HuggingFace with
+      Tensors for PyTorch/Tensorflow backend.
+
+      Depending on the type of tensors,
+      the model framework is determined automatically.
+
+      Example Usage model:
+      pcoll | RunInference(HuggingFaceModelHandlerTensor(
+        model_uri="bert-base-uncased"))
+
+    Args:
+      model_uri (str): path to the pretrained model on the
+      Hugging Face models hub.
+      model_config_args (Dict[str, Any]): keyword arguments to provide load
+        options while loading from Hugging Face Hub. Defaults to None.
+      inference_args (Optional[Dict[str, Any]]): Non-batchable arguments
+        required as inputs to the model's forward() function. Unlike Tensors in
+        `batch`, these parameters will not be dynamically batched.
+        Defaults to None.
+      min_batch_size: the minimum batch size to use when batching inputs.
+      max_batch_size: the maximum batch size to use when batching inputs.
+    """
+    self._model_uri = model_uri
+    self._model_config_args = model_config_args if model_config_args else {}
+    self._inference_args = inference_args if inference_args else {}
+    self._batching_kwargs = {}
+    self._framework = "torch"
+    self._env_vars = kwargs.get('env_vars', {})
+    if min_batch_size is not None:
+      self._batching_kwargs['min_batch_size'] = min_batch_size
+    if max_batch_size is not None:
+      self._batching_kwargs['max_batch_size'] = max_batch_size
+
+  def load_model(self):
+    """Loads and initializes the model for processing."""
+    return AutoModel.from_pretrained(
+        self._model_name, **self._model_config_args)
+
+  def update_model_path(self, model_path: Optional[str] = None):
+    self._model_path = model_path if model_path else self._model_path
+
+  def run_inference(
+      self,
+      batch: Sequence[Union[tf.Tensor, torch.Tensor]],
+      model: Any,
+      inference_args: Optional[Dict[str, Any]] = None
+  ) -> Iterable[PredictionResult]:
+    """
+    Runs inferences on a batch of Tensors and returns an Iterable of
+    Tensors Predictions.
+
+    This method stacks the list of Tensors in a vectorized format to optimize
+    the inference call.
+
+    Args:
+      batch: A sequence of Tensors. These Tensors should be batchable, as this
+        method will call `tf.stack()`/`torch.stack()` and pass in batched
+        Tensors with dimensions (batch_size, n_features, etc.) into the model's
+        predict() function.
+      model: A Tensorflow/PyTorch model.
+      inference_args: Non-batchable arguments required as inputs to the model's
+        forward() function. Unlike Tensors in `batch`, these parameters will
+        not be dynamically batched
+    Returns:
+      An Iterable of type PredictionResult.
+    """
+    inference_args = {} if not inference_args else inference_args
+
+    if isinstance(batch[0], tf.Tensor):
+      self._framework = "tf"
+      predictions = model(batch, **inference_args)
+    else:
+      # torch.no_grad() mitigates GPU memory issues
+      # https://github.com/apache/beam/issues/22811
+      with torch.no_grad():
+        predictions = model(batch, **inference_args)
+
+    return utils._convert_to_result(batch, predictions)
+
+  def get_num_bytes(
+      self, batch: Sequence[Union[tf.Tensor, torch.Tensor]]) -> int:
+    """
+    Returns:
+      The number of bytes of data for a batch.
+    """
+    if self._framework == "tf":
+      return sum(sys.getsizeof(element) for element in batch)
+    else:
+      return sum(
+          (el.element_size() for tensor in batch for el in tensor.values()))
+
+  def get_metrics_namespace(self) -> str:
+    """
+    Returns:
+       A namespace for metrics collected by the RunInference transform.
+    """
+    return 'BeamML_HuggingFaceModelHandler_Tensor'
+
+  def batch_elements_kwargs(self):
+    return self._batching_kwargs
+
+
+class HuggingFaceModelHandlerNumpy(ModelHandler[numpy.ndarray,
+                                                PredictionResult,
+                                                Any]):
+  def __init__(
+      self,
+      model_uri: str,
+      model_config_args: Dict[str, Any] = None,
+      inference_args: Optional[Dict[str, Any]] = None,
+      min_batch_size: Optional[int] = None,
+      max_batch_size: Optional[int] = None,
+      **kwargs):
+    """Implementation of the ModelHandler interface for HuggingFace with
+      numpy.ndarray for PyTorch/Tensorflow backend.
+
+      Example Usage model::
+      pcoll | RunInference(HuggingFaceModelHandlerNumpy(
+        model_uri="bert-base-uncased"))
+
+    Args:
+      model_uri (str): path to the pretrained model
+        on the Hugging Face Models hub.
+      model_config_args (Dict[str, Any]): keyword arguments to provide load
+        options while loading from Hugging Face Hub. Defaults to None.
+      inference_args (Optional[Dict[str, Any]]): Non-batchable arguments
+        required as inputs to the model's forward() function. Unlike numpy
+        ndarray in `batch`, these parameters will not be dynamically batched.
+        Defaults to None.
+      min_batch_size: the minimum batch size to use when batching inputs.
+      max_batch_size: the maximum batch size to use when batching inputs.
+    """
+    self._model_uri = model_uri
+    self._model_config_args = model_config_args if model_config_args else {}
+    self._inference_args = inference_args if inference_args else {}
+    self._batching_kwargs = {}
+    self._env_vars = kwargs.get('env_vars', {})
+    if min_batch_size is not None:
+      self._batching_kwargs['min_batch_size'] = min_batch_size
+    if max_batch_size is not None:
+      self._batching_kwargs['max_batch_size'] = max_batch_size
+
+  def load_model(self):
+    """Loads and initializes the model for processing."""
+    return AutoModel.from_pretrained(
+        self._model_name, **self._model_config_args)
+
+  def update_model_path(self, model_path: Optional[str] = None):
+    self._model_path = model_path if model_path else self._model_path
+
+  def run_inference(
+      self,
+      batch: Sequence[numpy.ndarray],
+      model: Any,
+      inference_args: Optional[Dict[str, Any]] = None
+  ) -> Iterable[PredictionResult]:
+    """
+    Runs inferences on a batch of numpy ndarray and returns an Iterable of
+    numpy Predictions.
+
+    This method stacks the list of numpy ndarray in a vectorized format to
+    optimize the inference call.
+
+    Args:
+      batch: A sequence of numpy ndarray. These arrays should be batchable, as
+        this method will call `numpy.stack()` and pass in batched arrays with
+        dimensions (batch_size, n_features, etc.) into the model's
+        predict() function.
+      model: A pretrained model compatible with numpy input.
+      inference_args: Non-batchable arguments required as inputs to the model's
+        forward() function. Unlike Tensors in `batch`, these parameters will
+        not be dynamically batched
+    Returns:
+      An Iterable of type PredictionResult.
+    """
+    inference_args = {} if not inference_args else inference_args
+    predictions = model(batch, **inference_args)
+    return utils._convert_to_result(batch, predictions)
+
+  def get_num_bytes(self, batch: Sequence[numpy.ndarray]) -> int:
+    """
+    Returns:
+      The number of bytes of data for a batch.
+    """
+    return sum(sys.getsizeof(element) for element in batch)
+
+  def get_metrics_namespace(self) -> str:
+    """
+    Returns:
+       A namespace for metrics collected by the RunInference transform.
+    """
+    return 'BeamML_HuggingFaceModelHandler_Tensor'
+
+  def batch_elements_kwargs(self):
+    return self._batching_kwargs

From df87366d6272933af0601e56be6d7e8f8743a47c Mon Sep 17 00:00:00 2001
From: riteshghorse <riteshghorse@gmail.com>
Date: Fri, 23 Jun 2023 00:52:45 -0400
Subject: [PATCH 04/26] add model_class param

---
 .../huggingface_language_modeling.py          | 46 +++++++++----------
 .../ml/inference/huggingface_inference.py     | 32 +++++++++----
 .../huggingface_inference_it_test.py          | 29 ++++++------
 3 files changed, 56 insertions(+), 51 deletions(-)

diff --git a/sdks/python/apache_beam/examples/inference/huggingface_language_modeling.py b/sdks/python/apache_beam/examples/inference/huggingface_language_modeling.py
index e2736a0842c6e..67a6a2f68021d 100644
--- a/sdks/python/apache_beam/examples/inference/huggingface_language_modeling.py
+++ b/sdks/python/apache_beam/examples/inference/huggingface_language_modeling.py
@@ -34,7 +34,7 @@
 import torch
 
 import apache_beam as beam
-from apache_beam.ml.inference.huggingface_inference import HuggingFaceModelHandler
+from apache_beam.ml.inference.huggingface_inference import HuggingFaceModelHandlerKeyedTensor
 from apache_beam.ml.inference.base import KeyedModelHandler
 from apache_beam.ml.inference.base import PredictionResult
 from apache_beam.ml.inference.base import RunInference
@@ -42,20 +42,19 @@
 from apache_beam.options.pipeline_options import SetupOptions
 from apache_beam.runners.runner import PipelineResult
 
-from transformers import BertTokenizer
+from transformers import AutoTokenizer
 
 
 def add_mask_to_last_word(text: str) -> Tuple[str, str]:
   text_list = text.split()
-  return text, ' '.join(text_list[:-2] + ['[MASK]', text_list[-1]])
+  return text, ' '.join(text_list[:-2] + ['<mask>', text_list[-1]])
 
 
 def tokenize_sentence(
     text_and_mask: Tuple[str, str],
-    bert_tokenizer: BertTokenizer) -> Tuple[str, Dict[str, torch.Tensor]]:
+    tokenizer: AutoTokenizer) -> Tuple[str, Dict[str, torch.Tensor]]:
   text, masked_text = text_and_mask
-  tokenized_sentence = bert_tokenizer.encode_plus(
-      masked_text, return_tensors="pt")
+  tokenized_sentence = tokenizer.encode_plus(masked_text, return_tensors="pt")
 
   # Workaround to manually remove batch dim until we have the feature to
   # add optional batching flag.
@@ -75,14 +74,14 @@ def filter_empty_lines(text: str) -> Iterator[str]:
 class PostProcessor(beam.DoFn):
   """Processes the PredictionResult to get the predicted word.
 
-  The logits are the output of the BERT Model. After applying a softmax
+  The logits are the output of the Model. After applying a softmax
   activation function to the logits, we get probabilistic distributions for each
-  of the words in BERT’s vocabulary. We can get the word with the highest
+  of the words in the model's vocabulary. We can get the word with the highest
   probability of being a candidate replacement word by taking the argmax.
   """
-  def __init__(self, bert_tokenizer: BertTokenizer):
+  def __init__(self, tokenizer: AutoTokenizer):
     super().__init__()
-    self.bert_tokenizer = bert_tokenizer
+    self.bert_tokenizer = tokenizer
 
   def process(self, element: Tuple[str, PredictionResult]) -> Iterable[str]:
     text, prediction_result = element
@@ -108,15 +107,15 @@ def parse_known_args(argv):
       dest='output',
       required=True,
       help='Path of file in which to save the output predictions.')
-  parser.add_argument(
-      '--bert_tokenizer',
-      dest='bert_tokenizer',
-      default='bert-base-uncased',
-      help='bert uncased model. This can be base model or large model')
   parser.add_argument(
       '--model_name',
       dest='model_name',
       required=True,
+      help='bert uncased model. This can be base model or large model')
+  parser.add_argument(
+      '--model_class',
+      dest='model_class',
+      required=True,
       help="Name of the model from Hugging Face")
   return parser.parse_known_args(argv)
 
@@ -127,10 +126,7 @@ def run(
   Args:
     argv: Command line arguments defined for this example.
     model_class: Reference to the class definition of the model.
-                If None, BertForMaskedLM will be used as default .
-    model_params: Parameters passed to the constructor of the model_class.
-                  These will be used to instantiate the model object in the
-                  RunInference API.
+    model_name: Name of the pretrained model to be loaded.
     save_main_session: Used for internal testing.
     test_pipeline: Used for internal testing.
   """
@@ -142,9 +138,10 @@ def run(
   if not test_pipeline:
     pipeline = beam.Pipeline(options=pipeline_options)
 
-  bert_tokenizer = BertTokenizer.from_pretrained(known_args.bert_tokenizer)
+  tokenizer = AutoTokenizer.from_pretrained(known_args.model_name)
 
-  model_handler = HuggingFaceModelHandler(known_args.model_name)
+  model_handler = HuggingFaceModelHandlerKeyedTensor(
+      model_uri=known_args.model_name, model_class=known_args.model_class)
   if not known_args.input:
     text = (pipeline | 'CreateSentences' >> beam.Create([
       'The capital of France is Paris .',
@@ -165,13 +162,12 @@ def run(
       text
       | 'FilterEmptyLines' >> beam.ParDo(filter_empty_lines)
       | 'AddMask' >> beam.Map(add_mask_to_last_word)
-      | 'TokenizeSentence' >>
-      beam.Map(lambda x: tokenize_sentence(x, bert_tokenizer)))
+      |
+      'TokenizeSentence' >> beam.Map(lambda x: tokenize_sentence(x, tokenizer)))
   output = (
       text_and_tokenized_text_tuple
       | 'PyTorchRunInference' >> RunInference(KeyedModelHandler(model_handler))
-      | 'ProcessOutput' >> beam.ParDo(
-          PostProcessor(bert_tokenizer=bert_tokenizer)))
+      | 'ProcessOutput' >> beam.ParDo(PostProcessor(bert_tokenizer=tokenizer)))
   output | "WriteOutput" >> beam.io.WriteToText( # pylint: disable=expression-not-assigned
     known_args.output,
     shard_name_template='',
diff --git a/sdks/python/apache_beam/ml/inference/huggingface_inference.py b/sdks/python/apache_beam/ml/inference/huggingface_inference.py
index 3501ed971c760..8b660f8c5acbc 100644
--- a/sdks/python/apache_beam/ml/inference/huggingface_inference.py
+++ b/sdks/python/apache_beam/ml/inference/huggingface_inference.py
@@ -75,6 +75,7 @@ class HuggingFaceModelHandlerKeyedTensor(ModelHandler[Dict[str,
   def __init__(
       self,
       model_uri: str,
+      model_class: AutoModel,
       model_config_args: Dict[str, Any] = None,
       inference_args: Optional[Dict[str, Any]] = None,
       min_batch_size: Optional[int] = None,
@@ -103,6 +104,8 @@ def __init__(
       max_batch_size: the maximum batch size to use when batching inputs.
     """
     self._model_uri = model_uri
+    self._model_class = model_class
+    self._model_path = model_uri
     self._model_config_args = model_config_args if model_config_args else {}
     self._inference_args = inference_args if inference_args else {}
     self._batching_kwargs = {}
@@ -114,8 +117,8 @@ def __init__(
 
   def load_model(self):
     """Loads and initializes the model for processing."""
-    return AutoModel.from_pretrained(
-        self._model_name, **self._model_config_args)
+    return self._model_class.from_pretrained(
+        self._model_uri, **self._model_config_args)
 
   def update_model_path(self, model_path: Optional[str] = None):
     self._model_path = model_path if model_path else self._model_path
@@ -160,15 +163,18 @@ def get_num_bytes(
     Returns:
       The number of bytes of data for the Tensors batch.
     """
-    return sum(
-        (el.element_size() for tensor in batch for el in tensor.values()))
+    if self._framework == "tf":
+      return sum(sys.getsizeof(element) for element in batch)
+    else:
+      return sum(
+          (el.element_size() for tensor in batch for el in tensor.values()))
 
   def get_metrics_namespace(self) -> str:
     """
     Returns:
        A namespace for metrics collected by the RunInference transform.
     """
-    return 'BeamML_HuggingFace_Tensor'
+    return 'BeamML_HuggingFaceModelHandler_KeyedTensor'
 
   def batch_elements_kwargs(self):
     return self._batching_kwargs
@@ -180,6 +186,7 @@ class HuggingFaceModelHandlerTensor(ModelHandler[Union[tf.Tensor, torch.Tensor],
   def __init__(
       self,
       model_uri: str,
+      model_class: AutoModel,
       model_config_args: Dict[str, Any] = None,
       inference_args: Optional[Dict[str, Any]] = None,
       min_batch_size: Optional[int] = None,
@@ -208,6 +215,8 @@ def __init__(
       max_batch_size: the maximum batch size to use when batching inputs.
     """
     self._model_uri = model_uri
+    self._model_class = model_class
+    self._model_path = model_uri
     self._model_config_args = model_config_args if model_config_args else {}
     self._inference_args = inference_args if inference_args else {}
     self._batching_kwargs = {}
@@ -220,8 +229,8 @@ def __init__(
 
   def load_model(self):
     """Loads and initializes the model for processing."""
-    return AutoModel.from_pretrained(
-        self._model_name, **self._model_config_args)
+    return self._model_class.from_pretrained(
+        self._model_uri, **self._model_config_args)
 
   def update_model_path(self, model_path: Optional[str] = None):
     self._model_path = model_path if model_path else self._model_path
@@ -293,6 +302,7 @@ class HuggingFaceModelHandlerNumpy(ModelHandler[numpy.ndarray,
   def __init__(
       self,
       model_uri: str,
+      model_class: AutoModel,
       model_config_args: Dict[str, Any] = None,
       inference_args: Optional[Dict[str, Any]] = None,
       min_batch_size: Optional[int] = None,
@@ -318,6 +328,8 @@ def __init__(
       max_batch_size: the maximum batch size to use when batching inputs.
     """
     self._model_uri = model_uri
+    self._model_class = model_class
+    self._model_path = model_uri
     self._model_config_args = model_config_args if model_config_args else {}
     self._inference_args = inference_args if inference_args else {}
     self._batching_kwargs = {}
@@ -329,8 +341,8 @@ def __init__(
 
   def load_model(self):
     """Loads and initializes the model for processing."""
-    return AutoModel.from_pretrained(
-        self._model_name, **self._model_config_args)
+    return self._model_class.from_pretrained(
+        self._model_uri, **self._model_config_args)
 
   def update_model_path(self, model_path: Optional[str] = None):
     self._model_path = model_path if model_path else self._model_path
@@ -376,7 +388,7 @@ def get_metrics_namespace(self) -> str:
     Returns:
        A namespace for metrics collected by the RunInference transform.
     """
-    return 'BeamML_HuggingFaceModelHandler_Tensor'
+    return 'BeamML_HuggingFaceModelHandler_Numpy'
 
   def batch_elements_kwargs(self):
     return self._batching_kwargs
diff --git a/sdks/python/apache_beam/ml/inference/huggingface_inference_it_test.py b/sdks/python/apache_beam/ml/inference/huggingface_inference_it_test.py
index 6d2dc9ce68f84..d48b5d7a86a2d 100644
--- a/sdks/python/apache_beam/ml/inference/huggingface_inference_it_test.py
+++ b/sdks/python/apache_beam/ml/inference/huggingface_inference_it_test.py
@@ -20,43 +20,40 @@
 import logging
 import unittest
 import uuid
-import pytest
-from apache_beam.io.filesystems import FileSystems
-from apache_beam.testing.test_pipeline import TestPipeline
-from apache_beam.examples.inference import huggingface_language_modeling
 
+from transformers import AutoModelForMaskedLM
 
-def process_outputs(filepath):
-  with FileSystems().open(filepath) as f:
-    lines = f.readlines()
-  lines = [l.decode('utf-8').strip('\n') for l in lines]
-  return lines
+from apache_beam.examples.inference import huggingface_language_modeling
+from apache_beam.io.filesystems import FileSystems
+from apache_beam.ml.inference import pytorch_inference_it_test
+from apache_beam.testing.test_pipeline import TestPipeline
 
 
 class HuggingFaceInference(unittest.TestCase):
-  @pytest.mark.timeout(4200)
-  def test_hf_imagenet_image_segmentation(self):
+  def test_hf_language_modeling(self):
     test_pipeline = TestPipeline(is_integration_test=True)
     # Path to text file containing some sentences
     file_of_sentences = 'gs://apache-beam-ml/datasets/custom/sentences.txt'  # pylint: disable=line-too-long
     output_file_dir = 'gs://apache-beam-ml/testing/predictions'
     output_file = '/'.join([output_file_dir, str(uuid.uuid4()), 'result.txt'])
 
-    model_name = 'againeureka/my_awesome_eli5_mlm_model'
+    model_name = 'stevhliu/my_awesome_eli5_mlm_model'
+    model_class = AutoModelForMaskedLM
     extra_opts = {
         'input': file_of_sentences,
         'output': output_file,
         'model_name': model_name,
-        '--bert_tokenizer': model_name
+        'model_class': model_class,
     }
     huggingface_language_modeling.run(
         test_pipeline.get_full_options_as_args(**extra_opts),
-        save_main_session=True)
+        save_main_session=False)
 
     self.assertEqual(FileSystems().exists(output_file), True)
-    predictions = process_outputs(filepath=output_file)
+    predictions = pytorch_inference_it_test.process_outputs(
+        filepath=output_file)
     actuals_file = 'gs://apache-beam-ml/testing/expected_outputs/test_torch_run_inference_bert_for_masked_lm_actuals.txt'  # pylint: disable=line-too-long
-    actuals = process_outputs(filepath=actuals_file)
+    actuals = pytorch_inference_it_test.process_outputs(filepath=actuals_file)
 
     predictions_dict = {}
     for prediction in predictions:

From 025cc5239e5ec9b5de65103f1ddbdbe9c00c07bb Mon Sep 17 00:00:00 2001
From: riteshghorse <riteshghorse@gmail.com>
Date: Mon, 26 Jun 2023 11:49:10 -0400
Subject: [PATCH 05/26] update doc comments

---
 .../huggingface_language_modeling.py          | 19 +++++++++----------
 .../ml/inference/huggingface_inference.py     | 12 ++++++------
 .../huggingface_inference_it_test.py          | 12 +++++++-----
 .../huggingface_tests_requirements.txt        |  7 +------
 4 files changed, 23 insertions(+), 27 deletions(-)

diff --git a/sdks/python/apache_beam/examples/inference/huggingface_language_modeling.py b/sdks/python/apache_beam/examples/inference/huggingface_language_modeling.py
index 67a6a2f68021d..cebdb67211ace 100644
--- a/sdks/python/apache_beam/examples/inference/huggingface_language_modeling.py
+++ b/sdks/python/apache_beam/examples/inference/huggingface_language_modeling.py
@@ -15,10 +15,11 @@
 # limitations under the License.
 #
 
-""""A pipeline that uses RunInference to perform Language Modeling with Bert.
+""""A pipeline that uses RunInference to perform Language Modeling with
+model from Hugging Face.
 
 This pipeline takes sentences from a custom text file, converts the last word
-of the sentence into a [MASK] token, and then uses the BertForMaskedLM from
+of the sentence into a <mask> token, and then uses the AutoModelForMaskedLM from
 Hugging Face to predict the best word for the masked token given all the words
 already in the sentence. The pipeline then writes the prediction to an output
 file in which users can then compare against the original sentence.
@@ -81,17 +82,17 @@ class PostProcessor(beam.DoFn):
   """
   def __init__(self, tokenizer: AutoTokenizer):
     super().__init__()
-    self.bert_tokenizer = tokenizer
+    self.tokenizer = tokenizer
 
   def process(self, element: Tuple[str, PredictionResult]) -> Iterable[str]:
     text, prediction_result = element
     inputs = prediction_result.example
     logits = prediction_result.inference.logits
     mask_token_index = (
-        inputs['input_ids'] == self.bert_tokenizer.mask_token_id).nonzero(
+        inputs['input_ids'] == self.tokenizer.mask_token_id).nonzero(
             as_tuple=True)[0]
     predicted_token_id = logits[mask_token_index].argmax(axis=-1)
-    decoded_word = self.bert_tokenizer.decode(predicted_token_id)
+    decoded_word = self.tokenizer.decode(predicted_token_id)
     yield text + ';' + decoded_word
 
 
@@ -166,12 +167,10 @@ def run(
       'TokenizeSentence' >> beam.Map(lambda x: tokenize_sentence(x, tokenizer)))
   output = (
       text_and_tokenized_text_tuple
-      | 'PyTorchRunInference' >> RunInference(KeyedModelHandler(model_handler))
-      | 'ProcessOutput' >> beam.ParDo(PostProcessor(bert_tokenizer=tokenizer)))
+      | 'RunInference' >> RunInference(KeyedModelHandler(model_handler))
+      | 'ProcessOutput' >> beam.ParDo(PostProcessor(tokenizer=tokenizer)))
   output | "WriteOutput" >> beam.io.WriteToText( # pylint: disable=expression-not-assigned
-    known_args.output,
-    shard_name_template='',
-    append_trailing_newlines=True)
+      known_args.output, shard_name_template='', append_trailing_newlines=True)
 
   result = pipeline.run()
   result.wait_until_finish()
diff --git a/sdks/python/apache_beam/ml/inference/huggingface_inference.py b/sdks/python/apache_beam/ml/inference/huggingface_inference.py
index 8b660f8c5acbc..ea3a2cfed435b 100644
--- a/sdks/python/apache_beam/ml/inference/huggingface_inference.py
+++ b/sdks/python/apache_beam/ml/inference/huggingface_inference.py
@@ -71,7 +71,7 @@ class HuggingFaceModelHandlerKeyedTensor(ModelHandler[Dict[str,
                                                            Union[tf.Tensor,
                                                                  torch.Tensor]],
                                                       PredictionResult,
-                                                      Any]):
+                                                      AutoModel]):
   def __init__(
       self,
       model_uri: str,
@@ -126,7 +126,7 @@ def update_model_path(self, model_path: Optional[str] = None):
   def run_inference(
       self,
       batch: Sequence[Dict[str, Union[tf.Tensor, torch.Tensor]]],
-      model: Any,
+      model: AutoModel,
       inference_args: Optional[Dict[str, Any]] = None
   ) -> Iterable[PredictionResult]:
     """
@@ -182,7 +182,7 @@ def batch_elements_kwargs(self):
 
 class HuggingFaceModelHandlerTensor(ModelHandler[Union[tf.Tensor, torch.Tensor],
                                                  PredictionResult,
-                                                 Any]):
+                                                 AutoModel]):
   def __init__(
       self,
       model_uri: str,
@@ -238,7 +238,7 @@ def update_model_path(self, model_path: Optional[str] = None):
   def run_inference(
       self,
       batch: Sequence[Union[tf.Tensor, torch.Tensor]],
-      model: Any,
+      model: AutoModel,
       inference_args: Optional[Dict[str, Any]] = None
   ) -> Iterable[PredictionResult]:
     """
@@ -298,7 +298,7 @@ def batch_elements_kwargs(self):
 
 class HuggingFaceModelHandlerNumpy(ModelHandler[numpy.ndarray,
                                                 PredictionResult,
-                                                Any]):
+                                                AutoModel]):
   def __init__(
       self,
       model_uri: str,
@@ -350,7 +350,7 @@ def update_model_path(self, model_path: Optional[str] = None):
   def run_inference(
       self,
       batch: Sequence[numpy.ndarray],
-      model: Any,
+      model: AutoModel,
       inference_args: Optional[Dict[str, Any]] = None
   ) -> Iterable[PredictionResult]:
     """
diff --git a/sdks/python/apache_beam/ml/inference/huggingface_inference_it_test.py b/sdks/python/apache_beam/ml/inference/huggingface_inference_it_test.py
index d48b5d7a86a2d..5cfa4bcba7c62 100644
--- a/sdks/python/apache_beam/ml/inference/huggingface_inference_it_test.py
+++ b/sdks/python/apache_beam/ml/inference/huggingface_inference_it_test.py
@@ -15,12 +15,12 @@
 # limitations under the License.
 #
 
-"""End-to-End test for Pytorch Inference"""
+"""End-to-End test for Hugging Face Inference"""
 
 import logging
 import unittest
 import uuid
-
+import pytest
 from transformers import AutoModelForMaskedLM
 
 from apache_beam.examples.inference import huggingface_language_modeling
@@ -29,12 +29,14 @@
 from apache_beam.testing.test_pipeline import TestPipeline
 
 
+@pytest.mark.it_postcommit
 class HuggingFaceInference(unittest.TestCase):
+  @pytest.mark.timeout(1800)
   def test_hf_language_modeling(self):
     test_pipeline = TestPipeline(is_integration_test=True)
     # Path to text file containing some sentences
-    file_of_sentences = 'gs://apache-beam-ml/datasets/custom/sentences.txt'  # pylint: disable=line-too-long
-    output_file_dir = 'gs://apache-beam-ml/testing/predictions'
+    file_of_sentences = 'gs://clouddfe-riteshghorse/hf/datasets/custom/sentences.txt'  # pylint: disable=line-too-long
+    output_file_dir = 'gs://clouddfe-riteshghorse/hf/testing/predictions'
     output_file = '/'.join([output_file_dir, str(uuid.uuid4()), 'result.txt'])
 
     model_name = 'stevhliu/my_awesome_eli5_mlm_model'
@@ -52,7 +54,7 @@ def test_hf_language_modeling(self):
     self.assertEqual(FileSystems().exists(output_file), True)
     predictions = pytorch_inference_it_test.process_outputs(
         filepath=output_file)
-    actuals_file = 'gs://apache-beam-ml/testing/expected_outputs/test_torch_run_inference_bert_for_masked_lm_actuals.txt'  # pylint: disable=line-too-long
+    actuals_file = 'gs://clouddfe-riteshghorse/hf/testing/expected_outputs/test_torch_run_inference_bert_for_masked_lm_actuals.txt'  # pylint: disable=line-too-long
     actuals = pytorch_inference_it_test.process_outputs(filepath=actuals_file)
 
     predictions_dict = {}
diff --git a/sdks/python/apache_beam/ml/inference/huggingface_tests_requirements.txt b/sdks/python/apache_beam/ml/inference/huggingface_tests_requirements.txt
index aa8f4ea953e40..09c1fa8ca90c6 100644
--- a/sdks/python/apache_beam/ml/inference/huggingface_tests_requirements.txt
+++ b/sdks/python/apache_beam/ml/inference/huggingface_tests_requirements.txt
@@ -16,10 +16,5 @@
 #
 
 torch>=1.7.1
-torchvision>=0.8.2
-pillow>=8.0.0
 transformers>=4.18.0
-huggingface-hub==0.14.0
-tensorflow>=2.12.0
-tensorflow_hub>=0.10.0
-Pillow>=9.0.0
\ No newline at end of file
+tensorflow>=2.12.0
\ No newline at end of file

From 2c671abace0d245068bfee313c2d5e8c5da5ba69 Mon Sep 17 00:00:00 2001
From: riteshghorse <riteshghorse@gmail.com>
Date: Mon, 26 Jun 2023 15:48:46 -0400
Subject: [PATCH 06/26] updated integration test and example

---
 .../huggingface_language_modeling.py          | 36 +++++++++----------
 .../ml/inference/huggingface_inference.py     | 32 ++++++++++-------
 .../huggingface_inference_it_test.py          | 12 +++----
 3 files changed, 43 insertions(+), 37 deletions(-)

diff --git a/sdks/python/apache_beam/examples/inference/huggingface_language_modeling.py b/sdks/python/apache_beam/examples/inference/huggingface_language_modeling.py
index cebdb67211ace..92eb801c29667 100644
--- a/sdks/python/apache_beam/examples/inference/huggingface_language_modeling.py
+++ b/sdks/python/apache_beam/examples/inference/huggingface_language_modeling.py
@@ -44,6 +44,7 @@
 from apache_beam.runners.runner import PipelineResult
 
 from transformers import AutoTokenizer
+from transformers import AutoModelForMaskedLM
 
 
 def add_mask_to_last_word(text: str) -> Tuple[str, str]:
@@ -87,10 +88,9 @@ def __init__(self, tokenizer: AutoTokenizer):
   def process(self, element: Tuple[str, PredictionResult]) -> Iterable[str]:
     text, prediction_result = element
     inputs = prediction_result.example
-    logits = prediction_result.inference.logits
-    mask_token_index = (
-        inputs['input_ids'] == self.tokenizer.mask_token_id).nonzero(
-            as_tuple=True)[0]
+    logits = prediction_result.inference['logits']
+    mask_token_index = torch.where(
+        inputs["input_ids"] == self.tokenizer.mask_token_id)[0]
     predicted_token_id = logits[mask_token_index].argmax(axis=-1)
     decoded_word = self.tokenizer.decode(predicted_token_id)
     yield text + ';' + decoded_word
@@ -116,7 +116,7 @@ def parse_known_args(argv):
   parser.add_argument(
       '--model_class',
       dest='model_class',
-      required=True,
+      default=AutoModelForMaskedLM,
       help="Name of the model from Hugging Face")
   return parser.parse_known_args(argv)
 
@@ -142,20 +142,20 @@ def run(
   tokenizer = AutoTokenizer.from_pretrained(known_args.model_name)
 
   model_handler = HuggingFaceModelHandlerKeyedTensor(
-      model_uri=known_args.model_name, model_class=known_args.model_class)
+      model_uri=known_args.model_name,
+      model_class=known_args.model_class,
+      max_batch_size=1)
   if not known_args.input:
-    text = (pipeline | 'CreateSentences' >> beam.Create([
-      'The capital of France is Paris .',
-      'It is raining cats and dogs .',
-      'He looked up and saw the sun and stars .',
-      'Today is Monday and tomorrow is Tuesday .',
-      'There are 5 coconuts on this palm tree .',
-      'The richest person in the world is not here .',
-      'Malls are amazing places to shop because you can find everything you need under one roof .', # pylint: disable=line-too-long
-      'This audiobook is sure to liquefy your brain .',
-      'The secret ingredient to his wonderful life was gratitude .',
-      'The biggest animal in the world is the whale .',
-    ]))
+    text = (
+        pipeline | 'CreateSentences' >> beam.Create([
+            'The capital of France is Paris .',
+            'It is raining cats and dogs .',
+            'Today is Monday and tomorrow is Tuesday .',
+            'There are 5 coconuts on this palm tree .',
+            'The strongest person in the world is not famous .',
+            'The secret ingredient to his wonderful life was gratitude .',
+            'The biggest animal in the world is the whale .',
+        ]))
   else:
     text = (
         pipeline | 'ReadSentences' >> beam.io.ReadFromText(known_args.input))
diff --git a/sdks/python/apache_beam/ml/inference/huggingface_inference.py b/sdks/python/apache_beam/ml/inference/huggingface_inference.py
index ea3a2cfed435b..379aeb9e8b1c0 100644
--- a/sdks/python/apache_beam/ml/inference/huggingface_inference.py
+++ b/sdks/python/apache_beam/ml/inference/huggingface_inference.py
@@ -28,14 +28,17 @@
 import tensorflow as tf
 import torch
 from transformers import AutoModel
+from transformers import TFAutoModel
 
 from apache_beam.ml.inference.base import ModelHandler
 from apache_beam.ml.inference.base import PredictionResult
 from apache_beam.ml.inference import utils
+from apache_beam.ml.inference.pytorch_inference import _convert_to_device
 
 
 def _run_inference_torch_keyed_tensor(
-    batch: Sequence[torch.Tensor], model: Any,
+    batch: Sequence[Dict[str, Union[tf.Tensor, torch.Tensor]]],
+    model: Any,
     inference_args: Dict[str, Any]) -> Iterable[PredictionResult]:
   key_to_tensor_list = defaultdict(list)
   with torch.no_grad():
@@ -45,14 +48,15 @@ def _run_inference_torch_keyed_tensor(
     key_to_batched_tensors = {}
     for key in key_to_tensor_list:
       batched_tensors = torch.stack(key_to_tensor_list[key])
-      batched_tensors = key_to_tensor_list[key]
+      batched_tensors = _convert_to_device(batched_tensors, torch.device('cpu'))
       key_to_batched_tensors[key] = batched_tensors
     return utils._convert_to_result(
         batch, model(**key_to_batched_tensors, **inference_args))
 
 
 def _run_inference_tensorflow_keyed_tensor(
-    batch: Sequence[tf.Tensor], model: Any,
+    batch: Sequence[Dict[str, Union[tf.Tensor, torch.Tensor]]],
+    model: Any,
     inference_args: Dict[str, Any]) -> Iterable[PredictionResult]:
   key_to_tensor_list = defaultdict(list)
   for example in batch:
@@ -71,11 +75,12 @@ class HuggingFaceModelHandlerKeyedTensor(ModelHandler[Dict[str,
                                                            Union[tf.Tensor,
                                                                  torch.Tensor]],
                                                       PredictionResult,
-                                                      AutoModel]):
+                                                      Union[AutoModel,
+                                                            TFAutoModel]]):
   def __init__(
       self,
       model_uri: str,
-      model_class: AutoModel,
+      model_class: Union[AutoModel, TFAutoModel],
       model_config_args: Dict[str, Any] = None,
       inference_args: Optional[Dict[str, Any]] = None,
       min_batch_size: Optional[int] = None,
@@ -126,7 +131,7 @@ def update_model_path(self, model_path: Optional[str] = None):
   def run_inference(
       self,
       batch: Sequence[Dict[str, Union[tf.Tensor, torch.Tensor]]],
-      model: AutoModel,
+      model: Union[AutoModel, TFAutoModel],
       inference_args: Optional[Dict[str, Any]] = None
   ) -> Iterable[PredictionResult]:
     """
@@ -152,9 +157,11 @@ def run_inference(
     # torch.no_grad() mitigates GPU memory issues
     # https://github.com/apache/beam/issues/22811
     if isinstance(batch[0], tf.Tensor):
+      self._framework = "tf"
       return _run_inference_tensorflow_keyed_tensor(
           batch, model, inference_args)
     else:
+      self._framework = "torch"
       return _run_inference_torch_keyed_tensor(batch, model, inference_args)
 
   def get_num_bytes(
@@ -182,11 +189,12 @@ def batch_elements_kwargs(self):
 
 class HuggingFaceModelHandlerTensor(ModelHandler[Union[tf.Tensor, torch.Tensor],
                                                  PredictionResult,
-                                                 AutoModel]):
+                                                 Union[AutoModel,
+                                                       TFAutoModel]]):
   def __init__(
       self,
       model_uri: str,
-      model_class: AutoModel,
+      model_class: Union[AutoModel, TFAutoModel],
       model_config_args: Dict[str, Any] = None,
       inference_args: Optional[Dict[str, Any]] = None,
       min_batch_size: Optional[int] = None,
@@ -238,7 +246,7 @@ def update_model_path(self, model_path: Optional[str] = None):
   def run_inference(
       self,
       batch: Sequence[Union[tf.Tensor, torch.Tensor]],
-      model: AutoModel,
+      model: Union[AutoModel, TFAutoModel],
       inference_args: Optional[Dict[str, Any]] = None
   ) -> Iterable[PredictionResult]:
     """
@@ -298,11 +306,11 @@ def batch_elements_kwargs(self):
 
 class HuggingFaceModelHandlerNumpy(ModelHandler[numpy.ndarray,
                                                 PredictionResult,
-                                                AutoModel]):
+                                                Union[AutoModel, TFAutoModel]]):
   def __init__(
       self,
       model_uri: str,
-      model_class: AutoModel,
+      model_class: Union[AutoModel, TFAutoModel],
       model_config_args: Dict[str, Any] = None,
       inference_args: Optional[Dict[str, Any]] = None,
       min_batch_size: Optional[int] = None,
@@ -350,7 +358,7 @@ def update_model_path(self, model_path: Optional[str] = None):
   def run_inference(
       self,
       batch: Sequence[numpy.ndarray],
-      model: AutoModel,
+      model: Union[AutoModel, TFAutoModel],
       inference_args: Optional[Dict[str, Any]] = None
   ) -> Iterable[PredictionResult]:
     """
diff --git a/sdks/python/apache_beam/ml/inference/huggingface_inference_it_test.py b/sdks/python/apache_beam/ml/inference/huggingface_inference_it_test.py
index 5cfa4bcba7c62..37a67ff979644 100644
--- a/sdks/python/apache_beam/ml/inference/huggingface_inference_it_test.py
+++ b/sdks/python/apache_beam/ml/inference/huggingface_inference_it_test.py
@@ -21,7 +21,6 @@
 import unittest
 import uuid
 import pytest
-from transformers import AutoModelForMaskedLM
 
 from apache_beam.examples.inference import huggingface_language_modeling
 from apache_beam.io.filesystems import FileSystems
@@ -35,17 +34,16 @@ class HuggingFaceInference(unittest.TestCase):
   def test_hf_language_modeling(self):
     test_pipeline = TestPipeline(is_integration_test=True)
     # Path to text file containing some sentences
-    file_of_sentences = 'gs://clouddfe-riteshghorse/hf/datasets/custom/sentences.txt'  # pylint: disable=line-too-long
-    output_file_dir = 'gs://clouddfe-riteshghorse/hf/testing/predictions'
+    file_of_sentences = 'gs://apache-beam-ml/datasets/custom/hf_sentences.txt'  # pylint: disable=line-too-long
+    output_file_dir = 'gs://apache-beam-ml/testing/predictions'
     output_file = '/'.join([output_file_dir, str(uuid.uuid4()), 'result.txt'])
 
     model_name = 'stevhliu/my_awesome_eli5_mlm_model'
-    model_class = AutoModelForMaskedLM
+
     extra_opts = {
         'input': file_of_sentences,
         'output': output_file,
         'model_name': model_name,
-        'model_class': model_class,
     }
     huggingface_language_modeling.run(
         test_pipeline.get_full_options_as_args(**extra_opts),
@@ -54,13 +52,13 @@ def test_hf_language_modeling(self):
     self.assertEqual(FileSystems().exists(output_file), True)
     predictions = pytorch_inference_it_test.process_outputs(
         filepath=output_file)
-    actuals_file = 'gs://clouddfe-riteshghorse/hf/testing/expected_outputs/test_torch_run_inference_bert_for_masked_lm_actuals.txt'  # pylint: disable=line-too-long
+    actuals_file = 'gs://apache-beam-ml/testing/expected_outputs/test_hf_run_inference_for_masked_lm_actuals.txt'  # pylint: disable=line-too-long
     actuals = pytorch_inference_it_test.process_outputs(filepath=actuals_file)
 
     predictions_dict = {}
     for prediction in predictions:
       text, predicted_text = prediction.split(';')
-      predictions_dict[text] = predicted_text
+      predictions_dict[text] = predicted_text.strip().lower()
 
     for actual in actuals:
       text, actual_predicted_text = actual.split(';')

From abaeb2a0e382c29469aba32a26c9c79c5e91d770 Mon Sep 17 00:00:00 2001
From: riteshghorse <riteshghorse@gmail.com>
Date: Tue, 27 Jun 2023 01:08:21 -0400
Subject: [PATCH 07/26] unit test, modified params

---
 .../ml/inference/huggingface_inference.py     | 325 +++++++++++-------
 .../inference/huggingface_inference_test.py   | 129 +++++++
 2 files changed, 321 insertions(+), 133 deletions(-)
 create mode 100644 sdks/python/apache_beam/ml/inference/huggingface_inference_test.py

diff --git a/sdks/python/apache_beam/ml/inference/huggingface_inference.py b/sdks/python/apache_beam/ml/inference/huggingface_inference.py
index 379aeb9e8b1c0..7d2e22f106c74 100644
--- a/sdks/python/apache_beam/ml/inference/huggingface_inference.py
+++ b/sdks/python/apache_beam/ml/inference/huggingface_inference.py
@@ -16,15 +16,16 @@
 #
 
 from collections import defaultdict
+import logging
 import sys
 from typing import Any
+from typing import Callable
 from typing import Dict
 from typing import Iterable
 from typing import Optional
 from typing import Sequence
 from typing import Union
 
-import numpy
 import tensorflow as tf
 import torch
 from transformers import AutoModel
@@ -35,12 +36,55 @@
 from apache_beam.ml.inference import utils
 from apache_beam.ml.inference.pytorch_inference import _convert_to_device
 
+__all__ = [
+    'HuggingFaceModelHandlerTensor',
+    'HuggingFaceModelHandlerKeyedTensor',
+]
+
+TensorInferenceFn = Callable[[
+    Sequence[Union[torch.Tensor, tf.Tensor]],
+    Union[AutoModel, TFAutoModel],
+    torch.device,
+    Optional[Dict[str, Any]],
+    Optional[str]
+],
+                             Iterable[PredictionResult]]
+
+KeyedTensorInferenceFn = Callable[[
+    Sequence[Dict[str, Union[torch.Tensor, tf.Tensor]]],
+    Union[AutoModel, TFAutoModel],
+    torch.device,
+    Optional[Dict[str, Any]],
+    Optional[str]
+],
+                                  Iterable[PredictionResult]]
+
+
+def _validate_constructor_args(model_uri, model_class):
+  message = (
+      "Please provide both model class and model uri to load the model."
+      "Got params as model_uri={model_uri} and "
+      "model_class={model_class}.")
+  if not model_uri and not model_class:
+    raise RuntimeError(
+        message.format(model_uri=model_uri, model_class=model_class))
+  elif not model_uri:
+    raise RuntimeError(
+        message.format(model_uri=model_uri, model_class=model_class))
+  elif not model_class:
+    raise RuntimeError(
+        message.format(model_uri=model_uri, model_class=model_class))
+
 
 def _run_inference_torch_keyed_tensor(
     batch: Sequence[Dict[str, Union[tf.Tensor, torch.Tensor]]],
-    model: Any,
-    inference_args: Dict[str, Any]) -> Iterable[PredictionResult]:
+    model: AutoModel,
+    device,
+    inference_args: Dict[str, Any],
+    model_id: Optional[str] = None) -> Iterable[PredictionResult]:
   key_to_tensor_list = defaultdict(list)
+  # torch.no_grad() mitigates GPU memory issues
+  # https://github.com/apache/beam/issues/22811
   with torch.no_grad():
     for example in batch:
       for key, tensor in example.items():
@@ -48,7 +92,7 @@ def _run_inference_torch_keyed_tensor(
     key_to_batched_tensors = {}
     for key in key_to_tensor_list:
       batched_tensors = torch.stack(key_to_tensor_list[key])
-      batched_tensors = _convert_to_device(batched_tensors, torch.device('cpu'))
+      batched_tensors = _convert_to_device(batched_tensors, device)
       key_to_batched_tensors[key] = batched_tensors
     return utils._convert_to_result(
         batch, model(**key_to_batched_tensors, **inference_args))
@@ -56,8 +100,10 @@ def _run_inference_torch_keyed_tensor(
 
 def _run_inference_tensorflow_keyed_tensor(
     batch: Sequence[Dict[str, Union[tf.Tensor, torch.Tensor]]],
-    model: Any,
-    inference_args: Dict[str, Any]) -> Iterable[PredictionResult]:
+    model: TFAutoModel,
+    device,
+    inference_args: Dict[str, Any],
+    model_id: Optional[str] = None) -> Iterable[PredictionResult]:
   key_to_tensor_list = defaultdict(list)
   for example in batch:
     for key, tensor in example.items():
@@ -81,10 +127,14 @@ def __init__(
       self,
       model_uri: str,
       model_class: Union[AutoModel, TFAutoModel],
-      model_config_args: Dict[str, Any] = None,
+      device: str = 'CPU',
+      *,
+      inference_fn: KeyedTensorInferenceFn = None,
+      load_model_args: Optional[Dict[str, Any]] = None,
       inference_args: Optional[Dict[str, Any]] = None,
       min_batch_size: Optional[int] = None,
       max_batch_size: Optional[int] = None,
+      large_model: bool = False,
       **kwargs):
     """Implementation of the ModelHandler interface for HuggingFace with
       Keyed Tensors for PyTorch/Tensorflow backend.
@@ -99,7 +149,13 @@ def __init__(
     Args:
       model_uri (str): path to the pretrained model on the hugging face
         models hub.
-      model_config_args (Dict[str, Any]): keyword arguments to provide load
+      model_class: model class to load the repository from model_uri.
+      device: For torch tensors, specify device on which you wish to
+        run the model. Defaults to CPU.
+      inference_fn: the inference function to use during RunInference.
+        Default is _run_inference_torch_keyed_tensor or
+        _run_inference_tensorflow_keyed_tensor depending on the input type.
+      load_model_args (Dict[str, Any]): keyword arguments to provide load
         options while loading from Hugging Face Hub. Defaults to None.
       inference_args (Optional[Dict[str, Any]]): Non-batchable arguments
         required as inputs to the model's forward() function. Unlike Tensors in
@@ -107,11 +163,24 @@ def __init__(
         Defaults to None.
       min_batch_size: the minimum batch size to use when batching inputs.
       max_batch_size: the maximum batch size to use when batching inputs.
+      large_model: set to true if your model is large enough to run into
+        memory pressure if you load multiple copies. Given a model that
+        consumes N memory and a machine with W cores and M memory, you should
+        set this to True if N*W > M.
+      kwargs: 'env_vars' can be used to set environment variables
+        before loading the model.
+
+    **Supported Versions:** RunInference APIs in Apache Beam
+    supports transformers>=4.18.0.
     """
     self._model_uri = model_uri
     self._model_class = model_class
-    self._model_path = model_uri
-    self._model_config_args = model_config_args if model_config_args else {}
+    if device == 'GPU':
+      self._device = torch.device('cuda')
+    else:
+      self._device = torch.device('cpu')
+    self._inference_fn = inference_fn
+    self._model_config_args = load_model_args if load_model_args else {}
     self._inference_args = inference_args if inference_args else {}
     self._batching_kwargs = {}
     self._env_vars = kwargs.get('env_vars', {})
@@ -119,14 +188,27 @@ def __init__(
       self._batching_kwargs['min_batch_size'] = min_batch_size
     if max_batch_size is not None:
       self._batching_kwargs['max_batch_size'] = max_batch_size
+    self._large_model = large_model
+    self._framework = None
+
+    _validate_constructor_args(
+        model_uri=self._model_uri, model_class=self._model_class)
 
   def load_model(self):
     """Loads and initializes the model for processing."""
-    return self._model_class.from_pretrained(
+    model = self._model_class.from_pretrained(
         self._model_uri, **self._model_config_args)
+    if self._device == torch.device('cuda'):
+      if not torch.cuda.is_available():
+        logging.warning(
+            "Model handler specified a 'GPU' device, "
+            "but GPUs are not available. Switching to CPU.")
+        self._device = torch.device('cpu')
+      model.to(self._device)
+    return model
 
   def update_model_path(self, model_path: Optional[str] = None):
-    self._model_path = model_path if model_path else self._model_path
+    self._model_path = model_path if model_path else self._model_uri
 
   def run_inference(
       self,
@@ -154,15 +236,22 @@ def run_inference(
       An Iterable of type PredictionResult.
     """
     inference_args = {} if not inference_args else inference_args
-    # torch.no_grad() mitigates GPU memory issues
-    # https://github.com/apache/beam/issues/22811
-    if isinstance(batch[0], tf.Tensor):
-      self._framework = "tf"
+    if not self._framework:
+      self._framework = "tf" if isinstance(batch[0], tf.Tensor) else "torch"
+
+    if self._inference_fn:
+      return self._inference_fn(
+          batch, model, self._device, inference_args, self._model_uri)
+
+    if not self._framework:
+      self._framework = "tf" if isinstance(batch[0], tf.Tensor) else "torch"
+
+    if self._framework == "tf":
       return _run_inference_tensorflow_keyed_tensor(
-          batch, model, inference_args)
+          batch, model, self._device, inference_args, self._model_uri)
     else:
-      self._framework = "torch"
-      return _run_inference_torch_keyed_tensor(batch, model, inference_args)
+      return _run_inference_torch_keyed_tensor(
+          batch, model, self._device, inference_args, self._model_uri)
 
   def get_num_bytes(
       self, batch: Sequence[Union[tf.Tensor, torch.Tensor]]) -> int:
@@ -186,6 +275,36 @@ def get_metrics_namespace(self) -> str:
   def batch_elements_kwargs(self):
     return self._batching_kwargs
 
+  def share_model_across_processes(self) -> bool:
+    return self._large_model
+
+
+def _default_inference_fn_torch(
+    batch: Sequence[Union[tf.Tensor, torch.Tensor]],
+    model: Union[AutoModel, TFAutoModel],
+    device,
+    inference_args: Dict[str, Any] = None,
+    model_id: Optional[str] = None) -> Iterable[PredictionResult]:
+  # torch.no_grad() mitigates GPU memory issues
+  # https://github.com/apache/beam/issues/22811
+  with torch.no_grad():
+    batched_tensors = torch.stack(batch)
+    batched_tensors = _convert_to_device(batched_tensors, device)
+    predictions = model(batched_tensors, **inference_args)
+
+  return utils._convert_to_result(batch, predictions, model_id)
+
+
+def _default_inference_fn_tensorflow(
+    batch: Sequence[Union[tf.Tensor, torch.Tensor]],
+    model: Union[AutoModel, TFAutoModel],
+    device,
+    inference_args: Dict[str, Any],
+    model_id: Optional[str] = None) -> Iterable[PredictionResult]:
+  batched_tensors = tf.stack(batch, axis=0)
+  predictions = model(batched_tensors, **inference_args)
+  return utils._convert_to_result(batch, predictions, model_id)
+
 
 class HuggingFaceModelHandlerTensor(ModelHandler[Union[tf.Tensor, torch.Tensor],
                                                  PredictionResult,
@@ -195,10 +314,14 @@ def __init__(
       self,
       model_uri: str,
       model_class: Union[AutoModel, TFAutoModel],
-      model_config_args: Dict[str, Any] = None,
+      device: str = 'CPU',
+      *,
+      inference_fn: TensorInferenceFn = None,
+      load_model_args: Optional[Dict[str, Any]] = None,
       inference_args: Optional[Dict[str, Any]] = None,
       min_batch_size: Optional[int] = None,
       max_batch_size: Optional[int] = None,
+      large_model: bool = False,
       **kwargs):
     """Implementation of the ModelHandler interface for HuggingFace with
       Tensors for PyTorch/Tensorflow backend.
@@ -211,9 +334,14 @@ def __init__(
         model_uri="bert-base-uncased"))
 
     Args:
-      model_uri (str): path to the pretrained model on the
-      Hugging Face models hub.
-      model_config_args (Dict[str, Any]): keyword arguments to provide load
+      model_uri (str): path to the pretrained model on the hugging face
+        models hub.
+      model_class: model class to load the repository from model_uri.
+      device: For torch tensors, specify device on which you wish to
+        run the model. Defaults to CPU.
+      inference_fn: the inference function to use during RunInference.
+        Default is _default_inference_fn_tensor.
+      load_model_args (Dict[str, Any]): keyword arguments to provide load
         options while loading from Hugging Face Hub. Defaults to None.
       inference_args (Optional[Dict[str, Any]]): Non-batchable arguments
         required as inputs to the model's forward() function. Unlike Tensors in
@@ -221,24 +349,49 @@ def __init__(
         Defaults to None.
       min_batch_size: the minimum batch size to use when batching inputs.
       max_batch_size: the maximum batch size to use when batching inputs.
+      large_model: set to true if your model is large enough to run into
+        memory pressure if you load multiple copies. Given a model that
+        consumes N memory and a machine with W cores and M memory, you should
+        set this to True if N*W > M.
+      kwargs: 'env_vars' can be used to set environment variables
+        before loading the model.
+
+    **Supported Versions:** RunInference APIs in Apache Beam
+    supports transformers>=4.18.0.
     """
     self._model_uri = model_uri
     self._model_class = model_class
-    self._model_path = model_uri
-    self._model_config_args = model_config_args if model_config_args else {}
+    if device == 'GPU':
+      self._device = torch.device('cuda')
+    else:
+      self._device = torch.device('cpu')
+    self._inference_fn = inference_fn
+    self._model_config_args = load_model_args if load_model_args else {}
     self._inference_args = inference_args if inference_args else {}
     self._batching_kwargs = {}
-    self._framework = "torch"
     self._env_vars = kwargs.get('env_vars', {})
     if min_batch_size is not None:
       self._batching_kwargs['min_batch_size'] = min_batch_size
     if max_batch_size is not None:
       self._batching_kwargs['max_batch_size'] = max_batch_size
+    self._large_model = large_model
+    self._framework = None
+
+    _validate_constructor_args(
+        model_uri=self._model_uri, model_class=self._model_class)
 
   def load_model(self):
     """Loads and initializes the model for processing."""
-    return self._model_class.from_pretrained(
+    model = self._model_class.from_pretrained(
         self._model_uri, **self._model_config_args)
+    if self._device == torch.device('cuda'):
+      if not torch.cuda.is_available():
+        logging.warning(
+            "Model handler specified a 'GPU' device, "
+            "but GPUs are not available. Switching to CPU.")
+        self._device = torch.device('cpu')
+      model.to(self._device)
+    return model
 
   def update_model_path(self, model_path: Optional[str] = None):
     self._model_path = model_path if model_path else self._model_path
@@ -269,17 +422,18 @@ def run_inference(
       An Iterable of type PredictionResult.
     """
     inference_args = {} if not inference_args else inference_args
+    if not self._framework:
+      self._framework = "tf" if isinstance(batch[0], tf.Tensor) else "torch"
+    if self._inference_fn:
+      return self._inference_fn(
+          batch, model, inference_args, inference_args, self._model_uri)
 
-    if isinstance(batch[0], tf.Tensor):
-      self._framework = "tf"
-      predictions = model(batch, **inference_args)
+    if self._framework == "tf":
+      return _default_inference_fn_tensorflow(
+          batch, model, self._device, inference_args, self._model_uri)
     else:
-      # torch.no_grad() mitigates GPU memory issues
-      # https://github.com/apache/beam/issues/22811
-      with torch.no_grad():
-        predictions = model(batch, **inference_args)
-
-    return utils._convert_to_result(batch, predictions)
+      return _default_inference_fn_torch(
+          batch, model, self._device, inference_args, self._model_uri)
 
   def get_num_bytes(
       self, batch: Sequence[Union[tf.Tensor, torch.Tensor]]) -> int:
@@ -303,100 +457,5 @@ def get_metrics_namespace(self) -> str:
   def batch_elements_kwargs(self):
     return self._batching_kwargs
 
-
-class HuggingFaceModelHandlerNumpy(ModelHandler[numpy.ndarray,
-                                                PredictionResult,
-                                                Union[AutoModel, TFAutoModel]]):
-  def __init__(
-      self,
-      model_uri: str,
-      model_class: Union[AutoModel, TFAutoModel],
-      model_config_args: Dict[str, Any] = None,
-      inference_args: Optional[Dict[str, Any]] = None,
-      min_batch_size: Optional[int] = None,
-      max_batch_size: Optional[int] = None,
-      **kwargs):
-    """Implementation of the ModelHandler interface for HuggingFace with
-      numpy.ndarray for PyTorch/Tensorflow backend.
-
-      Example Usage model::
-      pcoll | RunInference(HuggingFaceModelHandlerNumpy(
-        model_uri="bert-base-uncased"))
-
-    Args:
-      model_uri (str): path to the pretrained model
-        on the Hugging Face Models hub.
-      model_config_args (Dict[str, Any]): keyword arguments to provide load
-        options while loading from Hugging Face Hub. Defaults to None.
-      inference_args (Optional[Dict[str, Any]]): Non-batchable arguments
-        required as inputs to the model's forward() function. Unlike numpy
-        ndarray in `batch`, these parameters will not be dynamically batched.
-        Defaults to None.
-      min_batch_size: the minimum batch size to use when batching inputs.
-      max_batch_size: the maximum batch size to use when batching inputs.
-    """
-    self._model_uri = model_uri
-    self._model_class = model_class
-    self._model_path = model_uri
-    self._model_config_args = model_config_args if model_config_args else {}
-    self._inference_args = inference_args if inference_args else {}
-    self._batching_kwargs = {}
-    self._env_vars = kwargs.get('env_vars', {})
-    if min_batch_size is not None:
-      self._batching_kwargs['min_batch_size'] = min_batch_size
-    if max_batch_size is not None:
-      self._batching_kwargs['max_batch_size'] = max_batch_size
-
-  def load_model(self):
-    """Loads and initializes the model for processing."""
-    return self._model_class.from_pretrained(
-        self._model_uri, **self._model_config_args)
-
-  def update_model_path(self, model_path: Optional[str] = None):
-    self._model_path = model_path if model_path else self._model_path
-
-  def run_inference(
-      self,
-      batch: Sequence[numpy.ndarray],
-      model: Union[AutoModel, TFAutoModel],
-      inference_args: Optional[Dict[str, Any]] = None
-  ) -> Iterable[PredictionResult]:
-    """
-    Runs inferences on a batch of numpy ndarray and returns an Iterable of
-    numpy Predictions.
-
-    This method stacks the list of numpy ndarray in a vectorized format to
-    optimize the inference call.
-
-    Args:
-      batch: A sequence of numpy ndarray. These arrays should be batchable, as
-        this method will call `numpy.stack()` and pass in batched arrays with
-        dimensions (batch_size, n_features, etc.) into the model's
-        predict() function.
-      model: A pretrained model compatible with numpy input.
-      inference_args: Non-batchable arguments required as inputs to the model's
-        forward() function. Unlike Tensors in `batch`, these parameters will
-        not be dynamically batched
-    Returns:
-      An Iterable of type PredictionResult.
-    """
-    inference_args = {} if not inference_args else inference_args
-    predictions = model(batch, **inference_args)
-    return utils._convert_to_result(batch, predictions)
-
-  def get_num_bytes(self, batch: Sequence[numpy.ndarray]) -> int:
-    """
-    Returns:
-      The number of bytes of data for a batch.
-    """
-    return sum(sys.getsizeof(element) for element in batch)
-
-  def get_metrics_namespace(self) -> str:
-    """
-    Returns:
-       A namespace for metrics collected by the RunInference transform.
-    """
-    return 'BeamML_HuggingFaceModelHandler_Numpy'
-
-  def batch_elements_kwargs(self):
-    return self._batching_kwargs
+  def share_model_across_processes(self) -> bool:
+    return self._large_model
diff --git a/sdks/python/apache_beam/ml/inference/huggingface_inference_test.py b/sdks/python/apache_beam/ml/inference/huggingface_inference_test.py
new file mode 100644
index 0000000000000..cc84bf219fc11
--- /dev/null
+++ b/sdks/python/apache_beam/ml/inference/huggingface_inference_test.py
@@ -0,0 +1,129 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import pytest
+import shutil
+import tempfile
+from typing import Any
+from typing import Dict
+from typing import Iterable
+from typing import Optional
+from typing import Sequence
+from typing import Union
+import unittest
+
+import tensorflow as tf
+import torch
+from transformers import AutoModel
+from transformers import TFAutoModel
+
+from apache_beam.ml.inference import utils
+from apache_beam.ml.inference.base import PredictionResult
+from apache_beam.ml.inference.huggingface_inference import HuggingFaceModelHandlerTensor
+from apache_beam.ml.inference.tensorflow_inference_test import _compare_tensor_prediction_result
+from apache_beam.ml.inference.tensorflow_inference_test import FakeTFTensorModel
+
+
+def fake_inference_fn_tensor(
+    batch: Sequence[Union[tf.Tensor, torch.Tensor]],
+    model: Union[AutoModel, TFAutoModel],
+    device,
+    inference_args: Dict[str, Any],
+    model_id: Optional[str] = None) -> Iterable[PredictionResult]:
+  predictions = model.predict(batch, **inference_args)
+  return utils._convert_to_result(batch, predictions, model_id)
+
+
+class FakeTorchModel:
+  def predict(self, input: torch.Tensor):
+    return input
+
+
+@pytest.mark.uses_transformers
+class HuggingFaceInferenceTest(unittest.TestCase):
+  def setUp(self) -> None:
+    self.tmpdir = tempfile.mkdtemp()
+
+  def tearDown(self) -> None:
+    shutil.rmtree(self.tmpdir)
+
+  def test_predict_tensor(self):
+    fake_model = FakeTFTensorModel()
+    inference_runner = HuggingFaceModelHandlerTensor(
+        model_uri='unused',
+        model_class=TFAutoModel,
+        inference_fn=fake_inference_fn_tensor)
+    batched_examples = [tf.constant([1]), tf.constant([10]), tf.constant([100])]
+    expected_predictions = [
+        PredictionResult(ex, pred) for ex,
+        pred in zip(
+            batched_examples,
+            [tf.math.multiply(n, 10) for n in batched_examples])
+    ]
+
+    inferences = inference_runner.run_inference(batched_examples, fake_model)
+    for actual, expected in zip(inferences, expected_predictions):
+      self.assertTrue(_compare_tensor_prediction_result(actual, expected))
+
+  def test_predict_tensor_with_inference_args(self):
+    fake_model = FakeTFTensorModel()
+    inference_runner = HuggingFaceModelHandlerTensor(
+        model_uri='unused',
+        model_class=TFAutoModel,
+        inference_fn=fake_inference_fn_tensor,
+        inference_args={"add": True})
+    batched_examples = [tf.constant([1]), tf.constant([10]), tf.constant([100])]
+    expected_predictions = [
+        PredictionResult(ex, pred) for ex,
+        pred in zip(
+            batched_examples, [
+                tf.math.add(tf.math.multiply(n, 10), 10)
+                for n in batched_examples
+            ])
+    ]
+
+    inferences = inference_runner.run_inference(
+        batched_examples, fake_model, inference_args={"add": True})
+
+    for actual, expected in zip(inferences, expected_predictions):
+      self.assertTrue(_compare_tensor_prediction_result(actual, expected))
+
+  def test_framework_detection_torch(self):
+    fake_model = FakeTorchModel()
+    inference_runner = HuggingFaceModelHandlerTensor(
+        model_uri='unused',
+        model_class=TFAutoModel,
+        inference_fn=fake_inference_fn_tensor)
+    batched_examples = [torch.tensor(1), torch.tensor(10), torch.tensor(100)]
+    inference_runner.run_inference(batched_examples, fake_model)
+    self.assertEqual(inference_runner._framework, "torch")
+
+  def test_framework_detection_tensorflow(self):
+    fake_model = FakeTFTensorModel()
+    inference_runner = HuggingFaceModelHandlerTensor(
+        model_uri='unused',
+        model_class=TFAutoModel,
+        inference_fn=fake_inference_fn_tensor,
+        inference_args={"add": True})
+    batched_examples = [tf.constant([1]), tf.constant([10]), tf.constant([100])]
+    inference_runner.run_inference(
+        batched_examples, fake_model, inference_args={"add": True})
+    self.assertEqual(inference_runner._framework, "tf")
+
+
+if __name__ == '__main__':
+  unittest.main()

From d5e1cf3dc6b62fc2be52ac8ab89ac8b497964e43 Mon Sep 17 00:00:00 2001
From: riteshghorse <riteshghorse@gmail.com>
Date: Tue, 27 Jun 2023 10:54:49 -0400
Subject: [PATCH 08/26] add test setup for hugging face tests

---
 .../apache_beam/examples/inference/README.md  | 73 +++++++++++++++++++
 .../ml/inference/huggingface_inference.py     |  3 +-
 .../huggingface_inference_it_test.py          |  2 +
 .../inference/huggingface_inference_test.py   | 18 +++--
 sdks/python/pytest.ini                        | 13 ++--
 sdks/python/test-suites/direct/common.gradle  | 30 +++++++-
 sdks/python/test-suites/tox/py38/build.gradle | 13 ++++
 sdks/python/tox.ini                           | 19 +++++
 8 files changed, 157 insertions(+), 14 deletions(-)

diff --git a/sdks/python/apache_beam/examples/inference/README.md b/sdks/python/apache_beam/examples/inference/README.md
index 326ec4b4a0968..1653f3a9699ff 100644
--- a/sdks/python/apache_beam/examples/inference/README.md
+++ b/sdks/python/apache_beam/examples/inference/README.md
@@ -42,6 +42,7 @@ The RunInference API supports the Tensorflow framework. To use Tensorflow locall
 pip install tensorflow==2.12.0
 ```
 
+
 ### PyTorch dependencies
 
 The following installation requirements are for the files used in these examples.
@@ -65,6 +66,21 @@ For installation of the `torch` dependency on a distributed runner such as Dataf
 [PyPI dependency instructions](https://beam.apache.org/documentation/sdks/python-pipeline-dependencies/#pypi-dependencies).
 
 
+### Transformers dependencies
+
+The following installation requirement is for the Hugging Face model handler examples.
+
+The RunInference API supports loading models from the Hugging Face Hub. To use it, first install `transformers`.
+```
+pip install transformers==4.30.0
+```
+Additional dependicies for PyTorch and TensorFlow may need to be installed separately:
+```
+pip install tensorflow==2.12.0
+pip install torch==1.10.0
+```
+
+
 ### TensorRT dependencies
 
 The RunInference API supports TensorRT SDK for high-performance deep learning inference with NVIDIA GPUs.
@@ -687,3 +703,60 @@ MilkQualityAggregation(bad_quality_measurements=6, medium_quality_measurements=4
 MilkQualityAggregation(bad_quality_measurements=3, medium_quality_measurements=3, high_quality_measurements=3)
 MilkQualityAggregation(bad_quality_measurements=1, medium_quality_measurements=2, high_quality_measurements=1)
 ```
+
+---
+## Language modeling with Hugging Face Hub
+
+[`huggingface_language_modeling.py`](./huggingface_language_modeling.py) contains an implementation for a RunInference pipeline that performs masked language modeling (that is, decoding a masked token in a sentence) using the `AutoModelForMaskedLM` architecture from Hugging Face.
+
+The pipeline reads sentences, performs basic preprocessing to convert the last word into a `<mask>` token, passes the masked sentence to the Hugging Face implementation of RunInference, and then writes the predictions to a text file.
+
+### Dataset and model for language modeling
+
+To use this transform, you need a dataset and model for language modeling.
+
+1. Choose a checkpoint to load from Hugging Face Hub, eg:[MaskedLanguageModel](https://huggingface.co/stevhliu/my_awesome_eli5_mlm_model).
+2. (Optional) Create a file named `SENTENCES.txt` that contains sentences to feed into the model. The content of the file should be similar to the following example:
+```
+The capital of France is Paris .
+He looked up and saw the sun and stars .
+...
+```
+
+### Running `huggingface_language_modeling.py`
+
+To run the language modeling pipeline locally, use the following command:
+```sh
+python -m apache_beam.examples.inference.huggingface_language_modeling \
+  --input SENTENCES \
+  --output OUTPUT \
+  --model_name REPOSITORY_ID
+```
+The `input` argument is optional. If none is provided, it will run the pipeline with some
+example sentences.
+
+For example, if you've followed the naming conventions recommended above:
+```sh
+python -m apache_beam.examples.inference.huggingface_language_modeling \
+  --input SENTENCES.txt \
+  --output predictions.csv \
+  --model_name "stevhliu/my_awesome_eli5_mlm_model"
+```
+Or, using the default example sentences:
+```sh
+python -m apache_beam.examples.inference.huggingface_language_modeling \
+  --output predictions.csv \
+  --model_name "stevhliu/my_awesome_eli5_mlm_model"
+```
+
+This writes the output to the `predictions.csv` with contents like:
+```
+The capital of France is Paris .;paris
+He looked up and saw the sun and stars .;moon
+...
+```
+Each line has data separated by a semicolon ";".
+The first item is the input sentence. The model masks the last word and tries to predict it;
+the second item is the word that the model predicts for the mask.
+
+---
\ No newline at end of file
diff --git a/sdks/python/apache_beam/ml/inference/huggingface_inference.py b/sdks/python/apache_beam/ml/inference/huggingface_inference.py
index 7d2e22f106c74..0f4085e8992a5 100644
--- a/sdks/python/apache_beam/ml/inference/huggingface_inference.py
+++ b/sdks/python/apache_beam/ml/inference/huggingface_inference.py
@@ -15,6 +15,8 @@
 # limitations under the License.
 #
 
+# pytype: skip-file
+
 from collections import defaultdict
 import logging
 import sys
@@ -30,7 +32,6 @@
 import torch
 from transformers import AutoModel
 from transformers import TFAutoModel
-
 from apache_beam.ml.inference.base import ModelHandler
 from apache_beam.ml.inference.base import PredictionResult
 from apache_beam.ml.inference import utils
diff --git a/sdks/python/apache_beam/ml/inference/huggingface_inference_it_test.py b/sdks/python/apache_beam/ml/inference/huggingface_inference_it_test.py
index 37a67ff979644..06530330c753b 100644
--- a/sdks/python/apache_beam/ml/inference/huggingface_inference_it_test.py
+++ b/sdks/python/apache_beam/ml/inference/huggingface_inference_it_test.py
@@ -20,6 +20,7 @@
 import logging
 import unittest
 import uuid
+
 import pytest
 
 from apache_beam.examples.inference import huggingface_language_modeling
@@ -28,6 +29,7 @@
 from apache_beam.testing.test_pipeline import TestPipeline
 
 
+@pytest.mark.uses_transformers
 @pytest.mark.it_postcommit
 class HuggingFaceInference(unittest.TestCase):
   @pytest.mark.timeout(1800)
diff --git a/sdks/python/apache_beam/ml/inference/huggingface_inference_test.py b/sdks/python/apache_beam/ml/inference/huggingface_inference_test.py
index cc84bf219fc11..03c0d752c6108 100644
--- a/sdks/python/apache_beam/ml/inference/huggingface_inference_test.py
+++ b/sdks/python/apache_beam/ml/inference/huggingface_inference_test.py
@@ -15,6 +15,8 @@
 # limitations under the License.
 #
 
+# pytype: skip-file
+
 import pytest
 import shutil
 import tempfile
@@ -26,17 +28,21 @@
 from typing import Union
 import unittest
 
-import tensorflow as tf
-import torch
-from transformers import AutoModel
-from transformers import TFAutoModel
-
 from apache_beam.ml.inference import utils
 from apache_beam.ml.inference.base import PredictionResult
-from apache_beam.ml.inference.huggingface_inference import HuggingFaceModelHandlerTensor
 from apache_beam.ml.inference.tensorflow_inference_test import _compare_tensor_prediction_result
 from apache_beam.ml.inference.tensorflow_inference_test import FakeTFTensorModel
 
+# pylint: disable=ungrouped-imports
+try:
+  import tensorflow as tf
+  import torch
+  from transformers import AutoModel
+  from transformers import TFAutoModel
+  from apache_beam.ml.inference.huggingface_inference import HuggingFaceModelHandlerTensor
+except ImportError:
+  raise unittest.SkipTest('Transformers dependencies are not installed.')
+
 
 def fake_inference_fn_tensor(
     batch: Sequence[Union[tf.Tensor, torch.Tensor]],
diff --git a/sdks/python/pytest.ini b/sdks/python/pytest.ini
index 6e93c5f96e7fa..7c564235c581c 100644
--- a/sdks/python/pytest.ini
+++ b/sdks/python/pytest.ini
@@ -47,15 +47,16 @@ markers =
     # as enabling save_main_session.
     no_xdist: run without pytest-xdist plugin
     # We run these tests with multiple major pyarrow versions (BEAM-11211)
-    uses_pyarrow: tests that utilize pyarrow in some way
+    uses_pyarrow: tests that utilize pyarrow in some way.
     # ML tests
-    uses_pytorch: tests that utilize pytorch in some way
-    uses_sklearn: tests that utilize scikit-learn in some way
-    uses_tensorflow: tests that utilize tensorflow in some way
+    uses_pytorch: tests that utilize pytorch in some way.
+    uses_sklearn: tests that utilize scikit-learn in some way.
+    uses_tensorflow: tests that utilize tensorflow in some way.
     uses_tft: tests that utilizes tensorflow transforms in some way.
-    uses_xgboost: tests that utilize xgboost in some way
+    uses_xgboost: tests that utilize xgboost in some way.
     uses_onnx: tests that utilizes onnx in some way.
-    uses_tf: tests that utilize tensorflow
+    uses_tf: tests that utilize tensorflow.
+    uses_transformers: tests that utilize transformers in some way.
 
 # Default timeout intended for unit tests.
 # If certain tests need a different value, please see the docs on how to
diff --git a/sdks/python/test-suites/direct/common.gradle b/sdks/python/test-suites/direct/common.gradle
index 27e91b4733dd5..aebdb4cfa009f 100644
--- a/sdks/python/test-suites/direct/common.gradle
+++ b/sdks/python/test-suites/direct/common.gradle
@@ -337,13 +337,41 @@ task xgboostInferenceTest {
 
 }
 
+// Transformers RunInference IT tests
+task transformersInferenceTest {
+  dependsOn 'installGcpTest'
+  dependsOn ':sdks:python:sdist'
+  def requirementsFile = "${rootDir}/sdks/python/apache_beam/ml/inference/huggingface_tests_requirements.txt"
+  doFirst {
+      exec {
+        executable 'sh'
+        args '-c', ". ${envdir}/bin/activate && pip install -r $requirementsFile"
+      }
+    }
+  doLast {
+      def testOpts = basicTestOpts
+      def argMap = [
+          "test_opts": testOpts,
+          "suite": "postCommitIT-direct-py${pythonVersionSuffix}",
+          "collect": "uses_transformers and it_postcommit" ,
+          "runner": "TestDirectRunner"
+      ]
+      def cmdArgs = mapToArgString(argMap)
+      exec {
+        executable 'sh'
+        args '-c', ". ${envdir}/bin/activate && ${runScriptsDir}/run_integration_test.sh $cmdArgs"
+      }
+    }
+}
+
 // Add all the RunInference framework IT tests to this gradle task that runs on Direct Runner Post commit suite.
 project.tasks.register("inferencePostCommitIT") {
   dependsOn = [
   'torchInferenceTest',
   'sklearnInferenceTest',
   'tensorflowInferenceTest',
-  'xgboostInferenceTest'
+  'xgboostInferenceTest',
+  'transformersInferenceTest'
   // (TODO) https://github.com/apache/beam/issues/25799
    // uncomment tfx bsl tests once tfx supports protobuf 4.x
   // 'tfxInferenceTest',
diff --git a/sdks/python/test-suites/tox/py38/build.gradle b/sdks/python/test-suites/tox/py38/build.gradle
index a96e1c5c9f6b3..43ce2e0801d4f 100644
--- a/sdks/python/test-suites/tox/py38/build.gradle
+++ b/sdks/python/test-suites/tox/py38/build.gradle
@@ -126,6 +126,19 @@ toxTask "testPy38tensorflow-212", "py38-tensorflow-212", "${posargs}"
 test.dependsOn "testPy38tensorflow-212"
 preCommitPyCoverage.dependsOn "testPy38tensorflow-212"
 
+// Create a test task for each minor version of transformers
+toxTask "testPy38transformers-428", "py38-transformers-428", "${posargs}"
+test.dependsOn "testPy38transformers-428"
+preCommitPyCoverage.dependsOn "testPy38transformers-428"
+
+toxTask "testPy38transformers-429", "py38-transformers-429", "${posargs}"
+test.dependsOn "testPy38transformers-429"
+preCommitPyCoverage.dependsOn "testPy38transformers-429"
+
+toxTask "testPy38transformers-430", "py38-transformers-430", "${posargs}"
+test.dependsOn "testPy38transformers-430"
+preCommitPyCoverage.dependsOn "testPy38transformers-430"
+
 toxTask "whitespacelint", "whitespacelint", "${posargs}"
 
 task archiveFilesToLint(type: Zip) {
diff --git a/sdks/python/tox.ini b/sdks/python/tox.ini
index b2f784aada505..499beed2967af 100644
--- a/sdks/python/tox.ini
+++ b/sdks/python/tox.ini
@@ -163,6 +163,7 @@ deps =
   torch
   xgboost
   datatable==1.0.0
+  transformers
 commands =
   time {toxinidir}/scripts/generate_pydoc.sh
 
@@ -400,3 +401,21 @@ commands =
   # Run all XGBoost unit tests
   # Allow exit code 5 (no tests run) so that we can run this command safely on arbitrary subdirectories.
   /bin/sh -c 'pytest -o junit_suite_name={envname} --junitxml=pytest_{envname}.xml -n 6 -m uses_xgboost {posargs}; ret=$?; [ $ret = 5 ] && exit 0 || exit $ret'
+
+[testenv:py{38,39,310}-transformers-{428,429,430}]
+deps =
+  -r build-requirements.txt
+  428: transformers>=4.28.0,<4.29.0
+  429: transformers>=4.29.0,<4.30.0
+  430: transformers>=4.30.0,<4.31.0
+  torch>=1.9.0<1.14.0
+  tensorflow==2.12.0
+extras = test,gcp
+commands =
+  # Log transformers and its dependencies version for debugging
+  /bin/sh -c "pip freeze | grep -E transformers"
+  /bin/sh -c "pip freeze | grep -E torch"
+  /bin/sh -c "pip freeze | grep -E tensorflow"
+  # Run all Transformers unit tests
+  # Allow exit code 5 (no tests run) so that we can run this command safely on arbitrary subdirectories.
+  /bin/sh -c 'pytest -o junit_suite_name={envname} --junitxml=pytest_{envname}.xml -n 6 -m uses_transformers {posargs}; ret=$?; [ $ret = 5 ] && exit 0 || exit $ret'
\ No newline at end of file

From 4177c096d919b89991df8a354747d4f0111ad821 Mon Sep 17 00:00:00 2001
From: riteshghorse <riteshghorse@gmail.com>
Date: Tue, 27 Jun 2023 12:57:18 -0400
Subject: [PATCH 09/26] fix lints

---
 .../inference/huggingface_language_modeling.py        |  8 +++-----
 .../apache_beam/ml/inference/huggingface_inference.py |  6 +++---
 .../ml/inference/huggingface_inference_it_test.py     | 11 +++++++++--
 .../ml/inference/huggingface_inference_test.py        |  7 ++++---
 4 files changed, 19 insertions(+), 13 deletions(-)

diff --git a/sdks/python/apache_beam/examples/inference/huggingface_language_modeling.py b/sdks/python/apache_beam/examples/inference/huggingface_language_modeling.py
index 92eb801c29667..19d0d3f255d3c 100644
--- a/sdks/python/apache_beam/examples/inference/huggingface_language_modeling.py
+++ b/sdks/python/apache_beam/examples/inference/huggingface_language_modeling.py
@@ -32,19 +32,17 @@
 from typing import Iterator
 from typing import Tuple
 
-import torch
-
 import apache_beam as beam
-from apache_beam.ml.inference.huggingface_inference import HuggingFaceModelHandlerKeyedTensor
+import torch
 from apache_beam.ml.inference.base import KeyedModelHandler
 from apache_beam.ml.inference.base import PredictionResult
 from apache_beam.ml.inference.base import RunInference
+from apache_beam.ml.inference.huggingface_inference import HuggingFaceModelHandlerKeyedTensor
 from apache_beam.options.pipeline_options import PipelineOptions
 from apache_beam.options.pipeline_options import SetupOptions
 from apache_beam.runners.runner import PipelineResult
-
-from transformers import AutoTokenizer
 from transformers import AutoModelForMaskedLM
+from transformers import AutoTokenizer
 
 
 def add_mask_to_last_word(text: str) -> Tuple[str, str]:
diff --git a/sdks/python/apache_beam/ml/inference/huggingface_inference.py b/sdks/python/apache_beam/ml/inference/huggingface_inference.py
index 0f4085e8992a5..8d10b2df7545d 100644
--- a/sdks/python/apache_beam/ml/inference/huggingface_inference.py
+++ b/sdks/python/apache_beam/ml/inference/huggingface_inference.py
@@ -17,9 +17,9 @@
 
 # pytype: skip-file
 
-from collections import defaultdict
 import logging
 import sys
+from collections import defaultdict
 from typing import Any
 from typing import Callable
 from typing import Dict
@@ -30,12 +30,12 @@
 
 import tensorflow as tf
 import torch
-from transformers import AutoModel
-from transformers import TFAutoModel
 from apache_beam.ml.inference.base import ModelHandler
 from apache_beam.ml.inference.base import PredictionResult
 from apache_beam.ml.inference import utils
 from apache_beam.ml.inference.pytorch_inference import _convert_to_device
+from transformers import AutoModel
+from transformers import TFAutoModel
 
 __all__ = [
     'HuggingFaceModelHandlerTensor',
diff --git a/sdks/python/apache_beam/ml/inference/huggingface_inference_it_test.py b/sdks/python/apache_beam/ml/inference/huggingface_inference_it_test.py
index 06530330c753b..6aefb533327b8 100644
--- a/sdks/python/apache_beam/ml/inference/huggingface_inference_it_test.py
+++ b/sdks/python/apache_beam/ml/inference/huggingface_inference_it_test.py
@@ -23,11 +23,18 @@
 
 import pytest
 
-from apache_beam.examples.inference import huggingface_language_modeling
 from apache_beam.io.filesystems import FileSystems
-from apache_beam.ml.inference import pytorch_inference_it_test
 from apache_beam.testing.test_pipeline import TestPipeline
 
+try:
+  from apache_beam.examples.inference import huggingface_language_modeling
+  from apache_beam.ml.inference import pytorch_inference_it_test
+except ImportError:
+  raise unittest.SkipTest(
+      "transformers dependencies are not installed. "
+      "Check if transformers, torch, and tensorflow "
+      "is installed.")
+
 
 @pytest.mark.uses_transformers
 @pytest.mark.it_postcommit
diff --git a/sdks/python/apache_beam/ml/inference/huggingface_inference_test.py b/sdks/python/apache_beam/ml/inference/huggingface_inference_test.py
index 03c0d752c6108..a7f52dace73ad 100644
--- a/sdks/python/apache_beam/ml/inference/huggingface_inference_test.py
+++ b/sdks/python/apache_beam/ml/inference/huggingface_inference_test.py
@@ -17,21 +17,22 @@
 
 # pytype: skip-file
 
-import pytest
 import shutil
 import tempfile
+import unittest
 from typing import Any
 from typing import Dict
 from typing import Iterable
 from typing import Optional
 from typing import Sequence
 from typing import Union
-import unittest
+
+import pytest
 
 from apache_beam.ml.inference import utils
+from apache_beam.ml.inference.tensorflow_inference_test import FakeTFTensorModel
 from apache_beam.ml.inference.base import PredictionResult
 from apache_beam.ml.inference.tensorflow_inference_test import _compare_tensor_prediction_result
-from apache_beam.ml.inference.tensorflow_inference_test import FakeTFTensorModel
 
 # pylint: disable=ungrouped-imports
 try:

From 6324752b029a5879af95f3050022df4ebec737f7 Mon Sep 17 00:00:00 2001
From: riteshghorse <riteshghorse@gmail.com>
Date: Tue, 27 Jun 2023 16:28:41 -0400
Subject: [PATCH 10/26] fix import order

---
 sdks/python/apache_beam/ml/inference/huggingface_inference.py   | 2 +-
 .../apache_beam/ml/inference/huggingface_inference_test.py      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/sdks/python/apache_beam/ml/inference/huggingface_inference.py b/sdks/python/apache_beam/ml/inference/huggingface_inference.py
index 8d10b2df7545d..1024700ae954f 100644
--- a/sdks/python/apache_beam/ml/inference/huggingface_inference.py
+++ b/sdks/python/apache_beam/ml/inference/huggingface_inference.py
@@ -30,9 +30,9 @@
 
 import tensorflow as tf
 import torch
+from apache_beam.ml.inference import utils
 from apache_beam.ml.inference.base import ModelHandler
 from apache_beam.ml.inference.base import PredictionResult
-from apache_beam.ml.inference import utils
 from apache_beam.ml.inference.pytorch_inference import _convert_to_device
 from transformers import AutoModel
 from transformers import TFAutoModel
diff --git a/sdks/python/apache_beam/ml/inference/huggingface_inference_test.py b/sdks/python/apache_beam/ml/inference/huggingface_inference_test.py
index a7f52dace73ad..763d5ee8d36fc 100644
--- a/sdks/python/apache_beam/ml/inference/huggingface_inference_test.py
+++ b/sdks/python/apache_beam/ml/inference/huggingface_inference_test.py
@@ -30,8 +30,8 @@
 import pytest
 
 from apache_beam.ml.inference import utils
-from apache_beam.ml.inference.tensorflow_inference_test import FakeTFTensorModel
 from apache_beam.ml.inference.base import PredictionResult
+from apache_beam.ml.inference.tensorflow_inference_test import FakeTFTensorModel
 from apache_beam.ml.inference.tensorflow_inference_test import _compare_tensor_prediction_result
 
 # pylint: disable=ungrouped-imports

From 30029d36eef9fcd0155fa217e0ad6999e5c139c4 Mon Sep 17 00:00:00 2001
From: riteshghorse <riteshghorse@gmail.com>
Date: Wed, 28 Jun 2023 11:15:18 -0400
Subject: [PATCH 11/26] refactor, doc, lints

---
 .../huggingface_language_modeling.py          |   2 +-
 .../ml/inference/huggingface_inference.py     | 291 ++++++++----------
 .../huggingface_inference_it_test.py          |   6 +-
 sdks/python/tox.ini                           |   2 +-
 4 files changed, 139 insertions(+), 162 deletions(-)

diff --git a/sdks/python/apache_beam/examples/inference/huggingface_language_modeling.py b/sdks/python/apache_beam/examples/inference/huggingface_language_modeling.py
index 19d0d3f255d3c..39cd5cc302d82 100644
--- a/sdks/python/apache_beam/examples/inference/huggingface_language_modeling.py
+++ b/sdks/python/apache_beam/examples/inference/huggingface_language_modeling.py
@@ -167,7 +167,7 @@ def run(
       text_and_tokenized_text_tuple
       | 'RunInference' >> RunInference(KeyedModelHandler(model_handler))
       | 'ProcessOutput' >> beam.ParDo(PostProcessor(tokenizer=tokenizer)))
-  output | "WriteOutput" >> beam.io.WriteToText( # pylint: disable=expression-not-assigned
+  _ = output | "WriteOutput" >> beam.io.WriteToText(
       known_args.output, shard_name_template='', append_trailing_newlines=True)
 
   result = pipeline.run()
diff --git a/sdks/python/apache_beam/ml/inference/huggingface_inference.py b/sdks/python/apache_beam/ml/inference/huggingface_inference.py
index 1024700ae954f..13935a1504d6f 100644
--- a/sdks/python/apache_beam/ml/inference/huggingface_inference.py
+++ b/sdks/python/apache_beam/ml/inference/huggingface_inference.py
@@ -17,6 +17,7 @@
 
 # pytype: skip-file
 
+from abc import ABC
 import logging
 import sys
 from collections import defaultdict
@@ -31,13 +32,17 @@
 import tensorflow as tf
 import torch
 from apache_beam.ml.inference import utils
+from apache_beam.ml.inference.base import ExampleT
 from apache_beam.ml.inference.base import ModelHandler
+from apache_beam.ml.inference.base import ModelT
 from apache_beam.ml.inference.base import PredictionResult
+from apache_beam.ml.inference.base import PredictionT
 from apache_beam.ml.inference.pytorch_inference import _convert_to_device
 from transformers import AutoModel
 from transformers import TFAutoModel
 
 __all__ = [
+    'HuggingFaceModelHandler',
     'HuggingFaceModelHandlerTensor',
     'HuggingFaceModelHandlerKeyedTensor',
 ]
@@ -78,7 +83,7 @@ def _validate_constructor_args(model_uri, model_class):
 
 
 def _run_inference_torch_keyed_tensor(
-    batch: Sequence[Dict[str, Union[tf.Tensor, torch.Tensor]]],
+    batch: Sequence[Dict[str, torch.Tensor]],
     model: AutoModel,
     device,
     inference_args: Dict[str, Any],
@@ -95,12 +100,12 @@ def _run_inference_torch_keyed_tensor(
       batched_tensors = torch.stack(key_to_tensor_list[key])
       batched_tensors = _convert_to_device(batched_tensors, device)
       key_to_batched_tensors[key] = batched_tensors
-    return utils._convert_to_result(
-        batch, model(**key_to_batched_tensors, **inference_args))
+    predictions = model(**key_to_batched_tensors, **inference_args)
+    return utils._convert_to_result(batch, predictions, model_id)
 
 
 def _run_inference_tensorflow_keyed_tensor(
-    batch: Sequence[Dict[str, Union[tf.Tensor, torch.Tensor]]],
+    batch: Sequence[Dict[str, tf.Tensor]],
     model: TFAutoModel,
     device,
     inference_args: Dict[str, Any],
@@ -111,41 +116,35 @@ def _run_inference_tensorflow_keyed_tensor(
       key_to_tensor_list[key].append(tensor)
   key_to_batched_tensors = {}
   for key in key_to_tensor_list:
-    batched_tensors = torch.stack(key_to_tensor_list[key])
-    batched_tensors = key_to_tensor_list[key]
+    batched_tensors = tf.stack(key_to_tensor_list[key], axis=0)
     key_to_batched_tensors[key] = batched_tensors
-  return utils._convert_to_result(
-      batch, model(**key_to_batched_tensors, **inference_args))
+  predictions = model(**key_to_batched_tensors, **inference_args)
+  return utils._convert_to_result(batch, predictions, model_id)
 
 
-class HuggingFaceModelHandlerKeyedTensor(ModelHandler[Dict[str,
-                                                           Union[tf.Tensor,
-                                                                 torch.Tensor]],
-                                                      PredictionResult,
-                                                      Union[AutoModel,
-                                                            TFAutoModel]]):
+class HuggingFaceModelHandler(ModelHandler[ExampleT, PredictionT, ModelT], ABC):
   def __init__(
       self,
       model_uri: str,
       model_class: Union[AutoModel, TFAutoModel],
       device: str = 'CPU',
       *,
-      inference_fn: KeyedTensorInferenceFn = None,
+      inference_fn: Union[
+          KeyedTensorInferenceFn,
+          TensorInferenceFn] = _run_inference_torch_keyed_tensor,
       load_model_args: Optional[Dict[str, Any]] = None,
       inference_args: Optional[Dict[str, Any]] = None,
       min_batch_size: Optional[int] = None,
       max_batch_size: Optional[int] = None,
       large_model: bool = False,
       **kwargs):
-    """Implementation of the ModelHandler interface for HuggingFace with
-      Keyed Tensors for PyTorch/Tensorflow backend.
-
-      Depending on the type of tensors,
-      the model framework is determined automatically.
+    """Implementation of the abstract base class of ModelHandler interface
+    for Hugging Face. This class shouldn't be instantiated directly.
+    Use HuggingFaceModelHandlerKeyedTensor or HuggingFaceModelHandlerTensor.
 
-      Example Usage model::
-      pcoll | RunInference(HuggingFaceModelHandlerKeyedTensor(
-        model_uri="bert-base-uncased"))
+    Example Usage model::
+    pcoll | RunInference(HuggingFaceModelHandlerKeyedTensor(
+      model_uri="bert-base-uncased", model_class=AutoModelForMaskedLM))
 
     Args:
       model_uri (str): path to the pretrained model on the hugging face
@@ -158,7 +157,7 @@ def __init__(
         _run_inference_tensorflow_keyed_tensor depending on the input type.
       load_model_args (Dict[str, Any]): keyword arguments to provide load
         options while loading from Hugging Face Hub. Defaults to None.
-      inference_args (Optional[Dict[str, Any]]): Non-batchable arguments
+      inference_args [Dict[str, Any]]: Non-batchable arguments
         required as inputs to the model's forward() function. Unlike Tensors in
         `batch`, these parameters will not be dynamically batched.
         Defaults to None.
@@ -190,7 +189,7 @@ def __init__(
     if max_batch_size is not None:
       self._batching_kwargs['max_batch_size'] = max_batch_size
     self._large_model = large_model
-    self._framework = None
+    self._framework = ""
 
     _validate_constructor_args(
         model_uri=self._model_uri, model_class=self._model_class)
@@ -209,8 +208,68 @@ def load_model(self):
     return model
 
   def update_model_path(self, model_path: Optional[str] = None):
-    self._model_path = model_path if model_path else self._model_uri
+    self._model_uri = model_path if model_path else self._model_uri
+
+  def get_num_bytes(
+      self, batch: Sequence[Union[tf.Tensor, torch.Tensor]]) -> int:
+    """
+    Returns:
+      The number of bytes of data for the Tensors batch.
+    """
+    if self._framework == "tf":
+      return sum(sys.getsizeof(element) for element in batch)
+    else:
+      return sum(
+          (el.element_size() for tensor in batch for el in tensor.values()))
+
+  def batch_elements_kwargs(self):
+    return self._batching_kwargs
 
+  def share_model_across_processes(self) -> bool:
+    return self._large_model
+
+
+class HuggingFaceModelHandlerKeyedTensor(
+    HuggingFaceModelHandler[Dict[str, Union[tf.Tensor, torch.Tensor]],
+                            PredictionResult,
+                            Union[AutoModel, TFAutoModel]]):
+  """Implementation of the ModelHandler interface for HuggingFace with
+    Keyed Tensors for PyTorch/Tensorflow backend.
+
+    Depending on the type of tensors,
+    the model framework is determined automatically.
+
+    Example Usage model::
+    pcoll | RunInference(HuggingFaceModelHandlerKeyedTensor(
+      model_uri="bert-base-uncased", model_class=AutoModelForMaskedLM))
+
+  Args:
+    model_uri (str): path to the pretrained model on the hugging face
+      models hub.
+    model_class: model class to load the repository from model_uri.
+    device: For torch tensors, specify device on which you wish to
+      run the model. Defaults to CPU.
+    inference_fn: the inference function to use during RunInference.
+      Default is _run_inference_torch_keyed_tensor or
+      _run_inference_tensorflow_keyed_tensor depending on the input type.
+    load_model_args (Dict[str, Any]): keyword arguments to provide load
+      options while loading from Hugging Face Hub. Defaults to None.
+    inference_args ([Dict[str, Any]]): Non-batchable arguments
+      required as inputs to the model's forward() function. Unlike Tensors in
+      `batch`, these parameters will not be dynamically batched.
+      Defaults to None.
+    min_batch_size: the minimum batch size to use when batching inputs.
+    max_batch_size: the maximum batch size to use when batching inputs.
+    large_model: set to true if your model is large enough to run into
+      memory pressure if you load multiple copies. Given a model that
+      consumes N memory and a machine with W cores and M memory, you should
+      set this to True if N*W > M.
+    kwargs: 'env_vars' can be used to set environment variables
+      before loading the model.
+
+  **Supported Versions:** RunInference APIs in Apache Beam
+  supports transformers>=4.18.0.
+  """
   def run_inference(
       self,
       batch: Sequence[Dict[str, Union[tf.Tensor, torch.Tensor]]],
@@ -240,13 +299,12 @@ def run_inference(
     if not self._framework:
       self._framework = "tf" if isinstance(batch[0], tf.Tensor) else "torch"
 
-    if self._inference_fn:
+    # default is always torch keyed tensor. We check if user has provided their
+    # own or we move to infer it with input type.
+    if self._inference_fn != _run_inference_torch_keyed_tensor:
       return self._inference_fn(
           batch, model, self._device, inference_args, self._model_uri)
 
-    if not self._framework:
-      self._framework = "tf" if isinstance(batch[0], tf.Tensor) else "torch"
-
     if self._framework == "tf":
       return _run_inference_tensorflow_keyed_tensor(
           batch, model, self._device, inference_args, self._model_uri)
@@ -254,18 +312,6 @@ def run_inference(
       return _run_inference_torch_keyed_tensor(
           batch, model, self._device, inference_args, self._model_uri)
 
-  def get_num_bytes(
-      self, batch: Sequence[Union[tf.Tensor, torch.Tensor]]) -> int:
-    """
-    Returns:
-      The number of bytes of data for the Tensors batch.
-    """
-    if self._framework == "tf":
-      return sum(sys.getsizeof(element) for element in batch)
-    else:
-      return sum(
-          (el.element_size() for tensor in batch for el in tensor.values()))
-
   def get_metrics_namespace(self) -> str:
     """
     Returns:
@@ -273,12 +319,6 @@ def get_metrics_namespace(self) -> str:
     """
     return 'BeamML_HuggingFaceModelHandler_KeyedTensor'
 
-  def batch_elements_kwargs(self):
-    return self._batching_kwargs
-
-  def share_model_across_processes(self) -> bool:
-    return self._large_model
-
 
 def _default_inference_fn_torch(
     batch: Sequence[Union[tf.Tensor, torch.Tensor]],
@@ -307,96 +347,48 @@ def _default_inference_fn_tensorflow(
   return utils._convert_to_result(batch, predictions, model_id)
 
 
-class HuggingFaceModelHandlerTensor(ModelHandler[Union[tf.Tensor, torch.Tensor],
-                                                 PredictionResult,
-                                                 Union[AutoModel,
-                                                       TFAutoModel]]):
-  def __init__(
-      self,
-      model_uri: str,
-      model_class: Union[AutoModel, TFAutoModel],
-      device: str = 'CPU',
-      *,
-      inference_fn: TensorInferenceFn = None,
-      load_model_args: Optional[Dict[str, Any]] = None,
-      inference_args: Optional[Dict[str, Any]] = None,
-      min_batch_size: Optional[int] = None,
-      max_batch_size: Optional[int] = None,
-      large_model: bool = False,
-      **kwargs):
-    """Implementation of the ModelHandler interface for HuggingFace with
-      Tensors for PyTorch/Tensorflow backend.
-
-      Depending on the type of tensors,
-      the model framework is determined automatically.
-
-      Example Usage model:
-      pcoll | RunInference(HuggingFaceModelHandlerTensor(
-        model_uri="bert-base-uncased"))
-
-    Args:
-      model_uri (str): path to the pretrained model on the hugging face
-        models hub.
-      model_class: model class to load the repository from model_uri.
-      device: For torch tensors, specify device on which you wish to
-        run the model. Defaults to CPU.
-      inference_fn: the inference function to use during RunInference.
-        Default is _default_inference_fn_tensor.
-      load_model_args (Dict[str, Any]): keyword arguments to provide load
-        options while loading from Hugging Face Hub. Defaults to None.
-      inference_args (Optional[Dict[str, Any]]): Non-batchable arguments
-        required as inputs to the model's forward() function. Unlike Tensors in
-        `batch`, these parameters will not be dynamically batched.
-        Defaults to None.
-      min_batch_size: the minimum batch size to use when batching inputs.
-      max_batch_size: the maximum batch size to use when batching inputs.
-      large_model: set to true if your model is large enough to run into
-        memory pressure if you load multiple copies. Given a model that
-        consumes N memory and a machine with W cores and M memory, you should
-        set this to True if N*W > M.
-      kwargs: 'env_vars' can be used to set environment variables
-        before loading the model.
-
-    **Supported Versions:** RunInference APIs in Apache Beam
-    supports transformers>=4.18.0.
-    """
-    self._model_uri = model_uri
-    self._model_class = model_class
-    if device == 'GPU':
-      self._device = torch.device('cuda')
-    else:
-      self._device = torch.device('cpu')
-    self._inference_fn = inference_fn
-    self._model_config_args = load_model_args if load_model_args else {}
-    self._inference_args = inference_args if inference_args else {}
-    self._batching_kwargs = {}
-    self._env_vars = kwargs.get('env_vars', {})
-    if min_batch_size is not None:
-      self._batching_kwargs['min_batch_size'] = min_batch_size
-    if max_batch_size is not None:
-      self._batching_kwargs['max_batch_size'] = max_batch_size
-    self._large_model = large_model
-    self._framework = None
-
-    _validate_constructor_args(
-        model_uri=self._model_uri, model_class=self._model_class)
-
-  def load_model(self):
-    """Loads and initializes the model for processing."""
-    model = self._model_class.from_pretrained(
-        self._model_uri, **self._model_config_args)
-    if self._device == torch.device('cuda'):
-      if not torch.cuda.is_available():
-        logging.warning(
-            "Model handler specified a 'GPU' device, "
-            "but GPUs are not available. Switching to CPU.")
-        self._device = torch.device('cpu')
-      model.to(self._device)
-    return model
-
-  def update_model_path(self, model_path: Optional[str] = None):
-    self._model_path = model_path if model_path else self._model_path
-
+class HuggingFaceModelHandlerTensor(HuggingFaceModelHandler[Union[tf.Tensor,
+                                                                  torch.Tensor],
+                                                            PredictionResult,
+                                                            Union[AutoModel,
+                                                                  TFAutoModel]]
+                                    ):
+  """Implementation of the ModelHandler interface for HuggingFace with
+    Tensors for PyTorch/Tensorflow backend.
+
+    Depending on the type of tensors,
+    the model framework is determined automatically.
+
+    Example Usage model:
+    pcoll | RunInference(HuggingFaceModelHandlerTensor(
+      model_uri="bert-base-uncased", model_class=AutoModelForMaskedLM))
+
+  Args:
+    model_uri (str): path to the pretrained model on the hugging face
+      models hub.
+    model_class: model class to load the repository from model_uri.
+    device: For torch tensors, specify device on which you wish to
+      run the model. Defaults to CPU.
+    inference_fn: the inference function to use during RunInference.
+      Default is _default_inference_fn_tensor.
+    load_model_args (Dict[str, Any]): keyword arguments to provide load
+      options while loading from Hugging Face Hub. Defaults to None.
+    inference_args ([Dict[str, Any]]): Non-batchable arguments
+      required as inputs to the model's forward() function. Unlike Tensors in
+      `batch`, these parameters will not be dynamically batched.
+      Defaults to None.
+    min_batch_size: the minimum batch size to use when batching inputs.
+    max_batch_size: the maximum batch size to use when batching inputs.
+    large_model: set to true if your model is large enough to run into
+      memory pressure if you load multiple copies. Given a model that
+      consumes N memory and a machine with W cores and M memory, you should
+      set this to True if N*W > M.
+    kwargs: 'env_vars' can be used to set environment variables
+      before loading the model.
+
+  **Supported Versions:** RunInference APIs in Apache Beam
+  supports transformers>=4.18.0.
+  """
   def run_inference(
       self,
       batch: Sequence[Union[tf.Tensor, torch.Tensor]],
@@ -425,7 +417,10 @@ def run_inference(
     inference_args = {} if not inference_args else inference_args
     if not self._framework:
       self._framework = "tf" if isinstance(batch[0], tf.Tensor) else "torch"
-    if self._inference_fn:
+
+    # default is always torch keyed tensor. We check if user has provided their
+    # own or we move to infer it with input type.
+    if self._inference_fn != _run_inference_torch_keyed_tensor:
       return self._inference_fn(
           batch, model, inference_args, inference_args, self._model_uri)
 
@@ -436,27 +431,9 @@ def run_inference(
       return _default_inference_fn_torch(
           batch, model, self._device, inference_args, self._model_uri)
 
-  def get_num_bytes(
-      self, batch: Sequence[Union[tf.Tensor, torch.Tensor]]) -> int:
-    """
-    Returns:
-      The number of bytes of data for a batch.
-    """
-    if self._framework == "tf":
-      return sum(sys.getsizeof(element) for element in batch)
-    else:
-      return sum(
-          (el.element_size() for tensor in batch for el in tensor.values()))
-
   def get_metrics_namespace(self) -> str:
     """
     Returns:
        A namespace for metrics collected by the RunInference transform.
     """
     return 'BeamML_HuggingFaceModelHandler_Tensor'
-
-  def batch_elements_kwargs(self):
-    return self._batching_kwargs
-
-  def share_model_across_processes(self) -> bool:
-    return self._large_model
diff --git a/sdks/python/apache_beam/ml/inference/huggingface_inference_it_test.py b/sdks/python/apache_beam/ml/inference/huggingface_inference_it_test.py
index 6aefb533327b8..0a810a3713104 100644
--- a/sdks/python/apache_beam/ml/inference/huggingface_inference_it_test.py
+++ b/sdks/python/apache_beam/ml/inference/huggingface_inference_it_test.py
@@ -43,8 +43,8 @@ class HuggingFaceInference(unittest.TestCase):
   def test_hf_language_modeling(self):
     test_pipeline = TestPipeline(is_integration_test=True)
     # Path to text file containing some sentences
-    file_of_sentences = 'gs://apache-beam-ml/datasets/custom/hf_sentences.txt'  # pylint: disable=line-too-long
-    output_file_dir = 'gs://apache-beam-ml/testing/predictions'
+    file_of_sentences = 'gs://clouddfe-riteshghorse/hf/datasets/custom/hf_sentences.txt'  # pylint: disable=line-too-long
+    output_file_dir = 'gs://clouddfe-riteshghorse/hf/testing/predictions'
     output_file = '/'.join([output_file_dir, str(uuid.uuid4()), 'result.txt'])
 
     model_name = 'stevhliu/my_awesome_eli5_mlm_model'
@@ -61,7 +61,7 @@ def test_hf_language_modeling(self):
     self.assertEqual(FileSystems().exists(output_file), True)
     predictions = pytorch_inference_it_test.process_outputs(
         filepath=output_file)
-    actuals_file = 'gs://apache-beam-ml/testing/expected_outputs/test_hf_run_inference_for_masked_lm_actuals.txt'  # pylint: disable=line-too-long
+    actuals_file = 'gs://clouddfe-riteshghorse/hf/testing/expected_outputs/test_hf_run_inference_for_masked_lm_actuals.txt'  # pylint: disable=line-too-long
     actuals = pytorch_inference_it_test.process_outputs(filepath=actuals_file)
 
     predictions_dict = {}
diff --git a/sdks/python/tox.ini b/sdks/python/tox.ini
index 499beed2967af..d223d6a5a7010 100644
--- a/sdks/python/tox.ini
+++ b/sdks/python/tox.ini
@@ -402,7 +402,7 @@ commands =
   # Allow exit code 5 (no tests run) so that we can run this command safely on arbitrary subdirectories.
   /bin/sh -c 'pytest -o junit_suite_name={envname} --junitxml=pytest_{envname}.xml -n 6 -m uses_xgboost {posargs}; ret=$?; [ $ret = 5 ] && exit 0 || exit $ret'
 
-[testenv:py{38,39,310}-transformers-{428,429,430}]
+[testenv:py{38,39,310,311}-transformers-{428,429,430}]
 deps =
   -r build-requirements.txt
   428: transformers>=4.28.0,<4.29.0

From c60d312fcfe996798c84aae29402f07534f0185a Mon Sep 17 00:00:00 2001
From: riteshghorse <riteshghorse@gmail.com>
Date: Thu, 29 Jun 2023 11:55:37 -0400
Subject: [PATCH 12/26] refactor, doc comments

---
 .../ml/inference/huggingface_inference.py     | 80 +++++++++++--------
 1 file changed, 45 insertions(+), 35 deletions(-)

diff --git a/sdks/python/apache_beam/ml/inference/huggingface_inference.py b/sdks/python/apache_beam/ml/inference/huggingface_inference.py
index 13935a1504d6f..cad728f4db336 100644
--- a/sdks/python/apache_beam/ml/inference/huggingface_inference.py
+++ b/sdks/python/apache_beam/ml/inference/huggingface_inference.py
@@ -82,12 +82,35 @@ def _validate_constructor_args(model_uri, model_class):
         message.format(model_uri=model_uri, model_class=model_class))
 
 
+def no_gpu_available_warning():
+  logging.warning(
+      "Model handler specified a 'GPU' device, but GPUs are not available. "
+      "Switching to CPU.")
+
+
+def is_gpu_available_torch(device):
+  if device == 'GPU' and torch.cuda.is_available():
+    return True
+  no_gpu_available_warning()
+  return False
+
+
+def is_gpu_available_tensorflow(device):
+  gpu_devices = tf.config.list_physical_devices(device)
+  if len(gpu_devices) == 0:
+    no_gpu_available_warning()
+    return False
+  return True
+
+
 def _run_inference_torch_keyed_tensor(
     batch: Sequence[Dict[str, torch.Tensor]],
     model: AutoModel,
     device,
     inference_args: Dict[str, Any],
     model_id: Optional[str] = None) -> Iterable[PredictionResult]:
+  device = torch.device('cuda') if is_gpu_available_torch(
+      device) else torch.device('cpu')
   key_to_tensor_list = defaultdict(list)
   # torch.no_grad() mitigates GPU memory issues
   # https://github.com/apache/beam/issues/22811
@@ -110,6 +133,7 @@ def _run_inference_tensorflow_keyed_tensor(
     device,
     inference_args: Dict[str, Any],
     model_id: Optional[str] = None) -> Iterable[PredictionResult]:
+  is_gpu_available_tensorflow()
   key_to_tensor_list = defaultdict(list)
   for example in batch:
     for key, tensor in example.items():
@@ -129,9 +153,7 @@ def __init__(
       model_class: Union[AutoModel, TFAutoModel],
       device: str = 'CPU',
       *,
-      inference_fn: Union[
-          KeyedTensorInferenceFn,
-          TensorInferenceFn] = _run_inference_torch_keyed_tensor,
+      inference_fn: Optional[Callable[..., PredictionT]] = None,
       load_model_args: Optional[Dict[str, Any]] = None,
       inference_args: Optional[Dict[str, Any]] = None,
       min_batch_size: Optional[int] = None,
@@ -156,9 +178,9 @@ def __init__(
         Default is _run_inference_torch_keyed_tensor or
         _run_inference_tensorflow_keyed_tensor depending on the input type.
       load_model_args (Dict[str, Any]): keyword arguments to provide load
-        options while loading from Hugging Face Hub. Defaults to None.
+        options while loading models from Hugging Face Hub. Defaults to None.
       inference_args [Dict[str, Any]]: Non-batchable arguments
-        required as inputs to the model's forward() function. Unlike Tensors in
+        required as inputs to the model's inference function. Unlike Tensors in
         `batch`, these parameters will not be dynamically batched.
         Defaults to None.
       min_batch_size: the minimum batch size to use when batching inputs.
@@ -170,15 +192,12 @@ def __init__(
       kwargs: 'env_vars' can be used to set environment variables
         before loading the model.
 
-    **Supported Versions:** RunInference APIs in Apache Beam
-    supports transformers>=4.18.0.
+    **Supported Versions:** HuggingFaceModelHandler supports
+    transformers>=4.18.0.
     """
     self._model_uri = model_uri
     self._model_class = model_class
-    if device == 'GPU':
-      self._device = torch.device('cuda')
-    else:
-      self._device = torch.device('cpu')
+    self._device = device
     self._inference_fn = inference_fn
     self._model_config_args = load_model_args if load_model_args else {}
     self._inference_args = inference_args if inference_args else {}
@@ -198,13 +217,8 @@ def load_model(self):
     """Loads and initializes the model for processing."""
     model = self._model_class.from_pretrained(
         self._model_uri, **self._model_config_args)
-    if self._device == torch.device('cuda'):
-      if not torch.cuda.is_available():
-        logging.warning(
-            "Model handler specified a 'GPU' device, "
-            "but GPUs are not available. Switching to CPU.")
-        self._device = torch.device('cpu')
-      model.to(self._device)
+    if is_gpu_available_torch(self._device):
+      model.to(torch.device('cuda'))
     return model
 
   def update_model_path(self, model_path: Optional[str] = None):
@@ -253,9 +267,9 @@ class HuggingFaceModelHandlerKeyedTensor(
       Default is _run_inference_torch_keyed_tensor or
       _run_inference_tensorflow_keyed_tensor depending on the input type.
     load_model_args (Dict[str, Any]): keyword arguments to provide load
-      options while loading from Hugging Face Hub. Defaults to None.
+      options while loading models from Hugging Face Hub. Defaults to None.
     inference_args ([Dict[str, Any]]): Non-batchable arguments
-      required as inputs to the model's forward() function. Unlike Tensors in
+      required as inputs to the model's inference function. Unlike Tensors in
       `batch`, these parameters will not be dynamically batched.
       Defaults to None.
     min_batch_size: the minimum batch size to use when batching inputs.
@@ -267,8 +281,7 @@ class HuggingFaceModelHandlerKeyedTensor(
     kwargs: 'env_vars' can be used to set environment variables
       before loading the model.
 
-  **Supported Versions:** RunInference APIs in Apache Beam
-  supports transformers>=4.18.0.
+  **Supported Versions:** HuggingFaceModelHandler supports transformers>=4.18.0.
   """
   def run_inference(
       self,
@@ -290,7 +303,7 @@ def run_inference(
         model's predict() function.
       model: A Tensorflow/PyTorch model.
       inference_args: Non-batchable arguments required as inputs to the model's
-        forward() function. Unlike Tensors in `batch`, these parameters will
+        inference function. Unlike Tensors in `batch`, these parameters will
         not be dynamically batched
     Returns:
       An Iterable of type PredictionResult.
@@ -299,9 +312,7 @@ def run_inference(
     if not self._framework:
       self._framework = "tf" if isinstance(batch[0], tf.Tensor) else "torch"
 
-    # default is always torch keyed tensor. We check if user has provided their
-    # own or we move to infer it with input type.
-    if self._inference_fn != _run_inference_torch_keyed_tensor:
+    if self._inference_fn:
       return self._inference_fn(
           batch, model, self._device, inference_args, self._model_uri)
 
@@ -326,13 +337,14 @@ def _default_inference_fn_torch(
     device,
     inference_args: Dict[str, Any] = None,
     model_id: Optional[str] = None) -> Iterable[PredictionResult]:
+  device = torch.device('cuda') if is_gpu_available_torch(
+      device) else torch.device('cpu')
   # torch.no_grad() mitigates GPU memory issues
   # https://github.com/apache/beam/issues/22811
   with torch.no_grad():
     batched_tensors = torch.stack(batch)
     batched_tensors = _convert_to_device(batched_tensors, device)
     predictions = model(batched_tensors, **inference_args)
-
   return utils._convert_to_result(batch, predictions, model_id)
 
 
@@ -342,6 +354,7 @@ def _default_inference_fn_tensorflow(
     device,
     inference_args: Dict[str, Any],
     model_id: Optional[str] = None) -> Iterable[PredictionResult]:
+  is_gpu_available_tensorflow()
   batched_tensors = tf.stack(batch, axis=0)
   predictions = model(batched_tensors, **inference_args)
   return utils._convert_to_result(batch, predictions, model_id)
@@ -372,9 +385,9 @@ class HuggingFaceModelHandlerTensor(HuggingFaceModelHandler[Union[tf.Tensor,
     inference_fn: the inference function to use during RunInference.
       Default is _default_inference_fn_tensor.
     load_model_args (Dict[str, Any]): keyword arguments to provide load
-      options while loading from Hugging Face Hub. Defaults to None.
+      options while loading models from Hugging Face Hub. Defaults to None.
     inference_args ([Dict[str, Any]]): Non-batchable arguments
-      required as inputs to the model's forward() function. Unlike Tensors in
+      required as inputs to the model's inference function. Unlike Tensors in
       `batch`, these parameters will not be dynamically batched.
       Defaults to None.
     min_batch_size: the minimum batch size to use when batching inputs.
@@ -386,8 +399,7 @@ class HuggingFaceModelHandlerTensor(HuggingFaceModelHandler[Union[tf.Tensor,
     kwargs: 'env_vars' can be used to set environment variables
       before loading the model.
 
-  **Supported Versions:** RunInference APIs in Apache Beam
-  supports transformers>=4.18.0.
+  **Supported Versions:** HuggingFaceModelHandler supports transformers>=4.18.0.
   """
   def run_inference(
       self,
@@ -409,7 +421,7 @@ def run_inference(
         predict() function.
       model: A Tensorflow/PyTorch model.
       inference_args: Non-batchable arguments required as inputs to the model's
-        forward() function. Unlike Tensors in `batch`, these parameters will
+        inference function. Unlike Tensors in `batch`, these parameters will
         not be dynamically batched
     Returns:
       An Iterable of type PredictionResult.
@@ -418,9 +430,7 @@ def run_inference(
     if not self._framework:
       self._framework = "tf" if isinstance(batch[0], tf.Tensor) else "torch"
 
-    # default is always torch keyed tensor. We check if user has provided their
-    # own or we move to infer it with input type.
-    if self._inference_fn != _run_inference_torch_keyed_tensor:
+    if self._inference_fn:
       return self._inference_fn(
           batch, model, inference_args, inference_args, self._model_uri)
 

From a52536f4b4f0e16365762c045fef4086f8a62ee6 Mon Sep 17 00:00:00 2001
From: riteshghorse <riteshghorse@gmail.com>
Date: Thu, 29 Jun 2023 11:56:36 -0400
Subject: [PATCH 13/26] change test file

---
 .../ml/inference/huggingface_inference_it_test.py           | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sdks/python/apache_beam/ml/inference/huggingface_inference_it_test.py b/sdks/python/apache_beam/ml/inference/huggingface_inference_it_test.py
index 0a810a3713104..6aefb533327b8 100644
--- a/sdks/python/apache_beam/ml/inference/huggingface_inference_it_test.py
+++ b/sdks/python/apache_beam/ml/inference/huggingface_inference_it_test.py
@@ -43,8 +43,8 @@ class HuggingFaceInference(unittest.TestCase):
   def test_hf_language_modeling(self):
     test_pipeline = TestPipeline(is_integration_test=True)
     # Path to text file containing some sentences
-    file_of_sentences = 'gs://clouddfe-riteshghorse/hf/datasets/custom/hf_sentences.txt'  # pylint: disable=line-too-long
-    output_file_dir = 'gs://clouddfe-riteshghorse/hf/testing/predictions'
+    file_of_sentences = 'gs://apache-beam-ml/datasets/custom/hf_sentences.txt'  # pylint: disable=line-too-long
+    output_file_dir = 'gs://apache-beam-ml/testing/predictions'
     output_file = '/'.join([output_file_dir, str(uuid.uuid4()), 'result.txt'])
 
     model_name = 'stevhliu/my_awesome_eli5_mlm_model'
@@ -61,7 +61,7 @@ def test_hf_language_modeling(self):
     self.assertEqual(FileSystems().exists(output_file), True)
     predictions = pytorch_inference_it_test.process_outputs(
         filepath=output_file)
-    actuals_file = 'gs://clouddfe-riteshghorse/hf/testing/expected_outputs/test_hf_run_inference_for_masked_lm_actuals.txt'  # pylint: disable=line-too-long
+    actuals_file = 'gs://apache-beam-ml/testing/expected_outputs/test_hf_run_inference_for_masked_lm_actuals.txt'  # pylint: disable=line-too-long
     actuals = pytorch_inference_it_test.process_outputs(filepath=actuals_file)
 
     predictions_dict = {}

From 496d205b2fa31f96a196d5d7eb02aad380a61050 Mon Sep 17 00:00:00 2001
From: riteshghorse <riteshghorse@gmail.com>
Date: Fri, 7 Jul 2023 10:06:31 -0400
Subject: [PATCH 14/26] update types

---
 .../apache_beam/ml/inference/huggingface_inference.py     | 8 ++++----
 .../ml/inference/huggingface_inference_it_test.py         | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/sdks/python/apache_beam/ml/inference/huggingface_inference.py b/sdks/python/apache_beam/ml/inference/huggingface_inference.py
index cad728f4db336..8d0285869a14d 100644
--- a/sdks/python/apache_beam/ml/inference/huggingface_inference.py
+++ b/sdks/python/apache_beam/ml/inference/huggingface_inference.py
@@ -50,7 +50,7 @@
 TensorInferenceFn = Callable[[
     Sequence[Union[torch.Tensor, tf.Tensor]],
     Union[AutoModel, TFAutoModel],
-    torch.device,
+    str,
     Optional[Dict[str, Any]],
     Optional[str]
 ],
@@ -59,7 +59,7 @@
 KeyedTensorInferenceFn = Callable[[
     Sequence[Dict[str, Union[torch.Tensor, tf.Tensor]]],
     Union[AutoModel, TFAutoModel],
-    torch.device,
+    str,
     Optional[Dict[str, Any]],
     Optional[str]
 ],
@@ -84,8 +84,8 @@ def _validate_constructor_args(model_uri, model_class):
 
 def no_gpu_available_warning():
   logging.warning(
-      "Model handler specified a 'GPU' device, but GPUs are not available. "
-      "Switching to CPU.")
+      "HuggingFaceModelHandler specified a 'GPU' device, "
+      "but GPUs are not available. Switching to CPU.")
 
 
 def is_gpu_available_torch(device):
diff --git a/sdks/python/apache_beam/ml/inference/huggingface_inference_it_test.py b/sdks/python/apache_beam/ml/inference/huggingface_inference_it_test.py
index 6aefb533327b8..ed442a4b801aa 100644
--- a/sdks/python/apache_beam/ml/inference/huggingface_inference_it_test.py
+++ b/sdks/python/apache_beam/ml/inference/huggingface_inference_it_test.py
@@ -43,7 +43,7 @@ class HuggingFaceInference(unittest.TestCase):
   def test_hf_language_modeling(self):
     test_pipeline = TestPipeline(is_integration_test=True)
     # Path to text file containing some sentences
-    file_of_sentences = 'gs://apache-beam-ml/datasets/custom/hf_sentences.txt'  # pylint: disable=line-too-long
+    file_of_sentences = 'gs://apache-beam-ml/datasets/custom/hf_sentences.txt'
     output_file_dir = 'gs://apache-beam-ml/testing/predictions'
     output_file = '/'.join([output_file_dir, str(uuid.uuid4()), 'result.txt'])
 

From d7fd7776bb142f88693d96df876964414432786f Mon Sep 17 00:00:00 2001
From: riteshghorse <riteshghorse@gmail.com>
Date: Tue, 11 Jul 2023 13:11:39 -0400
Subject: [PATCH 15/26] update tox, doc, lints

---
 .../ml/inference/huggingface_inference.py     | 53 ++-----------------
 sdks/python/tox.ini                           |  2 +-
 2 files changed, 5 insertions(+), 50 deletions(-)

diff --git a/sdks/python/apache_beam/ml/inference/huggingface_inference.py b/sdks/python/apache_beam/ml/inference/huggingface_inference.py
index 8d0285869a14d..69f15bbc25413 100644
--- a/sdks/python/apache_beam/ml/inference/huggingface_inference.py
+++ b/sdks/python/apache_beam/ml/inference/huggingface_inference.py
@@ -17,9 +17,9 @@
 
 # pytype: skip-file
 
-from abc import ABC
 import logging
 import sys
+from abc import ABC
 from collections import defaultdict
 from typing import Any
 from typing import Callable
@@ -41,6 +41,8 @@
 from transformers import AutoModel
 from transformers import TFAutoModel
 
+_LOGGER = logging.getLogger(__name__)
+
 __all__ = [
     'HuggingFaceModelHandler',
     'HuggingFaceModelHandlerTensor',
@@ -83,7 +85,7 @@ def _validate_constructor_args(model_uri, model_class):
 
 
 def no_gpu_available_warning():
-  logging.warning(
+  _LOGGER.warning(
       "HuggingFaceModelHandler specified a 'GPU' device, "
       "but GPUs are not available. Switching to CPU.")
 
@@ -257,30 +259,6 @@ class HuggingFaceModelHandlerKeyedTensor(
     pcoll | RunInference(HuggingFaceModelHandlerKeyedTensor(
       model_uri="bert-base-uncased", model_class=AutoModelForMaskedLM))
 
-  Args:
-    model_uri (str): path to the pretrained model on the hugging face
-      models hub.
-    model_class: model class to load the repository from model_uri.
-    device: For torch tensors, specify device on which you wish to
-      run the model. Defaults to CPU.
-    inference_fn: the inference function to use during RunInference.
-      Default is _run_inference_torch_keyed_tensor or
-      _run_inference_tensorflow_keyed_tensor depending on the input type.
-    load_model_args (Dict[str, Any]): keyword arguments to provide load
-      options while loading models from Hugging Face Hub. Defaults to None.
-    inference_args ([Dict[str, Any]]): Non-batchable arguments
-      required as inputs to the model's inference function. Unlike Tensors in
-      `batch`, these parameters will not be dynamically batched.
-      Defaults to None.
-    min_batch_size: the minimum batch size to use when batching inputs.
-    max_batch_size: the maximum batch size to use when batching inputs.
-    large_model: set to true if your model is large enough to run into
-      memory pressure if you load multiple copies. Given a model that
-      consumes N memory and a machine with W cores and M memory, you should
-      set this to True if N*W > M.
-    kwargs: 'env_vars' can be used to set environment variables
-      before loading the model.
-
   **Supported Versions:** HuggingFaceModelHandler supports transformers>=4.18.0.
   """
   def run_inference(
@@ -376,29 +354,6 @@ class HuggingFaceModelHandlerTensor(HuggingFaceModelHandler[Union[tf.Tensor,
     pcoll | RunInference(HuggingFaceModelHandlerTensor(
       model_uri="bert-base-uncased", model_class=AutoModelForMaskedLM))
 
-  Args:
-    model_uri (str): path to the pretrained model on the hugging face
-      models hub.
-    model_class: model class to load the repository from model_uri.
-    device: For torch tensors, specify device on which you wish to
-      run the model. Defaults to CPU.
-    inference_fn: the inference function to use during RunInference.
-      Default is _default_inference_fn_tensor.
-    load_model_args (Dict[str, Any]): keyword arguments to provide load
-      options while loading models from Hugging Face Hub. Defaults to None.
-    inference_args ([Dict[str, Any]]): Non-batchable arguments
-      required as inputs to the model's inference function. Unlike Tensors in
-      `batch`, these parameters will not be dynamically batched.
-      Defaults to None.
-    min_batch_size: the minimum batch size to use when batching inputs.
-    max_batch_size: the maximum batch size to use when batching inputs.
-    large_model: set to true if your model is large enough to run into
-      memory pressure if you load multiple copies. Given a model that
-      consumes N memory and a machine with W cores and M memory, you should
-      set this to True if N*W > M.
-    kwargs: 'env_vars' can be used to set environment variables
-      before loading the model.
-
   **Supported Versions:** HuggingFaceModelHandler supports transformers>=4.18.0.
   """
   def run_inference(
diff --git a/sdks/python/tox.ini b/sdks/python/tox.ini
index d223d6a5a7010..7dbbfa5dc107f 100644
--- a/sdks/python/tox.ini
+++ b/sdks/python/tox.ini
@@ -408,7 +408,7 @@ deps =
   428: transformers>=4.28.0,<4.29.0
   429: transformers>=4.29.0,<4.30.0
   430: transformers>=4.30.0,<4.31.0
-  torch>=1.9.0<1.14.0
+  torch>=1.9.0,<1.14.0
   tensorflow==2.12.0
 extras = test,gcp
 commands =

From 854405136e5dd3337256555c91c83cbb43bce8e5 Mon Sep 17 00:00:00 2001
From: riteshghorse <riteshghorse@gmail.com>
Date: Tue, 11 Jul 2023 15:14:18 -0400
Subject: [PATCH 16/26] fix lints

---
 .../ml/inference/huggingface_inference.py     | 20 +++++++++----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/sdks/python/apache_beam/ml/inference/huggingface_inference.py b/sdks/python/apache_beam/ml/inference/huggingface_inference.py
index 69f15bbc25413..58f6060d116ef 100644
--- a/sdks/python/apache_beam/ml/inference/huggingface_inference.py
+++ b/sdks/python/apache_beam/ml/inference/huggingface_inference.py
@@ -135,7 +135,7 @@ def _run_inference_tensorflow_keyed_tensor(
     device,
     inference_args: Dict[str, Any],
     model_id: Optional[str] = None) -> Iterable[PredictionResult]:
-  is_gpu_available_tensorflow()
+  is_gpu_available_tensorflow(device)
   key_to_tensor_list = defaultdict(list)
   for example in batch:
     for key, tensor in example.items():
@@ -155,7 +155,7 @@ def __init__(
       model_class: Union[AutoModel, TFAutoModel],
       device: str = 'CPU',
       *,
-      inference_fn: Optional[Callable[..., PredictionT]] = None,
+      inference_fn: Optional[Callable[..., Iterable[PredictionT]]] = None,
       load_model_args: Optional[Dict[str, Any]] = None,
       inference_args: Optional[Dict[str, Any]] = None,
       min_batch_size: Optional[int] = None,
@@ -247,7 +247,7 @@ def share_model_across_processes(self) -> bool:
 
 class HuggingFaceModelHandlerKeyedTensor(
     HuggingFaceModelHandler[Dict[str, Union[tf.Tensor, torch.Tensor]],
-                            PredictionResult,
+                            Iterable[PredictionResult],
                             Union[AutoModel, TFAutoModel]]):
   """Implementation of the ModelHandler interface for HuggingFace with
     Keyed Tensors for PyTorch/Tensorflow backend.
@@ -313,7 +313,7 @@ def _default_inference_fn_torch(
     batch: Sequence[Union[tf.Tensor, torch.Tensor]],
     model: Union[AutoModel, TFAutoModel],
     device,
-    inference_args: Dict[str, Any] = None,
+    inference_args: Dict[str, Any],
     model_id: Optional[str] = None) -> Iterable[PredictionResult]:
   device = torch.device('cuda') if is_gpu_available_torch(
       device) else torch.device('cpu')
@@ -332,18 +332,16 @@ def _default_inference_fn_tensorflow(
     device,
     inference_args: Dict[str, Any],
     model_id: Optional[str] = None) -> Iterable[PredictionResult]:
-  is_gpu_available_tensorflow()
+  is_gpu_available_tensorflow(device)
   batched_tensors = tf.stack(batch, axis=0)
   predictions = model(batched_tensors, **inference_args)
   return utils._convert_to_result(batch, predictions, model_id)
 
 
-class HuggingFaceModelHandlerTensor(HuggingFaceModelHandler[Union[tf.Tensor,
-                                                                  torch.Tensor],
-                                                            PredictionResult,
-                                                            Union[AutoModel,
-                                                                  TFAutoModel]]
-                                    ):
+class HuggingFaceModelHandlerTensor(
+    HuggingFaceModelHandler[Union[tf.Tensor, torch.Tensor],
+                            Iterable[PredictionResult],
+                            Union[AutoModel, TFAutoModel]]):
   """Implementation of the ModelHandler interface for HuggingFace with
     Tensors for PyTorch/Tensorflow backend.
 

From 20b1af28857c72907373ac469647b4296ed5a315 Mon Sep 17 00:00:00 2001
From: riteshghorse <riteshghorse@gmail.com>
Date: Tue, 11 Jul 2023 15:45:28 -0400
Subject: [PATCH 17/26] pr type

---
 .../ml/inference/huggingface_inference.py            | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/sdks/python/apache_beam/ml/inference/huggingface_inference.py b/sdks/python/apache_beam/ml/inference/huggingface_inference.py
index 58f6060d116ef..bb4686c03170b 100644
--- a/sdks/python/apache_beam/ml/inference/huggingface_inference.py
+++ b/sdks/python/apache_beam/ml/inference/huggingface_inference.py
@@ -247,7 +247,7 @@ def share_model_across_processes(self) -> bool:
 
 class HuggingFaceModelHandlerKeyedTensor(
     HuggingFaceModelHandler[Dict[str, Union[tf.Tensor, torch.Tensor]],
-                            Iterable[PredictionResult],
+                            PredictionResult,
                             Union[AutoModel, TFAutoModel]]):
   """Implementation of the ModelHandler interface for HuggingFace with
     Keyed Tensors for PyTorch/Tensorflow backend.
@@ -338,10 +338,12 @@ def _default_inference_fn_tensorflow(
   return utils._convert_to_result(batch, predictions, model_id)
 
 
-class HuggingFaceModelHandlerTensor(
-    HuggingFaceModelHandler[Union[tf.Tensor, torch.Tensor],
-                            Iterable[PredictionResult],
-                            Union[AutoModel, TFAutoModel]]):
+class HuggingFaceModelHandlerTensor(HuggingFaceModelHandler[Union[tf.Tensor,
+                                                                  torch.Tensor],
+                                                            PredictionResult,
+                                                            Union[AutoModel,
+                                                                  TFAutoModel]]
+                                    ):
   """Implementation of the ModelHandler interface for HuggingFace with
     Tensors for PyTorch/Tensorflow backend.
 

From 85014a75a53d296c66b20989d92b28467fcbf7bf Mon Sep 17 00:00:00 2001
From: riteshghorse <riteshghorse@gmail.com>
Date: Wed, 12 Jul 2023 13:33:22 -0400
Subject: [PATCH 18/26] update gpu warnings

---
 .../ml/inference/huggingface_inference.py     | 37 +++++++++++++------
 1 file changed, 25 insertions(+), 12 deletions(-)

diff --git a/sdks/python/apache_beam/ml/inference/huggingface_inference.py b/sdks/python/apache_beam/ml/inference/huggingface_inference.py
index bb4686c03170b..44890a805b1db 100644
--- a/sdks/python/apache_beam/ml/inference/huggingface_inference.py
+++ b/sdks/python/apache_beam/ml/inference/huggingface_inference.py
@@ -90,11 +90,18 @@ def no_gpu_available_warning():
       "but GPUs are not available. Switching to CPU.")
 
 
-def is_gpu_available_torch(device):
-  if device == 'GPU' and torch.cuda.is_available():
+def is_gpu_available_torch():
+  if torch.cuda.is_available():
     return True
-  no_gpu_available_warning()
-  return False
+  else:
+    no_gpu_available_warning()
+    return False
+
+
+def get_device_torch(device):
+  if device == 'GPU' and is_gpu_available_torch():
+    return torch.device('cuda')
+  return torch.device('cpu')
 
 
 def is_gpu_available_tensorflow(device):
@@ -111,8 +118,7 @@ def _run_inference_torch_keyed_tensor(
     device,
     inference_args: Dict[str, Any],
     model_id: Optional[str] = None) -> Iterable[PredictionResult]:
-  device = torch.device('cuda') if is_gpu_available_torch(
-      device) else torch.device('cpu')
+  device = get_device_torch(device)
   key_to_tensor_list = defaultdict(list)
   # torch.no_grad() mitigates GPU memory issues
   # https://github.com/apache/beam/issues/22811
@@ -219,8 +225,6 @@ def load_model(self):
     """Loads and initializes the model for processing."""
     model = self._model_class.from_pretrained(
         self._model_uri, **self._model_config_args)
-    if is_gpu_available_torch(self._device):
-      model.to(torch.device('cuda'))
     return model
 
   def update_model_path(self, model_path: Optional[str] = None):
@@ -288,7 +292,12 @@ def run_inference(
     """
     inference_args = {} if not inference_args else inference_args
     if not self._framework:
-      self._framework = "tf" if isinstance(batch[0], tf.Tensor) else "torch"
+      if isinstance(batch[0], tf.Tensor):
+        self._framework = "tf"
+      else:
+        self._framework = "torch"
+        if self._device == 'GPU' and is_gpu_available_torch():
+          model.to(torch.device('cuda'))
 
     if self._inference_fn:
       return self._inference_fn(
@@ -315,8 +324,7 @@ def _default_inference_fn_torch(
     device,
     inference_args: Dict[str, Any],
     model_id: Optional[str] = None) -> Iterable[PredictionResult]:
-  device = torch.device('cuda') if is_gpu_available_torch(
-      device) else torch.device('cpu')
+  device = get_device_torch(device)
   # torch.no_grad() mitigates GPU memory issues
   # https://github.com/apache/beam/issues/22811
   with torch.no_grad():
@@ -383,7 +391,12 @@ def run_inference(
     """
     inference_args = {} if not inference_args else inference_args
     if not self._framework:
-      self._framework = "tf" if isinstance(batch[0], tf.Tensor) else "torch"
+      if isinstance(batch[0], tf.Tensor):
+        self._framework = "tf"
+      else:
+        self._framework = "torch"
+        if self._device == 'GPU' and is_gpu_available_torch():
+          model.to(torch.device('cuda'))
 
     if self._inference_fn:
       return self._inference_fn(

From 9dff2e3c6cc7edbb9839da60d8859ec9760c7927 Mon Sep 17 00:00:00 2001
From: riteshghorse <riteshghorse@gmail.com>
Date: Wed, 12 Jul 2023 15:34:04 -0400
Subject: [PATCH 19/26] fix pydoc

---
 .../apache_beam/ml/inference/huggingface_inference.py     | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/sdks/python/apache_beam/ml/inference/huggingface_inference.py b/sdks/python/apache_beam/ml/inference/huggingface_inference.py
index 44890a805b1db..af83f32d29e0a 100644
--- a/sdks/python/apache_beam/ml/inference/huggingface_inference.py
+++ b/sdks/python/apache_beam/ml/inference/huggingface_inference.py
@@ -187,7 +187,7 @@ def __init__(
         _run_inference_tensorflow_keyed_tensor depending on the input type.
       load_model_args (Dict[str, Any]): keyword arguments to provide load
         options while loading models from Hugging Face Hub. Defaults to None.
-      inference_args [Dict[str, Any]]: Non-batchable arguments
+      inference_args (Dict[str, Any]): Non-batchable arguments
         required as inputs to the model's inference function. Unlike Tensors in
         `batch`, these parameters will not be dynamically batched.
         Defaults to None.
@@ -383,9 +383,9 @@ def run_inference(
         Tensors with dimensions (batch_size, n_features, etc.) into the model's
         predict() function.
       model: A Tensorflow/PyTorch model.
-      inference_args: Non-batchable arguments required as inputs to the model's
-        inference function. Unlike Tensors in `batch`, these parameters will
-        not be dynamically batched
+      inference_args (Dict[str, Any]): Non-batchable arguments required as
+        inputs to the model's inference function. Unlike Tensors in `batch`,
+        these parameters will not be dynamically batched.
     Returns:
       An Iterable of type PredictionResult.
     """

From dc83ecd55b59415fdf045d41f676b5f5cc0e9689 Mon Sep 17 00:00:00 2001
From: riteshghorse <riteshghorse@gmail.com>
Date: Tue, 18 Jul 2023 13:38:23 -0400
Subject: [PATCH 20/26] update typos, refactor

---
 .../huggingface_language_modeling.py          |  10 +-
 .../ml/inference/huggingface_inference.py     | 140 +++++++++---------
 2 files changed, 73 insertions(+), 77 deletions(-)

diff --git a/sdks/python/apache_beam/examples/inference/huggingface_language_modeling.py b/sdks/python/apache_beam/examples/inference/huggingface_language_modeling.py
index 39cd5cc302d82..ee69d8bdea032 100644
--- a/sdks/python/apache_beam/examples/inference/huggingface_language_modeling.py
+++ b/sdks/python/apache_beam/examples/inference/huggingface_language_modeling.py
@@ -15,8 +15,8 @@
 # limitations under the License.
 #
 
-""""A pipeline that uses RunInference to perform Language Modeling with
-model from Hugging Face.
+"""A pipeline that uses RunInference to perform Language Modeling with
+masked language model from Hugging Face.
 
 This pipeline takes sentences from a custom text file, converts the last word
 of the sentence into a <mask> token, and then uses the AutoModelForMaskedLM from
@@ -74,9 +74,7 @@ def filter_empty_lines(text: str) -> Iterator[str]:
 class PostProcessor(beam.DoFn):
   """Processes the PredictionResult to get the predicted word.
 
-  The logits are the output of the Model. After applying a softmax
-  activation function to the logits, we get probabilistic distributions for each
-  of the words in the model's vocabulary. We can get the word with the highest
+  The logits are the output of the Model. We can get the word with the highest
   probability of being a candidate replacement word by taking the argmax.
   """
   def __init__(self, tokenizer: AutoTokenizer):
@@ -124,8 +122,6 @@ def run(
   """
   Args:
     argv: Command line arguments defined for this example.
-    model_class: Reference to the class definition of the model.
-    model_name: Name of the pretrained model to be loaded.
     save_main_session: Used for internal testing.
     test_pipeline: Used for internal testing.
   """
diff --git a/sdks/python/apache_beam/ml/inference/huggingface_inference.py b/sdks/python/apache_beam/ml/inference/huggingface_inference.py
index af83f32d29e0a..8b14dd9ec6857 100644
--- a/sdks/python/apache_beam/ml/inference/huggingface_inference.py
+++ b/sdks/python/apache_beam/ml/inference/huggingface_inference.py
@@ -4,7 +4,7 @@
 # this work for additional information regarding copyright ownership.
 # The ASF licenses this file to You under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
+# the License. You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
@@ -141,7 +141,8 @@ def _run_inference_tensorflow_keyed_tensor(
     device,
     inference_args: Dict[str, Any],
     model_id: Optional[str] = None) -> Iterable[PredictionResult]:
-  is_gpu_available_tensorflow(device)
+  if device == 'GPU':
+    is_gpu_available_tensorflow(device)
   key_to_tensor_list = defaultdict(list)
   for example in batch:
     for key, tensor in example.items():
@@ -169,39 +170,39 @@ def __init__(
       large_model: bool = False,
       **kwargs):
     """Implementation of the abstract base class of ModelHandler interface
-    for Hugging Face. This class shouldn't be instantiated directly.
-    Use HuggingFaceModelHandlerKeyedTensor or HuggingFaceModelHandlerTensor.
-
-    Example Usage model::
-    pcoll | RunInference(HuggingFaceModelHandlerKeyedTensor(
-      model_uri="bert-base-uncased", model_class=AutoModelForMaskedLM))
-
-    Args:
-      model_uri (str): path to the pretrained model on the hugging face
-        models hub.
-      model_class: model class to load the repository from model_uri.
-      device: For torch tensors, specify device on which you wish to
-        run the model. Defaults to CPU.
-      inference_fn: the inference function to use during RunInference.
-        Default is _run_inference_torch_keyed_tensor or
-        _run_inference_tensorflow_keyed_tensor depending on the input type.
-      load_model_args (Dict[str, Any]): keyword arguments to provide load
-        options while loading models from Hugging Face Hub. Defaults to None.
-      inference_args (Dict[str, Any]): Non-batchable arguments
-        required as inputs to the model's inference function. Unlike Tensors in
-        `batch`, these parameters will not be dynamically batched.
-        Defaults to None.
-      min_batch_size: the minimum batch size to use when batching inputs.
-      max_batch_size: the maximum batch size to use when batching inputs.
-      large_model: set to true if your model is large enough to run into
-        memory pressure if you load multiple copies. Given a model that
-        consumes N memory and a machine with W cores and M memory, you should
-        set this to True if N*W > M.
-      kwargs: 'env_vars' can be used to set environment variables
-        before loading the model.
-
-    **Supported Versions:** HuggingFaceModelHandler supports
-    transformers>=4.18.0.
+      for Hugging Face. This class shouldn't be instantiated directly.
+      Use HuggingFaceModelHandlerKeyedTensor or HuggingFaceModelHandlerTensor.
+
+      Example Usage model::
+      pcoll | RunInference(HuggingFaceModelHandlerKeyedTensor(
+        model_uri="bert-base-uncased", model_class=AutoModelForMaskedLM))
+
+      Args:
+        model_uri (str): path to the pretrained model on the hugging face
+          models hub.
+        model_class: model class to load the repository from model_uri.
+        device: For torch tensors, specify device on which you wish to
+          run the model. Defaults to CPU.
+        inference_fn: the inference function to use during RunInference.
+          Default is _run_inference_torch_keyed_tensor or
+          _run_inference_tensorflow_keyed_tensor depending on the input type.
+        load_model_args (Dict[str, Any]): keyword arguments to provide load
+          options while loading models from Hugging Face Hub. Defaults to None.
+        inference_args (Dict[str, Any]): Non-batchable arguments
+          required as inputs to the model's inference function. Unlike Tensors
+          in `batch`, these parameters will not be dynamically batched.
+          Defaults to None.
+        min_batch_size: the minimum batch size to use when batching inputs.
+        max_batch_size: the maximum batch size to use when batching inputs.
+        large_model: set to true if your model is large enough to run into
+          memory pressure if you load multiple copies. Given a model that
+          consumes N memory and a machine with W cores and M memory, you should
+          set this to True if N*W > M.
+        kwargs: 'env_vars' can be used to set environment variables
+          before loading the model.
+
+      **Supported Versions:** HuggingFaceModelHandler supports
+      transformers>=4.18.0.
     """
     self._model_uri = model_uri
     self._model_class = model_class
@@ -271,24 +272,23 @@ def run_inference(
       model: Union[AutoModel, TFAutoModel],
       inference_args: Optional[Dict[str, Any]] = None
   ) -> Iterable[PredictionResult]:
-    """
-    Runs inferences on a batch of Keyed Tensors and returns an Iterable of
-    Tensors Predictions.
-
-    This method stacks the list of Tensors in a vectorized format to optimize
-    the inference call.
-
-    Args:
-      batch: A sequence of Keyed Tensors. These Tensors should be batchable,
-        as this method will call `tf.stack()`/`torch.stack()` and pass in
-        batched Tensors with dimensions (batch_size, n_features, etc.) into the
-        model's predict() function.
-      model: A Tensorflow/PyTorch model.
-      inference_args: Non-batchable arguments required as inputs to the model's
-        inference function. Unlike Tensors in `batch`, these parameters will
-        not be dynamically batched
-    Returns:
-      An Iterable of type PredictionResult.
+    """Runs inferences on a batch of Keyed Tensors and returns an Iterable of
+      Tensors Predictions.
+
+      This method stacks the list of Tensors in a vectorized format to optimize
+      the inference call.
+
+      Args:
+        batch: A sequence of Keyed Tensors. These Tensors should be batchable,
+          as this method will call `tf.stack()`/`torch.stack()` and pass in
+          batched Tensors with dimensions (batch_size, n_features, etc.) into
+          the model's predict() function.
+        model: A Tensorflow/PyTorch model.
+        inference_args: Non-batchable arguments required as inputs to the
+          model's inference function. Unlike Tensors in `batch`,
+          these parameters will not be dynamically batched.
+      Returns:
+        An Iterable of type PredictionResult.
     """
     inference_args = {} if not inference_args else inference_args
     if not self._framework:
@@ -340,7 +340,8 @@ def _default_inference_fn_tensorflow(
     device,
     inference_args: Dict[str, Any],
     model_id: Optional[str] = None) -> Iterable[PredictionResult]:
-  is_gpu_available_tensorflow(device)
+  if device == 'GPU':
+    is_gpu_available_tensorflow(device)
   batched_tensors = tf.stack(batch, axis=0)
   predictions = model(batched_tensors, **inference_args)
   return utils._convert_to_result(batch, predictions, model_id)
@@ -370,24 +371,23 @@ def run_inference(
       model: Union[AutoModel, TFAutoModel],
       inference_args: Optional[Dict[str, Any]] = None
   ) -> Iterable[PredictionResult]:
-    """
-    Runs inferences on a batch of Tensors and returns an Iterable of
+    """Runs inferences on a batch of Tensors and returns an Iterable of
     Tensors Predictions.
 
-    This method stacks the list of Tensors in a vectorized format to optimize
-    the inference call.
-
-    Args:
-      batch: A sequence of Tensors. These Tensors should be batchable, as this
-        method will call `tf.stack()`/`torch.stack()` and pass in batched
-        Tensors with dimensions (batch_size, n_features, etc.) into the model's
-        predict() function.
-      model: A Tensorflow/PyTorch model.
-      inference_args (Dict[str, Any]): Non-batchable arguments required as
-        inputs to the model's inference function. Unlike Tensors in `batch`,
-        these parameters will not be dynamically batched.
-    Returns:
-      An Iterable of type PredictionResult.
+      This method stacks the list of Tensors in a vectorized format to optimize
+      the inference call.
+
+      Args:
+        batch: A sequence of Tensors. These Tensors should be batchable, as
+          this method will call `tf.stack()`/`torch.stack()` and pass in
+          batched Tensors with dimensions (batch_size, n_features, etc.)
+          into the model's predict() function.
+        model: A Tensorflow/PyTorch model.
+        inference_args (Dict[str, Any]): Non-batchable arguments required as
+          inputs to the model's inference function. Unlike Tensors in `batch`,
+          these parameters will not be dynamically batched.
+      Returns:
+        An Iterable of type PredictionResult.
     """
     inference_args = {} if not inference_args else inference_args
     if not self._framework:

From 441011fbd8825eec5e6bc2025dae92ab81ae71f0 Mon Sep 17 00:00:00 2001
From: riteshghorse <riteshghorse@gmail.com>
Date: Tue, 18 Jul 2023 13:59:45 -0400
Subject: [PATCH 21/26] fix docstrings

---
 .../ml/inference/huggingface_inference.py     | 152 +++++++++---------
 1 file changed, 77 insertions(+), 75 deletions(-)

diff --git a/sdks/python/apache_beam/ml/inference/huggingface_inference.py b/sdks/python/apache_beam/ml/inference/huggingface_inference.py
index 8b14dd9ec6857..09722c1aace68 100644
--- a/sdks/python/apache_beam/ml/inference/huggingface_inference.py
+++ b/sdks/python/apache_beam/ml/inference/huggingface_inference.py
@@ -170,39 +170,40 @@ def __init__(
       large_model: bool = False,
       **kwargs):
     """Implementation of the abstract base class of ModelHandler interface
-      for Hugging Face. This class shouldn't be instantiated directly.
-      Use HuggingFaceModelHandlerKeyedTensor or HuggingFaceModelHandlerTensor.
-
-      Example Usage model::
-      pcoll | RunInference(HuggingFaceModelHandlerKeyedTensor(
-        model_uri="bert-base-uncased", model_class=AutoModelForMaskedLM))
-
-      Args:
-        model_uri (str): path to the pretrained model on the hugging face
-          models hub.
-        model_class: model class to load the repository from model_uri.
-        device: For torch tensors, specify device on which you wish to
-          run the model. Defaults to CPU.
-        inference_fn: the inference function to use during RunInference.
-          Default is _run_inference_torch_keyed_tensor or
-          _run_inference_tensorflow_keyed_tensor depending on the input type.
-        load_model_args (Dict[str, Any]): keyword arguments to provide load
-          options while loading models from Hugging Face Hub. Defaults to None.
-        inference_args (Dict[str, Any]): Non-batchable arguments
-          required as inputs to the model's inference function. Unlike Tensors
-          in `batch`, these parameters will not be dynamically batched.
-          Defaults to None.
-        min_batch_size: the minimum batch size to use when batching inputs.
-        max_batch_size: the maximum batch size to use when batching inputs.
-        large_model: set to true if your model is large enough to run into
-          memory pressure if you load multiple copies. Given a model that
-          consumes N memory and a machine with W cores and M memory, you should
-          set this to True if N*W > M.
-        kwargs: 'env_vars' can be used to set environment variables
-          before loading the model.
-
-      **Supported Versions:** HuggingFaceModelHandler supports
-      transformers>=4.18.0.
+    for Hugging Face. This class shouldn't be instantiated directly.
+
+    Use HuggingFaceModelHandlerKeyedTensor or HuggingFaceModelHandlerTensor.
+
+    Example Usage model::
+    pcoll | RunInference(HuggingFaceModelHandlerKeyedTensor(
+      model_uri="bert-base-uncased", model_class=AutoModelForMaskedLM))
+
+    Args:
+      model_uri (str): path to the pretrained model on the hugging face
+        models hub.
+      model_class: model class to load the repository from model_uri.
+      device: For torch tensors, specify device on which you wish to
+        run the model. Defaults to CPU.
+      inference_fn: the inference function to use during RunInference.
+        Default is _run_inference_torch_keyed_tensor or
+        _run_inference_tensorflow_keyed_tensor depending on the input type.
+      load_model_args (Dict[str, Any]): keyword arguments to provide load
+        options while loading models from Hugging Face Hub. Defaults to None.
+      inference_args (Dict[str, Any]): Non-batchable arguments
+        required as inputs to the model's inference function. Unlike Tensors
+        in `batch`, these parameters will not be dynamically batched.
+        Defaults to None.
+      min_batch_size: the minimum batch size to use when batching inputs.
+      max_batch_size: the maximum batch size to use when batching inputs.
+      large_model: set to true if your model is large enough to run into
+        memory pressure if you load multiple copies. Given a model that
+        consumes N memory and a machine with W cores and M memory, you should
+        set this to True if N*W > M.
+      kwargs: 'env_vars' can be used to set environment variables
+        before loading the model.
+
+    **Supported Versions:** HuggingFaceModelHandler supports
+    transformers>=4.18.0.
     """
     self._model_uri = model_uri
     self._model_class = model_class
@@ -255,14 +256,14 @@ class HuggingFaceModelHandlerKeyedTensor(
                             PredictionResult,
                             Union[AutoModel, TFAutoModel]]):
   """Implementation of the ModelHandler interface for HuggingFace with
-    Keyed Tensors for PyTorch/Tensorflow backend.
+  Keyed Tensors for PyTorch/Tensorflow backend.
 
-    Depending on the type of tensors,
-    the model framework is determined automatically.
+  Depending on the type of tensors, the model framework is determined
+  automatically.
 
-    Example Usage model::
-    pcoll | RunInference(HuggingFaceModelHandlerKeyedTensor(
-      model_uri="bert-base-uncased", model_class=AutoModelForMaskedLM))
+  Example Usage model::
+  pcoll | RunInference(HuggingFaceModelHandlerKeyedTensor(
+    model_uri="bert-base-uncased", model_class=AutoModelForMaskedLM))
 
   **Supported Versions:** HuggingFaceModelHandler supports transformers>=4.18.0.
   """
@@ -273,22 +274,22 @@ def run_inference(
       inference_args: Optional[Dict[str, Any]] = None
   ) -> Iterable[PredictionResult]:
     """Runs inferences on a batch of Keyed Tensors and returns an Iterable of
-      Tensors Predictions.
-
-      This method stacks the list of Tensors in a vectorized format to optimize
-      the inference call.
-
-      Args:
-        batch: A sequence of Keyed Tensors. These Tensors should be batchable,
-          as this method will call `tf.stack()`/`torch.stack()` and pass in
-          batched Tensors with dimensions (batch_size, n_features, etc.) into
-          the model's predict() function.
-        model: A Tensorflow/PyTorch model.
-        inference_args: Non-batchable arguments required as inputs to the
-          model's inference function. Unlike Tensors in `batch`,
-          these parameters will not be dynamically batched.
-      Returns:
-        An Iterable of type PredictionResult.
+    Tensors Predictions.
+
+    This method stacks the list of Tensors in a vectorized format to optimize
+    the inference call.
+
+    Args:
+      batch: A sequence of Keyed Tensors. These Tensors should be batchable,
+        as this method will call `tf.stack()`/`torch.stack()` and pass in
+        batched Tensors with dimensions (batch_size, n_features, etc.) into
+        the model's predict() function.
+      model: A Tensorflow/PyTorch model.
+      inference_args: Non-batchable arguments required as inputs to the
+        model's inference function. Unlike Tensors in `batch`,
+        these parameters will not be dynamically batched.
+    Returns:
+      An Iterable of type PredictionResult.
     """
     inference_args = {} if not inference_args else inference_args
     if not self._framework:
@@ -354,14 +355,14 @@ class HuggingFaceModelHandlerTensor(HuggingFaceModelHandler[Union[tf.Tensor,
                                                                   TFAutoModel]]
                                     ):
   """Implementation of the ModelHandler interface for HuggingFace with
-    Tensors for PyTorch/Tensorflow backend.
+  Tensors for PyTorch/Tensorflow backend.
 
-    Depending on the type of tensors,
-    the model framework is determined automatically.
+  Depending on the type of tensors, the model framework is determined
+  automatically.
 
-    Example Usage model:
-    pcoll | RunInference(HuggingFaceModelHandlerTensor(
-      model_uri="bert-base-uncased", model_class=AutoModelForMaskedLM))
+  Example Usage model:
+  pcoll | RunInference(HuggingFaceModelHandlerTensor(
+    model_uri="bert-base-uncased", model_class=AutoModelForMaskedLM))
 
   **Supported Versions:** HuggingFaceModelHandler supports transformers>=4.18.0.
   """
@@ -374,20 +375,21 @@ def run_inference(
     """Runs inferences on a batch of Tensors and returns an Iterable of
     Tensors Predictions.
 
-      This method stacks the list of Tensors in a vectorized format to optimize
-      the inference call.
-
-      Args:
-        batch: A sequence of Tensors. These Tensors should be batchable, as
-          this method will call `tf.stack()`/`torch.stack()` and pass in
-          batched Tensors with dimensions (batch_size, n_features, etc.)
-          into the model's predict() function.
-        model: A Tensorflow/PyTorch model.
-        inference_args (Dict[str, Any]): Non-batchable arguments required as
-          inputs to the model's inference function. Unlike Tensors in `batch`,
-          these parameters will not be dynamically batched.
-      Returns:
-        An Iterable of type PredictionResult.
+    This method stacks the list of Tensors in a vectorized format to optimize
+    the inference call.
+
+    Args:
+      batch: A sequence of Tensors. These Tensors should be batchable, as
+        this method will call `tf.stack()`/`torch.stack()` and pass in
+        batched Tensors with dimensions (batch_size, n_features, etc.)
+        into the model's predict() function.
+      model: A Tensorflow/PyTorch model.
+      inference_args (Dict[str, Any]): Non-batchable arguments required as
+        inputs to the model's inference function. Unlike Tensors in `batch`,
+        these parameters will not be dynamically batched.
+
+    Returns:
+      An Iterable of type PredictionResult.
     """
     inference_args = {} if not inference_args else inference_args
     if not self._framework:

From f7e974c345796b110bc92d97e4d102386651ff7a Mon Sep 17 00:00:00 2001
From: riteshghorse <riteshghorse@gmail.com>
Date: Tue, 18 Jul 2023 14:23:45 -0400
Subject: [PATCH 22/26] refactor, doc, lints

---
 .../ml/inference/huggingface_inference.py     | 50 ++++++++++---------
 1 file changed, 26 insertions(+), 24 deletions(-)

diff --git a/sdks/python/apache_beam/ml/inference/huggingface_inference.py b/sdks/python/apache_beam/ml/inference/huggingface_inference.py
index 09722c1aace68..8c49b37b0e733 100644
--- a/sdks/python/apache_beam/ml/inference/huggingface_inference.py
+++ b/sdks/python/apache_beam/ml/inference/huggingface_inference.py
@@ -44,9 +44,9 @@
 _LOGGER = logging.getLogger(__name__)
 
 __all__ = [
-    'HuggingFaceModelHandler',
-    'HuggingFaceModelHandlerTensor',
-    'HuggingFaceModelHandlerKeyedTensor',
+    "HuggingFaceModelHandler",
+    "HuggingFaceModelHandlerTensor",
+    "HuggingFaceModelHandlerKeyedTensor",
 ]
 
 TensorInferenceFn = Callable[[
@@ -54,18 +54,20 @@
     Union[AutoModel, TFAutoModel],
     str,
     Optional[Dict[str, Any]],
-    Optional[str]
+    Optional[str],
 ],
-                             Iterable[PredictionResult]]
+                             Iterable[PredictionResult],
+                             ]
 
 KeyedTensorInferenceFn = Callable[[
     Sequence[Dict[str, Union[torch.Tensor, tf.Tensor]]],
     Union[AutoModel, TFAutoModel],
     str,
     Optional[Dict[str, Any]],
-    Optional[str]
+    Optional[str],
 ],
-                                  Iterable[PredictionResult]]
+                                  Iterable[PredictionResult],
+                                  ]
 
 
 def _validate_constructor_args(model_uri, model_class):
@@ -99,9 +101,9 @@ def is_gpu_available_torch():
 
 
 def get_device_torch(device):
-  if device == 'GPU' and is_gpu_available_torch():
-    return torch.device('cuda')
-  return torch.device('cpu')
+  if device == "GPU" and is_gpu_available_torch():
+    return torch.device("cuda")
+  return torch.device("cpu")
 
 
 def is_gpu_available_tensorflow(device):
@@ -141,7 +143,7 @@ def _run_inference_tensorflow_keyed_tensor(
     device,
     inference_args: Dict[str, Any],
     model_id: Optional[str] = None) -> Iterable[PredictionResult]:
-  if device == 'GPU':
+  if device == "GPU":
     is_gpu_available_tensorflow(device)
   key_to_tensor_list = defaultdict(list)
   for example in batch:
@@ -160,7 +162,7 @@ def __init__(
       self,
       model_uri: str,
       model_class: Union[AutoModel, TFAutoModel],
-      device: str = 'CPU',
+      device: str = "CPU",
       *,
       inference_fn: Optional[Callable[..., Iterable[PredictionT]]] = None,
       load_model_args: Optional[Dict[str, Any]] = None,
@@ -212,11 +214,11 @@ def __init__(
     self._model_config_args = load_model_args if load_model_args else {}
     self._inference_args = inference_args if inference_args else {}
     self._batching_kwargs = {}
-    self._env_vars = kwargs.get('env_vars', {})
+    self._env_vars = kwargs.get("env_vars", {})
     if min_batch_size is not None:
-      self._batching_kwargs['min_batch_size'] = min_batch_size
+      self._batching_kwargs["min_batch_size"] = min_batch_size
     if max_batch_size is not None:
-      self._batching_kwargs['max_batch_size'] = max_batch_size
+      self._batching_kwargs["max_batch_size"] = max_batch_size
     self._large_model = large_model
     self._framework = ""
 
@@ -297,8 +299,8 @@ def run_inference(
         self._framework = "tf"
       else:
         self._framework = "torch"
-        if self._device == 'GPU' and is_gpu_available_torch():
-          model.to(torch.device('cuda'))
+        if self._device == "GPU" and is_gpu_available_torch():
+          model.to(torch.device("cuda"))
 
     if self._inference_fn:
       return self._inference_fn(
@@ -314,9 +316,9 @@ def run_inference(
   def get_metrics_namespace(self) -> str:
     """
     Returns:
-       A namespace for metrics collected by the RunInference transform.
+        A namespace for metrics collected by the RunInference transform.
     """
-    return 'BeamML_HuggingFaceModelHandler_KeyedTensor'
+    return "BeamML_HuggingFaceModelHandler_KeyedTensor"
 
 
 def _default_inference_fn_torch(
@@ -341,7 +343,7 @@ def _default_inference_fn_tensorflow(
     device,
     inference_args: Dict[str, Any],
     model_id: Optional[str] = None) -> Iterable[PredictionResult]:
-  if device == 'GPU':
+  if device == "GPU":
     is_gpu_available_tensorflow(device)
   batched_tensors = tf.stack(batch, axis=0)
   predictions = model(batched_tensors, **inference_args)
@@ -397,8 +399,8 @@ def run_inference(
         self._framework = "tf"
       else:
         self._framework = "torch"
-        if self._device == 'GPU' and is_gpu_available_torch():
-          model.to(torch.device('cuda'))
+        if self._device == "GPU" and is_gpu_available_torch():
+          model.to(torch.device("cuda"))
 
     if self._inference_fn:
       return self._inference_fn(
@@ -414,6 +416,6 @@ def run_inference(
   def get_metrics_namespace(self) -> str:
     """
     Returns:
-       A namespace for metrics collected by the RunInference transform.
+        A namespace for metrics collected by the RunInference transform.
     """
-    return 'BeamML_HuggingFaceModelHandler_Tensor'
+    return "BeamML_HuggingFaceModelHandler_Tensor"

From 467e5dd8c37481b359788a7a0f5d131c0f1355c5 Mon Sep 17 00:00:00 2001
From: riteshghorse <riteshghorse@gmail.com>
Date: Tue, 18 Jul 2023 14:46:30 -0400
Subject: [PATCH 23/26] pydoc

---
 .../ml/inference/huggingface_inference.py         | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/sdks/python/apache_beam/ml/inference/huggingface_inference.py b/sdks/python/apache_beam/ml/inference/huggingface_inference.py
index 8c49b37b0e733..a7530a942dfc0 100644
--- a/sdks/python/apache_beam/ml/inference/huggingface_inference.py
+++ b/sdks/python/apache_beam/ml/inference/huggingface_inference.py
@@ -171,7 +171,8 @@ def __init__(
       max_batch_size: Optional[int] = None,
       large_model: bool = False,
       **kwargs):
-    """Implementation of the abstract base class of ModelHandler interface
+    """
+    Implementation of the abstract base class of ModelHandler interface
     for Hugging Face. This class shouldn't be instantiated directly.
 
     Use HuggingFaceModelHandlerKeyedTensor or HuggingFaceModelHandlerTensor.
@@ -257,7 +258,8 @@ class HuggingFaceModelHandlerKeyedTensor(
     HuggingFaceModelHandler[Dict[str, Union[tf.Tensor, torch.Tensor]],
                             PredictionResult,
                             Union[AutoModel, TFAutoModel]]):
-  """Implementation of the ModelHandler interface for HuggingFace with
+  """
+  Implementation of the ModelHandler interface for HuggingFace with
   Keyed Tensors for PyTorch/Tensorflow backend.
 
   Depending on the type of tensors, the model framework is determined
@@ -275,7 +277,8 @@ def run_inference(
       model: Union[AutoModel, TFAutoModel],
       inference_args: Optional[Dict[str, Any]] = None
   ) -> Iterable[PredictionResult]:
-    """Runs inferences on a batch of Keyed Tensors and returns an Iterable of
+    """
+    Runs inferences on a batch of Keyed Tensors and returns an Iterable of
     Tensors Predictions.
 
     This method stacks the list of Tensors in a vectorized format to optimize
@@ -356,7 +359,8 @@ class HuggingFaceModelHandlerTensor(HuggingFaceModelHandler[Union[tf.Tensor,
                                                             Union[AutoModel,
                                                                   TFAutoModel]]
                                     ):
-  """Implementation of the ModelHandler interface for HuggingFace with
+  """
+  Implementation of the ModelHandler interface for HuggingFace with
   Tensors for PyTorch/Tensorflow backend.
 
   Depending on the type of tensors, the model framework is determined
@@ -374,7 +378,8 @@ def run_inference(
       model: Union[AutoModel, TFAutoModel],
       inference_args: Optional[Dict[str, Any]] = None
   ) -> Iterable[PredictionResult]:
-    """Runs inferences on a batch of Tensors and returns an Iterable of
+    """
+    Runs inferences on a batch of Tensors and returns an Iterable of
     Tensors Predictions.
 
     This method stacks the list of Tensors in a vectorized format to optimize

From 1a022d9040edb4c874a64329261ee091a8f1a480 Mon Sep 17 00:00:00 2001
From: riteshghorse <riteshghorse@gmail.com>
Date: Thu, 20 Jul 2023 09:56:13 -0400
Subject: [PATCH 24/26] fix pydoc

---
 .../ml/inference/huggingface_inference.py            | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/sdks/python/apache_beam/ml/inference/huggingface_inference.py b/sdks/python/apache_beam/ml/inference/huggingface_inference.py
index a7530a942dfc0..936ae94be66c2 100644
--- a/sdks/python/apache_beam/ml/inference/huggingface_inference.py
+++ b/sdks/python/apache_beam/ml/inference/huggingface_inference.py
@@ -178,8 +178,8 @@ def __init__(
     Use HuggingFaceModelHandlerKeyedTensor or HuggingFaceModelHandlerTensor.
 
     Example Usage model::
-    pcoll | RunInference(HuggingFaceModelHandlerKeyedTensor(
-      model_uri="bert-base-uncased", model_class=AutoModelForMaskedLM))
+      pcoll | RunInference(HuggingFaceModelHandlerKeyedTensor(
+        model_uri="bert-base-uncased", model_class=AutoModelForMaskedLM))
 
     Args:
       model_uri (str): path to the pretrained model on the hugging face
@@ -266,8 +266,8 @@ class HuggingFaceModelHandlerKeyedTensor(
   automatically.
 
   Example Usage model::
-  pcoll | RunInference(HuggingFaceModelHandlerKeyedTensor(
-    model_uri="bert-base-uncased", model_class=AutoModelForMaskedLM))
+    pcoll | RunInference(HuggingFaceModelHandlerKeyedTensor(
+      model_uri="bert-base-uncased", model_class=AutoModelForMaskedLM))
 
   **Supported Versions:** HuggingFaceModelHandler supports transformers>=4.18.0.
   """
@@ -367,8 +367,8 @@ class HuggingFaceModelHandlerTensor(HuggingFaceModelHandler[Union[tf.Tensor,
   automatically.
 
   Example Usage model:
-  pcoll | RunInference(HuggingFaceModelHandlerTensor(
-    model_uri="bert-base-uncased", model_class=AutoModelForMaskedLM))
+    pcoll | RunInference(HuggingFaceModelHandlerTensor(
+      model_uri="bert-base-uncased", model_class=AutoModelForMaskedLM))
 
   **Supported Versions:** HuggingFaceModelHandler supports transformers>=4.18.0.
   """

From 2f95adc2ac0bf71f87f5ad23777f6339d7476d97 Mon Sep 17 00:00:00 2001
From: riteshghorse <riteshghorse@gmail.com>
Date: Mon, 24 Jul 2023 10:43:48 -0400
Subject: [PATCH 25/26] updates to keyed model handler

---
 .../huggingface_language_modeling.py          |   1 +
 .../ml/inference/huggingface_inference.py     | 221 +++++++++++-------
 2 files changed, 143 insertions(+), 79 deletions(-)

diff --git a/sdks/python/apache_beam/examples/inference/huggingface_language_modeling.py b/sdks/python/apache_beam/examples/inference/huggingface_language_modeling.py
index ee69d8bdea032..f6cb3de72b705 100644
--- a/sdks/python/apache_beam/examples/inference/huggingface_language_modeling.py
+++ b/sdks/python/apache_beam/examples/inference/huggingface_language_modeling.py
@@ -138,6 +138,7 @@ def run(
   model_handler = HuggingFaceModelHandlerKeyedTensor(
       model_uri=known_args.model_name,
       model_class=known_args.model_class,
+      framework='pt',
       max_batch_size=1)
   if not known_args.input:
     text = (
diff --git a/sdks/python/apache_beam/ml/inference/huggingface_inference.py b/sdks/python/apache_beam/ml/inference/huggingface_inference.py
index 936ae94be66c2..19b8ed02fa9d8 100644
--- a/sdks/python/apache_beam/ml/inference/huggingface_inference.py
+++ b/sdks/python/apache_beam/ml/inference/huggingface_inference.py
@@ -19,7 +19,6 @@
 
 import logging
 import sys
-from abc import ABC
 from collections import defaultdict
 from typing import Any
 from typing import Callable
@@ -32,9 +31,7 @@
 import tensorflow as tf
 import torch
 from apache_beam.ml.inference import utils
-from apache_beam.ml.inference.base import ExampleT
 from apache_beam.ml.inference.base import ModelHandler
-from apache_beam.ml.inference.base import ModelT
 from apache_beam.ml.inference.base import PredictionResult
 from apache_beam.ml.inference.base import PredictionT
 from apache_beam.ml.inference.pytorch_inference import _convert_to_device
@@ -44,7 +41,6 @@
 _LOGGER = logging.getLogger(__name__)
 
 __all__ = [
-    "HuggingFaceModelHandler",
     "HuggingFaceModelHandlerTensor",
     "HuggingFaceModelHandlerKeyedTensor",
 ]
@@ -157,11 +153,17 @@ def _run_inference_tensorflow_keyed_tensor(
   return utils._convert_to_result(batch, predictions, model_id)
 
 
-class HuggingFaceModelHandler(ModelHandler[ExampleT, PredictionT, ModelT], ABC):
+class HuggingFaceModelHandlerKeyedTensor(ModelHandler[Dict[str,
+                                                           Union[tf.Tensor,
+                                                                 torch.Tensor]],
+                                                      PredictionResult,
+                                                      Union[AutoModel,
+                                                            TFAutoModel]]):
   def __init__(
       self,
       model_uri: str,
       model_class: Union[AutoModel, TFAutoModel],
+      framework: str,
       device: str = "CPU",
       *,
       inference_fn: Optional[Callable[..., Iterable[PredictionT]]] = None,
@@ -172,27 +174,29 @@ def __init__(
       large_model: bool = False,
       **kwargs):
     """
-    Implementation of the abstract base class of ModelHandler interface
-    for Hugging Face. This class shouldn't be instantiated directly.
-
-    Use HuggingFaceModelHandlerKeyedTensor or HuggingFaceModelHandlerTensor.
+    Implementation of the ModelHandler interface for HuggingFace with
+    Keyed Tensors for PyTorch/Tensorflow backend.
 
     Example Usage model::
       pcoll | RunInference(HuggingFaceModelHandlerKeyedTensor(
-        model_uri="bert-base-uncased", model_class=AutoModelForMaskedLM))
+        model_uri="bert-base-uncased", model_class=AutoModelForMaskedLM,
+        framework='pt'))
 
     Args:
       model_uri (str): path to the pretrained model on the hugging face
         models hub.
       model_class: model class to load the repository from model_uri.
+      framework (str): Framework to use for the model. 'tf' for TensorFlow and
+        'pt' for PyTorch.
       device: For torch tensors, specify device on which you wish to
         run the model. Defaults to CPU.
       inference_fn: the inference function to use during RunInference.
         Default is _run_inference_torch_keyed_tensor or
         _run_inference_tensorflow_keyed_tensor depending on the input type.
-      load_model_args (Dict[str, Any]): keyword arguments to provide load
-        options while loading models from Hugging Face Hub. Defaults to None.
-      inference_args (Dict[str, Any]): Non-batchable arguments
+      load_model_args (Dict[str, Any]): (Optional) Keyword arguments to provide
+        load options while loading models from Hugging Face Hub.
+        Defaults to None.
+      inference_args (Dict[str, Any]): (Optional) Non-batchable arguments
         required as inputs to the model's inference function. Unlike Tensors
         in `batch`, these parameters will not be dynamically batched.
         Defaults to None.
@@ -221,7 +225,7 @@ def __init__(
     if max_batch_size is not None:
       self._batching_kwargs["max_batch_size"] = max_batch_size
     self._large_model = large_model
-    self._framework = ""
+    self._framework = framework
 
     _validate_constructor_args(
         model_uri=self._model_uri, model_class=self._model_class)
@@ -230,47 +234,11 @@ def load_model(self):
     """Loads and initializes the model for processing."""
     model = self._model_class.from_pretrained(
         self._model_uri, **self._model_config_args)
+    if self._framework == 'pt':
+      if self._device == "GPU" and is_gpu_available_torch:
+        model.to(torch.device("cuda"))
     return model
 
-  def update_model_path(self, model_path: Optional[str] = None):
-    self._model_uri = model_path if model_path else self._model_uri
-
-  def get_num_bytes(
-      self, batch: Sequence[Union[tf.Tensor, torch.Tensor]]) -> int:
-    """
-    Returns:
-      The number of bytes of data for the Tensors batch.
-    """
-    if self._framework == "tf":
-      return sum(sys.getsizeof(element) for element in batch)
-    else:
-      return sum(
-          (el.element_size() for tensor in batch for el in tensor.values()))
-
-  def batch_elements_kwargs(self):
-    return self._batching_kwargs
-
-  def share_model_across_processes(self) -> bool:
-    return self._large_model
-
-
-class HuggingFaceModelHandlerKeyedTensor(
-    HuggingFaceModelHandler[Dict[str, Union[tf.Tensor, torch.Tensor]],
-                            PredictionResult,
-                            Union[AutoModel, TFAutoModel]]):
-  """
-  Implementation of the ModelHandler interface for HuggingFace with
-  Keyed Tensors for PyTorch/Tensorflow backend.
-
-  Depending on the type of tensors, the model framework is determined
-  automatically.
-
-  Example Usage model::
-    pcoll | RunInference(HuggingFaceModelHandlerKeyedTensor(
-      model_uri="bert-base-uncased", model_class=AutoModelForMaskedLM))
-
-  **Supported Versions:** HuggingFaceModelHandler supports transformers>=4.18.0.
-  """
   def run_inference(
       self,
       batch: Sequence[Dict[str, Union[tf.Tensor, torch.Tensor]]],
@@ -297,13 +265,6 @@ def run_inference(
       An Iterable of type PredictionResult.
     """
     inference_args = {} if not inference_args else inference_args
-    if not self._framework:
-      if isinstance(batch[0], tf.Tensor):
-        self._framework = "tf"
-      else:
-        self._framework = "torch"
-        if self._device == "GPU" and is_gpu_available_torch():
-          model.to(torch.device("cuda"))
 
     if self._inference_fn:
       return self._inference_fn(
@@ -316,6 +277,27 @@ def run_inference(
       return _run_inference_torch_keyed_tensor(
           batch, model, self._device, inference_args, self._model_uri)
 
+  def update_model_path(self, model_path: Optional[str] = None):
+    self._model_uri = model_path if model_path else self._model_uri
+
+  def get_num_bytes(
+      self, batch: Sequence[Union[tf.Tensor, torch.Tensor]]) -> int:
+    """
+    Returns:
+      The number of bytes of data for the Tensors batch.
+    """
+    if self._framework == "tf":
+      return sum(sys.getsizeof(element) for element in batch)
+    else:
+      return sum(
+          (el.element_size() for tensor in batch for el in tensor.values()))
+
+  def batch_elements_kwargs(self):
+    return self._batching_kwargs
+
+  def share_model_across_processes(self) -> bool:
+    return self._large_model
+
   def get_metrics_namespace(self) -> str:
     """
     Returns:
@@ -353,25 +335,86 @@ def _default_inference_fn_tensorflow(
   return utils._convert_to_result(batch, predictions, model_id)
 
 
-class HuggingFaceModelHandlerTensor(HuggingFaceModelHandler[Union[tf.Tensor,
-                                                                  torch.Tensor],
-                                                            PredictionResult,
-                                                            Union[AutoModel,
-                                                                  TFAutoModel]]
-                                    ):
-  """
-  Implementation of the ModelHandler interface for HuggingFace with
-  Tensors for PyTorch/Tensorflow backend.
+class HuggingFaceModelHandlerTensor(ModelHandler[Union[tf.Tensor, torch.Tensor],
+                                                 PredictionResult,
+                                                 Union[AutoModel,
+                                                       TFAutoModel]]):
+  def __init__(
+      self,
+      model_uri: str,
+      model_class: Union[AutoModel, TFAutoModel],
+      device: str = "CPU",
+      *,
+      inference_fn: Optional[Callable[..., Iterable[PredictionT]]] = None,
+      load_model_args: Optional[Dict[str, Any]] = None,
+      inference_args: Optional[Dict[str, Any]] = None,
+      min_batch_size: Optional[int] = None,
+      max_batch_size: Optional[int] = None,
+      large_model: bool = False,
+      **kwargs):
+    """
+    Implementation of the ModelHandler interface for HuggingFace with
+    Tensors for PyTorch/Tensorflow backend.
 
-  Depending on the type of tensors, the model framework is determined
-  automatically.
+    Depending on the type of tensors, the model framework is determined
+    automatically.
 
-  Example Usage model:
-    pcoll | RunInference(HuggingFaceModelHandlerTensor(
-      model_uri="bert-base-uncased", model_class=AutoModelForMaskedLM))
+    Example Usage model:
+      pcoll | RunInference(HuggingFaceModelHandlerTensor(
+        model_uri="bert-base-uncased", model_class=AutoModelForMaskedLM))
+
+    Args:
+      model_uri (str): path to the pretrained model on the hugging face
+        models hub.
+      model_class: model class to load the repository from model_uri.
+      device: For torch tensors, specify device on which you wish to
+        run the model. Defaults to CPU.
+      inference_fn: the inference function to use during RunInference.
+        Default is _run_inference_torch_keyed_tensor or
+        _run_inference_tensorflow_keyed_tensor depending on the input type.
+      load_model_args (Dict[str, Any]): (Optional) keyword arguments to provide
+        load options while loading models from Hugging Face Hub.
+        Defaults to None.
+      inference_args (Dict[str, Any]): (Optional) Non-batchable arguments
+        required as inputs to the model's inference function. Unlike Tensors
+        in `batch`, these parameters will not be dynamically batched.
+        Defaults to None.
+      min_batch_size: the minimum batch size to use when batching inputs.
+      max_batch_size: the maximum batch size to use when batching inputs.
+      large_model: set to true if your model is large enough to run into
+        memory pressure if you load multiple copies. Given a model that
+        consumes N memory and a machine with W cores and M memory, you should
+        set this to True if N*W > M.
+      kwargs: 'env_vars' can be used to set environment variables
+        before loading the model.
+
+    **Supported Versions:** HuggingFaceModelHandler supports
+    transformers>=4.18.0.
+    """
+    self._model_uri = model_uri
+    self._model_class = model_class
+    self._device = device
+    self._inference_fn = inference_fn
+    self._model_config_args = load_model_args if load_model_args else {}
+    self._inference_args = inference_args if inference_args else {}
+    self._batching_kwargs = {}
+    self._env_vars = kwargs.get("env_vars", {})
+    if min_batch_size is not None:
+      self._batching_kwargs["min_batch_size"] = min_batch_size
+    if max_batch_size is not None:
+      self._batching_kwargs["max_batch_size"] = max_batch_size
+    self._large_model = large_model
+    self._framework = ""
+
+    _validate_constructor_args(
+        model_uri=self._model_uri, model_class=self._model_class)
+
+  def load_model(self):
+    """Loads and initializes the model for processing."""
+    model = self._model_class.from_pretrained(
+        self._model_uri, **self._model_config_args)
+    return model
 
-  **Supported Versions:** HuggingFaceModelHandler supports transformers>=4.18.0.
-  """
   def run_inference(
       self,
       batch: Sequence[Union[tf.Tensor, torch.Tensor]],
@@ -403,9 +446,11 @@ def run_inference(
       if isinstance(batch[0], tf.Tensor):
         self._framework = "tf"
       else:
-        self._framework = "torch"
-        if self._device == "GPU" and is_gpu_available_torch():
-          model.to(torch.device("cuda"))
+        self._framework = "pt"
+
+    if (self._framework == 'pt' and self._device == "GPU" and
+        is_gpu_available_torch()):
+      model.to(torch.device("cuda"))
 
     if self._inference_fn:
       return self._inference_fn(
@@ -418,6 +463,24 @@ def run_inference(
       return _default_inference_fn_torch(
           batch, model, self._device, inference_args, self._model_uri)
 
+  def get_num_bytes(
+      self, batch: Sequence[Union[tf.Tensor, torch.Tensor]]) -> int:
+    """
+    Returns:
+      The number of bytes of data for the Tensors batch.
+    """
+    if self._framework == "tf":
+      return sum(sys.getsizeof(element) for element in batch)
+    else:
+      return sum(
+          (el.element_size() for tensor in batch for el in tensor.values()))
+
+  def batch_elements_kwargs(self):
+    return self._batching_kwargs
+
+  def share_model_across_processes(self) -> bool:
+    return self._large_model
+
   def get_metrics_namespace(self) -> str:
     """
     Returns:

From 4bdab80c1880dcf54d778184bcc5b921656e5992 Mon Sep 17 00:00:00 2001
From: riteshghorse <riteshghorse@gmail.com>
Date: Mon, 24 Jul 2023 11:17:43 -0400
Subject: [PATCH 26/26] pylints

---
 .../python/apache_beam/ml/inference/huggingface_inference.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/sdks/python/apache_beam/ml/inference/huggingface_inference.py b/sdks/python/apache_beam/ml/inference/huggingface_inference.py
index 19b8ed02fa9d8..35c3a1686c70b 100644
--- a/sdks/python/apache_beam/ml/inference/huggingface_inference.py
+++ b/sdks/python/apache_beam/ml/inference/huggingface_inference.py
@@ -33,7 +33,6 @@
 from apache_beam.ml.inference import utils
 from apache_beam.ml.inference.base import ModelHandler
 from apache_beam.ml.inference.base import PredictionResult
-from apache_beam.ml.inference.base import PredictionT
 from apache_beam.ml.inference.pytorch_inference import _convert_to_device
 from transformers import AutoModel
 from transformers import TFAutoModel
@@ -166,7 +165,7 @@ def __init__(
       framework: str,
       device: str = "CPU",
       *,
-      inference_fn: Optional[Callable[..., Iterable[PredictionT]]] = None,
+      inference_fn: Optional[Callable[..., Iterable[PredictionResult]]] = None,
       load_model_args: Optional[Dict[str, Any]] = None,
       inference_args: Optional[Dict[str, Any]] = None,
       min_batch_size: Optional[int] = None,
@@ -345,7 +344,7 @@ def __init__(
       model_class: Union[AutoModel, TFAutoModel],
       device: str = "CPU",
       *,
-      inference_fn: Optional[Callable[..., Iterable[PredictionT]]] = None,
+      inference_fn: Optional[Callable[..., Iterable[PredictionResult]]] = None,
       load_model_args: Optional[Dict[str, Any]] = None,
       inference_args: Optional[Dict[str, Any]] = None,
       min_batch_size: Optional[int] = None,