From 2188497e03e5465d16708ae024297e4b61827b6e Mon Sep 17 00:00:00 2001
From: Ronghang Hu <ronghang.hu@gmail.com>
Date: Wed, 14 Aug 2024 06:34:46 +0000
Subject: [PATCH] open `README.md` with unicode (to support Hugging Face
 emoji); fix various typos

(close #217, #66, #67, #69, #91, #126, #127, #145)
---
 sam2/modeling/position_encoding.py | 2 +-
 sam2/modeling/sam2_base.py         | 2 +-
 sam2/sam2_image_predictor.py       | 2 +-
 sam2/sam2_video_predictor.py       | 6 +++---
 sam2/utils/misc.py                 | 4 ++--
 sav_dataset/sav_evaluator.py       | 2 +-
 sav_dataset/utils/sav_benchmark.py | 2 +-
 setup.py                           | 2 +-
 8 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/sam2/modeling/position_encoding.py b/sam2/modeling/position_encoding.py
index cf53e30f..52ac2267 100644
--- a/sam2/modeling/position_encoding.py
+++ b/sam2/modeling/position_encoding.py
@@ -16,7 +16,7 @@
 class PositionEmbeddingSine(nn.Module):
     """
     This is a more standard version of the position embedding, very similar to the one
-    used by the Attention is all you need paper, generalized to work on images.
+    used by the Attention Is All You Need paper, generalized to work on images.
     """
 
     def __init__(
diff --git a/sam2/modeling/sam2_base.py b/sam2/modeling/sam2_base.py
index 50d1655c..224a8c1b 100644
--- a/sam2/modeling/sam2_base.py
+++ b/sam2/modeling/sam2_base.py
@@ -642,7 +642,7 @@ def _prepare_memory_conditioned_features(
                 pix_feat_with_mem = pix_feat_with_mem.permute(1, 2, 0).view(B, C, H, W)
                 return pix_feat_with_mem
 
-            # Use a dummy token on the first frame (to avoid emtpy memory input to tranformer encoder)
+            # Use a dummy token on the first frame (to avoid empty memory input to tranformer encoder)
             to_cat_memory = [self.no_mem_embed.expand(1, B, self.mem_dim)]
             to_cat_memory_pos_embed = [self.no_mem_pos_enc.expand(1, B, self.mem_dim)]
 
diff --git a/sam2/sam2_image_predictor.py b/sam2/sam2_image_predictor.py
index f6f9a5a1..463462c5 100644
--- a/sam2/sam2_image_predictor.py
+++ b/sam2/sam2_image_predictor.py
@@ -180,7 +180,7 @@ def predict_batch(
         normalize_coords=True,
     ) -> Tuple[List[np.ndarray], List[np.ndarray], List[np.ndarray]]:
         """This function is very similar to predict(...), however it is used for batched mode, when the model is expected to generate predictions on multiple images.
-        It returns a tupele of lists of masks, ious, and low_res_masks_logits.
+        It returns a tuple of lists of masks, ious, and low_res_masks_logits.
         """
         assert self._is_batch, "This function should only be used when in batched mode"
         if not self._is_image_set:
diff --git a/sam2/sam2_video_predictor.py b/sam2/sam2_video_predictor.py
index 78284e24..8dd43ad5 100644
--- a/sam2/sam2_video_predictor.py
+++ b/sam2/sam2_video_predictor.py
@@ -44,7 +44,7 @@ def init_state(
         offload_state_to_cpu=False,
         async_loading_frames=False,
     ):
-        """Initialize a inference state."""
+        """Initialize an inference state."""
         compute_device = self.device  # device of the model
         images, video_height, video_width = load_video_frames(
             video_path=video_path,
@@ -589,7 +589,7 @@ def propagate_in_video_preflight(self, inference_state):
         # to `propagate_in_video_preflight`).
         consolidated_frame_inds = inference_state["consolidated_frame_inds"]
         for is_cond in [False, True]:
-            # Separately consolidate conditioning and non-conditioning temp outptus
+            # Separately consolidate conditioning and non-conditioning temp outputs
             storage_key = "cond_frame_outputs" if is_cond else "non_cond_frame_outputs"
             # Find all the frames that contain temporary outputs for any objects
             # (these should be the frames that have just received clicks for mask inputs
@@ -598,7 +598,7 @@ def propagate_in_video_preflight(self, inference_state):
             for obj_temp_output_dict in temp_output_dict_per_obj.values():
                 temp_frame_inds.update(obj_temp_output_dict[storage_key].keys())
             consolidated_frame_inds[storage_key].update(temp_frame_inds)
-            # consolidate the temprary output across all objects on this frame
+            # consolidate the temporary output across all objects on this frame
             for frame_idx in temp_frame_inds:
                 consolidated_out = self._consolidate_temp_output_across_obj(
                     inference_state, frame_idx, is_cond=is_cond, run_mem_encoder=True
diff --git a/sam2/utils/misc.py b/sam2/utils/misc.py
index e2d39a08..525e8cb3 100644
--- a/sam2/utils/misc.py
+++ b/sam2/utils/misc.py
@@ -68,7 +68,7 @@ def mask_to_box(masks: torch.Tensor):
     compute bounding box given an input mask
 
     Inputs:
-    - masks: [B, 1, H, W] boxes, dtype=torch.Tensor
+    - masks: [B, 1, H, W] masks, dtype=torch.Tensor
 
     Returns:
     - box_coords: [B, 1, 4], contains (x, y) coordinates of top left and bottom right box corners, dtype=torch.Tensor
@@ -120,7 +120,7 @@ def __init__(
         self.offload_video_to_cpu = offload_video_to_cpu
         self.img_mean = img_mean
         self.img_std = img_std
-        # items in `self._images` will be loaded asynchronously
+        # items in `self.images` will be loaded asynchronously
         self.images = [None] * len(img_paths)
         # catch and raise any exceptions in the async loading thread
         self.exception = None
diff --git a/sav_dataset/sav_evaluator.py b/sav_dataset/sav_evaluator.py
index 1c319e10..d4b0ef0a 100644
--- a/sav_dataset/sav_evaluator.py
+++ b/sav_dataset/sav_evaluator.py
@@ -72,7 +72,7 @@
 parser.add_argument(
     "--do_not_skip_first_and_last_frame",
     help="In SA-V val and test, we skip the first and the last annotated frames in evaluation. "
-    "Set this to true for evaluation on settings that doen't skip first and last frames",
+    "Set this to true for evaluation on settings that doesn't skip first and last frames",
     action="store_true",
 )
 
diff --git a/sav_dataset/utils/sav_benchmark.py b/sav_dataset/utils/sav_benchmark.py
index babb330e..c4b2444f 100644
--- a/sav_dataset/utils/sav_benchmark.py
+++ b/sav_dataset/utils/sav_benchmark.py
@@ -183,7 +183,7 @@ def _seg2bmap(seg, width=None, height=None):
 
     assert not (
         width > w | height > h | abs(ar1 - ar2) > 0.01
-    ), "Can" "t convert %dx%d seg to %dx%d bmap." % (w, h, width, height)
+    ), "Cannot convert %dx%d seg to %dx%d bmap." % (w, h, width, height)
 
     e = np.zeros_like(seg)
     s = np.zeros_like(seg)
diff --git a/setup.py b/setup.py
index 92ee0f34..ebef97cd 100644
--- a/setup.py
+++ b/setup.py
@@ -17,7 +17,7 @@
 LICENSE = "Apache 2.0"
 
 # Read the contents of README file
-with open("README.md", "r") as f:
+with open("README.md", "r", encoding="utf-8") as f:
     LONG_DESCRIPTION = f.read()
 
 # Required dependencies