huggingface · alexander-soare · Jun 19, 2024 · Jun 17, 2024 · Jun 18, 2024 · Jun 19, 2024
diff --git a/lerobot/common/policies/act/modeling_act.py b/lerobot/common/policies/act/modeling_act.py
@@ -314,9 +314,23 @@ def forward(self, batch: dict[str, Tensor]) -> tuple[Tensor, tuple[Tensor, Tenso
             # Note: detach() shouldn't be necessary but leaving it the same as the original code just in case.
             pos_embed = self.vae_encoder_pos_enc.clone().detach()  # (1, S+2, D)
 
+            # Prepare key padding mask for the transformer encoder. We have 1 or 2 extra tokens at the start of the
+            # sequence depending whether we use the input states or not (cls and robot state)
+            # False means not a padding token.
+            cls_joint_is_pad = torch.full(
+                (batch_size, 2 if self.use_input_state else 1),
+                False,
+                device=batch["observation.state"].device,
+            )
+            key_padding_mask = torch.cat(
+                [cls_joint_is_pad, batch["action_is_pad"]], axis=1
+            )  # (bs, seq+1 or 2)
+
             # Forward pass through VAE encoder to get the latent PDF parameters.
             cls_token_out = self.vae_encoder(
-                vae_encoder_input.permute(1, 0, 2), pos_embed=pos_embed.permute(1, 0, 2)
+                vae_encoder_input.permute(1, 0, 2),
+                pos_embed=pos_embed.permute(1, 0, 2),
+                key_padding_mask=key_padding_mask,
             )[0]  # select the class token, with shape (B, D)
             latent_pdf_params = self.vae_encoder_latent_output_proj(cls_token_out)
             mu = latent_pdf_params[:, : self.config.latent_dim]
@@ -402,9 +416,11 @@ def __init__(self, config: ACTConfig):
         self.layers = nn.ModuleList([ACTEncoderLayer(config) for _ in range(config.n_encoder_layers)])
         self.norm = nn.LayerNorm(config.dim_model) if config.pre_norm else nn.Identity()
 
-    def forward(self, x: Tensor, pos_embed: Tensor | None = None) -> Tensor:
+    def forward(
+        self, x: Tensor, pos_embed: Tensor | None = None, key_padding_mask: Tensor | None = None
+    ) -> Tensor:
         for layer in self.layers:
-            x = layer(x, pos_embed=pos_embed)
+            x = layer(x, pos_embed=pos_embed, key_padding_mask=key_padding_mask)
         x = self.norm(x)
         return x
 
@@ -427,12 +443,13 @@ def __init__(self, config: ACTConfig):
         self.activation = get_activation_fn(config.feedforward_activation)
         self.pre_norm = config.pre_norm
 
-    def forward(self, x, pos_embed: Tensor | None = None) -> Tensor:
+    def forward(self, x, pos_embed: Tensor | None = None, key_padding_mask: Tensor | None = None) -> Tensor:
         skip = x
         if self.pre_norm:
             x = self.norm1(x)
         q = k = x if pos_embed is None else x + pos_embed
-        x = self.self_attn(q, k, value=x)[0]  # select just the output, not the attention weights
+        x = self.self_attn(q, k, value=x, key_padding_mask=key_padding_mask)
+        x = x[0]  # note: [0] to select just the output, not the attention weights
         x = skip + self.dropout1(x)
         if self.pre_norm:
             skip = x

diff --git a/lerobot/configs/policy/act_1000_actions.yaml b/lerobot/configs/policy/act_1000_actions.yaml
@@ -0,0 +1,83 @@
+# @package _global_
+
+seed: 1000
+dataset_repo_id: lerobot/aloha_sim_insertion_human
+
+override_dataset_stats:
+  observation.images.top:
+    # stats from imagenet, since we use a pretrained vision model
+    mean: [[[0.485]], [[0.456]], [[0.406]]]  # (c,1,1)
+    std: [[[0.229]], [[0.224]], [[0.225]]]  # (c,1,1)
+
+training:
+  offline_steps: 80000
+  online_steps: 0
+  eval_freq: 10000
+  save_freq: 100000
+  log_freq: 250
+  save_checkpoint: true
+
+  batch_size: 8
+  lr: 1e-5
+  lr_backbone: 1e-5
+  weight_decay: 1e-4
+  grad_clip_norm: 10
+  online_steps_between_rollouts: 1
+
+  delta_timestamps:
+    action: "[i / ${fps} for i in range(${policy.chunk_size})]"
+
+eval:
+  n_episodes: 50
+  batch_size: 50
+
+# See `configuration_act.py` for more details.
+policy:
+  name: act
+
+  # Input / output structure.
+  n_obs_steps: 1
+  chunk_size: 1000 # chunk_size
+  n_action_steps: 1000
+
+  input_shapes:
+    # TODO(rcadene, alexander-soare): add variables for height and width from the dataset/env?
+    observation.images.top: [3, 480, 640]
+    observation.state: ["${env.state_dim}"]
+  output_shapes:
+    action: ["${env.action_dim}"]
+
+  # Normalization / Unnormalization
+  input_normalization_modes:
+    observation.images.top: mean_std
+    observation.state: mean_std
+  output_normalization_modes:
+    action: mean_std
+
+  # Architecture.
+  # Vision backbone.
+  vision_backbone: resnet18
+  pretrained_backbone_weights: ResNet18_Weights.IMAGENET1K_V1
+  replace_final_stride_with_dilation: false
+  # Transformer layers.
+  pre_norm: false
+  dim_model: 512
+  n_heads: 8
+  dim_feedforward: 3200
+  feedforward_activation: relu
+  n_encoder_layers: 4
+  # Note: Although the original ACT implementation has 7 for `n_decoder_layers`, there is a bug in the code
+  # that means only the first layer is used. Here we match the original implementation by setting this to 1.
+  # See this issue https://github.com/tonyzhaozh/act/issues/25#issue-2258740521.
+  n_decoder_layers: 1
+  # VAE.
+  use_vae: true
+  latent_dim: 32
+  n_vae_encoder_layers: 4
+
+  # Inference.
+  temporal_ensemble_momentum: null
+
+  # Training and loss computation.
+  dropout: 0.1
+  kl_weight: 10.0
diff --git a/tests/data/save_policy_to_safetensors/aloha_act_1000_actions/actions.safetensors b/tests/data/save_policy_to_safetensors/aloha_act_1000_actions/actions.safetensors
diff --git a/tests/data/save_policy_to_safetensors/aloha_act_1000_actions/grad_stats.safetensors b/tests/data/save_policy_to_safetensors/aloha_act_1000_actions/grad_stats.safetensors
diff --git a/tests/data/save_policy_to_safetensors/aloha_act_1000_actions/output_dict.safetensors b/tests/data/save_policy_to_safetensors/aloha_act_1000_actions/output_dict.safetensors
diff --git a/tests/data/save_policy_to_safetensors/aloha_act_1000_actions/param_stats.safetensors b/tests/data/save_policy_to_safetensors/aloha_act_1000_actions/param_stats.safetensors
diff --git a/tests/scripts/save_policy_to_safetensors.py b/tests/scripts/save_policy_to_safetensors.py
@@ -108,15 +108,15 @@ def save_policy_to_safetensors(output_dir, env_name, policy_name, extra_override
 
 if __name__ == "__main__":
     env_policies = [
-        ("xarm", "tdmpc", []),
-        (
-            "pusht",
-            "diffusion",
-            ["policy.n_action_steps=8", "policy.num_inference_steps=10", "policy.down_dims=[128, 256, 512]"],
-        ),
-        ("aloha", "act", ["policy.n_action_steps=10"]),
-        ("dora_aloha_real", "act_real", ["policy.n_action_steps=10"]),
-        ("dora_aloha_real", "act_real_no_state", ["policy.n_action_steps=10"]),
+        # ("xarm", "tdmpc", []),
+        # (
+        #     "pusht",
+        #     "diffusion",
+        #     ["policy.n_action_steps=8", "policy.num_inference_steps=10", "policy.down_dims=[128, 256, 512]"],
+        # ),
+        ("aloha", "act_1000_actions", []),
-        ("aloha", "act_1000_actions", []),
+        ("aloha", "act_1000_actions", ["policy.n_action_steps=1000", "policy.chunk_size=1000"]),
-        ("aloha", "act_1000_actions", []),
+        ("aloha", "act_1000_actions", ["policy.n_action_steps=1000", "policy.chunk_size=1000"]),
+        # ("dora_aloha_real", "act_real", ["policy.n_action_steps=10"]),
+        # ("dora_aloha_real", "act_real_no_state", ["policy.n_action_steps=10"]),
     ]
     for env, policy, extra_overrides in env_policies:
         save_policy_to_safetensors("tests/data/save_policy_to_safetensors", env, policy, extra_overrides)
diff --git a/tests/test_policies.py b/tests/test_policies.py
@@ -296,6 +296,7 @@ def test_normalize(insert_temporal_dim):
             ["policy.n_action_steps=8", "policy.num_inference_steps=10", "policy.down_dims=[128, 256, 512]"],
         ),
         ("aloha", "act", ["policy.n_action_steps=10"]),
+        ("aloha", "act_1000_actions", []),
         ("dora_aloha_real", "act_real", ["policy.n_action_steps=10"]),
         ("dora_aloha_real", "act_real_no_state", ["policy.n_action_steps=10"]),
     ],