diff --git a/.gitignore b/.gitignore
index e43b0f9..73d4ca5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,161 @@
 .DS_Store
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
diff --git a/README.md b/README.md
index 3e8df5d..17a4884 100644
--- a/README.md
+++ b/README.md
@@ -59,10 +59,11 @@ export SPRINT=[SPRINT_DOWNLOAD_LOCATION]
 ```
 
 ## 2. Downloading the data
+### 2.1 Model Training Data
 You need to pre-train models to run zero-shot or finetuning experiments. 
 If you don't want to pre-train a model yourself, you can skip to step 3 as you don't need the pre-training dataset file. 
 
-We have two options for obtaining the ALFRED dataset---either download the data from here: [Google Drive Link](https://drive.google.com/file/d/1ZgKDgG9Fv491GVb9rxIVNJpViPNKFWMF) or set it up yourself with the instructions in the [README in the `datasets` folder](datasets/README.md).
+Download the ALFRED dataset here: [Google Drive Link](https://drive.google.com/file/d/1ZgKDgG9Fv491GVb9rxIVNJpViPNKFWMF).
 
 You can use [Gdown](https://github.com/wkentaro/gdown) to directly download the dataset to your server/computer at the desired location (18GB download):
 ```
@@ -78,6 +79,15 @@ Once the dataset is downloaded (`px_llama_13b.tar.gz`) simply untar it (36GB aft
 ``` 
 tar -xvzf px_llama_13b.tar.gz
 ```
+### 2.2 ALFRED Evaluation Data
+To run evals and fine-tuning experiments, you must extract ALFRED evaluation data we have processed ([Google Drive Link](https://drive.google.com/file/d/1MHDrKSRmyag-DwipyLj-i-BbKU_dxbne/view)):
+
+```
+cd [SPRINT_REPO_LOCATION]
+cd sprint/alfred/data
+gdown 1MHDrKSRmyag-DwipyLj-i-BbKU_dxbne
+tar -xvzf json_2.1.0_merge_goto.tar.gz
+```
 
 ## 3. Setting up WandB
 We log using WandB. First create a wandb account if you don't already have one [here](https://wandb.ai).
@@ -87,7 +97,7 @@ Finally, fill in `WANDB_ENTITY_NAME, WANDB_PROJECT_NAME` in the file `utils/wand
 
 
 ## 4. Pre-training a Model
-You can either pre-train a model yourself or download a pre-trained checkpoint. Pre-trained model checkpoints can be found here: [Google Drive Link](https://drive.google.com/file/d/1PDNX7Z1BBoB3pmeBTfOgNxe2I53kUoS0).
+You can either pre-train a model yourself or download a pre-trained checkpoint. Pre-trained model checkpoints can be found here: [Google Drive Link](https://drive.google.com/file/d/1PDNX7Z1BBoB3pmeBTfOgNxe2I53kUoS0/view).
 
 Otherwise, run the following command from the base SPRINT repo location to train our model, SPRINT:
 
@@ -150,8 +160,10 @@ Checkpoints are saved in `sprint_saved_rl_models/`
 
 To run SayCan zero-shot evals, pre-train the L-BC baseline above and then:
 ```
-TODO: coming soon
+python sprint/saycan_eval.py --model_checkpoint_dir [L-BC PATH] --env_type {eval_instruct, eval_length, eval_scene} --run_group [RUN_GROUP] --experiment_name [EXP_NAME] --llm_gpus [GPU]
 ```
+The optional `llm_gpus` flag allows you to input a comma separated list of GPU IDs to put the LLM onto since it might be too big to fit on the same GPU as the model.
+
 The 13b llama model we used is no longer available on huggingface, so to fully reproduce this you should follow the llama instructions to download LLaMA-13B. 
 Right now this script defaults to LLaMA-7B, but the empirically the performance is very similar.
 
diff --git a/sprint/datasets/large_language_model.py b/sprint/datasets/large_language_model.py
index 7cf21ac..a8c2231 100644
--- a/sprint/datasets/large_language_model.py
+++ b/sprint/datasets/large_language_model.py
@@ -2,6 +2,7 @@
 from tqdm import tqdm
 from transformers import AutoModelForCausalLM, AutoTokenizer
 import torch
+from threading import Lock
 from sprint.utils.utils import process_skill_strings
 
 
@@ -48,8 +49,8 @@ class LargeLanguageModel:
 
     def __init__(self, config):
         assert (
-            "opt" in config.llm_model or "gpt" in config.llm_model
-        ), "No tokenizer support for non-gpt/opt models"
+            "opt" in config.llm_model or "gpt" in config.llm_model or "llama" in config.llm_model
+        ), "No tokenizer support for non-gpt/opt/llama models"
         self.config = config
         self.llm_gpus = config.llm_gpus
         self.llm_max_new_tokens = config.llm_max_new_tokens
@@ -76,6 +77,7 @@ def __init__(self, config):
                 pad_token_id=self.tokenizer.eos_token_id,
                 device_map="auto",
             )
+        self.lock = Lock()
         self.next_skill_top_p = 0.9
         self.next_skill_temp = 0.8
         self.ret_tensor_type = "pt"
@@ -320,6 +322,7 @@ def _get_non_generated_logprobs_hf(
     ):
         second_skill_start_pos = second_skill_attn_mask.sum(-1)
         with torch.no_grad():
+            # so that even if multiple threads are using it at once, our maximium batch size won't be exceeded
             with self.lock:
                 logits = (
                     self.model(
diff --git a/sprint/datasets/llm_aggregate_dataset.py b/sprint/datasets/llm_aggregate_dataset.py
index 1dc2e9e..9a894dd 100644
--- a/sprint/datasets/llm_aggregate_dataset.py
+++ b/sprint/datasets/llm_aggregate_dataset.py
@@ -81,7 +81,7 @@ def main():
         "--llm_model",
         type=str,
         default="decapoda-resaerch/llama-13b-hf",
-        help="which model to use for the large language model. For optimal performance, use GPT-J-6B or bigger. For speed and decent performance, opt-2.7b is fine.",
+        help="which model to use for the large language model.",
         choices=[
             "facebook/opt-125m",
             "facebook/opt-350m",
diff --git a/sprint/eval.py b/sprint/eval.py
index 43ecc45..ea74c38 100644
--- a/sprint/eval.py
+++ b/sprint/eval.py
@@ -217,7 +217,6 @@ def main(config):
     )
     model_checkpoint_dir = config.model_checkpoint_dir
     print(model_checkpoint_dir)
-    config.use_llm = False  # set use llm to false so it doesn't load the LLM
     list_of_checkpoints, list_of_epochs = get_list_of_checkpoints(model_checkpoint_dir)
     # load one of the checkpoints' configs
 
diff --git a/sprint/rollouts/rollout.py b/sprint/rollouts/rollout.py
index a717062..71805a8 100644
--- a/sprint/rollouts/rollout.py
+++ b/sprint/rollouts/rollout.py
@@ -11,12 +11,9 @@
 )
 from sprint.utils.utils import (
     get_action_from_agent,
-    AttrDict,
     load_object_class,
-    process_skill_strings,
 )
 
-# import random
 path = "."
 
 import sys
@@ -34,15 +31,7 @@
 DATA_PATH = (
     f"{os.environ['SPRINT']}/sprint/alfred/data/json_2.1.0_merge_goto/preprocess"
 )
-# VISUAL_MODEL = "resnet18"
 REWARD_CONFIG_PATH = f"{os.environ['SPRINT']}/sprint/alfred/models/config/rewards.json"
-# DEFAULT_NUM_STEPS = 30
-# EVAL_STEP_RATIO = 2
-# TRAIN_STEP_RATIO = 2
-
-# global_task_args = AttrDict()
-# global_task_args.reward_config = REWARD_CONFIG
-# global_task_args.visual_model = VISUAL_MODEL
 
 
 def run_policy(
diff --git a/sprint/rollouts/saycan_rollout.py b/sprint/rollouts/saycan_rollout.py
index 25fb4e3..3c12154 100644
--- a/sprint/rollouts/saycan_rollout.py
+++ b/sprint/rollouts/saycan_rollout.py
@@ -1,55 +1,47 @@
 import numpy as np
 from sprint.alfred.env.thor_env import ThorEnv
-from sprint.rollouts.rollout import sample_task, setup_scene
 import torch
 from sprint.models.saycan_llm import SaycanPlanner
+from sprint.rollouts.gym_env import ALFREDRLEnv
+from sprint.rollouts.rollout import DATA_PATH, REWARD_CONFIG_PATH, vocab_path, vocab_obj_path
 import revtok
 from sprint.utils.data_utils import (
     remove_spaces_and_lower,
+    pad_sequence,
     numericalize,
     process_annotation,
 )
 import os
 import sys
-from utils import generate_video
 
-path = "."
-sys.path.append(os.path.join(path))
-sys.path.append(os.path.join(path, "gen"))
-sys.path.append(os.path.join(path, "models"))
-sys.path.append(os.path.join(path, "models", "eval"))
 from sprint.utils.utils import (
     load_object_class,
-    AttrDict,
     get_action_from_agent,
     process_skill_strings,
 )
-
+path = "."
+import sys
+sys.path.append(os.path.join(path))
+sys.path.append(os.path.join(path, "gen"))
+sys.path.append(os.path.join(path, "models"))
+sys.path.append(os.path.join(path, "models", "eval"))
 from PIL import Image
 from PIL import ImageFont
 from PIL import ImageDraw
 
-vocab = torch.load(f"{os.environ['SPRINT']}/sprint/models/low_level_actions.vocab")
-vocab_obj = torch.load(f"{os.environ['SPRINT']}/sprint/models/obj_cls.vocab")
-
-DATA_PATH = (
-    f"{os.environ['SPRINT']}/sprint/alfred/data/json_2.1.0_merge_goto/preprocess"
-)
-VISUAL_MODEL = "resnet18"
-REWARD_CONFIG = f"{os.environ['SPRINT']}/sprint/alfred/models/config/rewards.json"
-DEFAULT_NUM_STEPS = 30
-EVAL_STEP_RATIO = 2
-TRAIN_STEP_RATIO = 2
-
-global_task_args = AttrDict()
-global_task_args.reward_config = REWARD_CONFIG
-global_task_args.visual_model = VISUAL_MODEL
 
+def encode(annotations, vocab_word, convert_to_tensor=False):
+    if convert_to_tensor:
+        return pad_sequence(
+            [process_annotation(a, vocab_word).long() for a in annotations],
+            batch_first=True,
+            padding_value=0,
+        )
+    return [process_annotation(a, vocab_word).long() for a in annotations]
 
 def get_next_skill_from_saycan(
     model,
     saycan_planner: SaycanPlanner,
-    sentence_embedder,
     high_level_skill: str,
     primitive_skill_annotations: list[str],
     already_completed_skills: list[str],
@@ -62,8 +54,8 @@ def get_next_skill_from_saycan(
         [high_level_skill],
     )
     # get value logprobs
-    primitive_embeddings = sentence_embedder.encode(
-        primitive_skill_annotations, convert_to_tensor=True
+    primitive_embeddings = encode(
+        primitive_skill_annotations, model.vocab_word, convert_to_tensor=True
     )
     values = []
     for primitive_embedding in primitive_embeddings:
@@ -78,8 +70,9 @@ def get_next_skill_from_saycan(
     combined_affordance_probs = llm_probs * values
     # now take the argmax
     next_skill_idx = torch.argmax(combined_affordance_probs).item()
-    feat["language_ann"] = sentence_embedder.encode(
+    feat["language_ann"] = encode(
         primitive_skill_annotations[next_skill_idx : next_skill_idx + 1],
+        model.vocab_word,
         convert_to_tensor=True,
     ).to(
         device
@@ -88,393 +81,279 @@ def get_next_skill_from_saycan(
 
 
 def run_policy(
-    env,
+    env: ALFREDRLEnv,
     model,
-    saycan_planner,
-    sentence_embedder,
+    saycan_planner: SaycanPlanner,
     visual_preprocessor,
     device,
-    subgoal_pool,
     max_skill_length,
-    goal_states,
-    eval_split,
-    num_subgoals_in_pool,
     deterministic,
     log_video,
     epsilon,
     selected_specific_subgoal=None,
-    eval=True,
 ):
     # actually does the rollout. This function is a bit tricky to integrate with ALFRED's required code.
     model.eval()
-    if eval_split == "eval_instruct":
-        json_path_name = "train"
-    elif eval_split == "eval_length":
-        json_path_name = "train"
-    else:
-        json_path_name = "valid_unseen"
-    with torch.no_grad():
-        eval_idx, traj_data, r_idx, actually_selected_subgoal = sample_task(
-            json_path_name,
-            subgoal_pool,
-            num_subgoals_in_pool,
-            selected_specific_subgoal,
-        )
-        num_subgoals_to_complete = len(
-            subgoal_pool[actually_selected_subgoal]["primitive_skills"]
-        )
-        num_primitive_steps_in_task = sum(
-            [
-                len(primitive_skill["api_action"])
-                for primitive_skill in subgoal_pool[actually_selected_subgoal][
-                    "primitive_skills"
-                ]
+    ob, info = env.reset(selected_specific_subgoal)
+    # build the features to give as input to the actual model
+    feat = {}
+    # initialize frames and action buffers for the transformer
+    ob = visual_preprocessor.featurize([Image.fromarray(ob)], batch=1)
+    feat["frames_buffer"] = ob.unsqueeze(0).to(device)
+    feat["action_traj"] = torch.zeros(1, 0).long().to(device)
+    feat["object_traj"] = torch.zeros(1, 0).long().to(device)
+
+    chained_subgoal_instr = info.lang_instruction
+    actually_selected_subgoal = info.task
+    # get all primitive skills from this env
+    subgoal_pool = env.subgoal_pool
+    # subgoal info
+    primitive_skills_to_choose_from = process_skill_strings(
+        [
+            subgoal_pool[actually_selected_subgoal]["primitive_skills"][0][
+                "annotations"
             ]
-        )
-        if eval:
-            MAX_STEPS = num_primitive_steps_in_task * EVAL_STEP_RATIO
-        else:
-            MAX_STEPS = num_primitive_steps_in_task * TRAIN_STEP_RATIO
-
-        setup_scene(env, traj_data, r_idx, global_task_args)
-
-        expert_init_actions = [
-            a["discrete_action"]
-            for a in traj_data["plan"]["low_actions"]
-            if a["high_idx"] < eval_idx
         ]
-        all_instructions = process_skill_strings(
-            [
-                subgoal_pool[actually_selected_subgoal]["primitive_skills"][0][
-                    "annotations"
-                ]
-            ]
-            + [subgoal_pool[actually_selected_subgoal]["annotation"]]
-        )
-        primitive_skills = all_instructions[:-1]
-        # subgoal info
-        chained_subgoal_instr = all_instructions[-1]
-
-        ann_l = revtok.tokenize(remove_spaces_and_lower(chained_subgoal_instr))
-        ann_l = [w.strip().lower() for w in ann_l]
-        ann_token = numericalize(model.vocab_word, ann_l, train=False)
-        ann_token = torch.tensor(ann_token).long()
-        feat = {}
-        feat["language_ann"] = ann_token.to(device).unsqueeze(0)
-        if goal_states is not None:
-            task_name = subgoal_pool[actually_selected_subgoal]["task"]
-            subgoal_str = list()
-            for p in range(eval_idx, (eval_idx + num_subgoals_to_complete)):
-                subgoal_str.append(str(p))
-            trial_name = task_name + "-" + "_".join(subgoal_str)
-
-            task_index = goal_states[json_path_name]["trial_name"].index(trial_name)
-            trial_goal = goal_states[json_path_name]["goal_state"][task_index]
-            feat["state_goal"] = trial_goal.reshape([1, 5, 512, 7, 7]).to(device)
-
-        completed_eval_idx = eval_idx + num_subgoals_to_complete - 1
-        done = 0
-        t = 0
-
-        obs = []
-        acs = []
-        obj_acs = []
-        dones = []
-        env_rewards = []
-        str_act = []
-        video_frames = []
-        completed_skills = []
-        predicted_skills = []
-        value_predict = []
-        while not done:
-            # break if max_steps reached
-            if t >= MAX_STEPS + len(expert_init_actions):
-                break
-
-            if (len(expert_init_actions) == 0 and t == 0) or (
-                len(expert_init_actions) != 0 and t == len(expert_init_actions)
-            ):
-                curr_image = Image.fromarray(np.uint8(env.last_event.frame))
-
-                curr_frame = (
-                    visual_preprocessor.featurize([curr_image], batch=1)
-                    .unsqueeze(0)
-                    .to(device)
-                )
-                feat["frames_buffer"] = curr_frame.to(device)
-                feat["action_traj"] = torch.zeros(1, 0).long().to(device)
-                feat["object_traj"] = torch.zeros(1, 0).long().to(device)
-                # get first saycan planner action
-                saycan_selected_next_skill = get_next_skill_from_saycan(
-                    model,
-                    saycan_planner,
-                    sentence_embedder,
-                    chained_subgoal_instr,
-                    primitive_skills,
-                    completed_skills,
-                    feat,
-                    device,
-                )
-                predicted_skills.append(saycan_selected_next_skill)
-
-            if t < len(expert_init_actions):
-                # get expert action
-
-                action = expert_init_actions[t]
-                # print("expert_action", action)
-                compressed_mask = (
-                    action["args"]["mask"] if "mask" in action["args"] else None
-                )
-                mask = (
-                    env.decompress_mask(compressed_mask)
-                    if compressed_mask is not None
-                    else None
-                )
-
-                success, _, _, err, _ = env.va_interact(
-                    action["action"], interact_mask=mask, smooth_nav=True, debug=False
-                )
-                if not success:
-                    print("expert initialization failed, re-sampling")
-                    return run_policy(
-                        env,
-                        model,
-                        visual_preprocessor,
-                        device,
-                        subgoal_pool,
-                        max_skill_length,
-                        eval_split,
-                        num_subgoals_in_pool,
-                        deterministic,
-                        log_video,
-                        epsilon,
-                        selected_specific_subgoal,
-                        eval,
-                    )
-                _, _ = env.get_transition_reward()
-            else:
-                obs.append(curr_frame.cpu().detach().squeeze(1))
-                video_frames.append(np.uint8(env.last_event.frame))
-
-                (action, output_object, _) = get_action_from_agent(
-                    model,
-                    feat,
-                    vocab,
-                    vocab_obj,
-                    env,
-                    deterministic=deterministic,
-                    epsilon=epsilon,
-                    ret_value=False,
-                )
-                value_output = None
-                if value_output != None:
-                    value_output = value_output.squeeze().cpu().detach().numpy()
-                    value_predict.append(value_output)
-                try:
-                    _, _ = env.to_thor_api_exec(action, output_object, smooth_nav=True)
-                except Exception as e:
-                    # if there's an exception from running, then we'll just try again.
-                    # print(e)
-                    # record this exception in exceptions.txt
-                    with open("exceptions.txt", "a") as f:
-                        f.write(str(e) + "\n")
-
-                next_frame = np.uint8(env.last_event.frame)
-                next_frame = Image.fromarray(next_frame)
-                next_frame = (
-                    visual_preprocessor.featurize([next_frame], batch=1)
-                    .unsqueeze(0)
-                    .to(device)
-                )
-                curr_frame = next_frame
-
-                feat["frames_buffer"] = torch.cat(
-                    [feat["frames_buffer"], next_frame], dim=1
-                ).to(device)
-                # - 2 because ET dataloader had a -1 for padding reasons on the action, and we did - 1 when processing ALFRED actions to get rid of
-                # the extraneous END action
-                tensor_action = torch.tensor(
-                    vocab["action_low"].word2index(action) - 2
-                ).to(device)
-                feat["action_traj"] = torch.cat(
-                    [feat["action_traj"], tensor_action.unsqueeze(0).unsqueeze(0)],
-                    dim=1,
-                ).to(device)
-                obj_index = load_object_class(vocab_obj, output_object)
-                feat["object_traj"] = torch.cat(
-                    [
-                        feat["object_traj"],
-                        torch.tensor(obj_index).unsqueeze(0).unsqueeze(0).to(device),
-                    ],
-                    dim=1,
-                )
-                feat["frames_buffer"] = feat["frames_buffer"][:, -max_skill_length:]
-                feat["action_traj"] = feat["action_traj"][:, -max_skill_length + 1 :]
-                feat["object_traj"] = feat["object_traj"][:, -max_skill_length + 1 :]
-
-                acs.append(tensor_action.cpu())
-                str_act.append(
-                    dict(
-                        action=action,
-                        object=(
-                            output_object.split("|")[0]
-                            if output_object is not None
-                            else None
-                        ),
-                    )
-                )
-                obj_acs.append(obj_index)
+    )
 
-                t_success = env.last_event.metadata["lastActionSuccess"]
-                if not t_success:
-                    # env error logging just in case, but we'll just continue
-                    # these aren't critical errors, just sim errors
-                    err = env.last_event.metadata["errorMessage"]
-                    exception_string = f"Failed to execute action {action}. Object: {output_object}. Error: {err}"
-                    with open("exceptions.txt", "a") as f:
-                        f.write(exception_string + "\n")
+    ann_l = revtok.tokenize(remove_spaces_and_lower(chained_subgoal_instr))
+    ann_l = [w.strip().lower() for w in ann_l]
+    ann_token = numericalize(model.vocab_word, ann_l, train=False)
+    ann_token = torch.tensor(ann_token).long()
+    
+    obs = []
+    acs = []
+    obj_acs = []
+    dones = []
+    env_rewards = []
+    str_act = []
+    video_frames = []
+    value_predict = []
+    predicted_skills = []
+    completed_skills = []
+    done, timeout = False, False
+    # get first saycan planner action
+    saycan_selected_next_skill = get_next_skill_from_saycan(
+        model,
+        saycan_planner,
+        chained_subgoal_instr,
+        primitive_skills_to_choose_from,
+        completed_skills,
+        feat,
+        device,
+    )
+    predicted_skills.append(saycan_selected_next_skill)
 
-                # MUST call get_transition_reward to update the environment
-                _, _ = env.get_transition_reward()
-                curr_subgoal_idx = env.get_subgoal_idx()
-                partial_success = 0
-                if curr_subgoal_idx == completed_eval_idx:
-                    done = 1
-                    partial_success = 1
-                elif curr_subgoal_idx == eval_idx:
-                    eval_idx += 1
-                    partial_success = 1
-                    completed_skills.append(saycan_selected_next_skill)
-                    feat["frames_buffer"] = next_frame.unsqueeze(0).to(device)
-                    feat["action_traj"] = torch.zeros(1, 0).long().to(device)
-                    feat["object_traj"] = torch.zeros(1, 0).long().to(device)
-                    saycan_selected_next_skill = get_next_skill_from_saycan(
-                        model,
-                        saycan_planner,
-                        sentence_embedder,
-                        chained_subgoal_instr,
-                        primitive_skills,
-                        completed_skills,
-                        feat,
-                        device,
-                    )
-                    predicted_skills.append(saycan_selected_next_skill)
-                env_rewards.append(partial_success)
+    while not (done or timeout):
+        obs.append(ob.cpu().detach().squeeze(1))
+        video_frames.append(env.obs)
 
-            t = t + 1
-        subgoal_last_frame_video = np.uint8(env.last_event.frame)
-        video_frames.append(subgoal_last_frame_video)
-        (*_,) = get_action_from_agent(
+        (action, output_object, _) = get_action_from_agent(
             model,
             feat,
-            vocab,
-            vocab_obj,
+            env.vocab,
+            env.vocab_obj,
             env,
             deterministic=deterministic,
             epsilon=epsilon,
             ret_value=False,
         )
-        obs.append(next_frame.cpu().detach().squeeze(1))  # next obs
         value_output = None
         if value_output != None:
             value_output = value_output.squeeze().cpu().detach().numpy()
             value_predict.append(value_output)
 
-        if log_video:
-            value_font = ImageFont.truetype("FreeMono.ttf", 20)
-            action_font = ImageFont.truetype("FreeMono.ttf", 14)
-            gif_logs = []
-            for frame_number in range(len(video_frames)):
-                img = video_frames[frame_number]
-                img = Image.fromarray(img)
-                draw = ImageDraw.Draw(img)
-                if len(value_predict) != 0:
-                    value_log = value_predict[frame_number]
-                    # draw.text(
-                    #    (1, 280),
-                    #    "Value: %.3f" % (value_log),
-                    #    fill=(255, 255, 255),
-                    #    font=value_font,
-                    # )
-                if frame_number != 0:
-                    reward_log = env_rewards[frame_number - 1]
-                    # draw.text(
-                    #    (1, 260),
-                    #    "Reward: %.1f" % (reward_log),
-                    #    fill=(255, 255, 255),
-                    #    font=value_font,
-                    # )
-                    return_log = sum(env_rewards[0:frame_number])
-                    # draw.text(
-                    #    (150, 260),
-                    #    "Return: %.1f" % (return_log),
-                    #    fill=(255, 255, 255),
-                    #    font=value_font,
-                    # )
-                if frame_number != len(video_frames) - 1:
-                    action_log, object_log = (
-                        str_act[frame_number]["action"],
-                        str_act[frame_number]["object"],
-                    )
-                    # draw.text(
-                    #    (1, 1),
-                    #    f"Action: {action_log}\nObject: {str(object_log)}",
-                    #    fill=(255, 255, 255),
-                    #    font=action_font,
-                    # )
-
-                log_images = np.array(img)
-                gif_logs.append(log_images)
-
-            video_frames = np.asarray(gif_logs)
-            video_frames = np.transpose(video_frames, (0, 3, 1, 2))
-
-        rewards = torch.tensor(env_rewards, dtype=torch.float)
-        dones = torch.zeros(len(rewards))
-        dones[-1] = done
-        vid_caption = f"{chained_subgoal_instr[0] if isinstance(chained_subgoal_instr, list) else chained_subgoal_instr}: {'SUCCESS' if done else 'FAIL'}. Return: {rewards.sum()}/{num_subgoals_to_complete}."
-        ground_truth_sequence = " ".join(primitive_skills)
-        return dict(
-            completed_skills=" ".join(completed_skills),
-            predicted_skills=" ".join(predicted_skills),
-            ground_truth_sequence=ground_truth_sequence,
-            high_level_skill=chained_subgoal_instr,
-            rews=rewards,
-            dones=dones,
-            video_frames=video_frames if log_video else None,
-            video_caption=vid_caption,
-            chained_language_instruction=process_annotation(
-                chained_subgoal_instr, model.vocab_word, train=False
-            ).long(),
-            skill_length=num_subgoals_to_complete,
+        action_dict = dict(action=action, object=output_object)
+        next_ob, rew, done, info = env.step(action_dict)
+        timeout = info.timeout
+        next_ob = Image.fromarray(next_ob)
+        next_ob = (
+            visual_preprocessor.featurize([next_ob], batch=1)
+            .unsqueeze(0)
+            .to(device)
+        )
+        ob = next_ob
+        feat["frames_buffer"] = torch.cat(
+            [feat["frames_buffer"], next_ob], dim=1
+        ).to(device)
+        # - 2 because ET dataloader had a -1 for padding reasons on the action, and we did - 1 when processing ALFRED actions to get rid of
+        # the extraneous END action
+        tensor_action = torch.tensor(
+            env.vocab["action_low"].word2index(action) - 2
+        ).to(device)
+        feat["action_traj"] = torch.cat(
+            [feat["action_traj"], tensor_action.unsqueeze(0).unsqueeze(0)],
+            dim=1,
+        ).to(device)
+        obj_index = load_object_class(env.vocab_obj, output_object)
+        feat["object_traj"] = torch.cat(
+            [
+                feat["object_traj"],
+                torch.tensor(obj_index).unsqueeze(0).unsqueeze(0).to(device),
+            ],
+            dim=1,
+        )
+        # move onto the next skill predicted by saycan
+        if rew == 1 and not done:
+            completed_skills.append(saycan_selected_next_skill)
+            feat["frames_buffer"] = next_ob.to(device)
+            feat["action_traj"] = torch.zeros(1, 0).long().to(device)
+            feat["object_traj"] = torch.zeros(1, 0).long().to(device)
+            saycan_selected_next_skill = get_next_skill_from_saycan(
+                model,
+                saycan_planner,
+                chained_subgoal_instr,
+                primitive_skills_to_choose_from,
+                completed_skills,
+                feat,
+                device,
+            )
+            predicted_skills.append(saycan_selected_next_skill)
+        feat["frames_buffer"] = feat["frames_buffer"][:, -max_skill_length:]
+        feat["action_traj"] = feat["action_traj"][:, -max_skill_length + 1 :]
+        feat["object_traj"] = feat["object_traj"][:, -max_skill_length + 1 :]
+
+        env_rewards.append(rew)
+
+        acs.append(tensor_action.cpu())
+        str_act.append(
+            dict(
+                action=action,
+                object=(
+                    output_object.split("|")[0]
+                    if output_object is not None
+                    else None
+                ),
+            )
         )
+        obj_acs.append(obj_index)
+
+    subgoal_last_frame_video = env.obs
+    video_frames.append(subgoal_last_frame_video)
+    (*_,) = get_action_from_agent(
+        model,
+        feat,
+        env.vocab,
+        env.vocab_obj,
+        env,
+        deterministic=deterministic,
+        epsilon=epsilon,
+        ret_value=False,
+    )
+    obs.append(ob.cpu().detach().squeeze(1))  # last next obs
+    value_output = None
+    if value_output != None:
+        value_output = value_output.squeeze().cpu().detach().numpy()
+        value_predict.append(value_output)
+
+    if log_video:
+        value_font = ImageFont.truetype("FreeMono.ttf", 20)
+        action_font = ImageFont.truetype("FreeMono.ttf", 14)
+        gif_logs = []
+        for frame_number in range(len(video_frames)):
+            img = video_frames[frame_number]
+            img = Image.fromarray(img)
+            draw = ImageDraw.Draw(img)
+            if len(value_predict) != 0:
+                value_log = value_predict[frame_number]
+                # draw.text(
+                #    (1, 280),
+                #    "Value: %.3f" % (value_log),
+                #    fill=(255, 255, 255),
+                #    font=value_font,
+                # )
+            if frame_number != 0:
+                reward_log = env_rewards[frame_number - 1]
+                # draw.text(
+                #    (1, 260),
+                #    "Reward: %.1f" % (reward_log),
+                #    fill=(255, 255, 255),
+                #    font=value_font,
+                # )
+                return_log = sum(env_rewards[0:frame_number])
+                # draw.text(
+                #    (150, 260),
+                #    "Return: %.1f" % (return_log),
+                #    fill=(255, 255, 255),
+                #    font=value_font,
+                # )
+            if frame_number != len(video_frames) - 1:
+                action_log, object_log = (
+                    str_act[frame_number]["action"],
+                    str_act[frame_number]["object"],
+                )
+                # draw.text(
+                #    (1, 1),
+                #    f"Action: {action_log}\nObject: {str(object_log)}",
+                #    fill=(255, 255, 255),
+                #    font=action_font,
+                # )
+
+            log_images = np.array(img)
+            gif_logs.append(log_images)
+
+        video_frames = np.asarray(gif_logs)
+        video_frames = np.transpose(video_frames, (0, 3, 1, 2))
+
+    rewards = torch.tensor(env_rewards, dtype=torch.float)
+    dones = torch.zeros(len(rewards))
+    dones[-1] = done
+    vid_caption = f"{chained_subgoal_instr[0] if isinstance(chained_subgoal_instr, list) else chained_subgoal_instr}: {'SUCCESS' if done else 'FAIL'}. Return: {rewards.sum()}/{env.num_subgoals_to_complete}."
+    ground_truth_sequence = " ".join(primitive_skills_to_choose_from)
+    return dict(
+        completed_skills=" ".join(completed_skills),
+        predicted_skills=" ".join(predicted_skills),
+        ground_truth_sequence=ground_truth_sequence,
+        high_level_skill=chained_subgoal_instr,
+        rews=rewards,
+        dones=dones,
+        video_frames=video_frames if log_video else None,
+        video_caption=vid_caption,
+        chained_language_instruction=process_annotation(
+            chained_subgoal_instr, model.vocab_word, train=False
+        ).long(),
+        skill_length=env.num_subgoals_to_complete,
+    )
 
 
 def run_policy_multi_process(
     ret_queue,
     task_queue,
     offline_rl_model,
+    saycan_planner,
     resnet,
     device,
-    subgoal_pool,
     max_skill_length,
-    goal_states,
+    task_suite,
+    eval_json_path,
+    specific_task,
 ):
-    env = ThorEnv()
+    env = ALFREDRLEnv(
+        DATA_PATH,
+        REWARD_CONFIG_PATH,
+        task_suite,
+        eval_json_path,
+        vocab_path,
+        vocab_obj_path,
+        specific_task,
+    )
+    num_eval_tasks = env.num_tasks
+    # put the number of tasks into the return queue to tell the calling script thing how many rollouts to perform for evaluation
+    ret_queue.put(num_eval_tasks)
     while True:
         task_args = task_queue.get()
         if task_args is None:
             break
-        ret_queue.put(
-            run_policy(
-                env,
-                offline_rl_model,
-                resnet,
-                device,
-                subgoal_pool,
-                max_skill_length,
-                goal_states,
-                *task_args,
+        with torch.no_grad():
+            ret_queue.put(
+                run_policy(
+                    env,
+                    offline_rl_model,
+                    saycan_planner,
+                    resnet,
+                    device,
+                    max_skill_length,
+                    *task_args,
+                )
             )
-        )
     env.stop()
diff --git a/sprint/saycan_eval.py b/sprint/saycan_eval.py
index 4898cc8..b11c6bc 100644
--- a/sprint/saycan_eval.py
+++ b/sprint/saycan_eval.py
@@ -40,65 +40,11 @@
 WANDB_PROJECT_NAME = "p-bootstrap-llm"
 
 
-class ETLanguageEncoder:
-    def __init__(self, vocab_word):
-        self.vocab_word = vocab_word
-
-    def encode(self, annotations, convert_to_tensor=False):
-        if convert_to_tensor:
-            return pad_sequence(
-                [process_annotation(a, self.vocab_word).long() for a in annotations],
-                batch_first=True,
-                padding_value=0,
-            )
-        return [process_annotation(a, self.vocab_word).long() for a in annotations]
-
-
-def setup_mp(
-    result_queue,
-    task_queue,
-    saycan_planner,
-    sentence_embedder,
-    agent_model,
-    resnet,
-    config,
-    device,
-    subgoal_pool,
-    goal_states,
-):
-    num_workers = config.num_rollout_workers
-    workers = []
-    # start workers
-    worker_target = run_policy_multi_process
-    for _ in range(num_workers):
-        worker = threading.Thread(
-            target=worker_target,
-            args=(
-                result_queue,
-                task_queue,
-                saycan_planner,
-                sentence_embedder,
-                agent_model,
-                resnet,
-                device,
-                subgoal_pool,
-                config.max_skill_length,
-                goal_states,
-            ),
-        )
-        worker.daemon = True  # kills thread/process when parent thread terminates
-        worker.start()
-        time.sleep(0.5)
-        workers.append(worker)
-    return workers
-
-
 def setup_mp(
     result_queue,
     task_queue,
     agent_model,
     saycan_planner,
-    sentence_encoder,
     resnet,
     config,
     device,
@@ -116,17 +62,20 @@ def setup_mp(
                 task_queue,
                 agent_model,
                 saycan_planner,
-                sentence_encoder,
                 resnet,
-                config,
                 device,
+                config.max_skill_length,
+                config.env_type,
+                config.eval_json,
+                config.specific_task,
             ),
         )
         worker.daemon = True  # kills thread/process when parent thread terminates
         worker.start()
         time.sleep(0.5)
         workers.append(worker)
-    return workers
+        num_tasks = result_queue.get()
+    return workers, num_tasks
 
 
 def multithread_dataset_aggregation(
@@ -145,7 +94,7 @@ def multithread_dataset_aggregation(
     num_env_samples = 0
     num_finished_tasks = 0
     num_rollouts = (
-        config.num_subgoals_in_pool if eval else config.num_rollouts_per_epoch
+        config.num_eval_tasks if eval else config.num_rollouts_per_epoch
     )
     with tqdm(total=num_rollouts) as pbar:
         while num_finished_tasks < num_rollouts:
@@ -165,7 +114,7 @@ def multithread_dataset_aggregation(
                 extra_info["ground_truth_sequence"].append(
                     result["ground_truth_sequence"]
                 )
-            num_env_samples += result["obs"].shape[0]
+            num_env_samples += result["rews"].shape[0]
             num_finished_tasks += 1
             pbar.update(1)
             if eval:
@@ -185,32 +134,29 @@ def multiprocess_rollout(
     result_queue,
     config,
     epsilon,
-    dataset,
-    eval,
     make_video,
-    dataset_agg_func,
 ):
     rollout_returns = []
     subgoal_successes = []
     rollout_gifs = []
     video_captions = []
     extra_info = defaultdict(list)
-    num_rollouts = config.num_subgoals_in_pool if eval else config.num_rollouts_per_epoch
+
+    num_rollouts = (
+        config.num_eval_tasks if eval else config.num_rollouts_per_epoch
+    )
+    # create tasks for MP Queue
 
     # create tasks for thread/process Queue
     args_func = lambda subgoal: (
-        config.env_type,
-        config.num_subgoals_in_pool,
-        True if eval else config.deterministic_action,
-        True if eval else False,  # log to video
+        True,
+        True,
         epsilon,
-        subgoal if eval else None,
+        subgoal
     )
 
-    for i in range(num_rollouts):
-        eval_list = eval_skill_info_list
-        eval_skill_index = i
-        task_queue.put(args_func(eval_skill_index, eval_list))
+    for subgoal in range(num_rollouts):
+        task_queue.put(args_func(subgoal))
 
     num_env_samples_list = []  # use list for thread safety
     multithread_dataset_aggregation(
@@ -223,10 +169,17 @@ def multiprocess_rollout(
         None,
         config,
         num_env_samples_list,
-        eval,
+        True,
     )
 
     num_env_samples = num_env_samples_list[0]
+    # aggregate metrics
+    rollout_metrics = dict(
+        average_return=np.mean(rollout_returns),
+        subgoal_success=np.mean(subgoal_successes),
+    )
+    for key, value in extra_info.items():
+        rollout_metrics[key] = np.mean(value)
     # make a WandB table for the high level skill, ground truth sequence, predicted, completed skills
     saycan_completed_skill_data = []
     keys = [
@@ -273,13 +226,25 @@ def multiprocess_rollout(
         data=saycan_completed_skill_data,
     )
     rollout_metrics["evaluation_table"] = table
-    renamed_metrics = {}
-    for key in rollout_metrics:
-        renamed_metrics[f"{rollout_mode}_{key}"] = rollout_metrics[key]
-    return renamed_metrics, num_env_samples
+    return rollout_metrics, num_env_samples
 
 
 def main(config):
+    config.eval_json = (
+        f"{os.environ['SPRINT']}/sprint/rollouts/{config.env_type}_ann_human.json"
+    )
+    seed = config.seed
+    torch.manual_seed(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+    run = wandb.init(
+        resume=config.experiment_name,
+        project=WANDB_PROJECT_NAME,
+        entity=WANDB_ENTITY_NAME,
+        notes=config.notes,
+        config=config,
+        group=config.run_group,
+    )
     model_checkpoint_dir = config.model_checkpoint_dir
     print(model_checkpoint_dir)
     list_of_checkpoints, list_of_epochs = get_list_of_checkpoints(model_checkpoint_dir)
@@ -298,25 +263,6 @@ def main(config):
         ):
             vars(config)[key] = vars(old_config)[key]
 
-    run = wandb.init(
-        # resume=config.experiment_name,  # "allow",
-        # name=config.experiment_name,
-        id=config.experiment_name,
-        name=config.experiment_name,
-        project=WANDB_PROJECT_NAME,
-        entity=WANDB_ENTITY_NAME,
-        notes=config.notes,
-        config=config,
-        group=config.run_group,
-    )
-    seed = config.seed
-    if seed is not None:
-        torch.manual_seed(seed)
-        np.random.seed(seed)
-        torch.backends.cudnn.deterministic = False
-        random.seed(seed)
-    torch.backends.cudnn.benchmark = False  # uses a lot of gpu memory if True
-
     os.makedirs(
         os.path.join(
             config.save_dir,
@@ -329,9 +275,12 @@ def main(config):
         exist_ok=True,
     )
 
+    if vars(config)["model"] == "sprint":
+        agent_model = SPRINTETIQLModel(config)
+    else:
+        raise ValueError(f"Model {config.model} not supported for SayCan")
+
     device = torch.device(config.gpus[0])
-    agent_model = SPRINTETIQLModel(config)
-    goal_states = None
     if len(config.gpus) > 1:
         print(f"-----Using {len(config.gpus)} GPUs-----")
         agent_model = nn.DataParallel(
@@ -342,13 +291,10 @@ def main(config):
 
     agent_model.load_from_checkpoint(checkpoint)
 
-    # print(agent_model)
-
     resnet_args = AttrDict()
     resnet_args.visual_model = "resnet18"
     resnet_args.gpu = config.gpus[0]
     resnet = Resnet(resnet_args, eval=True, use_conv_feat=True)
-    sentence_encoder = ETLanguageEncoder(agent_model.vocab_word)
 
     saycan_planner = SaycanPlanner(config)
 
@@ -356,128 +302,16 @@ def main(config):
     task_queue = queue.SimpleQueue()
     result_queue = queue.SimpleQueue()
 
-    with open(config.eval_json, "r") as f:
-        eval_skill_info_list = json.load(f)
-
-    # sort skill info list by num_primitive_skills, descending, for faster evaluation with multiple threads
-    # eval_skill_info_list.sort(
-    #    key=lambda x: sum(
-    #        [
-    #            len(primitive_skill["api_action"])
-    #            for primitive_skill in x["primitive_skills"]
-    #        ]
-    #    ),
-    #    reverse=True,
-    # )
-    eval_skill_info_list.sort(key=lambda x: len(x["primitive_skills"]), reverse=True)
-    with open(f"scene_sampling/{config.scene_type}_scene.json", "r") as f:
-        all_scenes_json = json.load(f)
-    floorplan_set = set()
-    all_floorplans = [
-        all_scenes_json[x["primitive_skills"][0]["scene_index"]]["scene_num"]
-        for x in eval_skill_info_list
-    ]
-    unique_floorplans = [
-        x for x in all_floorplans if not (x in floorplan_set or floorplan_set.add(x))
-    ]
-
-    if config.eval_per_task_in_json:
-        eval_skill_info_list = [
-            x for x in eval_skill_info_list if len(x["primitive_skills"])
-        ]
-
-        sorted_task_names = [
-            dict(
-                task=task["task"],
-                starting_subgoal_id=min(task["subgoal_ids"]),
-                repeat_id=task["repeat_id"],
-            )
-            for task in eval_skill_info_list
-        ]
-    else:
-        if config.specific_task is not None:
-            eval_skill_info_list = eval_skill_info_list[
-                config.specific_task : config.specific_task + 1
-            ]
-        # unique so we can select a specific env
-        sorted_task_names = [
-            count[0]
-            for count in Counter(
-                [skill_info["task"] for skill_info in eval_skill_info_list]
-            ).most_common()
-        ]
-
-        eval_skill_info_list = [
-            skill_info
-            for skill_info in eval_skill_info_list
-            if skill_info["task"] in sorted_task_names
-        ]
-
-    config.num_subgoals_in_pool = len(sorted_task_names)
-    # step by step evaluation skill info list
-    primitive_eval_skill_info_list = make_primitive_annotation_eval_dataset(
-        eval_skill_info_list
-    )
-    print(
-        f"Evaluating on {len(sorted_task_names)} tasks. Total {len(eval_skill_info_list)} skills"
-    )
-
-    primitive_skills_to_use = []
-    task_lengths = None
-    if config.eval_per_task_in_json:
-        task_lengths = []
-        for task in primitive_eval_skill_info_list:
-            primitive_skills_to_use.append(
-                generate_primitive_skill_list_from_eval_skill_info_list([task])
-            )
-            task_lengths.append(len(task["primitive_skills"]))
-    else:
-        primitive_skills_to_use = [
-            generate_primitive_skill_list_from_eval_skill_info_list(
-                primitive_eval_skill_info_list
-            )
-        ]
-    if not config.use_only_task_primitive_skills:
-        # aggregate all primitive skills if they have the same floorplan
-        floorplan_per_task = []
-        for task in eval_skill_info_list:
-            floorplan_per_task.append(
-                all_scenes_json[task["primitive_skills"][0]["scene_index"]]["scene_num"]
-            )
-        aggregated_tasks = {}
-        for task, fp in zip(eval_skill_info_list, floorplan_per_task):
-            if fp not in aggregated_tasks:
-                aggregated_tasks[fp] = copy.deepcopy(task)
-            else:
-                aggregated_tasks[fp]["primitive_skills"].extend(
-                    task["primitive_skills"]
-                )
-        # now separate by floorplan
-        all_primitive_skills_per_floorplan = {
-            fp: generate_primitive_skill_list_from_eval_skill_info_list(
-                [aggregated_tasks[fp]]
-            )
-            for fp in aggregated_tasks
-        }
-        # now add the primitive skills for each task
-        primitive_skills_to_use = []
-        for fp in floorplan_per_task:
-            primitive_skills_to_use.append(all_primitive_skills_per_floorplan[fp])
-    print(
-        f"Evaluating on {len(sorted_task_names)} tasks. Total {[len(primitive_skills_to_use[i]) for i in range(len(primitive_skills_to_use))]} skills"
-    )
-    processes = setup_mp(
+    processes, num_eval_tasks = setup_mp(
         result_queue,
         task_queue,
         agent_model,
         saycan_planner,
-        sentence_encoder,
         resnet,
         config,
         device,
-        eval_skill_info_list,
-        goal_states,
     )
+    config.num_eval_tasks = num_eval_tasks
 
     def signal_handler(sig, frame):
         print("SIGINT received. Exiting...closing all processes first")
@@ -491,8 +325,7 @@ def signal_handler(sig, frame):
         result_queue,
         config,
         0,
-        rollout_mode="fixed_eval",
-        eval_skill_info_list=eval_skill_info_list,
+        True, 
     )
     wandb.log(
         eval_metrics,
@@ -519,7 +352,7 @@ def signal_handler(sig, frame):
         help="which gpus. pass in as comma separated string to use DataParallel on multiple GPUs",
     )
     parser.add_argument(
-        "--seed", type=int, default=None, help="random seed for initialization"
+        "--seed", type=int, default=42, help="random seed for initialization"
     )
     parser.add_argument(
         "--num_rollout_workers",
@@ -550,17 +383,11 @@ def signal_handler(sig, frame):
         help="group to run the experiment in. If None, no group is used",
     )
     parser.add_argument(
-        "--scene_type",
-        type=str,
-        default="valid_seen",
-        choices=["train", "valid_seen", "valid_unseen"],
-        help="which type of scenes to sample from/evaluate on",
-    )
-    parser.add_argument(
-        "--eval_json",
+        "--env_type",
         type=str,
-        default="scene_sampling/bootstrap_valid_seen-40_ann_human.json",
-        help="path to the json file containing the evaluation scenes and skills",
+        required=True,
+        choices=["eval_instruct", "eval_length", "eval_scene"],
+        help="alfred environment to use",
     )
     parser.add_argument(
         "--use_amp",
@@ -570,31 +397,14 @@ def signal_handler(sig, frame):
         nargs="?",
         help="whether to use automatic mixed precision. set default to false to disable nans during online training.",
     )
-    parser.add_argument(
-        "--eval_per_task_in_json",
-        type=str2bool,
-        default=True,
-        const=True,
-        nargs="?",
-        help="whether to evaluate each task in the json file separately.",
-    )
-    parser.add_argument(
-        "--use_only_task_primitive_skills",
-        type=str2bool,
-        default=False,
-        const=True,
-        nargs="?",
-        help="whether to use only the given eval primitive skills for chaining",
-    )
     # LLM arguments
     parser.add_argument(
         "--llm_model",
         type=str,
-        # default="decapoda-research/llama-13b-hf",
-        default="decapoda-research/llama-7B-hf",
+        default="huggyllama/llama-7b",
         help="which model to use for the large language model. ",
         choices=[
-            "None",
+            "facebook/opt-125m",
             "decapoda-research/llama-13b-hf",
             "decapoda-research/llama-7B-hf",
         ],