diff --git a/.gitignore b/.gitignore index e43b0f9..73d4ca5 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,161 @@ .DS_Store +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ diff --git a/README.md b/README.md index 3e8df5d..17a4884 100644 --- a/README.md +++ b/README.md @@ -59,10 +59,11 @@ export SPRINT=[SPRINT_DOWNLOAD_LOCATION] ``` ## 2. Downloading the data +### 2.1 Model Training Data You need to pre-train models to run zero-shot or finetuning experiments. If you don't want to pre-train a model yourself, you can skip to step 3 as you don't need the pre-training dataset file. -We have two options for obtaining the ALFRED dataset---either download the data from here: [Google Drive Link](https://drive.google.com/file/d/1ZgKDgG9Fv491GVb9rxIVNJpViPNKFWMF) or set it up yourself with the instructions in the [README in the `datasets` folder](datasets/README.md). +Download the ALFRED dataset here: [Google Drive Link](https://drive.google.com/file/d/1ZgKDgG9Fv491GVb9rxIVNJpViPNKFWMF). You can use [Gdown](https://github.com/wkentaro/gdown) to directly download the dataset to your server/computer at the desired location (18GB download): ``` @@ -78,6 +79,15 @@ Once the dataset is downloaded (`px_llama_13b.tar.gz`) simply untar it (36GB aft ``` tar -xvzf px_llama_13b.tar.gz ``` +### 2.2 ALFRED Evaluation Data +To run evals and fine-tuning experiments, you must extract ALFRED evaluation data we have processed ([Google Drive Link](https://drive.google.com/file/d/1MHDrKSRmyag-DwipyLj-i-BbKU_dxbne/view)): + +``` +cd [SPRINT_REPO_LOCATION] +cd sprint/alfred/data +gdown 1MHDrKSRmyag-DwipyLj-i-BbKU_dxbne +tar -xvzf json_2.1.0_merge_goto.tar.gz +``` ## 3. Setting up WandB We log using WandB. First create a wandb account if you don't already have one [here](https://wandb.ai). @@ -87,7 +97,7 @@ Finally, fill in `WANDB_ENTITY_NAME, WANDB_PROJECT_NAME` in the file `utils/wand ## 4. Pre-training a Model -You can either pre-train a model yourself or download a pre-trained checkpoint. Pre-trained model checkpoints can be found here: [Google Drive Link](https://drive.google.com/file/d/1PDNX7Z1BBoB3pmeBTfOgNxe2I53kUoS0). +You can either pre-train a model yourself or download a pre-trained checkpoint. Pre-trained model checkpoints can be found here: [Google Drive Link](https://drive.google.com/file/d/1PDNX7Z1BBoB3pmeBTfOgNxe2I53kUoS0/view). Otherwise, run the following command from the base SPRINT repo location to train our model, SPRINT: @@ -150,8 +160,10 @@ Checkpoints are saved in `sprint_saved_rl_models/` To run SayCan zero-shot evals, pre-train the L-BC baseline above and then: ``` -TODO: coming soon +python sprint/saycan_eval.py --model_checkpoint_dir [L-BC PATH] --env_type {eval_instruct, eval_length, eval_scene} --run_group [RUN_GROUP] --experiment_name [EXP_NAME] --llm_gpus [GPU] ``` +The optional `llm_gpus` flag allows you to input a comma separated list of GPU IDs to put the LLM onto since it might be too big to fit on the same GPU as the model. + The 13b llama model we used is no longer available on huggingface, so to fully reproduce this you should follow the llama instructions to download LLaMA-13B. Right now this script defaults to LLaMA-7B, but the empirically the performance is very similar. diff --git a/sprint/datasets/large_language_model.py b/sprint/datasets/large_language_model.py index 7cf21ac..a8c2231 100644 --- a/sprint/datasets/large_language_model.py +++ b/sprint/datasets/large_language_model.py @@ -2,6 +2,7 @@ from tqdm import tqdm from transformers import AutoModelForCausalLM, AutoTokenizer import torch +from threading import Lock from sprint.utils.utils import process_skill_strings @@ -48,8 +49,8 @@ class LargeLanguageModel: def __init__(self, config): assert ( - "opt" in config.llm_model or "gpt" in config.llm_model - ), "No tokenizer support for non-gpt/opt models" + "opt" in config.llm_model or "gpt" in config.llm_model or "llama" in config.llm_model + ), "No tokenizer support for non-gpt/opt/llama models" self.config = config self.llm_gpus = config.llm_gpus self.llm_max_new_tokens = config.llm_max_new_tokens @@ -76,6 +77,7 @@ def __init__(self, config): pad_token_id=self.tokenizer.eos_token_id, device_map="auto", ) + self.lock = Lock() self.next_skill_top_p = 0.9 self.next_skill_temp = 0.8 self.ret_tensor_type = "pt" @@ -320,6 +322,7 @@ def _get_non_generated_logprobs_hf( ): second_skill_start_pos = second_skill_attn_mask.sum(-1) with torch.no_grad(): + # so that even if multiple threads are using it at once, our maximium batch size won't be exceeded with self.lock: logits = ( self.model( diff --git a/sprint/datasets/llm_aggregate_dataset.py b/sprint/datasets/llm_aggregate_dataset.py index 1dc2e9e..9a894dd 100644 --- a/sprint/datasets/llm_aggregate_dataset.py +++ b/sprint/datasets/llm_aggregate_dataset.py @@ -81,7 +81,7 @@ def main(): "--llm_model", type=str, default="decapoda-resaerch/llama-13b-hf", - help="which model to use for the large language model. For optimal performance, use GPT-J-6B or bigger. For speed and decent performance, opt-2.7b is fine.", + help="which model to use for the large language model.", choices=[ "facebook/opt-125m", "facebook/opt-350m", diff --git a/sprint/eval.py b/sprint/eval.py index 43ecc45..ea74c38 100644 --- a/sprint/eval.py +++ b/sprint/eval.py @@ -217,7 +217,6 @@ def main(config): ) model_checkpoint_dir = config.model_checkpoint_dir print(model_checkpoint_dir) - config.use_llm = False # set use llm to false so it doesn't load the LLM list_of_checkpoints, list_of_epochs = get_list_of_checkpoints(model_checkpoint_dir) # load one of the checkpoints' configs diff --git a/sprint/rollouts/rollout.py b/sprint/rollouts/rollout.py index a717062..71805a8 100644 --- a/sprint/rollouts/rollout.py +++ b/sprint/rollouts/rollout.py @@ -11,12 +11,9 @@ ) from sprint.utils.utils import ( get_action_from_agent, - AttrDict, load_object_class, - process_skill_strings, ) -# import random path = "." import sys @@ -34,15 +31,7 @@ DATA_PATH = ( f"{os.environ['SPRINT']}/sprint/alfred/data/json_2.1.0_merge_goto/preprocess" ) -# VISUAL_MODEL = "resnet18" REWARD_CONFIG_PATH = f"{os.environ['SPRINT']}/sprint/alfred/models/config/rewards.json" -# DEFAULT_NUM_STEPS = 30 -# EVAL_STEP_RATIO = 2 -# TRAIN_STEP_RATIO = 2 - -# global_task_args = AttrDict() -# global_task_args.reward_config = REWARD_CONFIG -# global_task_args.visual_model = VISUAL_MODEL def run_policy( diff --git a/sprint/rollouts/saycan_rollout.py b/sprint/rollouts/saycan_rollout.py index 25fb4e3..3c12154 100644 --- a/sprint/rollouts/saycan_rollout.py +++ b/sprint/rollouts/saycan_rollout.py @@ -1,55 +1,47 @@ import numpy as np from sprint.alfred.env.thor_env import ThorEnv -from sprint.rollouts.rollout import sample_task, setup_scene import torch from sprint.models.saycan_llm import SaycanPlanner +from sprint.rollouts.gym_env import ALFREDRLEnv +from sprint.rollouts.rollout import DATA_PATH, REWARD_CONFIG_PATH, vocab_path, vocab_obj_path import revtok from sprint.utils.data_utils import ( remove_spaces_and_lower, + pad_sequence, numericalize, process_annotation, ) import os import sys -from utils import generate_video -path = "." -sys.path.append(os.path.join(path)) -sys.path.append(os.path.join(path, "gen")) -sys.path.append(os.path.join(path, "models")) -sys.path.append(os.path.join(path, "models", "eval")) from sprint.utils.utils import ( load_object_class, - AttrDict, get_action_from_agent, process_skill_strings, ) - +path = "." +import sys +sys.path.append(os.path.join(path)) +sys.path.append(os.path.join(path, "gen")) +sys.path.append(os.path.join(path, "models")) +sys.path.append(os.path.join(path, "models", "eval")) from PIL import Image from PIL import ImageFont from PIL import ImageDraw -vocab = torch.load(f"{os.environ['SPRINT']}/sprint/models/low_level_actions.vocab") -vocab_obj = torch.load(f"{os.environ['SPRINT']}/sprint/models/obj_cls.vocab") - -DATA_PATH = ( - f"{os.environ['SPRINT']}/sprint/alfred/data/json_2.1.0_merge_goto/preprocess" -) -VISUAL_MODEL = "resnet18" -REWARD_CONFIG = f"{os.environ['SPRINT']}/sprint/alfred/models/config/rewards.json" -DEFAULT_NUM_STEPS = 30 -EVAL_STEP_RATIO = 2 -TRAIN_STEP_RATIO = 2 - -global_task_args = AttrDict() -global_task_args.reward_config = REWARD_CONFIG -global_task_args.visual_model = VISUAL_MODEL +def encode(annotations, vocab_word, convert_to_tensor=False): + if convert_to_tensor: + return pad_sequence( + [process_annotation(a, vocab_word).long() for a in annotations], + batch_first=True, + padding_value=0, + ) + return [process_annotation(a, vocab_word).long() for a in annotations] def get_next_skill_from_saycan( model, saycan_planner: SaycanPlanner, - sentence_embedder, high_level_skill: str, primitive_skill_annotations: list[str], already_completed_skills: list[str], @@ -62,8 +54,8 @@ def get_next_skill_from_saycan( [high_level_skill], ) # get value logprobs - primitive_embeddings = sentence_embedder.encode( - primitive_skill_annotations, convert_to_tensor=True + primitive_embeddings = encode( + primitive_skill_annotations, model.vocab_word, convert_to_tensor=True ) values = [] for primitive_embedding in primitive_embeddings: @@ -78,8 +70,9 @@ def get_next_skill_from_saycan( combined_affordance_probs = llm_probs * values # now take the argmax next_skill_idx = torch.argmax(combined_affordance_probs).item() - feat["language_ann"] = sentence_embedder.encode( + feat["language_ann"] = encode( primitive_skill_annotations[next_skill_idx : next_skill_idx + 1], + model.vocab_word, convert_to_tensor=True, ).to( device @@ -88,393 +81,279 @@ def get_next_skill_from_saycan( def run_policy( - env, + env: ALFREDRLEnv, model, - saycan_planner, - sentence_embedder, + saycan_planner: SaycanPlanner, visual_preprocessor, device, - subgoal_pool, max_skill_length, - goal_states, - eval_split, - num_subgoals_in_pool, deterministic, log_video, epsilon, selected_specific_subgoal=None, - eval=True, ): # actually does the rollout. This function is a bit tricky to integrate with ALFRED's required code. model.eval() - if eval_split == "eval_instruct": - json_path_name = "train" - elif eval_split == "eval_length": - json_path_name = "train" - else: - json_path_name = "valid_unseen" - with torch.no_grad(): - eval_idx, traj_data, r_idx, actually_selected_subgoal = sample_task( - json_path_name, - subgoal_pool, - num_subgoals_in_pool, - selected_specific_subgoal, - ) - num_subgoals_to_complete = len( - subgoal_pool[actually_selected_subgoal]["primitive_skills"] - ) - num_primitive_steps_in_task = sum( - [ - len(primitive_skill["api_action"]) - for primitive_skill in subgoal_pool[actually_selected_subgoal][ - "primitive_skills" - ] + ob, info = env.reset(selected_specific_subgoal) + # build the features to give as input to the actual model + feat = {} + # initialize frames and action buffers for the transformer + ob = visual_preprocessor.featurize([Image.fromarray(ob)], batch=1) + feat["frames_buffer"] = ob.unsqueeze(0).to(device) + feat["action_traj"] = torch.zeros(1, 0).long().to(device) + feat["object_traj"] = torch.zeros(1, 0).long().to(device) + + chained_subgoal_instr = info.lang_instruction + actually_selected_subgoal = info.task + # get all primitive skills from this env + subgoal_pool = env.subgoal_pool + # subgoal info + primitive_skills_to_choose_from = process_skill_strings( + [ + subgoal_pool[actually_selected_subgoal]["primitive_skills"][0][ + "annotations" ] - ) - if eval: - MAX_STEPS = num_primitive_steps_in_task * EVAL_STEP_RATIO - else: - MAX_STEPS = num_primitive_steps_in_task * TRAIN_STEP_RATIO - - setup_scene(env, traj_data, r_idx, global_task_args) - - expert_init_actions = [ - a["discrete_action"] - for a in traj_data["plan"]["low_actions"] - if a["high_idx"] < eval_idx ] - all_instructions = process_skill_strings( - [ - subgoal_pool[actually_selected_subgoal]["primitive_skills"][0][ - "annotations" - ] - ] - + [subgoal_pool[actually_selected_subgoal]["annotation"]] - ) - primitive_skills = all_instructions[:-1] - # subgoal info - chained_subgoal_instr = all_instructions[-1] - - ann_l = revtok.tokenize(remove_spaces_and_lower(chained_subgoal_instr)) - ann_l = [w.strip().lower() for w in ann_l] - ann_token = numericalize(model.vocab_word, ann_l, train=False) - ann_token = torch.tensor(ann_token).long() - feat = {} - feat["language_ann"] = ann_token.to(device).unsqueeze(0) - if goal_states is not None: - task_name = subgoal_pool[actually_selected_subgoal]["task"] - subgoal_str = list() - for p in range(eval_idx, (eval_idx + num_subgoals_to_complete)): - subgoal_str.append(str(p)) - trial_name = task_name + "-" + "_".join(subgoal_str) - - task_index = goal_states[json_path_name]["trial_name"].index(trial_name) - trial_goal = goal_states[json_path_name]["goal_state"][task_index] - feat["state_goal"] = trial_goal.reshape([1, 5, 512, 7, 7]).to(device) - - completed_eval_idx = eval_idx + num_subgoals_to_complete - 1 - done = 0 - t = 0 - - obs = [] - acs = [] - obj_acs = [] - dones = [] - env_rewards = [] - str_act = [] - video_frames = [] - completed_skills = [] - predicted_skills = [] - value_predict = [] - while not done: - # break if max_steps reached - if t >= MAX_STEPS + len(expert_init_actions): - break - - if (len(expert_init_actions) == 0 and t == 0) or ( - len(expert_init_actions) != 0 and t == len(expert_init_actions) - ): - curr_image = Image.fromarray(np.uint8(env.last_event.frame)) - - curr_frame = ( - visual_preprocessor.featurize([curr_image], batch=1) - .unsqueeze(0) - .to(device) - ) - feat["frames_buffer"] = curr_frame.to(device) - feat["action_traj"] = torch.zeros(1, 0).long().to(device) - feat["object_traj"] = torch.zeros(1, 0).long().to(device) - # get first saycan planner action - saycan_selected_next_skill = get_next_skill_from_saycan( - model, - saycan_planner, - sentence_embedder, - chained_subgoal_instr, - primitive_skills, - completed_skills, - feat, - device, - ) - predicted_skills.append(saycan_selected_next_skill) - - if t < len(expert_init_actions): - # get expert action - - action = expert_init_actions[t] - # print("expert_action", action) - compressed_mask = ( - action["args"]["mask"] if "mask" in action["args"] else None - ) - mask = ( - env.decompress_mask(compressed_mask) - if compressed_mask is not None - else None - ) - - success, _, _, err, _ = env.va_interact( - action["action"], interact_mask=mask, smooth_nav=True, debug=False - ) - if not success: - print("expert initialization failed, re-sampling") - return run_policy( - env, - model, - visual_preprocessor, - device, - subgoal_pool, - max_skill_length, - eval_split, - num_subgoals_in_pool, - deterministic, - log_video, - epsilon, - selected_specific_subgoal, - eval, - ) - _, _ = env.get_transition_reward() - else: - obs.append(curr_frame.cpu().detach().squeeze(1)) - video_frames.append(np.uint8(env.last_event.frame)) - - (action, output_object, _) = get_action_from_agent( - model, - feat, - vocab, - vocab_obj, - env, - deterministic=deterministic, - epsilon=epsilon, - ret_value=False, - ) - value_output = None - if value_output != None: - value_output = value_output.squeeze().cpu().detach().numpy() - value_predict.append(value_output) - try: - _, _ = env.to_thor_api_exec(action, output_object, smooth_nav=True) - except Exception as e: - # if there's an exception from running, then we'll just try again. - # print(e) - # record this exception in exceptions.txt - with open("exceptions.txt", "a") as f: - f.write(str(e) + "\n") - - next_frame = np.uint8(env.last_event.frame) - next_frame = Image.fromarray(next_frame) - next_frame = ( - visual_preprocessor.featurize([next_frame], batch=1) - .unsqueeze(0) - .to(device) - ) - curr_frame = next_frame - - feat["frames_buffer"] = torch.cat( - [feat["frames_buffer"], next_frame], dim=1 - ).to(device) - # - 2 because ET dataloader had a -1 for padding reasons on the action, and we did - 1 when processing ALFRED actions to get rid of - # the extraneous END action - tensor_action = torch.tensor( - vocab["action_low"].word2index(action) - 2 - ).to(device) - feat["action_traj"] = torch.cat( - [feat["action_traj"], tensor_action.unsqueeze(0).unsqueeze(0)], - dim=1, - ).to(device) - obj_index = load_object_class(vocab_obj, output_object) - feat["object_traj"] = torch.cat( - [ - feat["object_traj"], - torch.tensor(obj_index).unsqueeze(0).unsqueeze(0).to(device), - ], - dim=1, - ) - feat["frames_buffer"] = feat["frames_buffer"][:, -max_skill_length:] - feat["action_traj"] = feat["action_traj"][:, -max_skill_length + 1 :] - feat["object_traj"] = feat["object_traj"][:, -max_skill_length + 1 :] - - acs.append(tensor_action.cpu()) - str_act.append( - dict( - action=action, - object=( - output_object.split("|")[0] - if output_object is not None - else None - ), - ) - ) - obj_acs.append(obj_index) + ) - t_success = env.last_event.metadata["lastActionSuccess"] - if not t_success: - # env error logging just in case, but we'll just continue - # these aren't critical errors, just sim errors - err = env.last_event.metadata["errorMessage"] - exception_string = f"Failed to execute action {action}. Object: {output_object}. Error: {err}" - with open("exceptions.txt", "a") as f: - f.write(exception_string + "\n") + ann_l = revtok.tokenize(remove_spaces_and_lower(chained_subgoal_instr)) + ann_l = [w.strip().lower() for w in ann_l] + ann_token = numericalize(model.vocab_word, ann_l, train=False) + ann_token = torch.tensor(ann_token).long() + + obs = [] + acs = [] + obj_acs = [] + dones = [] + env_rewards = [] + str_act = [] + video_frames = [] + value_predict = [] + predicted_skills = [] + completed_skills = [] + done, timeout = False, False + # get first saycan planner action + saycan_selected_next_skill = get_next_skill_from_saycan( + model, + saycan_planner, + chained_subgoal_instr, + primitive_skills_to_choose_from, + completed_skills, + feat, + device, + ) + predicted_skills.append(saycan_selected_next_skill) - # MUST call get_transition_reward to update the environment - _, _ = env.get_transition_reward() - curr_subgoal_idx = env.get_subgoal_idx() - partial_success = 0 - if curr_subgoal_idx == completed_eval_idx: - done = 1 - partial_success = 1 - elif curr_subgoal_idx == eval_idx: - eval_idx += 1 - partial_success = 1 - completed_skills.append(saycan_selected_next_skill) - feat["frames_buffer"] = next_frame.unsqueeze(0).to(device) - feat["action_traj"] = torch.zeros(1, 0).long().to(device) - feat["object_traj"] = torch.zeros(1, 0).long().to(device) - saycan_selected_next_skill = get_next_skill_from_saycan( - model, - saycan_planner, - sentence_embedder, - chained_subgoal_instr, - primitive_skills, - completed_skills, - feat, - device, - ) - predicted_skills.append(saycan_selected_next_skill) - env_rewards.append(partial_success) + while not (done or timeout): + obs.append(ob.cpu().detach().squeeze(1)) + video_frames.append(env.obs) - t = t + 1 - subgoal_last_frame_video = np.uint8(env.last_event.frame) - video_frames.append(subgoal_last_frame_video) - (*_,) = get_action_from_agent( + (action, output_object, _) = get_action_from_agent( model, feat, - vocab, - vocab_obj, + env.vocab, + env.vocab_obj, env, deterministic=deterministic, epsilon=epsilon, ret_value=False, ) - obs.append(next_frame.cpu().detach().squeeze(1)) # next obs value_output = None if value_output != None: value_output = value_output.squeeze().cpu().detach().numpy() value_predict.append(value_output) - if log_video: - value_font = ImageFont.truetype("FreeMono.ttf", 20) - action_font = ImageFont.truetype("FreeMono.ttf", 14) - gif_logs = [] - for frame_number in range(len(video_frames)): - img = video_frames[frame_number] - img = Image.fromarray(img) - draw = ImageDraw.Draw(img) - if len(value_predict) != 0: - value_log = value_predict[frame_number] - # draw.text( - # (1, 280), - # "Value: %.3f" % (value_log), - # fill=(255, 255, 255), - # font=value_font, - # ) - if frame_number != 0: - reward_log = env_rewards[frame_number - 1] - # draw.text( - # (1, 260), - # "Reward: %.1f" % (reward_log), - # fill=(255, 255, 255), - # font=value_font, - # ) - return_log = sum(env_rewards[0:frame_number]) - # draw.text( - # (150, 260), - # "Return: %.1f" % (return_log), - # fill=(255, 255, 255), - # font=value_font, - # ) - if frame_number != len(video_frames) - 1: - action_log, object_log = ( - str_act[frame_number]["action"], - str_act[frame_number]["object"], - ) - # draw.text( - # (1, 1), - # f"Action: {action_log}\nObject: {str(object_log)}", - # fill=(255, 255, 255), - # font=action_font, - # ) - - log_images = np.array(img) - gif_logs.append(log_images) - - video_frames = np.asarray(gif_logs) - video_frames = np.transpose(video_frames, (0, 3, 1, 2)) - - rewards = torch.tensor(env_rewards, dtype=torch.float) - dones = torch.zeros(len(rewards)) - dones[-1] = done - vid_caption = f"{chained_subgoal_instr[0] if isinstance(chained_subgoal_instr, list) else chained_subgoal_instr}: {'SUCCESS' if done else 'FAIL'}. Return: {rewards.sum()}/{num_subgoals_to_complete}." - ground_truth_sequence = " ".join(primitive_skills) - return dict( - completed_skills=" ".join(completed_skills), - predicted_skills=" ".join(predicted_skills), - ground_truth_sequence=ground_truth_sequence, - high_level_skill=chained_subgoal_instr, - rews=rewards, - dones=dones, - video_frames=video_frames if log_video else None, - video_caption=vid_caption, - chained_language_instruction=process_annotation( - chained_subgoal_instr, model.vocab_word, train=False - ).long(), - skill_length=num_subgoals_to_complete, + action_dict = dict(action=action, object=output_object) + next_ob, rew, done, info = env.step(action_dict) + timeout = info.timeout + next_ob = Image.fromarray(next_ob) + next_ob = ( + visual_preprocessor.featurize([next_ob], batch=1) + .unsqueeze(0) + .to(device) + ) + ob = next_ob + feat["frames_buffer"] = torch.cat( + [feat["frames_buffer"], next_ob], dim=1 + ).to(device) + # - 2 because ET dataloader had a -1 for padding reasons on the action, and we did - 1 when processing ALFRED actions to get rid of + # the extraneous END action + tensor_action = torch.tensor( + env.vocab["action_low"].word2index(action) - 2 + ).to(device) + feat["action_traj"] = torch.cat( + [feat["action_traj"], tensor_action.unsqueeze(0).unsqueeze(0)], + dim=1, + ).to(device) + obj_index = load_object_class(env.vocab_obj, output_object) + feat["object_traj"] = torch.cat( + [ + feat["object_traj"], + torch.tensor(obj_index).unsqueeze(0).unsqueeze(0).to(device), + ], + dim=1, + ) + # move onto the next skill predicted by saycan + if rew == 1 and not done: + completed_skills.append(saycan_selected_next_skill) + feat["frames_buffer"] = next_ob.to(device) + feat["action_traj"] = torch.zeros(1, 0).long().to(device) + feat["object_traj"] = torch.zeros(1, 0).long().to(device) + saycan_selected_next_skill = get_next_skill_from_saycan( + model, + saycan_planner, + chained_subgoal_instr, + primitive_skills_to_choose_from, + completed_skills, + feat, + device, + ) + predicted_skills.append(saycan_selected_next_skill) + feat["frames_buffer"] = feat["frames_buffer"][:, -max_skill_length:] + feat["action_traj"] = feat["action_traj"][:, -max_skill_length + 1 :] + feat["object_traj"] = feat["object_traj"][:, -max_skill_length + 1 :] + + env_rewards.append(rew) + + acs.append(tensor_action.cpu()) + str_act.append( + dict( + action=action, + object=( + output_object.split("|")[0] + if output_object is not None + else None + ), + ) ) + obj_acs.append(obj_index) + + subgoal_last_frame_video = env.obs + video_frames.append(subgoal_last_frame_video) + (*_,) = get_action_from_agent( + model, + feat, + env.vocab, + env.vocab_obj, + env, + deterministic=deterministic, + epsilon=epsilon, + ret_value=False, + ) + obs.append(ob.cpu().detach().squeeze(1)) # last next obs + value_output = None + if value_output != None: + value_output = value_output.squeeze().cpu().detach().numpy() + value_predict.append(value_output) + + if log_video: + value_font = ImageFont.truetype("FreeMono.ttf", 20) + action_font = ImageFont.truetype("FreeMono.ttf", 14) + gif_logs = [] + for frame_number in range(len(video_frames)): + img = video_frames[frame_number] + img = Image.fromarray(img) + draw = ImageDraw.Draw(img) + if len(value_predict) != 0: + value_log = value_predict[frame_number] + # draw.text( + # (1, 280), + # "Value: %.3f" % (value_log), + # fill=(255, 255, 255), + # font=value_font, + # ) + if frame_number != 0: + reward_log = env_rewards[frame_number - 1] + # draw.text( + # (1, 260), + # "Reward: %.1f" % (reward_log), + # fill=(255, 255, 255), + # font=value_font, + # ) + return_log = sum(env_rewards[0:frame_number]) + # draw.text( + # (150, 260), + # "Return: %.1f" % (return_log), + # fill=(255, 255, 255), + # font=value_font, + # ) + if frame_number != len(video_frames) - 1: + action_log, object_log = ( + str_act[frame_number]["action"], + str_act[frame_number]["object"], + ) + # draw.text( + # (1, 1), + # f"Action: {action_log}\nObject: {str(object_log)}", + # fill=(255, 255, 255), + # font=action_font, + # ) + + log_images = np.array(img) + gif_logs.append(log_images) + + video_frames = np.asarray(gif_logs) + video_frames = np.transpose(video_frames, (0, 3, 1, 2)) + + rewards = torch.tensor(env_rewards, dtype=torch.float) + dones = torch.zeros(len(rewards)) + dones[-1] = done + vid_caption = f"{chained_subgoal_instr[0] if isinstance(chained_subgoal_instr, list) else chained_subgoal_instr}: {'SUCCESS' if done else 'FAIL'}. Return: {rewards.sum()}/{env.num_subgoals_to_complete}." + ground_truth_sequence = " ".join(primitive_skills_to_choose_from) + return dict( + completed_skills=" ".join(completed_skills), + predicted_skills=" ".join(predicted_skills), + ground_truth_sequence=ground_truth_sequence, + high_level_skill=chained_subgoal_instr, + rews=rewards, + dones=dones, + video_frames=video_frames if log_video else None, + video_caption=vid_caption, + chained_language_instruction=process_annotation( + chained_subgoal_instr, model.vocab_word, train=False + ).long(), + skill_length=env.num_subgoals_to_complete, + ) def run_policy_multi_process( ret_queue, task_queue, offline_rl_model, + saycan_planner, resnet, device, - subgoal_pool, max_skill_length, - goal_states, + task_suite, + eval_json_path, + specific_task, ): - env = ThorEnv() + env = ALFREDRLEnv( + DATA_PATH, + REWARD_CONFIG_PATH, + task_suite, + eval_json_path, + vocab_path, + vocab_obj_path, + specific_task, + ) + num_eval_tasks = env.num_tasks + # put the number of tasks into the return queue to tell the calling script thing how many rollouts to perform for evaluation + ret_queue.put(num_eval_tasks) while True: task_args = task_queue.get() if task_args is None: break - ret_queue.put( - run_policy( - env, - offline_rl_model, - resnet, - device, - subgoal_pool, - max_skill_length, - goal_states, - *task_args, + with torch.no_grad(): + ret_queue.put( + run_policy( + env, + offline_rl_model, + saycan_planner, + resnet, + device, + max_skill_length, + *task_args, + ) ) - ) env.stop() diff --git a/sprint/saycan_eval.py b/sprint/saycan_eval.py index 4898cc8..b11c6bc 100644 --- a/sprint/saycan_eval.py +++ b/sprint/saycan_eval.py @@ -40,65 +40,11 @@ WANDB_PROJECT_NAME = "p-bootstrap-llm" -class ETLanguageEncoder: - def __init__(self, vocab_word): - self.vocab_word = vocab_word - - def encode(self, annotations, convert_to_tensor=False): - if convert_to_tensor: - return pad_sequence( - [process_annotation(a, self.vocab_word).long() for a in annotations], - batch_first=True, - padding_value=0, - ) - return [process_annotation(a, self.vocab_word).long() for a in annotations] - - -def setup_mp( - result_queue, - task_queue, - saycan_planner, - sentence_embedder, - agent_model, - resnet, - config, - device, - subgoal_pool, - goal_states, -): - num_workers = config.num_rollout_workers - workers = [] - # start workers - worker_target = run_policy_multi_process - for _ in range(num_workers): - worker = threading.Thread( - target=worker_target, - args=( - result_queue, - task_queue, - saycan_planner, - sentence_embedder, - agent_model, - resnet, - device, - subgoal_pool, - config.max_skill_length, - goal_states, - ), - ) - worker.daemon = True # kills thread/process when parent thread terminates - worker.start() - time.sleep(0.5) - workers.append(worker) - return workers - - def setup_mp( result_queue, task_queue, agent_model, saycan_planner, - sentence_encoder, resnet, config, device, @@ -116,17 +62,20 @@ def setup_mp( task_queue, agent_model, saycan_planner, - sentence_encoder, resnet, - config, device, + config.max_skill_length, + config.env_type, + config.eval_json, + config.specific_task, ), ) worker.daemon = True # kills thread/process when parent thread terminates worker.start() time.sleep(0.5) workers.append(worker) - return workers + num_tasks = result_queue.get() + return workers, num_tasks def multithread_dataset_aggregation( @@ -145,7 +94,7 @@ def multithread_dataset_aggregation( num_env_samples = 0 num_finished_tasks = 0 num_rollouts = ( - config.num_subgoals_in_pool if eval else config.num_rollouts_per_epoch + config.num_eval_tasks if eval else config.num_rollouts_per_epoch ) with tqdm(total=num_rollouts) as pbar: while num_finished_tasks < num_rollouts: @@ -165,7 +114,7 @@ def multithread_dataset_aggregation( extra_info["ground_truth_sequence"].append( result["ground_truth_sequence"] ) - num_env_samples += result["obs"].shape[0] + num_env_samples += result["rews"].shape[0] num_finished_tasks += 1 pbar.update(1) if eval: @@ -185,32 +134,29 @@ def multiprocess_rollout( result_queue, config, epsilon, - dataset, - eval, make_video, - dataset_agg_func, ): rollout_returns = [] subgoal_successes = [] rollout_gifs = [] video_captions = [] extra_info = defaultdict(list) - num_rollouts = config.num_subgoals_in_pool if eval else config.num_rollouts_per_epoch + + num_rollouts = ( + config.num_eval_tasks if eval else config.num_rollouts_per_epoch + ) + # create tasks for MP Queue # create tasks for thread/process Queue args_func = lambda subgoal: ( - config.env_type, - config.num_subgoals_in_pool, - True if eval else config.deterministic_action, - True if eval else False, # log to video + True, + True, epsilon, - subgoal if eval else None, + subgoal ) - for i in range(num_rollouts): - eval_list = eval_skill_info_list - eval_skill_index = i - task_queue.put(args_func(eval_skill_index, eval_list)) + for subgoal in range(num_rollouts): + task_queue.put(args_func(subgoal)) num_env_samples_list = [] # use list for thread safety multithread_dataset_aggregation( @@ -223,10 +169,17 @@ def multiprocess_rollout( None, config, num_env_samples_list, - eval, + True, ) num_env_samples = num_env_samples_list[0] + # aggregate metrics + rollout_metrics = dict( + average_return=np.mean(rollout_returns), + subgoal_success=np.mean(subgoal_successes), + ) + for key, value in extra_info.items(): + rollout_metrics[key] = np.mean(value) # make a WandB table for the high level skill, ground truth sequence, predicted, completed skills saycan_completed_skill_data = [] keys = [ @@ -273,13 +226,25 @@ def multiprocess_rollout( data=saycan_completed_skill_data, ) rollout_metrics["evaluation_table"] = table - renamed_metrics = {} - for key in rollout_metrics: - renamed_metrics[f"{rollout_mode}_{key}"] = rollout_metrics[key] - return renamed_metrics, num_env_samples + return rollout_metrics, num_env_samples def main(config): + config.eval_json = ( + f"{os.environ['SPRINT']}/sprint/rollouts/{config.env_type}_ann_human.json" + ) + seed = config.seed + torch.manual_seed(seed) + np.random.seed(seed) + random.seed(seed) + run = wandb.init( + resume=config.experiment_name, + project=WANDB_PROJECT_NAME, + entity=WANDB_ENTITY_NAME, + notes=config.notes, + config=config, + group=config.run_group, + ) model_checkpoint_dir = config.model_checkpoint_dir print(model_checkpoint_dir) list_of_checkpoints, list_of_epochs = get_list_of_checkpoints(model_checkpoint_dir) @@ -298,25 +263,6 @@ def main(config): ): vars(config)[key] = vars(old_config)[key] - run = wandb.init( - # resume=config.experiment_name, # "allow", - # name=config.experiment_name, - id=config.experiment_name, - name=config.experiment_name, - project=WANDB_PROJECT_NAME, - entity=WANDB_ENTITY_NAME, - notes=config.notes, - config=config, - group=config.run_group, - ) - seed = config.seed - if seed is not None: - torch.manual_seed(seed) - np.random.seed(seed) - torch.backends.cudnn.deterministic = False - random.seed(seed) - torch.backends.cudnn.benchmark = False # uses a lot of gpu memory if True - os.makedirs( os.path.join( config.save_dir, @@ -329,9 +275,12 @@ def main(config): exist_ok=True, ) + if vars(config)["model"] == "sprint": + agent_model = SPRINTETIQLModel(config) + else: + raise ValueError(f"Model {config.model} not supported for SayCan") + device = torch.device(config.gpus[0]) - agent_model = SPRINTETIQLModel(config) - goal_states = None if len(config.gpus) > 1: print(f"-----Using {len(config.gpus)} GPUs-----") agent_model = nn.DataParallel( @@ -342,13 +291,10 @@ def main(config): agent_model.load_from_checkpoint(checkpoint) - # print(agent_model) - resnet_args = AttrDict() resnet_args.visual_model = "resnet18" resnet_args.gpu = config.gpus[0] resnet = Resnet(resnet_args, eval=True, use_conv_feat=True) - sentence_encoder = ETLanguageEncoder(agent_model.vocab_word) saycan_planner = SaycanPlanner(config) @@ -356,128 +302,16 @@ def main(config): task_queue = queue.SimpleQueue() result_queue = queue.SimpleQueue() - with open(config.eval_json, "r") as f: - eval_skill_info_list = json.load(f) - - # sort skill info list by num_primitive_skills, descending, for faster evaluation with multiple threads - # eval_skill_info_list.sort( - # key=lambda x: sum( - # [ - # len(primitive_skill["api_action"]) - # for primitive_skill in x["primitive_skills"] - # ] - # ), - # reverse=True, - # ) - eval_skill_info_list.sort(key=lambda x: len(x["primitive_skills"]), reverse=True) - with open(f"scene_sampling/{config.scene_type}_scene.json", "r") as f: - all_scenes_json = json.load(f) - floorplan_set = set() - all_floorplans = [ - all_scenes_json[x["primitive_skills"][0]["scene_index"]]["scene_num"] - for x in eval_skill_info_list - ] - unique_floorplans = [ - x for x in all_floorplans if not (x in floorplan_set or floorplan_set.add(x)) - ] - - if config.eval_per_task_in_json: - eval_skill_info_list = [ - x for x in eval_skill_info_list if len(x["primitive_skills"]) - ] - - sorted_task_names = [ - dict( - task=task["task"], - starting_subgoal_id=min(task["subgoal_ids"]), - repeat_id=task["repeat_id"], - ) - for task in eval_skill_info_list - ] - else: - if config.specific_task is not None: - eval_skill_info_list = eval_skill_info_list[ - config.specific_task : config.specific_task + 1 - ] - # unique so we can select a specific env - sorted_task_names = [ - count[0] - for count in Counter( - [skill_info["task"] for skill_info in eval_skill_info_list] - ).most_common() - ] - - eval_skill_info_list = [ - skill_info - for skill_info in eval_skill_info_list - if skill_info["task"] in sorted_task_names - ] - - config.num_subgoals_in_pool = len(sorted_task_names) - # step by step evaluation skill info list - primitive_eval_skill_info_list = make_primitive_annotation_eval_dataset( - eval_skill_info_list - ) - print( - f"Evaluating on {len(sorted_task_names)} tasks. Total {len(eval_skill_info_list)} skills" - ) - - primitive_skills_to_use = [] - task_lengths = None - if config.eval_per_task_in_json: - task_lengths = [] - for task in primitive_eval_skill_info_list: - primitive_skills_to_use.append( - generate_primitive_skill_list_from_eval_skill_info_list([task]) - ) - task_lengths.append(len(task["primitive_skills"])) - else: - primitive_skills_to_use = [ - generate_primitive_skill_list_from_eval_skill_info_list( - primitive_eval_skill_info_list - ) - ] - if not config.use_only_task_primitive_skills: - # aggregate all primitive skills if they have the same floorplan - floorplan_per_task = [] - for task in eval_skill_info_list: - floorplan_per_task.append( - all_scenes_json[task["primitive_skills"][0]["scene_index"]]["scene_num"] - ) - aggregated_tasks = {} - for task, fp in zip(eval_skill_info_list, floorplan_per_task): - if fp not in aggregated_tasks: - aggregated_tasks[fp] = copy.deepcopy(task) - else: - aggregated_tasks[fp]["primitive_skills"].extend( - task["primitive_skills"] - ) - # now separate by floorplan - all_primitive_skills_per_floorplan = { - fp: generate_primitive_skill_list_from_eval_skill_info_list( - [aggregated_tasks[fp]] - ) - for fp in aggregated_tasks - } - # now add the primitive skills for each task - primitive_skills_to_use = [] - for fp in floorplan_per_task: - primitive_skills_to_use.append(all_primitive_skills_per_floorplan[fp]) - print( - f"Evaluating on {len(sorted_task_names)} tasks. Total {[len(primitive_skills_to_use[i]) for i in range(len(primitive_skills_to_use))]} skills" - ) - processes = setup_mp( + processes, num_eval_tasks = setup_mp( result_queue, task_queue, agent_model, saycan_planner, - sentence_encoder, resnet, config, device, - eval_skill_info_list, - goal_states, ) + config.num_eval_tasks = num_eval_tasks def signal_handler(sig, frame): print("SIGINT received. Exiting...closing all processes first") @@ -491,8 +325,7 @@ def signal_handler(sig, frame): result_queue, config, 0, - rollout_mode="fixed_eval", - eval_skill_info_list=eval_skill_info_list, + True, ) wandb.log( eval_metrics, @@ -519,7 +352,7 @@ def signal_handler(sig, frame): help="which gpus. pass in as comma separated string to use DataParallel on multiple GPUs", ) parser.add_argument( - "--seed", type=int, default=None, help="random seed for initialization" + "--seed", type=int, default=42, help="random seed for initialization" ) parser.add_argument( "--num_rollout_workers", @@ -550,17 +383,11 @@ def signal_handler(sig, frame): help="group to run the experiment in. If None, no group is used", ) parser.add_argument( - "--scene_type", - type=str, - default="valid_seen", - choices=["train", "valid_seen", "valid_unseen"], - help="which type of scenes to sample from/evaluate on", - ) - parser.add_argument( - "--eval_json", + "--env_type", type=str, - default="scene_sampling/bootstrap_valid_seen-40_ann_human.json", - help="path to the json file containing the evaluation scenes and skills", + required=True, + choices=["eval_instruct", "eval_length", "eval_scene"], + help="alfred environment to use", ) parser.add_argument( "--use_amp", @@ -570,31 +397,14 @@ def signal_handler(sig, frame): nargs="?", help="whether to use automatic mixed precision. set default to false to disable nans during online training.", ) - parser.add_argument( - "--eval_per_task_in_json", - type=str2bool, - default=True, - const=True, - nargs="?", - help="whether to evaluate each task in the json file separately.", - ) - parser.add_argument( - "--use_only_task_primitive_skills", - type=str2bool, - default=False, - const=True, - nargs="?", - help="whether to use only the given eval primitive skills for chaining", - ) # LLM arguments parser.add_argument( "--llm_model", type=str, - # default="decapoda-research/llama-13b-hf", - default="decapoda-research/llama-7B-hf", + default="huggyllama/llama-7b", help="which model to use for the large language model. ", choices=[ - "None", + "facebook/opt-125m", "decapoda-research/llama-13b-hf", "decapoda-research/llama-7B-hf", ],