diff --git a/examples/hello_world.py b/examples/hello_world.py index 4ba1c9b8908..b2beb202932 100644 --- a/examples/hello_world.py +++ b/examples/hello_world.py @@ -29,7 +29,7 @@ "pad_token_id": tokenizer.eos_token_id, "max_new_tokens": 20, } -response_tensor = ppo_trainer.generate([item for item in query_tensor], return_prompt=False, **generation_kwargs) +response_tensor = ppo_trainer.generate(list(query_tensor), return_prompt=False, **generation_kwargs) response_txt = tokenizer.decode(response_tensor[0]) # 5. define a reward for response diff --git a/examples/research_projects/stack_llama/scripts/rl_training.py b/examples/research_projects/stack_llama/scripts/rl_training.py index 4028a13b73d..c8502c5e89e 100644 --- a/examples/research_projects/stack_llama/scripts/rl_training.py +++ b/examples/research_projects/stack_llama/scripts/rl_training.py @@ -162,7 +162,7 @@ def preprocess_function(examples): def collator(data): - return dict((key, [d[key] for d in data]) for key in data[0]) + return {key: [d[key] for d in data] for key in data[0]} # set seed before initializing value head for deterministic eval diff --git a/examples/research_projects/tools/triviaqa.py b/examples/research_projects/tools/triviaqa.py index 5732d142de1..5eb5044c2b1 100644 --- a/examples/research_projects/tools/triviaqa.py +++ b/examples/research_projects/tools/triviaqa.py @@ -113,7 +113,7 @@ class ScriptArguments: def data_generator(): for i in range(len(dataset)): - yield dataset[i]["question"], [item for item in dataset[i]["answer"]["normalized_aliases"]] + yield dataset[i]["question"], list(dataset[i]["answer"]["normalized_aliases"]) gen = data_generator() @@ -187,8 +187,6 @@ def print_trainable_parameters(model): "answer": [", ".join(item) for item in answers], } all_rewards = ppo_trainer.accelerator.gather(torch.tensor(rewards, device=ppo_trainer.accelerator.device)) - ppo_trainer.log_stats( - train_stats, texts, [item for item in all_rewards], columns_to_log=["query", "response", "answer"] - ) + ppo_trainer.log_stats(train_stats, texts, list(all_rewards), columns_to_log=["query", "response", "answer"]) if i % 100 == 0: ppo_trainer.save_pretrained(f"models/{args.model_name}_{args.seed}_{i}_triviaqa") diff --git a/examples/research_projects/toxicity/scripts/gpt-j-6b-toxicity.py b/examples/research_projects/toxicity/scripts/gpt-j-6b-toxicity.py index d7d826b1904..51f6d284c40 100644 --- a/examples/research_projects/toxicity/scripts/gpt-j-6b-toxicity.py +++ b/examples/research_projects/toxicity/scripts/gpt-j-6b-toxicity.py @@ -145,7 +145,7 @@ def tokenize(sample): def collator(data): - return dict((key, [d[key] for d in data]) for key in data[0]) + return {key: [d[key] for d in data] for key in data[0]} # set seed before initializing value head for deterministic eval diff --git a/examples/scripts/ppo.py b/examples/scripts/ppo.py index 7b09fd79010..9282144e667 100644 --- a/examples/scripts/ppo.py +++ b/examples/scripts/ppo.py @@ -94,7 +94,7 @@ def tokenize(sample): def collator(data): - return dict((key, [d[key] for d in data]) for key in data[0]) + return {key: [d[key] for d in data] for key in data[0]} # set seed before initializing value head for deterministic eval diff --git a/examples/scripts/ppo_multi_adapter.py b/examples/scripts/ppo_multi_adapter.py index f5614ddf159..782235781be 100644 --- a/examples/scripts/ppo_multi_adapter.py +++ b/examples/scripts/ppo_multi_adapter.py @@ -96,7 +96,7 @@ def tokenize(example): def collator(data): - return dict((key, [d[key] for d in data]) for key in data[0]) + return {key: [d[key] for d in data] for key in data[0]} config = PPOConfig( diff --git a/pyproject.toml b/pyproject.toml index 5be5c57e48b..c54bb81ec4b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,9 +2,11 @@ target-version = "py37" ignore = [ "B028", # warning without explicit stacklevel + "C408", # dict() calls (stylistic) + "C901", # function complexity "E501", ] -extend-select = ["E", "F", "I", "W", "UP", "B", "T"] +extend-select = ["E", "F", "I", "W", "UP", "B", "T", "C"] line-length = 119 [tool.ruff.per-file-ignores] diff --git a/scripts/stale.py b/scripts/stale.py index de7b869c132..0713f7f4196 100644 --- a/scripts/stale.py +++ b/scripts/stale.py @@ -35,7 +35,7 @@ def main(): open_issues = repo.get_issues(state="open") for issue in open_issues: - comments = sorted([comment for comment in issue.get_comments()], key=lambda i: i.created_at, reverse=True) + comments = sorted(issue.get_comments(), key=lambda i: i.created_at, reverse=True) last_comment = comments[0] if len(comments) > 0 else None if ( last_comment is not None diff --git a/tests/test_no_peft.py b/tests/test_no_peft.py index de5dce44b41..dae2103b121 100644 --- a/tests/test_no_peft.py +++ b/tests/test_no_peft.py @@ -140,7 +140,7 @@ def test_ppo_trainer_no_peft(self): # (this could be any reward such as human feedback or output from another model) reward = [torch.tensor(1.0), torch.tensor(0.0)] # train model - train_stats = ppo_trainer.step([q for q in query_tensor], [r for r in response_tensor], reward) + train_stats = ppo_trainer.step(list(query_tensor), list(response_tensor), reward) break # check gradients are not None diff --git a/tests/test_ppo_trainer.py b/tests/test_ppo_trainer.py index b5b31f512ec..8388102e671 100644 --- a/tests/test_ppo_trainer.py +++ b/tests/test_ppo_trainer.py @@ -200,7 +200,7 @@ def test_ppo_step(self): # (this could be any reward such as human feedback or output from another model) reward = [torch.tensor(1.0), torch.tensor(0.0)] # train model - train_stats = ppo_trainer.step([q for q in query_tensor], [r for r in response_tensor], reward) + train_stats = ppo_trainer.step(list(query_tensor), list(response_tensor), reward) break for param in ppo_trainer.model.parameters(): @@ -230,9 +230,7 @@ def test_ppo_step_with_masks(self): response_mask = [torch.ones_like(r) for r in response_tensor] # train model - train_stats = ppo_trainer.step( - [q for q in query_tensor], [r for r in response_tensor], reward, response_mask - ) + train_stats = ppo_trainer.step(list(query_tensor), list(response_tensor), reward, response_mask) break for param in ppo_trainer.model.parameters(): @@ -264,7 +262,7 @@ def test_ppo_step_with_no_ref_sgd(self): # (this could be any reward such as human feedback or output from another model) reward = [torch.tensor(1.0), torch.tensor(0.0)] # train model - train_stats = ppo_trainer.step([q for q in query_tensor], [r for r in response_tensor], reward) + train_stats = ppo_trainer.step(list(query_tensor), list(response_tensor), reward) break for name, param in ppo_trainer.model.named_parameters(): @@ -304,8 +302,8 @@ def test_ppo_step_with_no_ref_sgd_lr_scheduler(self): # (this could be any reward such as human feedback or output from another model) reward = [torch.tensor(1.0), torch.tensor(0.0)] # train model - _ = ppo_trainer.step([q for q in query_tensor], [r for r in response_tensor], reward) - train_stats = ppo_trainer.step([q for q in query_tensor], [r for r in response_tensor], reward) + _ = ppo_trainer.step(list(query_tensor), list(response_tensor), reward) + train_stats = ppo_trainer.step(list(query_tensor), list(response_tensor), reward) break for name, param in ppo_trainer.model.named_parameters(): @@ -341,7 +339,7 @@ def test_ppo_step_with_no_ref(self): # (this could be any reward such as human feedback or output from another model) reward = [torch.tensor(1.0), torch.tensor(0.0)] # train model - train_stats = ppo_trainer.step([q for q in query_tensor], [r for r in response_tensor], reward) + train_stats = ppo_trainer.step(list(query_tensor), list(response_tensor), reward) break for name, param in ppo_trainer.model.named_parameters(): @@ -392,7 +390,7 @@ def test_ppo_step_with_no_ref_custom_layers(self): # (this could be any reward such as human feedback or output from another model) reward = [torch.tensor(1.0), torch.tensor(0.0)] # train model - train_stats = ppo_trainer.step([q for q in query_tensor], [r for r in response_tensor], reward) + train_stats = ppo_trainer.step(list(query_tensor), list(response_tensor), reward) break pattern = r".*transformer\.h\.(\d+)\..*" @@ -405,7 +403,7 @@ def test_ppo_step_with_no_ref_custom_layers(self): self.assertTrue(param.grad is None, f"Parameter {name} has a gradient") else: self.assertTrue(param.grad is not None, f"Parameter {name} has no gradient") - elif any([layer in name for layer in final_layers]): + elif any(layer in name for layer in final_layers): self.assertTrue(param.grad is not None, f"Parameter {name} has no gradient") # ref model should not be trained @@ -460,11 +458,11 @@ def test_ppo_step_rewards_shape(self): reward = [torch.tensor([[1.0]]), torch.tensor([[0.0]])] # train model - this should raise an error with self.assertRaises(ValueError): - _ = ppo_trainer.step([q for q in query_tensor], [r for r in response_tensor], reward) + _ = ppo_trainer.step(list(query_tensor), list(response_tensor), reward) reward = [torch.tensor([1.0]), torch.tensor([0.0])] # train model - this should work - _ = ppo_trainer.step([q for q in query_tensor], [r for r in response_tensor], reward) + _ = ppo_trainer.step(list(query_tensor), list(response_tensor), reward) break # check if the gradients are computed for the model @@ -499,7 +497,7 @@ def test_ppo_step_input_shape(self): bs = ppo_trainer.config.batch_size queries, responses, _, _ = ppo_trainer._step_safety_checker( - bs, [q for q in query_tensor], [r for r in response_tensor], reward + bs, list(query_tensor), list(response_tensor), reward ) self.assertTrue(isinstance(queries, list), f"queries should be a list, got {type(queries)}") @@ -704,7 +702,7 @@ def test_ppo_trainer_max_grad_norm(self): # (this could be any reward such as human feedback or output from another model) reward = [torch.tensor(1.0), torch.tensor(0.0)] # train model - _ = ppo_trainer.step([q for q in query_tensor], [r for r in response_tensor], reward) + _ = ppo_trainer.step(list(query_tensor), list(response_tensor), reward) break # check gradients @@ -894,11 +892,11 @@ def make_inputs_require_grad(module, input, output): # (this could be any reward such as human feedback or output from another model) reward = [torch.tensor(1.0), torch.tensor(0.0)] # train model by running a step twice - _ = ppo_trainer.step([q for q in query_tensor], [r for r in response_tensor], reward) + _ = ppo_trainer.step(list(query_tensor), list(response_tensor), reward) ppo_trainer.model.train() ppo_trainer.model.gradient_checkpointing_enable() - _ = ppo_trainer.step([q for q in query_tensor], [r for r in response_tensor], reward) + _ = ppo_trainer.step(list(query_tensor), list(response_tensor), reward) break # check gradients @@ -982,11 +980,11 @@ def make_inputs_require_grad(module, input, output): # (this could be any reward such as human feedback or output from another model) reward = [torch.tensor(1.0), torch.tensor(0.0)] # train model by running a step twice - _ = ppo_trainer.step([q for q in query_tensor], [r for r in response_tensor], reward) + _ = ppo_trainer.step(list(query_tensor), list(response_tensor), reward) ppo_trainer.model.train() ppo_trainer.model.gradient_checkpointing_enable() - _ = ppo_trainer.step([q for q in query_tensor], [r for r in response_tensor], reward) + _ = ppo_trainer.step(list(query_tensor), list(response_tensor), reward) break new_logits = ppo_trainer.model.compute_reward_score(dummy_inputs) @@ -1092,11 +1090,11 @@ def make_inputs_require_grad(module, input, output): # (this could be any reward such as human feedback or output from another model) reward = [torch.tensor(1.0), torch.tensor(0.0)] # train model by running a step twice - _ = ppo_trainer.step([q for q in query_tensor], [r for r in response_tensor], reward) + _ = ppo_trainer.step(list(query_tensor), list(response_tensor), reward) ppo_trainer.model.train() ppo_trainer.model.gradient_checkpointing_enable() - _ = ppo_trainer.step([q for q in query_tensor], [r for r in response_tensor], reward) + _ = ppo_trainer.step(list(query_tensor), list(response_tensor), reward) break # check gradients @@ -1162,7 +1160,7 @@ def test_grad_accumulation(self): # (this could be any reward such as human feedback or output from another model) reward = [torch.tensor(1.0), torch.tensor(1.0)] # train model by running a step twice - _ = ppo_trainer.step([q for q in query_tensor], [r for r in response_tensor], reward) + _ = ppo_trainer.step(list(query_tensor), list(response_tensor), reward) break model_grad = gpt2_model.v_head.summary.weight @@ -1186,7 +1184,7 @@ def test_grad_accumulation(self): # (this could be any reward such as human feedback or output from another model) reward = [torch.tensor(1.0), torch.tensor(1.0)] # train model by running a step twice - _ = ppo_trainer.step([q for q in query_tensor], [r for r in response_tensor], reward) + _ = ppo_trainer.step(list(query_tensor), list(response_tensor), reward) break model_grad_acc = gpt2_model_clone.v_head.summary.weight @@ -1224,7 +1222,7 @@ def test_push_to_hub_if_best_reward(self): # (this could be any reward such as human feedback or output from another model) reward = [torch.tensor(1.0), torch.tensor(0.0)] # train model - _ = ppo_trainer.step([q for q in query_tensor], [r for r in response_tensor], reward) + _ = ppo_trainer.step(list(query_tensor), list(response_tensor), reward) break def test_batch_size_check(self): diff --git a/tests/test_reward_trainer.py b/tests/test_reward_trainer.py index df06e7fd30b..ad71fc7e9bd 100644 --- a/tests/test_reward_trainer.py +++ b/tests/test_reward_trainer.py @@ -175,7 +175,7 @@ def test_reward_trainer_peft(self): # check gradients are not None for n, param in trainer.model.named_parameters(): - if any([t in n for t in trainable_params_name]): + if any(t in n for t in trainable_params_name): previous_trainable_params[n] = param.clone() else: previous_non_trainable_params[n] = param.clone() diff --git a/trl/core.py b/trl/core.py index 39755ce2125..9d92ee18f03 100644 --- a/trl/core.py +++ b/trl/core.py @@ -80,7 +80,7 @@ def stack_dicts(stats_dicts: List[Dict]) -> Dict: def add_suffix(input_dict: Dict, suffix: str) -> Dict: """Add suffix to dict keys.""" - return dict((k + suffix, v) for k, v in input_dict.items()) + return {k + suffix: v for k, v in input_dict.items()} def pad_to_size(tensor: torch.Tensor, size: int, dim: int = 1, padding: int = 50256) -> torch.Tensor: diff --git a/trl/environment/base_environment.py b/trl/environment/base_environment.py index 58b61fd17f2..7037166d762 100644 --- a/trl/environment/base_environment.py +++ b/trl/environment/base_environment.py @@ -46,7 +46,7 @@ def __call__(self, input_ids, scores, **kwargs): done = [] for i, decoded_generation in enumerate(decoded_generations): - sequence_complete = any([stop_string in decoded_generation for stop_string in self.stop_strings]) + sequence_complete = any(stop_string in decoded_generation for stop_string in self.stop_strings) done.append(sequence_complete) if not sequence_complete: self.generated_tokens[i] += 1 @@ -243,7 +243,7 @@ def __init__( if isinstance(tools, dict): self.tools = tools else: - self.tools = dict([(tool.__class__.__name__, tool) for tool in tools]) + self.tools = {tool.__class__.__name__: tool for tool in tools} self.reward_fn = reward_fn self.max_length = max_length self.request_token = "" @@ -278,7 +278,7 @@ def run(self, queries, **rewards_kwargs): histories = [TextHistory(q, qt, system=True) for q, qt in zip(queries, queries_tokens)] - while any([not history.completed for history in histories]) and turns < self.max_turns: + while any(not history.completed for history in histories) and turns < self.max_turns: histories = self.generate(histories) histories = self.tasks_end_check(histories) # TODO: make this parallel rather than for-loop diff --git a/trl/models/modeling_base.py b/trl/models/modeling_base.py index 17e77380aaf..f7894ddedbe 100644 --- a/trl/models/modeling_base.py +++ b/trl/models/modeling_base.py @@ -384,7 +384,7 @@ def _get_checkpoint_from_hub( # check filename with `v_head` or any known extra module: files_to_download = set() for k, v in index["weight_map"].items(): - if any([module in k for module in cls.supported_modules]): + if any(module in k for module in cls.supported_modules): files_to_download.add(v) is_sharded = True @@ -487,7 +487,7 @@ def add_and_load_reward_modeling_adapter( adapter_state_dict = loading_func(local_filename, **load_kwargs) for score_name_candidate in cls.supported_rm_modules: - if any([score_name_candidate in name for name in adapter_state_dict.keys()]): + if any(score_name_candidate in name for name in adapter_state_dict.keys()): score_name = score_name_candidate # we have found the correct head name and can break break @@ -500,7 +500,7 @@ def add_and_load_reward_modeling_adapter( score_dict[key_name] = param.to(cls._get_current_device()) num_labels, hidden_dim = score_dict["weight"].shape - has_bias = any(["bias" in name for name in adapter_state_dict.keys()]) + has_bias = any("bias" in name for name in adapter_state_dict.keys()) score = nn.Linear(hidden_dim, num_labels, bias=has_bias).to( device=cls._get_current_device(), @@ -638,7 +638,7 @@ def create_reference_model( else: for pattern_candidate in LAYER_PATTERNS: pattern_candidate = pattern_candidate.format(layer=num_shared_layers) - if any([pattern_candidate in name for name in parameter_names]): + if any(pattern_candidate in name for name in parameter_names): pattern = pattern_candidate break diff --git a/trl/trainer/ppo_trainer.py b/trl/trainer/ppo_trainer.py index d2462c581be..df6c3090e92 100644 --- a/trl/trainer/ppo_trainer.py +++ b/trl/trainer/ppo_trainer.py @@ -1337,7 +1337,7 @@ def log_stats( if self.config.log_with == "wandb": import wandb - if any([column_to_log not in batch.keys() for column_to_log in columns_to_log]): + if any(column_to_log not in batch.keys() for column_to_log in columns_to_log): raise ValueError(f"Columns to log {columns_to_log} are not present in the batch {batch.keys()}.") batch_list = [batch[column_to_log] for column_to_log in columns_to_log]