From ab94211c5f3f072ded21494859253e68647b7563 Mon Sep 17 00:00:00 2001 From: Aaron Pham Date: Wed, 18 Sep 2024 07:00:56 -0400 Subject: [PATCH] [CI/Build] Update Ruff version (#8469) Signed-off-by: Aaron Pham Co-authored-by: Cyrus Leung --- .github/workflows/ruff.yml | 4 ++-- benchmarks/kernels/graph_machete_bench.py | 4 +--- format.sh | 4 ++-- pyproject.toml | 2 ++ requirements-lint.txt | 2 +- tests/conftest.py | 5 +---- tests/lora/conftest.py | 5 +---- tests/multimodal/test_base.py | 2 +- tests/test_cache_block_hashing.py | 5 +---- tests/test_logger.py | 4 ++-- tests/worker/test_encoder_decoder_model_runner.py | 4 +--- tests/worker/test_model_runner.py | 4 +--- vllm/adapter_commons/utils.py | 2 +- vllm/attention/backends/utils.py | 6 ++---- vllm/core/block/prefix_caching_block.py | 4 +--- vllm/core/block_manager_v2.py | 4 +--- vllm/engine/async_llm_engine.py | 6 +++--- vllm/engine/llm_engine.py | 6 +++--- .../guided_decoding/outlines_logits_processors.py | 4 ++-- .../layers/quantization/awq_marlin.py | 6 +++--- .../compressed_tensors/compressed_tensors.py | 14 +++++++------- .../layers/quantization/gptq_marlin.py | 8 ++++---- vllm/model_executor/model_loader/tensorizer.py | 4 +--- vllm/model_executor/models/minicpmv.py | 2 +- vllm/spec_decode/draft_model_runner.py | 5 +---- vllm/spec_decode/metrics.py | 7 ++----- vllm/triton_utils/libentry.py | 4 ++-- 27 files changed, 50 insertions(+), 77 deletions(-) diff --git a/.github/workflows/ruff.yml b/.github/workflows/ruff.yml index 1a794af572fef..90735d6e2bbf9 100644 --- a/.github/workflows/ruff.yml +++ b/.github/workflows/ruff.yml @@ -25,10 +25,10 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install ruff==0.1.5 codespell==2.3.0 tomli==2.0.1 isort==5.13.2 + pip install -r requirements-lint.txt - name: Analysing the code with ruff run: | - ruff . + ruff check . - name: Spelling check with codespell run: | codespell --toml pyproject.toml diff --git a/benchmarks/kernels/graph_machete_bench.py b/benchmarks/kernels/graph_machete_bench.py index 1d076ed6d5c18..de608fd05af70 100644 --- a/benchmarks/kernels/graph_machete_bench.py +++ b/benchmarks/kernels/graph_machete_bench.py @@ -45,8 +45,7 @@ rows = int(math.ceil(len(results) / 2)) fig, axs = plt.subplots(rows, 2, figsize=(12, 5 * rows)) axs = axs.flatten() - axs_idx = 0 - for shape, data in results.items(): + for axs_idx, (shape, data) in enumerate(results.items()): plt.sca(axs[axs_idx]) df = pd.DataFrame(data) sns.lineplot(data=df, @@ -59,6 +58,5 @@ palette="Dark2") plt.title(f"Shape: {shape}") plt.ylabel("time (median, s)") - axs_idx += 1 plt.tight_layout() plt.savefig("graph_machete_bench.pdf") diff --git a/format.sh b/format.sh index 2204b3ba59498..6563d89b192ea 100755 --- a/format.sh +++ b/format.sh @@ -159,7 +159,7 @@ echo 'vLLM codespell: Done' # Lint specified files lint() { - ruff "$@" + ruff check "$@" } # Lint files that differ from main branch. Ignores dirs that are not slated @@ -175,7 +175,7 @@ lint_changed() { if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &>/dev/null; then git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs \ - ruff + ruff check fi } diff --git a/pyproject.toml b/pyproject.toml index 6b682f5d4dd4d..14f0934499c46 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -42,6 +42,8 @@ ignore = [ "E731", # Loop control variable not used within loop body "B007", + # f-string format + "UP032", ] [tool.mypy] diff --git a/requirements-lint.txt b/requirements-lint.txt index d0b2fef6deaef..07f738873e1a8 100644 --- a/requirements-lint.txt +++ b/requirements-lint.txt @@ -2,7 +2,7 @@ yapf==0.32.0 toml==0.10.2 tomli==2.0.1 -ruff==0.1.5 +ruff==0.6.5 codespell==2.3.0 isort==5.13.2 clang-format==18.1.5 diff --git a/tests/conftest.py b/tests/conftest.py index e4c7b96e82429..e9c7fc7bf9c67 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -158,10 +158,7 @@ def should_do_global_cleanup_after_test(request) -> bool: to initialize torch. """ - if request.node.get_closest_marker("skip_global_cleanup"): - return False - - return True + return not request.node.get_closest_marker("skip_global_cleanup") @pytest.fixture(autouse=True) diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py index 0bcae5b0c96dc..4834a9d35a3ee 100644 --- a/tests/lora/conftest.py +++ b/tests/lora/conftest.py @@ -65,10 +65,7 @@ def should_do_global_cleanup_after_test(request) -> bool: to initialize torch. """ - if request.node.get_closest_marker("skip_global_cleanup"): - return False - - return True + return not request.node.get_closest_marker("skip_global_cleanup") @pytest.fixture(autouse=True) diff --git a/tests/multimodal/test_base.py b/tests/multimodal/test_base.py index e9562d2048f06..68d05de904ba8 100644 --- a/tests/multimodal/test_base.py +++ b/tests/multimodal/test_base.py @@ -5,7 +5,7 @@ def assert_nested_tensors_equal(expected: NestedTensors, actual: NestedTensors): - assert type(expected) == type(actual) + assert type(expected) == type(actual) # noqa: E721 if isinstance(expected, torch.Tensor): assert torch.equal(expected, actual) else: diff --git a/tests/test_cache_block_hashing.py b/tests/test_cache_block_hashing.py index fe413d1228021..3576a4834ebc3 100644 --- a/tests/test_cache_block_hashing.py +++ b/tests/test_cache_block_hashing.py @@ -66,8 +66,7 @@ def test_auto_prefix_caching(model: str, block_size: int, max_num_seqs: int, hashes.append([]) prompts = [prefix + prompt for prompt in sample_prompts] - seq_id = 0 - for prompt in prompts: + for seq_id, prompt in enumerate(prompts): hashes[-1].append([]) prompt_token_ids = tokenizer.encode(prompt) seq = Sequence(seq_id, @@ -83,8 +82,6 @@ def test_auto_prefix_caching(model: str, block_size: int, max_num_seqs: int, for idx in range(num_blocks): hashes[-1][-1].append(seq.hash_of_block(idx)) - seq_id += 1 - # Check that hashes made with two prefixes with different first blocks are # different everywhere. for hash0, hash1 in zip(flatten_2d(hashes[0]), flatten_2d(hashes[1])): diff --git a/tests/test_logger.py b/tests/test_logger.py index 8f3d218416870..fadf66f2b61d4 100644 --- a/tests/test_logger.py +++ b/tests/test_logger.py @@ -111,7 +111,7 @@ def test_an_error_is_raised_when_custom_logging_config_file_does_not_exist(): configuration occurs.""" with pytest.raises(RuntimeError) as ex_info: _configure_vllm_root_logger() - assert ex_info.type == RuntimeError + assert ex_info.type == RuntimeError # noqa: E721 assert "File does not exist" in str(ex_info) @@ -152,7 +152,7 @@ def test_an_error_is_raised_when_custom_logging_config_is_unexpected_json( logging_config_file.name): with pytest.raises(ValueError) as ex_info: _configure_vllm_root_logger() - assert ex_info.type == ValueError + assert ex_info.type == ValueError # noqa: E721 assert "Invalid logging config. Expected Dict, got" in str(ex_info) diff --git a/tests/worker/test_encoder_decoder_model_runner.py b/tests/worker/test_encoder_decoder_model_runner.py index a00d46ddeb007..c0654712b71b5 100644 --- a/tests/worker/test_encoder_decoder_model_runner.py +++ b/tests/worker/test_encoder_decoder_model_runner.py @@ -453,8 +453,7 @@ def test_prepare_decode(batch_size): # each sequence) in the decode phase expected_selected_token_indices = [] - selected_token_start_idx = 0 - for seq_len in seq_lens: + for selected_token_start_idx, seq_len in enumerate(seq_lens): # Compute the index offset of the final token in each # sequence's decoded outputs; since a single token is # decoded per iteration per sequence, then the length @@ -463,7 +462,6 @@ def test_prepare_decode(batch_size): # generated tokens is 0 (i.e. the expected sampling index # for a given sequence is just `selected_token_start_idx`) expected_selected_token_indices.append(selected_token_start_idx) - selected_token_start_idx += 1 sampling_metadata = model_input.sampling_metadata actual = sampling_metadata.selected_token_indices diff --git a/tests/worker/test_model_runner.py b/tests/worker/test_model_runner.py index a20aa37bcc1e2..42b2337f46914 100644 --- a/tests/worker/test_model_runner.py +++ b/tests/worker/test_model_runner.py @@ -241,10 +241,8 @@ def test_prepare_decode_cuda_graph(batch_size): # Verify Sampling expected_selected_token_indices = [] - selected_token_start_idx = 0 - for _ in context_lens: + for selected_token_start_idx, _ in enumerate(context_lens): expected_selected_token_indices.append(selected_token_start_idx) - selected_token_start_idx += 1 sampling_metadata = SamplingMetadata.prepare( seq_group_metadata_list, seq_lens, diff --git a/vllm/adapter_commons/utils.py b/vllm/adapter_commons/utils.py index 6c5411f7d3d5c..1e9adca50093b 100644 --- a/vllm/adapter_commons/utils.py +++ b/vllm/adapter_commons/utils.py @@ -42,7 +42,7 @@ def list_adapters(registered_adapters: Dict[int, Any]) -> Dict[int, Any]: def get_adapter(adapter_id: int, registered_adapters: Dict[int, Any]) -> Optional[Any]: - return registered_adapters.get(adapter_id, None) + return registered_adapters.get(adapter_id) ## worker functions diff --git a/vllm/attention/backends/utils.py b/vllm/attention/backends/utils.py index 089008967a244..49fbb25f4547b 100644 --- a/vllm/attention/backends/utils.py +++ b/vllm/attention/backends/utils.py @@ -33,10 +33,8 @@ def is_block_tables_empty(block_tables: Union[None, Dict]): """ if block_tables is None: return True - if isinstance(block_tables, dict) and all( - value is None for value in block_tables.values()): - return True - return False + return (isinstance(block_tables, dict) + and all(value is None for value in block_tables.values())) def compute_slot_mapping_start_idx(is_prompt: bool, query_len: int, diff --git a/vllm/core/block/prefix_caching_block.py b/vllm/core/block/prefix_caching_block.py index a87e814cfb041..db67c95c32429 100644 --- a/vllm/core/block/prefix_caching_block.py +++ b/vllm/core/block/prefix_caching_block.py @@ -417,9 +417,7 @@ def get_prefix_cache_hit_rate(self) -> float: def is_block_cached(self, block: Block) -> bool: assert block.content_hash is not None - if block.content_hash in self._cached_blocks: - return True - return False + return block.content_hash in self._cached_blocks def promote_to_immutable_block(self, block: Block) -> BlockId: """Once a mutable block is full, it can be promoted to an immutable diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py index b06385b062e83..54818c7e3e9a6 100644 --- a/vllm/core/block_manager_v2.py +++ b/vllm/core/block_manager_v2.py @@ -399,9 +399,7 @@ def can_swap_out(self, seq_group: SequenceGroup) -> bool: """ alloc_status = self._can_swap(seq_group, Device.CPU, SequenceStatus.RUNNING) - if alloc_status == AllocStatus.OK: - return True - return False + return alloc_status == AllocStatus.OK def swap_out(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]: """Returns the block id mapping (from GPU to CPU) generated by diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 410e6ffaa2d50..82cdd41ad497e 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -826,7 +826,7 @@ async def generate( request_id: The unique id of the request. lora_request: LoRA request to use for generation, if any. trace_headers: OpenTelemetry trace headers. - prompt_adapter_request: Prompt Adapter request to use + prompt_adapter_request: Prompt Adapter request to use for generation, if any. Yields: @@ -1042,7 +1042,7 @@ def remove_logger(self, logger_name: str) -> None: async def start_profile(self) -> None: # using type instead of isinstance to check to avoid capturing # inherited classes - if type(self.engine.model_executor) == GPUExecutorAsync: + if type(self.engine.model_executor) == GPUExecutorAsync: # noqa: E721 self.engine.model_executor.start_profile() else: self.engine.model_executor._run_workers("start_profile") @@ -1050,7 +1050,7 @@ async def start_profile(self) -> None: async def stop_profile(self) -> None: # using type instead of isinstance to check to avoid capturing # inherited classes - if type(self.engine.model_executor) == GPUExecutorAsync: + if type(self.engine.model_executor) == GPUExecutorAsync: # noqa: E721 self.engine.model_executor.stop_profile() else: self.engine.model_executor._run_workers("stop_profile") diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 8b5009b2c6668..bdf1af014342a 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -144,7 +144,7 @@ class LLMEngine: decoding. executor_class: The model executor class for managing distributed execution. - prompt_adapter_config (Optional): The configuration related to serving + prompt_adapter_config (Optional): The configuration related to serving prompt adapters. log_stats: Whether to log statistics. usage_context: Specified entry point, used for usage info collection. @@ -1605,7 +1605,7 @@ def check_health(self) -> None: def start_profile(self) -> None: # using type instead of isinstance to check to avoid capturing # inherited classes (MultiprocessingGPUExecutor) - if type(self.model_executor) == GPUExecutor: + if type(self.model_executor) == GPUExecutor: # noqa: E721 self.model_executor.start_profile() else: self.model_executor._run_workers("start_profile") @@ -1613,7 +1613,7 @@ def start_profile(self) -> None: def stop_profile(self) -> None: # using type instead of isinstance to check to avoid capturing # inherited classes (MultiprocessingGPUExecutor) - if type(self.model_executor) == GPUExecutor: + if type(self.model_executor) == GPUExecutor: # noqa: E721 self.model_executor.stop_profile() else: self.model_executor._run_workers("stop_profile") diff --git a/vllm/model_executor/guided_decoding/outlines_logits_processors.py b/vllm/model_executor/guided_decoding/outlines_logits_processors.py index 554dcc0ed43ed..c28bd71c9f682 100644 --- a/vllm/model_executor/guided_decoding/outlines_logits_processors.py +++ b/vllm/model_executor/guided_decoding/outlines_logits_processors.py @@ -67,9 +67,9 @@ def __call__(self, input_ids: List[int], instruction = self._guide.get_next_instruction( state=self._fsm_state[seq_id]) - if type(instruction) == Generate: + if type(instruction) == Generate: # noqa: E721 allowed_tokens = instruction.tokens - elif type(instruction) == Write: + elif type(instruction) == Write: # noqa: E721 # TODO: support fast forward tokens allowed_tokens = [instruction.tokens[0]] else: diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py index eee6a8f7cff49..eed01953fb4af 100644 --- a/vllm/model_executor/layers/quantization/awq_marlin.py +++ b/vllm/model_executor/layers/quantization/awq_marlin.py @@ -110,9 +110,9 @@ def get_scaled_act_names(self) -> List[str]: def is_awq_marlin_compatible(cls, quant_config: Dict[str, Any]): # Extract data from quant config. quant_method = quant_config.get("quant_method", "").lower() - num_bits = quant_config.get("bits", None) - group_size = quant_config.get("group_size", None) - has_zp = quant_config.get("zero_point", None) + num_bits = quant_config.get("bits") + group_size = quant_config.get("group_size") + has_zp = quant_config.get("zero_point") if quant_method != "awq": return False diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index ab8207f128348..e536fae45c845 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -1,4 +1,4 @@ -from typing import Any, Dict, List, Optional +from typing import Any, Dict, List, Optional, cast import torch from pydantic import BaseModel @@ -79,8 +79,8 @@ def get_quant_method( @classmethod def from_config(cls, config: Dict[str, Any]) -> "CompressedTensorsConfig": target_scheme_map: Dict[str, Any] = dict() - ignore: List[str] = config.get("ignore", None) - quant_format: str = config.get("format", None) + ignore = cast(List[str], config.get("ignore")) + quant_format = cast(str, config.get("format")) # The quant_config has multiple config_groups, each containing # an input_activations key with details about how the activations are @@ -200,7 +200,7 @@ def _is_fp8_w8a16(self, weight_quant: BaseModel, is_per_tensor_or_channel_weight = (weight_quant.strategy in [ QuantizationStrategy.TENSOR, QuantizationStrategy.CHANNEL ]) - if not (is_symmetric_weight and is_static_weight + if not (is_symmetric_weight and is_static_weight # noqa: SIM103 and is_per_tensor_or_channel_weight): return False @@ -333,7 +333,7 @@ def create_weights(self, layer: torch.nn.Module, output_size: int, params_dtype: torch.dtype, **extra_weight_attrs): """ - Use the CompressedTensorsScheme associated with each layer to create + Use the CompressedTensorsScheme associated with each layer to create the necessary parameters for the layer. See LinearMethodBase for param details """ @@ -352,8 +352,8 @@ def apply(self, x: torch.Tensor, bias: Optional[torch.Tensor] = None): """ - Use the output of create_weights and the CompressedTensorsScheme - associated with the layer to apply the forward pass with the + Use the output of create_weights and the CompressedTensorsScheme + associated with the layer to apply the forward pass with the layer input. See LinearMethodBase for param details """ diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index cc699f5b4554f..5a1b2d701ab0d 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -132,10 +132,10 @@ def get_scaled_act_names(self) -> List[str]: def is_gptq_marlin_compatible(cls, quant_config: Dict[str, Any]): # Extract data from quant config. quant_method = quant_config.get("quant_method", "").lower() - num_bits = quant_config.get("bits", None) - group_size = quant_config.get("group_size", None) - sym = quant_config.get("sym", None) - desc_act = quant_config.get("desc_act", None) + num_bits = quant_config.get("bits") + group_size = quant_config.get("group_size") + sym = quant_config.get("sym") + desc_act = quant_config.get("desc_act") if quant_method != "gptq": return False diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py index 3aac5cd2b43a5..36f33d6d139ee 100644 --- a/vllm/model_executor/model_loader/tensorizer.py +++ b/vllm/model_executor/model_loader/tensorizer.py @@ -408,9 +408,7 @@ def is_vllm_tensorized(tensorizer_config: "TensorizerConfig") -> bool: "inferred as vLLM models, so setting vllm_tensorized=True is " "only necessary for models serialized prior to this change.") return True - if (".vllm_tensorized_marker" in deserializer): - return True - return False + return ".vllm_tensorized_marker" in deserializer def serialize_vllm_model( diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index f8be9490ee55d..f0fc950defed7 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -884,7 +884,7 @@ def __new__( version = str(config.version).split(".") version = tuple([int(x) for x in version]) # Dispatch class based on version - instance_class = _SUPPORT_VERSION.get(version, None) + instance_class = _SUPPORT_VERSION.get(version) if instance_class is None: raise ValueError( "Currently, MiniCPMV only supports versions 2.0, 2.5, and 2.6") diff --git a/vllm/spec_decode/draft_model_runner.py b/vllm/spec_decode/draft_model_runner.py index 1e403637d2388..cf64af72a14a5 100644 --- a/vllm/spec_decode/draft_model_runner.py +++ b/vllm/spec_decode/draft_model_runner.py @@ -183,10 +183,7 @@ def supports_gpu_multi_step(self, execute_model_req: ExecuteModelRequest): return False # TODO: Add soft-tuning prompt adapter support - if self.prompt_adapter_config: - return False - - return True + return not self.prompt_adapter_config @torch.inference_mode() def execute_model( diff --git a/vllm/spec_decode/metrics.py b/vllm/spec_decode/metrics.py index ad4e2dc879d7b..89ccaba70e93c 100644 --- a/vllm/spec_decode/metrics.py +++ b/vllm/spec_decode/metrics.py @@ -104,13 +104,10 @@ def _should_collect_rejsample_metrics(self, now: float) -> bool: if self._rank != 0: return False - if (now - self._last_metrics_collect_time < - self._rejsample_metrics_collect_interval_s): - return False - return True + return now - self._last_metrics_collect_time >= self._rejsample_metrics_collect_interval_s # noqa: E501 def _copy_rejsample_metrics_async(self) -> torch.cuda.Event: - """Copy rejection/typical-acceptance sampling metrics + """Copy rejection/typical-acceptance sampling metrics (number of accepted tokens, etc) to CPU asynchronously. Returns a CUDA event recording when the copy is complete. diff --git a/vllm/triton_utils/libentry.py b/vllm/triton_utils/libentry.py index ae00af44a048a..4335c7adfc13b 100644 --- a/vllm/triton_utils/libentry.py +++ b/vllm/triton_utils/libentry.py @@ -35,8 +35,8 @@ def key(self, spec_args, dns_args, const_args): dns_key = [ arg.dtype if hasattr( arg, "data_ptr") else type(arg) if not isinstance(arg, int) - else "i32" if -(2**31) <= arg and arg <= 2**31 - - 1 else "u64" if 2**63 <= arg and arg <= 2**64 - 1 else "i64" + else "i32" if arg >= -(2**31) and arg <= 2**31 - + 1 else "u64" if arg >= 2**63 and arg <= 2**64 - 1 else "i64" for arg in dns_args ] # const args passed by position