From 9fb95d923368e1cb7645230592e410700f40580d Mon Sep 17 00:00:00 2001
From: Steven Liu <steven.liu@huggingface.co>
Date: Thu, 12 Sep 2024 11:17:54 -0700
Subject: [PATCH 1/2] more docstrings

---
 src/accelerate/utils/dataclasses.py | 230 ++++++++++++++++++++++++++++
 src/accelerate/utils/imports.py     |   3 +
 2 files changed, 233 insertions(+)

diff --git a/src/accelerate/utils/dataclasses.py b/src/accelerate/utils/dataclasses.py
index 07a37dcbae9..2e87808d43e 100644
--- a/src/accelerate/utils/dataclasses.py
+++ b/src/accelerate/utils/dataclasses.py
@@ -818,6 +818,21 @@ class DataLoaderConfiguration:
 class ProjectConfiguration:
     """
     Configuration for the Accelerator object based on inner-project needs.
+
+    Args:
+        project_dir (`str`, defaults to `None`):
+            A path to a directory for storing data.
+        logging_dir (`str`, defaults to `None`):
+            A path to a directory for storing logs of locally-compatible loggers. If None, defaults to `project_dir`.
+        automatic_checkpoint_naming (`bool`, defaults to `False`):
+            Whether saved states should be automatically iteratively named.
+        total_limit (`int`, defaults to `None`):
+            The maximum number of total saved states to keep.
+        iteration (`int`, defaults to `0`):
+            The current save iteration.
+        save_on_each_node (`bool`, defaults to `False`):
+            When doing multi-node distributed training, whether to save models and checkpoints on each node, or only on
+            the main one.
     """
 
     project_dir: str = field(default=None, metadata={"help": "A path to a directory for storing data."})
@@ -915,6 +930,19 @@ class GradientAccumulationPlugin(KwargsHandler):
 class TorchDynamoPlugin(KwargsHandler):
     """
     This plugin is used to compile a model with PyTorch 2.0
+
+    Args:
+        backend (`DynamoBackend`, defaults to `None`):
+        mode (`str`, defaults to `None`):
+            Possible options are 'default', 'reduce-overhead' or 'max-autotune'.
+        fullgraph (`bool`, defaults to `None`):
+            Whether it is ok to break model into several subgraphs.
+        dynamic (`bool`, defaults to `None`):
+            Whether to use dynamic shape for tracing.
+        options (`Any`, defaults to `None`):
+            A dictionary of options to pass to the backend.
+        disable (`bool`, defaults to `False`):
+            Turn torch.compile() into a no-op for testing
     """
 
     backend: DynamoBackend = field(
@@ -951,6 +979,39 @@ def to_dict(self):
 class DeepSpeedPlugin:
     """
     This plugin is used to integrate DeepSpeed.
+
+    Args:
+        hf_ds_config (`Any`, defaults to `None`):
+            Path to DeepSpeed config file or dict or an object of class `accelerate.utils.deepspeed.HfDeepSpeedConfig`.
+        gradient_accumulation_steps (`int`, defaults to `None`):
+            Number of steps to accumulate gradients before updating optimizer states. If not set, will use the value
+            from the `Accelerator` directly.
+        gradient_clipping (`float`, defaults to `None`):
+            Enable gradient clipping with value.
+        zero_stage (`int`, defaults to `None`):
+            Possible options are 0, 1, 2, 3. Default will be taken from environment variable.
+        is_train_batch_min (`bool`, defaults to `True`):
+            If both train & eval dataloaders are specified, this will decide the `train_batch_size`.
+        offload_optimizer_device (`str`, defaults to `None`):
+            Possible options are none|cpu|nvme. Only applicable with ZeRO Stages 2 and 3.
+        offload_param_device (`str`, defaults to `None`):
+            Possible options are none|cpu|nvme. Only applicable with ZeRO Stage 3.
+        offload_optimizer_nvme_path (`str`, defaults to `None`):
+            Possible options are /nvme|/local_nvme. Only applicable with ZeRO Stage 3.
+        offload_param_nvme_path (`str`, defaults to `None`):
+            Possible options are /nvme|/local_nvme. Only applicable with ZeRO Stage 3.
+        zero3_init_flag (`bool`, defaults to `None`):
+            Flag to indicate whether to save 16-bit model. Only applicable with ZeRO Stage-3.
+        zero3_save_16bit_model (`bool`, defaults to `None`):
+            Flag to indicate whether to save 16-bit model. Only applicable with ZeRO Stage-3.
+        transformer_moe_cls_names (`str`, defaults to `None`):
+            Comma-separated list of Transformers MoE layer class names (case-sensitive). For example,
+            `MixtralSparseMoeBlock`, `Qwen2MoeSparseMoeBlock`, `JetMoEAttention`, `JetMoEBlock`, etc.
+        enable_msamp (`bool`, defaults to `None`):
+            Flag to indicate whether to enable MS-AMP backend for FP8 training.
+        msasmp_opt_level (`Optional[Literal["O1", "O2"]]`, defaults to `None`):
+            Optimization level for MS-AMP (defaults to 'O1'). Only applicable if `enable_msamp` is True. Should be one
+            of ['O1' or 'O2'].
     """
 
     hf_ds_config: Any = field(
@@ -1317,6 +1378,62 @@ def set_moe_leaf_modules(self, model):
 class FullyShardedDataParallelPlugin:
     """
     This plugin is used to enable fully sharded data parallelism.
+
+    Args:
+        sharding_strategy (`Union[str, torch.distributed.fsdp.ShardingStrategy]`, defaults to `'FULL_SHARD'`):
+            Sharding strategy to use. Should be either a `str` or an instance of
+            `torch.distributed.fsdp.fully_sharded_data_parallel.ShardingStrategy`.
+        backward_prefetch (`Union[str, torch.distributed.fsdp.BackwardPrefetch]`, defaults to `'NO_PREFETCH'`):
+            Backward prefetch strategy to use. Should be either a `str` or an instance of
+            `torch.distributed.fsdp.fully_sharded_data_parallel.BackwardPrefetch`.
+        mixed_precision_policy (`Optional[Union[dict, torch.distributed.fsdp.MixedPrecision]]`, defaults to `None`):
+            A config to enable mixed precision training with FullyShardedDataParallel. If passing in a `dict`, it
+            should have the following keys: `param_dtype`, `reduce_dtype`, and `buffer_dtype`.
+        auto_wrap_policy (`Optional(Union[Callable, Literal["transformer_based_wrap", "size_based_wrap", "no_wrap"]]), defaults to `NO_WRAP`):
+            A callable or string specifying a policy to recursively wrap layers with FSDP. If a string, it must be one
+            of `transformer_based_wrap`, `size_based_wrap`, or `no_wrap`. See
+            `torch.distributed.fsdp.wrap.size_based_wrap_policy` for a direction on what it should look like.
+        cpu_offload (`Union[bool, torch.distributed.fsdp.CPUOffload]`, defaults to `False`):
+            Whether to offload parameters to CPU. Should be either a `bool` or an instance of
+            `torch.distributed.fsdp.fully_sharded_data_parallel.CPUOffload`.
+        ignored_modules (`Optional[Iterable[torch.nn.Module]]`, defaults to `None`):
+            A list of modules to ignore when wrapping with FSDP.
+        state_dict_type (`Union[str, torch.distributed.fsdp.StateDictType]`, defaults to `'FULL_STATE_DICT'`):
+            State dict type to use. If a string, it must be one of `full_state_dict`, `local_state_dict`, or
+            `sharded_state_dict`.
+        state_dict_config (`Optional[Union[torch.distributed.fsdp.FullStateDictConfig, torch.distributed.fsdp.ShardedStateDictConfig]`, defaults to `None`):
+            State dict config to use. Is determined based on the `state_dict_type` if not passed in.
+        optim_state_dict_config (`Optional[Union[torch.distributed.fsdp.FullOptimStateDictConfig, torch.distributed.fsdp.ShardedOptimStateDictConfig]`, defaults to `None`):
+            Optim state dict config to use. Is determined based on the `state_dict_type` if not passed in.
+        limit_all_gathers (`bool`, defaults to `True`):
+            Whether to have FSDP explicitly synchronizes the CPU thread to prevent too many in-flight all-gathers. This
+            bool only affects the sharded strategies that schedule all-gathers. Enabling this can help lower the number
+            of CUDA malloc retries.
+        use_orig_params (`bool`, defaults to `False`):
+            Whether to use the original parameters for the optimizer.
+        param_init_fn (`Optional[Callable[[torch.nn.Module], None]`, defaults to `None`):
+            A `Callable[torch.nn.Module] -> None` that specifies how modules that are currently on the meta device
+            should be initialized onto an actual device. Only applicable when `sync_module_states` is `True`. By
+            default is a `lambda` which calls `to_empty` on the module.
+        sync_module_states (`bool`, defaults to `False`):
+            Whether each individually wrapped FSDP unit should broadcast module parameters from rank 0 to ensure they
+            are the same across all ranks after initialization. Defaults to `False` unless `cpu_ram_efficient_loading`
+            is `True`, then will be forcibly enabled.
+        forward_prefetch (`bool`, defaults to `False`):
+            Whether to have FSDP explicitly prefetches the next upcoming all-gather while executing in the forward
+            pass. only use with Static graphs.
+        activation_checkpointing (`bool`, defaults to `False`):
+            A technique to reduce memory usage by clearing activations of certain layers and recomputing them during a
+            backward pass. Effectively, this trades extra computation time for reduced memory usage.
+        cpu_ram_efficient_loading (`bool`, defaults to `None`):
+            If True, only the first process loads the pretrained model checkoint while all other processes have empty
+            weights. Only applicable for Transformers. When using this, `sync_module_states` needs to be `True`.
+        transformer_cls_names_to_wrap (`Optional[List[str]]`, defaults to `None`):
+            A list of transformer layer class names to wrap. Only applicable when `auto_wrap_policy` is
+            `transformer_based_wrap`.
+        min_num_params (`Optional[int]`, defaults to `None`):
+            The minimum number of parameters a module must have to be wrapped. Only applicable when `auto_wrap_policy`
+            is `size_based_wrap`.
     """
 
     sharding_strategy: Union[str, "torch.distributed.fsdp.ShardingStrategy"] = field(
@@ -1672,6 +1789,96 @@ class MegatronLMPlugin:
     """
     Plugin for Megatron-LM to enable tensor, pipeline, sequence and data parallelism. Also to enable selective
     activation recomputation and optimized fused kernels.
+
+    Args:
+        tp_degree (`int`, defaults to `None`):
+            Tensor parallelism degree.
+        pp_degree (`int`, defaults to `None`):
+            Pipeline parallelism degree.
+        num_micro_batches (`int`, defaults to `None`):
+            Number of micro-batches.
+        gradient_clipping (`float`, defaults to `None`):
+            Gradient clipping value based on global L2 Norm (0 to disable).
+        sequence_parallelism (`bool`, defaults to `None`):
+            Enable sequence parallelism.
+        recompute_activations (`bool`, defaults to `None`):
+            Enable selective activation recomputation.
+        use_distributed_optimizr (`bool`, defaults to `None`):
+            Enable distributed optimizer.
+        pipeline_model_parallel_split_rank (`int`, defaults to `None`):
+            Rank where encoder and decoder should be split.
+        num_layers_per_virtual_pipeline_stage (`int`, defaults to `None`):
+            Number of layers per virtual pipeline stage.
+        is_train_batch_min (`str`, defaults to `True`):
+            If both tran & eval dataloaders are specified, this will decide the `micro_batch_size`.
+        train_iters (`int`, defaults to `None`):
+            Total number of samples to train over all training runs. Note that either train-iters or train-samples
+            should be provided when using `MegatronLMDummyScheduler`.
+        train_samples (`int`, defaults to `None`):
+            Total number of samples to train over all training runs. Note that either train-iters or train-samples
+            should be provided when using `MegatronLMDummyScheduler`.
+        weight_decay_incr_style (`str`, defaults to `'constant'`):
+            Weight decay increment function. choices=["constant", "linear", "cosine"].
+        start_weight_decay (`float`, defaults to `None`):
+            Initial weight decay coefficient for L2 regularization.
+        end_weight_decay (`float`, defaults to `None`):
+            End of run weight decay coefficient for L2 regularization.
+        lr_decay_style (`str`, defaults to `'linear'`):
+            Learning rate decay function. choices=['constant', 'linear', 'cosine'].
+        lr_decay_iters (`int`, defaults to `None`):
+            Number of iterations for learning rate decay. If None defaults to `train_iters`.
+        lr_decay_samples (`int`, defaults to `None`):
+            Number of samples for learning rate decay. If None defaults to `train_samples`.
+        lr_warmup_iters (`int`, defaults to `None`):
+            Number of iterations to linearly warmup learning rate over.
+        lr_warmup_samples (`int`, defaults to `None`):
+            Number of samples to linearly warmup learning rate over.
+        lr_warmup_fraction (`float`, defaults to `None`):
+            Fraction of lr-warmup-(iters/samples) to linearly warmup learning rate over.
+        min_lr (`float`, defaults to `0`):
+            Minumum value for learning rate. The scheduler clip values below this threshold.
+        consumed_samples (`List`, defaults to `None`):
+            Number of samples consumed in the same order as the dataloaders to `accelerator.prepare` call.
+        no_wd_decay_cond (`Optional`, defaults to `None`):
+            Condition to disable weight decay.
+        scale_lr_cond (`Optional`, defaults to `None`):
+            Condition to scale learning rate.
+        lr_mult (`float`, defaults to `1.0`):
+            Learning rate multiplier.
+        megatron_dataset_flag (`bool`, defaults to `False`):
+            Whether the format of dataset follows Megatron-LM Indexed/Cached/MemoryMapped format.
+        seq_length (`int`, defaults to `None`):
+            Maximum sequence length to process.
+        encoder_seq_length (`int`, defaults to `None`):
+            Maximum sequence length to process for the encoder.
+        decoder_seq_length (`int`, defaults to `None`):
+            Maximum sequence length to process for the decoder.
+        tensorboard_dir (`str`, defaults to `None`):
+            Path to save tensorboard logs.
+        set_all_logging_options (`bool`, defaults to `False`):
+            Whether to set all logging options.
+        eval_iters (`int`, defaults to `100`):
+            Number of iterations to run for evaluation validation/test for.
+        eval_interval (`int`, defaults to `1000`):
+            Interval between running evaluation on validation set.
+        return_logits (`bool`, defaults to `False`):
+            Whether to return logits from the model.
+        custom_train_step_class (`Optional`, defaults to `None`):
+            Custom train step class.
+        custom_train_step_kwargs (`Optional`, defaults to `None`):
+            Custom train step kwargs.
+        custom_model_provider_function (`Optional`, defaults to `None`):
+            Custom model provider function.
+        custom_prepare_model_function (`Optional`, defaults to `None`):
+            Custom prepare model function.
+        custom_megatron_datasets_provider_function (`Optional`, defaults to `None`):
+            Custom megatron train_valid_test datasets provider function.
+        custom_get_batch_function (`Optional`, defaults to `None`):
+            Custom get batch function.
+        custom_loss_function (`Optional`, defaults to `None`):
+            Custom loss function.
+        other_megatron_args (`Optional`, defaults to `None`):
+            Other Megatron-LM arguments. Please refer Megatron-LM.
     """
 
     tp_degree: int = field(default=None, metadata={"help": "tensor parallelism degree."})
@@ -2132,6 +2339,29 @@ def parse_llama_config(megatron_lm_plugin, model, batch_data):
 class BnbQuantizationConfig:
     """
     A plugin to enable BitsAndBytes 4bit and 8bit quantization
+
+    Args:
+        load_in_8bit (`bool`, defaults to `False`):
+            Enable 8bit quantization.
+        llm_int8_threshold (`float`, defaults to `6.0`):
+            Value of the outliner threshold. Only relevant when `load_in_8bit=True`.
+        load_in_4_bit (`bool`, defaults to `False`):
+            Enable 4bit quantization.
+        bnb_4bit_quant_type (`str`, defaults to `fp4`):
+            Set the quantization data type in the `bnb.nn.Linear4Bit` layers. Options are {'fp4','np4'}.
+        bnb_4bit_use_double_quant (`bool`, defaults to `False`):
+            Enable nested quantization where the quantization constants from the first quantization are quantized
+            again.
+        bnb_4bit_compute_dtype (`bool`, defaults to `fp16`):
+            This sets the computational type which might be different than the input time. For example, inputs might be
+            fp32, but computation can be set to bf16 for speedups. Options are {'fp32','fp16','bf16'}.
+        torch_dtype (`torch.dtype`, defaults to `None`):
+            This sets the dtype of the remaining non quantized layers. `bitsandbytes` library suggests to set the value
+            to `torch.float16` for 8 bit model and use the same dtype as the compute dtype for 4 bit model.
+        skip_modules (`List[str]`, defaults to `None`):
+            An explicit list of the modules that we don't quantize. The dtype of these modules will be `torch_dtype`.
+        keep_in_fp32_modules (`List`, defaults to `None`):
+            An explicit list of the modules that we don't quantize. We keep them in `torch.float32`.
     """
 
     load_in_8bit: bool = field(default=False, metadata={"help": "enable 8bit quantization."})
diff --git a/src/accelerate/utils/imports.py b/src/accelerate/utils/imports.py
index 15f802e5926..ee02aac3098 100644
--- a/src/accelerate/utils/imports.py
+++ b/src/accelerate/utils/imports.py
@@ -313,12 +313,15 @@ def is_mlflow_available():
 
 
 def is_mps_available(min_version="1.12"):
+    "Checks if MPS device is available. The minimum version required is 1.12."
     # With torch 1.12, you can use torch.backends.mps
     # With torch 2.0.0, you can use torch.mps
     return is_torch_version(">=", min_version) and torch.backends.mps.is_available() and torch.backends.mps.is_built()
 
 
 def is_ipex_available():
+    "Checks if ipex is installed."
+
     def get_major_and_minor_from_version(full_version):
         return str(version.parse(full_version).major) + "." + str(version.parse(full_version).minor)
 

From 17bb488a5838d1e5a57dd14aac0773b06e205e90 Mon Sep 17 00:00:00 2001
From: Steven Liu <steven.liu@huggingface.co>
Date: Thu, 12 Sep 2024 11:50:45 -0700
Subject: [PATCH 2/2] feedback

---
 src/accelerate/utils/dataclasses.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/accelerate/utils/dataclasses.py b/src/accelerate/utils/dataclasses.py
index 2e87808d43e..ef2715978e3 100644
--- a/src/accelerate/utils/dataclasses.py
+++ b/src/accelerate/utils/dataclasses.py
@@ -933,6 +933,7 @@ class TorchDynamoPlugin(KwargsHandler):
 
     Args:
         backend (`DynamoBackend`, defaults to `None`):
+            A valid Dynamo backend. See https://pytorch.org/docs/stable/torch.compiler.html for more details.
         mode (`str`, defaults to `None`):
             Possible options are 'default', 'reduce-overhead' or 'max-autotune'.
         fullgraph (`bool`, defaults to `None`):