Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[docs] More docstrings #3108

Merged
merged 2 commits into from
Sep 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
231 changes: 231 additions & 0 deletions src/accelerate/utils/dataclasses.py
Original file line number Diff line number Diff line change
Expand Up @@ -818,6 +818,21 @@ class DataLoaderConfiguration:
class ProjectConfiguration:
"""
Configuration for the Accelerator object based on inner-project needs.

Args:
project_dir (`str`, defaults to `None`):
A path to a directory for storing data.
logging_dir (`str`, defaults to `None`):
A path to a directory for storing logs of locally-compatible loggers. If None, defaults to `project_dir`.
automatic_checkpoint_naming (`bool`, defaults to `False`):
Whether saved states should be automatically iteratively named.
total_limit (`int`, defaults to `None`):
The maximum number of total saved states to keep.
iteration (`int`, defaults to `0`):
The current save iteration.
save_on_each_node (`bool`, defaults to `False`):
When doing multi-node distributed training, whether to save models and checkpoints on each node, or only on
the main one.
"""

project_dir: str = field(default=None, metadata={"help": "A path to a directory for storing data."})
Expand Down Expand Up @@ -915,6 +930,20 @@ class GradientAccumulationPlugin(KwargsHandler):
class TorchDynamoPlugin(KwargsHandler):
"""
This plugin is used to compile a model with PyTorch 2.0

Args:
backend (`DynamoBackend`, defaults to `None`):
stevhliu marked this conversation as resolved.
Show resolved Hide resolved
A valid Dynamo backend. See https://pytorch.org/docs/stable/torch.compiler.html for more details.
mode (`str`, defaults to `None`):
Possible options are 'default', 'reduce-overhead' or 'max-autotune'.
fullgraph (`bool`, defaults to `None`):
Whether it is ok to break model into several subgraphs.
dynamic (`bool`, defaults to `None`):
Whether to use dynamic shape for tracing.
options (`Any`, defaults to `None`):
A dictionary of options to pass to the backend.
disable (`bool`, defaults to `False`):
Turn torch.compile() into a no-op for testing
"""

backend: DynamoBackend = field(
Expand Down Expand Up @@ -951,6 +980,39 @@ def to_dict(self):
class DeepSpeedPlugin:
"""
This plugin is used to integrate DeepSpeed.

Args:
hf_ds_config (`Any`, defaults to `None`):
Path to DeepSpeed config file or dict or an object of class `accelerate.utils.deepspeed.HfDeepSpeedConfig`.
gradient_accumulation_steps (`int`, defaults to `None`):
Number of steps to accumulate gradients before updating optimizer states. If not set, will use the value
from the `Accelerator` directly.
gradient_clipping (`float`, defaults to `None`):
Enable gradient clipping with value.
zero_stage (`int`, defaults to `None`):
Possible options are 0, 1, 2, 3. Default will be taken from environment variable.
is_train_batch_min (`bool`, defaults to `True`):
If both train & eval dataloaders are specified, this will decide the `train_batch_size`.
offload_optimizer_device (`str`, defaults to `None`):
Possible options are none|cpu|nvme. Only applicable with ZeRO Stages 2 and 3.
offload_param_device (`str`, defaults to `None`):
Possible options are none|cpu|nvme. Only applicable with ZeRO Stage 3.
offload_optimizer_nvme_path (`str`, defaults to `None`):
Possible options are /nvme|/local_nvme. Only applicable with ZeRO Stage 3.
offload_param_nvme_path (`str`, defaults to `None`):
Possible options are /nvme|/local_nvme. Only applicable with ZeRO Stage 3.
zero3_init_flag (`bool`, defaults to `None`):
Flag to indicate whether to save 16-bit model. Only applicable with ZeRO Stage-3.
zero3_save_16bit_model (`bool`, defaults to `None`):
Flag to indicate whether to save 16-bit model. Only applicable with ZeRO Stage-3.
transformer_moe_cls_names (`str`, defaults to `None`):
Comma-separated list of Transformers MoE layer class names (case-sensitive). For example,
`MixtralSparseMoeBlock`, `Qwen2MoeSparseMoeBlock`, `JetMoEAttention`, `JetMoEBlock`, etc.
enable_msamp (`bool`, defaults to `None`):
Flag to indicate whether to enable MS-AMP backend for FP8 training.
msasmp_opt_level (`Optional[Literal["O1", "O2"]]`, defaults to `None`):
Optimization level for MS-AMP (defaults to 'O1'). Only applicable if `enable_msamp` is True. Should be one
of ['O1' or 'O2'].
"""

hf_ds_config: Any = field(
Expand Down Expand Up @@ -1317,6 +1379,62 @@ def set_moe_leaf_modules(self, model):
class FullyShardedDataParallelPlugin:
"""
This plugin is used to enable fully sharded data parallelism.

Args:
sharding_strategy (`Union[str, torch.distributed.fsdp.ShardingStrategy]`, defaults to `'FULL_SHARD'`):
Sharding strategy to use. Should be either a `str` or an instance of
`torch.distributed.fsdp.fully_sharded_data_parallel.ShardingStrategy`.
backward_prefetch (`Union[str, torch.distributed.fsdp.BackwardPrefetch]`, defaults to `'NO_PREFETCH'`):
Backward prefetch strategy to use. Should be either a `str` or an instance of
`torch.distributed.fsdp.fully_sharded_data_parallel.BackwardPrefetch`.
mixed_precision_policy (`Optional[Union[dict, torch.distributed.fsdp.MixedPrecision]]`, defaults to `None`):
A config to enable mixed precision training with FullyShardedDataParallel. If passing in a `dict`, it
should have the following keys: `param_dtype`, `reduce_dtype`, and `buffer_dtype`.
auto_wrap_policy (`Optional(Union[Callable, Literal["transformer_based_wrap", "size_based_wrap", "no_wrap"]]), defaults to `NO_WRAP`):
A callable or string specifying a policy to recursively wrap layers with FSDP. If a string, it must be one
of `transformer_based_wrap`, `size_based_wrap`, or `no_wrap`. See
`torch.distributed.fsdp.wrap.size_based_wrap_policy` for a direction on what it should look like.
cpu_offload (`Union[bool, torch.distributed.fsdp.CPUOffload]`, defaults to `False`):
Whether to offload parameters to CPU. Should be either a `bool` or an instance of
`torch.distributed.fsdp.fully_sharded_data_parallel.CPUOffload`.
ignored_modules (`Optional[Iterable[torch.nn.Module]]`, defaults to `None`):
A list of modules to ignore when wrapping with FSDP.
state_dict_type (`Union[str, torch.distributed.fsdp.StateDictType]`, defaults to `'FULL_STATE_DICT'`):
State dict type to use. If a string, it must be one of `full_state_dict`, `local_state_dict`, or
`sharded_state_dict`.
state_dict_config (`Optional[Union[torch.distributed.fsdp.FullStateDictConfig, torch.distributed.fsdp.ShardedStateDictConfig]`, defaults to `None`):
State dict config to use. Is determined based on the `state_dict_type` if not passed in.
optim_state_dict_config (`Optional[Union[torch.distributed.fsdp.FullOptimStateDictConfig, torch.distributed.fsdp.ShardedOptimStateDictConfig]`, defaults to `None`):
Optim state dict config to use. Is determined based on the `state_dict_type` if not passed in.
limit_all_gathers (`bool`, defaults to `True`):
Whether to have FSDP explicitly synchronizes the CPU thread to prevent too many in-flight all-gathers. This
bool only affects the sharded strategies that schedule all-gathers. Enabling this can help lower the number
of CUDA malloc retries.
use_orig_params (`bool`, defaults to `False`):
Whether to use the original parameters for the optimizer.
param_init_fn (`Optional[Callable[[torch.nn.Module], None]`, defaults to `None`):
A `Callable[torch.nn.Module] -> None` that specifies how modules that are currently on the meta device
should be initialized onto an actual device. Only applicable when `sync_module_states` is `True`. By
default is a `lambda` which calls `to_empty` on the module.
sync_module_states (`bool`, defaults to `False`):
Whether each individually wrapped FSDP unit should broadcast module parameters from rank 0 to ensure they
are the same across all ranks after initialization. Defaults to `False` unless `cpu_ram_efficient_loading`
is `True`, then will be forcibly enabled.
forward_prefetch (`bool`, defaults to `False`):
Whether to have FSDP explicitly prefetches the next upcoming all-gather while executing in the forward
pass. only use with Static graphs.
activation_checkpointing (`bool`, defaults to `False`):
A technique to reduce memory usage by clearing activations of certain layers and recomputing them during a
backward pass. Effectively, this trades extra computation time for reduced memory usage.
cpu_ram_efficient_loading (`bool`, defaults to `None`):
If True, only the first process loads the pretrained model checkoint while all other processes have empty
weights. Only applicable for Transformers. When using this, `sync_module_states` needs to be `True`.
transformer_cls_names_to_wrap (`Optional[List[str]]`, defaults to `None`):
A list of transformer layer class names to wrap. Only applicable when `auto_wrap_policy` is
`transformer_based_wrap`.
min_num_params (`Optional[int]`, defaults to `None`):
The minimum number of parameters a module must have to be wrapped. Only applicable when `auto_wrap_policy`
is `size_based_wrap`.
"""

sharding_strategy: Union[str, "torch.distributed.fsdp.ShardingStrategy"] = field(
Expand Down Expand Up @@ -1672,6 +1790,96 @@ class MegatronLMPlugin:
"""
Plugin for Megatron-LM to enable tensor, pipeline, sequence and data parallelism. Also to enable selective
activation recomputation and optimized fused kernels.

Args:
tp_degree (`int`, defaults to `None`):
Tensor parallelism degree.
pp_degree (`int`, defaults to `None`):
Pipeline parallelism degree.
num_micro_batches (`int`, defaults to `None`):
Number of micro-batches.
gradient_clipping (`float`, defaults to `None`):
Gradient clipping value based on global L2 Norm (0 to disable).
sequence_parallelism (`bool`, defaults to `None`):
Enable sequence parallelism.
recompute_activations (`bool`, defaults to `None`):
Enable selective activation recomputation.
use_distributed_optimizr (`bool`, defaults to `None`):
Enable distributed optimizer.
pipeline_model_parallel_split_rank (`int`, defaults to `None`):
Rank where encoder and decoder should be split.
num_layers_per_virtual_pipeline_stage (`int`, defaults to `None`):
Number of layers per virtual pipeline stage.
is_train_batch_min (`str`, defaults to `True`):
If both tran & eval dataloaders are specified, this will decide the `micro_batch_size`.
train_iters (`int`, defaults to `None`):
Total number of samples to train over all training runs. Note that either train-iters or train-samples
should be provided when using `MegatronLMDummyScheduler`.
train_samples (`int`, defaults to `None`):
Total number of samples to train over all training runs. Note that either train-iters or train-samples
should be provided when using `MegatronLMDummyScheduler`.
weight_decay_incr_style (`str`, defaults to `'constant'`):
Weight decay increment function. choices=["constant", "linear", "cosine"].
start_weight_decay (`float`, defaults to `None`):
Initial weight decay coefficient for L2 regularization.
end_weight_decay (`float`, defaults to `None`):
End of run weight decay coefficient for L2 regularization.
lr_decay_style (`str`, defaults to `'linear'`):
Learning rate decay function. choices=['constant', 'linear', 'cosine'].
lr_decay_iters (`int`, defaults to `None`):
Number of iterations for learning rate decay. If None defaults to `train_iters`.
lr_decay_samples (`int`, defaults to `None`):
Number of samples for learning rate decay. If None defaults to `train_samples`.
lr_warmup_iters (`int`, defaults to `None`):
Number of iterations to linearly warmup learning rate over.
lr_warmup_samples (`int`, defaults to `None`):
Number of samples to linearly warmup learning rate over.
lr_warmup_fraction (`float`, defaults to `None`):
Fraction of lr-warmup-(iters/samples) to linearly warmup learning rate over.
min_lr (`float`, defaults to `0`):
Minumum value for learning rate. The scheduler clip values below this threshold.
consumed_samples (`List`, defaults to `None`):
Number of samples consumed in the same order as the dataloaders to `accelerator.prepare` call.
no_wd_decay_cond (`Optional`, defaults to `None`):
Condition to disable weight decay.
scale_lr_cond (`Optional`, defaults to `None`):
Condition to scale learning rate.
lr_mult (`float`, defaults to `1.0`):
Learning rate multiplier.
megatron_dataset_flag (`bool`, defaults to `False`):
Whether the format of dataset follows Megatron-LM Indexed/Cached/MemoryMapped format.
seq_length (`int`, defaults to `None`):
Maximum sequence length to process.
encoder_seq_length (`int`, defaults to `None`):
Maximum sequence length to process for the encoder.
decoder_seq_length (`int`, defaults to `None`):
Maximum sequence length to process for the decoder.
tensorboard_dir (`str`, defaults to `None`):
Path to save tensorboard logs.
set_all_logging_options (`bool`, defaults to `False`):
Whether to set all logging options.
eval_iters (`int`, defaults to `100`):
Number of iterations to run for evaluation validation/test for.
eval_interval (`int`, defaults to `1000`):
Interval between running evaluation on validation set.
return_logits (`bool`, defaults to `False`):
Whether to return logits from the model.
custom_train_step_class (`Optional`, defaults to `None`):
Custom train step class.
custom_train_step_kwargs (`Optional`, defaults to `None`):
Custom train step kwargs.
custom_model_provider_function (`Optional`, defaults to `None`):
Custom model provider function.
custom_prepare_model_function (`Optional`, defaults to `None`):
Custom prepare model function.
custom_megatron_datasets_provider_function (`Optional`, defaults to `None`):
Custom megatron train_valid_test datasets provider function.
custom_get_batch_function (`Optional`, defaults to `None`):
Custom get batch function.
custom_loss_function (`Optional`, defaults to `None`):
Custom loss function.
other_megatron_args (`Optional`, defaults to `None`):
Other Megatron-LM arguments. Please refer Megatron-LM.
"""

tp_degree: int = field(default=None, metadata={"help": "tensor parallelism degree."})
Expand Down Expand Up @@ -2132,6 +2340,29 @@ def parse_llama_config(megatron_lm_plugin, model, batch_data):
class BnbQuantizationConfig:
"""
A plugin to enable BitsAndBytes 4bit and 8bit quantization

Args:
load_in_8bit (`bool`, defaults to `False`):
Enable 8bit quantization.
llm_int8_threshold (`float`, defaults to `6.0`):
Value of the outliner threshold. Only relevant when `load_in_8bit=True`.
load_in_4_bit (`bool`, defaults to `False`):
Enable 4bit quantization.
bnb_4bit_quant_type (`str`, defaults to `fp4`):
Set the quantization data type in the `bnb.nn.Linear4Bit` layers. Options are {'fp4','np4'}.
bnb_4bit_use_double_quant (`bool`, defaults to `False`):
Enable nested quantization where the quantization constants from the first quantization are quantized
again.
bnb_4bit_compute_dtype (`bool`, defaults to `fp16`):
This sets the computational type which might be different than the input time. For example, inputs might be
fp32, but computation can be set to bf16 for speedups. Options are {'fp32','fp16','bf16'}.
torch_dtype (`torch.dtype`, defaults to `None`):
This sets the dtype of the remaining non quantized layers. `bitsandbytes` library suggests to set the value
to `torch.float16` for 8 bit model and use the same dtype as the compute dtype for 4 bit model.
skip_modules (`List[str]`, defaults to `None`):
An explicit list of the modules that we don't quantize. The dtype of these modules will be `torch_dtype`.
keep_in_fp32_modules (`List`, defaults to `None`):
An explicit list of the modules that we don't quantize. We keep them in `torch.float32`.
"""

load_in_8bit: bool = field(default=False, metadata={"help": "enable 8bit quantization."})
Expand Down
3 changes: 3 additions & 0 deletions src/accelerate/utils/imports.py
Original file line number Diff line number Diff line change
Expand Up @@ -313,12 +313,15 @@ def is_mlflow_available():


def is_mps_available(min_version="1.12"):
"Checks if MPS device is available. The minimum version required is 1.12."
# With torch 1.12, you can use torch.backends.mps
# With torch 2.0.0, you can use torch.mps
return is_torch_version(">=", min_version) and torch.backends.mps.is_available() and torch.backends.mps.is_built()


def is_ipex_available():
"Checks if ipex is installed."

def get_major_and_minor_from_version(full_version):
return str(version.parse(full_version).major) + "." + str(version.parse(full_version).minor)

Expand Down
Loading