Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

update configs #1954

Merged
merged 14 commits into from
Nov 7, 2024
Merged

update configs #1954

merged 14 commits into from
Nov 7, 2024

Conversation

felipemello1
Copy link
Contributor

@felipemello1 felipemello1 commented Nov 5, 2024

Changelog

  • fix bad defaults
  • add missing parameters
  • add better comments
  • Update unit tests that relied on the changed configs
import os


def modify_yaml_file(file_path):
    updated = {
        "updated_compile": False,
        "updated_packed": False,
        "added_compile": False,
        "added_activation_offloading": False,
        "added_packed": False,
        "added_profiler": False,
        "updated_gradient_accumulation_steps": False,
        "updated_checkpointing_comment": False,
        "updated_gradient_comment": False,
        "updated_compile_comment": False,
        "updated_packed_comment": False,
    }

    with open(file_path, "r") as file:
        lines = file.readlines()
    # Step 2: Remove duplicate 'compile' entries
    compile_indices = [
        i for i, line in enumerate(lines) if line.strip().startswith("compile:")
    ]
    if len(compile_indices) > 1:
        for index in sorted(compile_indices, reverse=True):
            del lines[index]
        updated["updated_compile"] = True
    # Step 3: Move 'packed' after '_component_' and align indentation
    for i, line in enumerate(lines):
        if (
            line.strip().startswith("packed:")
            and i + 1 < len(lines)
            and "_component_" in lines[i + 1]
        ):
            packed_line = lines.pop(i)
            lines.insert(i + 1, packed_line)  # Insert after the _component_ line
            updated["updated_packed"] = True
            break
    # Step 4: Add 'compile' if missing
    if not any(line.strip().startswith("compile:") for line in lines):
        for i, line in enumerate(lines):
            if line.strip().startswith("max_steps_per_epoch:"):
                indentation = len(line) - len(line.lstrip())
                new_line = (
                    " " * indentation
                    + "compile: False # pytorch compile, set to true for better perf/memory\n"
                )
                lines.insert(i + 1, new_line)
                updated["added_compile"] = True
                break
    # Step 5: Add 'enable_activation_offloading' if missing
    if (
        not any(
            line.strip().startswith("enable_activation_offloading:") for line in lines
        )
        and "vision" not in file_path
        and "ppo" not in file_path
        and "dpo" not in file_path
        and "distillation" not in file_path
        and "qat" not in file_path
    ):
        for i, line in enumerate(lines):
            if line.strip().startswith("enable_activation_checkpointing:"):
                indentation = len(line) - len(line.lstrip())
                new_line = (
                    " " * indentation
                    + "enable_activation_offloading: False  # True reduces memory\n"
                )
                lines.insert(i + 1, new_line)
                updated["added_activation_offloading"] = True
                break
    # Step 6: Add 'packed' if missing
    if "dpo" not in file_path and "ppo" not in file_path:
        if (
            not any(line.strip().startswith("packed:") for line in lines)
            and "vision" not in file_path
        ):
            for i, line in enumerate(lines):
                if "_component_" in line and "dataset" in lines[i - 1]:
                    indentation = len(line) - len(line.lstrip())
                    new_line = (
                        " " * indentation + "packed: False # True increases speed\n"
                    )
                    lines.insert(i + 1, new_line)
                    updated["added_packed"] = True
                    break

    # Step 7: Replace/Add 'profiler' section if missing
    if "ppo" not in file_path and "dpo" not in file_path:
        profiler_section = """# Profiler (disabled)
profiler:
    _component_: torchtune.training.setup_torch_profiler
    enabled: False

    #Output directory of trace artifacts
    output_dir: ${output_dir}/profiling_outputs

    #`torch.profiler.ProfilerActivity` types to trace
    cpu: True
    cuda: True

    #trace options passed to `torch.profiler.profile`
    profile_memory: False
    with_stack: False
    record_shapes: True
    with_flops: False

    # `torch.profiler.schedule` options:
    # wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat
    wait_steps: 5
    warmup_steps: 3
    active_steps: 2
    num_cycles: 1
"""

        # Correct the 'profiler' section if it has incorrect indentation
        start_index = None
        end_index = None
        for i, line in enumerate(lines):
            if line.strip().startswith("# Profiler (disabled)"):
                start_index = i
            if line.strip().startswith("num_cycles: 1"):
                end_index = i
                break

        if start_index is not None and end_index is not None:
            # Replaces profiler

            # Remove the old section
            del lines[start_index : end_index + 1]
            # Insert the new section
            lines.insert(start_index, profiler_section)
            updated["added_profiler"] = True

        if not any(line.strip().startswith("profiler:") for line in lines):
            lines.append(profiler_section)
            updated["added_profiler"] = True

    # Step 8: Update 'gradient_accumulation_steps' if greater than 1
    for i, line in enumerate(lines):
        if line.strip().startswith("gradient_accumulation_steps:"):
            parts = line.split(":")
            if len(parts) > 1 and int(parts[1].strip()) > 1:
                lines[i] = parts[0] + ": 8\n"
                updated["updated_gradient_accumulation_steps"] = True
            break

    # Step 9: Add or replace comment for 'enable_activation_checkpointing'
    for i, line in enumerate(lines):
        if line.strip().startswith("enable_activation_checkpointing:"):
            parts = line.split("#")
            lines[i] = parts[0].strip() + "  # True reduces memory\n"
            updated["updated_checkpointing_comment"] = True
            break
    # Step 9.5: Add or replace comment for 'enable_activation_offloading'
    for i, line in enumerate(lines):
        if line.strip().startswith("enable_activation_offloading:"):
            parts = line.split("#")
            lines[i] = parts[0].strip() + "  # True reduces memory\n"
            updated["updated_checkpointing_comment"] = True
            break
    # Step 10: Add or replace comment for 'gradient_accumulation_steps'
    for i, line in enumerate(lines):
        if line.strip().startswith("gradient_accumulation_steps:"):
            parts = line.split("#")
            lines[i] = parts[0].rstrip() + "  # Use to increase virtual batch size\n"
            updated["updated_gradient_comment"] = True
            break
    # Step 11: Add or replace comment for 'compile'
    for i, line in enumerate(lines):
        if line.strip().startswith("compile:"):
            parts = line.split("#")
            lines[i] = (
                parts[0].rstrip()
                + "  # pytorch compile, set to true for better perf/memory\n"
            )
            updated["updated_compile_comment"] = True
            break
    # Step 12: Add or replace comment for 'packed'
    for i, line in enumerate(lines):
        if line.strip().startswith("packed:"):
            parts = line.split("#")
            lines[i] = parts[0].rstrip() + "  # True increases speed\n"
            updated["updated_packed_comment"] = True
            break

    # for files ending with "full.yaml" or "full_single_device.yaml"
    if (
        file_path.endswith("full.yaml")
        or file_path.endswith("full_single_device.yaml")
        and "qat" not in file_path
        and "ppo" not in file_path
        and "dpo" not in file_path
    ):
        # Step 13: Add 'optimizer_in_bwd: False' if missing
        if not any(line.strip().startswith("optimizer_in_bwd:") for line in lines):
            for i, line in enumerate(lines):
                if line.strip().startswith("compile:"):
                    indentation = len(line) - len(line.lstrip())
                    new_line = " " * indentation + "optimizer_in_bwd: False\n"
                    lines.insert(i + 1, new_line)
                    updated["added_optimizer_in_bwd"] = True
                    break

    # Step 14: Add/replace comment for 'optimizer_in_bwd'
    for i, line in enumerate(lines):
        if line.strip().startswith("optimizer_in_bwd:"):
            parts = line.split("#")
            lines[i] = (
                parts[0].rstrip()
                + "  # True saves memory. Requires gradient_accumulation_steps=1\n"
            )
            updated["updated_optimizer_in_bwd_comment"] = True
            break

    # Step 14.5: Add/replace comment for 'custom_sharded_layers'
    for i, line in enumerate(lines):
        if line.strip().startswith("custom_sharded_layers:"):
            parts = line.split("#")
            lines[i] = (
                parts[0].rstrip()
                + "  # Layers to shard separately (useful for large vocab size models). Lower Memory, but lower speed.\n"
            )
            updated["updated_custom_sharded_layers_comment"] = True
            break

    # for files with lora in the name
    if "lora" in file_path or "dora" in file_path:
        for i, line in enumerate(lines):
            # Step 15: make 'lora_attn_modules: ['q_proj', 'v_proj', 'output_proj']'
            if line.strip().startswith("lora_attn_modules:"):
                lines[i] = "  lora_attn_modules: ['q_proj', 'v_proj', 'output_proj']\n"
            # Step 16: make 'apply_lora_to_mlp: True'
            elif line.strip().startswith("apply_lora_to_mlp:"):
                lines[i] = "  apply_lora_to_mlp: True\n"
            # Step 17: add comment to 'lora_rank'
            elif line.strip().startswith("lora_rank:"):
                parts = line.split("#")
                lines[i] = (
                    parts[0].rstrip() + "  # higher increases accuracy and memory\n"
                )
            # Step 18: add comment to 'lora_alpha'
            elif line.strip().startswith("lora_alpha:"):
                parts = line.split("#")
                lines[i] = parts[0].rstrip() + "  # usually alpha=2*rank\n"
    with open(file_path, "w") as file:
        file.writelines(lines)
    return updated


def search_yaml_files(directory):
    updated_files = []
    not_updated_files = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if "_" not in file or "generation" in file or "evaluation" in file:
                print(f"Skipping {file}")
                continue
            file_path = os.path.join(root, file)
            updates = modify_yaml_file(file_path)
            if any(updates.values()):
                updated_files.append({file_path: updates})
            else:
                not_updated_files.append(file_path)
    print("Updated files and changes:")
    for update in updated_files:
        print(update)
    print("\nFiles not updated:")
    for file in not_updated_files:
        print(file)


directory = "recipes/configs"
search_yaml_files(directory)


Copy link

pytorch-bot bot commented Nov 5, 2024

🔗 Helpful Links

🧪 See artifacts and rendered test results at hud.pytorch.org/pr/pytorch/torchtune/1954

Note: Links to docs will display an error until the docs builds have been completed.

✅ No Failures

As of commit 0fd16d1 with merge base 24d3579 (image):
💚 Looks good so far! There are no failures yet. 💚

This comment was automatically generated by Dr. CI and updates every 15 minutes.

@facebook-github-bot facebook-github-bot added the CLA Signed This label is managed by the Facebook bot. Authors need to sign the CLA before a PR can be reviewed. label Nov 5, 2024
Copy link
Contributor

@RdoubleA RdoubleA left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

major nit since this must've been very annoying to already put together, but for flags like activation checkpointing/offloading, would be nice to say # True reduces memory but reduces speed

@@ -55,20 +55,20 @@ shuffle: True
epochs: 1
max_steps_per_epoch: null
batch_size: 2
gradient_accumulation_steps: 1
gradient_accumulation_steps: 1 # Use to increase virtual batch size
optimizer:
_component_: bitsandbytes.optim.PagedAdamW
lr: 2e-5
optimizer_in_bwd: True
Copy link
Contributor

@RdoubleA RdoubleA Nov 6, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do you need a comment that this requires grad accum = 1? a lot of configs don't have this comment

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

good catch. I will add the comment.

@@ -100,7 +102,7 @@ log_peak_memory_stats: False
# Environment
device: cuda
dtype: bf16
enable_activation_checkpointing: False
enable_activation_checkpointing: False # True reduces memory
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

does KD support optimizer in bwd, activation offloading?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No :/

Copy link
Contributor Author

@felipemello1 felipemello1 Nov 6, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I dont know if it could. The answer is probably yes and we just didnt add it

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah I believe it should be able to

@@ -7,7 +7,6 @@
# tune download meta-llama/Meta-Llama-3.1-8B-Instruct --output-dir /tmp/Meta-Llama-3.1-8B-Instruct --ignore-patterns "original/consolidated.00.pth"
#
# You get better results using KD if the teacher model has already been fine-tuned on the target dataset:
packed: False # Set to true for great speed ups
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

wut


# Training env
device: cuda

# Memory management
enable_activation_checkpointing: True
enable_activation_checkpointing: True # True reduces memory
custom_sharded_layers: ['decoder.tok_embeddings']
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

a comment explaining this would be nice

@@ -179,3 +180,28 @@ metric_logger:
log_dir: ${output_dir}

log_every_n_steps: 1

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We don't currently have a profiler implemented for the PPO recipe, I'll be adding it soon so I can update the config when I do.

@@ -47,6 +47,7 @@ save_adapter_weights_only: False
# Dataset and Sampler
dataset:
_component_: torchtune.datasets.stack_exchange_paired_dataset
packed: False # True increases speed
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should be removed

@@ -46,6 +46,7 @@ save_adapter_weights_only: False
# Dataset and Sampler
dataset:
_component_: torchtune.datasets.stack_exchange_paired_dataset
packed: False # True increases speed
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
packed: False # True increases speed

@@ -47,6 +47,7 @@ save_adapter_weights_only: False
# Dataset and Sampler
dataset:
_component_: torchtune.datasets.stack_exchange_paired_dataset
packed: False # True increases speed
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
packed: False # True increases speed

@@ -33,6 +33,7 @@ tokenizer:
# Dataset
dataset:
_component_: torchtune.datasets.text_completion_dataset
packed: False # True increases speed
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
packed: False # True increases speed

Comment on lines 185 to 205
profiler:
_component_: torchtune.training.setup_torch_profiler
enabled: False

#Output directory of trace artifacts
output_dir: ${output_dir}/profiling_outputs

#`torch.profiler.ProfilerActivity` types to trace
cpu: True
cuda: True

#trace options passed to `torch.profiler.profile`
profile_memory: False
with_stack: False
record_shapes: True
with_flops: False

# `torch.profiler.schedule` options:
# wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat
wait_steps: 1
warmup_steps: 8
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
profiler:
_component_: torchtune.training.setup_torch_profiler
enabled: False
#Output directory of trace artifacts
output_dir: ${output_dir}/profiling_outputs
#`torch.profiler.ProfilerActivity` types to trace
cpu: True
cuda: True
#trace options passed to `torch.profiler.profile`
profile_memory: False
with_stack: False
record_shapes: True
with_flops: False
# `torch.profiler.schedule` options:
# wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat
wait_steps: 1
warmup_steps: 8

Comment on lines 86 to 109
# Profiler (disabled)
profiler:
_component_: torchtune.training.setup_torch_profiler
enabled: False

#Output directory of trace artifacts
output_dir: ${output_dir}/profiling_outputs

#`torch.profiler.ProfilerActivity` types to trace
cpu: True
cuda: True

#trace options passed to `torch.profiler.profile`
profile_memory: False
with_stack: False
record_shapes: True
with_flops: False

# `torch.profiler.schedule` options:
# wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat
wait_steps: 1
warmup_steps: 8
active_steps: 2
num_cycles: 1
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The DPO recipe also doesn't support a profiler, we can raise an issue to implement

Suggested change
# Profiler (disabled)
profiler:
_component_: torchtune.training.setup_torch_profiler
enabled: False
#Output directory of trace artifacts
output_dir: ${output_dir}/profiling_outputs
#`torch.profiler.ProfilerActivity` types to trace
cpu: True
cuda: True
#trace options passed to `torch.profiler.profile`
profile_memory: False
with_stack: False
record_shapes: True
with_flops: False
# `torch.profiler.schedule` options:
# wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat
wait_steps: 1
warmup_steps: 8
active_steps: 2
num_cycles: 1

Comment on lines 89 to 112
# Profiler (disabled)
profiler:
_component_: torchtune.training.setup_torch_profiler
enabled: False

#Output directory of trace artifacts
output_dir: ${output_dir}/profiling_outputs

#`torch.profiler.ProfilerActivity` types to trace
cpu: True
cuda: True

#trace options passed to `torch.profiler.profile`
profile_memory: False
with_stack: False
record_shapes: True
with_flops: False

# `torch.profiler.schedule` options:
# wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat
wait_steps: 1
warmup_steps: 8
active_steps: 2
num_cycles: 1
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
# Profiler (disabled)
profiler:
_component_: torchtune.training.setup_torch_profiler
enabled: False
#Output directory of trace artifacts
output_dir: ${output_dir}/profiling_outputs
#`torch.profiler.ProfilerActivity` types to trace
cpu: True
cuda: True
#trace options passed to `torch.profiler.profile`
profile_memory: False
with_stack: False
record_shapes: True
with_flops: False
# `torch.profiler.schedule` options:
# wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat
wait_steps: 1
warmup_steps: 8
active_steps: 2
num_cycles: 1

@felipemello1
Copy link
Contributor Author

@ebsmothers. mind taking a look at the test changes?
@SalmanMohammadi , mind doing a last pass on PPO/DPO?

Felipe Mello added 4 commits November 6, 2024 10:20
@@ -59,7 +59,7 @@ shuffle: True
epochs: 1
max_steps_per_epoch: null
batch_size: 2
gradient_accumulation_steps: 16
gradient_accumulation_steps: 8 # Use to increase virtual batch size
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry to be a pain but I feel like "effective batch size" is the more common term?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

:D

lora_attn_modules: ['q_proj', 'v_proj']
apply_lora_to_mlp: False
lora_attn_modules: ['q_proj', 'v_proj', 'output_proj']
apply_lora_to_mlp: True
apply_lora_to_output: False
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I know it's not from your PR but I thought Qwen 2.5 has tied word embeddings for smaller model sizes. In that case we should not even be exposing apply_lora_to_output here?

@@ -73,6 +73,8 @@ def test_training_state_on_resume(
tune run lora_dpo_single_device \
--config llama2/7B_lora_dpo_single_device \
output_dir={tmpdir} \
model.lora_attn_modules=['q_proj','v_proj'] \
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the better way to do this would be to just modify the source of truth test model definition here:

"llama2_lora": lora_llama2_test_config(
lora_attn_modules=["q_proj", "k_proj", "v_proj", "output_proj"],
apply_lora_to_mlp=False,
apply_lora_to_output=False,
lora_rank=8,
lora_alpha=16,
),
"llama2_dora": lora_llama2_test_config(
lora_attn_modules=["q_proj", "k_proj", "v_proj", "output_proj"],
apply_lora_to_mlp=False,
apply_lora_to_output=False,
lora_rank=8,
lora_alpha=16,
use_dora=True,
),
"llama2_qlora": lora_llama2_test_config(
lora_attn_modules=["q_proj", "k_proj", "v_proj", "output_proj"],
apply_lora_to_mlp=True,
apply_lora_to_output=False,
lora_rank=8,
lora_alpha=16,
quantize_base=True,
),

recipes/configs/llama3/8B_full.yaml Outdated Show resolved Hide resolved
recipes/configs/llama3_1/70B_full.yaml Outdated Show resolved Hide resolved
recipes/configs/llama3_1/8B_full.yaml Outdated Show resolved Hide resolved
recipes/configs/llama3_2_vision/11B_full.yaml Outdated Show resolved Hide resolved
recipes/configs/llama3_2_vision/90B_full.yaml Outdated Show resolved Hide resolved
Co-authored-by: ebsmothers <ebs@meta.com>
@felipemello1 felipemello1 merged commit e137afe into pytorch:main Nov 7, 2024
14 checks passed
@felipemello1 felipemello1 deleted the update_configs branch November 7, 2024 18:16
joecummings pushed a commit that referenced this pull request Nov 11, 2024
Co-authored-by: Felipe Mello <felipemello@fb.com>
Co-authored-by: ebsmothers <ebs@meta.com>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
CLA Signed This label is managed by the Facebook bot. Authors need to sign the CLA before a PR can be reviewed.
Projects
None yet
Development

Successfully merging this pull request may close these issues.

5 participants