Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Integrate new cache system for training #472

Merged
merged 20 commits into from
Feb 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions .github/workflows/test_inf1_export.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: Optimum neuron / Test INF1 export
name: Optimum neuron / Test INF1 partial export

on:
push:
Expand All @@ -18,7 +18,7 @@ concurrency:

jobs:
do-the-job:
name: Run INF1 tests
name: Run INF1 export tests
runs-on: [self-hosted, 4-aws-inf1, 24-cpu, ci]
env:
AWS_REGION: us-east-1
Expand Down Expand Up @@ -46,4 +46,5 @@ jobs:
- name: Run export tests
run: |
source aws_neuron_venv_pytorch/bin/activate
export MAX_EXPORT_TEST_COMBINATIONS=1
HF_TOKEN=${{ secrets.HF_TOKEN_OPTIMUM_NEURON_CI }} pytest -m is_inferentia_test tests/exporters
47 changes: 47 additions & 0 deletions .github/workflows/test_inf1_full_export.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
name: Optimum neuron / Test INF1 full export

on:
push:
branches: [ main ]
paths:
- "optimum/exporters/neuron/*.py"
pull_request:
branches: [ main ]
paths:
- "optimum/exporters/neuron/*.py"

concurrency:
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
cancel-in-progress: true

jobs:
do-the-job:
name: Run INF1 full export tests
runs-on: [self-hosted, 4-aws-inf1, 24-cpu, ci]
env:
AWS_REGION: us-east-1
steps:
- name: Check AMI
run: dpkg -l | grep neuron
- name: Checkout
uses: actions/checkout@v2
- name: Install system packages
run: |
sudo apt install python3.8-venv -y
- name: Install python packages
run: |
python3 -m venv aws_neuron_venv_pytorch
source aws_neuron_venv_pytorch/bin/activate
python -m pip install -U pip
python -m pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com
python -m pip install .[neuron,tests]
python -m pip uninstall optimum -y
python -m pip install optimum
- name: Run CLI tests
run: |
source aws_neuron_venv_pytorch/bin/activate
HF_TOKEN=${{ secrets.HF_TOKEN_OPTIMUM_NEURON_CI }} pytest -m is_inferentia_test tests/cli
- name: Run export tests
run: |
source aws_neuron_venv_pytorch/bin/activate
HF_TOKEN=${{ secrets.HF_TOKEN_OPTIMUM_NEURON_CI }} pytest -m is_inferentia_test tests/exporters
6 changes: 1 addition & 5 deletions .github/workflows/test_inf1_inference.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: Optimum neuron / Test INF1 inference & pipelines
name: Optimum neuron / Test INF1 inference

on:
push:
Expand Down Expand Up @@ -43,7 +43,3 @@ jobs:
run: |
source aws_neuron_venv_pytorch/bin/activate
HF_TOKEN=${{ secrets.HF_TOKEN_OPTIMUM_NEURON_CI }} pytest -m is_inferentia_test tests/inference
- name: Run pipelines tests
run: |
source aws_neuron_venv_pytorch/bin/activate
HF_TOKEN=${{ secrets.HF_TOKEN_OPTIMUM_NEURON_CI }} pytest -m is_inferentia_test tests/pipelines
43 changes: 43 additions & 0 deletions .github/workflows/test_inf1_pipelines.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
name: Optimum neuron / Test INF1 pipelines

on:
push:
branches: [ main ]
paths:
- "optimum/neuron/pipelines/**.py"
pull_request:
branches: [ main ]
paths:
- "optimum/neuron/pipelines/**.py"

concurrency:
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
cancel-in-progress: true

jobs:
do-the-job:
name: Run INF1 tests
runs-on: [self-hosted, 4-aws-inf1, 24-cpu, ci]
env:
AWS_REGION: us-east-1
steps:
- name: Check AMI
run: dpkg -l | grep neuron
- name: Checkout
uses: actions/checkout@v2
- name: Install system packages
run: |
sudo apt install python3.8-venv -y
- name: Install python packages
run: |
python3 -m venv aws_neuron_venv_pytorch
source aws_neuron_venv_pytorch/bin/activate
python -m pip install -U pip
python -m pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com
python -m pip install .[neuron,tests]
python -m pip uninstall optimum -y
python -m pip install optimum
- name: Run pipelines tests
run: |
source aws_neuron_venv_pytorch/bin/activate
HF_TOKEN=${{ secrets.HF_TOKEN_OPTIMUM_NEURON_CI }} pytest -m is_inferentia_test tests/pipelines
5 changes: 3 additions & 2 deletions .github/workflows/test_inf2_export.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: Optimum neuron / Test INF2 export
name: Optimum neuron / Test INF2 partial export

on:
push:
Expand All @@ -18,7 +18,7 @@ concurrency:

jobs:
do-the-job:
name: Run INF2 tests
name: Run INF2 export tests
runs-on: [self-hosted, 1-aws-inf2, 32-cpu, ci] # run the job on the newly created runner
env:
AWS_REGION: us-east-1
Expand All @@ -38,4 +38,5 @@ jobs:
- name: Run exporters tests
run: |
source aws_neuron_venv_pytorch/bin/activate
export MAX_EXPORT_TEST_COMBINATIONS=1
HF_TOKEN=${{ secrets.HF_TOKEN_OPTIMUM_NEURON_CI }} pytest -m is_inferentia_test tests/exporters
39 changes: 39 additions & 0 deletions .github/workflows/test_inf2_full_export.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
name: Optimum neuron / Test INF2 full export

on:
push:
branches: [ main ]
paths:
- "optimum/exporters/neuron/*.py"
pull_request:
branches: [ main ]
paths:
- "optimum/exporters/neuron/*.py"

concurrency:
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
cancel-in-progress: true

jobs:
do-the-job:
name: Run INF2 full export tests
runs-on: [self-hosted, 1-aws-inf2, 32-cpu, ci] # run the job on the newly created runner
env:
AWS_REGION: us-east-1
steps:
- name: Check AMI
run: dpkg -l | grep neuron
- name: Checkout
uses: actions/checkout@v2
- name: Install python dependencies
run: |
sudo apt install python3.8-venv -y
python3 -m venv aws_neuron_venv_pytorch
source aws_neuron_venv_pytorch/bin/activate
python -m pip install -U pip
python -m pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com
python -m pip install .[neuronx,tests]
- name: Run exporters tests
run: |
source aws_neuron_venv_pytorch/bin/activate
HF_TOKEN=${{ secrets.HF_TOKEN_OPTIMUM_NEURON_CI }} pytest -m is_inferentia_test tests/exporters
1 change: 1 addition & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,4 @@

include README.md
include LICENSE
include optimum/neuron/utils/neuron_cc_wrapper
77 changes: 19 additions & 58 deletions optimum/commands/neuron/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,6 @@
CACHE_REPO_NAME,
HF_HOME_CACHE_REPO_FILE,
create_custom_cache_repo,
list_in_registry,
load_custom_cache_repo_name_from_hf_home,
set_custom_cache_repo_name_in_hf_home,
)
from ...neuron.utils.runner import ExampleRunner
Expand Down Expand Up @@ -163,52 +161,6 @@ def run(self):
)


class ListRepoCommand(BaseOptimumCLICommand):
@staticmethod
def parse_args(parser: "ArgumentParser"):
parser.add_argument(
"name",
type=str,
nargs="?",
default=None,
help="The name of the repo to list. Will use the locally saved cache repo if left unspecified.",
)
parser.add_argument(
"-m",
"--model",
type=str,
default=None,
help="The model name or path of the model to consider. If left unspecified, will list all available models.",
)
parser.add_argument(
"-v",
"--version",
type=str,
default=None,
help=(
"The version of the Neuron X Compiler to consider. Will list all available versions if left "
"unspecified."
),
)

def run(self):
if self.args.name is None:
custom_cache_repo_name = load_custom_cache_repo_name_from_hf_home()
if custom_cache_repo_name is None:
raise ValueError("No custom cache repo was set locally so you need to specify a cache repo name.")
self.args.name = custom_cache_repo_name

entries = list_in_registry(
self.args.name, model_name_or_path_or_hash=self.args.model, neuron_compiler_version=self.args.version
)
if not entries:
entries = ["Nothing was found."]
line = "\n" + "=" * 50 + "\n"
result = line.join(entries)

print(f"\n*** Repo id: {self.args.name} ***\n\n{result}")


class SynchronizeRepoCommand(BaseOptimumCLICommand):
@staticmethod
def parse_args(parser: "ArgumentParser"):
Expand All @@ -226,18 +178,32 @@ def parse_args(parser: "ArgumentParser"):
type=str,
help="The model_id to lookup cached versions for.",
)
parser.add_argument(
"--mode",
type=str,
choices=["training", "inference", "all"],
default="all",
help='The mode you wish to lookup compilation files for. Can be either "training", "inference" or "all"',
)
parser.add_argument("--repo_id", type=str, default=None, help="The name of the repo to use as remote cache.")

def run(self):
entries = get_hub_cached_entries(self.args.model_id, cache_repo_id=self.args.repo_id)
def _list_entries(self, mode: str):
entries = get_hub_cached_entries(self.args.model_id, mode, cache_repo_id=self.args.repo_id)
n_entries = len(entries)
output = f"\n*** {n_entries} entrie(s) found in cache for {self.args.model_id} ***\n\n"
output = f"\n*** {n_entries} entrie(s) found in cache for {self.args.model_id} for {mode}.***\n\n"
for entry in entries:
for key, value in entry.items():
output += f"\n{key}: {value}"
output += "\n"
print(output)

def run(self):
if self.args.mode == "all":
self._list_entries("training")
self._list_entries("inference")
else:
self._list_entries(self.args.mode)


class CustomCacheRepoCommand(BaseOptimumCLICommand):
SUBCOMMANDS = (
Expand All @@ -256,19 +222,14 @@ class CustomCacheRepoCommand(BaseOptimumCLICommand):
help="Add a model to the cache of your choice (trainium only).",
subcommand_class=AddToCacheRepoCommand,
),
CommandInfo(
name="list",
help="List models in a cache repo (trainium only).",
subcommand_class=ListRepoCommand,
),
CommandInfo(
name="synchronize",
help="Synchronize the neuronx compiler cache with a hub cache repo (inferentia only).",
help="Synchronize the neuronx compiler cache with a hub cache repo.",
subcommand_class=SynchronizeRepoCommand,
),
CommandInfo(
name="lookup",
help="Lookup the neuronx compiler hub cache for the specified model id (inferentia only).",
help="Lookup the neuronx compiler hub cache for the specified model id.",
subcommand_class=LookupRepoCommand,
),
)
2 changes: 1 addition & 1 deletion optimum/neuron/modeling_decoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ def __init__(
cache_entry = None if checkpoint_id is None else ModelCacheEntry(checkpoint_id, config)

# Export the model using the Optimum Neuron Cache
with hub_neuronx_cache(entry=cache_entry):
with hub_neuronx_cache("inference", entry=cache_entry):
available_cores = get_available_cores()
if num_cores > available_cores:
raise ValueError(
Expand Down
Loading
Loading