forked from NVIDIA/Megatron-LM
-
Notifications
You must be signed in to change notification settings - Fork 0
/
.gitlab-ci.yml
68 lines (63 loc) · 2.22 KB
/
.gitlab-ci.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
stages:
- test
- jet
- cleanup
variables: &VARS
SELENE_ADLR_CI_PATH: "/lustre/fsw/adlr/adlr-nlp/adlr_ci/megatron"
DATA_DIR: "/lustre/fsw/adlr/adlr-nlp/adlr_ci/megatron/data"
PYTORCH_IMAGE: /lustre/fsw/adlr/adlr-nlp/adlr_ci/megatron/nvcr_pytorch_23.04.sqsh # This is the image that is run by all nodes on selene for tests
PYTHON_VIRTUAL_ENV: /lustre/fsw/adlr/adlr-nlp/adlr_ci/cienv/bin/activate
TESTS_TO_RUN_AFTER_MERGE_REQ_APPROVED: "MR_TESTS JET" # Can specify levels
TESTS_TO_RUN_AFTER_MERGING: "MR_TESTS NIGHTLY_TESTS" # Can specify levels
TESTS_TO_RUN_ON_THIS_COMMIT: unit_tests
TEST_REGEX_ON_THIS_COMMIT: NONE #https://github.com/google/re2/wiki/Syntax (Can define regex as in this spec) e.g /.*gpt3.*/
JET_CUSTOM_FILTER: ""
DISPLAY_OUTPUT: "True" # Set to true for new tests to copy the logs for creating golden truth file
TIME_LIMIT: "10:00" # Default time limit for all jobs
MOE_GROUPED_GEMM: 0 # Set to 1 to enable grouped gemm for MoE
JET_CLUSTER_BRANCH:
value: "mcore/draco-oci"
options:
- "mcore/draco-oci"
- "mcore/eos"
description: '"mcore/draco-oci" for OCI-IAD, "mcore/eos" for EOS'
include:
- jet-tests.yml
unit_tests:
image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/pytorch-all-tests:23.04-v1
tags:
- 8xL40S
stage: test
script:
- torchrun --nproc_per_node=8 -m pytest --cov-report=term --cov-report=html --cov=megatron/core tests/unit_tests
coverage: '/(?i)total.*? (100(?:\.0+)?\%|[1-9]?\d(?:\.\d+)?\%)$/'
artifacts:
paths:
- coverage
expire_in: 30 days
rules:
- when: always
docs_build_test:
image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/python-format:0.0.1
stage: test
tags:
- os/linux
script:
- cd ..
- rm -rf documentation && git clone https://gitlab-ci-token:${CI_JOB_TOKEN}@gitlab-master.nvidia.com/nemo-megatron-core-tme/documentation.git
- mv megatron-lm/ documentation/
- cd documentation/
- ./repo docs
allow_failure: true
except:
- main
formatting:
image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/python-format:0.0.1
tags:
- os/linux
stage: test
script:
- black megatron/core --check --verbose --diff
- isort megatron/core --check
rules:
- when: always