diff --git a/.actions/assistant.py b/.actions/assistant.py index 8486051f983f4..d0ace9cf75653 100644 --- a/.actions/assistant.py +++ b/.actions/assistant.py @@ -23,9 +23,9 @@ from itertools import chain from os.path import dirname, isfile from pathlib import Path -from typing import Dict, List, Optional, Sequence, Tuple +from typing import Any, Dict, Iterable, Iterator, List, Optional, Sequence, Tuple, Union -from pkg_resources import parse_requirements +from pkg_resources import parse_requirements, Requirement, yield_lines REQUIREMENT_FILES = { "pytorch": ( @@ -49,86 +49,106 @@ _PROJECT_ROOT = os.path.dirname(os.path.dirname(__file__)) -def _augment_requirement(ln: str, comment_char: str = "#", unfreeze: str = "all") -> str: - """Adjust the upper version contrains. - - Args: - ln: raw line from requirement - comment_char: charter marking comment - unfreeze: Enum or "all"|"major"|"" - - Returns: - adjusted requirement - - >>> _augment_requirement("arrow<=1.2.2,>=1.2.0 # anything", unfreeze="none") - 'arrow<=1.2.2,>=1.2.0' - >>> _augment_requirement("arrow<=1.2.2,>=1.2.0 # strict", unfreeze="none") - 'arrow<=1.2.2,>=1.2.0 # strict' - >>> _augment_requirement("arrow<=1.2.2,>=1.2.0 # my name", unfreeze="all") - 'arrow>=1.2.0' - >>> _augment_requirement("arrow>=1.2.0, <=1.2.2 # strict", unfreeze="all") - 'arrow>=1.2.0, <=1.2.2 # strict' - >>> _augment_requirement("arrow", unfreeze="all") - 'arrow' - >>> _augment_requirement("arrow>=1.2.0, <=1.2.2 # cool", unfreeze="major") - 'arrow>=1.2.0, <2.0 # strict' - >>> _augment_requirement("arrow>=1.2.0, <=1.2.2 # strict", unfreeze="major") - 'arrow>=1.2.0, <=1.2.2 # strict' - >>> _augment_requirement("arrow>=1.2.0", unfreeze="major") - 'arrow>=1.2.0, <2.0 # strict' - >>> _augment_requirement("arrow", unfreeze="major") - 'arrow' +class _RequirementWithComment(Requirement): + strict_string = "# strict" + + def __init__(self, *args: Any, comment: str = "", pip_argument: Optional[str] = None, **kwargs: Any) -> None: + super().__init__(*args, **kwargs) + self.comment = comment + assert pip_argument is None or pip_argument # sanity check that it's not an empty str + self.pip_argument = pip_argument + self.strict = self.strict_string in comment.lower() + + def adjust(self, unfreeze: str) -> str: + """Remove version restrictions unless they are strict. + + >>> _RequirementWithComment("arrow<=1.2.2,>=1.2.0", comment="# anything").adjust("none") + 'arrow<=1.2.2,>=1.2.0' + >>> _RequirementWithComment("arrow<=1.2.2,>=1.2.0", comment="# strict").adjust("none") + 'arrow<=1.2.2,>=1.2.0 # strict' + >>> _RequirementWithComment("arrow<=1.2.2,>=1.2.0", comment="# my name").adjust("all") + 'arrow>=1.2.0' + >>> _RequirementWithComment("arrow>=1.2.0, <=1.2.2", comment="# strict").adjust("all") + 'arrow<=1.2.2,>=1.2.0 # strict' + >>> _RequirementWithComment("arrow").adjust("all") + 'arrow' + >>> _RequirementWithComment("arrow>=1.2.0, <=1.2.2", comment="# cool").adjust("major") + 'arrow<2.0,>=1.2.0' + >>> _RequirementWithComment("arrow>=1.2.0, <=1.2.2", comment="# strict").adjust("major") + 'arrow<=1.2.2,>=1.2.0 # strict' + >>> _RequirementWithComment("arrow>=1.2.0").adjust("major") + 'arrow>=1.2.0' + >>> _RequirementWithComment("arrow").adjust("major") + 'arrow' + """ + out = str(self) + if self.strict: + return f"{out} {self.strict_string}" + if unfreeze == "major": + for operator, version in self.specs: + if operator in ("<", "<="): + major = LooseVersion(version).version[0] + # replace upper bound with major version increased by one + return out.replace(f"{operator}{version}", f"<{major + 1}.0") + elif unfreeze == "all": + for operator, version in self.specs: + if operator in ("<", "<="): + # drop upper bound + return out.replace(f"{operator}{version},", "") + elif unfreeze != "none": + raise ValueError(f"Unexpected unfreeze: {unfreeze!r} value.") + return out + + +def _parse_requirements(strs: Union[str, Iterable[str]]) -> Iterator[_RequirementWithComment]: + """Adapted from `pkg_resources.parse_requirements` to include comments. + + >>> txt = ['# ignored', '', 'this # is an', '--piparg', 'example', 'foo # strict', 'thing', '-r different/file.txt'] + >>> [r.adjust('none') for r in _parse_requirements(txt)] + ['this', 'example', 'foo # strict', 'thing'] + >>> txt = '\\n'.join(txt) + >>> [r.adjust('none') for r in _parse_requirements(txt)] + ['this', 'example', 'foo # strict', 'thing'] """ - assert unfreeze in {"none", "major", "all"} - # filer all comments - if comment_char in ln: - comment = ln[ln.index(comment_char) :] - ln = ln[: ln.index(comment_char)] - is_strict = "strict" in comment - else: - is_strict = False - req = ln.strip() - # skip directly installed dependencies - if not req or any(c in req for c in ["http:", "https:", "@"]): - return "" - # extract the major version from all listed versions - if unfreeze == "major": - req_ = list(parse_requirements([req]))[0] - vers = [LooseVersion(v) for s, v in req_.specs if s not in ("==", "~=")] - ver_major = sorted(vers)[-1].version[0] if vers else None - else: - ver_major = None - - # remove version restrictions unless they are strict - if unfreeze != "none" and "<" in req and not is_strict: - req = re.sub(r",? *<=? *[\d\.\*]+,? *", "", req).strip() - if ver_major is not None and not is_strict: - # add , only if there are already some versions - req += f"{',' if any(c in req for c in '<=>') else ''} <{int(ver_major) + 1}.0" - - # adding strict back to the comment - if is_strict or ver_major is not None: - req += " # strict" - - return req - - -def load_requirements( - path_dir: str, file_name: str = "base.txt", comment_char: str = "#", unfreeze: str = "all" -) -> List[str]: + lines = yield_lines(strs) + pip_argument = None + for line in lines: + # Drop comments -- a hash without a space may be in a URL. + if " #" in line: + comment_pos = line.find(" #") + line, comment = line[:comment_pos], line[comment_pos:] + else: + comment = "" + # If there is a line continuation, drop it, and append the next line. + if line.endswith("\\"): + line = line[:-2].strip() + try: + line += next(lines) + except StopIteration: + return + # If there's a pip argument, save it + if line.startswith("--"): + pip_argument = line + continue + if line.startswith("-r "): + # linked requirement files are unsupported + continue + yield _RequirementWithComment(line, comment=comment, pip_argument=pip_argument) + pip_argument = None + + +def load_requirements(path_dir: str, file_name: str = "base.txt", unfreeze: str = "all") -> List[str]: """Loading requirements from a file. >>> path_req = os.path.join(_PROJECT_ROOT, "requirements") >>> load_requirements(path_req, "docs.txt", unfreeze="major") # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE - ['sphinx>=4.0, <6.0 # strict', ...] + ['sphinx<6.0,>=4.0', ...] """ assert unfreeze in {"none", "major", "all"} - with open(os.path.join(path_dir, file_name)) as file: - lines = [ln.strip() for ln in file.readlines()] - reqs = [_augment_requirement(ln, comment_char=comment_char, unfreeze=unfreeze) for ln in lines] - # filter empty lines and containing @ which means redirect to some git/http - reqs = [str(req) for req in reqs if req and not any(c in req for c in ["@", "http:", "https:"])] - return reqs + path = Path(path_dir) / file_name + assert path.exists(), (path_dir, file_name, path) + text = path.read_text() + return [req.adjust(unfreeze) for req in _parse_requirements(text)] def load_readme_description(path_dir: str, homepage: str, version: str) -> str: @@ -213,14 +233,13 @@ def _load_aggregate_requirements(req_dir: str = "requirements", freeze_requireme >>> _load_aggregate_requirements(os.path.join(_PROJECT_ROOT, "requirements")) """ requires = [ - # TODO: consider passing unfreeze as string instead - load_requirements(d, file_name="base.txt", unfreeze="none" if freeze_requirements else "major") + load_requirements(d, unfreeze="none" if freeze_requirements else "major") for d in glob.glob(os.path.join(req_dir, "*")) # skip empty folder as git artefacts, and resolving Will's special issue if os.path.isdir(d) and len(glob.glob(os.path.join(d, "*"))) > 0 and "__pycache__" not in d ] if not requires: - return None + return # TODO: add some smarter version aggregation per each package requires = sorted(set(chain(*requires))) with open(os.path.join(req_dir, "base.txt"), "w") as fp: diff --git a/.azure/app-cloud-e2e.yml b/.azure/app-cloud-e2e.yml index 76365350622b2..b59a8612023b4 100644 --- a/.azure/app-cloud-e2e.yml +++ b/.azure/app-cloud-e2e.yml @@ -31,7 +31,7 @@ pr: - ".azure/app-cloud-e2e.yml" - "src/lightning_app/**" - "requirements/app/**" - - "tests/integrations_app_examples/**" + - "tests/integrations_app/**" - "setup.py" exclude: - "requirements/*/docs.txt" @@ -48,7 +48,7 @@ variables: value: ./videos jobs: - - job: App_e2e_cloud + - job: test_e2e pool: "azure-cpus" container: image: mcr.microsoft.com/playwright/python:v1.28.0-focal @@ -58,41 +58,57 @@ jobs: 'App: v0_app': name: "v0_app" dir: "public" + queue_type: "redis" 'App: boring_app': name: "boring_app" dir: "public" + queue_type: "redis" + 'App: boring_app / HTTP': + name: "boring_app" + dir: "public" + queue_type: "http" 'App: template_streamlit_ui': name: "template_streamlit_ui" dir: "public" + queue_type: "redis" 'App: template_react_ui': name: "template_react_ui" dir: "public" + queue_type: "redis" # 'App: template_jupyterlab': # TODO: clarify where these files lives # name: "template_jupyterlab" 'App: installation_commands_app': name: "installation_commands_app" dir: "public" + queue_type: "redis" 'App: drive': name: "drive" dir: "public" + queue_type: "redis" 'App: payload': name: "payload" dir: "public" + queue_type: "redis" 'App: commands_and_api': name: "commands_and_api" dir: "public" + queue_type: "redis" 'App: quick_start': name: "quick_start" dir: "public" + queue_type: "redis" 'App: idle_timeout': name: "idle_timeout" dir: "local" + queue_type: "redis" 'App: collect_failures': name: "collect_failures" dir: "local" + queue_type: "redis" 'App: custom_work_dependencies': name: "custom_work_dependencies" dir: "local" + queue_type: "redis" timeoutInMinutes: "15" cancelTimeoutInMinutes: "1" # values: https://docs.microsoft.com/en-us/azure/devops/pipelines/process/phases?view=azure-devops&tabs=yaml#workspace @@ -109,6 +125,7 @@ jobs: HAR_LOCATION: './artifacts/hars' SLOW_MO: '50' LIGHTNING_DEBUG: '1' + LIGHTNING_CLOUD_QUEUE_TYPE: $(queue_type) steps: - script: echo '##vso[task.setvariable variable=local_id]$(System.PullRequest.PullRequestNumber)' @@ -117,7 +134,7 @@ jobs: - bash: | whoami - mkdir -p $(video_artifact_dir) + mkdir -p $(VIDEO_LOCATION) printf "local id: $(local_id)\n" python --version pip --version @@ -161,17 +178,16 @@ jobs: python .actions/assistant.py copy_replace_imports --source_dir="./tests" --source_import="lightning_app" --target_import="lightning.app" displayName: 'Adjust examples' - - bash: | - pip --version - pip list + - bash: pip --version && pip list displayName: 'List pip dependency' - bash: | - ls -l examples/${TEST_APP_NAME} - python -m pytest ${TEST_FILE}::test_${TEST_APP_NAME}_example_cloud \ + ls -l examples/$(TEST_APP_NAME) + echo ${TEST_FILE} + python -m pytest ${TEST_FILE}::test_$(TEST_APP_NAME)_example_cloud \ --timeout=540 --capture=no -v --color=yes env: - TEST_FILE: tests/integrations_app_examples/$(TEST_APP_FOLDER)/test_$(TEST_APP_NAME).py + TEST_FILE: tests/integrations_app/$(TEST_APP_FOLDER)/test_$(TEST_APP_NAME).py #LAI_USER: $(LAI_USER) # for STAGING #LAI_PASS: $(LAI_PASS) # for STAGING LIGHTNING_USER_ID: $(LIGHTNING_USER_ID_PROD) @@ -191,6 +207,7 @@ jobs: - bash: | time python -c "from lightning.app import testing; testing.delete_cloud_lightning_apps()" condition: always() + timeoutInMinutes: "3" env: #LAI_USER: $(LAI_USER) # for STAGING #LAI_PASS: $(LAI_PASS) # for STAGING @@ -198,5 +215,4 @@ jobs: LIGHTNING_API_KEY: $(LIGHTNING_API_KEY_PROD) LIGHTNING_USERNAME: $(LIGHTNING_USERNAME_PROD) LIGHTNING_CLOUD_URL: $(LIGHTNING_CLOUD_URL_PROD) - timeoutInMinutes: "3" displayName: 'Clean Previous Apps' diff --git a/.azure/app-flagships.yml b/.azure/app-flagships.yml new file mode 100644 index 0000000000000..d931e203c7e71 --- /dev/null +++ b/.azure/app-flagships.yml @@ -0,0 +1,129 @@ +# Python package +# Create and test a Python package on multiple Python versions. +# Add steps that analyze code, save the dist with the build record, publish to a PyPI-compatible index, and more: +# https://docs.microsoft.com/azure/devops/pipelines/languages/python + +trigger: + tags: + include: + - '*' + branches: + include: + - "release/*" + - "refs/tags/*" + # TODO: just for debugging this PR + - "ci/flagship-*" + +schedules: + - cron: "0 0 * * *" # At the end of every day + displayName: Daily midnight testing + branches: + include: + - "release/*" + +pr: none + +# variables are automatically exported as environment variables so this will override pip's default cache dir +variables: + - name: pip_cache_dir + value: $(Pipeline.Workspace)/.pip + - name: local_id + value: $(Build.BuildId) + - name: video_artifact_dir + value: ./videos + +jobs: + - job: test_flagships + pool: azure-cpus + container: + image: mcr.microsoft.com/playwright/python:v1.28.0-focal + options: "--shm-size=4gb" + + #- Training Studio + #- Flashy + #- Muse + #- Echo + #- StreamLit / Gradio + #- Jupyter Notebook Component + #- All homepage & docs apps + + strategy: + matrix: + 'App: Flashy': + name: "flashy" + repo: "https://github.com/Lightning-AI/LAI-Flashy-App.git" + timeoutInMinutes: "25" + cancelTimeoutInMinutes: "1" + # values: https://docs.microsoft.com/en-us/azure/devops/pipelines/process/phases?view=azure-devops&tabs=yaml#workspace + workspace: + clean: all + variables: + HEADLESS: '1' + PACKAGE_LIGHTNING: '1' + CLOUD: '1' + VIDEO_LOCATION: $(video_artifact_dir) + HAR_LOCATION: './artifacts/hars' + SLOW_MO: '50' + LIGHTNING_DEBUG: '1' + steps: + + - bash: | + whoami + mkdir -p tests/_flagships + mkdir -p $(video_artifact_dir) + printf "local id: $(local_id)\n" + python --version + pip --version + displayName: 'Info' + + - script: pip install -e .[cloud,test] -f https://download.pytorch.org/whl/cpu/torch_stable.html + displayName: 'Install Lightning & dependencies' + + - script: | + pip install playwright + python -m playwright install # --with-deps + displayName: 'Install Playwright system dependencies' + + - script: git clone $(repo) tests/_flagships/$(name) + displayName: 'Clone the Repo/App' + + - script: | + cd tests/_flagships/$(name) + ls -l . + pip install -r requirements-dev.txt + pip install -e . + condition: eq(variables['name'], 'flashy') + displayName: 'adjust env for Flashy' + + - bash: pip --version && pip list + displayName: 'List pip dependency' + + - script: | + ls -l tests/_flagships + python -m pytest tests/integrations_app/flagship/test_$(name).py \ + --timeout=540 --capture=no -v --color=yes + env: + LIGHTNING_USER_ID: $(LIGHTNING_USER_ID_PROD) + LIGHTNING_API_KEY: $(LIGHTNING_API_KEY_PROD) + LIGHTNING_USERNAME: $(LIGHTNING_USERNAME_PROD) + LIGHTNING_CLOUD_URL: $(LIGHTNING_CLOUD_URL_PROD) + displayName: 'Run the tests' + + - task: PublishPipelineArtifact@1 + condition: failed() + inputs: + path: "$(video_artifact_dir)/$(name)" + artifactName: $(name) + publishLocation: 'pipeline' + displayName: 'Publish videos' + + - script: | + time python -c "from lightning.app import testing; testing.delete_cloud_lightning_apps()" + condition: always() + timeoutInMinutes: "3" + env: + LIGHTNING_USER_ID: $(LIGHTNING_USER_ID_PROD) + LIGHTNING_API_KEY: $(LIGHTNING_API_KEY_PROD) + LIGHTNING_USERNAME: $(LIGHTNING_USERNAME_PROD) + LIGHTNING_CLOUD_URL: $(LIGHTNING_CLOUD_URL_PROD) + displayName: 'Clean Previous Apps' diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index cee7e2f247b96..7d8f9df3b3f0a 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -45,7 +45,7 @@ # Lightning App /src/lightning_app @tchaton @lantiga @awaelchli @hhsecond @ethanwharris /tests/tests_app @tchaton @lantiga @awaelchli @hhsecond @ethanwharris -/tests/integrations_app_examples @tchaton @lantiga @awaelchli @hhsecond @ethanwharris +/tests/integrations_app @tchaton @lantiga @awaelchli @hhsecond @ethanwharris /examples/app_* @tchaton @lantiga @awaelchli @hhsecond @ethanwharris /.github/CODEOWNERS @williamfalcon diff --git a/.github/checkgroup.yml b/.github/checkgroup.yml index a1ce8f73dfd8a..3aee714e64f93 100644 --- a/.github/checkgroup.yml +++ b/.github/checkgroup.yml @@ -245,7 +245,7 @@ subprojects: - ".actions/**" - ".github/workflows/ci-examples-app.yml" - "src/lightning_app/**" - - "tests/integrations_app_examples/**" + - "tests/integrations_app/**" - "examples/app_*/**" - "requirements/app/**" - "setup.py" @@ -269,7 +269,7 @@ subprojects: - ".azure/app-cloud-e2e.yml" - "src/lightning_app/**" - "requirements/app/**" - - "tests/integrations_app_examples/**" + - "tests/integrations_app/**" - "setup.py" - "!requirements/*/docs.txt" - "!*.md" diff --git a/.github/labeler.yml b/.github/labeler.yml index 1771245a3f003..75c748978e3ae 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -5,7 +5,7 @@ app: all: ['!src/pytorch_lightning/**/*', '!tests/tests_pytorch/**/*', '!docs/source-pytorch/**/*', '!requirements/pytorch/*'] - any: ['tests/tests_app/**/*'] all: ['!src/pytorch_lightning/**/*', '!tests/tests_pytorch/**/*', '!docs/source-pytorch/**/*', '!requirements/pytorch/*'] -- any: ['tests/integrations_app_examples/**/*'] +- any: ['tests/integrations_app/**/*'] all: ['!src/pytorch_lightning/**/*', '!tests/tests_pytorch/**/*', '!docs/source-pytorch/**/*', '!requirements/pytorch/*'] - any: ['examples/app_*/**/*'] all: ['!src/pytorch_lightning/**/*', '!tests/tests_pytorch/**/*', '!docs/source-pytorch/**/*', '!requirements/pytorch/*'] @@ -16,12 +16,12 @@ app: pl: - any: ['src/pytorch_lightning/**/*'] - all: ['!src/lightning/**/*', '!src/lightning_app/**/*', '!tests/tests_app/**/*', '!tests/integrations_app_examples/**/*', '!docs/source-app/**/*', '!requirements/app/*'] + all: ['!src/lightning/**/*', '!src/lightning_app/**/*', '!tests/tests_app/**/*', '!tests/integrations_app/**/*', '!docs/source-app/**/*', '!requirements/app/*'] - any: ['examples/*pl_*/**/*'] - all: ['!src/lightning/**/*', '!src/lightning_app/**/*', '!tests/tests_app/**/*', '!tests/integrations_app_examples/**/*', '!docs/source-app/**/*', '!requirements/app/*'] + all: ['!src/lightning/**/*', '!src/lightning_app/**/*', '!tests/tests_app/**/*', '!tests/integrations_app/**/*', '!docs/source-app/**/*', '!requirements/app/*'] - any: ['tests/tests_pytorch/**/*'] - all: ['!src/lightning/**/*', '!src/lightning_app/**/*', '!tests/tests_app/**/*', '!tests/integrations_app_examples/**/*', '!docs/source-app/**/*', '!requirements/app/*'] + all: ['!src/lightning/**/*', '!src/lightning_app/**/*', '!tests/tests_app/**/*', '!tests/integrations_app/**/*', '!docs/source-app/**/*', '!requirements/app/*'] - any: ['docs/source-pytorch/**/*'] - all: ['!src/lightning/**/*', '!src/lightning_app/**/*', '!tests/tests_app/**/*', '!tests/integrations_app_examples/**/*', '!docs/source-app/**/*', '!requirements/app/*'] + all: ['!src/lightning/**/*', '!src/lightning_app/**/*', '!tests/tests_app/**/*', '!tests/integrations_app/**/*', '!docs/source-app/**/*', '!requirements/app/*'] - any: ['requirements/pytorch/*'] - all: ['!src/lightning/**/*', '!src/lightning_app/**/*', '!tests/tests_app/**/*', '!tests/integrations_app_examples/**/*', '!docs/source-app/**/*', '!requirements/app/*'] + all: ['!src/lightning/**/*', '!src/lightning_app/**/*', '!tests/tests_app/**/*', '!tests/integrations_app/**/*', '!docs/source-app/**/*', '!requirements/app/*'] diff --git a/.github/workflows/ci-examples-app.yml b/.github/workflows/ci-examples-app.yml index 3ae51c6bbe6fd..d0adb8795a99f 100644 --- a/.github/workflows/ci-examples-app.yml +++ b/.github/workflows/ci-examples-app.yml @@ -11,7 +11,7 @@ on: - ".actions/**" - ".github/workflows/ci-examples-app.yml" - "src/lightning_app/**" - - "tests/integrations_app_examples/**" + - "tests/integrations_app/**" - "examples/app_*/**" - "requirements/app/**" - "setup.py" @@ -114,7 +114,7 @@ jobs: PYTEST_ARTIFACT: results-${{ runner.os }}-${{ matrix.python-version }}-${{ matrix.requires }}.xml run: | python -m coverage run --source ${COVERAGE_SCOPE} \ - -m pytest -m "not cloud" integrations_app_examples \ + -m pytest -m "not cloud" integrations_app \ --timeout=300 --durations=0 -vvvv \ --junitxml=$PYTEST_ARTIFACT diff --git a/.github/workflows/release-pypi.yml b/.github/workflows/release-pypi.yml index 71f6d72f4c663..b43faed2d186f 100644 --- a/.github/workflows/release-pypi.yml +++ b/.github/workflows/release-pypi.yml @@ -121,6 +121,9 @@ jobs: runs-on: ubuntu-20.04 needs: build-packages if: startsWith(github.event.ref, 'refs/tags') || github.event_name == 'release' + strategy: + matrix: + name: ["APP", "FABRIC", "PYTORCH", "LIGHTNING"] steps: - uses: actions/checkout@v3 # needed for local action bellow - uses: actions/download-artifact@v3 @@ -132,26 +135,17 @@ jobs: tree -L 2 -h dist/ - uses: ./.github/actions/pkg-publish with: - pkg-folder: dist/app - pypi-test-token: ${{ secrets.PYPI_TEST_TOKEN_APP }} - - uses: ./.github/actions/pkg-publish - with: - pkg-folder: dist/fabric - pypi-test-token: ${{ secrets.PYPI_TEST_TOKEN_FABRIC }} - - uses: ./.github/actions/pkg-publish - with: - pkg-folder: dist/pytorch - pypi-test-token: ${{ secrets.PYPI_TEST_TOKEN_PYTORCH }} - - uses: ./.github/actions/pkg-publish - with: - pkg-folder: dist/lightning - pypi-test-token: ${{ secrets.PYPI_TEST_TOKEN_LAI }} + pkg-folder: dist/$(echo 'console.log("${{ matrix.name }}".toLowerCase())') + pypi-test-token: ${{ secrets[format('PYPI_TEST_TOKEN_{0}', matrix.name)] }} publish-packages: runs-on: ubuntu-20.04 needs: [build-packages, waiting] if: startsWith(github.event.ref, 'refs/tags') || github.event_name == 'release' + strategy: + matrix: + name: ["APP", "FABRIC", "PYTORCH", "LIGHTNING"] steps: - uses: actions/checkout@v3 # needed for local action bellow - uses: actions/download-artifact@v3 @@ -163,20 +157,8 @@ jobs: tree -L 2 -h dist/ - uses: ./.github/actions/pkg-publish with: - pkg-folder: dist/app - pypi-token: ${{ secrets.PYPI_TOKEN_APP }} - - uses: ./.github/actions/pkg-publish - with: - pkg-folder: dist/fabric - pypi-token: ${{ secrets.PYPI_TOKEN_FABRIC }} - - uses: ./.github/actions/pkg-publish - with: - pkg-folder: dist/pytorch - pypi-token: ${{ secrets.PYPI_TOKEN_PYTORCH }} - - uses: ./.github/actions/pkg-publish - with: - pkg-folder: dist/lightning - pypi-token: ${{ secrets.PYPI_TOKEN_LAI }} + pkg-folder: dist/$(echo 'console.log("${{ matrix.name }}".toLowerCase())') + pypi-token: ${{ secrets[format('PYPI_TOKEN_{0}', matrix.name)] }} legacy-checkpoints: diff --git a/docs/source-app/glossary/index.rst b/docs/source-app/glossary/index.rst index a46da20ddd118..0f92d4b51af6e 100644 --- a/docs/source-app/glossary/index.rst +++ b/docs/source-app/glossary/index.rst @@ -18,6 +18,7 @@ storage/storage restful_api/restful_api add web ui <../workflows/add_web_ui/glossary_ui> + use_local_lightning ######## Glossary @@ -153,3 +154,10 @@ Glossary :col_css: col-md-12 :button_link: ../workflows/add_web_ui/glossary_ui.html :height: 100 + +.. displayitem:: + :header: Using a development branch of Lightning on the Cloud + :description: Learn how to contribute to the Lightning framework in the cloud + :col_css: col-md-12 + :button_link: use_local_lightning.html + :height: 100 diff --git a/docs/source-app/glossary/use_local_lightning.rst b/docs/source-app/glossary/use_local_lightning.rst new file mode 100644 index 0000000000000..0987dd370765a --- /dev/null +++ b/docs/source-app/glossary/use_local_lightning.rst @@ -0,0 +1,15 @@ +################################################################ +How to run an app on the cloud with a local version of lightning +################################################################ + +The lightning cloud uses the latest release by default. However, you might want to run your app with some local changes you've made to the lightning framework. To use your local version of lightning on the cloud, set the following environment variable: + +```bash +git clone https://github.com/Lightning-AI/lightning.git +cd lightning +pip install -e . +export PACKAGE_LIGHTNING=1 # <- this is the magic to use your version (not mainstream PyPI)! +lightning run app app.py --cloud +``` + +By seting `PACKAGE_LIGHTNING=1`, lightning packages the lightning source code in your local directory in addition to your app source code and uploads them to the cloud. diff --git a/docs/source-app/workflows/byoc/create_cluster.rst b/docs/source-app/workflows/byoc/create_cluster.rst index 4fb58ea019edc..4e7994b0737b5 100644 --- a/docs/source-app/workflows/byoc/create_cluster.rst +++ b/docs/source-app/workflows/byoc/create_cluster.rst @@ -11,7 +11,7 @@ Create AWS cluster **Prereqs:** basic familiarity with cloud provider infrastructure management. -.. note:: This feature is currently available for early access! To create your own clutser `contact us `_. +.. note:: This feature is currently available for early access! To create your own cluster `contact us `_. ---- @@ -87,6 +87,7 @@ Here's an example: lightning create cluster my-byoc-cluster --role-arn arn:aws:iam::1234567890:role/lai-byoc --external-id dummy .. note:: Cluster creation is going to take an hour or more after you run this command. +.. note:: Only us-east-1, us-east-2, us-west-1 and us-west-2 are supported today. Parameters diff --git a/docs/source-pytorch/fabric/guide/distributed.rst b/docs/source-pytorch/fabric/advanced/collectives.rst similarity index 91% rename from docs/source-pytorch/fabric/guide/distributed.rst rename to docs/source-pytorch/fabric/advanced/collectives.rst index 182fc77e38aed..1905873902248 100644 --- a/docs/source-pytorch/fabric/guide/distributed.rst +++ b/docs/source-pytorch/fabric/advanced/collectives.rst @@ -1,12 +1,12 @@ :orphan: -################################## -Working with distributed processes -################################## +########################################### +Communication between distributed processes +########################################### Page is under construction. ----------- +---- You can also easily use distributed collectives if required. diff --git a/docs/source-pytorch/fabric/advanced/gradient_accumulation.rst b/docs/source-pytorch/fabric/advanced/gradient_accumulation.rst new file mode 100644 index 0000000000000..fc150d6a0f15b --- /dev/null +++ b/docs/source-pytorch/fabric/advanced/gradient_accumulation.rst @@ -0,0 +1,58 @@ +:orphan: + +############################### +Efficient Gradient Accumulation +############################### + +Gradient accumulation works the same way with Fabric as in PyTorch. +You are in control of which model accumulates and at what frequency: + +.. code-block:: python + + for iteration, batch in enumerate(dataloader): + + # Accumulate gradient 8 batches at a time + is_accumulating = iteration % 8 != 0 + + output = model(input) + loss = ... + + # .backward() accumulates when .zero_grad() wasn't called + fabric.backward(loss) + ... + + if not is_accumulating: + # Step the optimizer after accumulation phase is over + optimizer.step() + optimizer.zero_grad() + + +However, in a distributed setting, for example when training across multiple GPUs or machines, doing it this way can slow down your training loop significantly. +In order to optimize this code, we should skip the synchronization in ``.backward()`` during the accumulation phase. +We only need to synchronize the gradients when the accumulation phase is over! +This can be achieved by adding the :meth:`~lightning_fabric.fabric.Fabric.no_backward_sync` context manager over the :meth:`~lightning_fabric.fabric.Fabric.backward` call: + +.. code-block:: diff + + for iteration, batch in enumerate(dataloader): + + # Accumulate gradient 8 batches at a time + is_accumulating = iteration % 8 != 0 + + + with fabric.no_backward_sync(model, enabled=is_accumulating): + output = model(input) + loss = ... + + # .backward() accumulates when .zero_grad() wasn't called + fabric.backward(loss) + + ... + + if not is_accumulating: + # Step the optimizer after accumulation phase is over + optimizer.step() + optimizer.zero_grad() + + +For those strategies that don't support it, a warning is emitted. For single-device strategies, it is a no-op. +Both the model's ``.forward()`` and the ``fabric.backward()`` call need to run under this context. diff --git a/docs/source-pytorch/fabric/api/api_reference.rst b/docs/source-pytorch/fabric/api/api_reference.rst index c341616635e0b..680b15df65f48 100644 --- a/docs/source-pytorch/fabric/api/api_reference.rst +++ b/docs/source-pytorch/fabric/api/api_reference.rst @@ -37,6 +37,21 @@ Accelerators TPUAccelerator +Loggers +^^^^^^^ + +.. currentmodule:: lightning_fabric.loggers + +.. autosummary:: + :toctree: ../../api + :nosignatures: + :template: classtemplate.rst + + Logger + CSVLogger + TensorBoardLogger + + Plugins ^^^^^^^ diff --git a/docs/source-pytorch/fabric/api/utilities.rst b/docs/source-pytorch/fabric/api/utilities.rst index 7d26de1d498fb..9491908912a14 100644 --- a/docs/source-pytorch/fabric/api/utilities.rst +++ b/docs/source-pytorch/fabric/api/utilities.rst @@ -8,4 +8,64 @@ Fabric Utilities seed_everything =============== +This function sets the random seed in important libraries. +In a single line of code, you can seed torch, numpy and Python: + +.. code-block:: diff + + + from lightning.fabric import seed_everything + + seed = 42 + - random.seed(seed) + - numpy.random.seed(seed) + - torch.manual_seed(seed) + - torch.cuda.manual_seed(seed) + + + seed_everything(seed) + +The same is also available as a method on the Fabric object if you don't want to import it separately: + .. code-block:: python + + from lightning.fabric import Fabric + + fabric.Fabric() + fabric.seed_everything(42) + + +In distributed settings, you may need to set a different seed per process depending on the application. +For example, when generating noise or data augmentations. This is very straightforward: + +.. code-block:: python + + fabric = Fabric(...) + fabric.seed_everything(seed + fabric.global_rank) + + +By default, ``seed_everything`` also handles the initialization of the seed in :class:`~torch.utils.data.DataLoader` worker processes: + +.. code-block:: python + + fabric = Fabric(...) + + # By default, we handle DataLoader workers too: + fabric.seed_everything(..., workers=True) + + # Can be turned off: + fabric.seed_everything(..., workers=False) + + +---- + + +print +===== + +Avoid duplicated print statements in the logs in distributed training by using Fabric's :meth:`~lightning_fabric.fabric.Fabric.print` method: + +.. code-block:: python + + print("This message gets printed in every process. That's a bit messy!") + + fabric = Fabric(...) + fabric.print("This message gets printed only in the main process. Much cleaner!") diff --git a/docs/source-pytorch/fabric/fabric.rst b/docs/source-pytorch/fabric/fabric.rst index aeb680e495ac7..d7d7184ba2d29 100644 --- a/docs/source-pytorch/fabric/fabric.rst +++ b/docs/source-pytorch/fabric/fabric.rst @@ -24,28 +24,30 @@ With only a few changes to your code, Fabric allows you to: + from lightning.fabric import Fabric - class MyModel(nn.Module): + class PyTorchModel(nn.Module): ... - class MyDataset(Dataset): + class PyTorchDataset(Dataset): ... + fabric = Fabric(accelerator="cuda", devices=8, strategy="ddp") + fabric.launch() - device = "cuda" if torch.cuda.is_available() else "cpu - model = MyModel(...) + model = PyTorchModel(...) optimizer = torch.optim.SGD(model.parameters()) + model, optimizer = fabric.setup(model, optimizer) - dataloader = DataLoader(MyDataset(...), ...) + dataloader = DataLoader(PyTorchDataset(...), ...) + dataloader = fabric.setup_dataloaders(dataloader) model.train() for epoch in range(num_epochs): for batch in dataloader: - - batch.to(device) + input, target = batch + - input, target = input.to(device), target.to(device) optimizer.zero_grad() - loss = model(batch) + output = model(input) + loss = loss_fn(output, target) - loss.backward() + fabric.backward(loss) optimizer.step() @@ -54,7 +56,7 @@ With only a few changes to your code, Fabric allows you to: .. note:: Fabric is currently in Beta. Its API is subject to change based on feedback. ----------- +---- ************ @@ -106,13 +108,21 @@ Fundamentals :height: 150 :tag: basic +.. displayitem:: + :header: Mixed Precision Training + :description: Save memory and speed up training using mixed precision + :button_link: fundamentals/precision.html + :col_css: col-md-4 + :height: 150 + :tag: intermediate + .. raw:: html ----------- +---- ********************** @@ -162,18 +172,41 @@ Build Your Own Trainer ----------- +---- *************** Advanced Topics *************** -Comnig soon. +.. raw:: html + +
+
+ +.. displayitem:: + :header: Efficient Gradient Accumulation + :description: Learn how to perform efficient gradient accumulation in distributed settings + :button_link: advanced/gradient_accumulation.html + :col_css: col-md-4 + :height: 160 + :tag: advanced + +.. displayitem:: + :header: Collectives + :description: Learn all about communication primitives for distributed operation. Gather, reduce, broadcast, etc. + :button_link: advanced/collectives.html + :col_css: col-md-4 + :height: 160 + :tag: advanced +.. raw:: html + +
+
----------- +---- .. _Fabric Examples: @@ -204,24 +237,27 @@ Examples :tag: intermediate .. displayitem:: - :header: Reinforcement Learning - :description: Coming soon + :header: Meta-Learning + :description: Distributed training with the MAML algorithm on the Omniglot and MiniImagenet datasets + :button_link: https://github.com/Lightning-AI/lightning/blob/master/examples/fabric/meta_learning/README.md :col_css: col-md-4 :height: 150 + :tag: intermediate .. displayitem:: - :header: Active Learning + :header: Reinforcement Learning :description: Coming soon :col_css: col-md-4 :height: 150 .. displayitem:: - :header: Meta Learning + :header: Active Learning :description: Coming soon :col_css: col-md-4 :height: 150 + .. raw:: html @@ -229,7 +265,7 @@ Examples ----------- +---- *** diff --git a/docs/source-pytorch/fabric/fundamentals/accelerators.rst b/docs/source-pytorch/fabric/fundamentals/accelerators.rst index bd60b5ef46b20..e5b13f0543610 100644 --- a/docs/source-pytorch/fabric/fundamentals/accelerators.rst +++ b/docs/source-pytorch/fabric/fundamentals/accelerators.rst @@ -52,7 +52,7 @@ You can also specifically set which accelerator to use: For running on multiple devices in parallel, also known as "distributed", read our guide for :doc:`Launching Multiple Processes <./launch>`. ----------- +---- ***************** diff --git a/docs/source-pytorch/fabric/fundamentals/code_structure.rst b/docs/source-pytorch/fabric/fundamentals/code_structure.rst index 537681e443f3e..9ba280a81e5a1 100644 --- a/docs/source-pytorch/fabric/fundamentals/code_structure.rst +++ b/docs/source-pytorch/fabric/fundamentals/code_structure.rst @@ -9,7 +9,7 @@ Despite the ultimate freedom, this page is meant to give beginners a template fo We also have several :ref:`examples ` that you can take inspiration from. ----------- +---- ***************** @@ -33,7 +33,7 @@ At the highest level, every Python script should contain the following boilerpla This ensures that any kind of multiprocessing will work properly (for example ``DataLoader(num_workers=...)`` etc.) ----------- +---- ************** @@ -79,7 +79,7 @@ Here is a skeleton for training a model in a function ``train()``: main() ----------- +---- ***************************** @@ -135,7 +135,7 @@ Here is how the code would be structured if we did that periodically during trai ----------- +---- ************ diff --git a/docs/source-pytorch/fabric/fundamentals/convert.rst b/docs/source-pytorch/fabric/fundamentals/convert.rst index 1e401ba717814..d823bf2ec9e8e 100644 --- a/docs/source-pytorch/fabric/fundamentals/convert.rst +++ b/docs/source-pytorch/fabric/fundamentals/convert.rst @@ -39,7 +39,7 @@ Here are five easy steps to let :class:`~lightning_fabric.fabric.Fabric` scale y .. code-block:: bash - lightning run model path/to/train.py`` + lightning run model path/to/train.py or use the :meth:`~lightning_fabric.fabric.Fabric.launch` method in a notebook. Learn more about :doc:`launching distributed training `. @@ -56,28 +56,30 @@ All steps combined, this is how your code will change: + from lightning.fabric import Fabric - class MyModel(nn.Module): + class PyTorchModel(nn.Module): ... - class MyDataset(Dataset): + class PyTorchDataset(Dataset): ... + fabric = Fabric(accelerator="cuda", devices=8, strategy="ddp") + fabric.launch() - device = "cuda" if torch.cuda.is_available() else "cpu - model = MyModel(...) + model = PyTorchModel(...) optimizer = torch.optim.SGD(model.parameters()) + model, optimizer = fabric.setup(model, optimizer) - dataloader = DataLoader(MyDataset(...), ...) + dataloader = DataLoader(PyTorchDataset(...), ...) + dataloader = fabric.setup_dataloaders(dataloader) model.train() for epoch in range(num_epochs): for batch in dataloader: - - batch.to(device) + input, target = batch + - input, target = input.to(device), target.to(device) optimizer.zero_grad() - loss = model(batch) + output = model(input) + loss = loss_fn(output, target) - loss.backward() + fabric.backward(loss) optimizer.step() @@ -85,3 +87,41 @@ All steps combined, this is how your code will change: That's it! You can now train on any device at any scale with a switch of a flag. Check out our before-and-after example for `image classification `_ and many more :ref:`examples ` that use Fabric. + +********** +Next steps +********** + +.. raw:: html + +
+
+ +.. displayitem:: + :header: Examples + :description: See examples across computer vision, NLP, RL, etc. + :col_css: col-md-4 + :button_link: ../fabric.html#examples + :height: 150 + :tag: basic + +.. displayitem:: + :header: Accelerators + :description: Take advantage of your hardware with a switch of a flag + :button_link: accelerators.html + :col_css: col-md-4 + :height: 150 + :tag: intermediate + +.. displayitem:: + :header: Build your own Trainer + :description: Learn how to build a trainer tailored for you + :col_css: col-md-4 + :button_link: ../fabric.html#build-your-own-trainer + :height: 150 + :tag: intermediate + +.. raw:: html + +
+
diff --git a/docs/source-pytorch/fabric/fundamentals/launch.rst b/docs/source-pytorch/fabric/fundamentals/launch.rst index 2e9fa0e7b17e7..043f2c7f6571a 100644 --- a/docs/source-pytorch/fabric/fundamentals/launch.rst +++ b/docs/source-pytorch/fabric/fundamentals/launch.rst @@ -10,7 +10,7 @@ To run your code distributed across many devices and/or across many machines, yo 2. Launch your code in multiple processes ----------- +---- ******************* @@ -105,7 +105,7 @@ Or `DeepSpeed Zero3 `_ w --precision=16 ----------- +---- ******************* @@ -134,7 +134,7 @@ In the command line, you run this like any other Python script: python train.py ----------- +---- ************************ diff --git a/docs/source-pytorch/fabric/fundamentals/precision.rst b/docs/source-pytorch/fabric/fundamentals/precision.rst new file mode 100644 index 0000000000000..65827760733d8 --- /dev/null +++ b/docs/source-pytorch/fabric/fundamentals/precision.rst @@ -0,0 +1,140 @@ +:orphan: + +################################ +Save memory with mixed precision +################################ + + +************************ +What is Mixed Precision? +************************ + +PyTorch, like most deep learning frameworks, trains on 32-bit floating-point (FP32) arithmetic by default. +However, many deep learning models do not require this to reach complete accuracy. +By conducting operations in half-precision format while keeping minimum information in single-precision to maintain as much information as possible in crucial areas of the network, mixed precision training delivers significant computational speedup. +Switching to mixed precision has resulted in considerable training speedups since the introduction of Tensor Cores in the Volta and Turing architectures. +It combines FP32 and lower-bit floating-points (such as FP16) to reduce memory footprint and increase performance during model training and evaluation. +It accomplishes this by recognizing the steps that require complete accuracy and employing a 32-bit floating-point for those steps only, while using a 16-bit floating-point for the rest. +When compared to complete precision training, mixed precision training delivers all of these benefits while ensuring that no task-specific accuracy is lost [`1 `_]. + +This is how you select the precision in Fabric: + +.. code-block:: + + from lightning.fabric import Fabric + + # This is the default + fabric = Fabric(precision=32) + + # FP16 mixed precision + fabric = Fabric(precision=16) + + # Precision values can also be set as a string + fabric = Fabric(precision="16") + + # BFloat16 precision (Volta GPUs and later) + fabric = Fabric(precision="bf16") + + # Double precision + fabric = Fabric(precision=64) + + +The same values can also be set through the :doc:`command line interface `: + +.. code-block:: bash + + lightning run model train.py --precision=bf16 + + +.. note:: + + In some cases, it is essential to remain in FP32 for numerical stability, so keep this in mind when using mixed precision. + For example, when running scatter operations during the forward (such as torchpoint3d), computation must remain in FP32. + + +---- + + +******************** +FP16 Mixed Precision +******************** + +In most cases, mixed precision uses FP16. +Supported `PyTorch operations `_ automatically run in FP16, saving memory and improving throughput on the supported accelerators. +Since computation happens in FP16, there is a chance of numerical instability during training. +This is handled internally by a dynamic grad scaler which skips invalid steps and adjusts the scaler to ensure subsequent steps fall within a finite range. +For more information `see the autocast docs `_. + +This is how you enable FP16 in Fabric: + +.. code-block:: + + # Select FP16 mixed precision + fabric = Fabric(precision=16) + + # Or as a string + fabric = Fabric(precision="16") + +.. note:: + + When using TPUs, setting ``precision=16`` will enable bfloat16, the only supported half-precision type on TPUs. + + +---- + + +************************ +BFloat16 Mixed Precision +************************ + +BFloat16 Mixed precision is similar to FP16 mixed precision, however, it maintains more of the "dynamic range" that FP32 offers. +This means it is able to improve numerical stability than FP16 mixed precision. +For more information, see `this TPU performance blogpost `_. + +.. code-block:: + + # Select BF16 precision + fabric = Fabric(precision="bf16") + + +Under the hood, we use `torch.autocast `__ with the dtype set to ``bfloat16``, with no gradient scaling. +It is also possible to use BFloat16 mixed precision on the CPU, relying on MKLDNN. + +.. note:: + + BFloat16 is also experimental and may not provide significant speedups or memory improvements, offering better numerical stability. + Do note for GPUs, the most significant benefits require `Ampere `_ based GPUs, such as A100s or 3090s. + + +---- + + +************************************ +Control where precision gets applied +************************************ + +Fabric automatically casts the data type and operations in the ``forward`` of your model: + +.. code-block:: + + fabric = Fabric(precision="bf16") + + model = ... + optimizer = ... + + # Here, Fabric sets up the `model.forward` for precision auto-casting + model, optimizer = fabric.setup(model, optimizer) + + # Precision casting gets handled in your forward, no code changes required + output = model.forward(input) + + # Precision does NOT get applied here (only in forward) + loss = loss_function(output, target) + +If you want to enable operations in lower bit-precision **outside** your model's ``forward()``, you can use the :meth:`~lightning_fabric.fabric.Fabric.autocast` context manager: + +.. code-block:: + + # Precision now gets handled also in this part of the code: + with fabric.autocast(): + loss = loss_function(output, target) diff --git a/docs/source-pytorch/fabric/guide/callbacks.rst b/docs/source-pytorch/fabric/guide/callbacks.rst index 006828daa026f..e964c240ba849 100644 --- a/docs/source-pytorch/fabric/guide/callbacks.rst +++ b/docs/source-pytorch/fabric/guide/callbacks.rst @@ -4,4 +4,111 @@ Callbacks ######### -Coming soon. +Callbacks enable you, or the users of your code, to add new behavior to the training loop without the need to modify the source code in-place. + + +---- + + +************************************* +Add a callback interface to your loop +************************************* + +Suppose we want to enable anyone to run some arbitrary code at the end of a training iteration. +Here is how that gets done in Fabric: + +.. code-block:: python + :caption: my_callbacks.py + + class MyCallback: + def on_train_batch_end(self, loss, output): + # Here, put any code you want to run at the end of a training step + ... + + +.. code-block:: python + :caption: train.py + :emphasize-lines: 4,7,18 + + from lightning.fabric import Fabric + + # The code of a callback can live anywhere, away from the training loop + from my_callbacks import MyCallback + + # Add one or several callbacks: + fabric = Fabric(callbacks=[MyCallback()]) + + ... + + for iteration, batch in enumerate(train_dataloader): + ... + fabric.backward(loss) + optimizer.step() + + # Let a callback add some arbitrary processing at the appropriate place + # Give the callback access to some varibles + fabric.call("on_train_batch_end", loss=loss, output=...) + + +As you can see, the code inside the callback method is completely decoupled from the trainer code. +This enables flexibility in extending the loop in arbitrary ways. + +**Exercise**: Implement a callback that computes and prints the time to complete an iteration. + + +---- + + +****************** +Multiple callbacks +****************** + +The callback system is designed to easily run multiple callbacks at the same time. +You can simply pass a list to Fabric: + +.. code-block:: python + + # Add multiple callback implementations in a list + callback1 = LearningRateMonitor() + callback2 = Profiler() + fabric = Fabric(callbacks=[callback1, callback2]) + + # Let Fabric call the implementations (if they exist) + fabric.call("any_callback_method", arg1=..., arg2=...) + + # fabric.call is the same as doing this + callback1.any_callback_method(arg1=..., arg2=...) + callback2.any_callback_method(arg1=..., arg2=...) + + +The :meth:`~lightning_fabric.fabric.Fabric.call` simply takes care of calling the callback objects in the order they are given to Fabric. +Not all objects registered via ``Fabric(callbacks=...)`` must implement a method with the given name. +The ones that have a matching method name will get called. + + +---- + + +********** +Next steps +********** + +Callbacks are a powerful tool to build a Trainer. Learn how in our comprehensive guide. + +.. raw:: html + +
+
+ +.. displayitem:: + :header: Template Trainer + :description: Coming soon + :button_link: guide/trainer_template.html + :col_css: col-md-4 + :height: 150 + :tag: intermediate + +.. raw:: html + +
+
diff --git a/docs/source-pytorch/fabric/guide/lightning_module.rst b/docs/source-pytorch/fabric/guide/lightning_module.rst index 5c7479c1a5860..dc0f4e956c4a7 100644 --- a/docs/source-pytorch/fabric/guide/lightning_module.rst +++ b/docs/source-pytorch/fabric/guide/lightning_module.rst @@ -12,7 +12,7 @@ This is what the :doc:`LightningModule <../../common/lightning_module>` was made Here is how you can neatly separate the research code (model, loss, optimization, etc.) from the "trainer" code (training loop, checkpointing, logging, etc.). ----------- +---- ************************************************* @@ -65,7 +65,7 @@ Take these main incredients and put them in a LightningModule: This is a minimal LightningModule, but there are :doc:`many other useful hooks <../../common/lightning_module>` you can use. ----------- +---- **************************************** diff --git a/docs/source-pytorch/fabric/guide/logging.rst b/docs/source-pytorch/fabric/guide/logging.rst index 45dfe48d13181..b1b7b8df3a42a 100644 --- a/docs/source-pytorch/fabric/guide/logging.rst +++ b/docs/source-pytorch/fabric/guide/logging.rst @@ -1,7 +1,126 @@ :orphan: -####### -Logging -####### +############################### +Track and Visualize Experiments +############################### -Coming soon. +******************************* +Why do I need to track metrics? +******************************* + +In model development, we track values of interest such as the *validation_loss* to visualize the learning process for our models. +Model development is like driving a car without windows, charts and logs provide the *windows* to know where to drive the car. + +With Lightning, you can visualize virtually anything you can think of: numbers, text, images, audio. + +---- + +************* +Track metrics +************* + +Metric visualization is the most basic but powerful way of understanding how your model is doing throughout the model development process. +To track a metric, add the following: + +**Step 1:** Pick a logger. + +.. code-block:: python + + from lightning.fabric import Fabric + from lightning.fabric.loggers import TensorBoardLogger + + # Pick a logger and add it to Fabric + logger = TensorBoardLogger(root_dir="logs") + fabric = Fabric(loggers=logger) + + +Built-in loggers you can choose from: + +- :class:`~lightning_fabric.loggers.TensorBoardLogger` +- :class:`~lightning_fabric.loggers.CSVLogger` + +| + +**Step 2:** Add :meth:`~lightning_fabric.fabric.Fabric.log` calls in your code. + +.. code-block:: python + + value = ... # Python scalar or tensor scalar + fabric.log("some_value", value) + + +To log multiple metrics at once, use :meth:`~lightning_fabric.fabric.Fabric.log_dict`: + +.. code-block:: python + + values = {"loss": loss, "acc": acc, "other": other} + fabric.log_dict(values) + + +---- + + +******************* +View logs dashboard +******************* + +How you can view the metrics depends on the individual logger you choose. +Most of them have a dashboard that lets you browse everything you log in real time. + +For the :class:`~lightning_fabric.loggers.tensorboard.TensorBoardLogger` shown above, you can open it by running + +.. code-block:: bash + + tensorboard --logdir=./logs + +If you're using a notebook environment such as *Google Colab* or *Kaggle* or *Jupyter*, launch TensorBoard with this command + +.. code-block:: bash + + %reload_ext tensorboard + %tensorboard --logdir=./logs + + +---- + + +************************* +Control logging frequency +************************* + +Logging a metric in every iteration can slow down training. +Reduce the added overhead by logging less frequently: + +.. code-block:: python + :emphasize-lines: 3 + + for iteration in range(num_iterations): + + if iteration % log_every_n_steps == 0: + value = ... + fabric.log("some_value", value) + + +---- + + +******************** +Use multiple loggers +******************** + +You can add as many loggers as you want without changing the logging code in your loop. + +.. code-block:: python + :emphasize-lines: 8 + + from lightning.fabric import Fabric + from lightning.fabric.loggers import CSVLogger, TensorBoardLogger + + tb_logger = TensorBoardLogger(root_dir="logs/tensorboard") + csv_logger = CSVLogger(root_dir="logs/csv") + + # Add multiple loggers into a list + fabric = Fabric(loggers=[tb_logger, csv_logger]) + + # Calling .log() or .log_dict() always logs to all loggers simultaneously + fabric.log("some_value", value) diff --git a/examples/fabric/dcgan/README.md b/examples/fabric/dcgan/README.md index beefe02785494..cc488faa6214f 100644 --- a/examples/fabric/dcgan/README.md +++ b/examples/fabric/dcgan/README.md @@ -8,7 +8,7 @@ The second one is using [Lightning Fabric](https://pytorch-lightning.readthedocs Tip: You can easily inspect the difference between the two files with: ```bash - sdiff train_torch.py train_fabric.py +sdiff train_torch.py train_fabric.py ``` | Real | Generated | @@ -19,13 +19,13 @@ Tip: You can easily inspect the difference between the two files with: **Raw PyTorch:** -```commandline +```bash python train_torch.py ``` **Accelerated using Lightning Fabric:** -```commandline +```bash python train_fabric.py ``` diff --git a/examples/fabric/image_classifier/README.md b/examples/fabric/image_classifier/README.md index d002b2c24bb31..7debc4c3f9a1a 100644 --- a/examples/fabric/image_classifier/README.md +++ b/examples/fabric/image_classifier/README.md @@ -7,7 +7,7 @@ The second one is using [Lightning Fabric](https://pytorch-lightning.readthedocs Tip: You can easily inspect the difference between the two files with: ```bash - sdiff train_torch.py train_fabric.py +sdiff train_torch.py train_fabric.py ``` #### 1. Image Classifier with Vanilla PyTorch diff --git a/examples/fabric/meta_learning/README.md b/examples/fabric/meta_learning/README.md new file mode 100644 index 0000000000000..fae766a0de993 --- /dev/null +++ b/examples/fabric/meta_learning/README.md @@ -0,0 +1,43 @@ +## Meta-Learning - MAML + +This is an example of a meta-learning algorithm called [MAML](https://arxiv.org/abs/1703.03400), trained on the +[Omniglot dataset](https://paperswithcode.com/dataset/omniglot-1) of handwritten characters from different alphabets. + +The goal of meta-learning in this context is to learn a 'meta'-model trained on many different tasks, such that it can quickly adapt to a new task when trained with very few samples (few-shot learning). +If you are new to meta-learning, have a look at this short [introduction video](https://www.youtube.com/watch?v=ItPEBdD6VMk). + +We show two code versions: +The first one is implemented in raw PyTorch, but it contains quite a bit of boilerplate code for distributed training. +The second one is using [Lightning Fabric](https://pytorch-lightning.readthedocs.io/en/latest/fabric/fabric.html) to accelerate and scale the model. + +Tip: You can easily inspect the difference between the two files with: + +```bash +sdiff train_torch.py train_fabric.py +``` + +### Requirements + +```bash +pip install lightning learn2learn cherry-rl 'gym<=0.22' +``` + +### Run + +**Raw PyTorch:** + +```bash +torchrun --nproc_per_node=2 --standalone train_torch.py +``` + +**Accelerated using Lightning Fabric:** + +```bash +lightning run model train_fabric.py --devices 2 --strategy ddp --accelerator cpu +``` + +### References + +- [MAML explained in 7 minutes](https://www.youtube.com/watch?v=ItPEBdD6VMk) +- [Learn2Learn Resources](http://learn2learn.net/examples/vision/#maml) +- [MAML Paper](https://arxiv.org/abs/1703.03400) diff --git a/examples/fabric/meta_learning/train_fabric.py b/examples/fabric/meta_learning/train_fabric.py new file mode 100644 index 0000000000000..24e747f56d047 --- /dev/null +++ b/examples/fabric/meta_learning/train_fabric.py @@ -0,0 +1,165 @@ +""" +MAML - Accelerated with Lightning Fabric + +Adapted from https://github.com/learnables/learn2learn/blob/master/examples/vision/distributed_maml.py +Original code author: Séb Arnold - learnables.net +Based on the paper: https://arxiv.org/abs/1703.03400 + +Requirements: +- lightning>=1.9.0 +- learn2learn +- cherry-rl +- gym<=0.22 + +Run it with: + lightning run model train_fabric.py accelerator=cuda --devices=2 --strategy=ddp +""" +import cherry +import learn2learn as l2l +import numpy as np +import torch + +from lightning.fabric import Fabric, seed_everything + + +def accuracy(predictions, targets): + predictions = predictions.argmax(dim=1).view(targets.shape) + return (predictions == targets).sum().float() / targets.size(0) + + +def fast_adapt(batch, learner, loss, adaptation_steps, shots, ways): + data, labels = batch + + # Separate data into adaptation/evalutation sets + adaptation_indices = np.zeros(data.size(0), dtype=bool) + adaptation_indices[np.arange(shots * ways) * 2] = True + evaluation_indices = torch.from_numpy(~adaptation_indices) + adaptation_indices = torch.from_numpy(adaptation_indices) + adaptation_data, adaptation_labels = data[adaptation_indices], labels[adaptation_indices] + evaluation_data, evaluation_labels = data[evaluation_indices], labels[evaluation_indices] + + # Adapt the model + for step in range(adaptation_steps): + train_error = loss(learner(adaptation_data), adaptation_labels) + learner.adapt(train_error) + + # Evaluate the adapted model + predictions = learner(evaluation_data) + valid_error = loss(predictions, evaluation_labels) + valid_accuracy = accuracy(predictions, evaluation_labels) + return valid_error, valid_accuracy + + +def main( + ways=5, + shots=5, + meta_lr=0.003, + fast_lr=0.5, + meta_batch_size=32, + adaptation_steps=1, + num_iterations=60000, + seed=42, +): + # Create the Fabric object + # Arguments get parsed from the command line, see `lightning run model --help` + fabric = Fabric() + + meta_batch_size = meta_batch_size // fabric.world_size + seed_everything(seed + fabric.global_rank) + + # Create Tasksets using the benchmark interface + tasksets = l2l.vision.benchmarks.get_tasksets( + # 'mini-imagenet' works too, but you need to download it manually due to license restrictions of ImageNet + "omniglot", + train_ways=ways, + train_samples=2 * shots, + test_ways=ways, + test_samples=2 * shots, + num_tasks=20000, + root="data", + ) + + # Create model + # model = l2l.vision.models.MiniImagenetCNN(ways) + model = l2l.vision.models.OmniglotFC(28**2, ways) + model = fabric.to_device(model) + maml = l2l.algorithms.MAML(model, lr=fast_lr, first_order=False) + optimizer = torch.optim.Adam(maml.parameters(), meta_lr) + optimizer = cherry.optim.Distributed(maml.parameters(), opt=optimizer, sync=1) + + # model, optimizer = fabric.setup(model, optimizer) + + optimizer.sync_parameters() + loss = torch.nn.CrossEntropyLoss(reduction="mean") + + for iteration in range(num_iterations): + optimizer.zero_grad() + meta_train_error = 0.0 + meta_train_accuracy = 0.0 + meta_valid_error = 0.0 + meta_valid_accuracy = 0.0 + for task in range(meta_batch_size): + # Compute meta-training loss + learner = maml.clone() + batch = fabric.to_device(tasksets.train.sample()) + evaluation_error, evaluation_accuracy = fast_adapt( + batch, + learner, + loss, + adaptation_steps, + shots, + ways, + ) + fabric.backward(evaluation_error) + meta_train_error += evaluation_error.item() + meta_train_accuracy += evaluation_accuracy.item() + + # Compute meta-validation loss + learner = maml.clone() + batch = fabric.to_device(tasksets.validation.sample()) + evaluation_error, evaluation_accuracy = fast_adapt( + batch, + learner, + loss, + adaptation_steps, + shots, + ways, + ) + meta_valid_error += evaluation_error.item() + meta_valid_accuracy += evaluation_accuracy.item() + + # Print some metrics + fabric.print("\n") + fabric.print("Iteration", iteration) + fabric.print("Meta Train Error", meta_train_error / meta_batch_size) + fabric.print("Meta Train Accuracy", meta_train_accuracy / meta_batch_size) + fabric.print("Meta Valid Error", meta_valid_error / meta_batch_size) + fabric.print("Meta Valid Accuracy", meta_valid_accuracy / meta_batch_size) + + # Average the accumulated gradients and optimize + for p in maml.parameters(): + p.grad.data.mul_(1.0 / meta_batch_size) + optimizer.step() # averages gradients across all workers + + meta_test_error = 0.0 + meta_test_accuracy = 0.0 + for task in range(meta_batch_size): + # Compute meta-testing loss + learner = maml.clone() + batch = fabric.to_device(tasksets.test.sample()) + evaluation_error, evaluation_accuracy = fast_adapt( + batch, + learner, + loss, + adaptation_steps, + shots, + ways, + ) + meta_test_error += evaluation_error.item() + meta_test_accuracy += evaluation_accuracy.item() + fabric.print("Meta Test Error", meta_test_error / meta_batch_size) + fabric.print("Meta Test Accuracy", meta_test_accuracy / meta_batch_size) + + +if __name__ == "__main__": + main() diff --git a/examples/fabric/meta_learning/train_torch.py b/examples/fabric/meta_learning/train_torch.py new file mode 100644 index 0000000000000..eadfe68d9887e --- /dev/null +++ b/examples/fabric/meta_learning/train_torch.py @@ -0,0 +1,183 @@ +""" +MAML - Raw PyTorch implementation using the Learn2Learn library + +Adapted from https://github.com/learnables/learn2learn/blob/master/examples/vision/distributed_maml.py +Original code author: Séb Arnold - learnables.net +Based on the paper: https://arxiv.org/abs/1703.03400 + +Requirements: +- learn2learn +- cherry-rl +- gym<=0.22 + +This code is written for distributed training. + +Run it with: + torchrun --nproc_per_node=2 --standalone train_torch.py +""" +import os +import random + +import cherry +import learn2learn as l2l +import numpy as np +import torch +import torch.distributed as dist + + +def accuracy(predictions, targets): + predictions = predictions.argmax(dim=1).view(targets.shape) + return (predictions == targets).sum().float() / targets.size(0) + + +def fast_adapt(batch, learner, loss, adaptation_steps, shots, ways, device): + data, labels = batch + data, labels = data.to(device), labels.to(device) + + # Separate data into adaptation/evalutation sets + adaptation_indices = np.zeros(data.size(0), dtype=bool) + adaptation_indices[np.arange(shots * ways) * 2] = True + evaluation_indices = torch.from_numpy(~adaptation_indices) + adaptation_indices = torch.from_numpy(adaptation_indices) + adaptation_data, adaptation_labels = data[adaptation_indices], labels[adaptation_indices] + evaluation_data, evaluation_labels = data[evaluation_indices], labels[evaluation_indices] + + # Adapt the model + for step in range(adaptation_steps): + train_error = loss(learner(adaptation_data), adaptation_labels) + learner.adapt(train_error) + + # Evaluate the adapted model + predictions = learner(evaluation_data) + valid_error = loss(predictions, evaluation_labels) + valid_accuracy = accuracy(predictions, evaluation_labels) + return valid_error, valid_accuracy + + +def main( + ways=5, + shots=5, + meta_lr=0.003, + fast_lr=0.5, + meta_batch_size=32, + adaptation_steps=1, + num_iterations=60000, + cuda=True, + seed=42, +): + local_rank = int(os.environ["LOCAL_RANK"]) + world_size = int(os.environ["WORLD_SIZE"]) + os.environ["MASTER_ADDR"] = "127.0.0.1" + os.environ["MASTER_PORT"] = "12345" + dist.init_process_group("gloo", rank=local_rank, world_size=world_size) + rank = dist.get_rank() + + meta_batch_size = meta_batch_size // world_size + seed = seed + rank + + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + device = torch.device("cpu") + if cuda and torch.cuda.device_count(): + torch.cuda.manual_seed(seed) + device_id = rank % torch.cuda.device_count() + device = torch.device("cuda:" + str(device_id)) + + # Create Tasksets using the benchmark interface + tasksets = l2l.vision.benchmarks.get_tasksets( + # 'mini-imagenet' works too, but you need to download it manually due to license restrictions of ImageNet + "omniglot", + train_ways=ways, + train_samples=2 * shots, + test_ways=ways, + test_samples=2 * shots, + num_tasks=20000, + root="data", + ) + + # Create model + # model = l2l.vision.models.MiniImagenetCNN(ways) + model = l2l.vision.models.OmniglotFC(28**2, ways) + model.to(device) + maml = l2l.algorithms.MAML(model, lr=fast_lr, first_order=False) + optimizer = torch.optim.Adam(maml.parameters(), meta_lr) + optimizer = cherry.optim.Distributed(maml.parameters(), opt=optimizer, sync=1) + optimizer.sync_parameters() + loss = torch.nn.CrossEntropyLoss(reduction="mean") + + for iteration in range(num_iterations): + optimizer.zero_grad() + meta_train_error = 0.0 + meta_train_accuracy = 0.0 + meta_valid_error = 0.0 + meta_valid_accuracy = 0.0 + for task in range(meta_batch_size): + # Compute meta-training loss + learner = maml.clone() + batch = tasksets.train.sample() + evaluation_error, evaluation_accuracy = fast_adapt( + batch, + learner, + loss, + adaptation_steps, + shots, + ways, + device, + ) + evaluation_error.backward() + meta_train_error += evaluation_error.item() + meta_train_accuracy += evaluation_accuracy.item() + + # Compute meta-validation loss + learner = maml.clone() + batch = tasksets.validation.sample() + evaluation_error, evaluation_accuracy = fast_adapt( + batch, + learner, + loss, + adaptation_steps, + shots, + ways, + device, + ) + meta_valid_error += evaluation_error.item() + meta_valid_accuracy += evaluation_accuracy.item() + + # Print some metrics + if rank == 0: + print("\n") + print("Iteration", iteration) + print("Meta Train Error", meta_train_error / meta_batch_size) + print("Meta Train Accuracy", meta_train_accuracy / meta_batch_size) + print("Meta Valid Error", meta_valid_error / meta_batch_size) + print("Meta Valid Accuracy", meta_valid_accuracy / meta_batch_size) + + # Average the accumulated gradients and optimize + for p in maml.parameters(): + p.grad.data.mul_(1.0 / meta_batch_size) + optimizer.step() # averages gradients across all workers + + meta_test_error = 0.0 + meta_test_accuracy = 0.0 + for task in range(meta_batch_size): + # Compute meta-testing loss + learner = maml.clone() + batch = tasksets.test.sample() + evaluation_error, evaluation_accuracy = fast_adapt( + batch, + learner, + loss, + adaptation_steps, + shots, + ways, + device, + ) + meta_test_error += evaluation_error.item() + meta_test_accuracy += evaluation_accuracy.item() + print("Meta Test Error", meta_test_error / meta_batch_size) + print("Meta Test Accuracy", meta_test_accuracy / meta_batch_size) + + +if __name__ == "__main__": + main() diff --git a/src/lightning_app/CHANGELOG.md b/src/lightning_app/CHANGELOG.md index ddc6f6121fe7d..dfb6627f87593 100644 --- a/src/lightning_app/CHANGELOG.md +++ b/src/lightning_app/CHANGELOG.md @@ -4,7 +4,7 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). -## [unreleased] - 202Y-MM-DD +## [1.9.0] - 2023-01-12 ### Added @@ -13,26 +13,20 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Changed - - The LoadBalancer now uses internal ip + port instead of URL exposed ([#16119](https://github.com/Lightning-AI/lightning/pull/16119)) - - Added support for logging in different trainer stages with `DeviceStatsMonitor` ([#16002](https://github.com/Lightning-AI/lightning/pull/16002)) - - -### Deprecated - -- - - -### Removed - -- +- Made cluster creation/deletion async by default ([#16185](https://github.com/Lightning-AI/lightning/pull/16185)) ### Fixed - Fixed not being able to run multiple lightning apps locally due to port collision ([#15819](https://github.com/Lightning-AI/lightning/pull/15819)) +- Avoid `relpath` bug on Windows ([#16164](https://github.com/Lightning-AI/lightning/pull/16164)) +- Avoid using the deprecated `LooseVersion` ([#16162](https://github.com/Lightning-AI/lightning/pull/16162)) +- Porting fixes to autoscaler component ([#16249](https://github.com/Lightning-AI/lightning/pull/16249)) + +- Fixed a bug where `lightning login` with env variables would not correctly save the credentials ([#16339](https://github.com/Lightning-AI/lightning/pull/16339)) ## [1.8.6] - 2022-12-21 diff --git a/src/lightning_app/cli/cmd_clusters.py b/src/lightning_app/cli/cmd_clusters.py index b7e62f52a1846..35aefa57604fa 100644 --- a/src/lightning_app/cli/cmd_clusters.py +++ b/src/lightning_app/cli/cmd_clusters.py @@ -109,7 +109,7 @@ def create( region: str = "us-east-1", external_id: str = None, edit_before_creation: bool = False, - do_async: bool = False, + do_async: bool = True, ) -> None: """request Lightning AI BYOC compute cluster creation. @@ -192,7 +192,7 @@ def list(self) -> None: console = Console() console.print(clusters.as_table()) - def delete(self, cluster_id: str, force: bool = False, do_async: bool = False) -> None: + def delete(self, cluster_id: str, force: bool = False, do_async: bool = True) -> None: if force: click.echo( """ diff --git a/src/lightning_app/cli/lightning_cli_create.py b/src/lightning_app/cli/lightning_cli_create.py index 5182b1e69d5e2..45783918c9743 100644 --- a/src/lightning_app/cli/lightning_cli_create.py +++ b/src/lightning_app/cli/lightning_cli_create.py @@ -50,13 +50,13 @@ def create() -> None: help="Edit the cluster specs before submitting them to the API server.", ) @click.option( - "--async", - "do_async", + "--sync", + "do_sync", type=bool, required=False, default=False, is_flag=True, - help="This flag makes the CLI return immediately and lets the cluster creation happen in the background.", + help="This flag makes the CLI wait until cluster creation completes.", ) def create_cluster( cluster_id: str, @@ -66,7 +66,7 @@ def create_cluster( provider: str, edit_before_creation: bool, enable_performance: bool, - do_async: bool, + do_sync: bool, **kwargs: Any, ) -> None: """Create a Lightning AI BYOC compute cluster with your cloud provider credentials.""" @@ -81,7 +81,7 @@ def create_cluster( external_id=external_id, edit_before_creation=edit_before_creation, cost_savings=not enable_performance, - do_async=do_async, + do_async=not do_sync, ) diff --git a/src/lightning_app/cli/lightning_cli_delete.py b/src/lightning_app/cli/lightning_cli_delete.py index 1664022e51c15..91eeb95e90535 100644 --- a/src/lightning_app/cli/lightning_cli_delete.py +++ b/src/lightning_app/cli/lightning_cli_delete.py @@ -20,15 +20,15 @@ def delete() -> None: @delete.command("cluster") @click.argument("cluster", type=str) @click.option( - "--async", - "do_async", + "--sync", + "do_sync", type=bool, required=False, default=False, is_flag=True, - help="This flag makes the CLI return immediately and lets the cluster deletion happen in the background", + help="This flag makes the CLI wait until cluster deletion completes.", ) -def delete_cluster(cluster: str, force: bool = False, do_async: bool = False) -> None: +def delete_cluster(cluster: str, force: bool = False, do_sync: bool = False) -> None: """Delete a Lightning AI BYOC cluster and all associated cloud provider resources. Deleting a cluster also deletes all apps that were started on the cluster. @@ -44,7 +44,7 @@ def delete_cluster(cluster: str, force: bool = False, do_async: bool = False) -> VPC components, etc. are irreversibly deleted and cannot be recovered! """ cluster_manager = AWSClusterManager() - cluster_manager.delete(cluster_id=cluster, force=force, do_async=do_async) + cluster_manager.delete(cluster_id=cluster, force=force, do_async=not do_sync) def _find_cluster_for_user(app_name: str, cluster_id: Optional[str]) -> str: diff --git a/src/lightning_app/testing/helpers.py b/src/lightning_app/testing/helpers.py index cb21d314145ff..0d323789ab323 100644 --- a/src/lightning_app/testing/helpers.py +++ b/src/lightning_app/testing/helpers.py @@ -3,6 +3,7 @@ from queue import Empty from typing import List, Optional, Tuple +import pytest from packaging.version import Version from lightning_app import LightningFlow, LightningWork @@ -63,8 +64,6 @@ def __new__( cloud: bool = False, **kwargs, ): - import pytest - """ Args: *args: Any :class:`pytest.mark.skipif` arguments. diff --git a/src/lightning_app/testing/testing.py b/src/lightning_app/testing/testing.py index 408a9530c4679..d223a8ce07d57 100644 --- a/src/lightning_app/testing/testing.py +++ b/src/lightning_app/testing/testing.py @@ -261,9 +261,12 @@ def run_app_in_cloud( if url.endswith("/"): url = url[:-1] payload = {"apiKey": _Config.api_key, "username": _Config.username} - res = requests.post(url + "/v1/auth/login", data=json.dumps(payload)) + url_login = url + "/v1/auth/login" + res = requests.post(url_login, data=json.dumps(payload)) if "token" not in res.json(): - raise Exception("You haven't properly setup your environment variables.") + raise RuntimeError( + f"You haven't properly setup your environment variables with {url_login} and data: \n{payload}" + ) token = res.json()["token"] @@ -409,7 +412,7 @@ def wait_openapi(page: playwright.sync_api.Page, app_url: str) -> None: print("App is running, continuing with testing...") wait_openapi(view_page, app.status.url) break - elif app.status.phase != V1LightningappInstanceState.PENDING: + elif app.status.phase not in (V1LightningappInstanceState.PENDING, V1LightningappInstanceState.NOT_STARTED): # there's a race condition if the app goes from pending to running to something else before we evaluate # the condition above. avoid it by checking stopped explicitly print(f"App finished with phase {app.status.phase}, finished testing...") @@ -473,13 +476,13 @@ def wait_for(page, callback: Callable, *args, **kwargs) -> Any: res = callback(*args, **kwargs) if res: return res - except (playwright._impl._api_types.Error, playwright._impl._api_types.TimeoutError) as e: - print(e) + except (playwright._impl._api_types.Error, playwright._impl._api_types.TimeoutError) as err: + print(err) try: sleep(7) page.reload() - except (playwright._impl._api_types.Error, playwright._impl._api_types.TimeoutError) as e: - print(e) + except (playwright._impl._api_types.Error, playwright._impl._api_types.TimeoutError) as err: + print(err) pass sleep(3) diff --git a/src/lightning_app/utilities/login.py b/src/lightning_app/utilities/login.py index ff04e3b865d41..31087be9ee0a5 100644 --- a/src/lightning_app/utilities/login.py +++ b/src/lightning_app/utilities/login.py @@ -40,20 +40,6 @@ class Auth: secrets_file = pathlib.Path(LIGHTNING_CREDENTIAL_PATH) - def __post_init__(self): - for key in Keys: - setattr(self, key.suffix, os.environ.get(key.value, None)) - - self._with_env_var = bool(self.user_id and self.api_key) # used by authenticate method - if self._with_env_var: - self.save("", self.user_id, self.api_key, self.user_id) - logger.info("Credentials loaded from environment variables") - elif self.api_key or self.user_id: - raise ValueError( - "To use env vars for authentication both " - f"{Keys.USER_ID.value} and {Keys.API_KEY.value} should be set." - ) - def load(self) -> bool: """Load credentials from disk and update properties with credentials. @@ -88,13 +74,12 @@ def save(self, token: str = "", user_id: str = "", api_key: str = "", username: self.api_key = api_key logger.debug("credentials saved successfully") - @classmethod - def clear(cls) -> None: - """remove credentials from disk and env variables.""" - if cls.secrets_file.exists(): - cls.secrets_file.unlink() + def clear(self) -> None: + """Remove credentials from disk.""" + if self.secrets_file.exists(): + self.secrets_file.unlink() for key in Keys: - os.environ.pop(key.value, None) + setattr(self, key.suffix, None) logger.debug("credentials removed successfully") @property @@ -119,11 +104,21 @@ def authenticate(self) -> Optional[str]: ---------- authorization header to use when authentication completes. """ - if self._with_env_var: - logger.debug("successfully loaded credentials from env") - return self.auth_header - if not self.load(): + # First try to authenticate from env + for key in Keys: + setattr(self, key.suffix, os.environ.get(key.value, None)) + + if self.user_id and self.api_key: + self.save("", self.user_id, self.api_key, self.user_id) + logger.info("Credentials loaded from environment variables") + return self.auth_header + elif self.api_key or self.user_id: + raise ValueError( + "To use env vars for authentication both " + f"{Keys.USER_ID.value} and {Keys.API_KEY.value} should be set." + ) + logger.debug("failed to load credentials, opening browser to get new.") self._run_server() return self.auth_header diff --git a/src/lightning_fabric/CHANGELOG.md b/src/lightning_fabric/CHANGELOG.md index c720d2b3a9fe1..4abc53d0909f7 100644 --- a/src/lightning_fabric/CHANGELOG.md +++ b/src/lightning_fabric/CHANGELOG.md @@ -5,31 +5,17 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). -## [1.9.0] - 202Y-MM-DD +## [1.9.0] - 2023-01-12 ### Added - - Added `Fabric.launch()` to programmatically launch processes (e.g. in Jupyter notebook) ([#14992](https://github.com/Lightning-AI/lightning/issues/14992)) - - - Added the option to launch Fabric scripts from the CLI, without the need to wrap the code into the `run` method ([#14992](https://github.com/Lightning-AI/lightning/issues/14992)) - - - Added `Fabric.setup_module()` and `Fabric.setup_optimizers()` to support strategies that need to set up the model before an optimizer can be created ([#15185](https://github.com/Lightning-AI/lightning/pull/15185)) - - - Added support for Fully Sharded Data Parallel (FSDP) training in Lightning Lite ([#14967](https://github.com/Lightning-AI/lightning/issues/14967)) - - - Added `lightning_fabric.accelerators.find_usable_cuda_devices` utility function ([#16147](https://github.com/PyTorchLightning/pytorch-lightning/pull/16147)) - - - Added basic support for LightningModules ([#16048](https://github.com/Lightning-AI/lightning/issues/16048)) - - - Added support for managing callbacks via `Fabric(callbacks=...)` and emitting events through `Fabric.call()` ([#16074](https://github.com/Lightning-AI/lightning/issues/16074)) - - Added Logger support ([#16121](https://github.com/Lightning-AI/lightning/issues/16121)) * Added `Fabric(loggers=...)` to support different Logger frameworks in Fabric * Added `Fabric.log` for logging scalars using multiple loggers @@ -37,32 +23,19 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). * Added `Fabric.loggers` and `Fabric.logger` attributes to access the individual logger instances * Added support for calling `self.log` and `self.log_dict` in a LightningModule when using Fabric * Added access to `self.logger` and `self.loggers` in a LightningModule when using Fabric - +- Added `lightning_fabric.loggers.TensorBoardLogger` ([#16121](https://github.com/Lightning-AI/lightning/issues/16121)) +- Added `lightning_fabric.loggers.CSVLogger` ([#16346](https://github.com/Lightning-AI/lightning/issues/16346)) - Added support for a consistent `.zero_grad(set_to_none=...)` on the wrapped optimizer regardless of which strategy is used ([#16275](https://github.com/Lightning-AI/lightning/issues/16275)) ### Changed - Renamed the class `LightningLite` to `Fabric` ([#15932](https://github.com/Lightning-AI/lightning/issues/15932), [#15938](https://github.com/Lightning-AI/lightning/issues/15938)) - - - The `Fabric.run()` method is no longer abstract ([#14992](https://github.com/Lightning-AI/lightning/issues/14992)) - - - The `XLAStrategy` now inherits from `ParallelStrategy` instead of `DDPSpawnStrategy` ([#15838](https://github.com/Lightning-AI/lightning/issues/15838)) - - - Merged the implementation of `DDPSpawnStrategy` into `DDPStrategy` and removed `DDPSpawnStrategy` ([#14952](https://github.com/Lightning-AI/lightning/issues/14952)) - - - The dataloader wrapper returned from `.setup_dataloaders()` now calls `.set_epoch()` on the distributed sampler if one is used ([#16101](https://github.com/Lightning-AI/lightning/issues/16101)) - -### Deprecated - -- - - ### Removed - Removed support for FairScale's sharded training (`strategy='ddp_sharded'|'ddp_sharded_spawn'`). Use Fully-Sharded Data Parallel instead (`strategy='fsdp'`) ([#16329](https://github.com/Lightning-AI/lightning/pull/16329)) @@ -70,6 +43,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Fixed - Restored sampling parity between PyTorch and Fabric dataloaders when using the `DistributedSampler` ([#16101](https://github.com/Lightning-AI/lightning/issues/16101)) +- Fixes an issue where the error message wouldn't tell the user the real value that was passed through the CLI ([#16334](https://github.com/Lightning-AI/lightning/issues/16334)) ## [1.8.6] - 2022-12-21 diff --git a/src/lightning_fabric/connector.py b/src/lightning_fabric/connector.py index 208fb9f00dfd6..e5bda1faa168b 100644 --- a/src/lightning_fabric/connector.py +++ b/src/lightning_fabric/connector.py @@ -539,7 +539,7 @@ def _argument_from_env(name: str, current: Any, default: Any) -> Any: if env_value is not None and env_value != str(current) and str(current) != str(default): raise ValueError( f"Your code has `Fabric({name}={current!r}, ...)` but it conflicts with the value " - f"`--{name}={current}` set through the CLI. " + f"`--{name}={env_value}` set through the CLI. " " Remove it either from the CLI or from the Lightning Fabric object." ) if env_value is None: diff --git a/src/lightning_fabric/fabric.py b/src/lightning_fabric/fabric.py index a19f1aa6c28b1..07abeb7967bc6 100644 --- a/src/lightning_fabric/fabric.py +++ b/src/lightning_fabric/fabric.py @@ -70,8 +70,8 @@ class Fabric: or bfloat16 precision (``"bf16"``). plugins: One or several custom plugins callbacks: A single callback or a list of callbacks. A callback can contain any arbitrary methods that - can be invoked through :meth:`lightning_fabric.fabric.Fabric.call` by the user. - loggers: A single logger or a list of loggers. See :meth:`lightning_fabric.fabric.Fabric.log` for more + can be invoked through :meth:`~lightning_fabric.fabric.Fabric.call` by the user. + loggers: A single logger or a list of loggers. See :meth:`~lightning_fabric.fabric.Fabric.log` for more information. """ diff --git a/src/lightning_fabric/loggers/__init__.py b/src/lightning_fabric/loggers/__init__.py index 03c21d71f8304..9b23c5a1bcd56 100644 --- a/src/lightning_fabric/loggers/__init__.py +++ b/src/lightning_fabric/loggers/__init__.py @@ -10,5 +10,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from lightning_fabric.loggers.csv_logs import CSVLogger # noqa: F401 from lightning_fabric.loggers.logger import Logger # noqa: F401 from lightning_fabric.loggers.tensorboard import TensorBoardLogger # noqa: F401 diff --git a/src/lightning_fabric/loggers/csv_logs.py b/src/lightning_fabric/loggers/csv_logs.py new file mode 100644 index 0000000000000..aa32b1668fba3 --- /dev/null +++ b/src/lightning_fabric/loggers/csv_logs.py @@ -0,0 +1,219 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import csv +import logging +import os +from argparse import Namespace +from typing import Any, Dict, List, Optional, Union + +from torch import Tensor + +from lightning_fabric.loggers.logger import Logger, rank_zero_experiment +from lightning_fabric.utilities.logger import _add_prefix +from lightning_fabric.utilities.rank_zero import rank_zero_only, rank_zero_warn +from lightning_fabric.utilities.types import _PATH + +log = logging.getLogger(__name__) + + +class CSVLogger(Logger): + r""" + Log to the local file system in CSV format. + + Logs are saved to ``os.path.join(root_dir, name, version)``. + + Args: + root_dir: The root directory in which all your experiments with different names and versions will be stored. + name: Experiment name. Defaults to ``'lightning_logs'``. + version: Experiment version. If version is not specified the logger inspects the save + directory for existing versions, then automatically assigns the next available version. + prefix: A string to put at the beginning of metric keys. + flush_logs_every_n_steps: How often to flush logs to disk (defaults to every 100 steps). + + Example:: + + from lightning.fabric.loggers import CSVLogger + + logger = CSVLogger("path/to/logs/root", name="my_model") + logger.log_metrics({"loss": 0.235, "acc": 0.75}) + logger.finalize("success") + """ + + LOGGER_JOIN_CHAR = "-" + + def __init__( + self, + root_dir: _PATH, + name: str = "lightning_logs", + version: Optional[Union[int, str]] = None, + prefix: str = "", + flush_logs_every_n_steps: int = 100, + ): + super().__init__() + self._root_dir = os.fspath(root_dir) + self._name = name or "" + self._version = version + self._prefix = prefix + self._experiment: Optional[_ExperimentWriter] = None + self._flush_logs_every_n_steps = flush_logs_every_n_steps + + @property + def name(self) -> str: + """Gets the name of the experiment. + + Returns: + The name of the experiment. + """ + return self._name + + @property + def version(self) -> Union[int, str]: + """Gets the version of the experiment. + + Returns: + The version of the experiment if it is specified, else the next version. + """ + if self._version is None: + self._version = self._get_next_version() + return self._version + + @property + def root_dir(self) -> str: + """Gets the save directory where the versioned CSV experiments are saved.""" + return self._root_dir + + @property + def log_dir(self) -> str: + """The log directory for this run. + + By default, it is named ``'version_${self.version}'`` but it can be overridden by passing a string value for the + constructor's version parameter instead of ``None`` or an int. + """ + # create a pseudo standard path + version = self.version if isinstance(self.version, str) else f"version_{self.version}" + log_dir = os.path.join(self.root_dir, self.name, version) + return log_dir + + @property + @rank_zero_experiment + def experiment(self) -> "_ExperimentWriter": + """Actual ExperimentWriter object. To use ExperimentWriter features anywhere in your code, do the + following. + + Example:: + + self.logger.experiment.some_experiment_writer_function() + """ + if self._experiment is not None: + return self._experiment + + os.makedirs(self.root_dir, exist_ok=True) + self._experiment = _ExperimentWriter(log_dir=self.log_dir) + return self._experiment + + @rank_zero_only + def log_hyperparams(self, params: Union[Dict[str, Any], Namespace]) -> None: + raise NotImplementedError("The `CSVLogger` does not yet support logging hyperparameters.") + + @rank_zero_only + def log_metrics(self, metrics: Dict[str, Union[Tensor, float]], step: Optional[int] = None) -> None: + metrics = _add_prefix(metrics, self._prefix, self.LOGGER_JOIN_CHAR) + self.experiment.log_metrics(metrics, step) + if step is not None and (step + 1) % self._flush_logs_every_n_steps == 0: + self.save() + + @rank_zero_only + def save(self) -> None: + super().save() + self.experiment.save() + + @rank_zero_only + def finalize(self, status: str) -> None: + if self._experiment is None: + # When using multiprocessing, finalize() should be a no-op on the main process, as no experiment has been + # initialized there + return + self.save() + + def _get_next_version(self) -> int: + root_dir = self.root_dir + + if not os.path.isdir(root_dir): + log.warning("Missing logger folder: %s", root_dir) + return 0 + + existing_versions = [] + for d in os.listdir(root_dir): + if os.path.isdir(os.path.join(root_dir, d)) and d.startswith("version_"): + existing_versions.append(int(d.split("_")[1])) + + if len(existing_versions) == 0: + return 0 + + return max(existing_versions) + 1 + + +class _ExperimentWriter: + r""" + Experiment writer for CSVLogger. + + Args: + log_dir: Directory for the experiment logs + """ + + NAME_METRICS_FILE = "metrics.csv" + + def __init__(self, log_dir: str) -> None: + self.metrics: List[Dict[str, float]] = [] + + self.log_dir = log_dir + if os.path.exists(self.log_dir) and os.listdir(self.log_dir): + rank_zero_warn( + f"Experiment logs directory {self.log_dir} exists and is not empty." + " Previous log files in this directory will be deleted when the new ones are saved!" + ) + os.makedirs(self.log_dir, exist_ok=True) + + self.metrics_file_path = os.path.join(self.log_dir, self.NAME_METRICS_FILE) + + def log_metrics(self, metrics_dict: Dict[str, float], step: Optional[int] = None) -> None: + """Record metrics.""" + + def _handle_value(value: Union[Tensor, Any]) -> Any: + if isinstance(value, Tensor): + return value.item() + return value + + if step is None: + step = len(self.metrics) + + metrics = {k: _handle_value(v) for k, v in metrics_dict.items()} + metrics["step"] = step + self.metrics.append(metrics) + + def save(self) -> None: + """Save recorded metrics into files.""" + if not self.metrics: + return + + last_m = {} + for m in self.metrics: + last_m.update(m) + metrics_keys = list(last_m.keys()) + + with open(self.metrics_file_path, "w", newline="") as f: + writer = csv.DictWriter(f, fieldnames=metrics_keys) + writer.writeheader() + writer.writerows(self.metrics) diff --git a/src/lightning_fabric/loggers/tensorboard.py b/src/lightning_fabric/loggers/tensorboard.py index ca694d9ea30c5..8cd7cfe93045c 100644 --- a/src/lightning_fabric/loggers/tensorboard.py +++ b/src/lightning_fabric/loggers/tensorboard.py @@ -72,7 +72,7 @@ class TensorBoardLogger(Logger): from lightning.fabric.loggers import TensorBoardLogger - logger = TensorBoardLogger("path/to/logs/rot", name="my_model") + logger = TensorBoardLogger("path/to/logs/root", name="my_model") logger.log_hyperparams({"epochs": 5, "optimizer": "Adam"}) logger.log_metrics({"acc": 0.75}) logger.finalize("success") diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 2ad3f1faa1c88..9fcead41eaf1e 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -5,91 +5,44 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). -## [unreleased] - 202Y-MM-DD +## [1.9.0] - 2023-01-12 ### Added - +- Added support for native logging of `MetricCollection` with enabled compute groups ([#15580](https://github.com/Lightning-AI/lightning/pull/15580)) - Added support for custom artifact names in `pl.loggers.WandbLogger` ([#16173](https://github.com/Lightning-AI/lightning/pull/16173)) - - Added support for DDP with `LRFinder` ([#15304](https://github.com/Lightning-AI/lightning/pull/15304)) - - - Added utilities to migrate checkpoints from one Lightning version to another ([#15237](https://github.com/Lightning-AI/lightning/pull/15237)) - - - Added support to upgrade all checkpoints in a folder using the `pl.utilities.upgrade_checkpoint` script ([#15333](https://github.com/Lightning-AI/lightning/pull/15333)) - - - Add an axes argument `ax` to the `.lr_find().plot()` to enable writing to a user-defined axes in a matplotlib figure ([#15652](https://github.com/Lightning-AI/lightning/pull/15652)) - - - Added `log_model` parameter to `MLFlowLogger` ([#9187](https://github.com/PyTorchLightning/pytorch-lightning/pull/9187)) - - - Added a check to validate that wrapped FSDP models are used while initializing optimizers ([#15301](https://github.com/Lightning-AI/lightning/pull/15301)) - - - Added a warning when `self.log(..., logger=True)` is called without a configured logger ([#15814](https://github.com/Lightning-AI/lightning/pull/15814)) - - - Added support for colossalai 0.1.11 ([#15888](https://github.com/Lightning-AI/lightning/pull/15888)) - - - Added `LightningCLI` support for optimizer and learning schedulers via callable type dependency injection ([#15869](https://github.com/Lightning-AI/lightning/pull/15869)) - - - Added support for activation checkpointing for the `DDPFullyShardedNativeStrategy` strategy ([#15826](https://github.com/Lightning-AI/lightning/pull/15826)) - - - Added the option to set `DDPFullyShardedNativeStrategy(cpu_offload=True|False)` via bool instead of needing to pass a configufation object ([#15832](https://github.com/Lightning-AI/lightning/pull/15832)) - - - Added info message for Ampere CUDA GPU users to enable tf32 matmul precision ([#16037](https://github.com/Lightning-AI/lightning/pull/16037)) - - - Added support for returning optimizer-like classes in `LightningModule.configure_optimizers` ([#16189](https://github.com/Lightning-AI/lightning/pull/16189)) ### Changed - Drop PyTorch 1.9 support ([#15347](https://github.com/Lightning-AI/lightning/pull/15347)) - - - Switch from `tensorboard` to `tensorboardx` in `TensorBoardLogger` ([#15728](https://github.com/Lightning-AI/lightning/pull/15728)) - - - From now on, Lightning Trainer and `LightningModule.load_from_checkpoint` automatically upgrade the loaded checkpoint if it was produced in an old version of Lightning ([#15237](https://github.com/Lightning-AI/lightning/pull/15237)) - - - `Trainer.{validate,test,predict}(ckpt_path=...)` no longer restores the `Trainer.global_step` and `trainer.current_epoch` value from the checkpoints - From now on, only `Trainer.fit` will restore this value ([#15532](https://github.com/Lightning-AI/lightning/pull/15532)) - - - The `ModelCheckpoint.save_on_train_epoch_end` attribute is now computed dynamically every epoch, accounting for changes to the validation dataloaders ([#15300](https://github.com/Lightning-AI/lightning/pull/15300)) - - - The Trainer now raises an error if it is given multiple stateful callbacks of the same time with colliding state keys ([#15634](https://github.com/Lightning-AI/lightning/pull/15634)) - - - `MLFlowLogger` now logs hyperparameters and metrics in batched API calls ([#15915](https://github.com/Lightning-AI/lightning/pull/15915)) - - - Overriding the `on_train_batch_{start,end}` hooks in conjunction with taking a `dataloader_iter` in the `training_step` no longer errors out and instead shows a warning ([#16062](https://github.com/Lightning-AI/lightning/pull/16062)) ### Deprecated - Deprecated `description`, `env_prefix` and `env_parse` parameters in `LightningCLI.__init__` in favour of giving them through `parser_kwargs` ([#15651](https://github.com/Lightning-AI/lightning/pull/15651)) - - - Deprecated `pytorch_lightning.profiler` in favor of `pytorch_lightning.profilers` ([#16059](https://github.com/PyTorchLightning/pytorch-lightning/pull/16059)) - - - Deprecated `Trainer(auto_select_gpus=...)` in favor of `pytorch_lightning.accelerators.find_usable_cuda_devices` ([#16147](https://github.com/PyTorchLightning/pytorch-lightning/pull/16147)) - - - Deprecated `pytorch_lightning.tuner.auto_gpu_select.{pick_single_gpu,pick_multiple_gpus}` in favor of `pytorch_lightning.accelerators.find_usable_cuda_devices` ([#16147](https://github.com/PyTorchLightning/pytorch-lightning/pull/16147)) - - - `nvidia/apex` deprecation ([#16039](https://github.com/PyTorchLightning/pytorch-lightning/pull/16039)) * Deprecated `pytorch_lightning.plugins.NativeMixedPrecisionPlugin` in favor of `pytorch_lightning.plugins.MixedPrecisionPlugin` * Deprecated the `LightningModule.optimizer_step(using_native_amp=...)` argument @@ -99,74 +52,39 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). * Deprecated the `pytorch_lightning.plugins.ApexMixedPrecisionPlugin` class * Deprecates the `pytorch_lightning.utilities.enum.sAMPType` enum * Deprecates the `DeepSpeedPrecisionPlugin(amp_type=..., amp_level=...)` arguments - - `horovod` deprecation ([#16141](https://github.com/PyTorchLightning/pytorch-lightning/pull/16141)) * Deprecated `Trainer(strategy="horovod")` * Deprecated the `HorovodStrategy` class - - - Deprecated `pytorch_lightning.lite.LightningLite` in favor of `lightning.fabric.Fabric` ([#16314](https://github.com/Lightning-AI/lightning/pull/16314)) ### Removed - Removed deprecated `pytorch_lightning.utilities.memory.get_gpu_memory_map` in favor of `pytorch_lightning.accelerators.cuda.get_nvidia_gpu_stats` ([#15617](https://github.com/Lightning-AI/lightning/pull/15617)) - - - Temporarily removed support for Hydra multi-run ([#15737](https://github.com/Lightning-AI/lightning/pull/15737)) - - - Removed deprecated `pytorch_lightning.profiler.base.AbstractProfiler` in favor of `pytorch_lightning.profilers.profiler.Profiler` ([#15637](https://github.com/Lightning-AI/lightning/pull/15637)) - - - Removed deprecated `pytorch_lightning.profiler.base.BaseProfiler` in favor of `pytorch_lightning.profilers.profiler.Profiler` ([#15637](https://github.com/Lightning-AI/lightning/pull/15637)) - - - Removed deprecated code in `pytorch_lightning.utilities.meta` ([#16038](https://github.com/Lightning-AI/lightning/pull/16038)) - - - Removed the deprecated `LightningDeepSpeedModule` ([#16041](https://github.com/Lightning-AI/lightning/pull/16041)) - - - Removed the deprecated `pytorch_lightning.accelerators.GPUAccelerator` in favor of `pytorch_lightning.accelerators.CUDAAccelerator` ([#16050](https://github.com/Lightning-AI/lightning/pull/16050)) - - - Removed the deprecated `pytorch_lightning.profiler.*` classes in favor of `pytorch_lightning.profilers` ([#16059](https://github.com/PyTorchLightning/pytorch-lightning/pull/16059)) - - - Removed the deprecated `pytorch_lightning.utilities.cli` module in favor of `pytorch_lightning.cli` ([#16116](https://github.com/PyTorchLightning/pytorch-lightning/pull/16116)) - - - Removed the deprecated `pytorch_lightning.loggers.base` module in favor of `pytorch_lightning.loggers.logger` ([#16120](https://github.com/PyTorchLightning/pytorch-lightning/pull/16120)) - - - Removed the deprecated `pytorch_lightning.loops.base` module in favor of `pytorch_lightning.loops.loop` ([#16142](https://github.com/PyTorchLightning/pytorch-lightning/pull/16142)) - - - Removed the deprecated `pytorch_lightning.core.lightning` module in favor of `pytorch_lightning.core.module` ([#16318](https://github.com/PyTorchLightning/pytorch-lightning/pull/16318)) - - - Removed the deprecated `pytorch_lightning.callbacks.base` module in favor of `pytorch_lightning.callbacks.callback` ([#16319](https://github.com/PyTorchLightning/pytorch-lightning/pull/16319)) - - - Removed the deprecated `Trainer.reset_train_val_dataloaders()` in favor of `Trainer.reset_{train,val}_dataloader` ([#16131](https://github.com/Lightning-AI/lightning/pull/16131)) - - - Removed support for `LightningCLI(seed_everything_default=None)` ([#16131](https://github.com/Lightning-AI/lightning/pull/16131)) - - - Removed support in LightningLite for FairScale's sharded training (`strategy='ddp_sharded'|'ddp_sharded_spawn'`). Use Fully-Sharded Data Parallel instead (`strategy='fsdp'`) ([#16329](https://github.com/Lightning-AI/lightning/pull/16329)) ### Fixed - Enhanced `reduce_boolean_decision` to accommodate `any`-analogous semantics expected by the `EarlyStopping` callback ([#15253](https://github.com/Lightning-AI/lightning/pull/15253)) - - - Fixed the incorrect optimizer step synchronization when running across multiple TPU devices ([#16020](https://github.com/Lightning-AI/lightning/pull/16020)) - - - Fixed a type error when dividing the chunk size in the ColossalAI strategy ([#16212](https://github.com/Lightning-AI/lightning/pull/16212)) +- Fixed bug where the ``interval`` key of the scheduler would be ignored during manual optimization, making the LearningRateMonitor callback fail to log the learning rate ([#16308](https://github.com/Lightning-AI/lightning/pull/16308)) +- Fixed an issue with `MLFlowLogger` not finalizing correctly when status code 'finished' was passed ([#16340](https://github.com/Lightning-AI/lightning/pull/16340)) ## [1.8.6] - 2022-12-21 diff --git a/src/pytorch_lightning/core/module.py b/src/pytorch_lightning/core/module.py index 9ca92e6d4b15b..2055d8ae8757c 100644 --- a/src/pytorch_lightning/core/module.py +++ b/src/pytorch_lightning/core/module.py @@ -28,7 +28,7 @@ from torch import ScriptModule, Tensor from torch.nn import Module from torch.optim.optimizer import Optimizer -from torchmetrics import Metric +from torchmetrics import Metric, MetricCollection from typing_extensions import Literal import lightning_fabric as lf @@ -50,7 +50,7 @@ from pytorch_lightning.trainer.connectors.logger_connector.fx_validator import _FxValidator from pytorch_lightning.utilities import GradClipAlgorithmType from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_13 +from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_13, _TORCHMETRICS_GREATER_EQUAL_0_9_1 from pytorch_lightning.utilities.rank_zero import rank_zero_debug, rank_zero_warn, WarningCache from pytorch_lightning.utilities.signature_utils import is_param_in_hook_signature from pytorch_lightning.utilities.types import ( @@ -534,7 +534,8 @@ def log_dict( Args: dictionary: key value pairs. - The values can be a ``float``, ``Tensor``, ``Metric``, or a dictionary of the former. + The values can be a ``float``, ``Tensor``, ``Metric``, a dictionary of the former + or a ``MetricCollection``. prog_bar: if ``True`` logs to the progress base. logger: if ``True`` logs to the logger. on_step: if ``True`` logs at this step. @@ -560,7 +561,15 @@ def log_dict( """ if self._fabric is not None: return self._log_dict_through_fabric(dictionary=dictionary, logger=logger) - for k, v in dictionary.items(): + + kwargs: Dict[str, bool] = {} + + if isinstance(dictionary, MetricCollection): + kwargs["keep_base"] = False + if _TORCHMETRICS_GREATER_EQUAL_0_9_1 and dictionary._enable_compute_groups: + kwargs["copy_state"] = False + + for k, v in dictionary.items(**kwargs): self.log( name=k, value=v, diff --git a/src/pytorch_lightning/core/optimizer.py b/src/pytorch_lightning/core/optimizer.py index 732ddd8b7cc7f..430c92ee62de8 100644 --- a/src/pytorch_lightning/core/optimizer.py +++ b/src/pytorch_lightning/core/optimizer.py @@ -322,7 +322,9 @@ def _configure_schedulers_manual_opt(schedulers: list) -> List[LRSchedulerConfig lr_scheduler_configs = [] for scheduler in schedulers: if isinstance(scheduler, dict): - invalid_keys = {"interval", "frequency", "reduce_on_plateau", "monitor", "strict"} + # interval is not in this list even though the user needs to manually call the scheduler because + # the `LearningRateMonitor` callback needs to check its value to know when to log the learning rate + invalid_keys = {"frequency", "reduce_on_plateau", "monitor", "strict"} keys_to_warn = [k for k in scheduler.keys() if k in invalid_keys] if keys_to_warn: diff --git a/src/pytorch_lightning/loggers/csv_logs.py b/src/pytorch_lightning/loggers/csv_logs.py index a22c0d6ad81b9..d35ab12cc7f5a 100644 --- a/src/pytorch_lightning/loggers/csv_logs.py +++ b/src/pytorch_lightning/loggers/csv_logs.py @@ -18,24 +18,24 @@ CSV logger for basic experiment logging that does not require opening ports """ -import csv import logging import os from argparse import Namespace -from typing import Any, Dict, List, Optional, Union +from typing import Any, Dict, Optional, Union -from torch import Tensor - -from lightning_fabric.utilities.logger import _add_prefix, _convert_params +from lightning_fabric.loggers.csv_logs import _ExperimentWriter as _FabricExperimentWriter +from lightning_fabric.loggers.csv_logs import CSVLogger as FabricCSVLogger +from lightning_fabric.loggers.logger import rank_zero_experiment +from lightning_fabric.utilities.logger import _convert_params from lightning_fabric.utilities.types import _PATH from pytorch_lightning.core.saving import save_hparams_to_yaml -from pytorch_lightning.loggers.logger import Logger, rank_zero_experiment -from pytorch_lightning.utilities.rank_zero import rank_zero_only, rank_zero_warn +from pytorch_lightning.loggers.logger import Logger +from pytorch_lightning.utilities.rank_zero import rank_zero_only log = logging.getLogger(__name__) -class ExperimentWriter: +class ExperimentWriter(_FabricExperimentWriter): r""" Experiment writer for CSVLogger. @@ -47,61 +47,23 @@ class ExperimentWriter: """ NAME_HPARAMS_FILE = "hparams.yaml" - NAME_METRICS_FILE = "metrics.csv" def __init__(self, log_dir: str) -> None: + super().__init__(log_dir=log_dir) self.hparams: Dict[str, Any] = {} - self.metrics: List[Dict[str, float]] = [] - - self.log_dir = log_dir - if os.path.exists(self.log_dir) and os.listdir(self.log_dir): - rank_zero_warn( - f"Experiment logs directory {self.log_dir} exists and is not empty." - " Previous log files in this directory will be deleted when the new ones are saved!" - ) - os.makedirs(self.log_dir, exist_ok=True) - - self.metrics_file_path = os.path.join(self.log_dir, self.NAME_METRICS_FILE) def log_hparams(self, params: Dict[str, Any]) -> None: """Record hparams.""" self.hparams.update(params) - def log_metrics(self, metrics_dict: Dict[str, float], step: Optional[int] = None) -> None: - """Record metrics.""" - - def _handle_value(value: Union[Tensor, Any]) -> Any: - if isinstance(value, Tensor): - return value.item() - return value - - if step is None: - step = len(self.metrics) - - metrics = {k: _handle_value(v) for k, v in metrics_dict.items()} - metrics["step"] = step - self.metrics.append(metrics) - def save(self) -> None: """Save recorded hparams and metrics into files.""" hparams_file = os.path.join(self.log_dir, self.NAME_HPARAMS_FILE) save_hparams_to_yaml(hparams_file, self.hparams) + return super().save() - if not self.metrics: - return - - last_m = {} - for m in self.metrics: - last_m.update(m) - metrics_keys = list(last_m.keys()) - - with open(self.metrics_file_path, "w", newline="") as f: - writer = csv.DictWriter(f, fieldnames=metrics_keys) - writer.writeheader() - writer.writerows(self.metrics) - -class CSVLogger(Logger): +class CSVLogger(Logger, FabricCSVLogger): r""" Log to local file system in yaml and CSV format. @@ -115,7 +77,7 @@ class CSVLogger(Logger): Args: save_dir: Save directory - name: Experiment name. Defaults to ``'default'``. + name: Experiment name. Defaults to ``'lightning_logs'``. version: Experiment version. If version is not specified the logger inspects the save directory for existing versions, then automatically assigns the next available version. prefix: A string to put at the beginning of metric keys. @@ -132,13 +94,14 @@ def __init__( prefix: str = "", flush_logs_every_n_steps: int = 100, ): - super().__init__() + super().__init__( + root_dir=save_dir, + name=name, + version=version, + prefix=prefix, + flush_logs_every_n_steps=flush_logs_every_n_steps, + ) self._save_dir = os.fspath(save_dir) - self._name = name or "" - self._version = version - self._prefix = prefix - self._experiment: Optional[ExperimentWriter] = None - self._flush_logs_every_n_steps = flush_logs_every_n_steps @property def root_dir(self) -> str: @@ -170,12 +133,17 @@ def save_dir(self) -> str: """ return self._save_dir + @rank_zero_only + def log_hyperparams(self, params: Union[Dict[str, Any], Namespace]) -> None: + params = _convert_params(params) + self.experiment.log_hparams(params) + @property @rank_zero_experiment - def experiment(self) -> ExperimentWriter: + def experiment(self) -> _FabricExperimentWriter: r""" - Actual ExperimentWriter object. To use ExperimentWriter features in your + Actual _ExperimentWriter object. To use _ExperimentWriter features in your :class:`~pytorch_lightning.core.module.LightningModule` do the following. Example:: @@ -189,65 +157,3 @@ def experiment(self) -> ExperimentWriter: os.makedirs(self.root_dir, exist_ok=True) self._experiment = ExperimentWriter(log_dir=self.log_dir) return self._experiment - - @rank_zero_only - def log_hyperparams(self, params: Union[Dict[str, Any], Namespace]) -> None: - params = _convert_params(params) - self.experiment.log_hparams(params) - - @rank_zero_only - def log_metrics(self, metrics: Dict[str, Union[Tensor, float]], step: Optional[int] = None) -> None: - metrics = _add_prefix(metrics, self._prefix, self.LOGGER_JOIN_CHAR) - self.experiment.log_metrics(metrics, step) - if step is not None and (step + 1) % self._flush_logs_every_n_steps == 0: - self.save() - - @rank_zero_only - def save(self) -> None: - super().save() - self.experiment.save() - - @rank_zero_only - def finalize(self, status: str) -> None: - if self._experiment is None: - # When using multiprocessing, finalize() should be a no-op on the main process, as no experiment has been - # initialized there - return - self.save() - - @property - def name(self) -> str: - """Gets the name of the experiment. - - Returns: - The name of the experiment. - """ - return self._name - - @property - def version(self) -> Union[int, str]: - """Gets the version of the experiment. - - Returns: - The version of the experiment if it is specified, else the next version. - """ - if self._version is None: - self._version = self._get_next_version() - return self._version - - def _get_next_version(self) -> int: - root_dir = self.root_dir - - if not os.path.isdir(root_dir): - log.warning("Missing logger folder: %s", root_dir) - return 0 - - existing_versions = [] - for d in os.listdir(root_dir): - if os.path.isdir(os.path.join(root_dir, d)) and d.startswith("version_"): - existing_versions.append(int(d.split("_")[1])) - - if len(existing_versions) == 0: - return 0 - - return max(existing_versions) + 1 diff --git a/src/pytorch_lightning/loggers/mlflow.py b/src/pytorch_lightning/loggers/mlflow.py index 87f0d707e111b..bbed562283326 100644 --- a/src/pytorch_lightning/loggers/mlflow.py +++ b/src/pytorch_lightning/loggers/mlflow.py @@ -284,6 +284,8 @@ def finalize(self, status: str = "success") -> None: status = "FINISHED" elif status == "failed": status = "FAILED" + elif status == "finished": + status = "FINISHED" # log checkpoints as artifacts if self._checkpoint_callback: diff --git a/src/pytorch_lightning/trainer/connectors/accelerator_connector.py b/src/pytorch_lightning/trainer/connectors/accelerator_connector.py index f43f9fb96fce7..2e74ff45fe105 100644 --- a/src/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/src/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -69,6 +69,7 @@ HorovodStrategy, HPUParallelStrategy, IPUStrategy, + ParallelStrategy, SingleDeviceStrategy, SingleHPUStrategy, SingleTPUStrategy, @@ -284,6 +285,20 @@ def _check_config_and_set_final_flags( f" Available names are: {', '.join(self._accelerator_types)}." ) + # MPS accelerator is incompatible with DDP family of strategies. It supports single-device operation only. + is_ddp_str = isinstance(strategy, str) and "ddp" in strategy + is_dp_str = isinstance(strategy, str) and "dp" in strategy + is_deepspeed_str = isinstance(strategy, str) and "deepspeed" in strategy + is_parallel_strategy = isinstance(strategy, ParallelStrategy) or is_ddp_str or is_dp_str or is_deepspeed_str + is_mps_accelerator = MPSAccelerator.is_available() and ( + accelerator in ("mps", "auto", "gpu", None) or isinstance(accelerator, MPSAccelerator) + ) + if is_mps_accelerator and is_parallel_strategy: + raise ValueError( + f"You set `strategy={strategy}` but strategies from the DDP family are not supported on the" + f" MPS accelerator. Either explicitly set `accelerator='cpu'` or change the strategy." + ) + self._accelerator_flag = accelerator supported_precision = get_args(_PRECISION_INPUT_STR) + get_args(_PRECISION_INPUT_INT) diff --git a/src/pytorch_lightning/utilities/imports.py b/src/pytorch_lightning/utilities/imports.py index 183251175bad0..95def583f7dfd 100644 --- a/src/pytorch_lightning/utilities/imports.py +++ b/src/pytorch_lightning/utilities/imports.py @@ -23,6 +23,7 @@ _TORCH_LESSER_EQUAL_1_10_2 = compare_version("torch", operator.le, "1.10.2") # duplicated from fabric because HPU is patching it below _TORCH_GREATER_EQUAL_1_13 = compare_version("torch", operator.ge, "1.13.0") +_TORCHMETRICS_GREATER_EQUAL_0_9_1 = RequirementCache("torchmetrics>=0.9.1") _HABANA_FRAMEWORK_AVAILABLE = package_available("habana_frameworks") _HIVEMIND_AVAILABLE = package_available("hivemind") diff --git a/tests/integrations_app/__init__.py b/tests/integrations_app/__init__.py new file mode 100644 index 0000000000000..a3c9eb29e7220 --- /dev/null +++ b/tests/integrations_app/__init__.py @@ -0,0 +1,3 @@ +from os.path import dirname + +_PATH_TESTS_DIR = dirname(dirname(__file__)) diff --git a/tests/integrations_app_examples/apps/collect_failures/__init__.py b/tests/integrations_app/apps/collect_failures/__init__.py similarity index 100% rename from tests/integrations_app_examples/apps/collect_failures/__init__.py rename to tests/integrations_app/apps/collect_failures/__init__.py diff --git a/tests/integrations_app_examples/apps/collect_failures/app.py b/tests/integrations_app/apps/collect_failures/app.py similarity index 100% rename from tests/integrations_app_examples/apps/collect_failures/app.py rename to tests/integrations_app/apps/collect_failures/app.py diff --git a/tests/integrations_app_examples/apps/collect_failures/requirements.txt b/tests/integrations_app/apps/collect_failures/requirements.txt similarity index 100% rename from tests/integrations_app_examples/apps/collect_failures/requirements.txt rename to tests/integrations_app/apps/collect_failures/requirements.txt diff --git a/tests/integrations_app_examples/apps/core_features_app/__init__.py b/tests/integrations_app/apps/core_features_app/__init__.py similarity index 100% rename from tests/integrations_app_examples/apps/core_features_app/__init__.py rename to tests/integrations_app/apps/core_features_app/__init__.py diff --git a/tests/integrations_app_examples/apps/core_features_app/app.py b/tests/integrations_app/apps/core_features_app/app.py similarity index 76% rename from tests/integrations_app_examples/apps/core_features_app/app.py rename to tests/integrations_app/apps/core_features_app/app.py index b23a9d74600ac..9214e852c45db 100644 --- a/tests/integrations_app_examples/apps/core_features_app/app.py +++ b/tests/integrations_app/apps/core_features_app/app.py @@ -8,7 +8,7 @@ def __init__(self): super().__init__() def run(self): - # these env vars are set here: tests/integrations_app_examples/test_core_features_app.py:15 + # these env vars are set here: tests/integrations_app/test_core_features_app.py:15 assert os.getenv("FOO", "") == "bar" assert os.getenv("BLA", "") == "bloz" self._exit() diff --git a/tests/integrations_app_examples/apps/custom_work_dependencies/__init__.py b/tests/integrations_app/apps/custom_work_dependencies/__init__.py similarity index 100% rename from tests/integrations_app_examples/apps/custom_work_dependencies/__init__.py rename to tests/integrations_app/apps/custom_work_dependencies/__init__.py diff --git a/tests/integrations_app_examples/apps/custom_work_dependencies/app.py b/tests/integrations_app/apps/custom_work_dependencies/app.py similarity index 100% rename from tests/integrations_app_examples/apps/custom_work_dependencies/app.py rename to tests/integrations_app/apps/custom_work_dependencies/app.py diff --git a/tests/integrations_app_examples/apps/idle_timeout/__init__.py b/tests/integrations_app/apps/idle_timeout/__init__.py similarity index 100% rename from tests/integrations_app_examples/apps/idle_timeout/__init__.py rename to tests/integrations_app/apps/idle_timeout/__init__.py diff --git a/tests/integrations_app_examples/apps/idle_timeout/app.py b/tests/integrations_app/apps/idle_timeout/app.py similarity index 100% rename from tests/integrations_app_examples/apps/idle_timeout/app.py rename to tests/integrations_app/apps/idle_timeout/app.py diff --git a/tests/integrations_app_examples/conftest.py b/tests/integrations_app/conftest.py similarity index 98% rename from tests/integrations_app_examples/conftest.py rename to tests/integrations_app/conftest.py index f4a2be0147fbb..fa90fcdaa26f1 100644 --- a/tests/integrations_app_examples/conftest.py +++ b/tests/integrations_app/conftest.py @@ -5,7 +5,7 @@ import psutil import pytest -from integrations_app_examples.public import _PATH_EXAMPLES +from integrations_app.public import _PATH_EXAMPLES from lightning_app.storage.path import _storage_root_dir from lightning_app.utilities.component import _set_context diff --git a/tests/integrations_app/flagship/__init__.py b/tests/integrations_app/flagship/__init__.py new file mode 100644 index 0000000000000..03b5efcea4e1d --- /dev/null +++ b/tests/integrations_app/flagship/__init__.py @@ -0,0 +1,5 @@ +import os.path + +from integrations_app import _PATH_TESTS_DIR + +_PATH_INTEGRATIONS_DIR = os.path.join(_PATH_TESTS_DIR, "_flagships") diff --git a/tests/integrations_app/flagship/test_flashy.py b/tests/integrations_app/flagship/test_flashy.py new file mode 100644 index 0000000000000..2217794b1f442 --- /dev/null +++ b/tests/integrations_app/flagship/test_flashy.py @@ -0,0 +1,79 @@ +import os +from time import sleep + +import pytest +from integrations_app.flagship import _PATH_INTEGRATIONS_DIR + +from lightning_app.testing.testing import run_app_in_cloud +from lightning_app.utilities.imports import _is_playwright_available + +if _is_playwright_available(): + import playwright + from playwright.sync_api import expect, Page + + +# TODO: when this function is moved to the app itself we can just import it, so to keep better aligned +def validate_app_functionalities(app_page: "Page") -> None: + """Validate the page after app starts. + + this is direct copy-paste of validation living in the app repository: + https://github.com/Lightning-AI/LAI-Flashy-App/blob/main/tests/test_app_gallery.py#L205 + + app_page: The UI page of the app to be validated. + """ + + while True: + try: + app_page.reload() + sleep(5) + app_label = app_page.frame_locator("iframe").locator("text=Choose your AI task") + app_label.wait_for(timeout=30 * 1000) + break + except ( + playwright._impl._api_types.Error, + playwright._impl._api_types.TimeoutError, + ): + pass + + input_field = app_page.frame_locator("iframe").locator('input:below(:text("Data URL"))').first + input_field.wait_for(timeout=1000) + input_field.type("https://pl-flash-data.s3.amazonaws.com/hymenoptera_data.zip") + sleep(1) + upload_btn = app_page.frame_locator("iframe").locator('button:has-text("Upload")') + upload_btn.wait_for(timeout=1000) + upload_btn.click() + + sleep(10) + + train_folder_dropdown = app_page.frame_locator("iframe").locator("#mui-2") + train_folder_dropdown.click() + + train_folder = app_page.frame_locator("iframe").locator('text="hymenoptera_data/train"') + train_folder.scroll_into_view_if_needed() + train_folder.click() + + val_folder_dropdown = app_page.frame_locator("iframe").locator("#mui-3") + val_folder_dropdown.click() + + val_folder = app_page.frame_locator("iframe").locator('text="hymenoptera_data/val"') + val_folder.scroll_into_view_if_needed() + val_folder.click() + + train_btn = app_page.frame_locator("iframe").locator('button:has-text("Start training!")') + train_btn.click() + + # Sometimes the results don't show until we refresh the page + sleep(10) + + app_page.reload() + + app_page.frame_locator("iframe").locator('button:has-text("RESULTS")').click() + runs = app_page.frame_locator("iframe").locator("table tbody tr") + expect(runs).to_have_count(1, timeout=120000) + + +@pytest.mark.cloud +def test_app_cloud() -> None: + with run_app_in_cloud(os.path.join(_PATH_INTEGRATIONS_DIR, "flashy")) as (admin_page, view_page, fetch_logs, _): + + validate_app_functionalities(view_page) diff --git a/tests/integrations_app_examples/local/__init__.py b/tests/integrations_app/local/__init__.py similarity index 100% rename from tests/integrations_app_examples/local/__init__.py rename to tests/integrations_app/local/__init__.py diff --git a/tests/integrations_app_examples/local/test_collect_failures.py b/tests/integrations_app/local/test_collect_failures.py similarity index 95% rename from tests/integrations_app_examples/local/test_collect_failures.py rename to tests/integrations_app/local/test_collect_failures.py index 765af4ab0db31..152e36d740d23 100644 --- a/tests/integrations_app_examples/local/test_collect_failures.py +++ b/tests/integrations_app/local/test_collect_failures.py @@ -2,7 +2,7 @@ from time import sleep import pytest -from integrations_app_examples.local import _PATH_APPS +from integrations_app.local import _PATH_APPS from lightning_app.testing.testing import run_app_in_cloud diff --git a/tests/integrations_app_examples/local/test_core_features_app.py b/tests/integrations_app/local/test_core_features_app.py similarity index 91% rename from tests/integrations_app_examples/local/test_core_features_app.py rename to tests/integrations_app/local/test_core_features_app.py index b6f61aa75d686..0c5be71ad671f 100644 --- a/tests/integrations_app_examples/local/test_core_features_app.py +++ b/tests/integrations_app/local/test_core_features_app.py @@ -1,7 +1,7 @@ import os from click.testing import CliRunner -from integrations_app_examples.local import _PATH_APPS +from integrations_app.local import _PATH_APPS from lightning_app.cli.lightning_cli import run_app diff --git a/tests/integrations_app_examples/local/test_custom_work_dependencies.py b/tests/integrations_app/local/test_custom_work_dependencies.py similarity index 91% rename from tests/integrations_app_examples/local/test_custom_work_dependencies.py rename to tests/integrations_app/local/test_custom_work_dependencies.py index 2d32b6cce5f76..63e4b33128ce2 100644 --- a/tests/integrations_app_examples/local/test_custom_work_dependencies.py +++ b/tests/integrations_app/local/test_custom_work_dependencies.py @@ -2,7 +2,7 @@ from time import sleep import pytest -from integrations_app_examples.local import _PATH_APPS +from integrations_app.local import _PATH_APPS from lightning_app.testing.testing import run_app_in_cloud diff --git a/tests/integrations_app_examples/local/test_idle_timeout.py b/tests/integrations_app/local/test_idle_timeout.py similarity index 90% rename from tests/integrations_app_examples/local/test_idle_timeout.py rename to tests/integrations_app/local/test_idle_timeout.py index 894dce8706b66..7b5882694a9ec 100644 --- a/tests/integrations_app_examples/local/test_idle_timeout.py +++ b/tests/integrations_app/local/test_idle_timeout.py @@ -2,7 +2,7 @@ from time import sleep import pytest -from integrations_app_examples.local import _PATH_APPS +from integrations_app.local import _PATH_APPS from lightning_app.testing.testing import run_app_in_cloud diff --git a/tests/integrations_app_examples/public/__init__.py b/tests/integrations_app/public/__init__.py similarity index 100% rename from tests/integrations_app_examples/public/__init__.py rename to tests/integrations_app/public/__init__.py diff --git a/tests/integrations_app_examples/public/test_app_dag.py b/tests/integrations_app/public/test_app_dag.py similarity index 91% rename from tests/integrations_app_examples/public/test_app_dag.py rename to tests/integrations_app/public/test_app_dag.py index f9e87b8062ca5..59558cd23e891 100644 --- a/tests/integrations_app_examples/public/test_app_dag.py +++ b/tests/integrations_app/public/test_app_dag.py @@ -2,7 +2,7 @@ from time import sleep import pytest -from integrations_app_examples.public import _PATH_EXAMPLES +from integrations_app.public import _PATH_EXAMPLES from lightning_app.testing.testing import run_app_in_cloud diff --git a/tests/integrations_app_examples/public/test_argparse.py b/tests/integrations_app/public/test_argparse.py similarity index 96% rename from tests/integrations_app_examples/public/test_argparse.py rename to tests/integrations_app/public/test_argparse.py index c844c93899d9e..80c50c3b4a940 100644 --- a/tests/integrations_app_examples/public/test_argparse.py +++ b/tests/integrations_app/public/test_argparse.py @@ -1,7 +1,7 @@ import os import sys -from integrations_app_examples.public import _PATH_EXAMPLES +from integrations_app.public import _PATH_EXAMPLES from lightning_app.testing.testing import application_testing from lightning_app.utilities.load_app import _patch_sys_argv diff --git a/tests/integrations_app_examples/public/test_boring_app.py b/tests/integrations_app/public/test_boring_app.py similarity index 94% rename from tests/integrations_app_examples/public/test_boring_app.py rename to tests/integrations_app/public/test_boring_app.py index ac5ca577bdb23..553d6e716cfc8 100644 --- a/tests/integrations_app_examples/public/test_boring_app.py +++ b/tests/integrations_app/public/test_boring_app.py @@ -2,7 +2,7 @@ import pytest from click.testing import CliRunner -from integrations_app_examples.public import _PATH_EXAMPLES +from integrations_app.public import _PATH_EXAMPLES from lightning_app.cli.lightning_cli import show from lightning_app.testing.testing import run_app_in_cloud, wait_for diff --git a/tests/integrations_app_examples/public/test_commands_and_api.py b/tests/integrations_app/public/test_commands_and_api.py similarity index 97% rename from tests/integrations_app_examples/public/test_commands_and_api.py rename to tests/integrations_app/public/test_commands_and_api.py index 0e13a0b1fcf19..a9554ed449b86 100644 --- a/tests/integrations_app_examples/public/test_commands_and_api.py +++ b/tests/integrations_app/public/test_commands_and_api.py @@ -4,7 +4,7 @@ import pytest import requests -from integrations_app_examples.public import _PATH_EXAMPLES +from integrations_app.public import _PATH_EXAMPLES from lightning_app.testing.testing import run_app_in_cloud from lightning_app.utilities.cloud import _get_project diff --git a/tests/integrations_app_examples/public/test_drive.py b/tests/integrations_app/public/test_drive.py similarity index 89% rename from tests/integrations_app_examples/public/test_drive.py rename to tests/integrations_app/public/test_drive.py index 20cec272dbf1f..038b9a1fd5ed0 100644 --- a/tests/integrations_app_examples/public/test_drive.py +++ b/tests/integrations_app/public/test_drive.py @@ -2,7 +2,7 @@ from time import sleep import pytest -from integrations_app_examples.public import _PATH_EXAMPLES +from integrations_app.public import _PATH_EXAMPLES from lightning_app.testing.testing import run_app_in_cloud diff --git a/tests/integrations_app_examples/public/test_gradio.py b/tests/integrations_app/public/test_gradio.py similarity index 100% rename from tests/integrations_app_examples/public/test_gradio.py rename to tests/integrations_app/public/test_gradio.py diff --git a/tests/integrations_app_examples/public/test_installation_commands_app.py b/tests/integrations_app/public/test_installation_commands_app.py similarity index 91% rename from tests/integrations_app_examples/public/test_installation_commands_app.py rename to tests/integrations_app/public/test_installation_commands_app.py index 48e17c850dd29..4aea5500a1258 100644 --- a/tests/integrations_app_examples/public/test_installation_commands_app.py +++ b/tests/integrations_app/public/test_installation_commands_app.py @@ -1,7 +1,7 @@ import os import pytest -from integrations_app_examples.public import _PATH_EXAMPLES +from integrations_app.public import _PATH_EXAMPLES from lightning_app.testing.testing import run_app_in_cloud diff --git a/tests/integrations_app_examples/public/test_layout.py b/tests/integrations_app/public/test_layout.py similarity index 89% rename from tests/integrations_app_examples/public/test_layout.py rename to tests/integrations_app/public/test_layout.py index 7ab99c2fa9e1c..735c27fc8565c 100644 --- a/tests/integrations_app_examples/public/test_layout.py +++ b/tests/integrations_app/public/test_layout.py @@ -1,7 +1,7 @@ import os from click.testing import CliRunner -from integrations_app_examples.public import _PATH_EXAMPLES +from integrations_app.public import _PATH_EXAMPLES from lightning_app.cli.lightning_cli import run_app diff --git a/tests/integrations_app_examples/public/test_multi_node.py b/tests/integrations_app/public/test_multi_node.py similarity index 96% rename from tests/integrations_app_examples/public/test_multi_node.py rename to tests/integrations_app/public/test_multi_node.py index 6e1380c3a9b3b..0512c67921d29 100644 --- a/tests/integrations_app_examples/public/test_multi_node.py +++ b/tests/integrations_app/public/test_multi_node.py @@ -2,7 +2,7 @@ from unittest import mock import pytest -from integrations_app_examples.public import _PATH_EXAMPLES +from integrations_app.public import _PATH_EXAMPLES from lightning_utilities.core.imports import package_available from lightning_app.testing.helpers import _RunIf diff --git a/tests/integrations_app_examples/public/test_payload.py b/tests/integrations_app/public/test_payload.py similarity index 88% rename from tests/integrations_app_examples/public/test_payload.py rename to tests/integrations_app/public/test_payload.py index a3c5c0a4d788c..dbe24875c990a 100644 --- a/tests/integrations_app_examples/public/test_payload.py +++ b/tests/integrations_app/public/test_payload.py @@ -2,7 +2,7 @@ from time import sleep import pytest -from integrations_app_examples.public import _PATH_EXAMPLES +from integrations_app.public import _PATH_EXAMPLES from lightning_app.testing.testing import run_app_in_cloud diff --git a/tests/integrations_app_examples/public/test_pickle_or_not.py b/tests/integrations_app/public/test_pickle_or_not.py similarity index 91% rename from tests/integrations_app_examples/public/test_pickle_or_not.py rename to tests/integrations_app/public/test_pickle_or_not.py index 7fc77aa601725..1ecd856c00f99 100644 --- a/tests/integrations_app_examples/public/test_pickle_or_not.py +++ b/tests/integrations_app/public/test_pickle_or_not.py @@ -2,7 +2,7 @@ import pytest from click.testing import CliRunner -from integrations_app_examples.public import _PATH_EXAMPLES +from integrations_app.public import _PATH_EXAMPLES from lightning_app.cli.lightning_cli import run_app diff --git a/tests/integrations_app_examples/public/test_quick_start.py b/tests/integrations_app/public/test_quick_start.py similarity index 97% rename from tests/integrations_app_examples/public/test_quick_start.py rename to tests/integrations_app/public/test_quick_start.py index 32ce5e02f2fcb..a8319bc4cd53d 100644 --- a/tests/integrations_app_examples/public/test_quick_start.py +++ b/tests/integrations_app/public/test_quick_start.py @@ -4,7 +4,7 @@ import pytest from click.testing import CliRunner -from integrations_app_examples.public import _PATH_EXAMPLES +from integrations_app.public import _PATH_EXAMPLES from lightning_app import LightningApp from lightning_app.cli.lightning_cli import run_app diff --git a/tests/integrations_app_examples/public/test_scripts.py b/tests/integrations_app/public/test_scripts.py similarity index 94% rename from tests/integrations_app_examples/public/test_scripts.py rename to tests/integrations_app/public/test_scripts.py index ca18622c50eb4..93ad5b1548ae8 100644 --- a/tests/integrations_app_examples/public/test_scripts.py +++ b/tests/integrations_app/public/test_scripts.py @@ -2,7 +2,7 @@ import pytest from click.testing import CliRunner -from integrations_app_examples.public import _PATH_EXAMPLES +from integrations_app.public import _PATH_EXAMPLES from lightning_app.cli.lightning_cli import run_app from lightning_app.testing.helpers import _run_script, _RunIf diff --git a/tests/integrations_app_examples/public/test_template_react_ui.py b/tests/integrations_app/public/test_template_react_ui.py similarity index 94% rename from tests/integrations_app_examples/public/test_template_react_ui.py rename to tests/integrations_app/public/test_template_react_ui.py index 74d244d5c2c49..6ca0be497cb70 100644 --- a/tests/integrations_app_examples/public/test_template_react_ui.py +++ b/tests/integrations_app/public/test_template_react_ui.py @@ -2,7 +2,7 @@ from time import sleep import pytest -from integrations_app_examples.public import _PATH_EXAMPLES +from integrations_app.public import _PATH_EXAMPLES from lightning_app.testing.testing import run_app_in_cloud, wait_for diff --git a/tests/integrations_app_examples/public/test_template_streamlit_ui.py b/tests/integrations_app/public/test_template_streamlit_ui.py similarity index 94% rename from tests/integrations_app_examples/public/test_template_streamlit_ui.py rename to tests/integrations_app/public/test_template_streamlit_ui.py index fd53caec8f66c..fbccae894e521 100644 --- a/tests/integrations_app_examples/public/test_template_streamlit_ui.py +++ b/tests/integrations_app/public/test_template_streamlit_ui.py @@ -2,7 +2,7 @@ from time import sleep import pytest -from integrations_app_examples.public import _PATH_EXAMPLES +from integrations_app.public import _PATH_EXAMPLES from lightning_app.testing.testing import run_app_in_cloud, wait_for diff --git a/tests/integrations_app_examples/public/test_v0_app.py b/tests/integrations_app/public/test_v0_app.py similarity index 98% rename from tests/integrations_app_examples/public/test_v0_app.py rename to tests/integrations_app/public/test_v0_app.py index 6b5f365412f63..865e3514bf394 100644 --- a/tests/integrations_app_examples/public/test_v0_app.py +++ b/tests/integrations_app/public/test_v0_app.py @@ -5,7 +5,7 @@ from unittest.mock import MagicMock import pytest -from integrations_app_examples.public import _PATH_EXAMPLES +from integrations_app.public import _PATH_EXAMPLES from lightning_app import LightningApp from lightning_app.runners import CloudRuntime diff --git a/tests/integrations_app_examples/__init__.py b/tests/integrations_app_examples/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/tests/tests_app/cli/test_cli.py b/tests/tests_app/cli/test_cli.py index b33db38532785..ce01260200203 100644 --- a/tests/tests_app/cli/test_cli.py +++ b/tests/tests_app/cli/test_cli.py @@ -88,6 +88,7 @@ def test_create_cluster(create_command: mock.MagicMock): "dummy", "--role-arn", "arn:aws:iam::1234567890:role/lai-byoc", + "--sync", ], ) @@ -124,7 +125,7 @@ def test_list_clusters(list_command: mock.MagicMock): @mock.patch("lightning_app.cli.cmd_clusters.AWSClusterManager.delete") def test_delete_cluster(delete: mock.MagicMock): runner = CliRunner() - runner.invoke(delete_cluster, ["test-7"]) + runner.invoke(delete_cluster, ["test-7", "--sync"]) delete.assert_called_once_with(cluster_id="test-7", force=False, do_async=False) diff --git a/tests/tests_app/utilities/test_login.py b/tests/tests_app/utilities/test_login.py index e0ad4b110c868..08e65454cc09e 100644 --- a/tests/tests_app/utilities/test_login.py +++ b/tests/tests_app/utilities/test_login.py @@ -11,7 +11,9 @@ @pytest.fixture(autouse=True) def before_each(): - login.Auth.clear() + for key in login.Keys: + os.environ.pop(key.value, None) + login.Auth().clear() class TestAuthentication: @@ -25,7 +27,6 @@ def test_can_store_credentials(self): def test_e2e(self): auth = login.Auth() - assert auth._with_env_var is False auth.save(username="superman", user_id="kr-1234") assert auth.secrets_file.exists() @@ -46,6 +47,9 @@ def test_auth_header(self): os.environ.setdefault("LIGHTNING_USER_ID", "7c8455e3-7c5f-4697-8a6d-105971d6b9bd") os.environ.setdefault("LIGHTNING_API_KEY", "e63fae57-2b50-498b-bc46-d6204cbf330e") auth = login.Auth() + auth.clear() + auth.authenticate() + assert "Basic" in auth.auth_header assert ( auth.auth_header @@ -57,7 +61,9 @@ def test_authentication_with_invalid_environment_vars(): # if api key is passed without user id os.environ.setdefault("LIGHTNING_API_KEY", "123") with pytest.raises(ValueError): - login.Auth() + auth = login.Auth() + auth.clear() + auth.authenticate() @mock.patch("lightning_app.utilities.login.AuthServer.login_with_browser") @@ -66,13 +72,19 @@ def test_authentication_with_environment_vars(browser_login: mock.MagicMock): os.environ.setdefault("LIGHTNING_API_KEY", "abc") auth = login.Auth() + auth.clear() + auth.authenticate() + assert auth.user_id == "abc" assert auth.auth_header == "Basic YWJjOmFiYw==" - assert auth._with_env_var is True assert auth.authenticate() == auth.auth_header # should not run login flow when env vars are passed browser_login.assert_not_called() + # Check credentials file + assert auth.secrets_file.exists() + assert auth.load() is True + def test_get_auth_url(): auth_url = login.AuthServer().get_auth_url(1234) @@ -103,13 +115,16 @@ def test_login_with_browser( def test_authenticate(click_launch: mock.MagicMock, head: mock.MagicMock, run: mock.MagicMock, port: mock.MagicMock): port.return_value = 1234 auth = login.Auth() - auth.user_id = "user_id" - auth.api_key = "api_key" + auth.clear() + + click_launch.side_effect = lambda _: auth.save("", "user_id", "api_key", "user_id") + auth.authenticate() url = f"{LIGHTNING_CLOUD_URL}/sign-in?redirectTo=http%3A%2F%2Flocalhost%3A1234%2Flogin-complete" # E501 head.assert_called_with(url) click_launch.assert_called_with(url) run.assert_called() + assert auth.auth_header == "Basic dXNlcl9pZDphcGlfa2V5" auth.authenticate() diff --git a/tests/tests_fabric/loggers/test_csv.py b/tests/tests_fabric/loggers/test_csv.py new file mode 100644 index 0000000000000..004b7bb3f4313 --- /dev/null +++ b/tests/tests_fabric/loggers/test_csv.py @@ -0,0 +1,96 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +from unittest.mock import MagicMock + +import pytest +import torch + +from lightning_fabric.loggers import CSVLogger +from lightning_fabric.loggers.csv_logs import _ExperimentWriter + + +def test_file_logger_automatic_versioning(tmpdir): + """Verify that automatic versioning works.""" + root_dir = tmpdir.mkdir("exp") + root_dir.mkdir("version_0") + root_dir.mkdir("version_1") + logger = CSVLogger(root_dir=root_dir, name="exp") + assert logger.version == 2 + + +def test_file_logger_manual_versioning(tmpdir): + """Verify that manual versioning works.""" + root_dir = tmpdir.mkdir("exp") + root_dir.mkdir("version_0") + root_dir.mkdir("version_1") + root_dir.mkdir("version_2") + logger = CSVLogger(root_dir=root_dir, name="exp", version=1) + assert logger.version == 1 + + +def test_file_logger_named_version(tmpdir): + """Verify that manual versioning works for string versions, e.g. '2020-02-05-162402'.""" + + exp_name = "exp" + tmpdir.mkdir(exp_name) + expected_version = "2020-02-05-162402" + + logger = CSVLogger(root_dir=tmpdir, name=exp_name, version=expected_version) + logger.log_metrics({"a": 1, "b": 2}) + logger.save() + assert logger.version == expected_version + assert os.listdir(tmpdir / exp_name) == [expected_version] + assert os.listdir(tmpdir / exp_name / expected_version) + + +@pytest.mark.parametrize("name", ["", None]) +def test_file_logger_no_name(tmpdir, name): + """Verify that None or empty name works.""" + logger = CSVLogger(root_dir=tmpdir, name=name) + logger.log_metrics({"a": 1}) + logger.save() + assert os.path.normpath(logger.root_dir) == tmpdir # use os.path.normpath to handle trailing / + assert os.listdir(tmpdir / "version_0") + + +@pytest.mark.parametrize("step_idx", [10, None]) +def test_file_logger_log_metrics(tmpdir, step_idx): + logger = CSVLogger(tmpdir) + metrics = {"float": 0.3, "int": 1, "FloatTensor": torch.tensor(0.1), "IntTensor": torch.tensor(1)} + logger.log_metrics(metrics, step_idx) + logger.save() + + path_csv = os.path.join(logger.log_dir, _ExperimentWriter.NAME_METRICS_FILE) + with open(path_csv) as fp: + lines = fp.readlines() + assert len(lines) == 2 + assert all(n in lines[0] for n in metrics) + + +def test_file_logger_log_hyperparams(tmpdir): + logger = CSVLogger(tmpdir) + with pytest.raises(NotImplementedError): + logger.log_hyperparams({}) + + +def test_flush_n_steps(tmpdir): + logger = CSVLogger(tmpdir, flush_logs_every_n_steps=2) + metrics = {"float": 0.3, "int": 1, "FloatTensor": torch.tensor(0.1), "IntTensor": torch.tensor(1)} + logger.save = MagicMock() + logger.log_metrics(metrics, step=0) + + logger.save.assert_not_called() + logger.log_metrics(metrics, step=1) + logger.save.assert_called_once() diff --git a/tests/tests_fabric/test_connector.py b/tests/tests_fabric/test_connector.py index 2f5164854eed0..0ea1295e229f8 100644 --- a/tests/tests_fabric/test_connector.py +++ b/tests/tests_fabric/test_connector.py @@ -13,7 +13,6 @@ # limitations under the License import os -from re import escape from typing import Any, Dict from unittest import mock @@ -808,27 +807,23 @@ def test_devices_from_environment(*_): def test_arguments_from_environment_collision(): """Test that the connector raises an error when the CLI settings conflict with settings in the code.""" with mock.patch.dict(os.environ, {"LT_ACCELERATOR": "cpu"}): - with pytest.raises( - ValueError, match=escape("Your code has `Fabric(accelerator='cuda', ...)` but it conflicts") - ): + with pytest.raises(ValueError, match="`Fabric\\(accelerator='cuda', ...\\)` but .* `--accelerator=cpu`"): _Connector(accelerator="cuda") with mock.patch.dict(os.environ, {"LT_STRATEGY": "ddp"}): - with pytest.raises( - ValueError, match=escape("Your code has `Fabric(strategy='ddp_spawn', ...)` but it conflicts") - ): + with pytest.raises(ValueError, match="`Fabric\\(strategy='ddp_spawn', ...\\)` but .* `--strategy=ddp`"): _Connector(strategy="ddp_spawn") with mock.patch.dict(os.environ, {"LT_DEVICES": "2"}): - with pytest.raises(ValueError, match=escape("Your code has `Fabric(devices=3, ...)` but it conflicts")): + with pytest.raises(ValueError, match="`Fabric\\(devices=3, ...\\)` but .* `--devices=2`"): _Connector(devices=3) with mock.patch.dict(os.environ, {"LT_NUM_NODES": "3"}): - with pytest.raises(ValueError, match=escape("Your code has `Fabric(num_nodes=2, ...)` but it conflicts")): + with pytest.raises(ValueError, match="`Fabric\\(num_nodes=2, ...\\)` but .* `--num_nodes=3`"): _Connector(num_nodes=2) with mock.patch.dict(os.environ, {"LT_PRECISION": "16"}): - with pytest.raises(ValueError, match=escape("Your code has `Fabric(precision=64, ...)` but it conflicts")): + with pytest.raises(ValueError, match="`Fabric\\(precision=64, ...\\)` but .* `--precision=16`"): _Connector(precision=64) diff --git a/tests/tests_pytorch/loggers/test_mlflow.py b/tests/tests_pytorch/loggers/test_mlflow.py index 17bd5389f4f96..d6828901a9961 100644 --- a/tests/tests_pytorch/loggers/test_mlflow.py +++ b/tests/tests_pytorch/loggers/test_mlflow.py @@ -268,6 +268,27 @@ def test_mlflow_logger_experiment_calls(client, _, time, param, metric, tmpdir): ) +@pytest.mark.parametrize( + "status,expected", + [ + ("success", "FINISHED"), + ("failed", "FAILED"), + ("finished", "FINISHED"), + ], +) +@mock.patch("pytorch_lightning.loggers.mlflow._MLFLOW_AVAILABLE", return_value=True) +@mock.patch("pytorch_lightning.loggers.mlflow.MlflowClient") +def test_mlflow_logger_finalize(_, __, status, expected): + logger = MLFlowLogger("test") + + # Pretend we are in a worker process and finalizing + _ = logger.experiment + assert logger._initialized + + logger.finalize(status) + logger.experiment.set_terminated.assert_called_once_with(logger.run_id, expected) + + @mock.patch("pytorch_lightning.loggers.mlflow._MLFLOW_AVAILABLE", return_value=True) @mock.patch("pytorch_lightning.loggers.mlflow.MlflowClient") def test_mlflow_logger_finalize_when_exception(*_): diff --git a/tests/tests_pytorch/plugins/test_cluster_integration.py b/tests/tests_pytorch/plugins/test_cluster_integration.py index c6542f0797743..e8beecf15020a 100644 --- a/tests/tests_pytorch/plugins/test_cluster_integration.py +++ b/tests/tests_pytorch/plugins/test_cluster_integration.py @@ -56,12 +56,13 @@ def environment_combinations(): yield environment, variables, expected +@RunIf(mps=False) @pytest.mark.parametrize( "strategy_cls", [DDPStrategy, DDPShardedStrategy, pytest.param(DeepSpeedStrategy, marks=RunIf(deepspeed=True))], ) @mock.patch("pytorch_lightning.accelerators.cuda.CUDAAccelerator.is_available", return_value=True) -def test_ranks_available_manual_strategy_selection(mock_gpu_acc_available, strategy_cls): +def test_ranks_available_manual_strategy_selection(_, strategy_cls): """Test that the rank information is readily available after Trainer initialization.""" num_nodes = 2 for cluster, variables, expected in environment_combinations(): @@ -77,6 +78,7 @@ def test_ranks_available_manual_strategy_selection(mock_gpu_acc_available, strat assert trainer.world_size == expected["world_size"] +@RunIf(mps=False) @pytest.mark.parametrize( "trainer_kwargs", [ @@ -86,7 +88,7 @@ def test_ranks_available_manual_strategy_selection(mock_gpu_acc_available, strat dict(strategy="ddp_spawn", accelerator="gpu", devices=[1, 2]), ], ) -def test_ranks_available_automatic_strategy_selection(mps_count_4, cuda_count_4, trainer_kwargs): +def test_ranks_available_automatic_strategy_selection(cuda_count_4, trainer_kwargs): """Test that the rank information is readily available after Trainer initialization.""" num_nodes = 2 trainer_kwargs.update(num_nodes=num_nodes) diff --git a/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py b/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py index b35ee92ff1852..c0da8086a8b84 100644 --- a/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py +++ b/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py @@ -126,6 +126,7 @@ def creates_processes_externally(self) -> bool: assert isinstance(trainer.strategy.cluster_environment, CustomCluster) +@RunIf(mps=False) @mock.patch.dict( os.environ, { @@ -231,7 +232,8 @@ def test_fallback_from_ddp_spawn_to_ddp_on_cluster(_, __, env_vars, expected_env assert isinstance(trainer.strategy.cluster_environment, expected_environment) -def test_interactive_incompatible_backend_error(mps_count_2, cuda_count_2, monkeypatch): +@RunIf(mps=False) +def test_interactive_incompatible_backend_error(cuda_count_2, monkeypatch): monkeypatch.setattr(pytorch_lightning.trainer.connectors.accelerator_connector, "_IS_INTERACTIVE", True) with pytest.raises(MisconfigurationException, match=r"strategy='ddp'\)`.*is not compatible"): Trainer(strategy="ddp", accelerator="gpu", devices=2) @@ -247,7 +249,7 @@ def test_interactive_incompatible_backend_error(mps_count_2, cuda_count_2, monke Trainer(strategy="dp") -def test_interactive_compatible_dp_strategy_gpu(cuda_count_2, monkeypatch): +def test_interactive_compatible_dp_strategy_gpu(mps_count_0, cuda_count_2, monkeypatch): monkeypatch.setattr(pytorch_lightning.trainer.connectors.accelerator_connector, "_IS_INTERACTIVE", True) trainer = Trainer(strategy="dp", accelerator="gpu") assert trainer.strategy.launcher is None @@ -358,7 +360,7 @@ def test_set_devices_if_none_cpu(): def test_unsupported_strategy_types_on_cpu_and_fallback(): with pytest.warns(UserWarning, match="is not supported on CPUs, hence setting `strategy='ddp"): - trainer = Trainer(strategy="dp", num_processes=2) + trainer = Trainer(accelerator="cpu", strategy="dp", num_processes=2) assert isinstance(trainer.strategy, DDPStrategy) @@ -369,6 +371,28 @@ def test_exception_invalid_strategy(): Trainer(strategy="tpu_spawn") +@pytest.mark.parametrize( + ["strategy", "strategy_class"], + ( + ("ddp_spawn", DDPSpawnStrategy), + ("ddp_spawn_find_unused_parameters_false", DDPSpawnStrategy), + ("ddp", DDPStrategy), + ("ddp_find_unused_parameters_false", DDPStrategy), + ("dp", DataParallelStrategy), + ("ddp_sharded", DDPShardedStrategy), + ("ddp_sharded_spawn", DDPSpawnShardedStrategy), + pytest.param("deepspeed", DeepSpeedStrategy, marks=RunIf(deepspeed=True)), + ), +) +@pytest.mark.parametrize("accelerator", ["mps", "auto", "gpu", None, MPSAccelerator()]) +def test_invalid_ddp_strategy_with_mps(accelerator, strategy, strategy_class, mps_count_1, cuda_count_0): + with pytest.raises(ValueError, match="strategies from the DDP family are not supported"): + Trainer(accelerator=accelerator, strategy=strategy) + + with pytest.raises(ValueError, match="strategies from the DDP family are not supported"): + Trainer(accelerator="mps", strategy=strategy_class()) + + @pytest.mark.parametrize( ["strategy", "strategy_class"], [ @@ -475,14 +499,6 @@ def test_strategy_choice_ddp_cuda(strategy, expected_cls, mps_count_0, cuda_coun assert isinstance(trainer.strategy.cluster_environment, LightningEnvironment) -@pytest.mark.parametrize("strategy,expected_cls", [("ddp", DDPStrategy), ("ddp_spawn", DDPSpawnStrategy)]) -def test_strategy_choice_ddp_mps(strategy, expected_cls, mps_count_1, cuda_count_0): - trainer = Trainer(fast_dev_run=True, strategy=strategy, accelerator="gpu", devices=1) - assert isinstance(trainer.accelerator, MPSAccelerator) - assert isinstance(trainer.strategy, expected_cls) - assert isinstance(trainer.strategy.cluster_environment, LightningEnvironment) - - @pytest.mark.parametrize("job_name,expected_env", [("some_name", SLURMEnvironment), ("bash", LightningEnvironment)]) @pytest.mark.parametrize("strategy", ["ddp", DDPStrategy]) def test_strategy_choice_ddp_slurm(cuda_count_2, strategy, job_name, expected_env): @@ -704,9 +720,9 @@ def test_deterministic_init(deterministic): (False, [Mock(spec=LayerSync)], LayerSync), ], ) -def test_sync_batchnorm_set(tmpdir, sync_batchnorm, plugins, expected): +def test_sync_batchnorm_set(sync_batchnorm, plugins, expected): """Test valid combinations of the sync_batchnorm Trainer flag and the plugins list of layer-sync plugins.""" - trainer = Trainer(sync_batchnorm=sync_batchnorm, plugins=plugins, strategy="ddp") + trainer = Trainer(accelerator="cpu", sync_batchnorm=sync_batchnorm, plugins=plugins, strategy="ddp") assert isinstance(trainer._accelerator_connector._layer_sync, expected) assert isinstance(trainer.strategy._layer_sync, expected) @@ -733,7 +749,7 @@ def __init__(self, **kwargs): strategy = CustomParallelStrategy() assert strategy._layer_sync is None - Trainer(strategy=strategy, sync_batchnorm=True) + Trainer(accelerator="cpu", strategy=strategy, sync_batchnorm=True) assert isinstance(strategy._layer_sync, NativeSyncBatchNorm) @@ -809,12 +825,12 @@ def test_accelerator_specific_checkpoint_io(*_): ) def test_ddp_fork_on_unsupported_platform(_, strategy): with pytest.raises(ValueError, match="process forking is not supported on this platform"): - Trainer(strategy=strategy) + Trainer(accelerator="cpu", strategy=strategy) @pytest.mark.parametrize( ["strategy", "strategy_cls"], [("DDP", DDPStrategy), ("DDP_FIND_UNUSED_PARAMETERS_FALSE", DDPStrategy)] ) def test_strategy_str_passed_being_case_insensitive(strategy, strategy_cls): - trainer = Trainer(strategy=strategy) + trainer = Trainer(accelerator="cpu", strategy=strategy) assert isinstance(trainer.strategy, strategy_cls) diff --git a/tests/tests_pytorch/trainer/logging_/test_logger_connector.py b/tests/tests_pytorch/trainer/logging_/test_logger_connector.py index 1ffe7ffe9defb..e2308ea18389c 100644 --- a/tests/tests_pytorch/trainer/logging_/test_logger_connector.py +++ b/tests/tests_pytorch/trainer/logging_/test_logger_connector.py @@ -11,13 +11,16 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import operator from functools import partial from unittest import mock +from unittest.mock import Mock import pytest import torch +from lightning_utilities.core.imports import compare_version from torch.utils.data import DataLoader -from torchmetrics import Accuracy, AveragePrecision, MeanAbsoluteError, MeanSquaredError +from torchmetrics import Accuracy, AveragePrecision, MeanAbsoluteError, MeanSquaredError, MetricCollection from pytorch_lightning import LightningModule from pytorch_lightning.callbacks.callback import Callback @@ -27,6 +30,7 @@ from pytorch_lightning.trainer.connectors.logger_connector.fx_validator import _FxValidator from pytorch_lightning.trainer.connectors.logger_connector.result import _ResultCollection from pytorch_lightning.utilities.exceptions import MisconfigurationException +from pytorch_lightning.utilities.imports import _TORCHMETRICS_GREATER_EQUAL_0_9_1 from tests_pytorch.helpers.runif import RunIf from tests_pytorch.models.test_hooks import get_members @@ -507,6 +511,77 @@ def _assert_called(model, fn, stage): _assert_called(model, "test", "test") +@pytest.mark.skipif( + compare_version("torchmetrics", operator.lt, "0.8.0"), reason="torchmetrics>=0.8.0 required for compute groups" +) +@pytest.mark.parametrize("compute_groups", [True, False]) +def test_metriccollection_compute_groups(tmpdir, compute_groups): + def assertion_calls(keep_base: bool, copy_state: bool): + if _TORCHMETRICS_GREATER_EQUAL_0_9_1: + assert copy_state != compute_groups + + assert not keep_base + + class CustomMetricsCollection(MetricCollection): + wrapped_assertion_calls = Mock(wraps=assertion_calls) + + def items(self, keep_base: bool = False, copy_state: bool = True): + if getattr(self, "_is_currently_logging", False): + self.wrapped_assertion_calls(keep_base, copy_state) + + return super().items(keep_base=keep_base, copy_state=copy_state) + + class DummyModule(LightningModule): + def __init__(self): + super().__init__() + if compare_version("torchmetrics", operator.ge, "0.10.0"): + from torchmetrics.classification import MulticlassAccuracy, MulticlassPrecision + + metrics = [ + MulticlassAccuracy(num_classes=10, average="micro"), + MulticlassPrecision(num_classes=10, average="micro"), + ] + else: + from torchmetrics import Accuracy, Precision + + metrics = [Accuracy(num_classes=10, average="micro"), Precision(num_classes=10, average="micro")] + + self.metrics = CustomMetricsCollection( + metrics, + compute_groups=compute_groups, + ) + self.layer = torch.nn.Linear(32, 10) + + def training_step(self, batch): + + self.metrics(torch.rand(10, 10).softmax(-1), torch.randint(0, 10, (10,))) + self.metrics._is_currently_logging = True + self.log_dict(self.metrics, on_step=True, on_epoch=True) + self.metrics._is_currently_logging = False + return self.layer(batch).sum() + + def train_dataloader(self): + return DataLoader(RandomDataset(32, 64)) + + def configure_optimizers(self): + optimizer = torch.optim.SGD(self.parameters(), lr=0.1) + return optimizer + + def on_train_epoch_end(self) -> None: + self.metrics.wrapped_assertion_calls.call_count == 2 + self.metrics.wrapped_assertion_calls.reset_mock() + + trainer = Trainer( + default_root_dir=tmpdir, + limit_train_batches=2, + limit_val_batches=0, + max_epochs=1, + enable_progress_bar=False, + enable_checkpointing=False, + ) + trainer.fit(DummyModule()) + + def test_result_collection_on_tensor_with_mean_reduction(): result_collection = _ResultCollection(True) product = [(True, True), (False, True), (True, False), (False, False)] diff --git a/tests/tests_pytorch/trainer/optimization/test_manual_optimization.py b/tests/tests_pytorch/trainer/optimization/test_manual_optimization.py index 9c3f85c5ab647..c8e20bb4427a7 100644 --- a/tests/tests_pytorch/trainer/optimization/test_manual_optimization.py +++ b/tests/tests_pytorch/trainer/optimization/test_manual_optimization.py @@ -933,6 +933,7 @@ def configure_optimizers(self): scheduler = { "scheduler": torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer), "monitor": "train_loss", + "interval": "step", # not warned } else: scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer) @@ -946,8 +947,9 @@ def configure_optimizers(self): ) if scheduler_as_dict: - with pytest.warns(RuntimeWarning, match="but the keys will be ignored"): + with pytest.warns(RuntimeWarning, match=r"\['monitor'\], but the keys will be ignored"): trainer.fit(model) + assert trainer.lr_scheduler_configs[0].interval == "step" else: trainer.fit(model) diff --git a/tests/tests_pytorch/trainer/optimization/test_optimizers.py b/tests/tests_pytorch/trainer/optimization/test_optimizers.py index 155cc175bcf80..d35ec9e8b72d7 100644 --- a/tests/tests_pytorch/trainer/optimization/test_optimizers.py +++ b/tests/tests_pytorch/trainer/optimization/test_optimizers.py @@ -557,25 +557,6 @@ def configure_optimizers(self): trainer.fit(model) -def test_warn_invalid_scheduler_key_in_manual_optimization(tmpdir): - """Test warning when invalid scheduler keys are provided in manual optimization.""" - - class TestModel(BoringModel): - def __init__(self): - super().__init__() - self.automatic_optimization = False - - def configure_optimizers(self): - opt = optim.SGD(self.layer.parameters(), lr=0.1) - sch = optim.lr_scheduler.StepLR(opt, step_size=1) - return [opt], [{"scheduler": sch, "interval": "epoch"}] - - model = TestModel() - trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True) - with pytest.warns(RuntimeWarning, match="the keys will be ignored"): - trainer.fit(model) - - @RunIf(min_cuda_gpus=2, standalone=True) def test_optimizer_state_on_device(tmpdir): """Test that optimizers that create state initially at instantiation still end up with the state on the GPU.""" diff --git a/tests/tests_pytorch/trainer/test_supporters.py b/tests/tests_pytorch/trainer/test_supporters.py index 15958500c2dec..8533ad5fdb467 100644 --- a/tests/tests_pytorch/trainer/test_supporters.py +++ b/tests/tests_pytorch/trainer/test_supporters.py @@ -316,7 +316,7 @@ def test_nested_calc_num_data(input_data, compute_func, expected_length): @mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0,1"}) @pytest.mark.parametrize("use_fault_tolerant", [False, True]) @pytest.mark.parametrize("replace_sampler_ddp", [False, True]) -def test_combined_data_loader_validation_test(mps_count_2, cuda_count_2, use_fault_tolerant, replace_sampler_ddp): +def test_combined_data_loader_validation_test(mps_count_0, cuda_count_2, use_fault_tolerant, replace_sampler_ddp): """This test makes sure distributed sampler has been properly injected in dataloaders when using CombinedLoader.""" diff --git a/tests/tests_pytorch/trainer/test_trainer.py b/tests/tests_pytorch/trainer/test_trainer.py index 3b4af83b13cfd..edace5429a531 100644 --- a/tests/tests_pytorch/trainer/test_trainer.py +++ b/tests/tests_pytorch/trainer/test_trainer.py @@ -1921,7 +1921,7 @@ def on_exception(self, *_): self.exceptions += 1 -@pytest.mark.parametrize("strategy", [None, pytest.param("ddp_spawn", marks=RunIf(skip_windows=True))]) +@pytest.mark.parametrize("strategy", [None, pytest.param("ddp_spawn", marks=RunIf(skip_windows=True, mps=False))]) def test_error_handling_all_stages(tmpdir, strategy): model = TrainerStagesErrorsModel() counter = ExceptionCounter() @@ -2017,9 +2017,11 @@ def training_step(self, batch, batch_idx): ["trainer_kwargs", "strategy_cls", "strategy_name", "accelerator_cls", "devices"], [ ({"strategy": None}, SingleDeviceStrategy, "single_device", CPUAccelerator, 1), - ({"strategy": "dp"}, DDPStrategy, "ddp", CPUAccelerator, 1), - ({"strategy": "ddp"}, DDPStrategy, "ddp", CPUAccelerator, 1), - ({"strategy": "ddp", "num_nodes": 2}, DDPStrategy, "ddp", CPUAccelerator, 1), + pytest.param({"strategy": "dp"}, DDPStrategy, "ddp", CPUAccelerator, 1, marks=RunIf(mps=False)), + pytest.param({"strategy": "ddp"}, DDPStrategy, "ddp", CPUAccelerator, 1, marks=RunIf(mps=False)), + pytest.param( + {"strategy": "ddp", "num_nodes": 2}, DDPStrategy, "ddp", CPUAccelerator, 1, marks=RunIf(mps=False) + ), ( {"strategy": None, "accelerator": "cuda", "devices": 1}, SingleDeviceStrategy, @@ -2075,7 +2077,7 @@ def training_step(self, batch, batch_idx): CUDAAccelerator, 2, ), - ({"strategy": DDPStrategy()}, DDPStrategy, "ddp", CPUAccelerator, 1), + pytest.param({"strategy": DDPStrategy()}, DDPStrategy, "ddp", CPUAccelerator, 1, marks=RunIf(mps=False)), ({"strategy": DDPStrategy(), "accelerator": "cuda", "devices": 2}, DDPStrategy, "ddp", CUDAAccelerator, 2), ( {"strategy": DataParallelStrategy(), "accelerator": "cuda", "devices": 2},