From ecd5b1f9eb96d358a64603d9b7140b7235cbf240 Mon Sep 17 00:00:00 2001 From: Zhongsheng Ji <9573586@qq.com> Date: Fri, 24 Nov 2023 20:08:48 +0800 Subject: [PATCH] Change md5 to hash and hash_algorithm, fix incompatibility (#1367) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Frédéric Collonval --- .github/workflows/downstream.yml | 18 ++ docs/source/developers/contents.rst | 92 ++++--- jupyter_server/services/api/api.yaml | 14 +- .../services/contents/filecheckpoints.py | 4 +- jupyter_server/services/contents/fileio.py | 259 ++++++++++++------ .../services/contents/filemanager.py | 87 +++--- jupyter_server/services/contents/handlers.py | 74 +++-- jupyter_server/services/contents/manager.py | 24 +- tests/services/contents/test_api.py | 26 +- tests/services/contents/test_fileio.py | 51 +++- tests/services/contents/test_manager.py | 37 ++- .../services/contents/test_manager_no_hash.py | 44 +++ 12 files changed, 509 insertions(+), 221 deletions(-) create mode 100644 tests/services/contents/test_manager_no_hash.py diff --git a/.github/workflows/downstream.yml b/.github/workflows/downstream.yml index 635c404e32..8763635490 100644 --- a/.github/workflows/downstream.yml +++ b/.github/workflows/downstream.yml @@ -107,6 +107,23 @@ jobs: test_command: pip install pytest-jupyter[server] && pytest -vv -raXxs -W default --durations 10 --color=yes package_name: jupyter_server_terminals + jupytext: + runs-on: ubuntu-latest + timeout-minutes: 10 + + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Base Setup + uses: jupyterlab/maintainer-tools/.github/actions/base-setup@v1 + + - name: Test jupytext + uses: jupyterlab/maintainer-tools/.github/actions/downstream-test@v1 + with: + package_name: jupytext + test_command: pip install pytest-jupyter[server] gitpython pre-commit && python -m ipykernel install --name jupytext-dev --user && pytest -vv -raXxs -W default --durations 10 --color=yes --ignore=tests/test_doc_files_are_notebooks.py --ignore=tests/test_changelog.py + downstream_check: # This job does nothing and is only used for the branch protection if: always() needs: @@ -115,6 +132,7 @@ jobs: - jupyterlab_server - notebook - nbclassic + - jupytext runs-on: ubuntu-latest steps: - name: Decide whether the needed jobs succeeded or failed diff --git a/docs/source/developers/contents.rst b/docs/source/developers/contents.rst index ca88025c88..6910535f30 100644 --- a/docs/source/developers/contents.rst +++ b/docs/source/developers/contents.rst @@ -33,40 +33,48 @@ which we refer to as **models**. Models may contain the following entries: -+--------------------+-----------+------------------------------+ -| Key | Type |Info | -+====================+===========+==============================+ -|**name** |unicode |Basename of the entity. | -+--------------------+-----------+------------------------------+ -|**path** |unicode |Full | -| | |(:ref:`API-style`) | -| | |path to the entity. | -+--------------------+-----------+------------------------------+ -|**type** |unicode |The entity type. One of | -| | |``"notebook"``, ``"file"`` or | -| | |``"directory"``. | -+--------------------+-----------+------------------------------+ -|**created** |datetime |Creation date of the entity. | -+--------------------+-----------+------------------------------+ -|**last_modified** |datetime |Last modified date of the | -| | |entity. | -+--------------------+-----------+------------------------------+ -|**content** |variable |The "content" of the entity. | -| | |(:ref:`See | -| | |Below`) | -+--------------------+-----------+------------------------------+ -|**mimetype** |unicode or |The mimetype of ``content``, | -| |``None`` |if any. (:ref:`See | -| | |Below`) | -+--------------------+-----------+------------------------------+ -|**format** |unicode or |The format of ``content``, | -| |``None`` |if any. (:ref:`See | -| | |Below`) | -+--------------------+-----------+------------------------------+ -|**md5** |unicode or |The md5 of the contents. | -| |``None`` | | -| | | | -+--------------------+-----------+------------------------------+ ++--------------------+------------+-------------------------------+ +| Key | Type | Info | ++====================+============+===============================+ +| **name** | unicode | Basename of the entity. | ++--------------------+------------+-------------------------------+ +| **path** | unicode | Full | +| | | (:ref:`API-style`) | +| | | path to the entity. | ++--------------------+------------+-------------------------------+ +| **type** | unicode | The entity type. One of | +| | | ``"notebook"``, ``"file"`` or | +| | | ``"directory"``. | ++--------------------+------------+-------------------------------+ +| **created** | datetime | Creation date of the entity. | ++--------------------+------------+-------------------------------+ +| **last_modified** | datetime | Last modified date of the | +| | | entity. | ++--------------------+------------+-------------------------------+ +| **content** | variable | The "content" of the entity. | +| | | (:ref:`See | +| | | Below`) | ++--------------------+------------+-------------------------------+ +| **mimetype** | unicode or | The mimetype of ``content``, | +| | ``None`` | if any. (:ref:`See | +| | | Below`) | ++--------------------+------------+-------------------------------+ +| **format** | unicode or | The format of ``content``, | +| | ``None`` | if any. (:ref:`See | +| | | Below`) | ++--------------------+------------+-------------------------------+ +| [optional] | | | +| **hash** | unicode or | The hash of the contents. | +| | ``None`` | It cannot be null if | +| | | ``hash_algorithm`` is | +| | | defined. | ++--------------------+------------+-------------------------------+ +| [optional] | | | +| **hash_algorithm** | unicode or | The algorithm used to compute | +| | ``None`` | hash value. | +| | | It cannot be null | +| | | if ``hash`` is defined. | ++--------------------+------------+-------------------------------+ .. _modelcontent: @@ -80,8 +88,9 @@ model. There are three model types: **notebook**, **file**, and **directory**. :class:`nbformat.notebooknode.NotebookNode` representing the .ipynb file represented by the model. See the `NBFormat`_ documentation for a full description. - - The ``md5`` field a hexdigest string of the md5 value of the notebook - file. + - The ``hash`` field a hexdigest string of the hash value of the file. + If ``ContentManager.get`` not support hash, it should always be ``None``. + - ``hash_algorithm`` is the algorithm used to compute the hash value. - ``file`` models - The ``format`` field is either ``"text"`` or ``"base64"``. @@ -91,14 +100,16 @@ model. There are three model types: **notebook**, **file**, and **directory**. file models, ``content`` simply contains the file's bytes after decoding as UTF-8. Non-text (``base64``) files are read as bytes, base64 encoded, and then decoded as UTF-8. - - The ``md5`` field a hexdigest string of the md5 value of the file. + - The ``hash`` field a hexdigest string of the hash value of the file. + If ``ContentManager.get`` not support hash, it should always be ``None``. + - ``hash_algorithm`` is the algorithm used to compute the hash value. - ``directory`` models - The ``format`` field is always ``"json"``. - The ``mimetype`` field is always ``None``. - The ``content`` field contains a list of :ref:`content-free` models representing the entities in the directory. - - The ``md5`` field is always ``None``. + - The ``hash`` field is always ``None``. .. note:: @@ -115,7 +126,7 @@ model. There are three model types: **notebook**, **file**, and **directory**. .. code-block:: python - # Notebook Model with Content + # Notebook Model with Content and Hash { "content": { "metadata": {}, @@ -137,7 +148,8 @@ model. There are three model types: **notebook**, **file**, and **directory**. "path": "foo/a.ipynb", "type": "notebook", "writable": True, - "md5": "7e47382b370c05a1b14706a2a8aff91a", + "hash": "f5e43a0b1c2e7836ab3b4d6b1c35c19e2558688de15a6a14e137a59e4715d34b", + "hash_algorithm": "sha256", } # Notebook Model without Content diff --git a/jupyter_server/services/api/api.yaml b/jupyter_server/services/api/api.yaml index 0394093ba9..5ee5c416bd 100644 --- a/jupyter_server/services/api/api.yaml +++ b/jupyter_server/services/api/api.yaml @@ -106,9 +106,9 @@ paths: in: query description: "Return content (0 for no content, 1 for return content)" type: integer - - name: md5 + - name: hash in: query - description: "Return md5 hexdigest string of content (0 for no md5, 1 for return md5)" + description: "May return hash hexdigest string of content and the hash algorithm (0 for no hash - default, 1 for return hash). It may be ignored by the content manager." type: integer responses: 404: @@ -889,7 +889,7 @@ definitions: kernel: $ref: "#/definitions/Kernel" Contents: - description: "A contents object. The content and format keys may be null if content is not contained. The md5 maybe null if md5 is not contained. If type is 'file', then the mimetype will be null." + description: "A contents object. The content and format keys may be null if content is not contained. The hash maybe null if hash is not required. If type is 'file', then the mimetype will be null." type: object required: - type @@ -901,7 +901,6 @@ definitions: - mimetype - format - content - - md5 properties: name: type: string @@ -939,9 +938,12 @@ definitions: format: type: string description: Format of content (one of null, 'text', 'base64', 'json') - md5: + hash: type: string - description: "The md5 hexdigest string of content, if requested (otherwise null)." + description: "[optional] The hexdigest hash string of content, if requested (otherwise null). It cannot be null if hash_algorithm is defined." + hash_algorithm: + type: string + description: "[optional] The algorithm used to produce the hash, if requested (otherwise null). It cannot be null if hash is defined." Checkpoints: description: A checkpoint object. type: object diff --git a/jupyter_server/services/contents/filecheckpoints.py b/jupyter_server/services/contents/filecheckpoints.py index f6d1ef44e7..522b3bbd01 100644 --- a/jupyter_server/services/contents/filecheckpoints.py +++ b/jupyter_server/services/contents/filecheckpoints.py @@ -252,7 +252,7 @@ def get_file_checkpoint(self, checkpoint_id, path): if not os.path.isfile(os_checkpoint_path): self.no_such_checkpoint(path, checkpoint_id) - content, format = self._read_file(os_checkpoint_path, format=None) + content, format = self._read_file(os_checkpoint_path, format=None) # type: ignore[misc] return { "type": "file", "content": content, @@ -318,7 +318,7 @@ async def get_file_checkpoint(self, checkpoint_id, path): if not os.path.isfile(os_checkpoint_path): self.no_such_checkpoint(path, checkpoint_id) - content, format = await self._read_file(os_checkpoint_path, format=None) + content, format = await self._read_file(os_checkpoint_path, format=None) # type: ignore[misc] return { "type": "file", "content": content, diff --git a/jupyter_server/services/contents/fileio.py b/jupyter_server/services/contents/fileio.py index 5f0aa4a8bf..45607944ce 100644 --- a/jupyter_server/services/contents/fileio.py +++ b/jupyter_server/services/contents/fileio.py @@ -3,6 +3,9 @@ """ # Copyright (c) Jupyter Development Team. # Distributed under the terms of the Modified BSD License. + +from __future__ import annotations + import errno import hashlib import os @@ -14,7 +17,7 @@ import nbformat from anyio.to_thread import run_sync from tornado.web import HTTPError -from traitlets import Bool +from traitlets import Bool, Enum from traitlets.config import Configurable from traitlets.config.configurable import LoggingConfigurable @@ -193,6 +196,13 @@ class FileManagerMixin(LoggingConfigurable, Configurable): If set to False, the new notebook is written directly on the old one which could fail (eg: full filesystem or quota )""", ) + hash_algorithm = Enum( # type: ignore[call-overload] + hashlib.algorithms_available, + default_value="sha256", + config=True, + help="Hash algorithm to use for file content, support by hashlib", + ) + @contextmanager def open(self, os_path, *args, **kwargs): """wrapper around io.open that turns permission errors into 403""" @@ -264,37 +274,42 @@ def _get_os_path(self, path): raise HTTPError(404, "%s is outside root contents directory" % path) return os_path - def _read_notebook(self, os_path, as_version=4, capture_validation_error=None): + def _read_notebook( + self, os_path, as_version=4, capture_validation_error=None, raw: bool = False + ): """Read a notebook from an os path.""" - with self.open(os_path, "r", encoding="utf-8") as f: - try: - return nbformat.read( - f, - as_version=as_version, - capture_validation_error=capture_validation_error, - ) - except Exception as e: - e_orig = e - - # If use_atomic_writing is enabled, we'll guess that it was also - # enabled when this notebook was written and look for a valid - # atomic intermediate. - tmp_path = path_to_intermediate(os_path) - - if not self.use_atomic_writing or not os.path.exists(tmp_path): - raise HTTPError( - 400, - f"Unreadable Notebook: {os_path} {e_orig!r}", - ) + answer = self._read_file(os_path, "text", raw=raw) - # Move the bad file aside, restore the intermediate, and try again. - invalid_file = path_to_invalid(os_path) - replace_file(os_path, invalid_file) - replace_file(tmp_path, os_path) - return self._read_notebook( - os_path, as_version, capture_validation_error=capture_validation_error + try: + nb = nbformat.reads( + answer[0], + as_version=as_version, + capture_validation_error=capture_validation_error, ) + return (nb, answer[2]) if raw else nb # type:ignore[misc] + except Exception as e: + e_orig = e + + # If use_atomic_writing is enabled, we'll guess that it was also + # enabled when this notebook was written and look for a valid + # atomic intermediate. + tmp_path = path_to_intermediate(os_path) + + if not self.use_atomic_writing or not os.path.exists(tmp_path): + raise HTTPError( + 400, + f"Unreadable Notebook: {os_path} {e_orig!r}", + ) + + # Move the bad file aside, restore the intermediate, and try again. + invalid_file = path_to_invalid(os_path) + replace_file(os_path, invalid_file) + replace_file(tmp_path, os_path) + return self._read_notebook( + os_path, as_version, capture_validation_error=capture_validation_error, raw=raw + ) + def _save_notebook(self, os_path, nb, capture_validation_error=None): """Save a notebook to an os_path.""" with self.atomic_writing(os_path, encoding="utf-8") as f: @@ -305,30 +320,69 @@ def _save_notebook(self, os_path, nb, capture_validation_error=None): capture_validation_error=capture_validation_error, ) - def _read_file(self, os_path, format): + def _get_hash(self, byte_content: bytes) -> dict[str, str]: + """Compute the hash hexdigest for the provided bytes. + + The hash algorithm is provided by the `hash_algorithm` attribute. + + Parameters + ---------- + byte_content : bytes + The bytes to hash + + Returns + ------- + A dictionary to be appended to a model {"hash": str, "hash_algorithm": str}. + """ + algorithm = self.hash_algorithm + h = hashlib.new(algorithm) + h.update(byte_content) + return {"hash": h.hexdigest(), "hash_algorithm": algorithm} + + def _read_file( + self, os_path: str, format: str | None, raw: bool = False + ) -> tuple[str | bytes, str] | tuple[str | bytes, str, bytes]: """Read a non-notebook file. - os_path: The path to be read. - format: - If 'text', the contents will be decoded as UTF-8. - If 'base64', the raw bytes contents will be encoded as base64. - If 'byte', the raw bytes contents will be returned. - If not specified, try to decode as UTF-8, and fall back to base64 + Parameters + ---------- + os_path: str + The path to be read. + format: str + If 'text', the contents will be decoded as UTF-8. + If 'base64', the raw bytes contents will be encoded as base64. + If 'byte', the raw bytes contents will be returned. + If not specified, try to decode as UTF-8, and fall back to base64 + raw: bool + [Optional] If True, will return as third argument the raw bytes content + + Returns + ------- + (content, format, byte_content) It returns the content in the given format + as well as the raw byte content. """ if not os.path.isfile(os_path): raise HTTPError(400, "Cannot read non-file %s" % os_path) with self.open(os_path, "rb") as f: bcontent = f.read() + if format == "byte": # Not for http response but internal use - return bcontent, "byte" + return (bcontent, "byte", bcontent) if raw else (bcontent, "byte") if format is None or format == "text": # Try to interpret as unicode if format is unknown or if unicode # was explicitly requested. try: - return bcontent.decode("utf8"), "text" + return ( + (bcontent.decode("utf8"), "text", bcontent) + if raw + else ( + bcontent.decode("utf8"), + "text", + ) + ) except UnicodeError as e: if format == "text": raise HTTPError( @@ -336,7 +390,14 @@ def _read_file(self, os_path, format): "%s is not UTF-8 encoded" % os_path, reason="bad format", ) from e - return encodebytes(bcontent).decode("ascii"), "base64" + return ( + (encodebytes(bcontent).decode("ascii"), "base64", bcontent) + if raw + else ( + encodebytes(bcontent).decode("ascii"), + "base64", + ) + ) def _save_file(self, os_path, content, format): """Save content of a generic file.""" @@ -357,12 +418,6 @@ def _save_file(self, os_path, content, format): with self.atomic_writing(os_path, text=False) as f: f.write(bcontent) - def _get_md5(self, os_path): - c, _ = self._read_file(os_path, "byte") - md5 = hashlib.md5() - md5.update(c) - return md5.hexdigest() - class AsyncFileManagerMixin(FileManagerMixin): """ @@ -376,40 +431,46 @@ async def _copy(self, src, dest): """ await async_copy2_safe(src, dest, log=self.log) - async def _read_notebook(self, os_path, as_version=4, capture_validation_error=None): + async def _read_notebook( + self, os_path, as_version=4, capture_validation_error=None, raw: bool = False + ): """Read a notebook from an os path.""" - with self.open(os_path, encoding="utf-8") as f: - try: - return await run_sync( - partial( - nbformat.read, - as_version=as_version, - capture_validation_error=capture_validation_error, - ), - f, - ) - except Exception as e: - e_orig = e - - # If use_atomic_writing is enabled, we'll guess that it was also - # enabled when this notebook was written and look for a valid - # atomic intermediate. - tmp_path = path_to_intermediate(os_path) - - if not self.use_atomic_writing or not os.path.exists(tmp_path): - raise HTTPError( - 400, - f"Unreadable Notebook: {os_path} {e_orig!r}", - ) + answer = await self._read_file(os_path, "text", raw) + + try: + nb = await run_sync( + partial( + nbformat.reads, + as_version=as_version, + capture_validation_error=capture_validation_error, + ), + answer[0], + ) + return (nb, answer[2]) if raw else nb # type:ignore[misc] + except Exception as e: + e_orig = e - # Move the bad file aside, restore the intermediate, and try again. - invalid_file = path_to_invalid(os_path) - await async_replace_file(os_path, invalid_file) - await async_replace_file(tmp_path, os_path) - return await self._read_notebook( - os_path, as_version, capture_validation_error=capture_validation_error + # If use_atomic_writing is enabled, we'll guess that it was also + # enabled when this notebook was written and look for a valid + # atomic intermediate. + tmp_path = path_to_intermediate(os_path) + + if not self.use_atomic_writing or not os.path.exists(tmp_path): + raise HTTPError( + 400, + f"Unreadable Notebook: {os_path} {e_orig!r}", ) + # Move the bad file aside, restore the intermediate, and try again. + invalid_file = path_to_invalid(os_path) + await async_replace_file(os_path, invalid_file) + await async_replace_file(tmp_path, os_path) + answer = await self._read_notebook( + os_path, as_version, capture_validation_error=capture_validation_error, raw=raw + ) + + return answer + async def _save_notebook(self, os_path, nb, capture_validation_error=None): """Save a notebook to an os_path.""" with self.atomic_writing(os_path, encoding="utf-8") as f: @@ -423,30 +484,50 @@ async def _save_notebook(self, os_path, nb, capture_validation_error=None): f, ) - async def _read_file(self, os_path, format): + async def _read_file( # type: ignore[override] + self, os_path: str, format: str | None, raw: bool = False + ) -> tuple[str | bytes, str] | tuple[str | bytes, str, bytes]: """Read a non-notebook file. - os_path: The path to be read. - format: - If 'text', the contents will be decoded as UTF-8. - If 'base64', the raw bytes contents will be encoded as base64. - If 'byte', the raw bytes contents will be returned. - If not specified, try to decode as UTF-8, and fall back to base64 + Parameters + ---------- + os_path: str + The path to be read. + format: str + If 'text', the contents will be decoded as UTF-8. + If 'base64', the raw bytes contents will be encoded as base64. + If 'byte', the raw bytes contents will be returned. + If not specified, try to decode as UTF-8, and fall back to base64 + raw: bool + [Optional] If True, will return as third argument the raw bytes content + + Returns + ------- + (content, format, byte_content) It returns the content in the given format + as well as the raw byte content. """ if not os.path.isfile(os_path): raise HTTPError(400, "Cannot read non-file %s" % os_path) with self.open(os_path, "rb") as f: bcontent = await run_sync(f.read) + if format == "byte": # Not for http response but internal use - return bcontent, "byte" + return (bcontent, "byte", bcontent) if raw else (bcontent, "byte") if format is None or format == "text": # Try to interpret as unicode if format is unknown or if unicode # was explicitly requested. try: - return bcontent.decode("utf8"), "text" + return ( + (bcontent.decode("utf8"), "text", bcontent) + if raw + else ( + bcontent.decode("utf8"), + "text", + ) + ) except UnicodeError as e: if format == "text": raise HTTPError( @@ -454,7 +535,11 @@ async def _read_file(self, os_path, format): "%s is not UTF-8 encoded" % os_path, reason="bad format", ) from e - return encodebytes(bcontent).decode("ascii"), "base64" + return ( + (encodebytes(bcontent).decode("ascii"), "base64", bcontent) + if raw + else (encodebytes(bcontent).decode("ascii"), "base64") + ) async def _save_file(self, os_path, content, format): """Save content of a generic file.""" @@ -474,9 +559,3 @@ async def _save_file(self, os_path, content, format): with self.atomic_writing(os_path, text=False) as f: await run_sync(f.write, bcontent) - - async def _get_md5(self, os_path): - c, _ = await self._read_file(os_path, "byte") - md5 = hashlib.md5() - await run_sync(md5.update, c) - return md5.hexdigest() diff --git a/jupyter_server/services/contents/filemanager.py b/jupyter_server/services/contents/filemanager.py index fe027a5c49..c56a1acc70 100644 --- a/jupyter_server/services/contents/filemanager.py +++ b/jupyter_server/services/contents/filemanager.py @@ -268,7 +268,8 @@ def _base_model(self, path): model["mimetype"] = None model["size"] = size model["writable"] = self.is_writable(path) - model["md5"] = None + model["hash"] = None + model["hash_algorithm"] = None return model @@ -336,7 +337,7 @@ def _dir_model(self, path, content=True): return model - def _file_model(self, path, content=True, format=None, md5=False): + def _file_model(self, path, content=True, format=None, require_hash=False): """Build a model for a file if content is requested, include the file contents. @@ -345,6 +346,8 @@ def _file_model(self, path, content=True, format=None, md5=False): If 'text', the contents will be decoded as UTF-8. If 'base64', the raw bytes contents will be encoded as base64. If not specified, try to decode as UTF-8, and fall back to base64 + + if require_hash is true, the model will include 'hash' """ model = self._base_model(path) model["type"] = "file" @@ -352,8 +355,9 @@ def _file_model(self, path, content=True, format=None, md5=False): os_path = self._get_os_path(path) model["mimetype"] = mimetypes.guess_type(os_path)[0] + bytes_content = None if content: - content, format = self._read_file(os_path, format) + content, format, bytes_content = self._read_file(os_path, format, raw=True) # type: ignore[misc] if model["mimetype"] is None: default_mime = { "text": "text/plain", @@ -365,37 +369,45 @@ def _file_model(self, path, content=True, format=None, md5=False): content=content, format=format, ) - if md5: - md5 = self._get_md5(os_path) - model.update(md5=md5) + + if require_hash: + if bytes_content is None: + bytes_content, _ = self._read_file(os_path, "byte") # type: ignore[assignment,misc] + model.update(**self._get_hash(bytes_content)) # type: ignore[arg-type] return model - def _notebook_model(self, path, content=True, md5=False): + def _notebook_model(self, path, content=True, require_hash=False): """Build a notebook model if content is requested, the notebook content will be populated as a JSON structure (not double-serialized) + + if require_hash is true, the model will include 'hash' """ model = self._base_model(path) model["type"] = "notebook" os_path = self._get_os_path(path) + bytes_content = None if content: validation_error: dict[str, t.Any] = {} - nb = self._read_notebook( - os_path, as_version=4, capture_validation_error=validation_error + nb, bytes_content = self._read_notebook( + os_path, as_version=4, capture_validation_error=validation_error, raw=True ) self.mark_trusted_cells(nb, path) model["content"] = nb model["format"] = "json" self.validate_notebook_model(model, validation_error) - if md5: - model["md5"] = self._get_md5(os_path) + + if require_hash: + if bytes_content is None: + bytes_content, _ = self._read_file(os_path, "byte") # type: ignore[misc] + model.update(**self._get_hash(bytes_content)) # type: ignore[arg-type] return model - def get(self, path, content=True, type=None, format=None, md5=None): + def get(self, path, content=True, type=None, format=None, require_hash=False): """Takes a path for an entity and returns its model Parameters @@ -410,8 +422,8 @@ def get(self, path, content=True, type=None, format=None, md5=None): format : str, optional The requested format for file contents. 'text' or 'base64'. Ignored if this returns a notebook or directory model. - md5: bool, optional - Whether to include the md5 of the file contents. + require_hash: bool, optional + Whether to include the hash of the file contents. Returns ------- @@ -439,11 +451,13 @@ def get(self, path, content=True, type=None, format=None, md5=None): ) model = self._dir_model(path, content=content) elif type == "notebook" or (type is None and path.endswith(".ipynb")): - model = self._notebook_model(path, content=content, md5=md5) + model = self._notebook_model(path, content=content, require_hash=require_hash) else: if type == "directory": raise web.HTTPError(400, "%s is not a directory" % path, reason="bad type") - model = self._file_model(path, content=content, format=format, md5=md5) + model = self._file_model( + path, content=content, format=format, require_hash=require_hash + ) self.emit(data={"action": "get", "path": path}) return model @@ -794,7 +808,7 @@ async def _dir_model(self, path, content=True): return model - async def _file_model(self, path, content=True, format=None, md5=False): + async def _file_model(self, path, content=True, format=None, require_hash=False): """Build a model for a file if content is requested, include the file contents. @@ -803,6 +817,8 @@ async def _file_model(self, path, content=True, format=None, md5=False): If 'text', the contents will be decoded as UTF-8. If 'base64', the raw bytes contents will be encoded as base64. If not specified, try to decode as UTF-8, and fall back to base64 + + if require_hash is true, the model will include 'hash' """ model = self._base_model(path) model["type"] = "file" @@ -810,8 +826,9 @@ async def _file_model(self, path, content=True, format=None, md5=False): os_path = self._get_os_path(path) model["mimetype"] = mimetypes.guess_type(os_path)[0] + bytes_content = None if content: - content, format = await self._read_file(os_path, format) + content, format, bytes_content = await self._read_file(os_path, format, raw=True) # type: ignore[misc] if model["mimetype"] is None: default_mime = { "text": "text/plain", @@ -823,13 +840,15 @@ async def _file_model(self, path, content=True, format=None, md5=False): content=content, format=format, ) - if md5: - md5 = await self._get_md5(os_path) - model.update(md5=md5) + + if require_hash: + if bytes_content is None: + bytes_content, _ = await self._read_file(os_path, "byte") # type: ignore[assignment,misc] + model.update(**self._get_hash(bytes_content)) # type: ignore[arg-type] return model - async def _notebook_model(self, path, content=True, md5=False): + async def _notebook_model(self, path, content=True, require_hash=False): """Build a notebook model if content is requested, the notebook content will be populated @@ -839,21 +858,25 @@ async def _notebook_model(self, path, content=True, md5=False): model["type"] = "notebook" os_path = self._get_os_path(path) + bytes_content = None if content: validation_error: dict[str, t.Any] = {} - nb = await self._read_notebook( - os_path, as_version=4, capture_validation_error=validation_error + nb, bytes_content = await self._read_notebook( + os_path, as_version=4, capture_validation_error=validation_error, raw=True ) self.mark_trusted_cells(nb, path) model["content"] = nb model["format"] = "json" self.validate_notebook_model(model, validation_error) - if md5: - model["md5"] = await self._get_md5(os_path) + + if require_hash: + if bytes_content is None: + bytes_content, _ = await self._read_file(os_path, "byte") # type: ignore[misc] + model.update(**(self._get_hash(bytes_content))) # type: ignore[arg-type] return model - async def get(self, path, content=True, type=None, format=None, md5=False): + async def get(self, path, content=True, type=None, format=None, require_hash=False): """Takes a path for an entity and returns its model Parameters @@ -868,8 +891,8 @@ async def get(self, path, content=True, type=None, format=None, md5=False): format : str, optional The requested format for file contents. 'text' or 'base64'. Ignored if this returns a notebook or directory model. - md5: bool, optional - Whether to include the md5 of the file contents. + require_hash: bool, optional + Whether to include the hash of the file contents. Returns ------- @@ -892,11 +915,13 @@ async def get(self, path, content=True, type=None, format=None, md5=False): ) model = await self._dir_model(path, content=content) elif type == "notebook" or (type is None and path.endswith(".ipynb")): - model = await self._notebook_model(path, content=content, md5=md5) + model = await self._notebook_model(path, content=content, require_hash=require_hash) else: if type == "directory": raise web.HTTPError(400, "%s is not a directory" % path, reason="bad type") - model = await self._file_model(path, content=content, format=format, md5=md5) + model = await self._file_model( + path, content=content, format=format, require_hash=require_hash + ) self.emit(data={"action": "get", "path": path}) return model diff --git a/jupyter_server/services/contents/handlers.py b/jupyter_server/services/contents/handlers.py index cc5ac5b8ca..a7c7ffff17 100644 --- a/jupyter_server/services/contents/handlers.py +++ b/jupyter_server/services/contents/handlers.py @@ -23,20 +23,20 @@ AUTH_RESOURCE = "contents" -def _validate_in_or_not(expect_in_model: bool, model: Dict[str, Any], maybe_none_keys: List[str]): +def _validate_keys(expect_defined: bool, model: Dict[str, Any], keys: List[str]): """ - Validate that the keys in maybe_none_keys are None or not None + Validate that the keys are defined (i.e. not None) or not (i.e. None) """ - if expect_in_model: - errors = [key for key in maybe_none_keys if model[key] is None] + if expect_defined: + errors = [key for key in keys if model[key] is None] if errors: raise web.HTTPError( 500, f"Keys unexpectedly None: {errors}", ) else: - errors = {key: model[key] for key in maybe_none_keys if model[key] is not None} # type: ignore[assignment] + errors = {key: model[key] for key in keys if model[key] is not None} # type: ignore[assignment] if errors: raise web.HTTPError( 500, @@ -44,14 +44,14 @@ def _validate_in_or_not(expect_in_model: bool, model: Dict[str, Any], maybe_none ) -def validate_model(model, expect_content, expect_md5): +def validate_model(model, expect_content=False, expect_hash=False): """ Validate a model returned by a ContentsManager method. If expect_content is True, then we expect non-null entries for 'content' and 'format'. - If expect_md5 is True, then we expect non-null entries for 'md5'. + If expect_hash is True, then we expect non-null entries for 'hash' and 'hash_algorithm'. """ required_keys = { "name", @@ -63,8 +63,9 @@ def validate_model(model, expect_content, expect_md5): "mimetype", "content", "format", - "md5", } + if expect_hash: + required_keys.update(["hash", "hash_algorithm"]) missing = required_keys - set(model.keys()) if missing: raise web.HTTPError( @@ -73,9 +74,9 @@ def validate_model(model, expect_content, expect_md5): ) content_keys = ["content", "format"] - md5_keys = ["md5"] - _validate_in_or_not(expect_content, model, content_keys) - _validate_in_or_not(expect_md5, model, md5_keys) + _validate_keys(expect_content, model, content_keys) + if expect_hash: + _validate_keys(expect_hash, model, ["hash", "hash_algorithm"]) class ContentsAPIHandler(APIHandler): @@ -136,26 +137,41 @@ async def get(self, path=""): raise web.HTTPError(400, "Content %r is invalid" % content_str) content = int(content_str or "") - md5_str = self.get_query_argument("md5", default="0") - if md5_str not in {"0", "1"}: - raise web.HTTPError(400, "Content %r is invalid" % md5_str) - md5 = int(md5_str or "") + hash_str = self.get_query_argument("hash", default="0") + if hash_str not in {"0", "1"}: + raise web.HTTPError(400, f"Content {hash_str!r} is invalid") + require_hash = int(hash_str) if not cm.allow_hidden and await ensure_async(cm.is_hidden(path)): await self._finish_error( HTTPStatus.NOT_FOUND, f"file or directory {path!r} does not exist" ) + try: - model = await ensure_async( - self.contents_manager.get( - path=path, - type=type, - format=format, - content=content, - md5=md5, + expect_hash = require_hash + try: + model = await ensure_async( + self.contents_manager.get( + path=path, + type=type, + format=format, + content=content, + require_hash=require_hash, + ) ) - ) - validate_model(model, expect_content=content, expect_md5=md5) + except TypeError: + # Fallback for ContentsManager not handling the require_hash argument + # introduced in 2.11 + expect_hash = False + model = await ensure_async( + self.contents_manager.get( + path=path, + type=type, + format=format, + content=content, + ) + ) + validate_model(model, expect_content=content, expect_hash=expect_hash) self._finish_model(model, location=False) except web.HTTPError as exc: # 404 is okay in this context, catch exception and return 404 code to prevent stack trace on client @@ -185,7 +201,7 @@ async def patch(self, path=""): raise web.HTTPError(400, f"Cannot rename file or directory {path!r}") model = await ensure_async(cm.update(model, path)) - validate_model(model, expect_content=False, expect_md5=False) + validate_model(model) self._finish_model(model) async def _copy(self, copy_from, copy_to=None): @@ -198,7 +214,7 @@ async def _copy(self, copy_from, copy_to=None): ) model = await ensure_async(self.contents_manager.copy(copy_from, copy_to)) self.set_status(201) - validate_model(model, expect_content=False, expect_md5=False) + validate_model(model) self._finish_model(model) async def _upload(self, model, path): @@ -206,7 +222,7 @@ async def _upload(self, model, path): self.log.info("Uploading file to %s", path) model = await ensure_async(self.contents_manager.new(model, path)) self.set_status(201) - validate_model(model, expect_content=False, expect_md5=False) + validate_model(model) self._finish_model(model) async def _new_untitled(self, path, type="", ext=""): @@ -216,7 +232,7 @@ async def _new_untitled(self, path, type="", ext=""): self.contents_manager.new_untitled(path=path, type=type, ext=ext) ) self.set_status(201) - validate_model(model, expect_content=False, expect_md5=False) + validate_model(model) self._finish_model(model) async def _save(self, model, path): @@ -225,7 +241,7 @@ async def _save(self, model, path): if not chunk or chunk == -1: # Avoid tedious log information self.log.info("Saving file at %s", path) model = await ensure_async(self.contents_manager.save(model, path)) - validate_model(model, expect_content=False, expect_md5=False) + validate_model(model) self._finish_model(model) @web.authenticated diff --git a/jupyter_server/services/contents/manager.py b/jupyter_server/services/contents/manager.py index 94684bb022..b12a2055ec 100644 --- a/jupyter_server/services/contents/manager.py +++ b/jupyter_server/services/contents/manager.py @@ -447,8 +447,16 @@ def exists(self, path): """ return self.file_exists(path) or self.dir_exists(path) - def get(self, path, content=True, type=None, format=None, md5=False): - """Get a file or directory model.""" + def get(self, path, content=True, type=None, format=None, require_hash=False): + """Get a file or directory model. + + Parameters + ---------- + require_hash : bool + Whether the file hash must be returned or not. + + *Changed in version 2.11*: The *require_hash* parameter was added. + """ raise NotImplementedError def save(self, model, path): @@ -849,8 +857,16 @@ async def exists(self, path): self.dir_exists(path) ) - async def get(self, path, content=True, type=None, format=None): - """Get a file or directory model.""" + async def get(self, path, content=True, type=None, format=None, require_hash=False): + """Get a file or directory model. + + Parameters + ---------- + require_hash : bool + Whether the file hash must be returned or not. + + *Changed in version 2.11*: The *require_hash* parameter was added. + """ raise NotImplementedError async def save(self, model, path): diff --git a/tests/services/contents/test_api.py b/tests/services/contents/test_api.py index eb93fd7526..b74ee8f62a 100644 --- a/tests/services/contents/test_api.py +++ b/tests/services/contents/test_api.py @@ -97,21 +97,24 @@ async def test_get_nb_contents(jp_fetch, contents, path, name): assert model["path"] == nbpath assert model["type"] == "notebook" assert "content" in model + assert model["hash"] is None + assert model["hash_algorithm"] is None assert model["format"] == "json" assert "metadata" in model["content"] assert isinstance(model["content"]["metadata"], dict) @pytest.mark.parametrize("path,name", dirs) -async def test_get_nb_md5(jp_fetch, contents, path, name): +async def test_get_nb_hash(jp_fetch, contents, path, name): nbname = name + ".ipynb" nbpath = (path + "/" + nbname).lstrip("/") - r = await jp_fetch("api", "contents", nbpath, method="GET", params=dict(md5="1")) + r = await jp_fetch("api", "contents", nbpath, method="GET", params=dict(hash="1")) model = json.loads(r.body.decode()) assert model["name"] == nbname assert model["path"] == nbpath assert model["type"] == "notebook" - assert "md5" in model + assert model["hash"] + assert model["hash_algorithm"] assert "metadata" in model["content"] assert isinstance(model["content"]["metadata"], dict) @@ -125,6 +128,9 @@ async def test_get_nb_no_contents(jp_fetch, contents, path, name): assert model["name"] == nbname assert model["path"] == nbpath assert model["type"] == "notebook" + assert "hash" in model + assert model["hash"] == None + assert "hash_algorithm" in model assert "content" in model assert model["content"] is None @@ -175,6 +181,9 @@ async def test_get_text_file_contents(jp_fetch, contents, path, name): model = json.loads(r.body.decode()) assert model["name"] == txtname assert model["path"] == txtpath + assert "hash" in model + assert model["hash"] == None + assert "hash_algorithm" in model assert "content" in model assert model["format"] == "text" assert model["type"] == "file" @@ -201,14 +210,16 @@ async def test_get_text_file_contents(jp_fetch, contents, path, name): @pytest.mark.parametrize("path,name", dirs) -async def test_get_text_file_md5(jp_fetch, contents, path, name): +async def test_get_text_file_hash(jp_fetch, contents, path, name): txtname = name + ".txt" txtpath = (path + "/" + txtname).lstrip("/") - r = await jp_fetch("api", "contents", txtpath, method="GET", params=dict(md5="1")) + r = await jp_fetch("api", "contents", txtpath, method="GET", params=dict(hash="1")) model = json.loads(r.body.decode()) assert model["name"] == txtname assert model["path"] == txtpath - assert "md5" in model + assert "hash" in model + assert model["hash"] + assert model["hash_algorithm"] assert model["format"] == "text" assert model["type"] == "file" @@ -253,6 +264,9 @@ async def test_get_binary_file_contents(jp_fetch, contents, path, name): assert model["name"] == blobname assert model["path"] == blobpath assert "content" in model + assert "hash" in model + assert model["hash"] == None + assert "hash_algorithm" in model assert model["format"] == "base64" assert model["type"] == "file" data_out = decodebytes(model["content"].encode("ascii")) diff --git a/tests/services/contents/test_fileio.py b/tests/services/contents/test_fileio.py index 19060db94a..12752ee810 100644 --- a/tests/services/contents/test_fileio.py +++ b/tests/services/contents/test_fileio.py @@ -137,13 +137,16 @@ def test_path_to_invalid(tmpdir): @pytest.mark.skipif(os.name == "nt", reason="test fails on Windows") -def test_file_manager_mixin(tmpdir): +def test_file_manager_mixin(tmp_path): mixin = FileManagerMixin() mixin.log = logging.getLogger() - bad_content = tmpdir / "bad_content.ipynb" + bad_content = tmp_path / "bad_content.ipynb" bad_content.write_text("{}", "utf8") - # Same as `echo -n {} | md5sum` - assert mixin._get_md5(bad_content) == "99914b932bd37a50b983c5e7c90ae93b" + # Same as `echo -n {} | sha256sum` + assert mixin._get_hash(bad_content.read_bytes()) == { + "hash": "44136fa355b3678a1146ad16f7e8649e94fb4fc21fe77e8310c060f61caaff8a", + "hash_algorithm": "sha256", + } with pytest.raises(HTTPError): mixin._read_notebook(bad_content) other = path_to_intermediate(bad_content) @@ -154,10 +157,10 @@ def test_file_manager_mixin(tmpdir): validate(nb) with pytest.raises(HTTPError): - mixin._read_file(tmpdir, "text") + mixin._read_file(tmp_path, "text") with pytest.raises(HTTPError): - mixin._save_file(tmpdir / "foo", "foo", "bar") + mixin._save_file(tmp_path / "foo", "foo", "bar") @pytest.mark.skipif(os.name == "nt", reason="test fails on Windows") @@ -166,15 +169,18 @@ async def test_async_file_manager_mixin(tmpdir): mixin.log = logging.getLogger() bad_content = tmpdir / "bad_content.ipynb" bad_content.write_text("{}", "utf8") - # Same as `echo -n {} | md5sum` - assert await mixin._get_md5(bad_content) == "99914b932bd37a50b983c5e7c90ae93b" with pytest.raises(HTTPError): await mixin._read_notebook(bad_content) other = path_to_intermediate(bad_content) with open(other, "w") as fid: json.dump(new_notebook(), fid) mixin.use_atomic_writing = True - nb = await mixin._read_notebook(bad_content) + nb, bcontent = await mixin._read_notebook(bad_content, raw=True) + # Same as `echo -n {} | sha256sum` + assert mixin._get_hash(bcontent) == { + "hash": "4747f9680816e352a697d0fb69d82334457cdd1e46f053e800859833d3e6003e", + "hash_algorithm": "sha256", + } validate(nb) with pytest.raises(HTTPError): @@ -182,3 +188,30 @@ async def test_async_file_manager_mixin(tmpdir): with pytest.raises(HTTPError): await mixin._save_file(tmpdir / "foo", "foo", "bar") + + +async def test_AsyncFileManagerMixin_read_notebook_no_raw(tmpdir): + mixin = AsyncFileManagerMixin() + mixin.log = logging.getLogger() + bad_content = tmpdir / "bad_content.ipynb" + bad_content.write_text("{}", "utf8") + + other = path_to_intermediate(bad_content) + with open(other, "w") as fid: + json.dump(new_notebook(), fid) + mixin.use_atomic_writing = True + answer = await mixin._read_notebook(bad_content) + + assert not isinstance(answer, tuple) + + +async def test_AsyncFileManagerMixin_read_file_no_raw(tmpdir): + mixin = AsyncFileManagerMixin() + mixin.log = logging.getLogger() + file_path = tmpdir / "bad_content.text" + file_path.write_text("blablabla", "utf8") + + mixin.use_atomic_writing = True + answer = await mixin._read_file(file_path, "text") + + assert len(answer) == 2 diff --git a/tests/services/contents/test_manager.py b/tests/services/contents/test_manager.py index 3d11a43ad0..e718036b0b 100644 --- a/tests/services/contents/test_manager.py +++ b/tests/services/contents/test_manager.py @@ -571,8 +571,16 @@ async def test_get(jp_contents_manager): nb_as_bin_file = await ensure_async(cm.get(path, content=True, type="file", format="base64")) assert nb_as_bin_file["format"] == "base64" - nb_with_md5 = await ensure_async(cm.get(path, md5=True)) - assert nb_with_md5["md5"] + nb_with_hash = await ensure_async(cm.get(path, require_hash=True)) + assert nb_with_hash["hash"] + assert nb_with_hash["hash_algorithm"] + + # Get the hash without the content + nb_with_hash = await ensure_async(cm.get(path, content=False, require_hash=True)) + assert nb_with_hash["content"] is None + assert nb_with_hash["format"] is None + assert nb_with_hash["hash"] + assert nb_with_hash["hash_algorithm"] # Test in sub-directory sub_dir = "/foo/" @@ -588,7 +596,7 @@ async def test_get(jp_contents_manager): # Test with a regular file. file_model_path = (await ensure_async(cm.new_untitled(path=sub_dir, ext=".txt")))["path"] - file_model = await ensure_async(cm.get(file_model_path, md5=True)) + file_model = await ensure_async(cm.get(file_model_path, require_hash=True)) expected_model = { "content": "", "format": "text", @@ -597,13 +605,34 @@ async def test_get(jp_contents_manager): "path": "foo/untitled.txt", "type": "file", "writable": True, + "hash_algorithm": cm.hash_algorithm, } # Assert expected model is in file_model for key, value in expected_model.items(): assert file_model[key] == value assert "created" in file_model assert "last_modified" in file_model - assert "md5" in file_model + assert file_model["hash"] + + # Get hash without content + file_model = await ensure_async(cm.get(file_model_path, content=False, require_hash=True)) + expected_model = { + "content": None, + "format": None, + "mimetype": "text/plain", + "name": "untitled.txt", + "path": "foo/untitled.txt", + "type": "file", + "writable": True, + "hash_algorithm": cm.hash_algorithm, + } + + # Assert expected model is in file_model + for key, value in expected_model.items(): + assert file_model[key] == value + assert "created" in file_model + assert "last_modified" in file_model + assert file_model["hash"] # Create a sub-sub directory to test getting directory contents with a # subdir. diff --git a/tests/services/contents/test_manager_no_hash.py b/tests/services/contents/test_manager_no_hash.py new file mode 100644 index 0000000000..511a8d319b --- /dev/null +++ b/tests/services/contents/test_manager_no_hash.py @@ -0,0 +1,44 @@ +import json + +import pytest + +from jupyter_server.services.contents.filemanager import ( + AsyncFileContentsManager, +) + + +class NoHashFileManager(AsyncFileContentsManager): + """FileManager prior to 2.11 that introduce the ability to request file hash.""" + + def _base_model(self, path): + """Drop new attributes from model.""" + model = super()._base_model(path) + + del model["hash"] + del model["hash_algorithm"] + + return model + + async def get(self, path, content=True, type=None, format=None): + """Get without the new `require_hash` argument""" + model = await super().get(path, content=content, type=type, format=format) + return model + + +@pytest.fixture +def jp_server_config(jp_server_config): + jp_server_config["ServerApp"]["contents_manager_class"] = NoHashFileManager + return jp_server_config + + +async def test_manager_no_hash_support(tmp_path, jp_root_dir, jp_fetch): + # Create some content + path = "dummy.txt" + (jp_root_dir / path).write_text("blablabla", encoding="utf-8") + + response = await jp_fetch("api", "contents", path, method="GET", params=dict(hash="1")) + + model = json.loads(response.body) + + assert "hash" not in model + assert "hash_algorithm" not in model