diff --git a/api/specs/storage/openapi.yaml b/api/specs/storage/openapi.yaml index cda9cd6e2c8..960093c9077 100644 --- a/api/specs/storage/openapi.yaml +++ b/api/specs/storage/openapi.yaml @@ -243,9 +243,11 @@ paths: $ref: "#/components/schemas/FileMetaEnvelope" default: $ref: "#/components/responses/DefaultErrorResponse" - patch: - summary: Update file metadata - operationId: update_file_meta_data + + /locations/{location_id}/files/{file_id}:abort: + post: + summary: Asks the server to abort the upload and revert to the last valid version if any + operationId: abort_upload_file parameters: - name: file_id in: path @@ -263,12 +265,8 @@ paths: schema: type: string responses: - "200": - description: "Returns file metadata" - content: - application/json: - schema: - $ref: "#/components/schemas/FileMetaEnvelope" + "204": + description: Abort OK default: $ref: "#/components/responses/DefaultErrorResponse" @@ -346,6 +344,12 @@ paths: application/json: schema: $ref: "#/components/schemas/PresignedLinkEnveloped" + links: + AbortUpload: + operationId: abort_upload_file + parameters: + path.location_id: "$request.path.location_id" + path.file_id: "$request.path.file_id" default: $ref: "#/components/responses/DefaultErrorResponse" delete: diff --git a/packages/models-library/src/models_library/api_schemas_storage.py b/packages/models-library/src/models_library/api_schemas_storage.py index efbe3202f5d..f51a703ea73 100644 --- a/packages/models-library/src/models_library/api_schemas_storage.py +++ b/packages/models-library/src/models_library/api_schemas_storage.py @@ -8,11 +8,26 @@ import re from datetime import datetime -from typing import List, Optional, Pattern, Union +from enum import Enum +from typing import Any, Optional, Pattern, Union from uuid import UUID -from models_library.projects_nodes_io import LocationID, LocationName, StorageFileID -from pydantic import BaseModel, ByteSize, ConstrainedStr, Extra, Field, validator +from models_library.projects_nodes_io import ( + LocationID, + LocationName, + NodeID, + SimcoreS3FileID, + StorageFileID, +) +from pydantic import ( + BaseModel, + ByteSize, + ConstrainedStr, + Extra, + Field, + root_validator, + validator, +) from pydantic.networks import AnyUrl from .basic_regex import DATCORE_DATASET_NAME_RE, S3_BUCKET_NAME_RE @@ -127,8 +142,8 @@ class FileMetaDataGet(BaseModel): @validator("location_id", pre=True) @classmethod - def convert_from_str(cls, v): - if isinstance(v, str): + def ensure_location_is_integer(cls, v): + if v is not None: return int(v) return v @@ -198,14 +213,46 @@ class Config: class FileMetaDataArray(BaseModel): - __root__: List[FileMetaDataGet] = [] + __root__: list[FileMetaDataGet] = [] # /locations/{location_id}/files/{file_id} +class LinkType(str, Enum): + PRESIGNED = "PRESIGNED" + S3 = "S3" + + class PresignedLink(BaseModel): link: AnyUrl # /simcore-s3/ + + +class FoldersBody(BaseModel): + source: dict[str, Any] = Field(default_factory=dict) + destination: dict[str, Any] = Field(default_factory=dict) + nodes_map: dict[NodeID, NodeID] = Field(default_factory=dict) + + @root_validator() + @classmethod + def ensure_consistent_entries(cls, values): + source_node_keys = ( + NodeID(n) for n in values["source"].get("workbench", {}).keys() + ) + if set(source_node_keys) != set(values["nodes_map"].keys()): + raise ValueError("source project nodes do not fit with nodes_map entries") + destination_node_keys = ( + NodeID(n) for n in values["destination"].get("workbench", {}).keys() + ) + if set(destination_node_keys) != set(values["nodes_map"].values()): + raise ValueError( + "destination project nodes do not fit with nodes_map values" + ) + return values + + +class SoftCopyBody(BaseModel): + link_id: SimcoreS3FileID diff --git a/packages/models-library/src/models_library/basic_types.py b/packages/models-library/src/models_library/basic_types.py index f818d190382..89e8fd68c91 100644 --- a/packages/models-library/src/models_library/basic_types.py +++ b/packages/models-library/src/models_library/basic_types.py @@ -2,7 +2,7 @@ from pydantic import conint, constr -from .basic_regex import UUID_RE_BASE, VERSION_RE +from .basic_regex import UUID_RE, VERSION_RE # port number range PortInt = conint(gt=0, lt=65535) @@ -21,7 +21,7 @@ EnvVarKey = constr(regex=r"[a-zA-Z][a-azA-Z0-9_]*") # e.g. '5c833a78-1af3-43a7-9ed7-6a63b188f4d8' -UUIDStr = constr(regex=UUID_RE_BASE) +UUIDStr = constr(regex=UUID_RE) class LogLevel(str, Enum): diff --git a/packages/models-library/src/models_library/projects.py b/packages/models-library/src/models_library/projects.py index 7fe93ee08cb..581f6e30a41 100644 --- a/packages/models-library/src/models_library/projects.py +++ b/packages/models-library/src/models_library/projects.py @@ -4,7 +4,7 @@ from copy import deepcopy from datetime import datetime from enum import Enum -from typing import Any, Dict, List, Optional +from typing import Any, Optional from uuid import UUID from pydantic import BaseModel, EmailStr, Extra, Field, HttpUrl, constr, validator @@ -22,7 +22,7 @@ ClassifierID = str # TODO: for some reason class Workbench(BaseModel): __root__= does not work as I thought ... investigate! -Workbench = Dict[NodeIDStr, Node] +Workbench = dict[NodeIDStr, Node] # NOTE: careful this is in sync with packages/postgres-database/src/simcore_postgres_database/models/projects.py!!! @@ -97,6 +97,7 @@ def convert_sql_alchemy_enum(cls, v): class Config: orm_mode = True use_enum_values = True + allow_population_by_field_name = True class Project(BaseProjectModel): @@ -121,15 +122,15 @@ class Project(BaseProjectModel): examples=["2018-07-01T11:13:43Z"], alias="lastChangeDate", ) - access_rights: Dict[GroupIDStr, AccessRights] = Field( + access_rights: dict[GroupIDStr, AccessRights] = Field( ..., description="object containing the GroupID as key and read/write/execution permissions as value", alias="accessRights", ) # Classification - tags: Optional[List[int]] = [] - classifiers: Optional[List[ClassifierID]] = Field( + tags: Optional[list[int]] = [] + classifiers: Optional[list[ClassifierID]] = Field( default_factory=list, description="Contains the reference to the project classifiers", examples=["some:id:to:a:classifier"], @@ -142,12 +143,12 @@ class Project(BaseProjectModel): ui: Optional[StudyUI] = None # Quality - quality: Dict[str, Any] = Field( + quality: dict[str, Any] = Field( {}, description="stores the study quality assessment" ) # Dev only - dev: Optional[Dict] = Field(description="object used for development purposes only") + dev: Optional[dict] = Field(description="object used for development purposes only") class Config: description = "Document that stores metadata, pipeline and UI setup of a study" @@ -155,7 +156,7 @@ class Config: extra = Extra.forbid @staticmethod - def schema_extra(schema: Dict, _model: "Project"): + def schema_extra(schema: dict, _model: "Project"): # pylint: disable=unsubscriptable-object # Patch to allow jsonschema nullable diff --git a/packages/models-library/src/models_library/projects_nodes_io.py b/packages/models-library/src/models_library/projects_nodes_io.py index f763ec98184..ade9cae45a7 100644 --- a/packages/models-library/src/models_library/projects_nodes_io.py +++ b/packages/models-library/src/models_library/projects_nodes_io.py @@ -7,9 +7,8 @@ """ import re -from enum import IntEnum from pathlib import Path -from typing import Literal, Optional, Pattern, Union +from typing import Optional, Pattern, Union from uuid import UUID from pydantic import AnyUrl, BaseModel, ConstrainedStr, Extra, Field, validator @@ -24,14 +23,8 @@ class NodeIDStr(ConstrainedStr): regex: Optional[Pattern[str]] = re.compile(UUID_RE) -# NOTE: this trick is used to keep backward compatility simcore.s3 is not a valid python variable name -Location = IntEnum( - value="Location", - names=[("simcore.s3", 0), ("SIMCORE_S3", 0), ("datcore", 1), ("DATCORE", 1)], -) - -LocationID = Union[Literal[0], Literal[1]] -LocationName = Union[Literal["simcore.s3"], Literal["datcore"]] +LocationID = int +LocationName = str class SimcoreS3FileID(ConstrainedStr): diff --git a/packages/models-library/tests/test_basic_regex.py b/packages/models-library/tests/test_basic_regex.py index 1244f483288..323108602ae 100644 --- a/packages/models-library/tests/test_basic_regex.py +++ b/packages/models-library/tests/test_basic_regex.py @@ -12,7 +12,7 @@ from models_library.basic_regex import ( DATE_RE, PUBLIC_VARIABLE_NAME_RE, - UUID_RE_BASE, + UUID_RE, VERSION_RE, ) from packaging.version import Version @@ -30,6 +30,7 @@ def assert_match_and_get_capture(regex_str, test_str, expected) -> Optional[Sequ assert match is not None print(regex_str, "captured:", match.group(), "->", match.groups()) else: + assert match captured = match.groups() assert captured == expected return captured @@ -56,7 +57,7 @@ def test_VERSION_RE(version_str, expected): ], ) def test_UUID_RE(uuid_str, expected): - assert_match_and_get_capture(UUID_RE_BASE, uuid_str, expected) + assert_match_and_get_capture(UUID_RE, uuid_str, expected) class webserver_timedate_utils: diff --git a/packages/postgres-database/src/simcore_postgres_database/migration/versions/2cc556e5c52d_file_meta_data_remove_unused_columns_.py b/packages/postgres-database/src/simcore_postgres_database/migration/versions/2cc556e5c52d_file_meta_data_remove_unused_columns_.py new file mode 100644 index 00000000000..c1a528136e5 --- /dev/null +++ b/packages/postgres-database/src/simcore_postgres_database/migration/versions/2cc556e5c52d_file_meta_data_remove_unused_columns_.py @@ -0,0 +1,87 @@ +"""file_meta_data: remove unused columns, add expiration of upload + +Revision ID: 2cc556e5c52d +Revises: cf3bac482ce0 +Create Date: 2022-06-26 19:12:13.478593+00:00 + +""" +import sqlalchemy as sa +from alembic import op + +# revision identifiers, used by Alembic. +revision = "2cc556e5c52d" +down_revision = "cf3bac482ce0" +branch_labels = None +depends_on = None + + +def upgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.add_column( + "file_meta_data", sa.Column("upload_expires_at", sa.DateTime(), nullable=True) + ) + op.alter_column( + "file_meta_data", "file_id", existing_type=sa.VARCHAR(), nullable=False + ) + op.drop_column("file_meta_data", "display_file_path") + op.drop_column("file_meta_data", "node_name") + op.drop_column("file_meta_data", "raw_file_path") + op.drop_column("file_meta_data", "project_name") + op.drop_column("file_meta_data", "file_name") + op.drop_column("file_meta_data", "file_uuid") + op.drop_column("file_meta_data", "user_name") + # ### end Alembic commands ### + op.create_primary_key("pk_file_meta_data", "file_meta_data", ["file_id"]) + + +def downgrade(): + op.drop_constraint("pk_file_meta_data", "file_meta_data", "primary") + # ### commands auto generated by Alembic - please adjust! ### + op.add_column( + "file_meta_data", + sa.Column("user_name", sa.VARCHAR(), autoincrement=False, nullable=True), + ) + op.add_column( + "file_meta_data", + sa.Column("file_uuid", sa.VARCHAR(), autoincrement=False, nullable=True), + ) + op.add_column( + "file_meta_data", + sa.Column("file_name", sa.VARCHAR(), autoincrement=False, nullable=True), + ) + op.add_column( + "file_meta_data", + sa.Column("project_name", sa.VARCHAR(), autoincrement=False, nullable=True), + ) + op.add_column( + "file_meta_data", + sa.Column("raw_file_path", sa.VARCHAR(), autoincrement=False, nullable=True), + ) + op.add_column( + "file_meta_data", + sa.Column("node_name", sa.VARCHAR(), autoincrement=False, nullable=True), + ) + op.add_column( + "file_meta_data", + sa.Column( + "display_file_path", sa.VARCHAR(), autoincrement=False, nullable=True + ), + ) + op.alter_column( + "file_meta_data", "file_id", existing_type=sa.VARCHAR(), nullable=True + ) + op.drop_column("file_meta_data", "upload_expires_at") + # ### end Alembic commands ### + conn = op.get_bind() + for row in conn.execute(sa.DDL("SELECT file_id FROM file_meta_data")): + file_id = row["file_id"] + conn.execute( + sa.DDL( + f""" +UPDATE file_meta_data + SET file_uuid = '{file_id}' + WHERE file_id = '{file_id}' + """ + ) + ) + op.create_primary_key("pk_file_meta_data", "file_meta_data", ["file_uuid"]) diff --git a/packages/postgres-database/src/simcore_postgres_database/models/file_meta_data.py b/packages/postgres-database/src/simcore_postgres_database/models/file_meta_data.py index 3a3e6cf21a1..471c25b7d94 100644 --- a/packages/postgres-database/src/simcore_postgres_database/models/file_meta_data.py +++ b/packages/postgres-database/src/simcore_postgres_database/models/file_meta_data.py @@ -5,21 +5,14 @@ file_meta_data = sa.Table( "file_meta_data", metadata, - sa.Column("file_uuid", sa.String(), primary_key=True), sa.Column("location_id", sa.String()), sa.Column("location", sa.String()), sa.Column("bucket_name", sa.String()), sa.Column("object_name", sa.String()), sa.Column("project_id", sa.String()), - sa.Column("project_name", sa.String()), sa.Column("node_id", sa.String()), - sa.Column("node_name", sa.String()), - sa.Column("file_name", sa.String()), sa.Column("user_id", sa.String()), - sa.Column("user_name", sa.String()), - sa.Column("file_id", sa.String()), - sa.Column("raw_file_path", sa.String()), - sa.Column("display_file_path", sa.String()), + sa.Column("file_id", sa.String(), primary_key=True), sa.Column("created_at", sa.String()), sa.Column("last_modified", sa.String()), sa.Column("file_size", sa.BigInteger()), @@ -38,4 +31,7 @@ doc="If true, this file is a soft link." "i.e. is another entry with the same object_name", ), + sa.Column( + "upload_expires_at", sa.DateTime(), nullable=True, doc="Timestamp of expiration" + ), ) diff --git a/packages/pytest-simcore/src/pytest_simcore/aioresponses_mocker.py b/packages/pytest-simcore/src/pytest_simcore/aioresponses_mocker.py index 3f21155fdac..54fa945004f 100644 --- a/packages/pytest-simcore/src/pytest_simcore/aioresponses_mocker.py +++ b/packages/pytest-simcore/src/pytest_simcore/aioresponses_mocker.py @@ -1,7 +1,13 @@ import pytest from aioresponses import aioresponses as AioResponsesMock -PASSTHROUGH_REQUESTS_PREFIXES = ["http://127.0.0.1", "ws://"] +from .helpers.utils_docker import get_localhost_ip + +PASSTHROUGH_REQUESTS_PREFIXES = [ + "http://127.0.0.1", + "ws://", + f"http://{get_localhost_ip()}", +] @pytest.fixture diff --git a/packages/pytest-simcore/src/pytest_simcore/helpers/utils_parametrizations.py b/packages/pytest-simcore/src/pytest_simcore/helpers/utils_parametrizations.py new file mode 100644 index 00000000000..8b0b539eb58 --- /dev/null +++ b/packages/pytest-simcore/src/pytest_simcore/helpers/utils_parametrizations.py @@ -0,0 +1,9 @@ +from typing import Optional + +from pydantic import ByteSize + + +def byte_size_ids(val) -> Optional[str]: + if isinstance(val, ByteSize): + return val.human_readable() + return None diff --git a/packages/pytest-simcore/src/pytest_simcore/services_api_mocks_for_aiohttp_clients.py b/packages/pytest-simcore/src/pytest_simcore/services_api_mocks_for_aiohttp_clients.py index 309e50c5912..3c588d1a921 100644 --- a/packages/pytest-simcore/src/pytest_simcore/services_api_mocks_for_aiohttp_clients.py +++ b/packages/pytest-simcore/src/pytest_simcore/services_api_mocks_for_aiohttp_clients.py @@ -359,6 +359,7 @@ def get_download_link_cb(url: URL, **kwargs) -> CallbackResult: get_file_metadata_pattern, status=web.HTTPOk.status_code, payload={"data": FileMetaDataGet.Config.schema_extra["examples"][0]}, + repeat=True, ) aioresponses_mocker.get( get_download_link_pattern, callback=get_download_link_cb, repeat=True @@ -370,6 +371,7 @@ def get_download_link_cb(url: URL, **kwargs) -> CallbackResult: get_locations_link_pattern, status=web.HTTPOk.status_code, payload={"data": [{"name": "simcore.s3", "id": 0}]}, + repeat=True, ) return aioresponses_mocker diff --git a/packages/pytest-simcore/src/pytest_simcore/simcore_storage_service.py b/packages/pytest-simcore/src/pytest_simcore/simcore_storage_service.py index 9e594d15eb5..755455db40b 100644 --- a/packages/pytest-simcore/src/pytest_simcore/simcore_storage_service.py +++ b/packages/pytest-simcore/src/pytest_simcore/simcore_storage_service.py @@ -61,7 +61,7 @@ async def wait_till_storage_responsive(storage_endpoint: URL): @pytest.fixture -def create_file_uuid() -> Callable[[ProjectID, NodeID, str], SimcoreS3FileID]: +def create_simcore_file_id() -> Callable[[ProjectID, NodeID, str], SimcoreS3FileID]: def _creator( project_id: ProjectID, node_id: NodeID, file_name: str ) -> SimcoreS3FileID: diff --git a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/dbmanager.py b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/dbmanager.py index 46e66a87d00..64a48031f2a 100644 --- a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/dbmanager.py +++ b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/dbmanager.py @@ -41,7 +41,7 @@ async def _get_node_from_db( ) if result.rowcount > 1: log.error("the node id %s is not unique", node_uuid) - node: RowProxy = await result.fetchone() + node: Optional[RowProxy] = await result.first() if not node: log.error("the node id %s was not found", node_uuid) raise NodeNotFound(node_uuid) diff --git a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/filemanager.py b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/filemanager.py index 0fabc84f68e..669997a2283 100644 --- a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/filemanager.py +++ b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/filemanager.py @@ -3,7 +3,7 @@ # pylint: disable=too-many-arguments import logging from pathlib import Path -from typing import Optional, Tuple +from typing import Optional import aiofiles from aiohttp import ClientError, ClientPayloadError, ClientSession @@ -16,7 +16,6 @@ from yarl import URL from ..node_ports_common.client_session_manager import ClientSessionContextManager -from ..node_ports_common.storage_client import update_file_meta_data from . import exceptions, storage_client from .constants import SIMCORE_LOCATION from .r_clone import RCloneFailedError, is_r_clone_available, sync_local_to_s3 @@ -132,12 +131,12 @@ async def _upload_file_to_link( if resp.status > 299: response_text = await resp.text() raise exceptions.S3TransferError( - "Could not upload file {}:{}".format(file_path, response_text) + f"Could not upload file {file_path}:{response_text}" ) if resp.status != 200: response_text = await resp.text() raise exceptions.S3TransferError( - "Issue when uploading file {}:{}".format(file_path, response_text) + f"Issue when uploading file {file_path}:{response_text}" ) # get the S3 etag from the headers @@ -185,7 +184,7 @@ async def get_upload_link_from_s3( s3_object: StorageFileID, link_type: storage_client.LinkType, client_session: Optional[ClientSession] = None, -) -> Tuple[LocationID, URL]: +) -> tuple[LocationID, URL]: if store_name is None and store_id is None: raise exceptions.NodeportsException(msg="both store name and store id are None") @@ -277,7 +276,7 @@ async def upload_file( local_file_path: Path, client_session: Optional[ClientSession] = None, r_clone_settings: Optional[RCloneSettings] = None, -) -> Tuple[LocationID, ETag]: +) -> tuple[LocationID, ETag]: """Uploads a file to S3 :param session: add app[APP_CLIENT_SESSION_KEY] session here otherwise default is opened/closed every call @@ -324,7 +323,9 @@ async def upload_file( ) else: try: - await _upload_file_to_link(session, upload_link, local_file_path) + e_tag = await _upload_file_to_link( + session, upload_link, local_file_path + ) except exceptions.S3TransferError as err: await delete_file( user_id=user_id, @@ -334,8 +335,9 @@ async def upload_file( ) raise err - e_tag = await update_file_meta_data( - session=session, s3_object=s3_object, user_id=user_id + # NOTE: this is not strictly necessary, only for RClone that does not retrieve the ETag + store_id, e_tag = await get_file_metadata( + user_id, store_id, s3_object, session ) except (RCloneFailedError, exceptions.S3TransferError) as exc: log.error("The upload failed with an unexpected error:", exc_info=True) @@ -377,7 +379,7 @@ async def get_file_metadata( store_id: LocationID, s3_object: StorageFileID, client_session: Optional[ClientSession] = None, -) -> Tuple[LocationID, ETag]: +) -> tuple[LocationID, ETag]: """ :raises S3InvalidPathError """ diff --git a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/storage_client.py b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/storage_client.py index 7a5c6e7a0f5..1be000c361f 100644 --- a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/storage_client.py +++ b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/storage_client.py @@ -7,7 +7,6 @@ from aiohttp import ClientSession, web from aiohttp.client_exceptions import ClientConnectionError, ClientResponseError from models_library.api_schemas_storage import ( - ETag, FileLocationArray, FileMetaDataGet, LocationID, @@ -144,17 +143,22 @@ async def get_file_metadata( raise exceptions.StorageInvalidCall( f"invalid call: user_id '{user_id}', location_id '{location_id}', file_id '{file_id}' are not allowed to be empty", ) - async with session.get( - f"{_base_url()}/locations/{location_id}/files/{quote(file_id, safe='')}/metadata", - params={"user_id": f"{user_id}"}, - ) as response: - response.raise_for_status() - file_metadata_enveloped = Envelope[FileMetaDataGet].parse_obj( - await response.json() - ) - if file_metadata_enveloped.data is None: - raise exceptions.S3InvalidPathError(file_id) - return file_metadata_enveloped.data + try: + async with session.get( + f"{_base_url()}/locations/{location_id}/files/{quote(file_id, safe='')}/metadata", + params={"user_id": f"{user_id}"}, + ) as response: + response.raise_for_status() + file_metadata_enveloped = Envelope[FileMetaDataGet].parse_obj( + await response.json() + ) + if file_metadata_enveloped.data is None: + raise exceptions.S3InvalidPathError(file_id) + return file_metadata_enveloped.data + except ClientResponseError as err: + if err.status == web.HTTPNotFound.status_code: + raise exceptions.S3InvalidPathError(file_id) from err + raise @handle_client_exception @@ -173,18 +177,3 @@ async def delete_file( params={"user_id": f"{user_id}"}, ) as response: response.raise_for_status() - - -@handle_client_exception -async def update_file_meta_data( - session: ClientSession, s3_object: StorageFileID, user_id: UserID -) -> ETag: - url = f"{_base_url()}/locations/0/files/{quote(s3_object, safe='')}/metadata" - result = await session.patch(url, params=dict(user_id=user_id)) - if result.status != web.HTTPOk.status_code: - raise exceptions.StorageInvalidCall( - f"Could not fetch metadata: status={result.status} {await result.text()}" - ) - - response = await result.json() - return response["data"]["entity_tag"] diff --git a/packages/simcore-sdk/tests/integration/test_node_ports_common_filemanager.py b/packages/simcore-sdk/tests/integration/test_node_ports_common_filemanager.py index b674eb04d98..50097168b14 100644 --- a/packages/simcore-sdk/tests/integration/test_node_ports_common_filemanager.py +++ b/packages/simcore-sdk/tests/integration/test_node_ports_common_filemanager.py @@ -235,7 +235,7 @@ async def test_invalid_file_path( ) download_folder = Path(tmpdir) / "downloads" - with pytest.raises(exceptions.InvalidDownloadLinkError): + with pytest.raises(exceptions.S3InvalidPathError): await filemanager.download_file_from_s3( user_id=user_id, store_id=store, diff --git a/packages/simcore-sdk/tests/unit/test_storage_client.py b/packages/simcore-sdk/tests/unit/test_storage_client.py index db0d80fc891..be0ca21f325 100644 --- a/packages/simcore-sdk/tests/unit/test_storage_client.py +++ b/packages/simcore-sdk/tests/unit/test_storage_client.py @@ -3,11 +3,13 @@ # pylint:disable=redefined-outer-name # pylint:disable=too-many-arguments +import re from typing import Any, Awaitable, Callable from uuid import uuid4 import aiohttp import pytest +from aiohttp import web from aioresponses import aioresponses as AioResponsesMock from models_library.api_schemas_storage import ( FileLocationArray, @@ -117,6 +119,44 @@ async def test_get_file_metada( ) +@pytest.fixture(params=["old_not_found_returns_empty_payload", "new_returns_404"]) +def storage_v0_service_mock_get_file_meta_data_not_found( + request, + aioresponses_mocker: AioResponsesMock, +) -> AioResponsesMock: + get_file_metadata_pattern = re.compile( + r"^http://[a-z\-_]*storage:[0-9]+/v0/locations/[0-9]+/files/.+/metadata.+$" + ) + if request.param == "old": + # NOTE: the old storage service did not consider using a 404 for when file is not found + aioresponses_mocker.get( + get_file_metadata_pattern, + status=web.HTTPOk.status_code, + payload={"error": "No result found", "data": {}}, + repeat=True, + ) + else: + # NOTE: the new storage service shall do it right one day and we shall be prepared + aioresponses_mocker.get( + get_file_metadata_pattern, + status=web.HTTPNotFound.status_code, + repeat=True, + ) + return aioresponses_mocker + + +async def test_get_file_metada_invalid_s3_path( + mock_environment: None, + storage_v0_service_mock_get_file_meta_data_not_found: AioResponsesMock, + user_id: UserID, + file_id: SimcoreS3FileID, + location_id: LocationID, +): + async with aiohttp.ClientSession() as session: + with pytest.raises(exceptions.S3InvalidPathError): + await get_file_metadata(session, file_id, location_id, user_id) + + @pytest.mark.parametrize( "fct_call, additional_kwargs", [ diff --git a/scripts/maintenance/migrate_project/src/db.py b/scripts/maintenance/migrate_project/src/db.py index cb5483de2bf..00bbd39339c 100644 --- a/scripts/maintenance/migrate_project/src/db.py +++ b/scripts/maintenance/migrate_project/src/db.py @@ -1,6 +1,6 @@ from collections import deque from contextlib import contextmanager -from typing import Any, Deque, Dict, Iterator, Optional, Tuple +from typing import Any, Deque, Iterator, Optional from uuid import UUID import typer @@ -31,9 +31,9 @@ def _project_uuid_exists_in_destination( return exists -def _meta_data_exists_in_destination(connection: Connection, file_uuid: str) -> bool: - query = select([file_meta_data.c.file_uuid]).where( - file_meta_data.c.file_uuid == f"{file_uuid}" +def _meta_data_exists_in_destination(connection: Connection, file_id: str) -> bool: + query = select([file_meta_data.c.file_id]).where( + file_meta_data.c.file_id == f"{file_id}" ) exists = len(list(connection.execute(query))) > 0 return exists @@ -80,11 +80,11 @@ def _green_message(message: str) -> None: _format_message(message, typer.colors.GREEN) -def _project_summary(project: Dict) -> str: +def _project_summary(project: dict) -> str: return f"PROJECT: {project['uuid']} {project['name']}" -def _file_summary(file_meta_data: Dict) -> str: +def _file_summary(file_meta_data: dict) -> str: return f"FILE: {file_meta_data['file_uuid']}" @@ -93,7 +93,7 @@ def get_project_and_files_to_migrate( hidden_projects_for_user: Optional[int], src_conn: Connection, dst_conn: Connection, -) -> Tuple[Deque, Deque]: +) -> tuple[Deque, Deque]: skipped_projects = deque() skipped_files_meta_data = deque() @@ -170,9 +170,9 @@ def get_project_and_files_to_migrate( return projects_to_migrate, files_meta_data_to_migrate -def insert_file_meta_data(connection: Connection, data: Dict[str, Any]) -> None: +def insert_file_meta_data(connection: Connection, data: dict[str, Any]) -> None: connection.execute(insert(file_meta_data).values(**data)) -def insert_projects(connection: Connection, data: Dict[str, Any]) -> None: +def insert_projects(connection: Connection, data: dict[str, Any]) -> None: connection.execute(insert(projects).values(**data)) diff --git a/services/api-server/src/simcore_service_api_server/api/routes/files.py b/services/api-server/src/simcore_service_api_server/api/routes/files.py index 607a480277a..34baa1dcb24 100644 --- a/services/api-server/src/simcore_service_api_server/api/routes/files.py +++ b/services/api-server/src/simcore_service_api_server/api/routes/files.py @@ -51,7 +51,6 @@ async def list_files( files_meta = deque() for stored_file_meta in stored_files: try: - assert stored_file_meta.user_id == user_id # nosec assert stored_file_meta.file_id # nosec file_meta: File = to_file_api_model(stored_file_meta) @@ -151,7 +150,6 @@ async def get_file( raise ValueError("Not found in storage") stored_file_meta = stored_files[0] - assert stored_file_meta.user_id == user_id # nosec assert stored_file_meta.file_id # nosec # Adapts storage API model to API model diff --git a/services/director-v2/tests/unit/with_dbs/test_utils_dask.py b/services/director-v2/tests/unit/with_dbs/test_utils_dask.py index 6d053eb5dad..0e6788919c0 100644 --- a/services/director-v2/tests/unit/with_dbs/test_utils_dask.py +++ b/services/director-v2/tests/unit/with_dbs/test_utils_dask.py @@ -8,7 +8,7 @@ from random import choice -from typing import Any, Callable, Dict +from typing import Any, Callable from unittest import mock import aiopg @@ -52,7 +52,7 @@ async def mocked_node_ports_filemanager_fcts( mocker: MockerFixture, faker: Faker, tasks_file_link_scheme: tuple, -) -> Dict[str, mock.MagicMock]: +) -> dict[str, mock.MagicMock]: return { "entry_exists": mocker.patch( "simcore_service_director_v2.utils.dask.port_utils.filemanager.entry_exists", @@ -127,7 +127,7 @@ def test_dask_job_id_serialization( @pytest.fixture() -def fake_io_config(faker: Faker) -> Dict[str, str]: +def fake_io_config(faker: Faker) -> dict[str, str]: return { f"pytest_io_key_{faker.pystr()}": choice( ["integer", "data:*/*", "boolean", "number", "string"] @@ -138,8 +138,8 @@ def fake_io_config(faker: Faker) -> Dict[str, str]: @pytest.fixture(params=[True, False]) def fake_io_schema( - fake_io_config: Dict[str, str], faker: Faker, request -) -> Dict[str, Dict[str, str]]: + fake_io_config: dict[str, str], faker: Faker, request +) -> dict[str, dict[str, str]]: fake_io_schema = {} for key, value_type in fake_io_config.items(): fake_io_schema[key] = { @@ -155,14 +155,16 @@ def fake_io_schema( @pytest.fixture() def fake_io_data( - fake_io_config: Dict[str, str], - create_file_uuid: Callable[[ProjectID, NodeID, str], SimcoreS3FileID], + fake_io_config: dict[str, str], + create_simcore_file_id: Callable[[ProjectID, NodeID, str], SimcoreS3FileID], faker: Faker, -) -> Dict[str, Any]: - def generate_simcore_file_link() -> Dict[str, Any]: +) -> dict[str, Any]: + def generate_simcore_file_link() -> dict[str, Any]: return SimCoreFileLink( store=0, - path=create_file_uuid(faker.uuid4(), faker.uuid4(), faker.file_name()), + path=create_simcore_file_id( + faker.uuid4(), faker.uuid4(), faker.file_name() + ), ).dict(by_alias=True, exclude_unset=True) TYPE_TO_FAKE_CALLABLE_MAP = { @@ -180,8 +182,8 @@ def generate_simcore_file_link() -> Dict[str, Any]: @pytest.fixture() def fake_task_output_data( - fake_io_schema: Dict[str, Dict[str, str]], - fake_io_data: Dict[str, Any], + fake_io_schema: dict[str, dict[str, str]], + fake_io_data: dict[str, Any], faker: Faker, ) -> TaskOutputData: converted_data = { @@ -204,7 +206,7 @@ async def test_parse_output_data( aiopg_engine: aiopg.sa.engine.Engine, # type: ignore published_project: PublishedProject, user_id: UserID, - fake_io_schema: Dict[str, Dict[str, str]], + fake_io_schema: dict[str, dict[str, str]], fake_task_output_data: TaskOutputData, mocker: MockerFixture, ): @@ -243,7 +245,7 @@ async def test_parse_output_data( def app_with_db( mock_env: None, monkeypatch: MonkeyPatch, - postgres_host_config: Dict[str, str], + postgres_host_config: dict[str, str], ): monkeypatch.setenv("DIRECTOR_V2_POSTGRES_ENABLED", "1") monkeypatch.setenv("R_CLONE_PROVIDER", "MINIO") @@ -260,9 +262,9 @@ async def test_compute_input_data( async_client: httpx.AsyncClient, user_id: UserID, published_project: PublishedProject, - fake_io_schema: Dict[str, Dict[str, str]], - fake_io_data: Dict[str, Any], - create_file_uuid: Callable[[ProjectID, NodeID, str], SimcoreS3FileID], + fake_io_schema: dict[str, dict[str, str]], + fake_io_data: dict[str, Any], + create_simcore_file_id: Callable[[ProjectID, NodeID, str], SimcoreS3FileID], faker: Faker, mocker: MockerFixture, tasks_file_link_type: FileLinkType, @@ -273,7 +275,7 @@ async def test_compute_input_data( fake_inputs = { key: SimCoreFileLink( store=0, - path=create_file_uuid( + path=create_simcore_file_id( published_project.project.uuid, sleeper_task.node_id, faker.file_name() ), ).dict(by_alias=True, exclude_unset=True) @@ -328,10 +330,10 @@ async def test_compute_output_data_schema( async_client: httpx.AsyncClient, user_id: UserID, published_project: PublishedProject, - fake_io_schema: Dict[str, Dict[str, str]], + fake_io_schema: dict[str, dict[str, str]], tasks_file_link_type: FileLinkType, tasks_file_link_scheme: tuple, - mocked_node_ports_filemanager_fcts: Dict[str, mock.MagicMock], + mocked_node_ports_filemanager_fcts: dict[str, mock.MagicMock], ): sleeper_task: CompTaskAtDB = published_project.tasks[1] # simulate pre-created file links @@ -369,10 +371,10 @@ async def test_clean_task_output_and_log_files_if_invalid( aiopg_engine: aiopg.sa.engine.Engine, # type: ignore user_id: UserID, published_project: PublishedProject, - mocked_node_ports_filemanager_fcts: Dict[str, mock.MagicMock], - create_file_uuid: Callable[[ProjectID, NodeID, str], SimcoreS3FileID], + mocked_node_ports_filemanager_fcts: dict[str, mock.MagicMock], + create_simcore_file_id: Callable[[ProjectID, NodeID, str], SimcoreS3FileID], entry_exists_returns: bool, - fake_io_schema: Dict[str, Dict[str, str]], + fake_io_schema: dict[str, dict[str, str]], faker: Faker, ): # since the presigned links for outputs and logs file are created @@ -389,7 +391,7 @@ async def test_clean_task_output_and_log_files_if_invalid( fake_outputs = { key: SimCoreFileLink( store=0, - path=create_file_uuid( + path=create_simcore_file_id( published_project.project.uuid, sleeper_task.node_id, faker.file_name() ), ).dict(by_alias=True, exclude_unset=True) @@ -433,7 +435,7 @@ async def test_clean_task_output_and_log_files_if_invalid( "req_example", NodeRequirements.Config.schema_extra["examples"] ) def test_node_requirements_correctly_convert_to_dask_resources( - req_example: Dict[str, Any] + req_example: dict[str, Any] ): node_reqs = NodeRequirements(**req_example) assert node_reqs diff --git a/services/storage/requirements/_base.in b/services/storage/requirements/_base.in index a05fe6a9d1a..74955dd1eae 100644 --- a/services/storage/requirements/_base.in +++ b/services/storage/requirements/_base.in @@ -13,19 +13,12 @@ --requirement ../../../packages/service-library/requirements/_base.in --requirement ../../../packages/service-library/requirements/_aiohttp.in -# server -aiohttp -aiohttp-swagger[performance] - -# s3 storage -aiobotocore -minio - -# i/o + db -aiofiles -aiopg[sa] - -# misc +aioboto3 # s3 storage +aiofiles # i/o +aiohttp # server +aiohttp-swagger[performance] # server +aiopg[sa] # db semantic_version tenacity typer +types-aiobotocore[s3]# s3 storage diff --git a/services/storage/requirements/_base.txt b/services/storage/requirements/_base.txt index 1cd5b4a0cbd..d070f33a8d8 100644 --- a/services/storage/requirements/_base.txt +++ b/services/storage/requirements/_base.txt @@ -4,8 +4,12 @@ # # pip-compile --output-file=requirements/_base.txt --strip-extras requirements/_base.in # -aiobotocore==2.3.3 +aioboto3==9.6.0 # via -r requirements/_base.in +aiobotocore==2.3.0 + # via + # aioboto3 + # types-aiobotocore aiodebug==2.3.0 # via # -c requirements/../../../packages/service-library/requirements/./_base.in @@ -54,10 +58,15 @@ attrs==20.3.0 # aiohttp # jsonschema # openapi-core -botocore==1.24.21 +boto3==1.21.21 # via aiobotocore -certifi==2022.5.18.1 - # via minio +botocore==1.24.21 + # via + # aiobotocore + # boto3 + # s3transfer +botocore-stubs==1.27.17 + # via types-aiobotocore charset-normalizer==2.0.12 # via aiohttp click==8.1.3 @@ -88,7 +97,9 @@ jinja2==3.1.2 # -c requirements/../../../requirements/constraints.txt # aiohttp-swagger jmespath==1.0.0 - # via botocore + # via + # boto3 + # botocore jsonschema==3.2.0 # via # -c requirements/../../../packages/service-library/requirements/././constraints.txt @@ -107,15 +118,6 @@ markupsafe==2.1.1 # via # jinja2 # mako -minio==7.0.4 - # via - # -c requirements/../../../packages/models-library/requirements/../../../requirements/constraints.txt - # -c requirements/../../../packages/postgres-database/requirements/../../../requirements/constraints.txt - # -c requirements/../../../packages/service-library/requirements/../../../requirements/constraints.txt - # -c requirements/../../../packages/service-library/requirements/./../../../requirements/constraints.txt - # -c requirements/../../../packages/settings-library/requirements/../../../requirements/constraints.txt - # -c requirements/../../../requirements/constraints.txt - # -r requirements/_base.in multidict==6.0.2 # via # aiohttp @@ -166,6 +168,8 @@ pyyaml==5.4.1 # -r requirements/../../../packages/service-library/requirements/_base.in # aiohttp-swagger # openapi-spec-validator +s3transfer==0.5.2 + # via boto3 semantic-version==2.9.0 # via -r requirements/_base.in six==1.16.0 @@ -196,11 +200,18 @@ typer==0.4.1 # via # -r requirements/../../../packages/settings-library/requirements/_base.in # -r requirements/_base.in +types-aiobotocore==2.3.3 + # via -r requirements/_base.in +types-aiobotocore-s3==2.3.3 + # via types-aiobotocore typing-extensions==4.2.0 # via # aiodebug # aioitertools + # botocore-stubs # pydantic + # types-aiobotocore + # types-aiobotocore-s3 ujson==5.3.0 # via aiohttp-swagger urllib3==1.26.9 @@ -212,7 +223,6 @@ urllib3==1.26.9 # -c requirements/../../../packages/settings-library/requirements/../../../requirements/constraints.txt # -c requirements/../../../requirements/constraints.txt # botocore - # minio werkzeug==2.0.3 # via -r requirements/../../../packages/service-library/requirements/_aiohttp.in wrapt==1.14.1 diff --git a/services/storage/requirements/_test.in b/services/storage/requirements/_test.in index fc903b60883..f73ae14d140 100644 --- a/services/storage/requirements/_test.in +++ b/services/storage/requirements/_test.in @@ -5,11 +5,14 @@ --constraint ../../../requirements/constraints.txt --constraint _base.txt + +aioresponses codecov coverage coveralls docker faker +moto[server] pandas pylint pytest diff --git a/services/storage/requirements/_test.txt b/services/storage/requirements/_test.txt index 985f315b1fa..69d2fee8f28 100644 --- a/services/storage/requirements/_test.txt +++ b/services/storage/requirements/_test.txt @@ -8,7 +8,10 @@ aiohttp==3.8.1 # via # -c requirements/../../../requirements/constraints.txt # -c requirements/_base.txt + # aioresponses # pytest-aiohttp +aioresponses==0.7.3 + # via -r requirements/_test.in aiosignal==1.2.0 # via # -c requirements/_base.txt @@ -23,16 +26,41 @@ attrs==20.3.0 # via # -c requirements/_base.txt # aiohttp + # jschema-to-python + # jsonschema # pytest -certifi==2022.5.18.1 + # sarif-om +aws-sam-translator==1.46.0 + # via cfn-lint +aws-xray-sdk==2.9.0 + # via moto +boto3==1.21.21 # via # -c requirements/_base.txt - # requests + # aws-sam-translator + # moto +botocore==1.24.21 + # via + # -c requirements/_base.txt + # aws-xray-sdk + # boto3 + # moto + # s3transfer +certifi==2022.5.18.1 + # via requests +cffi==1.15.0 + # via cryptography +cfn-lint==0.61.1 + # via moto charset-normalizer==2.0.12 # via # -c requirements/_base.txt # aiohttp # requests +click==8.1.3 + # via + # -c requirements/_base.txt + # flask codecov==2.1.12 # via -r requirements/_test.in coverage==6.3.2 @@ -43,49 +71,128 @@ coverage==6.3.2 # pytest-cov coveralls==3.3.1 # via -r requirements/_test.in +cryptography==37.0.2 + # via + # -c requirements/../../../requirements/constraints.txt + # moto + # python-jose + # sshpubkeys dill==0.3.4 # via pylint docker==5.0.3 - # via -r requirements/_test.in + # via + # -r requirements/_test.in + # moto docopt==0.6.2 # via coveralls +ecdsa==0.17.0 + # via + # moto + # python-jose + # sshpubkeys faker==13.7.0 # via -r requirements/_test.in +flask==2.1.2 + # via + # flask-cors + # moto +flask-cors==3.0.10 + # via moto frozenlist==1.3.0 # via # -c requirements/_base.txt # aiohttp # aiosignal +future==0.18.2 + # via aws-xray-sdk +graphql-core==3.2.1 + # via moto icdiff==2.0.5 # via pytest-icdiff idna==3.3 # via # -c requirements/_base.txt + # moto # requests # yarl +importlib-metadata==4.12.0 + # via flask iniconfig==1.1.1 # via pytest isort==5.10.1 # via pylint +itsdangerous==2.1.2 + # via flask +jinja2==3.1.2 + # via + # -c requirements/../../../requirements/constraints.txt + # -c requirements/_base.txt + # flask + # moto +jmespath==1.0.0 + # via + # -c requirements/_base.txt + # boto3 + # botocore +jschema-to-python==1.2.3 + # via cfn-lint +jsondiff==2.0.0 + # via moto +jsonpatch==1.32 + # via cfn-lint +jsonpickle==2.2.0 + # via jschema-to-python +jsonpointer==2.3 + # via jsonpatch +jsonschema==3.2.0 + # via + # -c requirements/_base.txt + # aws-sam-translator + # cfn-lint + # openapi-schema-validator + # openapi-spec-validator +junit-xml==1.9 + # via cfn-lint lazy-object-proxy==1.4.3 # via # -c requirements/_base.txt # astroid +markupsafe==2.1.1 + # via + # -c requirements/_base.txt + # jinja2 + # moto mccabe==0.7.0 # via pylint +moto==3.1.15 + # via -r requirements/_test.in multidict==6.0.2 # via # -c requirements/_base.txt # aiohttp # yarl +networkx==2.8.4 + # via cfn-lint numpy==1.22.3 # via pandas +openapi-schema-validator==0.2.3 + # via + # -c requirements/_base.txt + # openapi-spec-validator +openapi-spec-validator==0.4.0 + # via + # -c requirements/_base.txt + # moto packaging==21.3 # via # pytest # pytest-sugar pandas==1.4.2 # via -r requirements/_test.in +pbr==5.9.0 + # via + # jschema-to-python + # sarif-om platformdirs==2.5.2 # via pylint pluggy==1.0.0 @@ -94,10 +201,22 @@ pprintpp==0.4.0 # via pytest-icdiff py==1.11.0 # via pytest +pyasn1==0.4.8 + # via + # python-jose + # rsa +pycparser==2.21 + # via cffi pylint==2.13.8 # via -r requirements/_test.in pyparsing==3.0.9 - # via packaging + # via + # moto + # packaging +pyrsistent==0.18.1 + # via + # -c requirements/_base.txt + # jsonschema pytest==7.1.2 # via # -r requirements/_test.in @@ -127,21 +246,54 @@ pytest-sugar==0.9.4 python-dateutil==2.8.2 # via # -c requirements/_base.txt + # botocore # faker + # moto # pandas python-dotenv==0.20.0 # via -r requirements/_test.in +python-jose==3.3.0 + # via moto pytz==2022.1 - # via pandas + # via + # moto + # pandas +pyyaml==5.4.1 + # via + # -c requirements/../../../requirements/constraints.txt + # -c requirements/_base.txt + # cfn-lint + # moto + # openapi-spec-validator requests==2.27.1 # via # codecov # coveralls # docker + # moto + # responses +responses==0.21.0 + # via moto +rsa==4.8 + # via + # -c requirements/../../../requirements/constraints.txt + # python-jose +s3transfer==0.5.2 + # via + # -c requirements/_base.txt + # boto3 +sarif-om==1.0.4 + # via cfn-lint six==1.16.0 # via # -c requirements/_base.txt + # ecdsa + # flask-cors + # jsonschema + # junit-xml # python-dateutil +sshpubkeys==3.3.1 + # via moto termcolor==1.1.0 # via pytest-sugar tomli==2.0.1 @@ -158,17 +310,29 @@ urllib3==1.26.9 # via # -c requirements/../../../requirements/constraints.txt # -c requirements/_base.txt + # botocore # requests + # responses websocket-client==1.3.2 # via docker +werkzeug==2.0.3 + # via + # -c requirements/_base.txt + # flask + # moto wrapt==1.14.1 # via # -c requirements/_base.txt # astroid + # aws-xray-sdk +xmltodict==0.13.0 + # via moto yarl==1.7.2 # via # -c requirements/_base.txt # aiohttp +zipp==3.8.0 + # via importlib-metadata # The following packages are considered to be unsafe in a requirements file: # setuptools diff --git a/services/storage/requirements/_tools.txt b/services/storage/requirements/_tools.txt index 5c6f2986b45..1344efcc5a8 100644 --- a/services/storage/requirements/_tools.txt +++ b/services/storage/requirements/_tools.txt @@ -13,6 +13,7 @@ cfgv==3.3.1 click==8.1.3 # via # -c requirements/_base.txt + # -c requirements/_test.txt # black # pip-tools distlib==0.3.4 @@ -46,6 +47,7 @@ pyyaml==5.4.1 # via # -c requirements/../../../requirements/constraints.txt # -c requirements/_base.txt + # -c requirements/_test.txt # pre-commit # watchdog six==1.16.0 diff --git a/services/storage/src/simcore_service_storage/api/v0/openapi.yaml b/services/storage/src/simcore_service_storage/api/v0/openapi.yaml index 30887ea0198..d023b2bfcb1 100644 --- a/services/storage/src/simcore_service_storage/api/v0/openapi.yaml +++ b/services/storage/src/simcore_service_storage/api/v0/openapi.yaml @@ -237,9 +237,10 @@ paths: $ref: '#/components/schemas/FileMetaEnvelope' default: $ref: '#/components/responses/DefaultErrorResponse' - patch: - summary: Update file metadata - operationId: update_file_meta_data + '/locations/{location_id}/files/{file_id}:abort': + post: + summary: Asks the server to abort the upload and revert to the last valid version if any + operationId: abort_upload_file parameters: - name: file_id in: path @@ -257,12 +258,8 @@ paths: schema: type: string responses: - '200': - description: Returns file metadata - content: - application/json: - schema: - $ref: '#/components/schemas/FileMetaEnvelope' + '204': + description: Abort OK default: $ref: '#/components/responses/DefaultErrorResponse' '/locations/{location_id}/files/{file_id}': @@ -338,6 +335,12 @@ paths: application/json: schema: $ref: '#/components/schemas/PresignedLinkEnveloped' + links: + AbortUpload: + operationId: abort_upload_file + parameters: + path.location_id: $request.path.location_id + path.file_id: $request.path.file_id default: $ref: '#/components/responses/DefaultErrorResponse' delete: diff --git a/services/storage/src/simcore_service_storage/application.py b/services/storage/src/simcore_service_storage/application.py index 896c78810ea..155db47c9fa 100644 --- a/services/storage/src/simcore_service_storage/application.py +++ b/services/storage/src/simcore_service_storage/application.py @@ -14,9 +14,11 @@ from ._meta import WELCOME_MSG, app_name, version from .db import setup_db from .dsm import setup_dsm +from .dsm_cleaner import setup_dsm_cleaner from .rest import setup_rest from .s3 import setup_s3 from .settings import Settings +from .utils_handlers import dsm_exception_handler log = logging.getLogger(__name__) @@ -41,10 +43,19 @@ def create(settings: Settings) -> web.Application: skip_routes=None, ) - setup_db(app) # -> postgres service - setup_s3(app) # -> minio service - setup_dsm(app) # core subsystem. Needs s3 and db setups done - setup_rest(app) # lastly, we expose API to the world + if settings.STORAGE_POSTGRES: + setup_db(app) # -> postgres service + if settings.STORAGE_S3: + setup_s3(app) # -> minio service + + setup_rest(app) + + if settings.STORAGE_POSTGRES and settings.STORAGE_S3: + setup_dsm(app) # core subsystem. Needs s3 and db setups done + if settings.STORAGE_CLEANER_INTERVAL_S: + setup_dsm_cleaner(app) + + app.middlewares.append(dsm_exception_handler) if settings.LOG_LEVEL == "DEBUG": setup_dev_error_logger(app) diff --git a/services/storage/src/simcore_service_storage/constants.py b/services/storage/src/simcore_service_storage/constants.py index 7cdcaa4463b..0350f6edb61 100644 --- a/services/storage/src/simcore_service_storage/constants.py +++ b/services/storage/src/simcore_service_storage/constants.py @@ -1,33 +1,26 @@ from servicelib.aiohttp import application_keys -from . import _meta - RETRY_WAIT_SECS = 2 -RETRY_COUNT = 20 -CONNECT_TIMEOUT_SECS = 30 - -## VERSION----------------------------- -service_version = _meta.version - -## CONFIGURATION FILES------------------ -DEFAULT_CONFIG = "docker-prod-config.yaml" - +MAX_CHUNK_SIZE = 1024 +MINUTE = 60 APP_CONFIG_KEY = application_keys.APP_CONFIG_KEY # app-storage-key for config object -RSC_CONFIG_DIR_KEY = "data" # resource folder -# DSM specific constants +# DSM locations SIMCORE_S3_ID = 0 SIMCORE_S3_STR = "simcore.s3" DATCORE_ID = 1 DATCORE_STR = "datcore" +# NOTE: SAFE S3 characters are found here [https://docs.aws.amazon.com/AmazonS3/latest/userguide/object-keys.html] +# the % character was added since we need to url encode some of them +_SAFE_S3_FILE_NAME_RE = r"[\w!\-_\.\*\'\(\)\%]" +S3_FILE_ID_RE = rf"^({_SAFE_S3_FILE_NAME_RE}+?)\/({_SAFE_S3_FILE_NAME_RE}+?)\/({_SAFE_S3_FILE_NAME_RE}+?)$" -# REST API ---------------------------- -API_MAJOR_VERSION = service_version.major # NOTE: syncs with service key -API_VERSION_TAG = "v{:.0f}".format(API_MAJOR_VERSION) +S3_UNDEFINED_OR_EXTERNAL_MULTIPART_ID = "UNDEFINED/EXTERNALID" +# REST API ---------------------------- APP_OPENAPI_SPECS_KEY = ( application_keys.APP_OPENAPI_SPECS_KEY ) # app-storage-key for openapi specs object @@ -38,6 +31,5 @@ # DATA STORAGE MANAGER ---------------------------------- -APP_DSM_THREADPOOL = f"{__name__}.dsm_threadpool" APP_DSM_KEY = f"{__name__}.DSM" APP_S3_KEY = f"{__name__}.S3_CLIENT" diff --git a/services/storage/src/simcore_service_storage/datcore_adapter/datcore_adapter.py b/services/storage/src/simcore_service_storage/datcore_adapter/datcore_adapter.py index ee1c8e05b98..5eaac2d40c0 100644 --- a/services/storage/src/simcore_service_storage/datcore_adapter/datcore_adapter.py +++ b/services/storage/src/simcore_service_storage/datcore_adapter/datcore_adapter.py @@ -1,16 +1,18 @@ import asyncio import logging from math import ceil -from typing import Any, Callable, Optional, Type, Union, cast +from typing import Any, Callable, Optional, Union, cast import aiohttp from aiohttp import web +from models_library.api_schemas_storage import DatCoreDatasetName +from models_library.users import UserID +from pydantic import AnyUrl, parse_obj_as from servicelib.aiohttp.application_keys import APP_CONFIG_KEY from servicelib.aiohttp.client_session import ClientSession, get_client_session -from yarl import URL from ..constants import DATCORE_ID, DATCORE_STR -from ..models import DatasetMetaData, FileMetaData, FileMetaDataEx +from ..models import DatasetMetaData, FileMetaData from .datcore_adapter_exceptions import ( DatcoreAdapterClientError, DatcoreAdapterException, @@ -74,7 +76,7 @@ async def _retrieve_all_pages( api_secret: str, method: str, path: str, - return_type: Type, + return_type: type, return_type_creator: Callable, ): page = 1 @@ -126,23 +128,29 @@ async def check_user_can_connect( async def list_all_datasets_files_metadatas( - app: web.Application, api_key: str, api_secret: str -) -> list[FileMetaDataEx]: + app: web.Application, user_id: UserID, api_key: str, api_secret: str +) -> list[FileMetaData]: all_datasets: list[DatasetMetaData] = await list_datasets(app, api_key, api_secret) get_dataset_files_tasks = [ - list_all_files_metadatas_in_dataset(app, api_key, api_secret, d.dataset_id) + list_all_files_metadatas_in_dataset( + app, user_id, api_key, api_secret, cast(DatCoreDatasetName, d.dataset_id) + ) for d in all_datasets ] results = await asyncio.gather(*get_dataset_files_tasks) - all_files_of_all_datasets: list[FileMetaDataEx] = [] + all_files_of_all_datasets: list[FileMetaData] = [] for data in results: all_files_of_all_datasets += data return all_files_of_all_datasets async def list_all_files_metadatas_in_dataset( - app: web.Application, api_key: str, api_secret: str, dataset_id: str -) -> list[FileMetaDataEx]: + app: web.Application, + user_id: UserID, + api_key: str, + api_secret: str, + dataset_id: DatCoreDatasetName, +) -> list[FileMetaData]: all_files: list[dict[str, Any]] = cast( list[dict[str, Any]], await _request( @@ -154,20 +162,21 @@ async def list_all_files_metadatas_in_dataset( ), ) return [ - FileMetaDataEx( - fmd=FileMetaData( - file_uuid=d["path"], - location_id=DATCORE_ID, - location=DATCORE_STR, - bucket_name=d["dataset_id"], - object_name=d["path"], - file_name=d["name"], - file_id=d["package_id"], - file_size=d["size"], - created_at=d["created_at"], - last_modified=d["last_modified_at"], - display_file_path=d["name"], - ), # type: ignore + FileMetaData( + file_uuid=d["path"], + location_id=DATCORE_ID, + location=DATCORE_STR, + bucket_name=d["dataset_id"], + object_name=d["path"], + file_name=d["name"], + file_id=d["package_id"], + file_size=d["size"], + created_at=d["created_at"], + last_modified=d["last_modified_at"], + project_id=None, + node_id=None, + user_id=user_id, + is_soft_link=False, ) for d in all_files ] @@ -183,7 +192,7 @@ async def list_datasets( "GET", "/datasets", DatasetMetaData, - lambda d: DatasetMetaData(d["id"], d["display_name"]), + lambda d: DatasetMetaData(dataset_id=d["id"], display_name=d["display_name"]), ) return all_datasets @@ -191,15 +200,15 @@ async def list_datasets( async def get_file_download_presigned_link( app: web.Application, api_key: str, api_secret: str, file_id: str -) -> URL: +) -> AnyUrl: file_download_data = cast( dict[str, Any], await _request(app, api_key, api_secret, "GET", f"/files/{file_id}"), ) - return file_download_data["link"] + return parse_obj_as(AnyUrl, file_download_data["link"]) async def delete_file( app: web.Application, api_key: str, api_secret: str, file_id: str -): +) -> None: await _request(app, api_key, api_secret, "DELETE", f"/files/{file_id}") diff --git a/services/storage/src/simcore_service_storage/datcore_adapter/datcore_adapter_exceptions.py b/services/storage/src/simcore_service_storage/datcore_adapter/datcore_adapter_exceptions.py index 5551b2632f2..b997abc0b17 100644 --- a/services/storage/src/simcore_service_storage/datcore_adapter/datcore_adapter_exceptions.py +++ b/services/storage/src/simcore_service_storage/datcore_adapter/datcore_adapter_exceptions.py @@ -1,7 +1,10 @@ +from typing import Optional + + class DatcoreAdapterException(Exception): """basic exception for errors raised in datcore-adapter""" - def __init__(self, msg: str = None) -> None: + def __init__(self, msg: Optional[str] = None) -> None: super().__init__( msg or "Unexpected error occured in datcore-adapter subpackage" ) diff --git a/services/storage/src/simcore_service_storage/datcore_dsm.py b/services/storage/src/simcore_service_storage/datcore_dsm.py new file mode 100644 index 00000000000..3215cc8dcd6 --- /dev/null +++ b/services/storage/src/simcore_service_storage/datcore_dsm.py @@ -0,0 +1,84 @@ +from dataclasses import dataclass + +from aiohttp import web +from models_library.api_schemas_storage import LinkType +from models_library.projects_nodes_io import LocationID, LocationName, StorageFileID +from models_library.users import UserID +from pydantic import AnyUrl + +from .constants import DATCORE_ID, DATCORE_STR +from .datcore_adapter import datcore_adapter +from .db_tokens import get_api_token_and_secret +from .dsm_factory import BaseDataManager +from .models import DatasetMetaData, FileMetaData + + +@dataclass +class DatCoreDataManager(BaseDataManager): + app: web.Application + + async def _get_datcore_tokens(self, user_id: UserID): + return await get_api_token_and_secret(self.app, user_id) + + @classmethod + def get_location_id(cls) -> LocationID: + return DATCORE_ID + + @classmethod + def get_location_name(cls) -> LocationName: + return DATCORE_STR + + async def authorized(self, user_id: UserID) -> bool: + api_token, api_secret = await self._get_datcore_tokens(user_id) + if api_token and api_secret: + return await datcore_adapter.check_user_can_connect( + self.app, api_token, api_secret + ) + return False + + async def list_datasets(self, user_id: UserID) -> list[DatasetMetaData]: + api_token, api_secret = await self._get_datcore_tokens(user_id) + return await datcore_adapter.list_datasets(self.app, api_token, api_secret) + + async def list_files_in_dataset( + self, user_id: UserID, dataset_id: str + ) -> list[FileMetaData]: + api_token, api_secret = await self._get_datcore_tokens(user_id) + return await datcore_adapter.list_all_files_metadatas_in_dataset( + self.app, user_id, api_token, api_secret, dataset_id + ) + + async def list_files( + self, user_id: UserID, uuid_filter: str = "" + ) -> list[FileMetaData]: + api_token, api_secret = await self._get_datcore_tokens(user_id) + return await datcore_adapter.list_all_datasets_files_metadatas( + self.app, user_id, api_token, api_secret + ) + + async def get_file(self, user_id: UserID, file_id: StorageFileID) -> FileMetaData: + raise NotImplementedError + + async def create_file_upload_link( + self, user_id: UserID, file_id: StorageFileID, link_type: LinkType + ) -> AnyUrl: + raise NotImplementedError + + async def abort_file_upload(self, user_id: UserID, file_id: StorageFileID) -> None: + raise NotImplementedError + + async def create_file_download_link( + self, user_id: UserID, file_id: StorageFileID, link_type: LinkType + ) -> AnyUrl: + api_token, api_secret = await self._get_datcore_tokens(user_id) + return await datcore_adapter.get_file_download_presigned_link( + self.app, api_token, api_secret, file_id + ) + + async def delete_file(self, user_id: UserID, file_id: StorageFileID) -> None: + api_token, api_secret = await self._get_datcore_tokens(user_id) + await datcore_adapter.delete_file(self.app, api_token, api_secret, file_id) + + +def create_datcore_data_manager(app: web.Application) -> DatCoreDataManager: + return DatCoreDataManager(app) diff --git a/services/storage/src/simcore_service_storage/db.py b/services/storage/src/simcore_service_storage/db.py index f6bc1732754..e6d8d15601e 100644 --- a/services/storage/src/simcore_service_storage/db.py +++ b/services/storage/src/simcore_service_storage/db.py @@ -3,11 +3,7 @@ from aiohttp import web from aiopg.sa import Engine -from servicelib.aiohttp.aiopg_utils import ( - DataSourceName, - init_pg_tables, - is_pg_responsive, -) +from servicelib.aiohttp.aiopg_utils import DataSourceName, is_pg_responsive from servicelib.common_aiopg_utils import create_pg_engine from servicelib.retry_policies import PostgresRetryPolicyUponInitialization from simcore_postgres_database.utils_aiopg import ( @@ -18,7 +14,6 @@ from tenacity import retry from .constants import APP_CONFIG_KEY, APP_DB_ENGINE_KEY -from .models import metadata from .settings import PostgresSettings log = logging.getLogger(__name__) @@ -58,10 +53,6 @@ async def postgres_cleanup_ctx(app: web.Application): dsn, min_size=pg_cfg.POSTGRES_MINSIZE, max_size=pg_cfg.POSTGRES_MAXSIZE ) - if app[APP_CONFIG_KEY].STORAGE_TESTING: - log.info("Initializing tables for %s", dsn) - init_pg_tables(dsn, schema=metadata) - assert engine # nosec app[APP_DB_ENGINE_KEY] = engine @@ -95,11 +86,6 @@ def get_engine_state(app: web.Application) -> dict[str, Any]: def setup_db(app: web.Application): - if "postgres" in app[APP_CONFIG_KEY].STORAGE_DISABLE_SERVICES: - app[APP_DB_ENGINE_KEY] = None - log.warning("Service '%s' explicitly disabled in config", "postgres") - return - app[APP_DB_ENGINE_KEY] = None # app is created at this point but not yet started diff --git a/services/storage/src/simcore_service_storage/access_layer.py b/services/storage/src/simcore_service_storage/db_access_layer.py similarity index 88% rename from services/storage/src/simcore_service_storage/access_layer.py rename to services/storage/src/simcore_service_storage/db_access_layer.py index cd2dabe2296..261914edb48 100644 --- a/services/storage/src/simcore_service_storage/access_layer.py +++ b/services/storage/src/simcore_service_storage/db_access_layer.py @@ -40,20 +40,19 @@ import logging from dataclasses import dataclass from typing import Optional -from uuid import UUID import sqlalchemy as sa from aiopg.sa.connection import SAConnection from aiopg.sa.result import ResultProxy, RowProxy +from models_library.projects import ProjectID +from models_library.projects_nodes_io import StorageFileID +from models_library.users import GroupID, UserID from simcore_postgres_database.storage_models import file_meta_data, user_to_groups from sqlalchemy.sql import text logger = logging.getLogger(__name__) -ProjectID = str - - @dataclass class AccessRights: read: bool @@ -86,10 +85,10 @@ def __init__(self, identifier, reason=None, details=None): super().__init__(self.reason, self.details) def __str__(self): - return "Error in {}: {} [{}]".format(self.identifier, self.reason, self.details) + return f"Error in {self.identifier}: {self.reason} [{self.details}]" -async def _get_user_groups_ids(conn: SAConnection, user_id: int) -> list[int]: +async def _get_user_groups_ids(conn: SAConnection, user_id: UserID) -> list[GroupID]: stmt = sa.select([user_to_groups.c.gid]).where(user_to_groups.c.uid == user_id) rows = await (await conn.execute(stmt)).fetchall() user_group_ids = [g.gid for g in rows] @@ -97,7 +96,7 @@ async def _get_user_groups_ids(conn: SAConnection, user_id: int) -> list[int]: def _aggregate_access_rights( - access_rights: dict[str, dict], group_ids: list[int] + access_rights: dict[str, dict], group_ids: list[GroupID] ) -> AccessRights: try: prj_access = {"read": False, "write": False, "delete": False} @@ -117,7 +116,7 @@ def _aggregate_access_rights( async def list_projects_access_rights( - conn: SAConnection, user_id: int + conn: SAConnection, user_id: UserID ) -> dict[ProjectID, AccessRights]: """ Returns access-rights of user (user_id) over all OWNED or SHARED projects @@ -141,8 +140,8 @@ async def list_projects_access_rights( projects_access_rights = {} async for row in conn.execute(smt): - assert isinstance(row.access_rights, dict) - assert isinstance(row.uuid, ProjectID) + assert isinstance(row.access_rights, dict) # nosec + assert isinstance(row.uuid, str) # nosec if row.access_rights: # TODO: access_rights should be direclty filtered from result in stm instead calling again user_group_ids @@ -159,7 +158,7 @@ async def list_projects_access_rights( async def get_project_access_rights( - conn: SAConnection, user_id: int, project_id: ProjectID + conn: SAConnection, user_id: UserID, project_id: ProjectID ) -> AccessRights: """ Returns access-rights of user (user_id) over a project resource (project_id) @@ -189,8 +188,8 @@ async def get_project_access_rights( # Either project does not exists OR user_id has NO access return AccessRights.none() - assert row.prj_owner is None or isinstance(row.prj_owner, int) - assert isinstance(row.access_rights, dict) + assert row.prj_owner is None or isinstance(row.prj_owner, int) # nosec + assert isinstance(row.access_rights, dict) # nosec if row.prj_owner == user_id: return AccessRights.all() @@ -201,10 +200,10 @@ async def get_project_access_rights( async def get_file_access_rights( - conn: SAConnection, user_id: int, file_uuid: str + conn: SAConnection, user_id: UserID, file_id: StorageFileID ) -> AccessRights: """ - Returns access-rights of user (user_id) over data file resource (file_uuid) + Returns access-rights of user (user_id) over data file resource (file_id) raises InvalidFileIdentifier """ @@ -213,7 +212,7 @@ async def get_file_access_rights( # 1. file registered in file_meta_data table # stmt = sa.select([file_meta_data.c.project_id, file_meta_data.c.user_id]).where( - file_meta_data.c.file_uuid == file_uuid + file_meta_data.c.file_id == f"{file_id}" ) result: ResultProxy = await conn.execute(stmt) row: Optional[RowProxy] = await result.first() @@ -235,7 +234,7 @@ async def get_file_access_rights( logger.warning( "File %s references a project %s that does not exists in db." "TIP: Audit sync between files_meta_data and projects tables", - file_uuid, + file_id, row.project_id, ) return AccessRights.none() @@ -243,13 +242,13 @@ async def get_file_access_rights( else: # # 2. file is NOT registered in meta-data table e.g. it is about to be uploaded or it was deleted - # We rely on the assumption that file_uuid is formatted either as + # We rely on the assumption that file_id is formatted either as # # - project's data: {project_id}/{node_id}/{filename} # - API data: api/{file_id}/{filename} # try: - parent, _, _ = file_uuid.split("/") + parent, _, _ = file_id.split("/") if parent == "api": # FIXME: this is wrong, all api data must be registered and OWNED @@ -257,21 +256,20 @@ async def get_file_access_rights( return AccessRights.all() # otherwise assert 'parent' string corresponds to a valid UUID - UUID(parent) # raises ValueError access_rights = await get_project_access_rights( - conn, user_id, project_id=parent + conn, user_id, project_id=ProjectID(parent) ) if not access_rights: logger.warning( "File %s references a project %s that does not exists in db", - file_uuid, + file_id, row.project_id, ) return AccessRights.none() except (ValueError, AttributeError) as err: raise InvalidFileIdentifier( - identifier=file_uuid, + identifier=file_id, details=str(err), ) from err @@ -281,7 +279,9 @@ async def get_file_access_rights( # HELPERS ----------------------------------------------- -async def get_readable_project_ids(conn: SAConnection, user_id: int) -> list[ProjectID]: +async def get_readable_project_ids( + conn: SAConnection, user_id: UserID +) -> list[ProjectID]: """Returns a list of projects where user has granted read-access""" - projects_access_rights = await list_projects_access_rights(conn, int(user_id)) + projects_access_rights = await list_projects_access_rights(conn, user_id) return [pid for pid, access in projects_access_rights.items() if access.read] diff --git a/services/storage/src/simcore_service_storage/db_file_meta_data.py b/services/storage/src/simcore_service_storage/db_file_meta_data.py new file mode 100644 index 00000000000..3353ff6d9dd --- /dev/null +++ b/services/storage/src/simcore_service_storage/db_file_meta_data.py @@ -0,0 +1,150 @@ +import datetime +from typing import AsyncGenerator, Optional + +import sqlalchemy as sa +from aiopg.sa.connection import SAConnection +from models_library.projects import ProjectID +from models_library.projects_nodes import NodeID +from models_library.projects_nodes_io import SimcoreS3FileID +from models_library.users import UserID +from models_library.utils.fastapi_encoders import jsonable_encoder +from simcore_postgres_database.storage_models import file_meta_data +from sqlalchemy import and_, literal_column +from sqlalchemy.dialects.postgresql import insert as pg_insert + +from .exceptions import FileMetaDataNotFoundError +from .models import FileMetaData, FileMetaDataAtDB + + +async def exists(conn: SAConnection, file_id: SimcoreS3FileID) -> bool: + return ( + await conn.scalar( + sa.select([sa.func.count()]) + .select_from(file_meta_data) + .where(file_meta_data.c.file_id == file_id) + ) + == 1 + ) + + +async def upsert(conn: SAConnection, fmd: FileMetaData) -> FileMetaDataAtDB: + # NOTE: upsert file_meta_data, if the file already exists, we update the whole row + # so we get the correct time stamps + fmd_db = FileMetaDataAtDB.from_orm(fmd) + insert_statement = pg_insert(file_meta_data).values(**jsonable_encoder(fmd_db)) + on_update_statement = insert_statement.on_conflict_do_update( + index_elements=[file_meta_data.c.file_id], set_=jsonable_encoder(fmd_db) + ).returning(literal_column("*")) + result = await conn.execute(on_update_statement) + row = await result.first() + assert row # nosec + return FileMetaDataAtDB.from_orm(row) + + +async def insert(conn: SAConnection, fmd: FileMetaData) -> FileMetaDataAtDB: + fmd_db = FileMetaDataAtDB.from_orm(fmd) + result = await conn.execute( + file_meta_data.insert() + .values(jsonable_encoder(fmd_db)) + .returning(literal_column("*")) + ) + row = await result.first() + assert row # nosec + return FileMetaDataAtDB.from_orm(row) + + +async def get(conn: SAConnection, file_id: SimcoreS3FileID) -> FileMetaDataAtDB: + result = await conn.execute( + query=sa.select([file_meta_data]).where(file_meta_data.c.file_id == file_id) + ) + if row := await result.first(): + return FileMetaDataAtDB.from_orm(row) + raise FileMetaDataNotFoundError(file_id=file_id) + + +async def list_filter_with_partial_file_id( + conn: SAConnection, + *, + user_id: UserID, + project_ids: list[ProjectID], + file_id_prefix: Optional[str], + partial_file_id: Optional[str], +) -> list[FileMetaDataAtDB]: + stmt = sa.select([file_meta_data]).where( + ( + (file_meta_data.c.user_id == f"{user_id}") + | file_meta_data.c.project_id.in_(f"{pid}" for pid in project_ids) + ) + & ( + file_meta_data.c.file_id.startswith(file_id_prefix) + if file_id_prefix + else True + ) + & ( + file_meta_data.c.file_id.ilike(f"%{partial_file_id}%") + if partial_file_id + else True + ) + ) + return [FileMetaDataAtDB.from_orm(row) async for row in await conn.execute(stmt)] + + +async def list_fmds( + conn: SAConnection, + *, + user_id: Optional[UserID] = None, + project_ids: Optional[list[ProjectID]] = None, + file_ids: Optional[list[SimcoreS3FileID]] = None, + expired_after: Optional[datetime.datetime] = None, +) -> list[FileMetaDataAtDB]: + + stmt = sa.select([file_meta_data]).where( + and_( + (file_meta_data.c.user_id == f"{user_id}") if user_id else True, + (file_meta_data.c.project_id.in_([f"{p}" for p in project_ids])) + if project_ids + else True, + (file_meta_data.c.file_id.in_(file_ids)) if file_ids else True, + (file_meta_data.c.upload_expires_at < expired_after) + if expired_after + else True, + ) + ) + + return [FileMetaDataAtDB.from_orm(row) async for row in await conn.execute(stmt)] + + +async def total(conn: SAConnection) -> int: + """returns the number of uploaded file entries""" + return ( + await conn.scalar(sa.select([sa.func.count()]).select_from(file_meta_data)) or 0 + ) + + +async def list_valid_uploads( + conn: SAConnection, +) -> AsyncGenerator[FileMetaDataAtDB, None]: + """returns all the theoretically valid fmds (e.g. upload_expires_at column is null)""" + async for row in conn.execute( + sa.select([file_meta_data]).where(file_meta_data.c.upload_expires_at == None) + ): + fmd_at_db = FileMetaDataAtDB.from_orm(row) + yield fmd_at_db + + +async def delete(conn: SAConnection, file_ids: list[SimcoreS3FileID]) -> None: + await conn.execute( + file_meta_data.delete().where(file_meta_data.c.file_id.in_(file_ids)) + ) + + +async def delete_all_from_project(conn: SAConnection, project_id: ProjectID) -> None: + await conn.execute( + file_meta_data.delete().where(file_meta_data.c.project_id == f"{project_id}") + ) + + +async def delete_all_from_node(conn: SAConnection, node_id: NodeID) -> None: + await conn.execute( + file_meta_data.delete().where(file_meta_data.c.node_id == f"{node_id}") + ) diff --git a/services/storage/src/simcore_service_storage/db_projects.py b/services/storage/src/simcore_service_storage/db_projects.py new file mode 100644 index 00000000000..bb02061b42b --- /dev/null +++ b/services/storage/src/simcore_service_storage/db_projects.py @@ -0,0 +1,28 @@ +from typing import AsyncGenerator + +import sqlalchemy as sa +from aiopg.sa.connection import SAConnection +from models_library.projects import ProjectAtDB, ProjectID +from simcore_postgres_database.storage_models import projects + + +async def list_projects( + conn: SAConnection, project_uuids: list[ProjectID] +) -> AsyncGenerator[ProjectAtDB, None]: + async for row in conn.execute( + sa.select([projects]).where( + projects.c.uuid.in_(f"{pid}" for pid in project_uuids) + ) + ): + yield ProjectAtDB.from_orm(row) + + +async def project_exists(conn: SAConnection, project_uuid: ProjectID) -> bool: + return ( + await conn.scalar( + sa.select([sa.func.count()]) + .select_from(projects) + .where(projects.c.uuid == f"{project_uuid}") + ) + == 1 + ) diff --git a/services/storage/src/simcore_service_storage/db_tokens.py b/services/storage/src/simcore_service_storage/db_tokens.py index 3c003da8976..09daa80ca49 100644 --- a/services/storage/src/simcore_service_storage/db_tokens.py +++ b/services/storage/src/simcore_service_storage/db_tokens.py @@ -1,26 +1,25 @@ import logging +from typing import Any import sqlalchemy as sa from aiohttp import web -from psycopg2 import Error as DbApiError -from servicelib.aiohttp.aiopg_utils import PostgresRetryPolicyUponOperation -from tenacity import retry +from aiopg.sa.engine import Engine +from models_library.users import UserID +from simcore_postgres_database.storage_models import tokens from .constants import APP_CONFIG_KEY, APP_DB_ENGINE_KEY -from .models import tokens log = logging.getLogger(__name__) -@retry(**PostgresRetryPolicyUponOperation(log).kwargs) -async def _get_tokens_from_db(engine: sa.engine.Engine, userid: int): +async def _get_tokens_from_db(engine: Engine, user_id: UserID) -> dict[str, Any]: async with engine.acquire() as conn: result = await conn.execute( sa.select( [ tokens, ] - ).where(tokens.c.user_id == userid) + ).where(tokens.c.user_id == user_id) ) row = await result.first() data = dict(row) if row else {} @@ -28,30 +27,20 @@ async def _get_tokens_from_db(engine: sa.engine.Engine, userid: int): async def get_api_token_and_secret( - app: web.Application, userid: int + app: web.Application, user_id: UserID ) -> tuple[str, str]: # FIXME: this is a temporary solution. This information should be sent in some form # from the client side together with the userid? - engine = app.get(APP_DB_ENGINE_KEY, None) + engine = app[APP_DB_ENGINE_KEY] # defaults from config if any, othewise None api_token = app[APP_CONFIG_KEY].BF_API_KEY api_secret = app[APP_CONFIG_KEY].BF_API_SECRET - if engine: - try: - data = await _get_tokens_from_db(engine, userid) - except DbApiError: - # NOTE this shall not log as error since is a possible outcome with an alternative - log.warning( - "Cannot retrieve tokens for user %s in pgdb %s", - userid, - engine, - exc_info=True, - ) - else: - data = data.get("token_data", {}) - api_token = data.get("token_key", api_token) - api_secret = data.get("token_secret", api_secret) + data = await _get_tokens_from_db(engine, user_id) + + data = data.get("token_data", {}) + api_token = data.get("token_key", api_token) + api_secret = data.get("token_secret", api_secret) return api_token, api_secret diff --git a/services/storage/src/simcore_service_storage/dsm.py b/services/storage/src/simcore_service_storage/dsm.py index 14594e66f26..4d69f32bd99 100644 --- a/services/storage/src/simcore_service_storage/dsm.py +++ b/services/storage/src/simcore_service_storage/dsm.py @@ -1,1183 +1,39 @@ -# pylint: disable=no-value-for-parameter -# FIXME: E1120:No value for argument 'dml' in method call -# pylint: disable=protected-access -# FIXME: Access to a protected member _result_proxy of a client class - -import asyncio import logging -import os -import re -import tempfile -import urllib.parse -from collections import deque -from concurrent.futures import ThreadPoolExecutor -from dataclasses import dataclass, field -from pathlib import Path -from typing import Any, Final, Optional, TypedDict -import attr -import botocore -import botocore.exceptions -import sqlalchemy as sa -from aiobotocore.client import AioBaseClient -from aiobotocore.session import AioSession, ClientCreatorContext, get_session from aiohttp import web -from aiopg.sa import Engine -from aiopg.sa.result import ResultProxy, RowProxy -from models_library.projects_nodes_io import Location, LocationID, LocationName -from pydantic import AnyUrl, parse_obj_as -from servicelib.aiohttp.aiopg_utils import DBAPIError, PostgresRetryPolicyUponOperation -from servicelib.aiohttp.application_keys import APP_FIRE_AND_FORGET_TASKS_KEY -from servicelib.aiohttp.client_session import get_client_session -from servicelib.utils import fire_and_forget_task -from sqlalchemy.dialects.postgresql import insert as pg_insert -from sqlalchemy.sql.expression import literal_column -from tenacity import retry -from tenacity._asyncio import AsyncRetrying -from tenacity.before_sleep import before_sleep_log -from tenacity.retry import retry_if_exception_type -from tenacity.stop import stop_after_delay -from tenacity.wait import wait_exponential -from yarl import URL - -from .access_layer import ( - AccessRights, - get_file_access_rights, - get_project_access_rights, - get_readable_project_ids, -) -from .constants import ( - APP_CONFIG_KEY, - APP_DB_ENGINE_KEY, - APP_DSM_KEY, - APP_S3_KEY, - DATCORE_ID, - DATCORE_STR, - SIMCORE_S3_ID, - SIMCORE_S3_STR, -) -from .datcore_adapter import datcore_adapter -from .models import ( - DatasetMetaData, - FileMetaData, - FileMetaDataEx, - file_meta_data, - projects, -) -from .s3wrapper.s3_client import MinioClientWrapper -from .settings import Settings -from .utils import download_to_file_or_raise, is_file_entry_valid, to_meta_data_extended - -_MINUTE: Final[int] = 60 -_HOUR: Final[int] = 60 * _MINUTE +from .constants import APP_DSM_KEY +from .datcore_dsm import DatCoreDataManager, create_datcore_data_manager +from .dsm_factory import DataManagerProvider +from .simcore_s3_dsm import SimcoreS3DataManager, create_simcore_s3_data_manager logger = logging.getLogger(__name__) -postgres_service_retry_policy_kwargs = PostgresRetryPolicyUponOperation(logger).kwargs - def setup_dsm(app: web.Application): async def _cleanup_context(app: web.Application): - cfg: Settings = app[APP_CONFIG_KEY] - - with ThreadPoolExecutor(max_workers=cfg.STORAGE_MAX_WORKERS) as executor: - dsm = DataStorageManager( - s3_client=app.get(APP_S3_KEY), - engine=app.get(APP_DB_ENGINE_KEY), - loop=asyncio.get_event_loop(), - pool=executor, - simcore_bucket_name=cfg.STORAGE_S3.S3_BUCKET_NAME, - has_project_db=not cfg.STORAGE_TESTING, - app=app, - ) # type: ignore - - app[APP_DSM_KEY] = dsm - - yield - - assert app[APP_DSM_KEY].pool is executor # nosec - - logger.info("Shuting down %s", dsm.pool) - - # ------ - - app.cleanup_ctx.append(_cleanup_context) - - -@dataclass -class DatCoreApiToken: - api_token: Optional[str] = None - api_secret: Optional[str] = None - - def to_tuple(self): - return (self.api_token, self.api_secret) - - -class LocationDict(TypedDict): - name: LocationName - id: LocationID - - -@dataclass -class DataStorageManager: # pylint: disable=too-many-public-methods - """Data storage manager - - The dsm has access to the database for all meta data and to the actual backend. For now this - is simcore's S3 [minio] and the datcore storage facilities. - - For all data that is in-house (simcore.s3, ...) we keep a synchronized database with meta information - for the physical files. - - For physical changes on S3, that might be time-consuming, the db keeps a state (delete and upload mostly) - - The dsm provides the following additional functionalities: - - - listing of folders for a given users, optionally filtered using a regular expression and optionally - sorted by one of the meta data keys - - - upload/download of files - - client -> S3 : presigned upload link - S3 -> client : presigned download link - datcore -> client: presigned download link - S3 -> datcore: local copy and then upload via their api - - minio/S3 and postgres can talk nicely with each other via Notifications using rabbigMQ which we already have. - See: - - https://blog.minio.io/part-5-5-publish-minio-events-via-postgresql-50f6cc7a7346 - https://docs.minio.io/docs/minio-bucket-notification-guide.html - """ - - # TODO: perhaps can be used a cache? add a lifetime? - - s3_client: MinioClientWrapper - engine: Engine - loop: object - pool: ThreadPoolExecutor - simcore_bucket_name: str - has_project_db: bool - session: AioSession = field(default_factory=get_session) - datcore_tokens: dict[str, DatCoreApiToken] = field(default_factory=dict) - app: Optional[web.Application] = None - - def _create_aiobotocore_client_context(self) -> ClientCreatorContext: - assert hasattr(self.session, "create_client") # nosec - # pylint: disable=no-member - - # SEE API in https://botocore.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html - # SEE https://aiobotocore.readthedocs.io/en/latest/index.html - return self.session.create_client( - "s3", - endpoint_url=self.s3_client.endpoint_url, - aws_access_key_id=self.s3_client.access_key, - aws_secret_access_key=self.s3_client.secret_key, - ) - - def _get_datcore_tokens(self, user_id: str) -> tuple[Optional[str], Optional[str]]: - # pylint: disable=no-member - token = self.datcore_tokens.get(user_id, DatCoreApiToken()) - return token.to_tuple() - - async def locations(self, user_id: str) -> list[LocationDict]: - locs = [] - simcore_s3 = {"name": SIMCORE_S3_STR, "id": SIMCORE_S3_ID} - locs.append(simcore_s3) - - api_token, api_secret = self._get_datcore_tokens(user_id) - - if api_token and api_secret and self.app: - if await datcore_adapter.check_user_can_connect( - self.app, api_token, api_secret - ): - datcore = {"name": DATCORE_STR, "id": DATCORE_ID} - locs.append(datcore) - - return locs - - @classmethod - def location_from_id(cls, location_id: int) -> LocationName: - location = parse_obj_as(Location, location_id) - return location.name - - # LIST/GET --------------------------- - - # pylint: disable=too-many-arguments - # pylint: disable=too-many-branches - # pylint: disable=too-many-statements - async def list_files( - self, - user_id: str, - location: LocationName, - uuid_filter: str = "", - regex: str = "", - ) -> list[FileMetaDataEx]: - """Returns a list of file paths - - - Works for simcore.s3 and datcore - - Can filter on uuid: useful to filter on project_id/node_id - - Can filter upon regular expression (for now only on key: value pairs of the FileMetaData) - """ - data = deque() - if location == SIMCORE_S3_STR: - accesible_projects_ids = [] - async with self.engine.acquire() as conn, conn.begin(): - accesible_projects_ids = await get_readable_project_ids( - conn, int(user_id) - ) - where_statement = ( - file_meta_data.c.user_id == user_id - ) | file_meta_data.c.project_id.in_(accesible_projects_ids) - if uuid_filter: - where_statement &= file_meta_data.c.file_uuid.ilike( - f"%{uuid_filter}%" - ) - query = sa.select([file_meta_data]).where(where_statement) - - async for row in conn.execute(query): - dex = to_meta_data_extended(row) - if not is_file_entry_valid(dex.fmd): - # NOTE: the file is not updated with the information from S3 backend. - # 1. Either the file exists, but was never updated in the database - # 2. Or the file does not exist or was never completed, and the file_meta_data entry is old and faulty - # we need to update from S3 here since the database is not up-to-date - dex = await self.try_update_database_from_storage( - dex.fmd.file_uuid, - dex.fmd.bucket_name, - dex.fmd.object_name, - reraise_exceptions=False, - ) - if dex: - data.append(dex) - - if self.has_project_db: - uuid_name_dict = {} - # now parse the project to search for node/project names - try: - async with self.engine.acquire() as conn, conn.begin(): - query = sa.select([projects]).where( - projects.c.uuid.in_(accesible_projects_ids) - ) - - async for row in conn.execute(query): - proj_data = dict(row.items()) - - uuid_name_dict[proj_data["uuid"]] = proj_data["name"] - wb = proj_data["workbench"] - for node in wb.keys(): - uuid_name_dict[node] = wb[node]["label"] - except DBAPIError as _err: - logger.exception("Error querying database for project names") - - if not uuid_name_dict: - # there seems to be no project whatsoever for user_id - return [] - - # only keep files from non-deleted project - clean_data = deque() - for dx in data: - d = dx.fmd - if d.project_id not in uuid_name_dict: - continue - # - # FIXME: artifically fills ['project_name', 'node_name', 'file_id', 'raw_file_path', 'display_file_path'] - # with information from the projects table! - - d.project_name = uuid_name_dict[d.project_id] - if d.node_id in uuid_name_dict: - d.node_name = uuid_name_dict[d.node_id] - - d.raw_file_path = str( - Path(d.project_id) / Path(d.node_id) / Path(d.file_name) - ) - d.display_file_path = d.raw_file_path - d.file_id = d.file_uuid - if d.node_name and d.project_name: - d.display_file_path = str( - Path(d.project_name) / Path(d.node_name) / Path(d.file_name) - ) - # once the data was sync to postgres metadata table at this point - clean_data.append(dx) - - data = clean_data - - elif location == DATCORE_STR: - api_token, api_secret = self._get_datcore_tokens(user_id) - assert self.app # nosec - assert api_secret # nosec - assert api_token # nosec - return await datcore_adapter.list_all_datasets_files_metadatas( - self.app, api_token, api_secret - ) - - if uuid_filter: - # TODO: incorporate this in db query! - _query = re.compile(uuid_filter, re.IGNORECASE) - filtered_data = deque() - for dx in data: - d = dx.fmd - if _query.search(d.file_uuid): - filtered_data.append(dx) - - return list(filtered_data) - - if regex: - _query = re.compile(regex, re.IGNORECASE) - filtered_data = deque() - for dx in data: - d = dx.fmd - _vars = vars(d) - for v in _vars.keys(): - if _query.search(v) or _query.search(str(_vars[v])): - filtered_data.append(dx) - break - return list(filtered_data) - - return list(data) - - async def list_files_dataset( - self, user_id: str, location: LocationName, dataset_id: str - ) -> list[FileMetaDataEx]: - # this is a cheap shot, needs fixing once storage/db is in sync - data = [] - if location == SIMCORE_S3_STR: - data: list[FileMetaDataEx] = await self.list_files( - user_id, location, uuid_filter=dataset_id + "/" - ) - - elif location == DATCORE_STR: - api_token, api_secret = self._get_datcore_tokens(user_id) - # lists all the files inside the dataset - assert self.app # nosec - assert api_secret # nosec - assert api_token # nosec - return await datcore_adapter.list_all_files_metadatas_in_dataset( - self.app, api_token, api_secret, dataset_id - ) - - return data - - async def list_datasets( - self, user_id: str, location: LocationName - ) -> list[DatasetMetaData]: - """Returns a list of top level datasets - - Works for simcore.s3 and datcore - - """ - data = [] - - if location == SIMCORE_S3_STR: - if self.has_project_db: - try: - async with self.engine.acquire() as conn, conn.begin(): - readable_projects_ids = await get_readable_project_ids( - conn, int(user_id) - ) - has_read_access = projects.c.uuid.in_(readable_projects_ids) - - # FIXME: this DOES NOT read from file-metadata table!!! - query = sa.select([projects.c.uuid, projects.c.name]).where( - has_read_access - ) - async for row in conn.execute(query): - dmd = DatasetMetaData( - dataset_id=row.uuid, - display_name=row.name, - ) - data.append(dmd) - except DBAPIError as _err: - logger.exception("Error querying database for project names") - - elif location == DATCORE_STR: - api_token, api_secret = self._get_datcore_tokens(user_id) - assert self.app # nosec - assert api_secret # nosec - assert api_token # nosec - return await datcore_adapter.list_datasets(self.app, api_token, api_secret) - - return data - - async def list_file( - self, user_id: str, location: LocationName, file_uuid: str - ) -> Optional[FileMetaDataEx]: - - if location == SIMCORE_S3_STR: - - async with self.engine.acquire() as conn, conn.begin(): - can: Optional[AccessRights] = await get_file_access_rights( - conn, int(user_id), file_uuid - ) - if can.read: - query = sa.select([file_meta_data]).where( - file_meta_data.c.file_uuid == file_uuid - ) - result = await conn.execute(query) - row = await result.first() - if not row: - return None - file_metadata = to_meta_data_extended(row) - if is_file_entry_valid(file_metadata.fmd): - return file_metadata - # we need to update from S3 here since the database is not up-to-date - file_metadata = await self.try_update_database_from_storage( - file_metadata.fmd.file_uuid, - file_metadata.fmd.bucket_name, - file_metadata.fmd.object_name, - reraise_exceptions=False, - ) - return file_metadata - # FIXME: returns None in both cases: file does not exist or use has no access - logger.debug("User %s cannot read file %s", user_id, file_uuid) - return None - - elif location == DATCORE_STR: - # FIXME: review return inconsistencies - # api_token, api_secret = self._get_datcore_tokens(user_id) - import warnings - - warnings.warn("NOT IMPLEMENTED!!!") - return None - - # UPLOAD/DOWNLOAD LINKS --------------------------- - - async def upload_file_to_datcore( - self, _user_id: str, _local_file_path: str, _destination_id: str - ): - import warnings - - warnings.warn(f"NOT IMPLEMENTED!!! in {self.__class__}") - # uploads a locally available file to dat core given the storage path, optionally attached some meta data - # api_token, api_secret = self._get_datcore_tokens(user_id) - # await dcw.upload_file_to_id(destination_id, local_file_path) - - async def try_update_database_from_storage( - self, - file_uuid: str, - bucket_name: str, - object_name: str, - *, - reraise_exceptions: bool, - ) -> Optional[FileMetaDataEx]: - try: - async with self._create_aiobotocore_client_context() as aioboto_client: - result = await aioboto_client.head_object( - Bucket=bucket_name, Key=object_name - ) # type: ignore - - file_size = result["ContentLength"] # type: ignore - last_modified = result["LastModified"] # type: ignore - entity_tag = result["ETag"].strip('"') # type: ignore - - async with self.engine.acquire() as conn: - result: ResultProxy = await conn.execute( - file_meta_data.update() - .where(file_meta_data.c.file_uuid == file_uuid) - .values( - file_size=file_size, - last_modified=last_modified, - entity_tag=entity_tag, - ) - .returning(literal_column("*")) - ) - if not result: - return None - row: Optional[RowProxy] = await result.first() - if not row: - return None - - return to_meta_data_extended(row) - except botocore.exceptions.ClientError: - logger.warning("Error happened while trying to access %s", file_uuid) - if reraise_exceptions: - raise - - async def auto_update_database_from_storage_task( - self, file_uuid: str, bucket_name: str, object_name: str - ) -> Optional[FileMetaDataEx]: - async for attempt in AsyncRetrying( - stop=stop_after_delay(1 * _HOUR), - wait=wait_exponential(multiplier=0.1, exp_base=1.2, max=30), - retry=(retry_if_exception_type()), - before_sleep=before_sleep_log(logger, logging.INFO), - ): - with attempt: - return await self.try_update_database_from_storage( - file_uuid, bucket_name, object_name, reraise_exceptions=True - ) - - async def update_metadata( - self, file_uuid: str, user_id: int - ) -> Optional[FileMetaDataEx]: - async with self.engine.acquire() as conn: - can: Optional[AccessRights] = await get_file_access_rights( - conn, int(user_id), file_uuid - ) - if not can.write: - raise web.HTTPForbidden( - reason=f"User {user_id} was not allowed to upload file {file_uuid}" - ) - - bucket_name = self.simcore_bucket_name - object_name = file_uuid - return await self.auto_update_database_from_storage_task( - file_uuid=file_uuid, - bucket_name=bucket_name, - object_name=object_name, - ) - - async def _generate_metadata_for_link(self, user_id: str, file_uuid: str): - """ - Updates metadata table when link is used and upload is successfuly completed - - SEE _metadata_file_updater - """ - - async with self.engine.acquire() as conn: - can: Optional[AccessRights] = await get_file_access_rights( - conn, int(user_id), file_uuid - ) - if not can.write: - raise web.HTTPForbidden( - reason=f"User {user_id} does not have enough access rights to upload file {file_uuid}" - ) - - @retry(**postgres_service_retry_policy_kwargs) - async def _init_metadata() -> tuple[int, str]: - async with self.engine.acquire() as conn: - fmd = FileMetaData() - fmd.simcore_from_uuid(file_uuid, self.simcore_bucket_name) - fmd.user_id = user_id # NOTE: takes ownership of uploaded data - - # if file already exists, we might want to update a time-stamp - - # upsert file_meta_data - insert_stmt = pg_insert(file_meta_data).values(**vars(fmd)) - do_nothing_stmt = insert_stmt.on_conflict_do_nothing( - index_elements=["file_uuid"] - ) - await conn.execute(do_nothing_stmt) - - return fmd.file_size, fmd.last_modified - - await _init_metadata() - - async def upload_link( - self, user_id: str, file_uuid: str, as_presigned_link: bool - ) -> AnyUrl: - """returns: a presigned upload link - - NOTE: updates metadata once the upload is concluded""" - await self._generate_metadata_for_link(user_id=user_id, file_uuid=file_uuid) - - bucket_name = self.simcore_bucket_name - object_name = file_uuid - - # a parallel task is tarted which will update the metadata of the updated file - # once the update has finished. - fire_and_forget_task( - self.auto_update_database_from_storage_task( - file_uuid=file_uuid, - bucket_name=bucket_name, - object_name=object_name, - ), - task_suffix_name=f"auto_update_{file_uuid=}_{user_id=}", - fire_and_forget_tasks_collection=self.app[APP_FIRE_AND_FORGET_TASKS_KEY], - ) - - link = f"s3://{bucket_name}/{urllib.parse.quote( object_name)}" - if as_presigned_link: - link = self.s3_client.create_presigned_put_url(bucket_name, object_name) - return parse_obj_as(AnyUrl, f"{link}") - - async def download_link_s3( - self, file_uuid: str, user_id: int, as_presigned_link: bool - ) -> str: - - # access layer - async with self.engine.acquire() as conn: - can: Optional[AccessRights] = await get_file_access_rights( - conn, int(user_id), file_uuid - ) - if not can.read: - # NOTE: this is tricky. A user with read access can download and data! - # If write permission would be required, then shared projects as views cannot - # recover data in nodes (e.g. jupyter cannot pull work data) - # - raise web.HTTPForbidden( - reason=f"User {user_id} does not have enough rights to download file {file_uuid}" - ) - - bucket_name = self.simcore_bucket_name - async with self.engine.acquire() as conn: - stmt = sa.select([file_meta_data.c.object_name]).where( - file_meta_data.c.file_uuid == file_uuid - ) - object_name: Optional[str] = await conn.scalar(stmt) - - if object_name is None: - raise web.HTTPNotFound( - reason=f"File '{file_uuid}' does not exists in storage." - ) - link = parse_obj_as( - AnyUrl, f"s3://{bucket_name}/{urllib.parse.quote( object_name)}" - ) - if as_presigned_link: - link = self.s3_client.create_presigned_get_url(bucket_name, object_name) - return f"{link}" - - async def download_link_datcore(self, user_id: str, file_id: str) -> URL: - api_token, api_secret = self._get_datcore_tokens(user_id) - assert self.app # nosec - assert api_secret # nosec - assert api_token # nosec - return await datcore_adapter.get_file_download_presigned_link( - self.app, api_token, api_secret, file_id + dsm_provider = DataManagerProvider(app) + dsm_provider.register_builder( + SimcoreS3DataManager.get_location_id(), + create_simcore_s3_data_manager, + SimcoreS3DataManager, ) - - # COPY ----------------------------- - - async def copy_file_s3_s3( - self, user_id: str, dest_uuid: str, source_uuid: str - ) -> None: - # FIXME: operation MUST be atomic - - # source is s3, location is s3 - to_bucket_name = self.simcore_bucket_name - to_object_name = dest_uuid - from_bucket = self.simcore_bucket_name - from_object_name = source_uuid - await asyncio.get_event_loop().run_in_executor( - None, - self.s3_client.copy_object, - to_bucket_name, - to_object_name, - from_bucket, - from_object_name, + dsm_provider.register_builder( + DatCoreDataManager.get_location_id(), + create_datcore_data_manager, + DatCoreDataManager, ) + app[APP_DSM_KEY] = dsm_provider - # update db - async with self.engine.acquire() as conn: - fmd = FileMetaData() - fmd.simcore_from_uuid(dest_uuid, self.simcore_bucket_name) - fmd.user_id = user_id - ins = file_meta_data.insert().values(**vars(fmd)) - await conn.execute(ins) - - async def copy_file_s3_datcore( - self, user_id: str, dest_uuid: str, source_uuid: str - ): - assert self.app # nosec - session = get_client_session(self.app) - - # source is s3, get link and copy to datcore - bucket_name = self.simcore_bucket_name - object_name = source_uuid - filename = source_uuid.split("/")[-1] - - s3_dowload_link = self.s3_client.create_presigned_get_url( - bucket_name, object_name - ) - - with tempfile.TemporaryDirectory() as tmpdir: - # FIXME: connect download and upload streams - local_file_path = os.path.join(tmpdir, filename) - - # Downloads S3 -> local - await download_to_file_or_raise(session, s3_dowload_link, local_file_path) - - # Uploads local -> DATCore - await self.upload_file_to_datcore( - _user_id=user_id, - _local_file_path=local_file_path, - _destination_id=dest_uuid, - ) - - async def copy_file_datcore_s3( - self, - user_id: str, - dest_uuid: str, - source_uuid: str, - filename_missing: bool = False, - ): - assert self.app # nosec - session = get_client_session(self.app) - - # 2 steps: Get download link for local copy, the upload link to s3 - # TODO: This should be a redirect stream! - dc_link, filename = await self.download_link_datcore( - user_id=user_id, file_id=source_uuid - ) - if filename_missing: - dest_uuid = str(Path(dest_uuid) / filename) - - s3_upload_link = await self.upload_link( - user_id, dest_uuid, as_presigned_link=True - ) - - with tempfile.TemporaryDirectory() as tmpdir: - # FIXME: connect download and upload streams - - local_file_path = os.path.join(tmpdir, filename) - - # Downloads DATCore -> local - await download_to_file_or_raise(session, dc_link, local_file_path) - - # Uploads local -> S3 - s3_upload_link = URL(s3_upload_link) - async with session.put( - s3_upload_link, - data=Path(local_file_path).open("rb"), - raise_for_status=True, - ) as resp: - logger.debug( - "Uploaded local -> SIMCore %s . Status %s", - s3_upload_link, - resp.status, - ) - - return dest_uuid - - async def copy_file( - self, - user_id: str, - dest_location: LocationName, - dest_uuid: str, - source_location: LocationName, - source_uuid: str, - ) -> None: - if source_location == SIMCORE_S3_STR: - if dest_location == DATCORE_STR: - await self.copy_file_s3_datcore(user_id, dest_uuid, source_uuid) - elif dest_location == SIMCORE_S3_STR: - await self.copy_file_s3_s3(user_id, dest_uuid, source_uuid) - elif source_location == DATCORE_STR: - if dest_location == DATCORE_STR: - raise NotImplementedError("copy files from datcore 2 datcore not impl") - if dest_location == SIMCORE_S3_STR: - await self.copy_file_datcore_s3(user_id, dest_uuid, source_uuid) - - async def deep_copy_project_simcore_s3( - self, - user_id: str, - source_project: dict[str, Any], - destination_project: dict[str, Any], - node_mapping: dict[str, str], - ): - """Parses a given source project and copies all related files to the destination project - - Since all files are organized as - - project_id/node_id/filename or links to datcore - - this function creates a new folder structure - - project_id/node_id/filename - - and copies all files to the corresponding places. - - Additionally, all external files from datcore are being copied and the paths in the destination - project are adapted accordingly - - Lastly, the meta data db is kept in sync - """ - source_folder = source_project["uuid"] - dest_folder = destination_project["uuid"] - - # access layer - async with self.engine.acquire() as conn, conn.begin(): - source_access_rights = await get_project_access_rights( - conn, int(user_id), project_id=source_folder - ) - dest_access_rights = await get_project_access_rights( - conn, int(user_id), project_id=dest_folder - ) - if not source_access_rights.read: - raise web.HTTPForbidden( - reason=f"User {user_id} does not have enough access rights to read from project '{source_folder}'" - ) - - if not dest_access_rights.write: - raise web.HTTPForbidden( - reason=f"User {user_id} does not have enough access rights to write to project '{dest_folder}'" - ) - - # build up naming map based on labels - uuid_name_dict = {} - uuid_name_dict[dest_folder] = destination_project["name"] - for src_node_id, src_node in source_project["workbench"].items(): - new_node_id = node_mapping.get(src_node_id) - if new_node_id is not None: - uuid_name_dict[new_node_id] = src_node["label"] - - async with self._create_aiobotocore_client_context() as aioboto_client: - - logger.debug( - "Listing all items under %s:%s/", - self.simcore_bucket_name, - source_folder, - ) - - # Step 1: list all objects for this project replace them with the destination object name - # and do a copy at the same time collect some names - # Note: the / at the end of the Prefix is VERY important, makes the listing several order of magnitudes faster - response = await aioboto_client.list_objects_v2( - Bucket=self.simcore_bucket_name, Prefix=f"{source_folder}/" - ) - - contents: list = response.get("Contents", []) - logger.debug( - "Listed %s items under %s:%s/", - len(contents), - self.simcore_bucket_name, - source_folder, - ) - - for item in contents: - source_object_name = item["Key"] - source_object_parts = Path(source_object_name).parts - - if len(source_object_parts) != 3: - # This may happen once we have shared/home folders - # FIXME: this might cause problems - logger.info( - "Skipping copy of '%s'. Expected three parts path!", - source_object_name, - ) - continue - - old_node_id = source_object_parts[1] - new_node_id = node_mapping.get(old_node_id) - if new_node_id is not None: - old_filename = source_object_parts[2] - dest_object_name = str( - Path(dest_folder) / new_node_id / old_filename - ) - - copy_kwargs = dict( - CopySource={ - "Bucket": self.simcore_bucket_name, - "Key": source_object_name, - }, - Bucket=self.simcore_bucket_name, - Key=dest_object_name, - ) - logger.debug("Copying %s ...", copy_kwargs) - - # FIXME: if 5GB, it must use multipart upload Upload Part - Copy API - # SEE https://botocore.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#S3.Client.copy_object - await aioboto_client.copy_object(**copy_kwargs) - - # Step 2: list all references in outputs that point to datcore and copy over - for node_id, node in destination_project["workbench"].items(): - outputs: dict = node.get("outputs", {}) - for _, output in outputs.items(): - source = output["path"] - - if output.get("store") == DATCORE_ID: - destination_folder = str(Path(dest_folder) / node_id) - logger.info("Copying %s to %s", source, destination_folder) - - destination = await self.copy_file_datcore_s3( - user_id=user_id, - dest_uuid=destination_folder, - source_uuid=source, - filename_missing=True, - ) - assert destination.startswith(destination_folder) # nosec - - output["store"] = SIMCORE_S3_ID - output["path"] = destination - - elif output.get("store") == SIMCORE_S3_ID: - destination = str(Path(dest_folder) / node_id / Path(source).name) - output["store"] = SIMCORE_S3_ID - output["path"] = destination + yield - fmds = [] - async with self._create_aiobotocore_client_context() as aioboto_client: + logger.info("Shuting down %s", f"{dsm_provider=}") - # step 3: list files first to create fmds - # Note: the / at the end of the Prefix is VERY important, makes the listing several order of magnitudes faster - response = await aioboto_client.list_objects_v2( - Bucket=self.simcore_bucket_name, Prefix=f"{dest_folder}/" - ) - - if "Contents" in response: - for item in response["Contents"]: - fmd = FileMetaData() - fmd.simcore_from_uuid(item["Key"], self.simcore_bucket_name) - fmd.project_name = uuid_name_dict.get(dest_folder, "Untitled") - fmd.node_name = uuid_name_dict.get(fmd.node_id, "Untitled") - fmd.raw_file_path = fmd.file_uuid - fmd.display_file_path = str( - Path(fmd.project_name) / fmd.node_name / fmd.file_name - ) - fmd.user_id = user_id - fmd.file_size = item["Size"] - fmd.last_modified = str(item["LastModified"]) - fmds.append(fmd) - - # step 4 sync db - async with self.engine.acquire() as conn, conn.begin(): - # TODO: upsert in one statment of ALL - for fmd in fmds: - query = sa.select([file_meta_data]).where( - file_meta_data.c.file_uuid == fmd.file_uuid - ) - # if file already exists, we might w - rows = await conn.execute(query) - exists = await rows.scalar() - if exists: - delete_me = file_meta_data.delete().where( - file_meta_data.c.file_uuid == fmd.file_uuid - ) - await conn.execute(delete_me) - ins = file_meta_data.insert().values(**vars(fmd)) - await conn.execute(ins) - - # DELETE ------------------------------------- - - async def delete_file(self, user_id: str, location: LocationName, file_uuid: str): - """Deletes a file given its fmd and location - - Additionally requires a user_id for 3rd party auth - - For internal storage, the db state should be updated upon completion via - Notification mechanism - - For simcore.s3 we can use the file_name - For datcore we need the full path - """ - if location == SIMCORE_S3_STR: - # FIXME: operation MUST be atomic, transaction?? - - to_delete = [] - async with self.engine.acquire() as conn, conn.begin(): - can: Optional[AccessRights] = await get_file_access_rights( - conn, int(user_id), file_uuid - ) - if not can.delete: - raise web.HTTPForbidden( - reason=f"User {user_id} does not have enough access rights to delete file {file_uuid}" - ) - - query = sa.select( - [file_meta_data.c.bucket_name, file_meta_data.c.object_name] - ).where(file_meta_data.c.file_uuid == file_uuid) - - async for row in conn.execute(query): - if self.s3_client.remove_objects( - row.bucket_name, [row.object_name] - ): - to_delete.append(file_uuid) - - await conn.execute( - file_meta_data.delete().where( - file_meta_data.c.file_uuid.in_(to_delete) - ) - ) - - elif location == DATCORE_STR: - # FIXME: review return inconsistencies - api_token, api_secret = self._get_datcore_tokens(user_id) - assert self.app # nosec - assert api_secret # nosec - assert api_token # nosec - await datcore_adapter.delete_file( - self.app, api_token, api_secret, file_uuid - ) - - async def delete_project_simcore_s3( - self, user_id: str, project_id: str, node_id: Optional[str] = None - ) -> Optional[web.Response]: - - """Deletes all files from a given node in a project in simcore.s3 and updated db accordingly. - If node_id is not given, then all the project files db entries are deleted. - """ - - # FIXME: operation MUST be atomic. Mark for deletion and remove from db when deletion fully confirmed - - async with self.engine.acquire() as conn, conn.begin(): - # access layer - can: Optional[AccessRights] = await get_project_access_rights( - conn, int(user_id), project_id - ) - if not can.delete: - raise web.HTTPForbidden( - reason=f"User {user_id} does not have delete access for {project_id}" - ) - - delete_me = file_meta_data.delete().where( - file_meta_data.c.project_id == project_id, - ) - if node_id: - delete_me = delete_me.where(file_meta_data.c.node_id == node_id) - await conn.execute(delete_me) - - async with self._create_aiobotocore_client_context() as aioboto_client: - # Note: the / at the end of the Prefix is VERY important, makes the listing several order of magnitudes faster - response = await aioboto_client.list_objects_v2( - Bucket=self.simcore_bucket_name, - Prefix=f"{project_id}/{node_id}/" if node_id else f"{project_id}/", - ) - - objects_to_delete = [] - for f in response.get("Contents", []): - objects_to_delete.append({"Key": f["Key"]}) - - if objects_to_delete: - response = await aioboto_client.delete_objects( - Bucket=self.simcore_bucket_name, - Delete={"Objects": objects_to_delete}, - ) - return response - - # SEARCH ------------------------------------- - - async def search_files_starting_with( - self, user_id: int, prefix: str - ) -> list[FileMetaDataEx]: - # Avoids using list_files since it accounts for projects/nodes - # Storage should know NOTHING about those concepts - files_meta = deque() - - async with self.engine.acquire() as conn, conn.begin(): - # access layer - can_read_projects_ids = await get_readable_project_ids(conn, int(user_id)) - has_read_access = ( - file_meta_data.c.user_id == str(user_id) - ) | file_meta_data.c.project_id.in_(can_read_projects_ids) - - stmt = sa.select([file_meta_data]).where( - file_meta_data.c.file_uuid.startswith(prefix) & has_read_access - ) - - async for row in conn.execute(stmt): - meta_extended = to_meta_data_extended(row) - files_meta.append(meta_extended) - - return list(files_meta) - - async def create_soft_link( - self, user_id: int, target_uuid: str, link_uuid: str - ) -> FileMetaDataEx: - - # validate link_uuid - async with self.engine.acquire() as conn: - # TODO: select exists(select 1 from file_metadat where file_uuid=12) - found = await conn.scalar( - sa.select([file_meta_data.c.file_uuid]).where( - file_meta_data.c.file_uuid == link_uuid - ) - ) - if found: - raise ValueError(f"Invalid link {link_uuid}. Link already exists") - - # validate target_uuid - target = await self.list_file(str(user_id), SIMCORE_S3_STR, target_uuid) - if not target: - raise ValueError( - f"Invalid target '{target_uuid}'. File does not exists for this user" - ) - - # duplicate target and change the following columns: - target.fmd.file_uuid = link_uuid - target.fmd.file_id = link_uuid # NOTE: api-server relies on this id - target.fmd.is_soft_link = True - - async with self.engine.acquire() as conn: - stmt = ( - file_meta_data.insert() - .values(**attr.asdict(target.fmd)) - .returning(literal_column("*")) - ) - - result = await conn.execute(stmt) - link = to_meta_data_extended(await result.first()) - return link - - async def synchronise_meta_data_table( - self, location: LocationName, dry_run: bool - ) -> dict[str, Any]: - - PRUNE_CHUNK_SIZE = 20 - - removed: list[str] = [] - to_remove: list[str] = [] - - async def _prune_db_table(conn): - if not dry_run: - await conn.execute( - file_meta_data.delete().where( - file_meta_data.c.object_name.in_(to_remove) - ) - ) - logger.info( - "%s %s orphan items", - "Would have deleted" if dry_run else "Deleted", - len(to_remove), - ) - removed.extend(to_remove) - to_remove.clear() - - # ---------- - - assert ( # nosec - location == SIMCORE_S3_STR - ), "Only with s3, no other sync implemented" # nosec - - if location == SIMCORE_S3_STR: - - # NOTE: only valid for simcore, since datcore data is not in the database table - # let's get all the files in the table - logger.warning( - "synchronisation of database/s3 storage started, this will take some time..." - ) - - async with self.engine.acquire() as conn, self._create_aiobotocore_client_context() as aioboto_client: - - number_of_rows_in_db = ( - await conn.scalar( - sa.select([sa.func.count()]).select_from(file_meta_data) - ) - or 0 - ) - logger.warning( - "Total number of entries to check %d", - number_of_rows_in_db, - ) - - assert isinstance(aioboto_client, AioBaseClient) # nosec - - async for row in conn.execute( - sa.select([file_meta_data.c.object_name]) - ): - s3_key = row.object_name # type: ignore - - # now check if the file exists in S3 - # SEE https://www.peterbe.com/plog/fastest-way-to-find-out-if-a-file-exists-in-s3 - response = await aioboto_client.list_objects_v2( - Bucket=self.simcore_bucket_name, Prefix=s3_key - ) - if response.get("KeyCount", 0) == 0: - # this file does not exist in S3 - to_remove.append(s3_key) - - if len(to_remove) >= PRUNE_CHUNK_SIZE: - await _prune_db_table(conn) - - if to_remove: - await _prune_db_table(conn) + # ------ - assert len(to_remove) == 0 # nosec - assert len(removed) <= number_of_rows_in_db # nosec + app.cleanup_ctx.append(_cleanup_context) - logger.info( - "%s %d entries ", - "Would delete" if dry_run else "Deleting", - len(removed), - ) - return {"removed": removed} +def get_dsm_provider(app: web.Application) -> DataManagerProvider: + dsm_provider: DataManagerProvider = app[APP_DSM_KEY] + return dsm_provider diff --git a/services/storage/src/simcore_service_storage/dsm_cleaner.py b/services/storage/src/simcore_service_storage/dsm_cleaner.py new file mode 100644 index 00000000000..1d16aadb008 --- /dev/null +++ b/services/storage/src/simcore_service_storage/dsm_cleaner.py @@ -0,0 +1,73 @@ +""" backround task that cleans the DSM pending/expired uploads + +# Rationale: + - for each upload an entry is created in the file_meta_data table in the database + - then an upload link (S3/HTTP URL) is created through S3 backend and sent back to the client + - the client shall upload the file and then notify DSM of completion + - upon completion the corresponding entry in file_meta_data is updated: + - the file_size of the uploaded file is set + - the upload_expiration_date is set to null + +# DSM cleaner: + - runs at an interval + - list the entries that are expired in the database by checking "upload_expires_at" column + - tries to update from S3 the database first, if that fails: + - removes the entries in the database that are expired: + - removes the entry +""" + +import asyncio +import logging +import os +import socket +from contextlib import suppress +from typing import cast + +from aiohttp import web +from simcore_service_storage.simcore_s3_dsm import SimcoreS3DataManager + +from .constants import APP_CONFIG_KEY, APP_DSM_KEY +from .dsm_factory import DataManagerProvider +from .settings import Settings + +logger = logging.getLogger(__name__) + + +async def dsm_cleaner_task(app: web.Application) -> None: + logger.info("starting dsm cleaner task...") + cfg: Settings = app[APP_CONFIG_KEY] + dsm: DataManagerProvider = app[APP_DSM_KEY] + simcore_s3_dsm = cast( + SimcoreS3DataManager, dsm.get(SimcoreS3DataManager.get_location_id()) + ) + assert cfg.STORAGE_CLEANER_INTERVAL_S # nosec + while await asyncio.sleep(cfg.STORAGE_CLEANER_INTERVAL_S, result=True): + try: + await simcore_s3_dsm.clean_expired_uploads() + + except asyncio.CancelledError: + logger.info("cancelled dsm cleaner task") + raise + except Exception: # pylint: disable=broad-except + logger.exception( + "Unhandled error in dsm cleaner task, restarting task...", exc_info=True + ) + + +def setup_dsm_cleaner(app: web.Application): + async def _setup(app: web.Application): + task = asyncio.create_task( + dsm_cleaner_task(app), + name=f"dsm_cleaner_task_{socket.gethostname()}_{os.getpid()}", + ) + logger.info("%s created", f"{task=}") + + yield + + logger.debug("stopping %s...", f"{task=}") + task.cancel() + with suppress(asyncio.CancelledError): + await task + logger.info("%s stopped.", f"{task=}") + + app.cleanup_ctx.append(_setup) diff --git a/services/storage/src/simcore_service_storage/dsm_factory.py b/services/storage/src/simcore_service_storage/dsm_factory.py new file mode 100644 index 00000000000..0ea39e8ede9 --- /dev/null +++ b/services/storage/src/simcore_service_storage/dsm_factory.py @@ -0,0 +1,119 @@ +from abc import ABC, abstractmethod +from dataclasses import dataclass, field +from typing import Callable + +from aiohttp import web +from models_library.api_schemas_storage import LinkType +from models_library.projects_nodes_io import LocationID, LocationName, StorageFileID +from models_library.users import UserID +from pydantic import AnyUrl + +from .models import DatasetMetaData, FileMetaData + + +class BaseDataManager(ABC): + @property + def location_id(self) -> LocationID: + """returns the location Identifier (must be unique)""" + return self.get_location_id() + + @classmethod + @abstractmethod + def get_location_id(cls) -> LocationID: + """returns the location Identifier (must be unique)""" + + @property + def location_name(self) -> LocationName: + """returns the location human readable name (must be unique)""" + return self.get_location_name() + + @classmethod + @abstractmethod + def get_location_name(cls) -> LocationName: + """returns the location human readable name (must be unique)""" + + @abstractmethod + async def authorized(self, user_id: UserID) -> bool: + """returns True if user with user_id is authorized to access the storage""" + + @abstractmethod + async def list_datasets(self, user_id: UserID) -> list[DatasetMetaData]: + """returns all the top level datasets a user has access to""" + + @abstractmethod + async def list_files_in_dataset( + self, user_id: UserID, dataset_id: str + ) -> list[FileMetaData]: + """returns all the file meta data inside dataset with dataset_id""" + + @abstractmethod + async def list_files( + self, user_id: UserID, uuid_filter: str = "" + ) -> list[FileMetaData]: + """returns all the file meta data a user has access to (uuid_filter may be used)""" + + @abstractmethod + async def get_file(self, user_id: UserID, file_id: StorageFileID) -> FileMetaData: + """returns the file meta data of file_id if user_id has the rights to""" + + @abstractmethod + async def create_file_upload_link( + self, + user_id: UserID, + file_id: StorageFileID, + link_type: LinkType, + ) -> AnyUrl: + """creates an upload file link if user has the rights to""" + + @abstractmethod + async def abort_file_upload(self, user_id: UserID, file_id: StorageFileID) -> None: + """aborts an upload if user has the rights to, and reverts + to the latest version if available, else will delete the file""" + + @abstractmethod + async def create_file_download_link( + self, user_id: UserID, file_id: StorageFileID, link_type: LinkType + ) -> AnyUrl: + """creates a download file link if user has the rights to""" + + @abstractmethod + async def delete_file(self, user_id: UserID, file_id: StorageFileID) -> None: + """deletes file if user has the rights to""" + + +@dataclass +class DataManagerProvider: + app: web.Application + _builders: dict[ + LocationID, + tuple[Callable[[web.Application], BaseDataManager], type[BaseDataManager]], + ] = field(default_factory=dict) + _services: list[BaseDataManager] = field(default_factory=list) + + def register_builder( + self, + location_id: LocationID, + builder: Callable[[web.Application], BaseDataManager], + dsm_type: type[BaseDataManager], + ): + self._builders[location_id] = (builder, dsm_type) + + def _create(self, location_id: LocationID, **kwargs) -> BaseDataManager: + + builder_and_type = self._builders.get(location_id) + if not builder_and_type: + raise ValueError(location_id) + builder, _dsm_type = builder_and_type + new_dsm = builder(self.app, **kwargs) + self._services.append(new_dsm) + return new_dsm + + def get(self, location_id: LocationID) -> BaseDataManager: + for dsm in self._services: + if dsm.location_id == location_id: + return dsm + # try to create it + return self._create(location_id) + + def locations(self) -> list[LocationID]: + return list(self._builders.keys()) diff --git a/services/storage/src/simcore_service_storage/exceptions.py b/services/storage/src/simcore_service_storage/exceptions.py new file mode 100644 index 00000000000..e65684ce238 --- /dev/null +++ b/services/storage/src/simcore_service_storage/exceptions.py @@ -0,0 +1,50 @@ +from pydantic.errors import PydanticErrorMixin + + +class StorageRuntimeError(PydanticErrorMixin, RuntimeError): + ... + + +class DatabaseAccessError(StorageRuntimeError): + code = "database.access_error" + msg_template: str = "Unexpected error while accessing database backend" + + +class FileMetaDataNotFoundError(DatabaseAccessError): + code = "filemetadata.not_found_error" + msg_template: str = "The file meta data for {file_id} was not found" + + +class FileAccessRightError(DatabaseAccessError): + code = "file.access_right_error" + msg_template: str = "Insufficient access rights to {access_right} {file_id}" + + +class ProjectAccessRightError(DatabaseAccessError): + code = "file.access_right_error" + msg_template: str = "Insufficient access rights to {access_right} {project_id}" + + +class ProjectNotFoundError(DatabaseAccessError): + code = "project.not_found_error" + msg_template: str = "Project {project_id} was not found" + + +class LinkAlreadyExistsError(DatabaseAccessError): + code = "link.already_exists_error" + msg_template: str = "The link {file_id} already exists" + + +class S3AccessError(StorageRuntimeError): + code = "s3_access.error" + msg_template: str = "Unexpected error while accessing S3 backend" + + +class S3BucketInvalidError(S3AccessError): + code = "s3_bucket.invalid_error" + msg_template: str = "The {bucket} is invalid" + + +class S3KeyNotFoundError(S3AccessError): + code = "s3_key.not_found_error" + msg_template: str = "The file {key} in {bucket} was not found" diff --git a/services/storage/src/simcore_service_storage/handlers.py b/services/storage/src/simcore_service_storage/handlers.py deleted file mode 100644 index 154399f1456..00000000000 --- a/services/storage/src/simcore_service_storage/handlers.py +++ /dev/null @@ -1,487 +0,0 @@ -import asyncio -import json -import logging -import urllib.parse -from contextlib import contextmanager -from typing import Any, Optional - -from aiohttp import web -from aiohttp.web import RouteTableDef -from models_library.users import UserID -from models_library.utils.fastapi_encoders import jsonable_encoder -from servicelib.aiohttp.application_keys import APP_CONFIG_KEY -from servicelib.aiohttp.rest_utils import extract_and_validate -from settings_library.s3 import S3Settings - -# Exclusive for simcore-s3 storage ----------------------- -from . import sts -from ._meta import api_vtag -from .access_layer import InvalidFileIdentifier -from .constants import APP_DSM_KEY, DATCORE_STR, SIMCORE_S3_ID, SIMCORE_S3_STR -from .db_tokens import get_api_token_and_secret -from .dsm import DataStorageManager, DatCoreApiToken -from .models import DatasetMetaData, FileMetaDataEx -from .settings import Settings -from .temporary_handlers_utils import convert_to_api_dataset, convert_to_api_fmd - -log = logging.getLogger(__name__) - -routes = RouteTableDef() - - -async def _prepare_storage_manager( - params: dict, - query: dict, - request: web.Request, - force_check_datcore_tokens: bool = False, -) -> DataStorageManager: - # FIXME: scope properly, either request or app level!! - # Notice that every request is changing tokens! - # I would rather store tokens in request instead of in dsm - # or creating an different instance of dsm per request - - INIT_STR = "init" - dsm: DataStorageManager = request.app[APP_DSM_KEY] - user_id = query.get("user_id") - location = dsm.location_from_id(params.get("location_id", 0)) - - if user_id and (location in (INIT_STR, DATCORE_STR) or force_check_datcore_tokens): - # TODO: notify from db instead when tokens changed, then invalidate resource which enforces - # re-query when needed. - - # updates from db - token_info = await get_api_token_and_secret(request.app, int(user_id)) - if all(token_info): - dsm.datcore_tokens[user_id] = DatCoreApiToken(*token_info) - else: - dsm.datcore_tokens.pop(user_id, None) - return dsm - - -@contextmanager -def handle_storage_errors(): - """Basic policies to translate low-level errors into HTTP errors""" - # TODO: include _prepare_storage_manager? - # TODO: middleware? decorator? - try: - - yield - - except InvalidFileIdentifier as err: - raise web.HTTPUnprocessableEntity( - reason=f"{err} is an invalid file identifier" - ) from err - - -# HANDLERS --------------------------------------------------- - - -@routes.get(f"/{api_vtag}/locations", name="get_storage_locations") # type: ignore -async def get_storage_locations(request: web.Request): - log.debug("CHECK LOCATION PATH %s %s", request.path, request.url) - - params, query, body = await extract_and_validate(request) - - assert not params, "params %s" % params # nosec - assert query, "query %s" % query # nosec - assert not body, "body %s" % body # nosec - - assert query["user_id"] # nosec - - with handle_storage_errors(): - user_id = query["user_id"] - # NOTE: temporary, will be refactored - dsm = await _prepare_storage_manager( - params, query, request, force_check_datcore_tokens=True - ) - locs = await dsm.locations(user_id) - - return {"error": None, "data": locs} - - -@routes.get(f"/{api_vtag}/locations/{{location_id}}/datasets") # type: ignore -async def get_datasets_metadata(request: web.Request): - log.debug("GET METADATA DATASETS %s %s", request.path, request.url) - - params, query, body = await extract_and_validate(request) - - assert params, "params %s" % params # nosec - assert query, "query %s" % query # nosec - assert not body, "body %s" % body # nosec - - assert params["location_id"] # nosec - assert query["user_id"] # nosec - - with handle_storage_errors(): - - location_id = params["location_id"] - user_id = query["user_id"] - - dsm = await _prepare_storage_manager(params, query, request) - - location = dsm.location_from_id(location_id) - # To implement - data: list[DatasetMetaData] = await dsm.list_datasets(user_id, location) - py_data = [jsonable_encoder(convert_to_api_dataset(d)) for d in data] - return {"error": None, "data": py_data} - - -@routes.get(f"/{api_vtag}/locations/{{location_id}}/files/metadata") # type: ignore -async def get_files_metadata(request: web.Request): - log.debug("GET FILES METADATA %s %s", request.path, request.url) - - params, query, body = await extract_and_validate(request) - - assert params, "params %s" % params # nosec - assert query, "query %s" % query # nosec - assert not body, "body %s" % body # nosec - - assert params["location_id"] # nosec - assert query["user_id"] # nosec - - with handle_storage_errors(): - location_id = params["location_id"] - user_id = query["user_id"] - uuid_filter = query.get("uuid_filter", "") - - dsm = await _prepare_storage_manager(params, query, request) - location = dsm.location_from_id(location_id) - - log.debug("list files %s %s %s", user_id, location, uuid_filter) - - data: list[FileMetaDataEx] = await dsm.list_files( - user_id=user_id, location=location, uuid_filter=uuid_filter - ) - py_data = [jsonable_encoder(convert_to_api_fmd(d)) for d in data] - return {"error": None, "data": py_data} - - -@routes.get(f"/{api_vtag}/locations/{{location_id}}/datasets/{{dataset_id}}/metadata") # type: ignore -async def get_files_metadata_dataset(request: web.Request): - log.debug("GET FILES METADATA DATASET %s %s", request.path, request.url) - - params, query, body = await extract_and_validate(request) - - assert params, "params %s" % params # nosec - assert query, "query %s" % query # nosec - assert not body, "body %s" % body # nosec - - assert params["location_id"] # nosec - assert params["dataset_id"] # nosec - assert query["user_id"] # nosec - - with handle_storage_errors(): - location_id = params["location_id"] - user_id = query["user_id"] - dataset_id = params["dataset_id"] - - dsm = await _prepare_storage_manager(params, query, request) - - location = dsm.location_from_id(location_id) - - log.debug("list files %s %s %s", user_id, location, dataset_id) - - data: list[FileMetaDataEx] = await dsm.list_files_dataset( - user_id=user_id, location=location, dataset_id=dataset_id - ) - - py_data = [jsonable_encoder(convert_to_api_fmd(d)) for d in data] - return {"error": None, "data": py_data} - - -@routes.get(f"/{api_vtag}/locations/{{location_id}}/files/{{file_id}}/metadata") # type: ignore -async def get_file_metadata(request: web.Request): - params, query, body = await extract_and_validate(request) - - assert params, "params %s" % params # nosec - assert query, "query %s" % query # nosec - assert not body, "body %s" % body # nosec - - assert params["location_id"] # nosec - assert params["file_id"] # nosec - assert query["user_id"] # nosec - - with handle_storage_errors(): - location_id = params["location_id"] - user_id = query["user_id"] - file_uuid = params["file_id"] - - dsm = await _prepare_storage_manager(params, query, request) - location = dsm.location_from_id(location_id) - - data: Optional[FileMetaDataEx] = await dsm.list_file( - user_id=user_id, location=location, file_uuid=file_uuid - ) - # when no metadata is found - if data is None: - # NOTE: This is what happens Larry... data must be an empty {} or else some old - # dynamic services will FAIL (sic) - return {"error": "No result found", "data": {}} - - return { - "error": None, - "data": jsonable_encoder(convert_to_api_fmd(data)), - } - - -@routes.post(f"/{api_vtag}/locations/{{location_id}}:sync") # type: ignore -async def synchronise_meta_data_table(request: web.Request): - params, query, *_ = await extract_and_validate(request) - assert query["dry_run"] is not None # nosec - assert params["location_id"] # nosec - - with handle_storage_errors(): - location_id: int = params["location_id"] - fire_and_forget: bool = query["fire_and_forget"] - dry_run: bool = query["dry_run"] - - dsm = await _prepare_storage_manager(params, query, request) - location = dsm.location_from_id(location_id) - - sync_results: dict[str, Any] = { - "removed": [], - } - sync_coro = dsm.synchronise_meta_data_table(location, dry_run) - - if fire_and_forget: - settings: Settings = request.app[APP_CONFIG_KEY] - - async def _go(): - timeout = settings.STORAGE_SYNC_METADATA_TIMEOUT - try: - result = await asyncio.wait_for(sync_coro, timeout=timeout) - log.info( - "Sync metadata table completed: %d entries removed", - len(result.get("removed", [])), - ) - except asyncio.TimeoutError: - log.error("Sync metadata table timed out (%s seconds)", timeout) - - asyncio.create_task(_go(), name="fire&forget sync_task") - else: - sync_results = await sync_coro - - sync_results["fire_and_forget"] = fire_and_forget - sync_results["dry_run"] = dry_run - - return {"error": None, "data": sync_results} - - -@routes.patch(f"/{api_vtag}/locations/{{location_id}}/files/{{file_id}}/metadata") # type: ignore -async def update_file_meta_data(request: web.Request): - params, query, body = await extract_and_validate(request) - - assert params, "params %s" % params # nosec - assert query, "query %s" % query # nosec - assert not body, "body %s" % body # nosec - - with handle_storage_errors(): - file_uuid = urllib.parse.unquote_plus(params["file_id"]) - user_id = query["user_id"] - - dsm = await _prepare_storage_manager(params, query, request) - - data: Optional[FileMetaDataEx] = await dsm.update_metadata( - file_uuid=file_uuid, user_id=user_id - ) - if data is None: - raise web.HTTPNotFound(reason=f"Could not update metadata for {file_uuid}") - - return { - "error": None, - "data": jsonable_encoder(convert_to_api_fmd(data)), - } - - -@routes.get(f"/{api_vtag}/locations/{{location_id}}/files/{{file_id}}") # type: ignore -async def download_file(request: web.Request): - params, query, body = await extract_and_validate(request) - - assert params, "params %s" % params # nosec - assert query, "query %s" % query # nosec - assert not body, "body %s" % body # nosec - - assert params["location_id"] # nosec - assert params["file_id"] # nosec - assert query["user_id"] # nosec - link_type = query.get("link_type", "presigned") - - with handle_storage_errors(): - location_id = params["location_id"] - user_id = query["user_id"] - file_uuid = params["file_id"] - - if int(location_id) != SIMCORE_S3_ID and link_type == "s3": - raise web.HTTPPreconditionFailed( - reason=f"Only allowed to fetch s3 link for '{SIMCORE_S3_STR}'" - ) - - dsm = await _prepare_storage_manager(params, query, request) - location = dsm.location_from_id(location_id) - if location == SIMCORE_S3_STR: - link = await dsm.download_link_s3( - file_uuid, user_id, as_presigned_link=bool(link_type == "presigned") - ) - else: - link = await dsm.download_link_datcore(user_id, file_uuid) - - return {"error": None, "data": {"link": link}} - - -@routes.put(f"/{api_vtag}/locations/{{location_id}}/files/{{file_id}}") # type: ignore -async def upload_file(request: web.Request): - params, query, body = await extract_and_validate(request) - log.debug("received call to upload_file with %s", f"{params=}, {query=}, {body=}") - assert params, "params %s" % params # nosec - assert query, "query %s" % query # nosec - assert not body, "body %s" % body # nosec - link_type = query.get("link_type", "presigned") - - with handle_storage_errors(): - user_id = query["user_id"] - file_uuid = params["file_id"] - - dsm = await _prepare_storage_manager(params, query, request) - - link = await dsm.upload_link( - user_id=user_id, - file_uuid=file_uuid, - as_presigned_link=bool(link_type == "presigned"), - ) - - return {"error": None, "data": {"link": link}} - - -@routes.delete(f"/{api_vtag}/locations/{{location_id}}/files/{{file_id}}") # type: ignore -async def delete_file(request: web.Request): - params, query, body = await extract_and_validate(request) - - assert params, "params %s" % params # nosec - assert query, "query %s" % query # nosec - assert not body, "body %s" % body # nosec - - assert params["location_id"] # nosec - assert params["file_id"] # nosec - assert query["user_id"] # nosec - - with handle_storage_errors(): - location_id = params["location_id"] - user_id = query["user_id"] - file_uuid = params["file_id"] - - dsm = await _prepare_storage_manager(params, query, request) - location = dsm.location_from_id(location_id) - await dsm.delete_file(user_id=user_id, location=location, file_uuid=file_uuid) - - return {"error": None, "data": None} - - -@routes.post(f"/{api_vtag}/simcore-s3:access") -async def get_or_create_temporary_s3_access(request: web.Request): - user_id = UserID(request.query["user_id"]) - with handle_storage_errors(): - s3_settings: S3Settings = await sts.get_or_create_temporary_token_for_user( - request.app, user_id - ) - return {"data": s3_settings.dict()} - - -@routes.post(f"/{api_vtag}/simcore-s3/folders", name="copy_folders_from_project") # type: ignore -async def create_folders_from_project(request: web.Request): - # FIXME: Update openapi-core. Fails with additionalProperties https://github.com/p1c2u/openapi-core/issues/124. Fails with project - # params, query, body = await extract_and_validate(request) - user_id = request.query["user_id"] - - body = await request.json() - source_project = body.get("source", {}) - destination_project = body.get("destination", {}) - nodes_map = body.get("nodes_map", {}) - - assert set(nodes_map.keys()) == set(source_project["workbench"].keys()) # nosec - assert set(nodes_map.values()) == set( # nosec - destination_project["workbench"].keys() # nosec - ) # nosec - - # TODO: validate project with jsonschema instead?? - with handle_storage_errors(): - dsm = await _prepare_storage_manager( - params={"location_id": SIMCORE_S3_ID}, - query={"user_id": user_id}, - request=request, - ) - await dsm.deep_copy_project_simcore_s3( - user_id, source_project, destination_project, nodes_map - ) - - raise web.HTTPCreated( - text=json.dumps(destination_project), content_type="application/json" - ) - - -@routes.delete(f"/{api_vtag}/simcore-s3/folders/{{folder_id}}") # type: ignore -async def delete_folders_of_project(request: web.Request): - folder_id = request.match_info["folder_id"] - user_id = request.query["user_id"] - node_id = request.query.get("node_id", None) - - with handle_storage_errors(): - dsm = await _prepare_storage_manager( - params={"location_id": SIMCORE_S3_ID}, - query={"user_id": user_id}, - request=request, - ) - await dsm.delete_project_simcore_s3(user_id, folder_id, node_id) - - raise web.HTTPNoContent(content_type="application/json") - - -@routes.post(f"/{api_vtag}/simcore-s3/files/metadata:search") # type: ignore -async def search_files_starting_with(request: web.Request): - params, query, body = await extract_and_validate(request) - assert not params, "params %s" % params # nosec - assert query, "query %s" % query # nosec - assert not body, "body %s" % body # nosec - - assert query["user_id"] # nosec - assert query["startswith"] # nosec - - with handle_storage_errors(): - - user_id = int(query["user_id"]) - startswith = query["startswith"] - - dsm = await _prepare_storage_manager( - {"location_id": SIMCORE_S3_ID}, {"user_id": user_id}, request - ) - - data: list[FileMetaDataEx] = await dsm.search_files_starting_with( - int(user_id), prefix=startswith - ) - log.debug("Found %d files starting with '%s'", len(data), startswith) - py_data = [jsonable_encoder(convert_to_api_fmd(d)) for d in data] - return py_data - - -@routes.post(f"/{api_vtag}/files/{{file_id}}:soft-copy", name="copy_as_soft_link") # type: ignore -async def copy_as_soft_link(request: web.Request): - # TODO: error handling - params, query, body = await extract_and_validate(request) - - assert params, "params %s" % params # nosec - assert query, "query %s" % query # nosec - assert body, "body %s" % body # nosec - - with handle_storage_errors(): - target_uuid = params["file_id"] - user_id = int(query["user_id"]) - link_uuid = body.link_id - - dsm = await _prepare_storage_manager( - {"location_id": SIMCORE_S3_ID}, {"user_id": user_id}, request - ) - - file_link: FileMetaDataEx = await dsm.create_soft_link( - user_id, target_uuid, link_uuid - ) - - return jsonable_encoder(convert_to_api_fmd(file_link)) diff --git a/services/storage/src/simcore_service_storage/handlers_datasets.py b/services/storage/src/simcore_service_storage/handlers_datasets.py new file mode 100644 index 00000000000..3f79c93096d --- /dev/null +++ b/services/storage/src/simcore_service_storage/handlers_datasets.py @@ -0,0 +1,57 @@ +import logging + +from aiohttp import web +from aiohttp.web import RouteTableDef +from models_library.api_schemas_storage import FileMetaDataGet +from models_library.utils.fastapi_encoders import jsonable_encoder +from servicelib.aiohttp.requests_validation import ( + parse_request_path_parameters_as, + parse_request_query_parameters_as, +) +from simcore_service_storage.dsm import get_dsm_provider + +# Exclusive for simcore-s3 storage ----------------------- +from ._meta import api_vtag +from .models import ( + FileMetaData, + FilesMetadataDatasetPathParams, + LocationPathParams, + StorageQueryParamsBase, +) + +log = logging.getLogger(__name__) + +routes = RouteTableDef() + +UPLOAD_TASKS_KEY = f"{__name__}.upload_tasks" + + +@routes.get(f"/{api_vtag}/locations/{{location_id}}/datasets", name="get_datasets_metadata") # type: ignore +async def get_datasets_metadata(request: web.Request): + query_params = parse_request_query_parameters_as(StorageQueryParamsBase, request) + path_params = parse_request_path_parameters_as(LocationPathParams, request) + log.debug( + "received call to get_datasets_metadata with %s", + f"{path_params=}, {query_params=}", + ) + + dsm = get_dsm_provider(request.app).get(path_params.location_id) + return await dsm.list_datasets(query_params.user_id) + + +@routes.get(f"/{api_vtag}/locations/{{location_id}}/datasets/{{dataset_id}}/metadata", name="get_files_metadata_dataset") # type: ignore +async def get_files_metadata_dataset(request: web.Request): + query_params = parse_request_query_parameters_as(StorageQueryParamsBase, request) + path_params = parse_request_path_parameters_as( + FilesMetadataDatasetPathParams, request + ) + log.debug( + "received call to get_files_metadata_dataset with %s", + f"{path_params=}, {query_params=}", + ) + dsm = get_dsm_provider(request.app).get(path_params.location_id) + data: list[FileMetaData] = await dsm.list_files_in_dataset( + user_id=query_params.user_id, + dataset_id=path_params.dataset_id, + ) + return [jsonable_encoder(FileMetaDataGet.from_orm(d)) for d in data] diff --git a/services/storage/src/simcore_service_storage/handlers_files.py b/services/storage/src/simcore_service_storage/handlers_files.py new file mode 100644 index 00000000000..a3459459444 --- /dev/null +++ b/services/storage/src/simcore_service_storage/handlers_files.py @@ -0,0 +1,163 @@ +import logging +from typing import cast + +from aiohttp import web +from aiohttp.web import RouteTableDef +from models_library.api_schemas_storage import FileMetaDataGet, SoftCopyBody +from models_library.utils.fastapi_encoders import jsonable_encoder +from pydantic import AnyUrl +from servicelib.aiohttp.requests_validation import ( + parse_request_body_as, + parse_request_path_parameters_as, + parse_request_query_parameters_as, +) +from servicelib.mimetype_constants import MIMETYPE_APPLICATION_JSON +from simcore_service_storage.simcore_s3_dsm import SimcoreS3DataManager + +# Exclusive for simcore-s3 storage ----------------------- +from ._meta import api_vtag +from .dsm import get_dsm_provider +from .exceptions import FileMetaDataNotFoundError +from .models import ( + CopyAsSoftLinkParams, + FileDownloadQueryParams, + FileMetaData, + FilePathParams, + FilesMetadataQueryParams, + FileUploadQueryParams, + LocationPathParams, + StorageQueryParamsBase, +) + +log = logging.getLogger(__name__) + +routes = RouteTableDef() + +UPLOAD_TASKS_KEY = f"{__name__}.upload_tasks" + + +@routes.get(f"/{api_vtag}/locations/{{location_id}}/files/metadata", name="get_files_metadata") # type: ignore +async def get_files_metadata(request: web.Request): + query_params = parse_request_query_parameters_as(FilesMetadataQueryParams, request) + path_params = parse_request_path_parameters_as(LocationPathParams, request) + log.debug( + "received call to get_files_metadata with %s", + f"{path_params=}, {query_params=}", + ) + dsm = get_dsm_provider(request.app).get(path_params.location_id) + data: list[FileMetaData] = await dsm.list_files( + user_id=query_params.user_id, + uuid_filter=query_params.uuid_filter, + ) + return [jsonable_encoder(FileMetaDataGet.from_orm(d)) for d in data] + + +@routes.get( + f"/{api_vtag}/locations/{{location_id}}/files/{{file_id}}/metadata", + name="get_file_metadata", +) # type: ignore +async def get_file_metadata(request: web.Request): + query_params = parse_request_query_parameters_as(StorageQueryParamsBase, request) + path_params = parse_request_path_parameters_as(FilePathParams, request) + log.debug( + "received call to get_files_metadata_dataset with %s", + f"{path_params=}, {query_params=}", + ) + + dsm = get_dsm_provider(request.app).get(path_params.location_id) + try: + data = await dsm.get_file( + user_id=query_params.user_id, + file_id=path_params.file_id, + ) + except FileMetaDataNotFoundError: + # NOTE: This is what happens Larry... data must be an empty {} or else some old + # dynamic services will FAIL (sic) + # TODO: once all legacy services are gone, remove the try except, it will default to 404 + return {"error": "No result found", "data": {}} + + return jsonable_encoder(FileMetaDataGet.from_orm(data)) + + +@routes.get(f"/{api_vtag}/locations/{{location_id}}/files/{{file_id}}", name="download_file") # type: ignore +async def download_file(request: web.Request): + query_params = parse_request_query_parameters_as(FileDownloadQueryParams, request) + path_params = parse_request_path_parameters_as(FilePathParams, request) + log.debug( + "received call to download_file with %s", + f"{path_params=}, {query_params=}", + ) + dsm = get_dsm_provider(request.app).get(path_params.location_id) + link = await dsm.create_file_download_link( + query_params.user_id, path_params.file_id, query_params.link_type + ) + return {"link": link} + + +@routes.put(f"/{api_vtag}/locations/{{location_id}}/files/{{file_id}}", name="upload_file") # type: ignore +async def upload_file(request: web.Request): + query_params = parse_request_query_parameters_as(FileUploadQueryParams, request) + path_params = parse_request_path_parameters_as(FilePathParams, request) + + log.debug( + "received call to upload_file with %s", + f"{path_params=}, {query_params=}", + ) + + dsm = get_dsm_provider(request.app).get(path_params.location_id) + link: AnyUrl = await dsm.create_file_upload_link( + user_id=query_params.user_id, + file_id=path_params.file_id, + link_type=query_params.link_type, + ) + + return {"link": jsonable_encoder(link, by_alias=True)} + + +@routes.post(f"/{api_vtag}/locations/{{location_id}}/files/{{file_id}}:abort", name="abort_upload_file") # type: ignore +async def abort_upload_file(request: web.Request): + query_params = parse_request_query_parameters_as(StorageQueryParamsBase, request) + path_params = parse_request_path_parameters_as(FilePathParams, request) + log.debug( + "received call to abort_upload_file with %s", + f"{path_params=}, {query_params=}", + ) + + dsm = get_dsm_provider(request.app).get(path_params.location_id) + await dsm.abort_file_upload(query_params.user_id, path_params.file_id) + return web.HTTPNoContent(content_type=MIMETYPE_APPLICATION_JSON) + + +@routes.delete(f"/{api_vtag}/locations/{{location_id}}/files/{{file_id}}", name="delete_file") # type: ignore +async def delete_file(request: web.Request): + query_params = parse_request_query_parameters_as(StorageQueryParamsBase, request) + path_params = parse_request_path_parameters_as(FilePathParams, request) + log.debug( + "received call to delete_file with %s", + f"{path_params=}, {query_params=}", + ) + + dsm = get_dsm_provider(request.app).get(path_params.location_id) + await dsm.delete_file(query_params.user_id, path_params.file_id) + return web.HTTPNoContent(content_type=MIMETYPE_APPLICATION_JSON) + + +@routes.post(f"/{api_vtag}/files/{{file_id}}:soft-copy", name="copy_as_soft_link") # type: ignore +async def copy_as_soft_link(request: web.Request): + query_params = parse_request_query_parameters_as(StorageQueryParamsBase, request) + path_params = parse_request_path_parameters_as(CopyAsSoftLinkParams, request) + body = await parse_request_body_as(SoftCopyBody, request) + log.debug( + "received call to copy_as_soft_link with %s", + f"{path_params=}, {query_params=}, {body=}", + ) + + dsm = cast( + SimcoreS3DataManager, + get_dsm_provider(request.app).get(SimcoreS3DataManager.get_location_id()), + ) + file_link: FileMetaData = await dsm.create_soft_link( + query_params.user_id, path_params.file_id, body.link_id + ) + + return jsonable_encoder(FileMetaDataGet.from_orm(file_link)) diff --git a/services/storage/src/simcore_service_storage/app_handlers.py b/services/storage/src/simcore_service_storage/handlers_health.py similarity index 56% rename from services/storage/src/simcore_service_storage/app_handlers.py rename to services/storage/src/simcore_service_storage/handlers_health.py index 07fcfe6c503..5704f8c1104 100644 --- a/services/storage/src/simcore_service_storage/app_handlers.py +++ b/services/storage/src/simcore_service_storage/handlers_health.py @@ -6,13 +6,17 @@ import logging from aiohttp.web import Request, RouteTableDef -from models_library.api_schemas_storage import HealthCheck +from models_library.api_schemas_storage import HealthCheck, S3BucketName from models_library.app_diagnostics import AppStatusCheck from servicelib.aiohttp.rest_utils import extract_and_validate +from simcore_service_storage.constants import APP_CONFIG_KEY from ._meta import api_version, api_version_prefix, app_name from .db import get_engine_state from .db import is_service_responsive as is_pg_responsive +from .exceptions import S3AccessError, S3BucketInvalidError +from .s3 import get_s3_client +from .settings import Settings log = logging.getLogger(__name__) @@ -32,36 +36,28 @@ async def get_health(request: Request): ).dict(exclude_unset=True) -@routes.post(f"/{api_version_prefix}/check/{{action}}", name="check_action") # type: ignore -async def check_action(request: Request): - """ - Test checkpoint to ask server to fail or echo back the transmitted data - TODO: deprecate - """ - params, query, body = await extract_and_validate(request) - - assert params, "params %s" % params # nosec - assert query, "query %s" % query # nosec - assert body, "body %s" % body # nosec - - if params["action"] == "fail": - raise ValueError("some randome failure") - - # echo's input FIXME: convert to dic - # FIXME: output = fake_schema.dump(body) - return { - "path_value": params.get("action"), - "query_value": query.get("data"), - "body_value": { - "key1": 1, # body.body_value.key1, - "key2": 0, # body.body_value.key2, - }, - } - - @routes.get(f"/{api_version_prefix}/status", name="get_status") # type: ignore -async def get_app_status(request: Request): +async def get_status(request: Request): # NOTE: all calls here must NOT raise + assert request.app # nosec + app_settings: Settings = request.app[APP_CONFIG_KEY] + s3_state = "disabled" + if app_settings.STORAGE_S3: + try: + await get_s3_client(request.app).check_bucket_connection( + S3BucketName(app_settings.STORAGE_S3.S3_BUCKET_NAME) + ) + s3_state = "connected" + except S3BucketInvalidError: + s3_state = "no access to S3 bucket" + except S3AccessError: + s3_state = "failed" + + postgres_state = "disabled" + if app_settings.STORAGE_POSTGRES: + postgres_state = ( + "connected" if await is_pg_responsive(request.app) else "failed" + ) status = AppStatusCheck.parse_obj( { @@ -69,10 +65,10 @@ async def get_app_status(request: Request): "version": api_version, "services": { "postgres": { - "healthy": await is_pg_responsive(request.app), + "healthy": postgres_state, "pool": get_engine_state(request.app), }, - # TODO: s3-minio + "s3": {"healthy": s3_state}, }, } ) diff --git a/services/storage/src/simcore_service_storage/handlers_locations.py b/services/storage/src/simcore_service_storage/handlers_locations.py new file mode 100644 index 00000000000..b3f9ed4fc2d --- /dev/null +++ b/services/storage/src/simcore_service_storage/handlers_locations.py @@ -0,0 +1,96 @@ +import asyncio +import logging +from typing import cast + +from aiohttp import web +from aiohttp.web import RouteTableDef +from models_library.api_schemas_storage import FileLocation +from models_library.projects_nodes_io import StorageFileID +from models_library.utils.fastapi_encoders import jsonable_encoder +from servicelib.aiohttp.application_keys import ( + APP_CONFIG_KEY, + APP_FIRE_AND_FORGET_TASKS_KEY, +) +from servicelib.aiohttp.requests_validation import ( + parse_request_path_parameters_as, + parse_request_query_parameters_as, +) +from servicelib.utils import fire_and_forget_task + +# Exclusive for simcore-s3 storage ----------------------- +from ._meta import api_vtag +from .dsm import get_dsm_provider +from .models import LocationPathParams, StorageQueryParamsBase, SyncMetadataQueryParams +from .settings import Settings +from .simcore_s3_dsm import SimcoreS3DataManager + +log = logging.getLogger(__name__) + +routes = RouteTableDef() + + +# HANDLERS --------------------------------------------------- +@routes.get(f"/{api_vtag}/locations", name="get_storage_locations") # type: ignore +async def get_storage_locations(request: web.Request): + query_params = parse_request_query_parameters_as(StorageQueryParamsBase, request) + log.debug( + "received call to get_storage_locations with %s", + f"{query_params=}", + ) + dsm_provider = get_dsm_provider(request.app) + location_ids = dsm_provider.locations() + locs: list[FileLocation] = [] + for loc_id in location_ids: + dsm = dsm_provider.get(loc_id) + if await dsm.authorized(query_params.user_id): + locs.append(FileLocation(name=dsm.location_name, id=dsm.location_id)) + + return {"error": None, "data": jsonable_encoder(locs)} + + +@routes.post(f"/{api_vtag}/locations/{{location_id}}:sync", name="synchronise_meta_data_table") # type: ignore +async def synchronise_meta_data_table(request: web.Request): + query_params = parse_request_query_parameters_as(SyncMetadataQueryParams, request) + path_params = parse_request_path_parameters_as(LocationPathParams, request) + log.debug( + "received call to synchronise_meta_data_table with %s", + f"{path_params=}, {query_params=}", + ) + + dsm = cast( + SimcoreS3DataManager, + get_dsm_provider(request.app).get(SimcoreS3DataManager.get_location_id()), + ) + sync_results: list[StorageFileID] = [] + sync_coro = dsm.synchronise_meta_data_table(query_params.dry_run) + + if query_params.fire_and_forget: + settings: Settings = request.app[APP_CONFIG_KEY] + + async def _go(): + timeout = settings.STORAGE_SYNC_METADATA_TIMEOUT + try: + result = await asyncio.wait_for(sync_coro, timeout=timeout) + log.info( + "Sync metadata table completed: %d entries removed", + len(result), + ) + except asyncio.TimeoutError: + log.error("Sync metadata table timed out (%s seconds)", timeout) + + fire_and_forget_task( + _go(), + task_suffix_name="synchronise_meta_data_table", + fire_and_forget_tasks_collection=request.app[APP_FIRE_AND_FORGET_TASKS_KEY], + ) + else: + sync_results = await sync_coro + + return { + "error": None, + "data": { + "removed": sync_results, + "fire_and_forget": query_params.fire_and_forget, + "dry_run": query_params.dry_run, + }, + } diff --git a/services/storage/src/simcore_service_storage/handlers_simcore_s3.py b/services/storage/src/simcore_service_storage/handlers_simcore_s3.py new file mode 100644 index 00000000000..d8b4ee03b5b --- /dev/null +++ b/services/storage/src/simcore_service_storage/handlers_simcore_s3.py @@ -0,0 +1,111 @@ +import json +import logging +from typing import cast + +from aiohttp import web +from aiohttp.web import RouteTableDef +from models_library.api_schemas_storage import FileMetaDataGet, FoldersBody +from models_library.projects import ProjectID +from models_library.utils.fastapi_encoders import jsonable_encoder +from servicelib.aiohttp.requests_validation import ( + parse_request_body_as, + parse_request_path_parameters_as, + parse_request_query_parameters_as, +) +from servicelib.mimetype_constants import MIMETYPE_APPLICATION_JSON +from settings_library.s3 import S3Settings +from simcore_service_storage.dsm import get_dsm_provider +from simcore_service_storage.simcore_s3_dsm import SimcoreS3DataManager + +# Exclusive for simcore-s3 storage ----------------------- +from . import sts +from ._meta import api_vtag +from .models import ( + DeleteFolderQueryParams, + FileMetaData, + SearchFilesQueryParams, + SimcoreS3FoldersParams, + StorageQueryParamsBase, +) + +log = logging.getLogger(__name__) + +routes = RouteTableDef() + + +@routes.post(f"/{api_vtag}/simcore-s3:access", name="get_or_create_temporary_s3_access") # type: ignore +async def get_or_create_temporary_s3_access(request: web.Request): + query_params = parse_request_query_parameters_as(StorageQueryParamsBase, request) + log.debug( + "received call to get_or_create_temporary_s3_access with %s", + f"{query_params=}", + ) + + s3_settings: S3Settings = await sts.get_or_create_temporary_token_for_user( + request.app, query_params.user_id + ) + return {"data": s3_settings.dict()} + + +@routes.post(f"/{api_vtag}/simcore-s3/folders", name="copy_folders_from_project") # type: ignore +async def copy_folders_from_project(request: web.Request): + query_params = parse_request_query_parameters_as(StorageQueryParamsBase, request) + body = await parse_request_body_as(FoldersBody, request) + log.debug( + "received call to create_folders_from_project with %s", + f"{body=}, {query_params=}", + ) + + dsm = cast( + SimcoreS3DataManager, + get_dsm_provider(request.app).get(SimcoreS3DataManager.get_location_id()), + ) + await dsm.deep_copy_project_simcore_s3( + query_params.user_id, body.source, body.destination, body.nodes_map + ) + + raise web.HTTPCreated( + text=json.dumps(body.destination), content_type=MIMETYPE_APPLICATION_JSON + ) + + +@routes.delete(f"/{api_vtag}/simcore-s3/folders/{{folder_id}}", name="delete_folders_of_project") # type: ignore +async def delete_folders_of_project(request: web.Request): + query_params = parse_request_query_parameters_as(DeleteFolderQueryParams, request) + path_params = parse_request_path_parameters_as(SimcoreS3FoldersParams, request) + log.debug( + "received call to delete_folders_of_project with %s", + f"{path_params=}, {query_params=}", + ) + + dsm = cast( + SimcoreS3DataManager, + get_dsm_provider(request.app).get(SimcoreS3DataManager.get_location_id()), + ) + await dsm.delete_project_simcore_s3( + query_params.user_id, + ProjectID(path_params.folder_id), + query_params.node_id, + ) + + raise web.HTTPNoContent(content_type=MIMETYPE_APPLICATION_JSON) + + +@routes.post(f"/{api_vtag}/simcore-s3/files/metadata:search", name="search_files_starting_with") # type: ignore +async def search_files_starting_with(request: web.Request): + query_params = parse_request_query_parameters_as(SearchFilesQueryParams, request) + log.debug( + "received call to search_files_starting_with with %s", + f"{query_params=}", + ) + + dsm = cast( + SimcoreS3DataManager, + get_dsm_provider(request.app).get(SimcoreS3DataManager.get_location_id()), + ) + data: list[FileMetaData] = await dsm.search_files_starting_with( + query_params.user_id, prefix=query_params.startswith + ) + log.debug("Found %d files starting with '%s'", len(data), query_params.startswith) + + return [jsonable_encoder(FileMetaDataGet.from_orm(d)) for d in data] diff --git a/services/storage/src/simcore_service_storage/models.py b/services/storage/src/simcore_service_storage/models.py index 6f85388b4d9..a2cd30c8ec2 100644 --- a/services/storage/src/simcore_service_storage/models.py +++ b/services/storage/src/simcore_service_storage/models.py @@ -1,35 +1,36 @@ -""" Database models - -""" import datetime -from dataclasses import dataclass -from pathlib import Path +import urllib.parse +from typing import Optional from uuid import UUID -import attr -from simcore_postgres_database.storage_models import ( - file_meta_data, - groups, - metadata, - projects, - tokens, - user_to_groups, - users, +from models_library.api_schemas_storage import ( + DatasetMetaDataGet, + ETag, + FileMetaDataGet, + LinkType, + S3BucketName, +) +from models_library.projects import ProjectID +from models_library.projects_nodes import NodeID +from models_library.projects_nodes_io import ( + LocationID, + LocationName, + SimcoreS3FileID, + StorageFileID, +) +from models_library.users import UserID +from pydantic import ( + BaseModel, + ByteSize, + Extra, + parse_obj_as, + validate_arguments, + validator, ) - -from .constants import SIMCORE_S3_ID, SIMCORE_S3_STR - -# FIXME: W0611:Unused UUID imported from sqlalchemy.dialects.postgresql -# from sqlalchemy.dialects.postgresql import UUID - -# FIXME: R0902: Too many instance attributes (11/7) (too-many-instance-attributes) -# pylint: disable=R0902 -@dataclass -class DatasetMetaData: - dataset_id: str = "" - display_name: str = "" +class DatasetMetaData(DatasetMetaDataGet): + ... def is_uuid(value: str) -> bool: @@ -41,119 +42,170 @@ def is_uuid(value: str) -> bool: return True -class FileMetaData: - """This is a proposal, probably no everything is needed. - It is actually an overkill - - file_name : display name for a file - location_id : storage location - location_name : storage location display name - project_id : project_id - projec_name : project display name - node_id : node id - node_name : display_name - bucket_name : name of the bucket - object_name : s3 object name = folder/folder/filename.ending - user_id : user_id - user_name : user_name - - file_uuid : unique identifier for a file: - - bucket_name/project_id/node_id/file_name = /bucket_name/object_name - - file_id : unique uuid for the file - - simcore.s3: uuid created upon insertion - datcore: datcore uuid - - raw_file_path : raw path to file - - simcore.s3: proj_id/node_id/filename.ending - emailaddress/... - datcore: dataset/collection/filename.ending - - display_file_path: human readlable path to file - - simcore.s3: proj_name/node_name/filename.ending - my_documents/... - datcore: dataset/collection/filename.ending - - created_at : time stamp - last_modified : time stamp - file_size : size in bytes - - TODO: - state: on of OK, UPLOADING, DELETED - - """ - - # pylint: disable=attribute-defined-outside-init - def simcore_from_uuid(self, file_uuid: str, bucket_name: str): - parts = file_uuid.split("/") - if len(parts) == 3: - self.location = SIMCORE_S3_STR - self.location_id = SIMCORE_S3_ID - self.bucket_name = bucket_name - self.object_name = "/".join(parts[:]) - self.file_name = parts[2] - self.project_id = parts[0] if is_uuid(parts[0]) else None - self.node_id = parts[1] if is_uuid(parts[1]) else None - self.file_uuid = file_uuid - self.file_id = file_uuid - self.raw_file_path = self.file_uuid - self.display_file_path = str( - Path("not") / Path("yet") / Path("implemented") - ) - self.created_at = str(datetime.datetime.now()) - self.last_modified = self.created_at - self.file_size = -1 - self.entity_tag = None - self.is_soft_link = False - - def __str__(self): - d = attr.asdict(self) - _str = "" - for _d in d: - _str += " {0: <25}: {1}\n".format(_d, str(d[_d])) - return _str - - -def get_default(column): - # NOTE: this is temporary. it translates bool text-clauses into python - # The only defaults in file_meta_data are actually of these type - if column.server_default: - return {"false": False, "true": True}.get(str(column.server_default.arg)) - return None - - -attr.s( - these={c.name: attr.ib(default=get_default(c)) for c in file_meta_data.c}, - init=True, - kw_only=True, -)(FileMetaData) - - -@dataclass -class FileMetaDataEx: - """Extend the base type by some additional attributes that shall not end up in the db""" - - fmd: FileMetaData - parent_id: str = "" - - def __str__(self): - _str = str(self.fmd) - _str += " {0: <25}: {1}\n".format("parent_id", str(self.parent_id)) - return _str - - -__all__ = [ - "file_meta_data", - "tokens", - "metadata", +class FileMetaDataAtDB(BaseModel): + location_id: LocationID + location: LocationName + bucket_name: S3BucketName + object_name: SimcoreS3FileID + project_id: Optional[ProjectID] = None + node_id: Optional[NodeID] = None + user_id: UserID + created_at: datetime.datetime + file_id: SimcoreS3FileID + file_size: ByteSize + last_modified: datetime.datetime + entity_tag: Optional[ETag] = None + is_soft_link: bool + upload_expires_at: Optional[datetime.datetime] = None + + class Config: + orm_mode = True + extra = Extra.forbid + + +class FileMetaData(FileMetaDataGet): + upload_expires_at: Optional[datetime.datetime] = None + + location: LocationName + bucket_name: str + object_name: str + project_id: Optional[ProjectID] + node_id: Optional[NodeID] + user_id: Optional[UserID] + + @classmethod + @validate_arguments + def from_simcore_node( + cls, + user_id: UserID, + file_id: SimcoreS3FileID, + bucket: S3BucketName, + location_id: LocationID, + location_name: LocationName, + **file_meta_data_kwargs, + ): + + parts = file_id.split("/") + now = datetime.datetime.utcnow() + fmd_kwargs = { + "file_uuid": file_id, + "location_id": location_id, + "location": location_name, + "bucket_name": bucket, + "object_name": file_id, + "file_name": parts[2], + "user_id": user_id, + "project_id": parse_obj_as(ProjectID, parts[0]) + if is_uuid(parts[0]) + else None, + "node_id": parse_obj_as(NodeID, parts[1]) if is_uuid(parts[1]) else None, + "file_id": file_id, + "created_at": now, + "last_modified": now, + "file_size": ByteSize(-1), + "entity_tag": None, + "is_soft_link": False, + "upload_expires_at": None, + } + fmd_kwargs.update(**file_meta_data_kwargs) + return cls.parse_obj(fmd_kwargs) + + +class StorageQueryParamsBase(BaseModel): + user_id: UserID + + class Config: + allow_population_by_field_name = True + extra = Extra.forbid + + +class FilesMetadataQueryParams(StorageQueryParamsBase): + uuid_filter: str = "" + + +class SyncMetadataQueryParams(BaseModel): + dry_run: bool = False + fire_and_forget: bool = False + + +class FileDownloadQueryParams(StorageQueryParamsBase): + link_type: LinkType = LinkType.PRESIGNED + + @validator("link_type", pre=True) + @classmethod + def convert_from_lower_case(cls, v): + if v is not None: + return f"{v}".upper() + return v + + +class FileUploadQueryParams(StorageQueryParamsBase): + link_type: LinkType = LinkType.PRESIGNED + file_size: ByteSize = ByteSize(0) + + @validator("link_type", pre=True) + @classmethod + def convert_from_lower_case(cls, v): + if v is not None: + return f"{v}".upper() + return v + + +class DeleteFolderQueryParams(StorageQueryParamsBase): + node_id: Optional[NodeID] = None + + +class SearchFilesQueryParams(StorageQueryParamsBase): + startswith: str = "" + + +class LocationPathParams(BaseModel): + location_id: LocationID + + class Config: + allow_population_by_field_name = True + extra = Extra.forbid + + +class FilesMetadataDatasetPathParams(LocationPathParams): + dataset_id: str + + +class FilePathParams(LocationPathParams): + file_id: StorageFileID + + @validator("file_id", pre=True) + @classmethod + def unquote(cls, v): + if v is not None: + return urllib.parse.unquote(f"{v}") + return v + + +class FilePathIsUploadCompletedParams(FilePathParams): + future_id: str + + +class SimcoreS3FoldersParams(BaseModel): + folder_id: str + + +class CopyAsSoftLinkParams(BaseModel): + file_id: StorageFileID + + @validator("file_id", pre=True) + @classmethod + def unquote(cls, v): + if v is not None: + return urllib.parse.unquote(f"{v}") + return v + + +__all__ = ( + "ETag", "FileMetaData", - "FileMetaDataEx", - "projects", - "users", - "groups", - "user_to_groups", -] + "FileMetaDataAtDB", + "S3BucketName", + "SimcoreS3FileID", + "StorageFileID", +) diff --git a/services/storage/src/simcore_service_storage/rest.py b/services/storage/src/simcore_service_storage/rest.py index f05ad11c1bf..a13a19d100c 100644 --- a/services/storage/src/simcore_service_storage/rest.py +++ b/services/storage/src/simcore_service_storage/rest.py @@ -11,7 +11,13 @@ from servicelib.aiohttp.openapi import get_base_path from servicelib.aiohttp.rest_middlewares import append_rest_middlewares -from . import app_handlers, handlers +from . import ( + handlers_datasets, + handlers_files, + handlers_health, + handlers_locations, + handlers_simcore_s3, +) from .constants import APP_OPENAPI_SPECS_KEY from .resources import resources @@ -46,9 +52,16 @@ def setup_rest(app: web.Application): app[APP_OPENAPI_SPECS_KEY] = api_specs # Connects handlers - set_default_names(handlers.routes) - app.router.add_routes(handlers.routes) - app.router.add_routes(app_handlers.routes) + + for routes in [ + handlers_health.routes, + handlers_locations.routes, + handlers_datasets.routes, + handlers_files.routes, + handlers_simcore_s3.routes, + ]: + set_default_names(routes) + app.router.add_routes(routes) log.debug( "routes:\n %s", diff --git a/services/storage/src/simcore_service_storage/s3.py b/services/storage/src/simcore_service_storage/s3.py index b2b417a20e0..37447e73e59 100644 --- a/services/storage/src/simcore_service_storage/s3.py +++ b/services/storage/src/simcore_service_storage/s3.py @@ -1,91 +1,66 @@ """ Module to access s3 service """ +import json import logging +from contextlib import AsyncExitStack from aiohttp import web -from pydantic import AnyUrl, parse_obj_as -from tenacity import before_sleep_log, retry, stop_after_attempt, wait_fixed +from tenacity._asyncio import AsyncRetrying +from tenacity.before_sleep import before_sleep_log +from tenacity.wait import wait_fixed -from .constants import APP_CONFIG_KEY, APP_S3_KEY -from .s3wrapper.s3_client import MinioClientWrapper -from .settings import Settings -from .utils import RETRY_COUNT, RETRY_WAIT_SECS +from .constants import APP_CONFIG_KEY, APP_S3_KEY, RETRY_WAIT_SECS +from .s3_client import StorageS3Client log = logging.getLogger(__name__) -async def _setup_s3_bucket(app): +async def setup_s3_client(app): log.debug("setup %s.setup.cleanup_ctx", __name__) - # setup - s3_client = app[APP_S3_KEY] - cfg: Settings = app[APP_CONFIG_KEY] - - @retry( - wait=wait_fixed(RETRY_WAIT_SECS), - stop=stop_after_attempt(RETRY_COUNT), - before_sleep=before_sleep_log(log, logging.WARNING), - reraise=True, - ) - async def do_create_bucket(): - log.debug("Creating bucket: %s", cfg.STORAGE_S3.json(indent=2)) - s3_client.create_bucket(cfg.STORAGE_S3.S3_BUCKET_NAME) - - try: - await do_create_bucket() - except Exception: # pylint: disable=broad-except - log.exception("Impossible to create s3 bucket. Stoping") - - # ok, failures_count = False, 0 - # while not ok: - # try: - # s3_client.create_bucket(s3_bucket) - # ok = True - # except Exception: # pylint: disable=W0703 - # failures_count +=1 - # if failures_count>RETRY_COUNT: - # log.exception("") - # raise - # await asyncio.sleep(RETRY_WAIT_SECS) + storage_s3_settings = app[APP_CONFIG_KEY].STORAGE_S3 + + async with AsyncExitStack() as exit_stack: + client = None + async for attempt in AsyncRetrying( + wait=wait_fixed(RETRY_WAIT_SECS), + before_sleep=before_sleep_log(log, logging.WARNING), + reraise=True, + ): + with attempt: + client = await StorageS3Client.create(exit_stack, storage_s3_settings) + log.info( + "S3 client %s successfully created [%s]", + f"{client=}", + json.dumps(attempt.retry_state.retry_object.statistics), + ) + assert client # nosec + app[APP_S3_KEY] = client + + yield + # tear-down + log.debug("closing %s", f"{client=}") + log.info("closed s3 client %s", f"{client=}") + + +async def setup_s3_bucket(app: web.Application): + storage_s3_settings = app[APP_CONFIG_KEY].STORAGE_S3 + client = get_s3_client(app) + await client.create_bucket(storage_s3_settings.S3_BUCKET_NAME) yield - # tear-down - log.debug("tear-down %s.setup.cleanup_ctx", __name__) - - -def _minio_client_endpint(s3_endpoint: str) -> str: - # Minio client adds http and https based on the secure paramenter - # provided at construction time, already including the schema - # will cause issues, encoding url to HOST:PORT or just HOST - # if port is missing - url = parse_obj_as(AnyUrl, s3_endpoint) - return f"{url.host}" if url.port is None else f"{url.host}:{url.port}" - def setup_s3(app: web.Application): """minio/s3 service setup""" log.debug("Setting up %s ...", __name__) - STORAGE_DISABLE_SERVICES = app[APP_CONFIG_KEY].STORAGE_DISABLE_SERVICES - - if "s3" in STORAGE_DISABLE_SERVICES: - log.warning("Service '%s' explicitly disabled in config", "s3") - return - - cfg = app[APP_CONFIG_KEY] - - s3_client = MinioClientWrapper( - _minio_client_endpint(cfg.STORAGE_S3.S3_ENDPOINT), - cfg.STORAGE_S3.S3_ACCESS_KEY, - cfg.STORAGE_S3.S3_SECRET_KEY, - secure=cfg.STORAGE_S3.S3_SECURE, - ) - app[APP_S3_KEY] = s3_client - app.cleanup_ctx.append(_setup_s3_bucket) + app.cleanup_ctx.append(setup_s3_client) + app.cleanup_ctx.append(setup_s3_bucket) -def get_config_s3(app: web.Application) -> dict: - cfg = app[APP_CONFIG_KEY].STORAGE_S3 - return cfg +def get_s3_client(app: web.Application) -> StorageS3Client: + assert app[APP_S3_KEY] # nosec + assert isinstance(app[APP_S3_KEY], StorageS3Client) + return app[APP_S3_KEY] diff --git a/services/storage/src/simcore_service_storage/s3_client.py b/services/storage/src/simcore_service_storage/s3_client.py new file mode 100644 index 00000000000..9d7cacb3551 --- /dev/null +++ b/services/storage/src/simcore_service_storage/s3_client.py @@ -0,0 +1,192 @@ +import datetime +import json +import logging +import urllib.parse +from contextlib import AsyncExitStack +from dataclasses import dataclass +from pathlib import Path +from typing import Optional, cast + +import aioboto3 +from aiobotocore.session import ClientCreatorContext +from botocore.client import Config +from models_library.projects import ProjectID +from models_library.projects_nodes_io import NodeID, SimcoreS3FileID +from pydantic import AnyUrl, parse_obj_as +from settings_library.s3 import S3Settings +from types_aiobotocore_s3 import S3Client + +from .models import ETag, S3BucketName +from .s3_utils import s3_exception_handler + +log = logging.getLogger(__name__) + + +@dataclass(frozen=True) +class S3MetaData: + file_id: SimcoreS3FileID + last_modified: datetime.datetime + e_tag: ETag + size: int + + +@dataclass +class StorageS3Client: + session: aioboto3.Session + client: S3Client + + @classmethod + async def create( + cls, exit_stack: AsyncExitStack, settings: S3Settings + ) -> "StorageS3Client": + # upon creation the client does not try to connect, one need to make an operation + session = aioboto3.Session() + # NOTE: session.client returns an aiobotocore client enhanced with aioboto3 fcts (e.g. download_file, upload_file, copy_file...) + session_client = session.client( + "s3", + endpoint_url=settings.S3_ENDPOINT, + aws_access_key_id=settings.S3_ACCESS_KEY, + aws_secret_access_key=settings.S3_SECRET_KEY, + aws_session_token=settings.S3_ACCESS_TOKEN, + region_name=settings.S3_REGION, + config=Config(signature_version="s3v4"), + ) + assert isinstance(session_client, ClientCreatorContext) # nosec + client = cast(S3Client, await exit_stack.enter_async_context(session_client)) + # NOTE: this triggers a botocore.exception.ClientError in case the connection is not made to the S3 backend + await client.list_buckets() + + return cls(session, client) + + @s3_exception_handler(log) + async def create_bucket(self, bucket: S3BucketName) -> None: + log.debug("Creating bucket: %s", bucket) + try: + await self.client.create_bucket(Bucket=bucket) + log.info("Bucket %s successfully created", bucket) + except self.client.exceptions.BucketAlreadyOwnedByYou: + log.info( + "Bucket %s already exists and is owned by us", + bucket, + ) + + @s3_exception_handler(log) + async def check_bucket_connection(self, bucket: S3BucketName) -> None: + """ + :raises: S3BucketInvalidError if not existing, not enough rights + :raises: S3AccessError for any other error + """ + log.debug("Head bucket: %s", bucket) + await self.client.head_bucket(Bucket=bucket) + + @s3_exception_handler(log) + async def create_single_presigned_download_link( + self, bucket: S3BucketName, file_id: SimcoreS3FileID, expiration_secs: int + ) -> AnyUrl: + # NOTE: ensure the bucket/object exists, this will raise if not + await self.client.head_bucket(Bucket=bucket) + await self.get_file_metadata(bucket, file_id) + generated_link = await self.client.generate_presigned_url( + "get_object", + Params={"Bucket": bucket, "Key": file_id}, + ExpiresIn=expiration_secs, + ) + return parse_obj_as(AnyUrl, generated_link) + + @s3_exception_handler(log) + async def create_single_presigned_upload_link( + self, bucket: S3BucketName, file_id: SimcoreS3FileID, expiration_secs: int + ) -> AnyUrl: + # NOTE: ensure the bucket/object exists, this will raise if not + await self.client.head_bucket(Bucket=bucket) + generated_link = await self.client.generate_presigned_url( + "put_object", + Params={"Bucket": bucket, "Key": file_id}, + ExpiresIn=expiration_secs, + ) + return parse_obj_as(AnyUrl, generated_link) + + @s3_exception_handler(log) + async def delete_file(self, bucket: S3BucketName, file_id: SimcoreS3FileID) -> None: + await self.client.delete_object(Bucket=bucket, Key=file_id) + + @s3_exception_handler(log) + async def delete_files_in_project_node( + self, + bucket: S3BucketName, + project_id: ProjectID, + node_id: Optional[NodeID] = None, + ) -> None: + # NOTE: the / at the end of the Prefix is VERY important, + # makes the listing several order of magnitudes faster + response = await self.client.list_objects_v2( + Bucket=bucket, + Prefix=f"{project_id}/{node_id}/" if node_id else f"{project_id}/", + ) + + if objects_to_delete := [ + f["Key"] for f in response.get("Contents", []) if "Key" in f + ]: + await self.client.delete_objects( + Bucket=bucket, + Delete={"Objects": [{"Key": key} for key in objects_to_delete]}, + ) + + @s3_exception_handler(log) + async def get_file_metadata( + self, bucket: S3BucketName, file_id: SimcoreS3FileID + ) -> S3MetaData: + response = await self.client.head_object(Bucket=bucket, Key=file_id) + return S3MetaData( + file_id=file_id, + last_modified=response["LastModified"], + e_tag=json.loads(response["ETag"]), + size=response["ContentLength"], + ) + + @s3_exception_handler(log) + async def copy_file( + self, bucket: S3BucketName, src_file: SimcoreS3FileID, dst_file: SimcoreS3FileID + ) -> None: + """copy a file in S3 using aioboto3 transfer manager (e.g. works >5Gb and creates multiple threads) + + :type bucket: S3BucketName + :type src_file: SimcoreS3FileID + :type dst_file: SimcoreS3FileID + """ + await self.client.copy( + CopySource={"Bucket": bucket, "Key": src_file}, Bucket=bucket, Key=dst_file + ) + + @s3_exception_handler(log) + async def list_files( + self, bucket: S3BucketName, *, prefix: str + ) -> list[S3MetaData]: + # NOTE: adding a / at the end of a folder improves speed by several orders of magnitudes + response = await self.client.list_objects_v2(Bucket=bucket, Prefix=prefix) + return [ + S3MetaData( + file_id=entry["Key"], # type: ignore + last_modified=entry["LastModified"], # type: ignore + e_tag=json.loads(entry["ETag"]), # type: ignore + size=entry["Size"], # type: ignore + ) + for entry in response.get("Contents", []) + if all(k in entry for k in ("Key", "LastModified", "ETag", "Size")) + ] + + @s3_exception_handler(log) + async def upload_file( + self, bucket: S3BucketName, file: Path, file_id: SimcoreS3FileID + ) -> None: + """upload a file using aioboto3 transfer manager (e.g. works >5Gb and create multiple threads) + + :type bucket: S3BucketName + :type file: Path + :type file_id: SimcoreS3FileID + """ + await self.client.upload_file(f"{file}", Bucket=bucket, Key=file_id) + + @staticmethod + def compute_s3_url(bucket: S3BucketName, file_id: SimcoreS3FileID) -> AnyUrl: + return parse_obj_as(AnyUrl, f"s3://{bucket}/{urllib.parse.quote(file_id)}") diff --git a/services/storage/src/simcore_service_storage/s3_utils.py b/services/storage/src/simcore_service_storage/s3_utils.py new file mode 100644 index 00000000000..ecd9a10ec7e --- /dev/null +++ b/services/storage/src/simcore_service_storage/s3_utils.py @@ -0,0 +1,48 @@ +import functools +import logging + +from botocore import exceptions as botocore_exc +from pydantic import ByteSize + +from .exceptions import S3AccessError, S3BucketInvalidError, S3KeyNotFoundError + + +def compute_num_file_chunks(file_size: ByteSize) -> tuple[int, ByteSize]: + return 1, file_size + + +def s3_exception_handler(log: logging.Logger): + """converts typical aiobotocore/boto exceptions to storage exceptions + NOTE: this is a work in progress as more exceptions might arise in different + use-cases + """ + + def decorator(func): + @functools.wraps(func) + async def wrapper(self, *args, **kwargs): + try: + response = await func(self, *args, **kwargs) + except self.client.exceptions.NoSuchBucket as exc: + raise S3BucketInvalidError( + bucket=exc.response.get("Error", {}).get("BucketName", "undefined") + ) from exc + except botocore_exc.ClientError as exc: + if exc.response.get("Error", {}).get("Code") == "404": + if exc.operation_name == "HeadObject": + raise S3KeyNotFoundError(bucket=args[0], key=args[1]) from exc + if exc.operation_name == "HeadBucket": + raise S3BucketInvalidError(bucket=args[0]) from exc + if exc.response.get("Error", {}).get("Code") == "403": + if exc.operation_name == "HeadBucket": + raise S3BucketInvalidError(bucket=args[0]) from exc + raise S3AccessError from exc + + except Exception: + log.exception("Unexpected error in s3 client: ") + raise + + return response + + return wrapper + + return decorator diff --git a/services/storage/src/simcore_service_storage/s3wrapper/__init__.py b/services/storage/src/simcore_service_storage/s3wrapper/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/services/storage/src/simcore_service_storage/s3wrapper/s3_client.py b/services/storage/src/simcore_service_storage/s3wrapper/s3_client.py deleted file mode 100644 index 5af7f42afbc..00000000000 --- a/services/storage/src/simcore_service_storage/s3wrapper/s3_client.py +++ /dev/null @@ -1,208 +0,0 @@ -# -# SEE https://docs.min.io/docs/python-client-api-reference.html -# -import logging -from datetime import timedelta -from typing import Iterator, Optional - -from minio import Minio -from minio.commonconfig import CopySource -from minio.datatypes import Object -from minio.deleteobjects import DeleteError, DeleteObject -from minio.error import MinioException -from minio.helpers import ObjectWriteResult - -log = logging.getLogger(__name__) - - -class MinioClientWrapper: - """Wrapper around minio""" - - def __init__( - self, - endpoint: str, - access_key: str = None, - secret_key: str = None, - secure: bool = False, - ): - self.__metadata_prefix = "x-amz-meta-" - self.endpoint = endpoint - self.access_key = access_key - self.secret_key = secret_key - self.secure = secure - self.endpoint_url = ("https://" if secure else "http://") + endpoint - try: - self._minio = Minio( - endpoint, access_key=access_key, secret_key=secret_key, secure=secure - ) - except MinioException: - logging.exception("Could not create minio client") - raise - - def __remove_objects_recursively(self, bucket_name): - to_del = [ - obj.object_name for obj in self.list_objects(bucket_name, recursive=True) - ] - self.remove_objects(bucket_name, to_del) - - def create_bucket(self, bucket_name, delete_contents_if_exists=False): - try: - if not self.exists_bucket(bucket_name): - self._minio.make_bucket(bucket_name) - elif delete_contents_if_exists: - return self.__remove_objects_recursively(bucket_name) - - except MinioException: - logging.exception("Could not create bucket") - return False - # it probably already exists and is - return True - - def remove_bucket(self, bucket_name, delete_contents=False): - try: - if self.exists_bucket(bucket_name): - if delete_contents: - self.__remove_objects_recursively(bucket_name) - self._minio.remove_bucket(bucket_name) - except MinioException: - logging.exception("Could not remove bucket") - return False - return True - - def exists_bucket(self, bucket_name): - try: - return self._minio.bucket_exists(bucket_name) - except MinioException: - logging.exception("Could not check bucket for existence") - - return False - - def upload_file(self, bucket_name, object_name, filepath, metadata=None): - """Note - - metadata are special, you need to use the - 'X-Amz-Meta' standard, i.e: - - key and value must be strings - - and the keys are case insensitive: - - key1 -- > Key1 - key_one --> Key_one - key-one --> Key-One - - """ - try: - _metadata = {} - if metadata is not None: - for key in metadata.keys(): - _metadata[self.__metadata_prefix + key] = metadata[key] - self._minio.fput_object( - bucket_name, object_name, filepath, metadata=_metadata - ) - except MinioException: - logging.exception("Could not upload file") - return False - return True - - def download_file(self, bucket_name, object_name, filepath): - try: - self._minio.fget_object(bucket_name, object_name, filepath) - except MinioException: - logging.exception("Could not download file") - return False - return True - - def get_metadata(self, bucket_name, object_name): - try: - obj = self._minio.stat_object(bucket_name, object_name) - assert obj.metadata # nosec - return dict(obj.metadata) - - except MinioException: - logging.exception("Could not get metadata") - - return {} - - def list_objects( - self, bucket_name: str, prefix: Optional[str] = None, recursive: bool = False - ) -> Iterator[Object]: - try: - return self._minio.list_objects( - bucket_name=bucket_name, prefix=prefix, recursive=recursive - ) - except MinioException: - logging.exception("Could not list objects") - - return [] - - def remove_objects(self, bucket_name: str, objects: list[str]): - try: - delete = [DeleteObject(name, version_id=None) for name in objects] - iter_errors: Iterator[DeleteError] = self._minio.remove_objects( - bucket_name, delete - ) - for err in iter_errors: - log.error( - "Failed to delete '%s' [version=%s]: %s (code: %s)", - err.name, - err.version_id, - err.message, - err.code, - ) - - except MinioException: - logging.exception("Could remove objects") - return False - return True - - def exists_object(self, bucket_name, object_name, recursive=False): - """This seems to be pretty heavy, should be used with care""" - try: - for obj in self.list_objects(bucket_name, recursive=recursive): - if obj.object_name == object_name: - return True - except MinioException: - logging.exception("Could check object for existence") - return False - return False - - def create_presigned_put_url(self, bucket_name, object_name, dt=timedelta(days=3)): - try: - return self._minio.presigned_put_object( - bucket_name, object_name, expires=dt - ) - - except MinioException: - logging.exception("Could create presigned put url") - - return "" - - def create_presigned_get_url(self, bucket_name, object_name, dt=timedelta(days=3)): - try: - return self._minio.presigned_get_object( - bucket_name, object_name, expires=dt - ) - - except MinioException: - logging.exception("Could create presigned get url") - - return "" - - def copy_object( - self, - to_bucket_name: str, - to_object_name: str, - from_bucket: str, - from_object: str, - ): - try: - # ValueError for arguments - result: ObjectWriteResult = self._minio.copy_object( - bucket_name=to_bucket_name, - object_name=to_object_name, - source=CopySource(from_bucket, from_object), - ) - return result.bucket_name == to_bucket_name - except MinioException: - logging.exception("Could not copy") - - return False diff --git a/services/storage/src/simcore_service_storage/settings.py b/services/storage/src/simcore_service_storage/settings.py index e6722bd1086..a04d4775c40 100644 --- a/services/storage/src/simcore_service_storage/settings.py +++ b/services/storage/src/simcore_service_storage/settings.py @@ -27,11 +27,6 @@ class Settings(BaseCustomSettings, MixinLoggingSettings): STORAGE_MONITORING_ENABLED: bool = False - STORAGE_DISABLE_SERVICES: list[str] = [] - - STORAGE_TESTING: bool = Field( - False, description="Flag to enable some fakes for testing purposes" - ) BF_API_KEY: Optional[str] = Field( None, description="Pennsieve API key ONLY for testing purposes" ) @@ -39,9 +34,9 @@ class Settings(BaseCustomSettings, MixinLoggingSettings): None, description="Pennsieve API secret ONLY for testing purposes" ) - STORAGE_POSTGRES: PostgresSettings = Field(auto_default_from_env=True) + STORAGE_POSTGRES: Optional[PostgresSettings] = Field(auto_default_from_env=True) - STORAGE_S3: S3Settings = Field(auto_default_from_env=True) + STORAGE_S3: Optional[S3Settings] = Field(auto_default_from_env=True) STORAGE_TRACING: Optional[TracingSettings] = Field(auto_default_from_env=True) @@ -51,6 +46,15 @@ class Settings(BaseCustomSettings, MixinLoggingSettings): 180, description="Timeout (seconds) for metadata sync task" ) + STORAGE_DEFAULT_PRESIGNED_LINK_EXPIRATION_SECONDS: int = Field( + 3600, description="Default expiration time in seconds for presigned links" + ) + + STORAGE_CLEANER_INTERVAL_S: Optional[int] = Field( + 30, + description="Interval in seconds when task cleaning pending uploads runs. setting to NULL disables the cleaner.", + ) + @validator("LOG_LEVEL") @classmethod def _validate_loglevel(cls, value) -> str: diff --git a/services/storage/src/simcore_service_storage/simcore_s3_dsm.py b/services/storage/src/simcore_service_storage/simcore_s3_dsm.py new file mode 100644 index 00000000000..e4bed9d4774 --- /dev/null +++ b/services/storage/src/simcore_service_storage/simcore_s3_dsm.py @@ -0,0 +1,612 @@ +import datetime +import logging +import tempfile +import urllib.parse +from collections import deque +from contextlib import suppress +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Awaitable, Optional, Union + +from aiohttp import web +from aiopg.sa import Engine +from aiopg.sa.connection import SAConnection +from models_library.api_schemas_storage import LinkType, S3BucketName +from models_library.projects import ProjectID +from models_library.projects_nodes_io import ( + LocationID, + NodeID, + SimcoreS3FileID, + StorageFileID, +) +from models_library.users import UserID +from pydantic import AnyUrl, ByteSize, parse_obj_as +from servicelib.aiohttp.client_session import get_client_session +from servicelib.utils import logged_gather +from simcore_service_storage import db_tokens +from simcore_service_storage.s3 import get_s3_client + +from . import db_file_meta_data, db_projects +from .constants import ( + APP_CONFIG_KEY, + APP_DB_ENGINE_KEY, + DATCORE_ID, + SIMCORE_S3_ID, + SIMCORE_S3_STR, +) +from .datcore_adapter import datcore_adapter +from .db_access_layer import ( + AccessRights, + get_file_access_rights, + get_project_access_rights, + get_readable_project_ids, +) +from .dsm_factory import BaseDataManager +from .exceptions import ( + FileAccessRightError, + FileMetaDataNotFoundError, + LinkAlreadyExistsError, + ProjectAccessRightError, + ProjectNotFoundError, + S3KeyNotFoundError, +) +from .models import DatasetMetaData, FileMetaData, FileMetaDataAtDB +from .settings import Settings +from .utils import convert_db_to_model, download_to_file_or_raise, is_file_entry_valid + +logger = logging.getLogger(__name__) + + +@dataclass +class SimcoreS3DataManager(BaseDataManager): + engine: Engine + simcore_bucket_name: S3BucketName + app: web.Application + settings: Settings + + @classmethod + def get_location_id(cls) -> LocationID: + return SIMCORE_S3_ID + + @classmethod + def get_location_name(cls) -> str: + return SIMCORE_S3_STR + + async def authorized(self, _user_id: UserID) -> bool: + return True # always true for now + + async def list_datasets(self, user_id: UserID) -> list[DatasetMetaData]: + async with self.engine.acquire() as conn: + readable_projects_ids = await get_readable_project_ids(conn, user_id) + return [ + DatasetMetaData( + dataset_id=prj_data.uuid, + display_name=prj_data.name, + ) + async for prj_data in db_projects.list_projects( + conn, readable_projects_ids + ) + ] + + async def list_files_in_dataset( + self, user_id: UserID, dataset_id: str + ) -> list[FileMetaData]: + data: list[FileMetaData] = await self.list_files( + user_id, uuid_filter=dataset_id + "/" + ) + return data + + async def list_files( + self, user_id: UserID, uuid_filter: str = "" + ) -> list[FileMetaData]: + data: deque[FileMetaData] = deque() + accesible_projects_ids = [] + async with self.engine.acquire() as conn, conn.begin(): + accesible_projects_ids = await get_readable_project_ids(conn, user_id) + file_metadatas: list[ + FileMetaDataAtDB + ] = await db_file_meta_data.list_filter_with_partial_file_id( + conn, + user_id=user_id, + project_ids=accesible_projects_ids, + file_id_prefix=None, + partial_file_id=uuid_filter, + ) + + for fmd in file_metadatas: + if is_file_entry_valid(fmd): + data.append(convert_db_to_model(fmd)) + continue + with suppress(S3KeyNotFoundError): + # 1. this was uploaded using the legacy file upload that relied on + # a background task checking the S3 backend unreliably, the file eventually + # will be uploaded and this will lazily update the database + # 2. this is still in upload and the file is missing and it will raise + updated_fmd = await self._update_database_from_storage(conn, fmd) + data.append(convert_db_to_model(updated_fmd)) + + # now parse the project to search for node/project names + prj_names_mapping: dict[Union[ProjectID, NodeID], str] = {} + async for proj_data in db_projects.list_projects( + conn, accesible_projects_ids + ): + prj_names_mapping = {proj_data.uuid: proj_data.name} | { + NodeID(node_id): node_data.label + for node_id, node_data in proj_data.workbench.items() + } + + # FIXME: artifically fills ['project_name', 'node_name', 'file_id', 'raw_file_path', 'display_file_path'] + # with information from the projects table! + # also all this stuff with projects should be done in the client code not here + # NOTE: sorry for all the FIXMEs here, but this will need further refactoring + clean_data = deque() + for d in data: + if d.project_id not in prj_names_mapping: + continue + d.project_name = prj_names_mapping[d.project_id] + if d.node_id in prj_names_mapping: + d.node_name = prj_names_mapping[d.node_id] + if d.node_name and d.project_name: + clean_data.append(d) + + data = clean_data + return list(data) + + async def get_file(self, user_id: UserID, file_id: StorageFileID) -> FileMetaData: + async with self.engine.acquire() as conn, conn.begin(): + can: Optional[AccessRights] = await get_file_access_rights( + conn, int(user_id), file_id + ) + if can.read: + fmd: FileMetaDataAtDB = await db_file_meta_data.get( + conn, parse_obj_as(SimcoreS3FileID, file_id) + ) + if is_file_entry_valid(fmd): + return convert_db_to_model(fmd) + fmd = await self._update_database_from_storage(conn, fmd) + return convert_db_to_model(fmd) + + logger.debug("User %s cannot read file %s", user_id, file_id) + raise FileAccessRightError(access_right="read", file_id=file_id) + + async def create_file_upload_link( + self, + user_id: UserID, + file_id: StorageFileID, + link_type: LinkType, + ) -> AnyUrl: + async with self.engine.acquire() as conn, conn.begin(): + can: Optional[AccessRights] = await get_file_access_rights( + conn, user_id, file_id + ) + if not can.write: + raise web.HTTPForbidden( + reason=f"User {user_id} does not have enough access rights to upload file {file_id}" + ) + + # initiate the file meta data table + fmd = self._create_fmd_for_upload(user_id, file_id) + fmd = await db_file_meta_data.upsert(conn, fmd) + + # return the appropriate links + if link_type == LinkType.PRESIGNED: + single_presigned_link = await get_s3_client( + self.app + ).create_single_presigned_upload_link( + self.simcore_bucket_name, + fmd.file_id, + expiration_secs=self.settings.STORAGE_DEFAULT_PRESIGNED_LINK_EXPIRATION_SECONDS, + ) + return parse_obj_as(AnyUrl, f"{single_presigned_link}") + + # user wants just the s3 link + s3_link = get_s3_client(self.app).compute_s3_url( + self.simcore_bucket_name, parse_obj_as(SimcoreS3FileID, file_id) + ) + return s3_link + + async def abort_file_upload( + self, + user_id: UserID, + file_id: StorageFileID, + ) -> None: + async with self.engine.acquire() as conn, conn.begin(): + can: Optional[AccessRights] = await get_file_access_rights( + conn, int(user_id), file_id + ) + if not can.delete or not can.write: + raise web.HTTPForbidden( + reason=f"User {user_id} does not have enough access rights to delete file {file_id}" + ) + fmd: FileMetaDataAtDB = await db_file_meta_data.get( + conn, parse_obj_as(SimcoreS3FileID, file_id) + ) + + try: + # try to revert to what we had in storage if any + await self._update_database_from_storage(conn, fmd) + except S3KeyNotFoundError: + # the file does not exist, so we delete the entry in the db + async with self.engine.acquire() as conn: + await db_file_meta_data.delete(conn, [fmd.file_id]) + + async def create_file_download_link( + self, user_id: UserID, file_id: StorageFileID, link_type: LinkType + ) -> str: + async with self.engine.acquire() as conn: + can: Optional[AccessRights] = await get_file_access_rights( + conn, user_id, file_id + ) + if not can.read: + # NOTE: this is tricky. A user with read access can download and data! + # If write permission would be required, then shared projects as views cannot + # recover data in nodes (e.g. jupyter cannot pull work data) + # + raise FileAccessRightError(access_right="read", file_id=file_id) + + fmd = await db_file_meta_data.get( + conn, parse_obj_as(SimcoreS3FileID, file_id) + ) + + link = parse_obj_as( + AnyUrl, + f"s3://{self.simcore_bucket_name}/{urllib.parse.quote(fmd.object_name)}", + ) + if link_type == LinkType.PRESIGNED: + link = await get_s3_client(self.app).create_single_presigned_download_link( + self.simcore_bucket_name, + fmd.object_name, + self.settings.STORAGE_DEFAULT_PRESIGNED_LINK_EXPIRATION_SECONDS, + ) + + return f"{link}" + + async def delete_file(self, user_id: UserID, file_id: StorageFileID): + async with self.engine.acquire() as conn, conn.begin(): + can: Optional[AccessRights] = await get_file_access_rights( + conn, user_id, file_id + ) + if not can.delete: + raise FileAccessRightError(access_right="delete", file_id=file_id) + + with suppress(FileMetaDataNotFoundError): + file: FileMetaDataAtDB = await db_file_meta_data.get( + conn, parse_obj_as(SimcoreS3FileID, file_id) + ) + await get_s3_client(self.app).delete_file( + file.bucket_name, file.file_id + ) + await db_file_meta_data.delete(conn, [file.file_id]) + + async def delete_project_simcore_s3( + self, user_id: UserID, project_id: ProjectID, node_id: Optional[NodeID] = None + ) -> None: + async with self.engine.acquire() as conn, conn.begin(): + can: Optional[AccessRights] = await get_project_access_rights( + conn, user_id, project_id + ) + if not can.delete: + raise ProjectAccessRightError( + access_right="delete", project_id=project_id + ) + + # we can do it this way, since we are in a transaction, it will rollback in case of error + if not node_id: + await db_file_meta_data.delete_all_from_project(conn, project_id) + else: + await db_file_meta_data.delete_all_from_node(conn, node_id) + await get_s3_client(self.app).delete_files_in_project_node( + self.simcore_bucket_name, project_id, node_id + ) + + async def deep_copy_project_simcore_s3( + self, + user_id: UserID, + src_project: dict[str, Any], + dst_project: dict[str, Any], + node_mapping: dict[NodeID, NodeID], + ) -> None: + src_project_uuid: ProjectID = ProjectID(src_project["uuid"]) + dst_project_uuid: ProjectID = ProjectID(dst_project["uuid"]) + # Step 1: check access rights (read of src and write of dst) + async with self.engine.acquire() as conn: + for prj_uuid in [src_project_uuid, dst_project_uuid]: + if not await db_projects.project_exists(conn, prj_uuid): + raise ProjectNotFoundError(project_id=prj_uuid) + source_access_rights = await get_project_access_rights( + conn, user_id, project_id=src_project_uuid + ) + dest_access_rights = await get_project_access_rights( + conn, user_id, project_id=dst_project_uuid + ) + if not source_access_rights.read: + raise ProjectAccessRightError( + access_right="read", project_id=src_project_uuid + ) + if not dest_access_rights.write: + raise ProjectAccessRightError( + access_right="write", project_id=dst_project_uuid + ) + + # Step 2: start copying by listing what to copy + logger.debug( + "Copying all items from %s to %s", + f"{self.simcore_bucket_name=}:{src_project_uuid=}", + f"{self.simcore_bucket_name=}:{dst_project_uuid=}", + ) + async with self.engine.acquire() as conn: + src_project_files: list[ + FileMetaDataAtDB + ] = await db_file_meta_data.list_fmds(conn, project_ids=[src_project_uuid]) + + # Step 3.1: copy: files referenced from file_metadata + copy_tasks: deque[Awaitable] = deque() + for src_fmd in src_project_files: + if not src_fmd.node_id or (src_fmd.location_id != self.location_id): + raise NotImplementedError( + "This is not foreseen, stem from old decisions" + f", and needs to be implemented if needed. Faulty metadata: {src_fmd=}" + ) + + if new_node_id := node_mapping.get(src_fmd.node_id): + copy_tasks.append( + self._copy_file_s3_s3( + user_id, + src_fmd, + SimcoreS3FileID( + f"{dst_project_uuid}/{new_node_id}/{src_fmd.object_name.split('/')[-1]}" + ), + ) + ) + # Step 3.2: copy files referenced from file-picker from DAT-CORE + for node_id, node in dst_project.get("workbench", {}).items(): + copy_tasks.extend( + [ + self._copy_file_datcore_s3( + user_id=user_id, + source_uuid=output["path"], + dest_project_id=dst_project_uuid, + dest_node_id=NodeID(node_id), + file_storage_link=output, + ) + for output in node.get("outputs", {}).values() + if int(output.get("store", self.location_id)) == DATCORE_ID + ] + ) + for task in copy_tasks: + await task + # NOTE: running this in parallel tends to block while testing. not sure why? + # await asyncio.gather(*copy_tasks) + + async def search_files_starting_with( + self, user_id: UserID, prefix: str + ) -> list[FileMetaData]: + async with self.engine.acquire() as conn: + can_read_projects_ids = await get_readable_project_ids(conn, user_id) + file_metadatas: list[ + FileMetaDataAtDB + ] = await db_file_meta_data.list_filter_with_partial_file_id( + conn, + user_id=user_id, + project_ids=can_read_projects_ids, + file_id_prefix=prefix, + partial_file_id=None, + ) + resolved_fmds = [] + for fmd in file_metadatas: + if is_file_entry_valid(fmd): + resolved_fmds.append(convert_db_to_model(fmd)) + continue + with suppress(S3KeyNotFoundError): + updated_fmd = await self._update_database_from_storage(conn, fmd) + resolved_fmds.append(convert_db_to_model(updated_fmd)) + return resolved_fmds + + async def create_soft_link( + self, user_id: int, target_file_id: StorageFileID, link_file_id: StorageFileID + ) -> FileMetaData: + async with self.engine.acquire() as conn: + if await db_file_meta_data.exists( + conn, parse_obj_as(SimcoreS3FileID, link_file_id) + ): + raise LinkAlreadyExistsError(file_id=link_file_id) + # validate target_uuid + target = await self.get_file(user_id, target_file_id) + # duplicate target and change the following columns: + target.file_uuid = link_file_id + target.file_id = link_file_id # NOTE: api-server relies on this id + target.is_soft_link = True + + async with self.engine.acquire() as conn: + return convert_db_to_model(await db_file_meta_data.insert(conn, target)) + + async def synchronise_meta_data_table(self, dry_run: bool) -> list[StorageFileID]: + file_ids_to_remove = [] + async with self.engine.acquire() as conn: + logger.warning( + "Total number of entries to check %d", + await db_file_meta_data.total(conn), + ) + # iterate over all entries to check if there is a file in the S3 backend + async for fmd in db_file_meta_data.list_valid_uploads(conn): + # SEE https://www.peterbe.com/plog/fastest-way-to-find-out-if-a-file-exists-in-s3 + if not await get_s3_client(self.app).list_files( + self.simcore_bucket_name, prefix=fmd.object_name + ): + # this file does not exist in S3 + file_ids_to_remove.append(fmd.file_id) + + if not dry_run: + await db_file_meta_data.delete(conn, file_ids_to_remove) + + logger.info( + "%s %d entries ", + "Would delete" if dry_run else "Deleted", + len(file_ids_to_remove), + ) + + return file_ids_to_remove + + async def _clean_expired_uploads(self): + """this method will check for all incomplete updates by checking + the upload_expires_at entry in file_meta_data table. + 1. will try to update the entry from S3 backend if exists + 2. will delete the entry if nothing exists in S3 backend. + """ + now = datetime.datetime.utcnow() + async with self.engine.acquire() as conn: + list_of_expired_uploads = await db_file_meta_data.list_fmds( + conn, expired_after=now + ) + logger.debug( + "found following pending uploads: [%s]", + [fmd.file_id for fmd in list_of_expired_uploads], + ) + if not list_of_expired_uploads: + return + + # try first to upload these from S3 (conservative) + updated_fmds = await logged_gather( + *( + self._update_database_from_storage_no_connection(fmd) + for fmd in list_of_expired_uploads + ), + reraise=False, + log=logger, + max_concurrency=2, + ) + list_of_fmds_to_delete = [ + expired_fmd + for expired_fmd, updated_fmd in zip(list_of_expired_uploads, updated_fmds) + if not isinstance(updated_fmd, FileMetaDataAtDB) + ] + if list_of_fmds_to_delete: + # delete the remaining ones + logger.debug( + "following unfinished/incomplete uploads will now be deleted : [%s]", + [fmd.file_id for fmd in list_of_fmds_to_delete], + ) + await logged_gather( + *( + self.delete_file(fmd.user_id, fmd.file_id) + for fmd in list_of_fmds_to_delete + if fmd.user_id is not None + ), + log=logger, + max_concurrency=2, + ) + logger.warning( + "pending/incomplete uploads of [%s] removed", + [fmd.file_id for fmd in list_of_fmds_to_delete], + ) + + async def clean_expired_uploads(self) -> None: + await self._clean_expired_uploads() + + async def _update_database_from_storage( + self, conn: SAConnection, fmd: FileMetaDataAtDB + ) -> FileMetaDataAtDB: + s3_metadata = await get_s3_client(self.app).get_file_metadata( + fmd.bucket_name, fmd.object_name + ) + fmd = await db_file_meta_data.get(conn, fmd.file_id) + fmd.file_size = parse_obj_as(ByteSize, s3_metadata.size) + fmd.last_modified = s3_metadata.last_modified + fmd.entity_tag = s3_metadata.e_tag + fmd.upload_expires_at = None + updated_fmd = await db_file_meta_data.upsert(conn, convert_db_to_model(fmd)) + return updated_fmd + + async def _update_database_from_storage_no_connection( + self, fmd: FileMetaDataAtDB + ) -> FileMetaDataAtDB: + async with self.engine.acquire() as conn: + updated_fmd = await self._update_database_from_storage(conn, fmd) + return updated_fmd + + async def _copy_file_datcore_s3( + self, + user_id: UserID, + source_uuid: str, + dest_project_id: ProjectID, + dest_node_id: NodeID, + file_storage_link: dict[str, Any], + ) -> FileMetaData: + session = get_client_session(self.app) + # 2 steps: Get download link for local copy, then upload to S3 + # TODO: This should be a redirect stream! + api_token, api_secret = await db_tokens.get_api_token_and_secret( + self.app, user_id + ) + dc_link = await datcore_adapter.get_file_download_presigned_link( + self.app, api_token, api_secret, source_uuid + ) + assert dc_link.path # nosec + filename = Path(dc_link.path).name + dst_file_id = SimcoreS3FileID(f"{dest_project_id}/{dest_node_id}/{filename}") + logger.debug("copying %s to %s", f"{source_uuid=}", f"{dst_file_id=}") + + with tempfile.TemporaryDirectory() as tmpdir: + local_file_path = Path(tmpdir) / filename + # Downloads DATCore -> local + await download_to_file_or_raise(session, dc_link, local_file_path) + + # copying will happen using aioboto3, therefore multipart might happen + new_fmd = self._create_fmd_for_upload(user_id, dst_file_id) + async with self.engine.acquire() as conn, conn.begin(): + new_fmd = await db_file_meta_data.upsert(conn, new_fmd) + # Uploads local -> S3 + await get_s3_client(self.app).upload_file( + self.simcore_bucket_name, local_file_path, dst_file_id + ) + updated_fmd = await self._update_database_from_storage(conn, new_fmd) + file_storage_link["store"] = self.location_id + file_storage_link["path"] = new_fmd.file_id + + logger.info("copied %s to %s", f"{source_uuid=}", f"{updated_fmd=}") + + return convert_db_to_model(updated_fmd) + + async def _copy_file_s3_s3( + self, user_id: UserID, src_fmd: FileMetaDataAtDB, dst_file_id: SimcoreS3FileID + ) -> FileMetaData: + logger.debug("copying %s to %s", f"{src_fmd=}", f"{dst_file_id=}") + # copying will happen using aioboto3, therefore multipart might happen + new_fmd = self._create_fmd_for_upload(user_id, dst_file_id) + async with self.engine.acquire() as conn, conn.begin(): + new_fmd = await db_file_meta_data.upsert(conn, new_fmd) + await get_s3_client(self.app).copy_file( + self.simcore_bucket_name, + src_fmd.object_name, + new_fmd.object_name, + ) + updated_fmd = await self._update_database_from_storage(conn, new_fmd) + logger.info("copied %s to %s", f"{src_fmd=}", f"{updated_fmd=}") + return convert_db_to_model(updated_fmd) + + def _create_fmd_for_upload( + self, user_id: UserID, file_id: StorageFileID + ) -> FileMetaData: + now = datetime.datetime.utcnow() + upload_expiration_date = now + datetime.timedelta( + seconds=self.settings.STORAGE_DEFAULT_PRESIGNED_LINK_EXPIRATION_SECONDS + ) + return FileMetaData.from_simcore_node( + user_id=user_id, + file_id=parse_obj_as(SimcoreS3FileID, file_id), + bucket=self.simcore_bucket_name, + location_id=self.location_id, + location_name=self.location_name, + upload_expires_at=upload_expiration_date, + ) + + +def create_simcore_s3_data_manager(app: web.Application) -> SimcoreS3DataManager: + cfg: Settings = app[APP_CONFIG_KEY] + assert cfg.STORAGE_S3 # nosec + return SimcoreS3DataManager( + engine=app[APP_DB_ENGINE_KEY], + simcore_bucket_name=parse_obj_as(S3BucketName, cfg.STORAGE_S3.S3_BUCKET_NAME), + app=app, + settings=cfg, + ) diff --git a/services/storage/src/simcore_service_storage/temporary_handlers_utils.py b/services/storage/src/simcore_service_storage/temporary_handlers_utils.py deleted file mode 100644 index b6dd84a43db..00000000000 --- a/services/storage/src/simcore_service_storage/temporary_handlers_utils.py +++ /dev/null @@ -1,13 +0,0 @@ -from models_library.api_schemas_storage import DatasetMetaDataGet, FileMetaDataGet - -# NOTE: TEMPORARY UTILS (will be removed in the next PRs for refactoring storage) -from pydantic import parse_obj_as -from simcore_service_storage.models import DatasetMetaData, FileMetaDataEx - - -def convert_to_api_dataset(x: DatasetMetaData) -> DatasetMetaDataGet: - return parse_obj_as(DatasetMetaDataGet, x) - - -def convert_to_api_fmd(x: FileMetaDataEx) -> FileMetaDataGet: - return parse_obj_as(FileMetaDataGet, x.fmd) diff --git a/services/storage/src/simcore_service_storage/utils.py b/services/storage/src/simcore_service_storage/utils.py index 02dec2f20c6..cd136c4687a 100644 --- a/services/storage/src/simcore_service_storage/utils.py +++ b/services/storage/src/simcore_service_storage/utils.py @@ -1,61 +1,34 @@ import logging -import uuid -from functools import lru_cache +import urllib.parse from pathlib import Path from typing import Union -from uuid import UUID import aiofiles -import tenacity from aiohttp import ClientSession from aiohttp.typedefs import StrOrURL -from aiopg.sa.result import ResultProxy, RowProxy -from yarl import URL +from models_library.projects_nodes_io import StorageFileID +from models_library.users import UserID -from .models import FileMetaData, FileMetaDataEx +from .constants import MAX_CHUNK_SIZE +from .models import FileMetaData, FileMetaDataAtDB logger = logging.getLogger(__name__) -MAX_CHUNK_SIZE = 1024 -RETRY_WAIT_SECS = 2 -RETRY_COUNT = 20 -CONNECT_TIMEOUT_SECS = 30 - - -@tenacity.retry( - wait=tenacity.wait_fixed(RETRY_WAIT_SECS), - stop=tenacity.stop_after_attempt(RETRY_COUNT), - before_sleep=tenacity.before_sleep_log(logger, logging.INFO), -) -async def assert_enpoint_is_ok( - session: ClientSession, url: URL, expected_response: int = 200 -): - """Tenace check to GET given url endpoint - - Typically used to check connectivity to a given service - - In sync code use as - loop.run_until_complete( check_endpoint(url) ) - - :param url: endpoint service URL - :type url: URL - :param expected_response: expected http status, defaults to 200 (OK) - :param expected_response: int, optional - """ - async with session.get(url) as resp: - if resp.status != expected_response: - raise AssertionError(f"{resp.status} != {expected_response}") - - -def is_url(location): - return bool(URL(str(location)).host) +def convert_db_to_model(x: FileMetaDataAtDB) -> FileMetaData: + return FileMetaData.parse_obj( + x.dict() + | { + "file_uuid": x.file_id, + "file_name": x.file_id.split("/")[-1], + } + ) async def download_to_file_or_raise( session: ClientSession, url: StrOrURL, - destination_path: Union[str, Path], + destination_path: Path, *, chunk_size=MAX_CHUNK_SIZE, ) -> int: @@ -83,35 +56,13 @@ async def download_to_file_or_raise( return total_size -def create_reverse_dns(*resource_name_parts) -> str: - """ - Returns a name for the resource following the reverse domain name notation - """ - # See https://en.wikipedia.org/wiki/Reverse_domain_name_notation - return "io.simcore.storage" + ".".join(map(str, resource_name_parts)) - - -@lru_cache() -def create_resource_uuid(*resource_name_parts) -> UUID: - revers_dns = create_reverse_dns(*resource_name_parts) - return uuid.uuid5(uuid.NAMESPACE_DNS, revers_dns) - - -def to_meta_data_extended(row: Union[ResultProxy, RowProxy]) -> FileMetaDataEx: - assert row # nosec - meta = FileMetaData(**dict(row)) # type: ignore - # NOTE: I know this is sad but this is fixed in a later PR where the class is replaced by a pydantic class - meta.location_id = int(meta.location_id) - meta_extended = FileMetaDataEx( - fmd=meta, - parent_id=str(Path(meta.object_name).parent), - ) # type: ignore - return meta_extended - - -def is_file_entry_valid(file_metadata: FileMetaData) -> bool: +def is_file_entry_valid(file_metadata: Union[FileMetaData, FileMetaDataAtDB]) -> bool: return ( file_metadata.entity_tag is not None - and file_metadata.file_size is not None and file_metadata.file_size > 0 + and file_metadata.upload_expires_at is None ) + + +def create_upload_completion_task_name(user_id: UserID, file_id: StorageFileID) -> str: + return f"upload_complete_task_{user_id}_{urllib.parse.quote(file_id, safe='')}" diff --git a/services/storage/src/simcore_service_storage/utils_handlers.py b/services/storage/src/simcore_service_storage/utils_handlers.py new file mode 100644 index 00000000000..c5211a2d12d --- /dev/null +++ b/services/storage/src/simcore_service_storage/utils_handlers.py @@ -0,0 +1,39 @@ +from aiohttp import web +from aiohttp.typedefs import Handler +from aiohttp.web_request import Request +from pydantic import ValidationError +from servicelib.aiohttp.aiopg_utils import DBAPIError + +from .db_access_layer import InvalidFileIdentifier +from .exceptions import ( + FileAccessRightError, + FileMetaDataNotFoundError, + LinkAlreadyExistsError, + ProjectAccessRightError, + ProjectNotFoundError, + S3KeyNotFoundError, +) + + +@web.middleware +async def dsm_exception_handler( + request: Request, handler: Handler +) -> web.StreamResponse: + try: + return await handler(request) + except InvalidFileIdentifier as err: + raise web.HTTPUnprocessableEntity( + reason=f"{err} is an invalid file identifier" + ) from err + except (FileMetaDataNotFoundError, S3KeyNotFoundError, ProjectNotFoundError) as err: + raise web.HTTPNotFound(reason=f"{err}") from err + except (FileAccessRightError, ProjectAccessRightError) as err: + raise web.HTTPForbidden(reason=f"{err}") from err + except LinkAlreadyExistsError as err: + raise web.HTTPUnprocessableEntity(reason=f"{err}") from err + except ValidationError as err: + raise web.HTTPUnprocessableEntity(reason=f"{err}") from err + except DBAPIError as err: + raise web.HTTPServiceUnavailable( + reason="Unexpected error while accessing the database" + ) from err diff --git a/services/storage/tests/conftest.py b/services/storage/tests/conftest.py index a46fa993aa5..9088c0d185f 100644 --- a/services/storage/tests/conftest.py +++ b/services/storage/tests/conftest.py @@ -6,40 +6,55 @@ # pylint: disable=unused-variable -import datetime -import os +import asyncio import sys +import urllib.parse import uuid -from concurrent.futures import ThreadPoolExecutor from pathlib import Path -from random import randrange -from typing import Any, Callable, Iterable, Iterator +from typing import AsyncIterator, Awaitable, Callable, Iterator, Optional, cast import dotenv import pytest import simcore_service_storage +from aiobotocore.session import get_session from aiohttp import web +from aiohttp.test_utils import TestClient, unused_port from aiopg.sa import Engine -from servicelib.aiohttp.application import create_safe_application -from simcore_service_storage.constants import SIMCORE_S3_STR -from simcore_service_storage.dsm import DataStorageManager, DatCoreApiToken -from simcore_service_storage.models import FileMetaData, file_meta_data, projects, users -from simcore_service_storage.s3wrapper.s3_client import MinioClientWrapper -from tests.utils import BUCKET_NAME, DATA_DIR, USER_ID - -import tests +from aioresponses import aioresponses as AioResponsesMock +from faker import Faker +from models_library.api_schemas_storage import ETag, FileMetaDataGet, PresignedLink +from models_library.projects import ProjectID +from models_library.projects_nodes import NodeID +from models_library.projects_nodes_io import LocationID, SimcoreS3FileID +from models_library.users import UserID +from moto.server import ThreadedMotoServer +from pydantic import AnyUrl, ByteSize, parse_obj_as +from pytest_simcore.helpers.utils_assert import assert_status +from pytest_simcore.helpers.utils_docker import get_localhost_ip +from simcore_postgres_database.storage_models import file_meta_data, projects, users +from simcore_service_storage.application import create +from simcore_service_storage.dsm import get_dsm_provider +from simcore_service_storage.models import S3BucketName +from simcore_service_storage.s3 import get_s3_client +from simcore_service_storage.s3_client import StorageS3Client +from simcore_service_storage.settings import Settings +from simcore_service_storage.simcore_s3_dsm import SimcoreS3DataManager +from tests.helpers.file_utils import upload_file_to_presigned_link +from tests.helpers.utils_file_meta_data import assert_file_meta_data_in_db pytest_plugins = [ "pytest_simcore.cli_runner", "pytest_simcore.repository_paths", "tests.fixtures.data_models", + "tests.fixtures.datcore_adapter", "pytest_simcore.pytest_global_environs", "pytest_simcore.postgres_service", "pytest_simcore.docker_swarm", "pytest_simcore.docker_compose", "pytest_simcore.tmp_path_extra", "pytest_simcore.monkeypatch_extra", - "pytest_simcore.minio_service", + "pytest_simcore.file_extra", + "pytest_simcore.aioresponses_mocker", ] CURRENT_DIR = Path(sys.argv[0] if __name__ == "__main__" else __file__).resolve().parent @@ -99,17 +114,6 @@ def project_env_devel_environment(project_env_devel_dict, monkeypatch) -> None: monkeypatch.setenv(key, value) -@pytest.fixture(scope="module") -def s3_client(minio_config: dict[str, Any]) -> MinioClientWrapper: - - s3_client = MinioClientWrapper( - endpoint=minio_config["client"]["endpoint"], - access_key=minio_config["client"]["access_key"], - secret_key=minio_config["client"]["secret_key"], - ) - return s3_client - - ## FAKE DATA FIXTURES ---------------------------------------------- @@ -138,226 +142,294 @@ async def cleanup_user_projects_file_metadata(aiopg_engine: Engine): @pytest.fixture -def dsm_mockup_complete_db( - postgres_dsn, s3_client, cleanup_user_projects_file_metadata -) -> Iterator[tuple[dict[str, str], dict[str, str]]]: - dsn = "postgresql://{user}:{password}@{host}:{port}/{database}".format( - **postgres_dsn +def simcore_s3_dsm(client) -> SimcoreS3DataManager: + return cast( + SimcoreS3DataManager, + get_dsm_provider(client.app).get(SimcoreS3DataManager.get_location_id()), ) - tests.utils.fill_tables_from_csv_files(url=dsn) - - bucket_name = BUCKET_NAME - s3_client.create_bucket(bucket_name, delete_contents_if_exists=True) - file_1 = { - "project_id": "161b8782-b13e-5840-9ae2-e2250c231001", - "node_id": "ad9bda7f-1dc5-5480-ab22-5fef4fc53eac", - "filename": "outputController.dat", - } - f = DATA_DIR / "outputController.dat" - object_name = "{project_id}/{node_id}/{filename}".format(**file_1) - s3_client.upload_file(bucket_name, object_name, f) - - file_2 = { - "project_id": "161b8782-b13e-5840-9ae2-e2250c231001", - "node_id": "a3941ea0-37c4-5c1d-a7b3-01b5fd8a80c8", - "filename": "notebooks.zip", - } - f = DATA_DIR / "notebooks.zip" - object_name = "{project_id}/{node_id}/{filename}".format(**file_2) - s3_client.upload_file(bucket_name, object_name, f) - yield (file_1, file_2) - # cleanup - s3_client.remove_bucket(bucket_name, delete_contents=True) + +@pytest.fixture(scope="module") +def mocked_s3_server() -> Iterator[ThreadedMotoServer]: + """creates a moto-server that emulates AWS services in place + NOTE: Never use a bucket with underscores it fails!! + """ + server = ThreadedMotoServer(ip_address=get_localhost_ip(), port=unused_port()) + # pylint: disable=protected-access + print(f"--> started mock S3 server on {server._ip_address}:{server._port}") + print( + f"--> Dashboard available on [http://{server._ip_address}:{server._port}/moto-api/]" + ) + server.start() + yield server + server.stop() + print(f"<-- stopped mock S3 server on {server._ip_address}:{server._port}") @pytest.fixture -def dsm_mockup_db( - postgres_dsn_url, - s3_client: MinioClientWrapper, - mock_files_factory: Callable[[int], list[Path]], - cleanup_user_projects_file_metadata, -) -> Iterator[dict[str, FileMetaData]]: - - # s3 client - bucket_name = BUCKET_NAME - s3_client.create_bucket(bucket_name, delete_contents_if_exists=True) - - # TODO: use pip install Faker - users = ["alice", "bob", "chuck", "dennis"] - - projects = [ - "astronomy", - "biology", - "chemistry", - "dermatology", - "economics", - "futurology", - "geology", +async def mocked_s3_server_envs( + mocked_s3_server: ThreadedMotoServer, + monkeypatch: pytest.MonkeyPatch, +) -> AsyncIterator[None]: + monkeypatch.setenv("S3_SECURE", "false") + monkeypatch.setenv( + "S3_ENDPOINT", + f"{mocked_s3_server._ip_address}:{mocked_s3_server._port}", # pylint: disable=protected-access + ) + monkeypatch.setenv("S3_ACCESS_KEY", "xxx") + monkeypatch.setenv("S3_SECRET_KEY", "xxx") + monkeypatch.setenv("S3_BUCKET_NAME", "pytestbucket") + + yield + + # cleanup the buckets + session = get_session() + async with session.create_client( + "s3", + endpoint_url=f"http://{mocked_s3_server._ip_address}:{mocked_s3_server._port}", # pylint: disable=protected-access + aws_secret_access_key="xxx", + aws_access_key_id="xxx", + ) as client: + await _remove_all_buckets(client) + + +async def _clean_bucket_content(aiobotore_s3_client, bucket: S3BucketName): + response = await aiobotore_s3_client.list_objects_v2(Bucket=bucket) + while response["KeyCount"] > 0: + await aiobotore_s3_client.delete_objects( + Bucket=bucket, + Delete={ + "Objects": [ + {"Key": obj["Key"]} for obj in response["Contents"] if "Key" in obj + ] + }, + ) + response = await aiobotore_s3_client.list_objects_v2(Bucket=bucket) + + +async def _remove_all_buckets(aiobotore_s3_client): + response = await aiobotore_s3_client.list_buckets() + bucket_names = [ + bucket["Name"] for bucket in response["Buckets"] if "Name" in bucket ] - location = SIMCORE_S3_STR - - nodes = ["alpha", "beta", "gamma", "delta"] - - N = 100 - files = mock_files_factory(N) - counter = 0 - data = {} - - for _file in files: - idx = randrange(len(users)) - user_name = users[idx] - user_id = idx + 10 - idx = randrange(len(projects)) - project_name = projects[idx] - project_id = uuid.uuid4() - idx = randrange(len(nodes)) - node = nodes[idx] - node_id = uuid.uuid4() - file_name = str(counter) - object_name = Path(str(project_id), str(node_id), str(counter)).as_posix() - file_uuid = Path(object_name).as_posix() - raw_file_path = file_uuid - display_file_path = str(Path(project_name) / Path(node) / Path(file_name)) - created_at = str(datetime.datetime.utcnow()) - file_size = _file.stat().st_size - - assert s3_client.upload_file(bucket_name, object_name, _file) - s3_meta_data = s3_client.get_metadata(bucket_name, object_name) - assert "ETag" in s3_meta_data - entity_tag = s3_meta_data["ETag"].strip('"') - - d = { - "file_uuid": file_uuid, - "location_id": 0, - "location": location, - "bucket_name": bucket_name, - "object_name": object_name, - "project_id": str(project_id), - "project_name": project_name, - "node_id": str(node_id), - "node_name": node, - "file_name": file_name, - "user_id": str(user_id), - "user_name": user_name, - "file_id": object_name, - "raw_file_path": file_uuid, - "display_file_path": display_file_path, - "created_at": created_at, - "last_modified": created_at, - "file_size": file_size, - "entity_tag": entity_tag, - } - - counter = counter + 1 - - data[object_name] = FileMetaData(**d) - - # pylint: disable=no-member - - tests.utils.insert_metadata(postgres_dsn_url, data[object_name]) - - total_count = 0 - for _obj in s3_client.list_objects(bucket_name, recursive=True): - total_count = total_count + 1 - - assert total_count == N - - yield data - - # s3 client - s3_client.remove_bucket(bucket_name, delete_contents=True) + await asyncio.gather( + *(_clean_bucket_content(aiobotore_s3_client, bucket) for bucket in bucket_names) + ) + await asyncio.gather( + *(aiobotore_s3_client.delete_bucket(Bucket=bucket) for bucket in bucket_names) + ) -@pytest.fixture(scope="function") -def moduleless_app(event_loop, aiohttp_server) -> web.Application: - app: web.Application = create_safe_application() - # creates a dummy server - server = event_loop.run_until_complete(aiohttp_server(app)) - # server is destroyed on exit https://docs.aiohttp.org/en/stable/testing.html#pytest_aiohttp.aiohttp_server - return app +@pytest.fixture +async def storage_s3_client( + client: TestClient, +) -> StorageS3Client: + assert client.app + return get_s3_client(client.app) -@pytest.fixture(scope="function") -def dsm_fixture( - s3_client, aiopg_engine, event_loop, moduleless_app -) -> Iterable[DataStorageManager]: - - with ThreadPoolExecutor(3) as pool: - dsm_fixture = DataStorageManager( - s3_client=s3_client, - engine=aiopg_engine, - loop=event_loop, - pool=pool, - simcore_bucket_name=BUCKET_NAME, - has_project_db=False, - app=moduleless_app, +@pytest.fixture +async def storage_s3_bucket(app_settings: Settings) -> str: + assert app_settings.STORAGE_S3 + return app_settings.STORAGE_S3.S3_BUCKET_NAME + + +@pytest.fixture +def mock_config( + aiopg_engine: Engine, + postgres_host_config: dict[str, str], + mocked_s3_server_envs, + datcore_adapter_service_mock: AioResponsesMock, +): + # NOTE: this can be overriden in tests that do not need all dependencies up + ... + + +@pytest.fixture +def app_settings(mock_config) -> Settings: + test_app_settings = Settings.create_from_envs() + print(f"{test_app_settings.json(indent=2)=}") + return test_app_settings + + +@pytest.fixture +def client( + event_loop: asyncio.AbstractEventLoop, + aiohttp_client: Callable, + unused_tcp_port_factory: Callable[..., int], + app_settings: Settings, +) -> TestClient: + app = create(app_settings) + return event_loop.run_until_complete( + aiohttp_client(app, server_kwargs={"port": unused_tcp_port_factory()}) + ) + + +@pytest.fixture +async def node_id( + project_id: ProjectID, create_project_node: Callable[[ProjectID], Awaitable[NodeID]] +) -> NodeID: + return await create_project_node(project_id) + + +@pytest.fixture +def simcore_file_id( + project_id: ProjectID, + node_id: NodeID, + create_simcore_file_id: Callable[[ProjectID, NodeID, str], SimcoreS3FileID], + faker: Faker, +) -> SimcoreS3FileID: + return create_simcore_file_id( + project_id, node_id, f"öä$äö2-34 name in to add complexity {faker.file_name()}" + ) + + +# NOTE: this will be enabled at a later timepoint +@pytest.fixture( + params=[ + SimcoreS3DataManager.get_location_id(), + # DatCoreDataManager.get_location_id(), + ], + ids=[ + SimcoreS3DataManager.get_location_name(), + # DatCoreDataManager.get_location_name(), + ], +) +def location_id(request: pytest.FixtureRequest) -> LocationID: + return request.param # type: ignore + + +@pytest.fixture +async def get_file_meta_data( + client: TestClient, user_id: UserID, location_id: LocationID +) -> Callable[..., Awaitable[FileMetaDataGet]]: + async def _getter(file_id: SimcoreS3FileID) -> FileMetaDataGet: + assert client.app + url = ( + client.app.router["get_file_metadata"] + .url_for( + location_id=f"{location_id}", + file_id=urllib.parse.quote(file_id, safe=""), + ) + .with_query(user_id=user_id) ) + response = await client.get(f"{url}") + data, error = await assert_status(response, web.HTTPOk) + assert not error + assert data + received_fmd = parse_obj_as(FileMetaDataGet, data) + assert received_fmd + print(f"<-- {received_fmd.json(indent=2)=}") + return received_fmd - api_token = os.environ.get("BF_API_KEY", "none") - api_secret = os.environ.get("BF_API_SECRET", "none") - dsm_fixture.datcore_tokens[USER_ID] = DatCoreApiToken(api_token, api_secret) + return _getter - yield dsm_fixture +@pytest.fixture +async def create_upload_file_link( + client: TestClient, user_id: UserID, location_id: LocationID +) -> AsyncIterator[Callable[..., Awaitable[AnyUrl]]]: + + file_params: list[tuple[UserID, int, SimcoreS3FileID]] = [] + + async def _link_creator(file_id: SimcoreS3FileID, **query_kwargs) -> AnyUrl: + assert client.app + url = ( + client.app.router["upload_file"] + .url_for( + location_id=f"{location_id}", + file_id=urllib.parse.quote(file_id, safe=""), + ) + .with_query(**query_kwargs, user_id=user_id) + ) + response = await client.put(f"{url}") + data, error = await assert_status(response, web.HTTPOk) + assert not error + assert data + received_file_upload = parse_obj_as(PresignedLink, data) + assert received_file_upload + print(f"--> created link for {file_id=}") + file_params.append((user_id, location_id, file_id)) + return received_file_upload.link + + yield _link_creator -@pytest.fixture(scope="function") -async def datcore_structured_testbucket( - mock_files_factory: Callable[[int], list[Path]], - moduleless_app, -): - api_token = os.environ.get("BF_API_KEY") - api_secret = os.environ.get("BF_API_SECRET") - - if api_token is None or api_secret is None: - yield "no_bucket" - return - import warnings - - warnings.warn("DISABLED!!!") - raise Exception - # TODO: there are some missing commands in datcore-adapter before this can run - # this shall be used when the time comes and this code should be enabled again - - # dataset: DatasetMetaData = await datcore_adapter.create_dataset( - # moduleless_app, api_token, api_secret, BUCKET_NAME - # ) - # dataset_id = dataset.dataset_id - # assert dataset_id, f"Could not create dataset {BUCKET_NAME}" - - # tmp_files = mock_files_factory(3) - - # # first file to the root - # filename1 = os.path.normpath(tmp_files[0]) - # await datcore_adapter.upload_file(moduleless_app, api_token, api_secret, filename1) - # file_id1 = await dcw.upload_file_to_id(dataset_id, filename1) - # assert file_id1, f"Could not upload {filename1} to the root of {BUCKET_NAME}" - - # # create first level folder - # collection_id1 = await dcw.create_collection(dataset_id, "level1") - - # # upload second file - # filename2 = os.path.normpath(tmp_files[1]) - # file_id2 = await dcw.upload_file_to_id(collection_id1, filename2) - # assert file_id2, f"Could not upload {filename2} to the {BUCKET_NAME}/level1" - - # # create 3rd level folder - # filename3 = os.path.normpath(tmp_files[2]) - # collection_id2 = await dcw.create_collection(collection_id1, "level2") - # file_id3 = await dcw.upload_file_to_id(collection_id2, filename3) - # assert file_id3, f"Could not upload {filename3} to the {BUCKET_NAME}/level1/level2" - - # yield { - # "dataset_id": dataset_id, - # "coll1_id": collection_id1, - # "coll2_id": collection_id2, - # "file_id1": file_id1, - # "filename1": tmp_files[0], - # "file_id2": file_id2, - # "filename2": tmp_files[1], - # "file_id3": file_id3, - # "filename3": tmp_files[2], - # "dcw": dcw, - # } - - # await dcw.delete_test_dataset(BUCKET_NAME) + # cleanup + assert client.app + clean_tasks = [] + for u_id, loc_id, file_id in file_params: + url = ( + client.app.router["delete_file"] + .url_for( + location_id=f"{loc_id}", + file_id=urllib.parse.quote(file_id, safe=""), + ) + .with_query(user_id=u_id) + ) + clean_tasks.append(client.delete(f"{url}")) + await asyncio.gather(*clean_tasks) + + +@pytest.fixture +def upload_file( + aiopg_engine: Engine, + storage_s3_client: StorageS3Client, + storage_s3_bucket: S3BucketName, + client: TestClient, + project_id: ProjectID, + node_id: NodeID, + create_upload_file_link: Callable[..., Awaitable[AnyUrl]], + create_file_of_size: Callable[[ByteSize, Optional[str]], Path], + create_simcore_file_id: Callable[[ProjectID, NodeID, str], SimcoreS3FileID], + get_file_meta_data: Callable[..., Awaitable[FileMetaDataGet]], +) -> Callable[ + [ByteSize, str, Optional[SimcoreS3FileID]], Awaitable[tuple[Path, SimcoreS3FileID]] +]: + async def _uploader( + file_size: ByteSize, file_name: str, file_id: Optional[SimcoreS3FileID] = None + ) -> tuple[Path, SimcoreS3FileID]: + assert client.app + # create a file + file = create_file_of_size(file_size, file_name) + if not file_id: + file_id = create_simcore_file_id(project_id, node_id, file_name) + # get an upload link + file_upload_link = await create_upload_file_link( + file_id, link_type="presigned", file_size=file_size + ) + + # upload the file + e_tag: ETag = await upload_file_to_presigned_link(file, file_upload_link) + + # trigger a lazy upload of the tables by getting the file + received_fmd = await get_file_meta_data(file_id) + assert received_fmd.entity_tag == e_tag + + # check the entry in db now has the correct file size, and the upload id is gone + await assert_file_meta_data_in_db( + aiopg_engine, + file_id=file_id, + expected_entry_exists=True, + expected_file_size=file_size, + expected_upload_expiration_date=False, + ) + # check the file is in S3 for real + s3_metadata = await storage_s3_client.get_file_metadata( + storage_s3_bucket, file_id + ) + assert s3_metadata.size == file_size + assert s3_metadata.last_modified + assert s3_metadata.e_tag == e_tag + return file, file_id + + return _uploader + + +@pytest.fixture +def create_simcore_file_id() -> Callable[[ProjectID, NodeID, str], SimcoreS3FileID]: + def _creator( + project_id: ProjectID, node_id: NodeID, file_name: str + ) -> SimcoreS3FileID: + return parse_obj_as(SimcoreS3FileID, f"{project_id}/{node_id}/{file_name}") + + return _creator diff --git a/services/storage/tests/data/file_meta_data.csv b/services/storage/tests/data/file_meta_data.csv index fa473ea55a4..24bf7dc5ef8 100644 --- a/services/storage/tests/data/file_meta_data.csv +++ b/services/storage/tests/data/file_meta_data.csv @@ -1,3 +1,3 @@ -file_uuid,location_id,location,bucket_name,object_name,project_id,project_name,node_id,node_name,file_name,user_id,user_name -161b8782-b13e-5840-9ae2-e2250c231001/ad9bda7f-1dc5-5480-ab22-5fef4fc53eac/outputController.dat,0,simcore.s3,simcore-testing-bucket,161b8782-b13e-5840-9ae2-e2250c231001/ad9bda7f-1dc5-5480-ab22-5fef4fc53eac/outputController.dat,161b8782-b13e-5840-9ae2-e2250c231001,"",ad9bda7f-1dc5-5480-ab22-5fef4fc53eac,"",outputController.dat,21,"" -161b8782-b13e-5840-9ae2-e2250c231001/a3941ea0-37c4-5c1d-a7b3-01b5fd8a80c8/notebooks.zip,0,simcore.s3,simcore-testing-bucket,161b8782-b13e-5840-9ae2-e2250c231001/a3941ea0-37c4-5c1d-a7b3-01b5fd8a80c8/notebooks.zip,161b8782-b13e-5840-9ae2-e2250c231001,"",a3941ea0-37c4-5c1d-a7b3-01b5fd8a80c8,"",notebooks.zip,21,"" +file_id,location_id,location,bucket_name,object_name,project_id,node_id,user_id +161b8782-b13e-5840-9ae2-e2250c231001/ad9bda7f-1dc5-5480-ab22-5fef4fc53eac/outputController.dat,0,simcore.s3,pytestbucket,161b8782-b13e-5840-9ae2-e2250c231001/ad9bda7f-1dc5-5480-ab22-5fef4fc53eac/outputController.dat,161b8782-b13e-5840-9ae2-e2250c231001,ad9bda7f-1dc5-5480-ab22-5fef4fc53eac,21 +161b8782-b13e-5840-9ae2-e2250c231001/a3941ea0-37c4-5c1d-a7b3-01b5fd8a80c8/notebooks.zip,0,simcore.s3,pytestbucket,161b8782-b13e-5840-9ae2-e2250c231001/a3941ea0-37c4-5c1d-a7b3-01b5fd8a80c8/notebooks.zip,161b8782-b13e-5840-9ae2-e2250c231001,a3941ea0-37c4-5c1d-a7b3-01b5fd8a80c8,21 diff --git a/services/storage/tests/data/projects_with_data.json b/services/storage/tests/data/projects_with_data.json index ec343135b5a..302a5d33481 100644 --- a/services/storage/tests/data/projects_with_data.json +++ b/services/storage/tests/data/projects_with_data.json @@ -4,9 +4,9 @@ "name": "ISAN2019: 3D Paraview", "description": "3D Paraview viewer with two inputs", "thumbnail": "https://user-images.githubusercontent.com/33152403/60168939-073a5580-9806-11e9-8dad-8a7caa3eb5ab.png", - "prjOwner": "", - "creationDate": "2019-06-06 14:33:43.065", - "lastChangeDate": "2019-06-06 14:33:44.747", + "prjOwner": "pytest@itis.swiss", + "creationDate": "2019-06-06T14:33:43.065Z", + "lastChangeDate": "2019-06-06T14:33:44.747Z", "accessRights": {}, "workbench": { "de2578c5-431e-5753-af37-e6aec8120bf2": { @@ -45,7 +45,7 @@ "y": 250 } }, - "de2578c5-431e-9b0f-67677a20996c": { + "de2578c5-431e-9b0f-a456-67677a20996c": { "key": "simcore/services/dynamic/3d-viewer", "version": "2.10.0", "label": "3D ParaViewer", @@ -77,9 +77,9 @@ "name": "ISAN: UCDavis use case: 0D", "description": "Colleen Clancy Single Cell solver with a file picker and PostPro viewer", "thumbnail": "https://user-images.githubusercontent.com/33152403/60168940-073a5580-9806-11e9-9a44-ae5266eeb020.png", - "prjOwner": "", - "creationDate": "2019-06-06 14:33:51.94", - "lastChangeDate": "2019-06-06 14:33:54.329", + "prjOwner": "pytest@itis.swiss", + "creationDate": "2019-06-06T14:33:51.940Z", + "lastChangeDate": "2019-06-06T14:33:54.329Z", "accessRights": {}, "workbench": { "de2578c5-431e-59d6-b1a5-6e7b2773636b": { diff --git a/services/storage/tests/fixtures/data_models.py b/services/storage/tests/fixtures/data_models.py index 0a9075c9006..dba1f846c06 100644 --- a/services/storage/tests/fixtures/data_models.py +++ b/services/storage/tests/fixtures/data_models.py @@ -3,12 +3,14 @@ # pylint:disable=redefined-outer-name -from typing import AsyncIterator -from uuid import UUID +from typing import Any, AsyncIterator, Awaitable, Callable import pytest +import sqlalchemy as sa from aiopg.sa.engine import Engine +from faker import Faker from models_library.projects import ProjectID +from models_library.projects_nodes_io import NodeID from models_library.users import UserID from pytest_simcore.helpers.rawdata_fakers import random_project, random_user from simcore_postgres_database.storage_models import projects, users @@ -28,7 +30,7 @@ async def user_id(aiopg_engine: Engine) -> AsyncIterator[UserID]: async with aiopg_engine.acquire() as conn: result = await conn.execute(stmt) row = await result.fetchone() - + assert row assert isinstance(row.id, int) yield row.id @@ -37,21 +39,71 @@ async def user_id(aiopg_engine: Engine) -> AsyncIterator[UserID]: @pytest.fixture -async def project_id(user_id: UserID, aiopg_engine: Engine) -> AsyncIterator[ProjectID]: - # inject a random project for user in db. This will give user_id, the full project's ownership +async def create_project( + user_id: UserID, aiopg_engine: Engine +) -> AsyncIterator[Callable[[], Awaitable[dict[str, Any]]]]: + created_project_uuids = [] - # pylint: disable=no-value-for-parameter - stmt = ( - projects.insert() - .values(**random_project(prj_owner=user_id)) - .returning(projects.c.uuid) - ) - print(str(stmt)) + async def _creator(**kwargs) -> dict[str, Any]: + prj_config = {"prj_owner": user_id} + prj_config.update(kwargs) + async with aiopg_engine.acquire() as conn: + result = await conn.execute( + projects.insert() + .values(**random_project(**prj_config)) + .returning(sa.literal_column("*")) + ) + row = await result.fetchone() + assert row + created_project_uuids.append(row[projects.c.uuid]) + return dict(row) + + yield _creator + # cleanup async with aiopg_engine.acquire() as conn: - result = await conn.execute(stmt) - [prj_uuid] = (await result.fetchone()).as_tuple() + await conn.execute( + projects.delete().where(projects.c.uuid.in_(created_project_uuids)) + ) - yield UUID(prj_uuid) - async with aiopg_engine.acquire() as conn: - await conn.execute(projects.delete().where(projects.c.uuid == prj_uuid)) +@pytest.fixture +async def project_id( + create_project: Callable[[], Awaitable[dict[str, Any]]] +) -> ProjectID: + + project = await create_project() + return ProjectID(project["uuid"]) + + +@pytest.fixture +async def create_project_node( + user_id: UserID, aiopg_engine: Engine, faker: Faker +) -> AsyncIterator[Callable[..., Awaitable[NodeID]]]: + async def _creator(project_id: ProjectID) -> NodeID: + async with aiopg_engine.acquire() as conn: + result = await conn.execute( + sa.select([projects.c.workbench]).where( + projects.c.uuid == f"{project_id}" + ) + ) + row = await result.fetchone() + assert row + project_workbench: dict[str, Any] = row[projects.c.workbench] + new_node_id = NodeID(faker.uuid4()) + project_workbench.update( + { + f"{new_node_id}": { + "key": "simcore/services/frontend/file-picker", + "version": "1.0.0", + "label": "pytest_fake_node", + } + } + ) + await conn.execute( + projects.update() + .where(projects.c.uuid == f"{project_id}") + .values(workbench=project_workbench) + ) + return new_node_id + + yield _creator diff --git a/services/storage/tests/fixtures/datcore_adapter.py b/services/storage/tests/fixtures/datcore_adapter.py new file mode 100644 index 00000000000..4a161d0181f --- /dev/null +++ b/services/storage/tests/fixtures/datcore_adapter.py @@ -0,0 +1,28 @@ +import re + +import pytest +from aiohttp import web +from aioresponses import aioresponses as AioResponsesMock +from simcore_service_storage.datcore_adapter.datcore_adapter_settings import ( + DatcoreAdapterSettings, +) + + +@pytest.fixture +def datcore_adapter_service_mock( + aioresponses_mocker: AioResponsesMock, +) -> AioResponsesMock: + dat_core_settings = DatcoreAdapterSettings.create_from_envs() + datcore_adapter_base_url = dat_core_settings.endpoint + # mock base endpoint + aioresponses_mocker.get( + datcore_adapter_base_url, status=web.HTTPOk.status_code, repeat=True + ) + list_datasets_re = re.compile(rf"^{datcore_adapter_base_url}/datasets") + aioresponses_mocker.get( + list_datasets_re, status=web.HTTPOk.status_code, repeat=True + ) + aioresponses_mocker.get( + datcore_adapter_base_url, status=web.HTTPOk.status_code, repeat=True, payload={} + ) + return aioresponses_mocker diff --git a/services/storage/tests/helpers/file_utils.py b/services/storage/tests/helpers/file_utils.py new file mode 100644 index 00000000000..9ea042b776b --- /dev/null +++ b/services/storage/tests/helpers/file_utils.py @@ -0,0 +1,92 @@ +import json +from pathlib import Path +from time import perf_counter +from typing import Final + +import aiofiles +import pytest +from aiohttp import ClientSession, web +from pydantic import AnyUrl, ByteSize, parse_obj_as +from simcore_service_storage.s3_client import ETag + +_SENDER_CHUNK_SIZE: Final[int] = parse_obj_as(ByteSize, "16Mib") + + +async def _file_sender( + file: Path, *, offset: int, bytes_to_send: int, raise_while_uploading: bool +): + chunk_size = _SENDER_CHUNK_SIZE + if raise_while_uploading: + # to ensure we can raise before it is done + chunk_size = min(_SENDER_CHUNK_SIZE, int(file.stat().st_size / 3)) + async with aiofiles.open(file, "rb") as f: + await f.seek(offset) + num_read_bytes = 0 + while chunk := await f.read(min(chunk_size, bytes_to_send - num_read_bytes)): + num_read_bytes += len(chunk) + yield chunk + if raise_while_uploading: + raise RuntimeError("we were asked to raise here!") + + +async def upload_file_part( + session: ClientSession, + file: Path, + part_index: int, + file_offset: int, + this_file_chunk_size: int, + num_parts: int, + upload_url: AnyUrl, + raise_while_uploading: bool = False, +) -> ETag: + print( + f"--> uploading {this_file_chunk_size=} of {file=}, [{part_index+1}/{num_parts}]..." + ) + response = await session.put( + upload_url, + data=_file_sender( + file, + offset=file_offset, + bytes_to_send=this_file_chunk_size, + raise_while_uploading=raise_while_uploading, + ), + headers={ + "Content-Length": f"{this_file_chunk_size}", + }, + ) + response.raise_for_status() + # NOTE: the response from minio does not contain a json body + assert response.status == web.HTTPOk.status_code + assert response.headers + assert "Etag" in response.headers + received_e_tag = json.loads(response.headers["Etag"]) + print( + f"--> completed upload {this_file_chunk_size=} of {file=}, [{part_index+1}/{num_parts}], {received_e_tag=}" + ) + return received_e_tag + + +async def upload_file_to_presigned_link(file: Path, file_upload_link: AnyUrl) -> ETag: + + file_size = file.stat().st_size + + start = perf_counter() + print(f"--> uploading {file=}") + async with ClientSession() as session: + e_tag = await upload_file_part( + session, + file, + 0, + 0, + file.stat().st_size, + 1, + file_upload_link, + ) + print( + f"--> upload of {file=} of {file_size=} completed in {perf_counter() - start}" + ) + return e_tag + + +def parametrized_file_size(size_str: str): + return pytest.param(parse_obj_as(ByteSize, size_str), id=size_str) diff --git a/services/storage/tests/helpers/utils.py b/services/storage/tests/helpers/utils.py new file mode 100644 index 00000000000..b54d04e210b --- /dev/null +++ b/services/storage/tests/helpers/utils.py @@ -0,0 +1,14 @@ +import logging +import os + +log = logging.getLogger(__name__) + + +def has_datcore_tokens() -> bool: + # TODO: activate tests against BF services in the CI. + # + # CI shall add BF_API_KEY, BF_API_SECRET environs as secrets + # + if not os.environ.get("BF_API_KEY") or not os.environ.get("BF_API_SECRET"): + return False + return True diff --git a/services/storage/tests/helpers/utils_file_meta_data.py b/services/storage/tests/helpers/utils_file_meta_data.py new file mode 100644 index 00000000000..3c40e315043 --- /dev/null +++ b/services/storage/tests/helpers/utils_file_meta_data.py @@ -0,0 +1,39 @@ +from typing import Optional + +from aiopg.sa.engine import Engine +from models_library.projects_nodes_io import StorageFileID +from simcore_postgres_database.storage_models import file_meta_data + + +async def assert_file_meta_data_in_db( + aiopg_engine: Engine, + *, + file_id: StorageFileID, + expected_entry_exists: bool, + expected_file_size: Optional[int], + expected_upload_expiration_date: Optional[bool], +) -> None: + if expected_entry_exists and expected_file_size == None: + assert True, "Invalid usage of assertion, expected_file_size cannot be None" + + async with aiopg_engine.acquire() as conn: + result = await conn.execute( + file_meta_data.select().where(file_meta_data.c.file_id == f"{file_id}") + ) + db_data = await result.fetchall() + assert db_data is not None + assert len(db_data) == (1 if expected_entry_exists else 0) + if expected_entry_exists: + row = db_data[0] + assert ( + row[file_meta_data.c.file_size] == expected_file_size + ), f"entry in file_meta_data was not initialized correctly, size should be set to {expected_file_size}" + + if expected_upload_expiration_date: + assert row[ + file_meta_data.c.upload_expires_at + ], "no upload expiration date!" + else: + assert ( + row[file_meta_data.c.upload_expires_at] is None + ), "expiration date should be NULL" diff --git a/services/storage/tests/helpers/utils_project.py b/services/storage/tests/helpers/utils_project.py index 307e9d975e6..afe6c38fcce 100644 --- a/services/storage/tests/helpers/utils_project.py +++ b/services/storage/tests/helpers/utils_project.py @@ -1,17 +1,23 @@ import uuid as uuidlib from copy import deepcopy +from typing import Any +from models_library.projects_nodes_io import NodeIDStr -def clone_project_data(project: dict) -> tuple[dict, dict]: + +def clone_project_data( + project: dict, +) -> tuple[dict[str, Any], dict[NodeIDStr, NodeIDStr]]: project_copy = deepcopy(project) # Update project id # NOTE: this can be re-assigned by dbapi if not unique project_copy_uuid = uuidlib.uuid4() # random project id project_copy["uuid"] = str(project_copy_uuid) + project_copy.pop("id", None) # Workbench nodes shall be unique within the project context - def _create_new_node_uuid(old_uuid): + def _create_new_node_uuid(old_uuid: NodeIDStr) -> NodeIDStr: return str(uuidlib.uuid5(project_copy_uuid, str(old_uuid))) nodes_map = {} diff --git a/services/storage/tests/unit/s3wrapper/__init__.py b/services/storage/tests/unit/s3wrapper/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/services/storage/tests/unit/s3wrapper/test_s3_client.py b/services/storage/tests/unit/s3wrapper/test_s3_client.py deleted file mode 100644 index 85a1eedf2ca..00000000000 --- a/services/storage/tests/unit/s3wrapper/test_s3_client.py +++ /dev/null @@ -1,212 +0,0 @@ -# pylint: disable=redefined-outer-name -# pylint: disable=unused-argument -# pylint: disable=unused-variable - -import filecmp -import os -import time -import urllib -import urllib.error -import urllib.request -import uuid -from datetime import timedelta -from typing import Callable - -import pytest - -pytest_simcore_core_services_selection = ["postgres"] -pytest_simcore_ops_services_selection = ["minio"] - - -@pytest.fixture() -def bucket(s3_client, request): - bucket_name = "simcore-test" - s3_client.create_bucket(bucket_name, delete_contents_if_exists=True) - - def fin(): - s3_client.remove_bucket(bucket_name, delete_contents=True) - - request.addfinalizer(fin) - return bucket_name - - -@pytest.fixture(scope="function") -def text_files_factory(tmpdir_factory) -> Callable: - def _create_files(N): - filepaths = [] - for _i in range(N): - name = str(uuid.uuid4()) - filepath = os.path.normpath( - str(tmpdir_factory.mktemp("data").join(name + ".txt")) - ) - with open(filepath, "w") as fout: - fout.write("Hello world\n") - filepaths.append(filepath) - - return filepaths - - return _create_files - - -def test_create_remove_bucket(s3_client): - bucket_name = "simcore-test" - assert s3_client.create_bucket(bucket_name) - assert s3_client.exists_bucket(bucket_name) - s3_client.remove_bucket(bucket_name, delete_contents=True) - assert not s3_client.exists_bucket(bucket_name) - - -def test_create_remove_bucket_with_contents(s3_client, text_files_factory): - bucket_name = "simcore-test" - assert s3_client.create_bucket(bucket_name) - assert s3_client.exists_bucket(bucket_name) - object_name = "dummy" - filepath = text_files_factory(1)[0] - assert s3_client.upload_file(bucket_name, object_name, filepath) - assert s3_client.remove_bucket(bucket_name, delete_contents=False) - assert s3_client.exists_bucket(bucket_name) - s3_client.remove_bucket(bucket_name, delete_contents=True) - assert not s3_client.exists_bucket(bucket_name) - - -def test_file_upload_download(s3_client, bucket, text_files_factory): - filepath = text_files_factory(1)[0] - object_name = "1" - assert s3_client.upload_file(bucket, object_name, filepath) - filepath2 = filepath + ".rec" - assert s3_client.download_file(bucket, object_name, filepath2) - assert filecmp.cmp(filepath2, filepath) - - -def test_file_upload_meta_data(s3_client, bucket, text_files_factory): - filepath = text_files_factory(1)[0] - object_name = "1" - _id = uuid.uuid4() - metadata = {"user": "guidon", "node_id": str(_id), "boom-boom": str(42.0)} - - assert s3_client.upload_file(bucket, object_name, filepath, metadata=metadata) - - metadata2 = s3_client.get_metadata(bucket, object_name) - - assert metadata2["X-Amz-Meta-User"] == "guidon" - assert metadata2["X-Amz-Meta-Node_id"] == str(_id) - assert metadata2["X-Amz-Meta-Boom-Boom"] == str(42.0) - - -def test_sub_folders(s3_client, bucket, text_files_factory): - bucket_sub_folder = str(uuid.uuid4()) - filepaths = text_files_factory(3) - counter = 1 - for f in filepaths: - object_name = bucket_sub_folder + "/" + str(counter) - assert s3_client.upload_file(bucket, object_name, f) - counter += 1 - - -def test_presigned_put(s3_client, bucket, text_files_factory): - filepath = text_files_factory(1)[0] - object_name = "my_file" - url = s3_client.create_presigned_put_url(bucket, object_name) - with open(filepath, "rb") as fp: - d = fp.read() - req = urllib.request.Request(url, data=d, method="PUT") - with urllib.request.urlopen(req) as _f: - pass - - filepath2 = filepath + ".rec" - assert s3_client.download_file(bucket, object_name, filepath2) - assert filecmp.cmp(filepath2, filepath) - - -def test_presigned_put_expired(s3_client, bucket, text_files_factory): - filepath = text_files_factory(1)[0] - object_name = "my_file" - url = s3_client.create_presigned_put_url(bucket, object_name, timedelta(seconds=1)) - time.sleep(2) - failed = False - with open(filepath, "rb") as fp: - d = fp.read() - req = urllib.request.Request(url, data=d, method="PUT") - try: - # pylint: disable=consider-using-with - urllib.request.urlopen(req) - except urllib.error.HTTPError as _ex: - failed = True - assert failed - - -def test_presigned_get(s3_client, bucket, text_files_factory): - filepath = text_files_factory(1)[0] - filepath2 = filepath + "." - object_name = "bla" - assert s3_client.upload_file(bucket, object_name, filepath) - url = s3_client.create_presigned_get_url(bucket, object_name) - urllib.request.urlretrieve(url, filepath2) - - assert filecmp.cmp(filepath2, filepath) - - -def test_presigned_get_expired(s3_client, bucket, text_files_factory): - filepath = text_files_factory(1)[0] - filepath2 = filepath + "." - object_name = "bla" - assert s3_client.upload_file(bucket, object_name, filepath) - url = s3_client.create_presigned_get_url(bucket, object_name, timedelta(seconds=1)) - time.sleep(2) - failed = False - try: - urllib.request.urlretrieve(url, filepath2) - except urllib.error.HTTPError as _ex: - failed = True - - assert failed - - -def test_object_exists(s3_client, bucket, text_files_factory): - files = text_files_factory(2) - file1 = files[0] - file2 = files[1] - object_name = "level1" - assert s3_client.upload_file(bucket, object_name, file1) - assert s3_client.exists_object(bucket, object_name, False) - object_name = "leve1/level2" - assert s3_client.upload_file(bucket, object_name, file2) - assert not s3_client.exists_object(bucket, object_name, False) - assert s3_client.exists_object(bucket, object_name, True) - - -def test_copy_object(s3_client, bucket, text_files_factory): - files = text_files_factory(1) - file = files[0] - object_name = "original" - assert s3_client.upload_file(bucket, object_name, file) - assert s3_client.exists_object(bucket, object_name, False) - copied_object = "copy" - assert s3_client.copy_object(bucket, copied_object, bucket, object_name) - assert s3_client.exists_object(bucket, copied_object, False) - - -def test_list_objects(s3_client, bucket, text_files_factory): - files = text_files_factory(2) - file1 = files[0] - file2 = files[1] - object_name = "level1/level2/1" - assert s3_client.upload_file(bucket, object_name, file1) - object_name = "level2/level2/2" - assert s3_client.upload_file(bucket, object_name, file2) - - listed_objects = s3_client.list_objects(bucket) - for s3_obj in listed_objects: - assert s3_obj.object_name in ("level1/", "level2/") - - listed_objects = s3_client.list_objects(bucket, prefix="level1") - for s3_obj in listed_objects: - assert s3_obj.object_name == "level1/" - - listed_objects = s3_client.list_objects(bucket, prefix="level1", recursive=True) - for s3_obj in listed_objects: - assert s3_obj.object_name == "level1/level2/1" - - listed_objects = s3_client.list_objects(bucket, recursive=True) - for s3_obj in listed_objects: - assert s3_obj.object_name in ("level1/level2/1", "level2/level2/2") diff --git a/services/storage/tests/unit/test_access_layer.py b/services/storage/tests/unit/test_access_layer.py index 29c06c34334..639769c2bce 100644 --- a/services/storage/tests/unit/test_access_layer.py +++ b/services/storage/tests/unit/test_access_layer.py @@ -4,11 +4,12 @@ from typing import Iterable -from uuid import UUID import pytest from aiopg.sa.engine import Engine -from simcore_service_storage.access_layer import ( +from models_library.projects import ProjectID +from models_library.users import UserID +from simcore_service_storage.db_access_layer import ( AccessRights, get_file_access_rights, get_project_access_rights, @@ -19,18 +20,18 @@ @pytest.fixture async def filemeta_id( - user_id: int, project_id: str, aiopg_engine: Engine + user_id: UserID, project_id: ProjectID, aiopg_engine: Engine ) -> Iterable[str]: raise NotImplementedError() async def test_access_rights_on_owned_project( - user_id: int, project_id: UUID, aiopg_engine: Engine + user_id: UserID, project_id: ProjectID, aiopg_engine: Engine ): async with aiopg_engine.acquire() as conn: - access = await get_project_access_rights(conn, user_id, str(project_id)) + access = await get_project_access_rights(conn, user_id, project_id) assert access == AccessRights.all() # still NOT registered in file_meta_data BUT with prefix {project_id} owned by user diff --git a/services/storage/tests/unit/test_aiobotocore.py b/services/storage/tests/unit/test_aiobotocore.py new file mode 100644 index 00000000000..8325b69d081 --- /dev/null +++ b/services/storage/tests/unit/test_aiobotocore.py @@ -0,0 +1,68 @@ +# pylint: disable=redefined-outer-name +# pylint: disable=unused-argument +# pylint: disable=unused-variable + + +from contextlib import AsyncExitStack + +import pytest +from aiobotocore.session import get_session +from botocore import exceptions as boto_exceptions +from moto.server import ThreadedMotoServer + + +async def test_s3_client_fails_if_no_s3(): + """this tests shows that initializing the client actually checks if the S3 server is connected""" + session = get_session() + with pytest.raises(boto_exceptions.ClientError): + async with session.create_client( + "s3", + aws_secret_access_key="xxx", + aws_access_key_id="xxx", + ) as client: + assert client + await client.list_buckets() + with pytest.raises(boto_exceptions.ClientError): + async with AsyncExitStack() as exit_stack: + client = await exit_stack.enter_async_context( + session.create_client( + "s3", + aws_secret_access_key="xxx", + aws_access_key_id="xxx", + ) + ) + assert client + await client.list_buckets() + + +async def test_s3_client_reconnects_if_s3_server_restarts( + mocked_s3_server: ThreadedMotoServer, +): + """this tests shows that we do not need to restart the client if the S3 server restarts""" + session = get_session() + # pylint: disable=protected-access + async with session.create_client( + "s3", + endpoint_url=f"http://{mocked_s3_server._ip_address}:{mocked_s3_server._port}", + aws_secret_access_key="xxx", + aws_access_key_id="xxx", + ) as client: + assert client + response = await client.list_buckets() + assert response + assert "Buckets" in response + assert isinstance(response["Buckets"], list) + assert not response["Buckets"] + + # stop the server, the client shall be unhappy + mocked_s3_server.stop() + with pytest.raises(boto_exceptions.EndpointConnectionError): + response = await client.list_buckets() + + # restart the server and check that the aiobotocore client is connected again + mocked_s3_server.start() + response = await client.list_buckets() + assert response + assert "Buckets" in response + assert isinstance(response["Buckets"], list) + assert not response["Buckets"] diff --git a/services/storage/tests/unit/test_dsm.py b/services/storage/tests/unit/test_dsm.py index 9eb2f8cfa3c..3ba3d4f2bdf 100644 --- a/services/storage/tests/unit/test_dsm.py +++ b/services/storage/tests/unit/test_dsm.py @@ -1,834 +1,72 @@ # pylint: disable=unused-variable # pylint: disable=unused-argument # pylint: disable=redefined-outer-name -# pylint: disable=too-many-arguments -# pylint: disable=no-name-in-module -# pylint: disable=no-member -# pylint: disable=too-many-branches -import copy -import datetime -import filecmp -import os -import urllib.request -import uuid +import asyncio from pathlib import Path -from shutil import copyfile -from typing import Any, Callable, Iterator, Optional +from typing import Awaitable, Callable, Optional import pytest -import tests.utils -from simcore_service_storage.access_layer import InvalidFileIdentifier -from simcore_service_storage.constants import DATCORE_STR, SIMCORE_S3_ID, SIMCORE_S3_STR -from simcore_service_storage.dsm import DataStorageManager -from simcore_service_storage.models import FileMetaData, FileMetaDataEx -from simcore_service_storage.s3wrapper.s3_client import MinioClientWrapper -from tests.utils import BUCKET_NAME, USER_ID, has_datcore_tokens +from faker import Faker +from models_library.projects_nodes_io import SimcoreS3FileID +from models_library.users import UserID +from pydantic import ByteSize, parse_obj_as +from simcore_service_storage.models import FileMetaData, S3BucketName +from simcore_service_storage.s3_client import StorageS3Client +from simcore_service_storage.simcore_s3_dsm import SimcoreS3DataManager pytest_simcore_core_services_selection = ["postgres"] -pytest_simcore_ops_services_selection = ["minio", "adminer"] - - -async def test_dsm_s3( - dsm_mockup_db: dict[str, FileMetaData], dsm_fixture: DataStorageManager -): - id_name_map = {} - id_file_count = {} - for d in dsm_mockup_db.keys(): - md = dsm_mockup_db[d] - if not md.user_id in id_name_map: - id_name_map[md.user_id] = md.user_name - id_file_count[md.user_id] = 1 - else: - id_file_count[md.user_id] = id_file_count[md.user_id] + 1 - - dsm = dsm_fixture - - # list files for every user - for _id in id_file_count: - data = await dsm.list_files(user_id=_id, location=SIMCORE_S3_STR) - assert len(data) == id_file_count[_id] - - # Get files from bob from the project biology - bob_id = 0 - for _id, _name in id_name_map.items(): - if _name == "bob": - bob_id = _id - break - assert not bob_id == 0 - - data = await dsm.list_files( - user_id=bob_id, location=SIMCORE_S3_STR, regex="biology" - ) - data1 = await dsm.list_files( - user_id=bob_id, location=SIMCORE_S3_STR, regex="astronomy" - ) - data = data + data1 - bobs_biostromy_files = [] - for d in dsm_mockup_db.keys(): - md = dsm_mockup_db[d] - if md.user_id == bob_id and (md.project_name in ("biology", "astronomy")): - bobs_biostromy_files.append(md) - - assert len(data) == len(bobs_biostromy_files) - - # among bobs bio files, filter by project/node, take first one - - uuid_filter = os.path.join( - bobs_biostromy_files[0].project_id, bobs_biostromy_files[0].node_id - ) - filtered_data = await dsm.list_files( - user_id=bob_id, location=SIMCORE_S3_STR, uuid_filter=str(uuid_filter) - ) - assert filtered_data[0].fmd == bobs_biostromy_files[0] - - for dx in data: - d = dx.fmd - await dsm.delete_file( - user_id=d.user_id, location=SIMCORE_S3_STR, file_uuid=d.file_uuid - ) - - # now we should have less items - new_size = 0 - for _id in id_file_count: - data = await dsm.list_files(user_id=_id, location=SIMCORE_S3_STR) - new_size = new_size + len(data) - - assert len(dsm_mockup_db) == new_size + len(bobs_biostromy_files) - assert len(dsm_mockup_db) == new_size + len(bobs_biostromy_files) +pytest_simcore_ops_services_selection = ["adminer"] @pytest.fixture -def create_file_meta_for_s3( - s3_client: MinioClientWrapper, +async def dsm_mockup_complete_db( + simcore_s3_dsm: SimcoreS3DataManager, + user_id: UserID, + upload_file: Callable[ + [ByteSize, str, Optional[SimcoreS3FileID]], + Awaitable[tuple[Path, SimcoreS3FileID]], + ], cleanup_user_projects_file_metadata: None, -) -> Iterator[Callable[..., FileMetaData]]: - def _creator(tmp_file: Path) -> FileMetaData: - bucket_name = BUCKET_NAME - s3_client.create_bucket(bucket_name, delete_contents_if_exists=True) - - # create file and upload - filename = tmp_file.name - project_id = "api" # "357879cc-f65d-48b2-ad6c-074e2b9aa1c7" - project_name = "battlestar" - node_name = "galactica" - node_id = "b423b654-686d-4157-b74b-08fa9d90b36e" - file_name = filename - file_uuid = os.path.join(str(project_id), str(node_id), str(file_name)) - display_name = os.path.join(str(project_name), str(node_name), str(file_name)) - created_at = str(datetime.datetime.now()) - file_size = tmp_file.stat().st_size - - d = { - "object_name": os.path.join(str(project_id), str(node_id), str(file_name)), - "bucket_name": bucket_name, - "file_name": filename, - "user_id": USER_ID, - "user_name": "starbucks", - "location": SIMCORE_S3_STR, - "location_id": SIMCORE_S3_ID, - "project_id": project_id, - "project_name": project_name, - "node_id": node_id, - "node_name": node_name, - "file_uuid": file_uuid, - "file_id": file_uuid, - "raw_file_path": file_uuid, - "display_file_path": display_name, - "created_at": created_at, - "last_modified": created_at, - "file_size": file_size, - } - - fmd = FileMetaData(**d) - - return fmd - - yield _creator - - # cleanup - s3_client.remove_bucket(BUCKET_NAME, delete_contents=True) - - -async def _upload_file( - dsm: DataStorageManager, file_metadata: FileMetaData, file_path: Path -) -> FileMetaData: - up_url = await dsm.upload_link( - file_metadata.user_id, file_metadata.file_uuid, as_presigned_link=True - ) - assert file_path.exists() - with file_path.open("rb") as fp: - d = fp.read() - req = urllib.request.Request(up_url, data=d, method="PUT") - with urllib.request.urlopen(req) as _f: - entity_tag = _f.headers.get("ETag") - assert entity_tag is not None - file_metadata.entity_tag = entity_tag.strip('"') - return file_metadata - - -async def test_update_metadata_from_storage( - postgres_dsn_url: str, - s3_client: MinioClientWrapper, - mock_files_factory: Callable[[int], list[Path]], - dsm_fixture: DataStorageManager, - create_file_meta_for_s3: Callable, -): - tmp_file = mock_files_factory(1)[0] - fmd: FileMetaData = create_file_meta_for_s3(tmp_file) - fmd = await _upload_file(dsm_fixture, fmd, Path(tmp_file)) - - assert ( - await dsm_fixture.try_update_database_from_storage( # pylint: disable=protected-access - "some_fake_uuid", fmd.bucket_name, fmd.object_name, reraise_exceptions=False - ) - is None - ) - - assert ( - await dsm_fixture.try_update_database_from_storage( # pylint: disable=protected-access - fmd.file_uuid, "some_fake_bucket", fmd.object_name, reraise_exceptions=False - ) - is None - ) - - assert ( - await dsm_fixture.try_update_database_from_storage( # pylint: disable=protected-access - fmd.file_uuid, fmd.bucket_name, "some_fake_object", reraise_exceptions=False - ) - is None - ) - - file_metadata: Optional[ - FileMetaDataEx - ] = await dsm_fixture.try_update_database_from_storage( # pylint: disable=protected-access - fmd.file_uuid, fmd.bucket_name, fmd.object_name, reraise_exceptions=False - ) - assert file_metadata is not None - assert file_metadata.fmd.file_size == Path(tmp_file).stat().st_size - assert file_metadata.fmd.entity_tag == fmd.entity_tag - - -async def test_links_s3( - postgres_dsn_url: str, - s3_client: MinioClientWrapper, - mock_files_factory: Callable[[int], list[Path]], - dsm_fixture: DataStorageManager, - create_file_meta_for_s3: Callable, -): - - tmp_file = mock_files_factory(1)[0] - fmd: FileMetaData = create_file_meta_for_s3(tmp_file) - - dsm = dsm_fixture - - fmd = await _upload_file(dsm_fixture, fmd, Path(tmp_file)) - - # test wrong user - assert await dsm.list_file("654654654", fmd.location, fmd.file_uuid) is None - - # test wrong location - assert await dsm.list_file(fmd.user_id, "whatever_location", fmd.file_uuid) is None - - # test wrong file uuid - with pytest.raises(InvalidFileIdentifier): - await dsm.list_file(fmd.user_id, fmd.location, "some_fake_uuid") - # use correctly - file_metadata: Optional[FileMetaDataEx] = await dsm.list_file( - fmd.user_id, fmd.location, fmd.file_uuid - ) - assert file_metadata is not None - excluded_fields = [ - "project_id", - "project_name", - "node_name", - "user_name", - "display_file_path", - "created_at", - "last_modified", - ] - for field in FileMetaData.__attrs_attrs__: - if field.name not in excluded_fields: - if field.name == "location_id": - assert int( - file_metadata.fmd.__getattribute__(field.name) - ) == fmd.__getattribute__( - field.name - ), f"{field.name}: expected {fmd.__getattribute__(field.name)} vs {file_metadata.fmd.__getattribute__(field.name)}" - else: - assert file_metadata.fmd.__getattribute__( - field.name - ) == fmd.__getattribute__( - field.name - ), f"{field.name}: expected {fmd.__getattribute__(field.name)} vs {file_metadata.fmd.__getattribute__(field.name)}" - - tmp_file2 = f"{tmp_file}.rec" - user_id = 0 - down_url = await dsm.download_link_s3( - fmd.file_uuid, user_id, as_presigned_link=True - ) - - urllib.request.urlretrieve(down_url, tmp_file2) - - assert filecmp.cmp(tmp_file2, tmp_file) - - -async def test_copy_s3_s3( - postgres_dsn_url: str, - s3_client: MinioClientWrapper, - mock_files_factory: Callable[[int], list[Path]], - dsm_fixture: DataStorageManager, - create_file_meta_for_s3: Callable, -): - - tmp_file = mock_files_factory(1)[0] - fmd = create_file_meta_for_s3(tmp_file) - - dsm = dsm_fixture - data = await dsm.list_files(user_id=fmd.user_id, location=SIMCORE_S3_STR) - assert len(data) == 0 - - # upload the file - up_url = await dsm.upload_link(fmd.user_id, fmd.file_uuid, as_presigned_link=True) - with tmp_file.open("rb") as fp: - d = fp.read() - req = urllib.request.Request(up_url, data=d, method="PUT") - with urllib.request.urlopen(req) as _f: - pass - - data = await dsm.list_files(user_id=fmd.user_id, location=SIMCORE_S3_STR) - assert len(data) == 1 - - from_uuid = fmd.file_uuid - new_project = "zoology" - to_uuid = os.path.join(new_project, fmd.node_id, fmd.file_name) - await dsm.copy_file( - user_id=fmd.user_id, - dest_location=SIMCORE_S3_STR, - dest_uuid=to_uuid, - source_location=SIMCORE_S3_STR, - source_uuid=from_uuid, - ) - - data = await dsm.list_files(user_id=fmd.user_id, location=SIMCORE_S3_STR) - - assert len(data) == 2 - - -# NOTE: Below tests directly access the datcore platform, use with care! -@pytest.mark.skipif(not has_datcore_tokens(), reason="no datcore tokens") -def test_datcore_fixture(datcore_structured_testbucket): - print(datcore_structured_testbucket) - - -@pytest.mark.skipif(not has_datcore_tokens(), reason="no datcore tokens") -async def test_dsm_datcore( - postgres_dsn_url, dsm_fixture, datcore_structured_testbucket -): - dsm = dsm_fixture - user_id = "0" - data = await dsm.list_files( - user_id=user_id, location=DATCORE_STR, uuid_filter=BUCKET_NAME - ) - # the fixture creates 3 files - assert len(data) == 3 - - # delete the first one - fmd_to_delete = data[0].fmd - print("Deleting", fmd_to_delete.bucket_name, fmd_to_delete.object_name) - is_deleted = await dsm.delete_file(user_id, DATCORE_STR, fmd_to_delete.file_id) - assert is_deleted - - import time - - time.sleep(1) # FIXME: takes some time to delete!! - - data = await dsm.list_files( - user_id=user_id, location=DATCORE_STR, uuid_filter=BUCKET_NAME - ) - assert len(data) == 2 - - -@pytest.mark.skipif(not has_datcore_tokens(), reason="no datcore tokens") -async def test_dsm_s3_to_datcore( - postgres_dsn_url: str, - s3_client: MinioClientWrapper, - mock_files_factory: Callable[[int], list[Path]], - dsm_fixture: DataStorageManager, - datcore_structured_testbucket: str, - create_file_meta_for_s3: Callable, -): - tmp_file = mock_files_factory(1)[0] - - fmd = create_file_meta_for_s3(tmp_file) - - dsm = dsm_fixture - - up_url = await dsm.upload_link(fmd.user_id, fmd.file_uuid, as_presigned_link=True) - with tmp_file.open("rb") as fp: - d = fp.read() - req = urllib.request.Request(up_url, data=d, method="PUT") - with urllib.request.urlopen(req) as _f: - pass - - # given the fmd, upload to datcore - tmp_file2 = f"{tmp_file}.fordatcore" - user_id = USER_ID - down_url = await dsm.download_link_s3( - fmd.file_uuid, user_id, as_presigned_link=True - ) - urllib.request.urlretrieve(down_url, tmp_file2) - assert filecmp.cmp(tmp_file2, tmp_file) - # now we have the file locally, upload the file - await dsm.upload_file_to_datcore( - user_id, - tmp_file2, - datcore_structured_testbucket["dataset_id"], - ) - # and into a deeper strucutre - await dsm.upload_file_to_datcore( - user_id, - tmp_file2, - datcore_structured_testbucket["coll2_id"], - ) - - # FIXME: upload takes some time - import time - - time.sleep(1) - - data = await dsm.list_files( - user_id=user_id, location=DATCORE_STR, uuid_filter=BUCKET_NAME - ) - # there should now be 5 files - assert len(data) == 5 - - -@pytest.mark.skipif(not has_datcore_tokens(), reason="no datcore tokens") -async def test_dsm_datcore_to_local( - postgres_dsn_url, - dsm_fixture: DataStorageManager, - mock_files_factory: Callable[[int], list[Path]], - datcore_structured_testbucket, -): - - dsm = dsm_fixture - user_id = USER_ID - data = await dsm.list_files( - user_id=user_id, location=DATCORE_STR, uuid_filter=BUCKET_NAME - ) - assert len(data) - - url, filename = await dsm.download_link_datcore( - user_id, datcore_structured_testbucket["file_id1"] - ) - - tmp_file = mock_files_factory(1)[0] - tmp_file2 = f"{tmp_file}.fromdatcore" - - urllib.request.urlretrieve(url, tmp_file2) - - assert filecmp.cmp(tmp_file2, tmp_file) - - -@pytest.mark.skipif(not has_datcore_tokens(), reason="no datcore tokens") -async def test_dsm_datcore_to_S3( - postgres_dsn_url: str, - s3_client: MinioClientWrapper, - dsm_fixture: DataStorageManager, - mock_files_factory: Callable[[int], list[Path]], - datcore_structured_testbucket: str, - create_file_meta_for_s3: Callable, -): - # create temporary file - tmp_file = mock_files_factory(1)[0] - dest_fmd = create_file_meta_for_s3(tmp_file) - user_id = dest_fmd.user_id - dest_uuid = dest_fmd.file_uuid - - dsm = dsm_fixture - - s3_data = await dsm.list_files(user_id=user_id, location=SIMCORE_S3_STR) - assert len(s3_data) == 0 - - dc_data = await dsm.list_files( - user_id=user_id, location=DATCORE_STR, uuid_filter=BUCKET_NAME - ) - assert len(dc_data) == 3 - src_fmd = dc_data[0] - - await dsm.copy_file( - user_id=user_id, - dest_location=SIMCORE_S3_STR, - dest_uuid=dest_uuid, - source_location=DATCORE_STR, - source_uuid=datcore_structured_testbucket["file_id1"], - ) - - s3_data = await dsm.list_files(user_id=user_id, location=SIMCORE_S3_STR) - assert len(s3_data) == 1 - - # now download the original file - tmp_file1 = f"{tmp_file}.fromdatcore" - down_url_dc, filename = await dsm.download_link_datcore( - user_id, datcore_structured_testbucket["file_id1"] - ) - urllib.request.urlretrieve(down_url_dc, tmp_file1) - - # and the one on s3 - tmp_file2 = f"{tmp_file}.fromS3" - down_url_s3 = await dsm.download_link_s3(dest_uuid, user_id, as_presigned_link=True) - urllib.request.urlretrieve(down_url_s3, tmp_file2) - - assert filecmp.cmp(tmp_file1, tmp_file2) - - -@pytest.mark.skipif(not has_datcore_tokens(), reason="no datcore tokens") -async def test_copy_datcore( - postgres_dsn_url: str, - s3_client: MinioClientWrapper, - dsm_fixture: DataStorageManager, - mock_files_factory: Callable[[int], list[Path]], - datcore_structured_testbucket: str, - create_file_meta_for_s3: Callable, -): - # the fixture should provide 3 files - dsm = dsm_fixture - user_id = USER_ID - data = await dsm.list_files( - user_id=user_id, location=DATCORE_STR, uuid_filter=BUCKET_NAME - ) - assert len(data) == 3 - - # create temporary file and upload to s3 - tmp_file = mock_files_factory(1)[0] - fmd = create_file_meta_for_s3(tmp_file) - - up_url = await dsm.upload_link(fmd.user_id, fmd.file_uuid, as_presigned_link=True) - with tmp_file.open("rb") as fp: - d = fp.read() - req = urllib.request.Request(up_url, data=d, method="PUT") - with urllib.request.urlopen(req) as _f: - pass - - # now copy to datcore - dat_core_uuid = os.path.join(BUCKET_NAME, fmd.file_name) - - await dsm.copy_file( - user_id=user_id, - dest_location=DATCORE_STR, - dest_uuid=datcore_structured_testbucket["coll2_id"], - source_location=SIMCORE_S3_STR, - source_uuid=fmd.file_uuid, - ) - - data = await dsm.list_files( - user_id=user_id, location=DATCORE_STR, uuid_filter=BUCKET_NAME - ) - - # there should now be 4 files - assert len(data) == 4 - - -def test_fmd_build(): - file_uuid = str(Path("api") / Path("abcd") / Path("xx.dat")) - fmd = FileMetaData() - fmd.simcore_from_uuid(file_uuid, "test-bucket") - - assert not fmd.node_id - assert not fmd.project_id - assert fmd.file_name == "xx.dat" - assert fmd.object_name == "api/abcd/xx.dat" - assert fmd.file_uuid == file_uuid - assert fmd.location == SIMCORE_S3_STR - assert fmd.location_id == SIMCORE_S3_ID - assert fmd.bucket_name == "test-bucket" - - file_uuid = f"{uuid.uuid4()}/{uuid.uuid4()}/xx.dat" - fmd.simcore_from_uuid(file_uuid, "test-bucket") - - assert fmd.node_id == file_uuid.split("/")[1] - assert fmd.project_id == file_uuid.split("/")[0] - assert fmd.file_name == "xx.dat" - assert fmd.object_name == file_uuid - assert fmd.file_uuid == file_uuid - assert fmd.location == SIMCORE_S3_STR - assert fmd.location_id == SIMCORE_S3_ID - assert fmd.bucket_name == "test-bucket" - - -async def test_dsm_complete_db( - dsm_fixture: DataStorageManager, - dsm_mockup_complete_db: tuple[dict[str, str], dict[str, str]], -): - dsm = dsm_fixture - _id = "21" - dsm.has_project_db = True - data = await dsm.list_files(user_id=_id, location=SIMCORE_S3_STR) - - assert len(data) == 2 - for dx in data: - d = dx.fmd - assert d.display_file_path - assert d.node_name - assert d.project_name - assert d.raw_file_path - - -async def test_delete_data_folders( - dsm_fixture: DataStorageManager, - dsm_mockup_complete_db: tuple[dict[str, str], dict[str, str]], -): - file_1, file_2 = dsm_mockup_complete_db - _id = "21" - data = await dsm_fixture.list_files(user_id=_id, location=SIMCORE_S3_STR) - response = await dsm_fixture.delete_project_simcore_s3( - user_id=_id, project_id=file_1["project_id"], node_id=file_1["node_id"] - ) - data = await dsm_fixture.list_files(user_id=_id, location=SIMCORE_S3_STR) - assert len(data) == 1 - assert data[0].fmd.file_name == file_2["filename"] - response = await dsm_fixture.delete_project_simcore_s3( - user_id=_id, project_id=file_1["project_id"], node_id=None - ) - data = await dsm_fixture.list_files(user_id=_id, location=SIMCORE_S3_STR) - assert not data - - -@pytest.mark.skipif(not has_datcore_tokens(), reason="no datcore tokens") -async def test_deep_copy_project_simcore_s3( - dsm_fixture, s3_client, postgres_dsn_url, datcore_structured_testbucket -): - dsm = dsm_fixture - - tests.utils.fill_tables_from_csv_files(url=postgres_dsn_url) - - path_in_datcore = datcore_structured_testbucket["file_id3"] - file_name_in_datcore = Path(datcore_structured_testbucket["filename3"]).name - user_id = USER_ID - - source_project = { - "uuid": "de2578c5-431e-4d5e-b80e-401c8066782f", - "name": "ISAN: 2D Plot", - "description": "2D RawGraphs viewer with one input", - "thumbnail": "", - "prjOwner": "my.email@osparc.io", - "creationDate": "2019-05-24T10:36:57.813Z", - "lastChangeDate": "2019-05-24T11:36:12.015Z", - "workbench": { - "de2578c5-431e-48eb-a9d2-aaad6b72400a": { - "key": "simcore/services/frontend/file-picker", - "version": "1.0.0", - "label": "File Picker", - "inputs": {}, - "inputNodes": [], - "outputs": { - "outFile": { - "store": 1, - "path": "N:package:ab8c214d-a596-401f-a90c-9c50e3c048b0", - } - }, - "progress": 100, - "thumbnail": "", - "position": {"x": 100, "y": 100}, - }, - "de2578c5-431e-4c63-a705-03a2c339646c": { - "key": "simcore/services/dynamic/raw-graphs", - "version": "2.8.0", - "label": "2D plot", - "inputs": { - "input_1": { - "nodeUuid": "de2578c5-431e-48eb-a9d2-aaad6b72400a", - "output": "outFile", - } - }, - "inputNodes": ["de2578c5-431e-48eb-a9d2-aaad6b72400a"], - "outputs": {}, - "progress": 0, - "thumbnail": "", - "position": {"x": 400, "y": 100}, - }, - }, - } - - bucket_name = BUCKET_NAME - s3_client.create_bucket(bucket_name, delete_contents_if_exists=True) - - source_project["workbench"]["de2578c5-431e-48eb-a9d2-aaad6b72400a"]["outputs"][ - "outFile" - ]["path"] = path_in_datcore - - destination_project = copy.deepcopy(source_project) - source_project_id = source_project["uuid"] - destination_project["uuid"] = source_project_id.replace("template", "deep-copy") - destination_project["workbench"] = {} - - node_mapping = {} - - for node_id, node in source_project["workbench"].items(): - object_name = str( - Path(source_project_id) / Path(node_id) / Path(node_id + ".dat") - ) - f = tests.utils.data_dir() / Path("notebooks.zip") - s3_client.upload_file(bucket_name, object_name, f) - key = node_id.replace("template", "deep-copy") - destination_project["workbench"][key] = node - node_mapping[node_id] = key - - status = await dsm.deep_copy_project_simcore_s3( - user_id, source_project, destination_project, node_mapping + faker: Faker, +) -> tuple[FileMetaData, FileMetaData]: + file_size = parse_obj_as(ByteSize, "10Mib") + uploaded_files = await asyncio.gather( + *(upload_file(file_size, faker.file_name(), None) for _ in range(2)) ) - - new_path = destination_project["workbench"][ - "deep-copy-uuid-48eb-a9d2-aaad6b72400a" - ]["outputs"]["outFile"]["path"] - assert new_path != path_in_datcore - assert Path(new_path).name == file_name_in_datcore - files = await dsm.list_files(user_id=user_id, location=SIMCORE_S3_STR) - assert len(files) == 3 - # one of the files in s3 should be the dowloaded one from datcore - assert any( - f.fmd.file_name == Path(datcore_structured_testbucket["filename3"]).name - for f in files + fmds = await asyncio.gather( + *(simcore_s3_dsm.get_file(user_id, file_id) for _, file_id in uploaded_files) ) + assert len(fmds) == 2 - response = await dsm.delete_project_simcore_s3(user_id, destination_project["uuid"]) - - files = await dsm.list_files(user_id=user_id, location=SIMCORE_S3_STR) - assert len(files) == 0 - - -async def test_dsm_list_datasets_s3(dsm_fixture, dsm_mockup_complete_db): - dsm_fixture.has_project_db = True - - datasets = await dsm_fixture.list_datasets(user_id="21", location=SIMCORE_S3_STR) - - assert len(datasets) == 1 - assert any("Kember" in d.display_name for d in datasets) + return (fmds[0], fmds[1]) async def test_sync_table_meta_data( - dsm_fixture: DataStorageManager, - dsm_mockup_complete_db: tuple[dict[str, str], dict[str, str]], - s3_client: MinioClientWrapper, + simcore_s3_dsm: SimcoreS3DataManager, + dsm_mockup_complete_db: tuple[FileMetaData, FileMetaData], + storage_s3_client: StorageS3Client, + storage_s3_bucket: S3BucketName, ): - dsm_fixture.has_project_db = True - expected_removed_files = [] # the list should be empty on start - list_changes: dict[str, Any] = await dsm_fixture.synchronise_meta_data_table( - location=SIMCORE_S3_STR, dry_run=True - ) - assert "removed" in list_changes - assert list_changes["removed"] == expected_removed_files + list_changes = await simcore_s3_dsm.synchronise_meta_data_table(dry_run=True) + assert list_changes == expected_removed_files # now remove the files for file_entry in dsm_mockup_complete_db: - s3_key = f"{file_entry['project_id']}/{file_entry['node_id']}/{file_entry['filename']}" - s3_client.remove_objects(BUCKET_NAME, [s3_key]) + s3_key = f"{file_entry.project_id}/{file_entry.node_id}/{file_entry.file_name}" + await storage_s3_client.client.delete_object( + Bucket=storage_s3_bucket, Key=s3_key + ) expected_removed_files.append(s3_key) # the list should now contain the removed entries - list_changes: dict[str, Any] = await dsm_fixture.synchronise_meta_data_table( - location=SIMCORE_S3_STR, dry_run=True - ) - assert "removed" in list_changes - assert list_changes["removed"] == expected_removed_files + list_changes = await simcore_s3_dsm.synchronise_meta_data_table(dry_run=True) + assert set(list_changes) == set(expected_removed_files) # now effectively call the function should really remove the files - list_changes: dict[str, Any] = await dsm_fixture.synchronise_meta_data_table( - location=SIMCORE_S3_STR, dry_run=False - ) + list_changes = await simcore_s3_dsm.synchronise_meta_data_table(dry_run=False) # listing again will show an empty list again - list_changes: dict[str, Any] = await dsm_fixture.synchronise_meta_data_table( - location=SIMCORE_S3_STR, dry_run=True - ) - assert "removed" in list_changes - assert list_changes["removed"] == [] - - -@pytest.mark.skipif(not has_datcore_tokens(), reason="no datcore tokens") -async def test_dsm_list_datasets_datcore( - dsm_fixture: DataStorageManager, datcore_structured_testbucket: str -): - datasets = await dsm_fixture.list_datasets(user_id=USER_ID, location=DATCORE_STR) - - assert len(datasets) - assert any(BUCKET_NAME in d.display_name for d in datasets) - - -async def test_dsm_list_dataset_files_s3( - dsm_fixture: DataStorageManager, - dsm_mockup_complete_db: tuple[dict[str, str], dict[str, str]], -): - dsm_fixture.has_project_db = True - - datasets = await dsm_fixture.list_datasets(user_id="21", location=SIMCORE_S3_STR) - assert len(datasets) == 1 - assert any("Kember" in d.display_name for d in datasets) - for d in datasets: - files = await dsm_fixture.list_files_dataset( - user_id="21", location=SIMCORE_S3_STR, dataset_id=d.dataset_id - ) - if "Kember" in d.display_name: - assert len(files) == 2 - else: - assert len(files) == 0 - - if files: - found = await dsm_fixture.search_files_starting_with( - user_id="21", prefix=files[0].fmd.file_uuid - ) - assert found - assert len(found) == 1 - assert found[0].fmd.file_uuid == files[0].fmd.file_uuid - assert found[0].parent_id == files[0].parent_id - assert found[0].fmd.node_id == files[0].fmd.node_id - # NOTE: found and files differ in these attributes - # ['project_name', 'node_name', 'file_id', 'raw_file_path', 'display_file_path'] - # because these are added artificially in list_files - - -@pytest.mark.skipif(not has_datcore_tokens(), reason="no datcore tokens") -async def test_dsm_list_dataset_files_datcore( - dsm_fixture: DataStorageManager, datcore_structured_testbucket: str -): - datasets = await dsm_fixture.list_datasets(user_id=USER_ID, location=DATCORE_STR) - - assert len(datasets) - assert any(BUCKET_NAME in d.display_name for d in datasets) - - for d in datasets: - files = await dsm_fixture.list_files_dataset( - user_id=USER_ID, location=DATCORE_STR, dataset_id=d.dataset_id - ) - if BUCKET_NAME in d.display_name: - assert len(files) == 3 - - -@pytest.mark.skip(reason="develop only") -@pytest.mark.skipif(not has_datcore_tokens(), reason="no datcore tokens") -async def test_download_links( - datcore_structured_testbucket: str, - s3_client: MinioClientWrapper, - mock_files_factory: Callable[[int], list[Path]], -): - s3_client.create_bucket(BUCKET_NAME, delete_contents_if_exists=True) - _file = mock_files_factory(1)[0] - - s3_client.upload_file(BUCKET_NAME, "test.txt", f"{_file}") - link = s3_client.create_presigned_get_url(BUCKET_NAME, "test.txt") - print(link) - - dcw = datcore_structured_testbucket["dcw"] - - endings = ["txt", "json", "zip", "dat", "mat"] - counter = 1 - for e in endings: - file_name = "test{}.{}".format(counter, e) - copyfile(_file, file_name) - dataset_id = datcore_structured_testbucket["dataset_id"] - file_id = await dcw.upload_file_to_id(dataset_id, file_name) - link, _file_name = await dcw.download_link_by_id(file_id) - print(_file_name, link) - os.remove(file_name) + list_changes = await simcore_s3_dsm.synchronise_meta_data_table(dry_run=True) + assert list_changes == [] diff --git a/services/storage/tests/unit/test_dsm_cleaner.py b/services/storage/tests/unit/test_dsm_cleaner.py new file mode 100644 index 00000000000..477d859ded5 --- /dev/null +++ b/services/storage/tests/unit/test_dsm_cleaner.py @@ -0,0 +1,52 @@ +# pylint: disable=redefined-outer-name +# pylint: disable=unused-argument +# pylint: disable=unused-variable + +import asyncio +from unittest import mock + +import pytest +from aiohttp.test_utils import TestClient +from pytest_mock import MockerFixture + +pytest_simcore_core_services_selection = ["postgres"] +pytest_simcore_ops_services_selection = ["adminer"] + + +@pytest.fixture +def disable_dsm_cleaner(monkeypatch: pytest.MonkeyPatch): + monkeypatch.setenv("STORAGE_CLEANER_INTERVAL_S", "null") + + +@pytest.fixture +def mocked_dsm_clean(mocker: MockerFixture) -> mock.Mock: + return mocker.patch( + "simcore_service_storage.dsm_cleaner.SimcoreS3DataManager.clean_expired_uploads", + autospec=True, + side_effect=RuntimeError, + ) + + +@pytest.fixture +def short_dsm_cleaner_interval(monkeypatch: pytest.MonkeyPatch) -> int: + monkeypatch.setenv("STORAGE_CLEANER_INTERVAL_S", "1") + return 1 + + +async def test_setup_dsm_cleaner(client: TestClient): + all_tasks = asyncio.all_tasks() + assert any(t.get_name().startswith("dsm_cleaner_task") for t in all_tasks) + + +async def test_disable_dsm_cleaner(disable_dsm_cleaner, client: TestClient): + all_tasks = asyncio.all_tasks() + assert not any(t.get_name().startswith("dsm_cleaner_task") for t in all_tasks) + + +async def test_dsm_cleaner_task_restarts_if_error( + mocked_dsm_clean: mock.Mock, short_dsm_cleaner_interval: int, client: TestClient +): + num_calls = mocked_dsm_clean.call_count + await asyncio.sleep(short_dsm_cleaner_interval + 1) + mocked_dsm_clean.assert_called() + assert mocked_dsm_clean.call_count > num_calls diff --git a/services/storage/tests/unit/test_dsm_dsmcleaner.py b/services/storage/tests/unit/test_dsm_dsmcleaner.py new file mode 100644 index 00000000000..114f6409498 --- /dev/null +++ b/services/storage/tests/unit/test_dsm_dsmcleaner.py @@ -0,0 +1,148 @@ +# pylint: disable=unused-variable +# pylint: disable=unused-argument +# pylint: disable=redefined-outer-name +# pylint: disable=too-many-arguments +# pylint: disable=no-name-in-module +# pylint: disable=no-member +# pylint: disable=too-many-branches + +import asyncio +import datetime +from pathlib import Path +from typing import Awaitable, Callable, Optional + +import pytest +from aiopg.sa.engine import Engine +from faker import Faker +from models_library.api_schemas_storage import LinkType +from models_library.projects_nodes_io import SimcoreS3FileID +from models_library.users import UserID +from pydantic import ByteSize, parse_obj_as +from pytest_simcore.helpers.utils_parametrizations import byte_size_ids +from simcore_postgres_database.storage_models import file_meta_data +from simcore_service_storage import db_file_meta_data +from simcore_service_storage.exceptions import FileMetaDataNotFoundError +from simcore_service_storage.models import S3BucketName +from simcore_service_storage.s3_client import StorageS3Client +from simcore_service_storage.simcore_s3_dsm import SimcoreS3DataManager + +pytest_simcore_core_services_selection = ["postgres"] +pytest_simcore_ops_services_selection = ["adminer"] + + +@pytest.fixture +def disabled_dsm_cleaner_task(monkeypatch: pytest.MonkeyPatch): + monkeypatch.setenv("STORAGE_CLEANER_INTERVAL_S", "0") + + +@pytest.mark.parametrize( + "file_size", + [ByteSize(0), parse_obj_as(ByteSize, "10Mib"), parse_obj_as(ByteSize, "100Mib")], + ids=byte_size_ids, +) +@pytest.mark.parametrize("link_type", [LinkType.S3, LinkType.PRESIGNED]) +async def test_clean_expired_uploads_deletes_expired_pending_uploads( + disabled_dsm_cleaner_task, + aiopg_engine: Engine, + simcore_s3_dsm: SimcoreS3DataManager, + simcore_file_id: SimcoreS3FileID, + user_id: UserID, + link_type: LinkType, + file_size: ByteSize, + storage_s3_client: StorageS3Client, + storage_s3_bucket: S3BucketName, +): + """In this test we create valid upload links and check that once + expired they get properly deleted""" + await simcore_s3_dsm.create_file_upload_link(user_id, simcore_file_id, link_type) + # ensure the database is correctly set up + async with aiopg_engine.acquire() as conn: + fmd = await db_file_meta_data.get(conn, simcore_file_id) + assert fmd + assert fmd.upload_expires_at + + # now run the cleaner, nothing should happen since the expiration was set to the default of 3600 + await simcore_s3_dsm.clean_expired_uploads() + # check the entries are still the same + async with aiopg_engine.acquire() as conn: + fmd_after_clean = await db_file_meta_data.get(conn, simcore_file_id) + assert fmd_after_clean == fmd + + # now change the upload_expires_at entry to simulate and expired entry + async with aiopg_engine.acquire() as conn: + await conn.execute( + file_meta_data.update() + .where(file_meta_data.c.file_id == simcore_file_id) + .values(upload_expires_at=datetime.datetime.utcnow()) + ) + await asyncio.sleep(1) + await simcore_s3_dsm.clean_expired_uploads() + + # check the entries were removed + async with aiopg_engine.acquire() as conn: + with pytest.raises(FileMetaDataNotFoundError): + await db_file_meta_data.get(conn, simcore_file_id) + + +@pytest.mark.parametrize( + "file_size", + [parse_obj_as(ByteSize, "10Mib"), parse_obj_as(ByteSize, "100Mib")], + ids=byte_size_ids, +) +@pytest.mark.parametrize("link_type", [LinkType.S3, LinkType.PRESIGNED]) +async def test_clean_expired_uploads_reverts_to_last_known_version_expired_pending_uploads( + disabled_dsm_cleaner_task, + upload_file: Callable[ + [ByteSize, str, Optional[SimcoreS3FileID]], + Awaitable[tuple[Path, SimcoreS3FileID]], + ], + aiopg_engine: Engine, + simcore_s3_dsm: SimcoreS3DataManager, + user_id: UserID, + link_type: LinkType, + file_size: ByteSize, + storage_s3_client: StorageS3Client, + storage_s3_bucket: S3BucketName, + faker: Faker, +): + """In this test we first upload a file to have a valid entry, then we trigger + a new upload of the VERY SAME FILE, expire it, and make sure the cleaner reverts + to the last known version of the file""" + file, file_id = await upload_file(file_size, faker.file_name(), None) + async with aiopg_engine.acquire() as conn: + original_fmd = await db_file_meta_data.get(conn, file_id) + + # now create a new link to the VERY SAME FILE UUID + await simcore_s3_dsm.create_file_upload_link(user_id, file_id, link_type) + # ensure the database is correctly set up + async with aiopg_engine.acquire() as conn: + fmd = await db_file_meta_data.get(conn, file_id) + assert fmd + assert fmd.upload_expires_at + + # now run the cleaner, nothing should happen since the expiration was set to the default of 3600 + await simcore_s3_dsm.clean_expired_uploads() + # check the entries are still the same + async with aiopg_engine.acquire() as conn: + fmd_after_clean = await db_file_meta_data.get(conn, file_id) + assert fmd_after_clean == fmd + + # now change the upload_expires_at entry to simulate an expired entry + async with aiopg_engine.acquire() as conn: + await conn.execute( + file_meta_data.update() + .where(file_meta_data.c.file_id == file_id) + .values(upload_expires_at=datetime.datetime.utcnow()) + ) + await asyncio.sleep(1) + await simcore_s3_dsm.clean_expired_uploads() + + # check the entries were reverted + async with aiopg_engine.acquire() as conn: + reverted_fmd = await db_file_meta_data.get(conn, file_id) + assert original_fmd.dict(exclude={"created_at"}) == reverted_fmd.dict( + exclude={"created_at"} + ) + # check the S3 content is the old file + s3_meta_data = await storage_s3_client.get_file_metadata(storage_s3_bucket, file_id) + assert s3_meta_data.size == file_size diff --git a/services/storage/tests/unit/test_dsm_soft_links.py b/services/storage/tests/unit/test_dsm_soft_links.py index fa274c30dd2..d878d7ca8a4 100644 --- a/services/storage/tests/unit/test_dsm_soft_links.py +++ b/services/storage/tests/unit/test_dsm_soft_links.py @@ -2,79 +2,79 @@ # pylint: disable=unused-argument # pylint: disable=unused-variable +import uuid +from functools import lru_cache from typing import AsyncIterator -import attr import pytest from aiopg.sa.engine import Engine -from simcore_service_storage.constants import SIMCORE_S3_STR -from simcore_service_storage.dsm import DataStorageManager -from simcore_service_storage.models import FileMetaData, FileMetaDataEx, file_meta_data -from simcore_service_storage.utils import create_resource_uuid +from models_library.api_schemas_storage import S3BucketName +from models_library.projects_nodes_io import SimcoreS3FileID +from models_library.users import UserID +from models_library.utils.fastapi_encoders import jsonable_encoder +from pydantic import ByteSize +from simcore_postgres_database.storage_models import file_meta_data +from simcore_service_storage.models import FileMetaData, FileMetaDataAtDB +from simcore_service_storage.simcore_s3_dsm import SimcoreS3DataManager from sqlalchemy.sql.expression import literal_column pytest_simcore_core_services_selection = ["postgres"] -pytest_simcore_ops_services_selection = ["minio"] - - -@pytest.fixture -def storage(aiopg_engine: Engine) -> DataStorageManager: - - return DataStorageManager( - s3_client=None, - engine=aiopg_engine, - loop=None, - pool=None, - simcore_bucket_name="master-simcore", - has_project_db=True, - app=None, - ) # type: ignore +pytest_simcore_ops_services_selection = ["adminer"] @pytest.fixture() async def output_file( - user_id: int, project_id: str, aiopg_engine: Engine + user_id: UserID, project_id: str, aiopg_engine: Engine ) -> AsyncIterator[FileMetaData]: node_id = "fd6f9737-1988-341b-b4ac-0614b646fa82" # pylint: disable=no-value-for-parameter - file = FileMetaData() - file.simcore_from_uuid( - f"{project_id}/{node_id}/filename.txt", bucket_name="master-simcore" + file = FileMetaData.from_simcore_node( + user_id=user_id, + file_id=SimcoreS3FileID(f"{project_id}/{node_id}/filename.txt"), + bucket=S3BucketName("master-simcore"), + location_id=SimcoreS3DataManager.get_location_id(), + location_name=SimcoreS3DataManager.get_location_name(), ) file.entity_tag = "df9d868b94e53d18009066ca5cd90e9f" - file.file_size = 12 - file.user_name = "test" - file.user_id = str(user_id) + file.file_size = ByteSize(12) + file.user_id = user_id async with aiopg_engine.acquire() as conn: stmt = ( file_meta_data.insert() - .values( - **attr.asdict(file), - ) + .values(jsonable_encoder(FileMetaDataAtDB.from_orm(file))) .returning(literal_column("*")) ) result = await conn.execute(stmt) row = await result.fetchone() - - # hacks defect - file.user_id = str(user_id) - file.location_id = str(file.location_id) - # -- - assert file == FileMetaData(**dict(row)) # type: ignore + assert row yield file result = await conn.execute( - file_meta_data.delete().where(file_meta_data.c.file_uuid == row.file_uuid) + file_meta_data.delete().where(file_meta_data.c.file_id == row.file_id) ) +def create_reverse_dns(*resource_name_parts) -> str: + """ + Returns a name for the resource following the reverse domain name notation + """ + # See https://en.wikipedia.org/wiki/Reverse_domain_name_notation + return "io.simcore.storage" + ".".join(map(str, resource_name_parts)) + + +@lru_cache +def create_resource_uuid(*resource_name_parts) -> uuid.UUID: + revers_dns = create_reverse_dns(*resource_name_parts) + return uuid.uuid5(uuid.NAMESPACE_DNS, revers_dns) + + async def test_create_soft_link( - storage: DataStorageManager, user_id: int, output_file: FileMetaData + simcore_s3_dsm: SimcoreS3DataManager, user_id: int, output_file: FileMetaData ): api_file_id = create_resource_uuid( @@ -82,11 +82,12 @@ async def test_create_soft_link( ) file_name = output_file.file_name - link_file: FileMetaDataEx = await storage.create_soft_link( - user_id, output_file.file_uuid, f"api/{api_file_id}/{file_name}" + link_file: FileMetaData = await simcore_s3_dsm.create_soft_link( + user_id, + output_file.file_id, + SimcoreS3FileID(f"api/{api_file_id}/{file_name}"), ) - assert isinstance(link_file, FileMetaDataEx) - assert isinstance(link_file.fmd, FileMetaData) + assert isinstance(link_file, FileMetaData) # copy: # - you have two different versions of the file. @@ -112,26 +113,26 @@ async def test_create_soft_link( # 6686594 -rw-rw-r-- 2 crespo crespo 6 Mar 9 2020 VERSION-hard # 6686197 lrwxrwxrwx 1 crespo crespo 7 Apr 14 14:48 VERSION-link -> VERSION - assert link_file.fmd.file_uuid == f"api/{api_file_id}/{file_name}" - assert link_file.fmd.file_id == link_file.fmd.file_uuid - assert link_file.fmd.object_name == output_file.object_name - assert link_file.fmd.entity_tag == output_file.entity_tag - assert link_file.fmd.is_soft_link + assert link_file.file_uuid == f"api/{api_file_id}/{file_name}" + assert link_file.file_id == link_file.file_uuid + assert link_file.object_name == output_file.object_name + assert link_file.entity_tag == output_file.entity_tag + assert link_file.is_soft_link # TODO: in principle we keep this ... # assert output_file.created_at < link_file.fmd.created_at # assert output_file.last_modified < link_file.fmd.last_modified # can find - files_list = await storage.search_files_starting_with( + files_list = await simcore_s3_dsm.search_files_starting_with( user_id, f"api/{api_file_id}/{file_name}" ) assert len(files_list) == 1 assert files_list[0] == link_file # can get - got_file = await storage.list_file( - str(user_id), SIMCORE_S3_STR, f"api/{api_file_id}/{file_name}" + got_file = await simcore_s3_dsm.get_file( + user_id, SimcoreS3FileID(f"api/{api_file_id}/{file_name}") ) assert got_file == link_file diff --git a/services/storage/tests/unit/test_handlers.py b/services/storage/tests/unit/test_handlers.py index ab0640af274..dd9b6804280 100644 --- a/services/storage/tests/unit/test_handlers.py +++ b/services/storage/tests/unit/test_handlers.py @@ -2,18 +2,18 @@ # pylint: disable=unused-argument # pylint: disable=unused-variable +from importlib import import_module +from inspect import getmembers from pathlib import Path from typing import Any import openapi_core import pytest +import simcore_service_storage import yaml -from simcore_service_storage import handlers +from aiohttp.web import RouteTableDef from simcore_service_storage._meta import api_vtag from simcore_service_storage.resources import resources -from simcore_service_storage.rest import set_default_names - -set_default_names(handlers.routes) @pytest.fixture(scope="module") @@ -24,14 +24,36 @@ def openapi_specs(): return api_specs +def _iter_handler_cls(): + all_routes = RouteTableDef() + for filepath in ( + Path(simcore_service_storage.__file__).resolve().parent.glob("handlers*.py") + ): + mod = import_module( + name=f".{filepath.stem}", package=simcore_service_storage.__name__ + ) + + def _is_route(value): + return isinstance(value, RouteTableDef) + + member_named_routes = getmembers(mod, _is_route) + assert ( + len(member_named_routes) == 1 + ), f"missing definition of routes in {filepath.name}" + _, routes = member_named_routes[0] + all_routes._items.extend(routes._items) # pylint: disable=protected-access + + return all_routes + + @pytest.mark.parametrize( - "route", handlers.routes, ids=lambda r: f"{r.method.upper()} {r.path}" + "route", _iter_handler_cls(), ids=lambda r: f"{r.method.upper()} {r.path}" ) def test_route_against_openapi_specification(route, openapi_specs): assert route.path.startswith(f"/{api_vtag}") path = route.path.replace(f"/{api_vtag}", "") - + assert "name" in route.kwargs, f"missing name for {route=}" assert ( openapi_specs.paths[path].operations[route.method.lower()].operation_id == route.kwargs["name"] diff --git a/services/storage/tests/unit/test_handlers_datasets.py b/services/storage/tests/unit/test_handlers_datasets.py new file mode 100644 index 00000000000..3eb5e063b9b --- /dev/null +++ b/services/storage/tests/unit/test_handlers_datasets.py @@ -0,0 +1,96 @@ +from pathlib import Path +from typing import Awaitable, Callable + +import pytest +from aiohttp import web +from aiohttp.test_utils import TestClient +from faker import Faker +from models_library.api_schemas_storage import DatasetMetaDataGet, FileMetaDataGet +from models_library.projects import ProjectID +from models_library.projects_nodes_io import SimcoreS3FileID +from models_library.users import UserID +from pydantic import ByteSize, parse_obj_as +from pytest_simcore.helpers.utils_assert import assert_status +from pytest_simcore.helpers.utils_parametrizations import byte_size_ids +from tests.helpers.file_utils import parametrized_file_size + +pytest_simcore_core_services_selection = ["postgres"] +pytest_simcore_ops_services_selection = ["adminer"] + + +async def test_get_files_metadata_dataset_with_no_files_returns_empty_array( + client: TestClient, + user_id: UserID, + project_id: ProjectID, + location_id: int, +): + assert client.app + url = ( + client.app.router["get_files_metadata_dataset"] + .url_for(location_id=f"{location_id}", dataset_id=f"{project_id}") + .with_query(user_id=user_id) + ) + response = await client.get(f"{url}") + data, error = await assert_status(response, web.HTTPOk) + assert data == [] + assert not error + + +@pytest.mark.parametrize( + "file_size", + [parametrized_file_size("100Mib")], + ids=byte_size_ids, +) +async def test_get_files_metadata_dataset( + upload_file: Callable[[ByteSize, str], Awaitable[tuple[Path, SimcoreS3FileID]]], + client: TestClient, + user_id: UserID, + project_id: ProjectID, + location_id: int, + file_size: ByteSize, + faker: Faker, +): + assert client.app + NUM_FILES = 3 + for n in range(NUM_FILES): + file, file_id = await upload_file(file_size, faker.file_name()) + url = ( + client.app.router["get_files_metadata_dataset"] + .url_for(location_id=f"{location_id}", dataset_id=f"{project_id}") + .with_query(user_id=user_id) + ) + response = await client.get(f"{url}") + data, error = await assert_status(response, web.HTTPOk) + assert data + assert not error + list_fmds = parse_obj_as(list[FileMetaDataGet], data) + assert len(list_fmds) == (n + 1) + fmd = list_fmds[n] + assert fmd.file_name == file.name + assert fmd.file_id == file_id + assert fmd.file_uuid == file_id + assert fmd.file_size == file.stat().st_size + + +async def test_get_datasets_metadata( + client: TestClient, + user_id: UserID, + location_id: int, + project_id: ProjectID, +): + assert client.app + + url = ( + client.app.router["get_datasets_metadata"] + .url_for(location_id=f"{location_id}") + .with_query(user_id=f"{user_id}") + ) + + response = await client.get(f"{url}") + data, error = await assert_status(response, web.HTTPOk) + assert data + assert not error + list_datasets = parse_obj_as(list[DatasetMetaDataGet], data) + assert len(list_datasets) == 1 + dataset = list_datasets[0] + assert dataset.dataset_id == project_id diff --git a/services/storage/tests/unit/test_handlers_files.py b/services/storage/tests/unit/test_handlers_files.py new file mode 100644 index 00000000000..bd3b5e7eff4 --- /dev/null +++ b/services/storage/tests/unit/test_handlers_files.py @@ -0,0 +1,511 @@ +# pylint: disable=redefined-outer-name +# pylint: disable=unused-argument +# pylint: disable=unused-variable +# pylint: disable=too-many-arguments + +import asyncio +import filecmp +import json +import urllib.parse +from pathlib import Path +from typing import Awaitable, Callable, Optional +from uuid import uuid4 + +import pytest +from aiohttp import ClientSession, web +from aiohttp.test_utils import TestClient +from aiopg.sa import Engine +from faker import Faker +from models_library.api_schemas_storage import FileMetaDataGet, LinkType, SoftCopyBody +from models_library.projects import ProjectID +from models_library.projects_nodes_io import LocationID, NodeID, SimcoreS3FileID +from models_library.users import UserID +from models_library.utils.fastapi_encoders import jsonable_encoder +from pydantic import AnyUrl, ByteSize, parse_obj_as +from pytest_simcore.helpers.utils_assert import assert_status +from pytest_simcore.helpers.utils_parametrizations import byte_size_ids +from simcore_service_storage.exceptions import S3KeyNotFoundError +from simcore_service_storage.models import S3BucketName +from simcore_service_storage.s3_client import StorageS3Client +from tests.helpers.file_utils import upload_file_part +from tests.helpers.utils_file_meta_data import assert_file_meta_data_in_db + +pytest_simcore_core_services_selection = ["postgres"] +pytest_simcore_ops_services_selection = ["adminer"] + + +_HTTP_PRESIGNED_LINK_QUERY_KEYS = [ + "X-Amz-Algorithm", + "X-Amz-Credential", + "X-Amz-Date", + "X-Amz-Expires", + "X-Amz-Signature", + "X-Amz-SignedHeaders", +] + + +@pytest.mark.parametrize( + "url_query, expected_link_scheme, expected_link_query_keys, expected_chunk_size", + [ + pytest.param( + {}, + "http", + _HTTP_PRESIGNED_LINK_QUERY_KEYS, + int(parse_obj_as(ByteSize, "5GiB").to("b")), + id="default_returns_single_presigned", + ), + pytest.param( + {"link_type": "presigned"}, + "http", + _HTTP_PRESIGNED_LINK_QUERY_KEYS, + int(parse_obj_as(ByteSize, "5GiB").to("b")), + id="presigned_returns_single_presigned", + ), + pytest.param( + {"link_type": "s3"}, + "s3", + [], + int(parse_obj_as(ByteSize, "5TiB").to("b")), + id="s3_returns_single_s3_link", + ), + ], +) +async def test_create_upload_file_default_returns_single_link( + storage_s3_client, + storage_s3_bucket: S3BucketName, + simcore_file_id: SimcoreS3FileID, + url_query: dict[str, str], + expected_link_scheme: str, + expected_link_query_keys: list[str], + expected_chunk_size: int, + aiopg_engine: Engine, + create_upload_file_link: Callable[..., Awaitable[AnyUrl]], + cleanup_user_projects_file_metadata: None, +): + # create upload file link + received_file_upload = await create_upload_file_link(simcore_file_id, **url_query) + # check links, there should be only 1 + assert received_file_upload + assert received_file_upload.scheme == expected_link_scheme + assert received_file_upload.path + assert received_file_upload.path.endswith( + f"{urllib.parse.quote(simcore_file_id, safe='/')}" + ) + + # now check the entry in the database is correct, there should be only one + await assert_file_meta_data_in_db( + aiopg_engine, + file_id=simcore_file_id, + expected_entry_exists=True, + expected_file_size=-1, + expected_upload_expiration_date=True, + ) + + +@pytest.mark.parametrize( + "link_type, file_size", + [ + (LinkType.PRESIGNED, parse_obj_as(ByteSize, "1000Mib")), + (LinkType.S3, parse_obj_as(ByteSize, "1000Mib")), + ], +) +async def test_delete_unuploaded_file_correctly_cleans_up_db_and_s3( + aiopg_engine: Engine, + client: TestClient, + storage_s3_client: StorageS3Client, + storage_s3_bucket: S3BucketName, + simcore_file_id: SimcoreS3FileID, + link_type: LinkType, + file_size: ByteSize, + create_upload_file_link: Callable[..., Awaitable[AnyUrl]], + user_id: UserID, + location_id: LocationID, +): + assert client.app + # create upload file link + upload_link = await create_upload_file_link( + simcore_file_id, link_type=link_type.value.lower(), file_size=file_size + ) + + # we shall have an entry in the db, waiting for upload + await assert_file_meta_data_in_db( + aiopg_engine, + file_id=simcore_file_id, + expected_entry_exists=True, + expected_file_size=-1, + expected_upload_expiration_date=True, + ) + + # abort file upload + abort_url = ( + client.app.router["abort_upload_file"] + .url_for( + location_id=f"{location_id}", + file_id=urllib.parse.quote(simcore_file_id, safe=""), + ) + .with_query(user_id=user_id) + ) + response = await client.post(f"{abort_url}") + await assert_status(response, web.HTTPNoContent) + + # the DB shall be cleaned up + await assert_file_meta_data_in_db( + aiopg_engine, + file_id=simcore_file_id, + expected_entry_exists=False, + expected_file_size=None, + expected_upload_expiration_date=None, + ) + + +@pytest.mark.parametrize( + "link_type, file_size", + [ + (LinkType.PRESIGNED, parse_obj_as(ByteSize, "10Mib")), + (LinkType.PRESIGNED, parse_obj_as(ByteSize, "1000Mib")), + (LinkType.S3, parse_obj_as(ByteSize, "10Mib")), + (LinkType.S3, parse_obj_as(ByteSize, "1000Mib")), + ], + ids=byte_size_ids, +) +async def test_upload_same_file_uuid_aborts_previous_upload( + aiopg_engine: Engine, + client: TestClient, + storage_s3_client: StorageS3Client, + storage_s3_bucket: S3BucketName, + simcore_file_id: SimcoreS3FileID, + link_type: LinkType, + file_size: ByteSize, + create_upload_file_link: Callable[..., Awaitable[AnyUrl]], +): + assert client.app + # create upload file link + file_upload_link = await create_upload_file_link( + simcore_file_id, link_type=link_type.value.lower(), file_size=file_size + ) + # we shall have an entry in the db, waiting for upload + await assert_file_meta_data_in_db( + aiopg_engine, + file_id=simcore_file_id, + expected_entry_exists=True, + expected_file_size=-1, + expected_upload_expiration_date=True, + ) + + await asyncio.sleep(1) + # now we create a new upload + # we should abort the previous upload to prevent unwanted costs + new_file_upload_link = await create_upload_file_link( + simcore_file_id, link_type=link_type.value.lower(), file_size=file_size + ) + + if link_type == LinkType.PRESIGNED: + assert file_upload_link != new_file_upload_link + else: + assert file_upload_link == new_file_upload_link + # we shall have an entry in the db, waiting for upload + await assert_file_meta_data_in_db( + aiopg_engine, + file_id=simcore_file_id, + expected_entry_exists=True, + expected_file_size=-1, + expected_upload_expiration_date=True, + ) + + +@pytest.mark.parametrize( + "file_name", + [ + "some file name with spaces and extension.txt", + "some name with special characters -_ü!öäàé++3245", + ], +) +@pytest.mark.parametrize( + "file_size", + [ + (parse_obj_as(ByteSize, "1Mib")), + (parse_obj_as(ByteSize, "500Mib")), + # (parse_obj_as(ByteSize, "5Gib")), + # (parse_obj_as(ByteSize, "7Gib")), + ], + ids=byte_size_ids, +) +async def test_upload_real_file( + file_name: str, + file_size: ByteSize, + upload_file: Callable[[ByteSize, str], Awaitable[Path]], +): + await upload_file(file_size, file_name) + + +async def test_upload_real_file_with_s3_client( + aiopg_engine: Engine, + storage_s3_client: StorageS3Client, + storage_s3_bucket: S3BucketName, + client: TestClient, + create_upload_file_link: Callable[..., Awaitable[AnyUrl]], + create_file_of_size: Callable[[ByteSize, Optional[str]], Path], + create_simcore_file_id: Callable[[ProjectID, NodeID, str], SimcoreS3FileID], + project_id: ProjectID, + node_id: NodeID, + faker: Faker, + get_file_meta_data: Callable[..., Awaitable[FileMetaDataGet]], +): + assert client.app + file_size = parse_obj_as(ByteSize, "500Mib") + file_name = faker.file_name() + # create a file + file = create_file_of_size(file_size, file_name) + simcore_file_id = create_simcore_file_id(project_id, node_id, file_name) + # get an S3 upload link + file_upload_link = await create_upload_file_link( + simcore_file_id, link_type="s3", file_size=file_size + ) + # let's use the storage s3 internal client to upload + with file.open("rb") as fp: + response = await storage_s3_client.client.put_object( + Bucket=storage_s3_bucket, Key=simcore_file_id, Body=fp + ) + assert "ETag" in response + upload_e_tag = json.loads(response["ETag"]) + # check the file is now on S3 + s3_metadata = await storage_s3_client.get_file_metadata( + storage_s3_bucket, simcore_file_id + ) + assert s3_metadata.size == file_size + assert s3_metadata.last_modified + assert s3_metadata.e_tag == upload_e_tag + + # check getting the file actually lazily updates the table and returns the expected values + received_fmd: FileMetaDataGet = await get_file_meta_data(simcore_file_id) + assert received_fmd.entity_tag == upload_e_tag + + # check the entry in db now has the correct file size, and the upload id is gone + await assert_file_meta_data_in_db( + aiopg_engine, + file_id=simcore_file_id, + expected_entry_exists=True, + expected_file_size=file_size, + expected_upload_expiration_date=False, + ) + # check the file is in S3 for real + s3_metadata = await storage_s3_client.get_file_metadata( + storage_s3_bucket, simcore_file_id + ) + assert s3_metadata.size == file_size + assert s3_metadata.last_modified + assert s3_metadata.e_tag == upload_e_tag + + +@pytest.mark.parametrize( + "file_size", + [parse_obj_as(ByteSize, "160Mib"), parse_obj_as(ByteSize, "1Mib")], + ids=lambda obj: obj.human_readable(), +) +async def test_upload_twice_and_fail_second_time_shall_keep_first_version( + aiopg_engine: Engine, + client: TestClient, + storage_s3_client: StorageS3Client, + storage_s3_bucket: S3BucketName, + file_size: ByteSize, + upload_file: Callable[[ByteSize, str], Awaitable[tuple[Path, SimcoreS3FileID]]], + faker: Faker, + create_file_of_size: Callable[[ByteSize, Optional[str]], Path], + create_upload_file_link: Callable[..., Awaitable[AnyUrl]], + user_id: UserID, + location_id: LocationID, +): + assert client.app + # 1. upload a valid file + file_name = faker.file_name() + _, uploaded_file_id = await upload_file(file_size, file_name) + + # 2. create an upload link for the second file + upload_link = await create_upload_file_link( + uploaded_file_id, link_type="presigned", file_size=file_size + ) + # we shall have an entry in the db, waiting for upload + await assert_file_meta_data_in_db( + aiopg_engine, + file_id=uploaded_file_id, + expected_entry_exists=True, + expected_file_size=-1, + expected_upload_expiration_date=True, + ) + + # 3. upload part of the file to simulate a network issue in the upload + new_file = create_file_of_size(file_size, file_name) + with pytest.raises(RuntimeError): + async with ClientSession() as session: + await upload_file_part( + session, + new_file, + part_index=1, + file_offset=0, + this_file_chunk_size=file_size, + num_parts=1, + upload_url=upload_link, + raise_while_uploading=True, + ) + + # 4. abort file upload + abort_url = ( + client.app.router["abort_upload_file"] + .url_for( + location_id=f"{location_id}", + file_id=urllib.parse.quote(uploaded_file_id, safe=""), + ) + .with_query(user_id=user_id) + ) + response = await client.post(f"{abort_url}") + await assert_status(response, web.HTTPNoContent) + + # we should have the original file still in now... + await assert_file_meta_data_in_db( + aiopg_engine, + file_id=uploaded_file_id, + expected_entry_exists=True, + expected_file_size=file_size, + expected_upload_expiration_date=False, + ) + # check the file is in S3 for real + s3_metadata = await storage_s3_client.get_file_metadata( + storage_s3_bucket, uploaded_file_id + ) + assert s3_metadata.size == file_size + + +@pytest.mark.parametrize( + "file_size", + [ + pytest.param(parse_obj_as(ByteSize, "1Mib")), + ], + ids=byte_size_ids, +) +async def test_download_file( + client: TestClient, + file_size: ByteSize, + upload_file: Callable[[ByteSize, str], Awaitable[tuple[Path, SimcoreS3FileID]]], + location_id: int, + user_id: UserID, + tmp_path: Path, + faker: Faker, +): + assert client.app + uploaded_file, uploaded_file_uuid = await upload_file(file_size, faker.file_name()) + + download_url = ( + client.app.router["download_file"] + .url_for( + location_id=f"{location_id}", + file_id=urllib.parse.quote(uploaded_file_uuid, safe=""), + ) + .with_query(user_id=user_id) + ) + response = await client.get(f"{download_url}") + data, error = await assert_status(response, web.HTTPOk) + assert not error + assert data + assert "link" in data + # now download the link from S3 + dest_file = tmp_path / faker.file_name() + async with ClientSession() as session: + response = await session.get(data["link"]) + response.raise_for_status() + with dest_file.open("wb") as fp: + fp.write(await response.read()) + assert dest_file.exists() + # compare files + assert filecmp.cmp(uploaded_file, dest_file) + + +@pytest.mark.parametrize( + "file_size", + [ + pytest.param(parse_obj_as(ByteSize, "1Mib")), + ], + ids=byte_size_ids, +) +async def test_delete_file( + aiopg_engine: Engine, + storage_s3_client: StorageS3Client, + storage_s3_bucket: S3BucketName, + client: TestClient, + file_size: ByteSize, + upload_file: Callable[[ByteSize, str], Awaitable[tuple[Path, SimcoreS3FileID]]], + location_id: int, + user_id: UserID, + faker: Faker, +): + assert client.app + _, uploaded_file_uuid = await upload_file(file_size, faker.file_name()) + + delete_url = ( + client.app.router["delete_file"] + .url_for( + location_id=f"{location_id}", + file_id=urllib.parse.quote(uploaded_file_uuid, safe=""), + ) + .with_query(user_id=user_id) + ) + response = await client.delete(f"{delete_url}") + await assert_status(response, web.HTTPNoContent) + + # check the entry in db is removed + await assert_file_meta_data_in_db( + aiopg_engine, + file_id=uploaded_file_uuid, + expected_entry_exists=False, + expected_file_size=None, + expected_upload_expiration_date=None, + ) + # check the file is gone from S3 + with pytest.raises(S3KeyNotFoundError): + await storage_s3_client.get_file_metadata(storage_s3_bucket, uploaded_file_uuid) + + +async def test_copy_as_soft_link( + client: TestClient, + user_id: UserID, + project_id: ProjectID, + node_id: NodeID, + upload_file: Callable[[ByteSize, str], Awaitable[tuple[Path, SimcoreS3FileID]]], + create_simcore_file_id: Callable[[ProjectID, NodeID, str], SimcoreS3FileID], + faker: Faker, +): + assert client.app + + # missing simcore_file_id returns 404 + missing_file_uuid = create_simcore_file_id(project_id, node_id, faker.file_name()) + invalid_link_id = create_simcore_file_id(uuid4(), uuid4(), faker.file_name()) + url = ( + client.app.router["copy_as_soft_link"] + .url_for( + file_id=urllib.parse.quote(missing_file_uuid, safe=""), + ) + .with_query(user_id=user_id) + ) + response = await client.post( + f"{url}", json=jsonable_encoder(SoftCopyBody(link_id=invalid_link_id)) + ) + await assert_status(response, web.HTTPNotFound) + + # now let's try with whatever link id + file, original_file_uuid = await upload_file( + parse_obj_as(ByteSize, "10Mib"), faker.file_name() + ) + url = ( + client.app.router["copy_as_soft_link"] + .url_for( + file_id=urllib.parse.quote(original_file_uuid, safe=""), + ) + .with_query(user_id=user_id) + ) + link_id = SimcoreS3FileID(f"api/{node_id}/{faker.file_name()}") + response = await client.post( + f"{url}", json=jsonable_encoder(SoftCopyBody(link_id=link_id)) + ) + data, error = await assert_status(response, web.HTTPOk) + assert not error + fmd = parse_obj_as(FileMetaDataGet, data) + assert fmd.file_id == link_id diff --git a/services/storage/tests/unit/test_handlers_files_metadata.py b/services/storage/tests/unit/test_handlers_files_metadata.py new file mode 100644 index 00000000000..51093d4278b --- /dev/null +++ b/services/storage/tests/unit/test_handlers_files_metadata.py @@ -0,0 +1,154 @@ +# pylint: disable=protected-access +# pylint: disable=redefined-outer-name +# pylint: disable=unused-argument + + +import urllib.parse +from pathlib import Path +from random import choice +from typing import Awaitable, Callable + +import pytest +from aiohttp import web +from aiohttp.test_utils import TestClient +from faker import Faker +from models_library.api_schemas_storage import FileMetaDataGet, SimcoreS3FileID +from models_library.projects import ProjectID +from models_library.users import UserID +from pydantic import ByteSize, parse_obj_as +from pytest_simcore.helpers.utils_assert import assert_status + +pytest_simcore_core_services_selection = ["postgres"] +pytest_simcore_ops_services_selection = ["adminer"] + + +async def test_get_files_metadata( + upload_file: Callable[[ByteSize, str], Awaitable[tuple[Path, SimcoreS3FileID]]], + client: TestClient, + user_id: UserID, + location_id: int, + project_id: ProjectID, + faker: Faker, +): + assert client.app + + url = ( + client.app.router["get_files_metadata"] + .url_for(location_id=f"{location_id}") + .with_query(user_id=f"{user_id}") + ) + + # this should return an empty list + response = await client.get(f"{url}") + data, error = await assert_status(response, web.HTTPOk) + assert not error + list_fmds = parse_obj_as(list[FileMetaDataGet], data) + assert not list_fmds + + # now add some stuff there + NUM_FILES = 10 + file_size = parse_obj_as(ByteSize, "15Mib") + files_owned_by_us = [] + for _ in range(NUM_FILES): + files_owned_by_us.append(await upload_file(file_size, faker.file_name())) + # we should find these files now + response = await client.get(f"{url}") + data, error = await assert_status(response, web.HTTPOk) + assert not error + list_fmds = parse_obj_as(list[FileMetaDataGet], data) + assert len(list_fmds) == NUM_FILES + # create some more files but with a base common name + NUM_FILES = 10 + file_size = parse_obj_as(ByteSize, "15Mib") + files_with_common_name = [] + for _ in range(NUM_FILES): + files_with_common_name.append( + await upload_file(file_size, f"common_name-{faker.file_name()}") + ) + # we should find these files now + response = await client.get(f"{url}") + data, error = await assert_status(response, web.HTTPOk) + assert not error + list_fmds = parse_obj_as(list[FileMetaDataGet], data) + assert len(list_fmds) == (2 * NUM_FILES) + # we can filter them now + response = await client.get(f"{url.update_query(uuid_filter='common_name')}") + data, error = await assert_status(response, web.HTTPOk) + assert not error + list_fmds = parse_obj_as(list[FileMetaDataGet], data) + assert len(list_fmds) == (NUM_FILES) + + +@pytest.mark.xfail( + reason="storage get_file_metadata must return a 200 with no payload as long as legacy services are around!!" +) +async def test_get_file_metadata_is_legacy_services_compatible( + client: TestClient, + user_id: UserID, + location_id: int, + simcore_file_id: SimcoreS3FileID, +): + assert client.app + + url = ( + client.app.router["get_file_metadata"] + .url_for( + location_id=f"{location_id}", + file_id=f"{urllib.parse.quote(simcore_file_id, safe='')}", + ) + .with_query(user_id=f"{user_id}") + ) + # this should return an empty list + response = await client.get(f"{url}") + await assert_status(response, web.HTTPNotFound) + + +async def test_get_file_metadata( + upload_file: Callable[[ByteSize, str], Awaitable[tuple[Path, SimcoreS3FileID]]], + client: TestClient, + user_id: UserID, + location_id: int, + project_id: ProjectID, + simcore_file_id: SimcoreS3FileID, + faker: Faker, +): + assert client.app + + url = ( + client.app.router["get_file_metadata"] + .url_for( + location_id=f"{location_id}", + file_id=f"{urllib.parse.quote(simcore_file_id, safe='')}", + ) + .with_query(user_id=f"{user_id}") + ) + # this should return an empty list + response = await client.get(f"{url}") + # await assert_status(response, web.HTTPNotFound) + + # NOTE: This needs to be a Ok response with empty data until ALL legacy services are gone, then it should be changed to 404! see test above + assert response.status == web.HTTPOk.status_code + assert await response.json() == {"data": {}, "error": "No result found"} + + # now add some stuff there + NUM_FILES = 10 + file_size = parse_obj_as(ByteSize, "15Mib") + files_owned_by_us = [] + for _ in range(NUM_FILES): + files_owned_by_us.append(await upload_file(file_size, faker.file_name())) + selected_file, selected_file_uuid = choice(files_owned_by_us) + url = ( + client.app.router["get_file_metadata"] + .url_for( + location_id=f"{location_id}", + file_id=f"{urllib.parse.quote(selected_file_uuid, safe='')}", + ) + .with_query(user_id=f"{user_id}") + ) + response = await client.get(f"{url}") + data, error = await assert_status(response, web.HTTPOk) + assert not error + assert data + fmd = parse_obj_as(FileMetaDataGet, data) + assert fmd.file_id == selected_file_uuid + assert fmd.file_size == selected_file.stat().st_size diff --git a/services/storage/tests/unit/test_handlers_health.py b/services/storage/tests/unit/test_handlers_health.py new file mode 100644 index 00000000000..65aa3f4f449 --- /dev/null +++ b/services/storage/tests/unit/test_handlers_health.py @@ -0,0 +1,47 @@ +# pylint: disable=protected-access +# pylint: disable=redefined-outer-name +# pylint: disable=unused-argument + + +import simcore_service_storage._meta +from aiohttp import web +from aiohttp.test_utils import TestClient +from models_library.app_diagnostics import AppStatusCheck +from pytest_simcore.helpers.utils_assert import assert_status +from simcore_service_storage.handlers_health import HealthCheck + +pytest_simcore_core_services_selection = ["postgres"] +pytest_simcore_ops_services_selection = ["adminer"] + + +async def test_health_check(client: TestClient): + assert client.app + url = client.app.router["health_check"].url_for() + response = await client.get(f"{url}") + data, error = await assert_status(response, web.HTTPOk) + assert data + assert not error + + app_health = HealthCheck.parse_obj(data) + assert app_health.name == simcore_service_storage._meta.app_name + assert app_health.version == simcore_service_storage._meta.api_version + + +async def test_health_status(client: TestClient): + assert client.app + url = client.app.router["get_status"].url_for() + response = await client.get(f"{url}") + data, error = await assert_status(response, web.HTTPOk) + assert data + assert not error + + app_status_check = AppStatusCheck.parse_obj(data) + assert app_status_check.app_name == simcore_service_storage._meta.app_name + assert app_status_check.version == simcore_service_storage._meta.api_version + assert len(app_status_check.services) == 2 + assert "postgres" in app_status_check.services + assert "healthy" in app_status_check.services["postgres"] + assert app_status_check.services["postgres"]["healthy"] == "connected" + assert "s3" in app_status_check.services + assert "healthy" in app_status_check.services["s3"] + assert app_status_check.services["s3"]["healthy"] == "connected" diff --git a/services/storage/tests/unit/test_handlers_locations.py b/services/storage/tests/unit/test_handlers_locations.py new file mode 100644 index 00000000000..503803b57c2 --- /dev/null +++ b/services/storage/tests/unit/test_handlers_locations.py @@ -0,0 +1,70 @@ +# pylint: disable=protected-access +# pylint: disable=redefined-outer-name +# pylint: disable=unused-argument + +from typing import Any, Optional + +import pytest +from aiohttp import web +from aiohttp.test_utils import TestClient +from models_library.users import UserID +from pytest_simcore.helpers.utils_assert import assert_status +from tests.helpers.utils import has_datcore_tokens + +pytest_simcore_core_services_selection = ["postgres"] +pytest_simcore_ops_services_selection = ["adminer"] + + +async def test_locations(client: TestClient, user_id: UserID): + resp = await client.get(f"/v0/locations?user_id={user_id}") + + payload = await resp.json() + assert resp.status == 200, str(payload) + + data, error = tuple(payload.get(k) for k in ("data", "error")) + + _locs = 2 if has_datcore_tokens() else 1 + assert len(data) == _locs + assert not error + + +@pytest.mark.parametrize( + "dry_run, fire_and_forget, expected_removed", + [ + (None, None, []), + (True, False, []), + (True, True, []), + (False, True, []), + (False, False, []), + ], +) +async def test_synchronise_meta_data_table( + client: TestClient, + location_id: int, + user_id: UserID, + dry_run: Optional[bool], + fire_and_forget: Optional[bool], + expected_removed: list, +): + assert client.app + query_params: dict[str, Any] = {"user_id": user_id} + if dry_run: + query_params["dry_run"] = f"{dry_run}" + if fire_and_forget: + query_params["fire_and_forget"] = f"{fire_and_forget}" + url = ( + client.app.router["synchronise_meta_data_table"] + .url_for(location_id=f"{location_id}") + .with_query(**query_params) + ) + resp = await client.post( + f"{url}", + ) + data, error = await assert_status(resp, web.HTTPOk) + assert not error + assert data + assert data["dry_run"] == (False if dry_run is None else dry_run) + assert data["fire_and_forget"] == ( + False if fire_and_forget is None else fire_and_forget + ) + assert data["removed"] == expected_removed diff --git a/services/storage/tests/unit/test_handlers_simcore_s3.py b/services/storage/tests/unit/test_handlers_simcore_s3.py new file mode 100644 index 00000000000..df9864e0ac5 --- /dev/null +++ b/services/storage/tests/unit/test_handlers_simcore_s3.py @@ -0,0 +1,460 @@ +# pylint: disable=redefined-outer-name +# pylint: disable=unused-argument +# pylint: disable=unused-variable +# pylint: disable=too-many-nested-blocks + +import asyncio +import sys +from collections import deque +from copy import deepcopy +from pathlib import Path +from random import randint +from secrets import choice +from typing import Any, Awaitable, Callable, Optional + +import pytest +import sqlalchemy as sa +from aiohttp import web +from aiohttp.test_utils import TestClient +from aiopg.sa.engine import Engine +from faker import Faker +from models_library.api_schemas_storage import FileMetaDataGet, FoldersBody +from models_library.projects import Project, ProjectID +from models_library.projects_nodes_io import NodeID, NodeIDStr, SimcoreS3FileID +from models_library.users import UserID +from models_library.utils.change_case import camel_to_snake +from models_library.utils.fastapi_encoders import jsonable_encoder +from pydantic import ByteSize, parse_file_as, parse_obj_as +from pytest_mock import MockerFixture +from pytest_simcore.helpers.utils_assert import assert_status +from settings_library.s3 import S3Settings +from simcore_postgres_database.storage_models import file_meta_data, projects +from simcore_service_storage.s3_client import StorageS3Client +from simcore_service_storage.simcore_s3_dsm import SimcoreS3DataManager +from tests.helpers.utils_file_meta_data import assert_file_meta_data_in_db +from tests.helpers.utils_project import clone_project_data +from yarl import URL + +pytest_simcore_core_services_selection = ["postgres"] +pytest_simcore_ops_services_selection = ["adminer"] + + +@pytest.fixture +def mock_datcore_download(mocker, client): + # Use to mock downloading from DATCore + async def _fake_download_to_file_or_raise(session, url, dest_path): + print(f"Faking download: {url} -> {dest_path}") + Path(dest_path).write_text("FAKE: test_create_and_delete_folders_from_project") + + mocker.patch( + "simcore_service_storage.simcore_s3_dsm.download_to_file_or_raise", + side_effect=_fake_download_to_file_or_raise, + autospec=True, + ) + + mocker.patch( + "simcore_service_storage.simcore_s3_dsm.datcore_adapter.get_file_download_presigned_link", + autospec=True, + return_value=URL("https://httpbin.org/image"), + ) + + +async def test_simcore_s3_access_returns_default(client: TestClient): + assert client.app + url = ( + client.app.router["get_or_create_temporary_s3_access"] + .url_for() + .with_query(user_id=1) + ) + response = await client.post(f"{url}") + data, error = await assert_status(response, web.HTTPOk) + assert not error + assert data + received_settings = S3Settings.parse_obj(data) + assert received_settings + + +async def test_copy_folders_from_non_existing_project( + client: TestClient, + user_id: UserID, + create_project: Callable[[], Awaitable[dict[str, Any]]], + faker: Faker, +): + assert client.app + url = ( + client.app.router["copy_folders_from_project"] + .url_for() + .with_query(user_id=user_id) + ) + src_project = await create_project() + incorrect_src_project = deepcopy(src_project) + incorrect_src_project["uuid"] = faker.uuid4() + dst_project = await create_project() + incorrect_dst_project = deepcopy(dst_project) + incorrect_dst_project["uuid"] = faker.uuid4() + + response = await client.post( + f"{url}", + json=jsonable_encoder( + FoldersBody( + source=incorrect_src_project, destination=dst_project, nodes_map={} + ) + ), + ) + data, error = await assert_status(response, web.HTTPNotFound) + assert error + assert not data + + response = await client.post( + f"{url}", + json=jsonable_encoder( + FoldersBody( + source=src_project, destination=incorrect_dst_project, nodes_map={} + ) + ), + ) + data, error = await assert_status(response, web.HTTPNotFound) + assert error + assert not data + + +async def test_copy_folders_from_empty_project( + client: TestClient, + user_id: UserID, + create_project: Callable[[], Awaitable[dict[str, Any]]], + aiopg_engine: Engine, + storage_s3_client: StorageS3Client, +): + assert client.app + url = ( + client.app.router["copy_folders_from_project"] + .url_for() + .with_query(user_id=user_id) + ) + + # we will copy from src to dst + src_project = await create_project() + dst_project = await create_project() + + response = await client.post( + f"{url}", + json=jsonable_encoder( + FoldersBody(source=src_project, destination=dst_project, nodes_map={}) + ), + ) + data, error = await assert_status(response, web.HTTPCreated) + assert not error + assert data == jsonable_encoder(dst_project) + # check there is nothing in the dst project + async with aiopg_engine.acquire() as conn: + num_entries = await conn.scalar( + sa.select([sa.func.count()]) + .select_from(file_meta_data) + .where(file_meta_data.c.project_id == dst_project["uuid"]) + ) + assert num_entries == 0 + + +async def _get_updated_project(aiopg_engine: Engine, project_id: str) -> dict[str, Any]: + async with aiopg_engine.acquire() as conn: + result = await conn.execute( + sa.select([projects]).where(projects.c.uuid == project_id) + ) + row = await result.fetchone() + assert row + return dict(row) + + +@pytest.fixture +async def random_project_with_files( + aiopg_engine: Engine, + create_project: Callable[[], Awaitable[dict[str, Any]]], + create_project_node: Callable[[ProjectID], Awaitable[NodeID]], + create_simcore_file_id: Callable[[ProjectID, NodeID, str], SimcoreS3FileID], + upload_file: Callable[ + [ByteSize, str, str], Awaitable[tuple[Path, SimcoreS3FileID]] + ], + faker: Faker, +) -> tuple[dict[str, Any], dict[NodeID, dict[SimcoreS3FileID, Path]]]: + project = await create_project() + NUM_NODES = 12 + FILE_SIZES = [ + parse_obj_as(ByteSize, "7Mib"), + parse_obj_as(ByteSize, "110Mib"), + parse_obj_as(ByteSize, "1Mib"), + ] + src_projects_list: dict[NodeID, dict[SimcoreS3FileID, Path]] = {} + upload_tasks: deque[Awaitable] = deque() + for _node_index in range(NUM_NODES): + src_node_id = await create_project_node(ProjectID(project["uuid"])) + src_projects_list[src_node_id] = {} + + async def _upload_file_and_update_project(project, src_node_id): + src_file_name = faker.file_name() + src_file_uuid = create_simcore_file_id( + ProjectID(project["uuid"]), src_node_id, src_file_name + ) + src_file, _ = await upload_file( + choice(FILE_SIZES), src_file_name, src_file_uuid + ) + src_projects_list[src_node_id][src_file_uuid] = src_file + + upload_tasks.extend( + [ + _upload_file_and_update_project(project, src_node_id) + for _ in range(randint(0, 3)) + ] + ) + await asyncio.gather(*upload_tasks) + + project = await _get_updated_project(aiopg_engine, project["uuid"]) + return project, src_projects_list + + +async def test_copy_folders_from_valid_project( + client: TestClient, + user_id: UserID, + create_project: Callable[[], Awaitable[dict[str, Any]]], + create_simcore_file_id: Callable[[ProjectID, NodeID, str], SimcoreS3FileID], + aiopg_engine: Engine, + random_project_with_files: tuple[ + dict[str, Any], dict[NodeID, dict[SimcoreS3FileID, Path]] + ], +): + assert client.app + url = ( + client.app.router["copy_folders_from_project"] + .url_for() + .with_query(user_id=user_id) + ) + + # 1. create a src project with some files + src_project, src_projects_list = random_project_with_files + # 2. create a dst project without files + dst_project, nodes_map = clone_project_data(src_project) + dst_project = await create_project(**dst_project) + # copy the project files + response = await client.post( + f"{url}", + json=jsonable_encoder( + FoldersBody( + source=src_project, + destination=dst_project, + nodes_map={NodeID(i): NodeID(j) for i, j in nodes_map.items()}, + ) + ), + ) + data, error = await assert_status(response, web.HTTPCreated) + assert not error + assert data == jsonable_encoder( + await _get_updated_project(aiopg_engine, dst_project["uuid"]) + ) + # check that file meta data was effectively copied + for src_node_id in src_projects_list: + dst_node_id = nodes_map.get(NodeIDStr(f"{src_node_id}")) + assert dst_node_id + for src_file in src_projects_list[src_node_id].values(): + await assert_file_meta_data_in_db( + aiopg_engine, + file_id=create_simcore_file_id( + ProjectID(dst_project["uuid"]), NodeID(dst_node_id), src_file.name + ), + expected_entry_exists=True, + expected_file_size=src_file.stat().st_size, + expected_upload_expiration_date=None, + ) + + +current_dir = Path(sys.argv[0] if __name__ == "__main__" else __file__).resolve().parent + + +def _get_project_with_data() -> list[Project]: + projects = parse_file_as( + list[Project], current_dir / "../data/projects_with_data.json" + ) + assert projects + return projects + + +async def _create_and_delete_folders_from_project( + user_id: UserID, + project: dict[str, Any], + client: TestClient, + project_db_creator: Callable, + check_list_files: bool, +): + destination_project, nodes_map = clone_project_data(project) + await project_db_creator(**destination_project) + + # creating a copy + assert client.app + url = ( + client.app.router["copy_folders_from_project"] + .url_for() + .with_query(user_id=f"{user_id}") + ) + resp = await client.post( + f"{url}", + json=jsonable_encoder( + FoldersBody( + source=project, + destination=destination_project, + nodes_map={NodeID(i): NodeID(j) for i, j in nodes_map.items()}, + ) + ), + ) + + data, _error = await assert_status(resp, expected_cls=web.HTTPCreated) + + # data should be equal to the destination project, and all store entries should point to simcore.s3 + for key in data: + if key != "workbench": + assert data[key] == destination_project[key] + else: + for _node_id, node in data[key].items(): + if "outputs" in node: + for _o_id, o in node["outputs"].items(): + if "store" in o: + assert o["store"] == SimcoreS3DataManager.get_location_id() + project_id = data["uuid"] + + # list data to check all is here + if check_list_files: + url = ( + client.app.router["get_files_metadata"] + .url_for(location_id=f"{SimcoreS3DataManager.get_location_id()}") + .with_query(user_id=f"{user_id}", uuid_filter=f"{project_id}") + ) + resp = await client.get(f"{url}") + data, error = await assert_status(resp, web.HTTPOk) + assert not error + # DELETING + url = ( + client.app.router["delete_folders_of_project"] + .url_for(folder_id=project_id) + .with_query(user_id=f"{user_id}") + ) + resp = await client.delete(f"{url}") + + await assert_status(resp, expected_cls=web.HTTPNoContent) + + # list data is gone + if check_list_files: + url = ( + client.app.router["get_files_metadata"] + .url_for(location_id=f"{SimcoreS3DataManager.get_location_id()}") + .with_query(user_id=f"{user_id}", uuid_filter=f"{project_id}") + ) + resp = await client.get(f"{url}") + data, error = await assert_status(resp, web.HTTPOk) + assert not error + assert not data + + +@pytest.fixture +def mock_check_project_exists(mocker: MockerFixture): + # NOTE: this avoid having to inject project in database + mock = mocker.patch( + "simcore_service_storage.dsm._check_project_exists", + autospec=True, + return_value=None, + ) + + +@pytest.mark.parametrize( + "project", + [pytest.param(prj, id=prj.name) for prj in _get_project_with_data()], +) +async def test_create_and_delete_folders_from_project( + client: TestClient, + user_id: UserID, + project: Project, + create_project: Callable[..., Awaitable[dict[str, Any]]], + mock_datcore_download, +): + project_as_dict = jsonable_encoder(project, exclude={"tags", "state", "prj_owner"}) + # HACK: some key names must be changed but not all + KEYS = {"creationDate", "lastChangeDate", "accessRights"} + for k in KEYS: + project_as_dict[camel_to_snake(k)] = project_as_dict.pop(k, None) + + await create_project(**project_as_dict) + await _create_and_delete_folders_from_project( + user_id, project_as_dict, client, create_project, check_list_files=True + ) + + +@pytest.mark.parametrize( + "project", + [pytest.param(prj, id=prj.name) for prj in _get_project_with_data()], +) +async def test_create_and_delete_folders_from_project_burst( + client: TestClient, + user_id: UserID, + project: Project, + create_project: Callable[..., Awaitable[dict[str, Any]]], + mock_datcore_download, +): + project_as_dict = jsonable_encoder( + project, exclude={"tags", "state", "prj_owner"}, by_alias=False + ) + await create_project(**project_as_dict) + await asyncio.gather( + *[ + _create_and_delete_folders_from_project( + user_id, project_as_dict, client, create_project, check_list_files=False + ) + for _ in range(100) + ] + ) + + +async def test_search_files_starting_with( + client: TestClient, + user_id: UserID, + upload_file: Callable[ + [ByteSize, str, Optional[str]], Awaitable[tuple[Path, SimcoreS3FileID]] + ], + faker: Faker, +): + assert client.app + url = ( + client.app.router["search_files_starting_with"] + .url_for() + .with_query(user_id=user_id, startswith="") + ) + + response = await client.post(f"{url}") + data, error = await assert_status(response, web.HTTPOk) + assert not error + list_fmds = parse_obj_as(list[FileMetaDataGet], data) + assert not list_fmds + + # let's upload some files now + file, file_id = await upload_file( + parse_obj_as(ByteSize, "10Mib"), faker.file_name(), None + ) + # search again should return something + response = await client.post(f"{url}") + data, error = await assert_status(response, web.HTTPOk) + assert not error + list_fmds = parse_obj_as(list[FileMetaDataGet], data) + assert len(list_fmds) == 1 + assert list_fmds[0].file_id == file_id + assert list_fmds[0].file_size == file.stat().st_size + # search again with part of the file uuid shall return the same + url.update_query(startswith=file_id[0:5]) + response = await client.post(f"{url}") + data, error = await assert_status(response, web.HTTPOk) + assert not error + list_fmds = parse_obj_as(list[FileMetaDataGet], data) + assert len(list_fmds) == 1 + assert list_fmds[0].file_id == file_id + assert list_fmds[0].file_size == file.stat().st_size + # search again with some other stuff shall return empty + url = url.update_query(startswith="Iamlookingforsomethingthatdoesnotexist") + response = await client.post(f"{url}") + data, error = await assert_status(response, web.HTTPOk) + assert not error + list_fmds = parse_obj_as(list[FileMetaDataGet], data) + assert not list_fmds diff --git a/services/storage/tests/unit/test_models.py b/services/storage/tests/unit/test_models.py new file mode 100644 index 00000000000..c45a0c6b124 --- /dev/null +++ b/services/storage/tests/unit/test_models.py @@ -0,0 +1,73 @@ +import uuid + +import pytest +from models_library.api_schemas_storage import S3BucketName +from models_library.projects import ProjectID +from models_library.projects_nodes_io import NodeID, SimcoreS3FileID, StorageFileID +from pydantic import ValidationError, parse_obj_as +from simcore_service_storage.models import FileMetaData +from simcore_service_storage.simcore_s3_dsm import SimcoreS3DataManager + + +@pytest.mark.parametrize( + "file_id", + ["test", "test/hop", "gogo", "//file.name"], +) +def test_file_id_raises_error(file_id: str): + with pytest.raises(ValidationError): + parse_obj_as(StorageFileID, file_id) + + +@pytest.mark.parametrize( + "file_id", + [ + "1c46752c-b096-11ea-a3c4-02420a00392e/e603724d-4af1-52a1-b866-0d4b792f8c4a/work.zip", + "api/7b6b4e3d-39ae-3559-8765-4f815a49984e/tmpf_qatpzx_!...***", + "api/6f788ad9-0ad8-3d0d-9722-72f08c24a212/output_data.json", + "N:package:ce145b61-7e4f-470b-a113-033653e86d3d", + ], +) +def test_file_id(file_id: str): + parsed_file_id = parse_obj_as(StorageFileID, file_id) + assert parsed_file_id + assert parsed_file_id == file_id + + +def test_fmd_build(): + file_id = parse_obj_as(SimcoreS3FileID, f"api/{uuid.uuid4()}/xx.dat") + fmd = FileMetaData.from_simcore_node( + user_id=12, + file_id=file_id, + bucket=S3BucketName("test-bucket"), + location_id=SimcoreS3DataManager.get_location_id(), + location_name=SimcoreS3DataManager.get_location_name(), + ) + + assert fmd.node_id + assert not fmd.project_id + assert fmd.file_name == "xx.dat" + assert fmd.object_name == file_id + assert fmd.file_uuid == file_id + assert fmd.file_id == file_id + assert fmd.location == SimcoreS3DataManager.get_location_name() + assert fmd.location_id == SimcoreS3DataManager.get_location_id() + assert fmd.bucket_name == "test-bucket" + + file_id = parse_obj_as(SimcoreS3FileID, f"{uuid.uuid4()}/{uuid.uuid4()}/xx.dat") + fmd = FileMetaData.from_simcore_node( + user_id=12, + file_id=file_id, + bucket=S3BucketName("test-bucket"), + location_id=SimcoreS3DataManager.get_location_id(), + location_name=SimcoreS3DataManager.get_location_name(), + ) + + assert fmd.node_id == NodeID(file_id.split("/")[1]) + assert fmd.project_id == ProjectID(file_id.split("/")[0]) + assert fmd.file_name == "xx.dat" + assert fmd.object_name == file_id + assert fmd.file_uuid == file_id + assert fmd.file_id == file_id + assert fmd.location == SimcoreS3DataManager.get_location_name() + assert fmd.location_id == SimcoreS3DataManager.get_location_id() + assert fmd.bucket_name == "test-bucket" diff --git a/services/storage/tests/unit/test_rest.py b/services/storage/tests/unit/test_rest.py deleted file mode 100644 index ce5a594536b..00000000000 --- a/services/storage/tests/unit/test_rest.py +++ /dev/null @@ -1,389 +0,0 @@ -# pylint: disable=protected-access -# pylint: disable=redefined-outer-name -# pylint: disable=unused-argument -# pylint: disable=unused-variable - -import asyncio -import json -import os -import sys -from pathlib import Path -from typing import Any, Callable -from urllib.parse import quote - -import pytest -import simcore_service_storage._meta -from aiohttp import web -from aiohttp.test_utils import TestClient -from aiopg.sa import Engine -from pytest_simcore.helpers.utils_assert import assert_status -from simcore_service_storage.access_layer import AccessRights -from simcore_service_storage.app_handlers import HealthCheck -from simcore_service_storage.application import create -from simcore_service_storage.constants import SIMCORE_S3_ID -from simcore_service_storage.dsm import APP_DSM_KEY, DataStorageManager -from simcore_service_storage.models import FileMetaData -from simcore_service_storage.settings import Settings -from tests.helpers.utils_project import clone_project_data -from tests.utils import USER_ID, has_datcore_tokens - -current_dir = Path(sys.argv[0] if __name__ == "__main__" else __file__).resolve().parent - -pytest_simcore_core_services_selection = ["postgres"] -pytest_simcore_ops_services_selection = ["minio", "adminer"] - - -def parse_db(dsm_mockup_db: dict[str, FileMetaData]): - id_name_map = {} - id_file_count = {} - for d in dsm_mockup_db.keys(): - md = dsm_mockup_db[d] - if not md.user_id in id_name_map: - id_name_map[md.user_id] = md.user_name - id_file_count[md.user_id] = 1 - else: - id_file_count[md.user_id] = id_file_count[md.user_id] + 1 - - return id_file_count, id_name_map - - -@pytest.fixture -def app_settings( - aiopg_engine: Engine, - postgres_host_config: dict[str, str], - minio_config: dict[str, Any], - monkeypatch: pytest.MonkeyPatch, -) -> Settings: - monkeypatch.setenv("STORAGE_LOG_LEVEL", "DEBUG") - monkeypatch.setenv("STORAGE_TESTING", "1") - - monkeypatch.setenv("SC_BOOT_MODE", "local-development") - test_app_settings = Settings.create_from_envs() - print(f"{test_app_settings.json(indent=2)=}") - return test_app_settings - - -@pytest.fixture -def client( - event_loop: asyncio.AbstractEventLoop, - aiohttp_client: Callable, - unused_tcp_port_factory: Callable[..., int], - app_settings: Settings, -) -> TestClient: - - app = create(app_settings) - - cli = event_loop.run_until_complete( - aiohttp_client(app, server_kwargs={"port": unused_tcp_port_factory()}) - ) - return cli - - -async def test_health_check(client: TestClient): - resp = await client.get("/v0/") - text = await resp.text() - - assert resp.status == 200, text - - payload = await resp.json() - data, error = tuple(payload.get(k) for k in ("data", "error")) - - assert data - assert not error - - app_health = HealthCheck.parse_obj(data) - assert app_health.name == simcore_service_storage._meta.app_name - assert app_health.version == simcore_service_storage._meta.api_version - - -async def test_locations(client: TestClient): - user_id = USER_ID - - resp = await client.get("/v0/locations?user_id={}".format(user_id)) - - payload = await resp.json() - assert resp.status == 200, str(payload) - - data, error = tuple(payload.get(k) for k in ("data", "error")) - - _locs = 2 if has_datcore_tokens() else 1 - assert len(data) == _locs - assert not error - - -async def test_s3_files_metadata( - client: TestClient, dsm_mockup_db: dict[str, FileMetaData] -): - id_file_count, _id_name_map = parse_db(dsm_mockup_db) - - # list files for every user - for _id in id_file_count: - resp = await client.get("/v0/locations/0/files/metadata?user_id={}".format(_id)) - payload = await resp.json() - assert resp.status == 200, str(payload) - - data, error = tuple(payload.get(k) for k in ("data", "error")) - assert not error - assert len(data) == id_file_count[_id] - - # list files fileterd by uuid - for d in dsm_mockup_db.keys(): - fmd = dsm_mockup_db[d] - assert fmd.project_id - uuid_filter = os.path.join(fmd.project_id, fmd.node_id) - resp = await client.get( - "/v0/locations/0/files/metadata?user_id={}&uuid_filter={}".format( - fmd.user_id, quote(uuid_filter, safe="") - ) - ) - payload = await resp.json() - assert resp.status == 200, str(payload) - - data, error = tuple(payload.get(k) for k in ("data", "error")) - assert not error - for d in data: - assert d["file_id"].startswith(uuid_filter) - - -async def test_s3_file_metadata(client, dsm_mockup_db): - # go through all files and get them - for d in dsm_mockup_db.keys(): - fmd = dsm_mockup_db[d] - resp = await client.get( - "/v0/locations/0/files/{}/metadata?user_id={}".format( - quote(fmd.file_uuid, safe=""), fmd.user_id - ) - ) - payload = await resp.json() - assert resp.status == 200, str(payload) - - data, error = tuple(payload.get(k) for k in ("data", "error")) - assert not error - assert data - - -async def test_download_link(client, dsm_mockup_db): - for d in dsm_mockup_db.keys(): - fmd = dsm_mockup_db[d] - resp = await client.get( - "/v0/locations/0/files/{}?user_id={}".format( - quote(fmd.file_uuid, safe=""), fmd.user_id - ) - ) - payload = await resp.json() - assert resp.status == 200, str(payload) - - data, error = tuple(payload.get(k) for k in ("data", "error")) - assert not error - assert data - - -async def test_upload_link(client, dsm_mockup_db): - for d in dsm_mockup_db.keys(): - fmd = dsm_mockup_db[d] - resp = await client.put( - "/v0/locations/0/files/{}?user_id={}".format( - quote(fmd.file_uuid, safe=""), fmd.user_id - ) - ) - payload = await resp.json() - assert resp.status == 200, str(payload) - - data, error = tuple(payload.get(k) for k in ("data", "error")) - assert not error - assert data - - -async def test_delete_file(client, dsm_mockup_db): - id_file_count, _id_name_map = parse_db(dsm_mockup_db) - - for d in dsm_mockup_db.keys(): - fmd = dsm_mockup_db[d] - resp = await client.delete( - "/v0/locations/0/files/{}?user_id={}".format( - quote(fmd.file_uuid, safe=""), fmd.user_id - ) - ) - payload = await resp.json() - assert resp.status == 200, str(payload) - - data, error = tuple(payload.get(k) for k in ("data", "error")) - assert not error - assert not data - - for _id in id_file_count: - resp = await client.get("/v0/locations/0/files/metadata?user_id={}".format(_id)) - payload = await resp.json() - assert resp.status == 200, str(payload) - - data, error = tuple(payload.get(k) for k in ("data", "error")) - assert not error - assert len(data) == 0 - - -async def test_action_check(client): - QUERY = "mguidon" - ACTION = "echo" - FAKE = {"path_value": "one", "query_value": "two", "body_value": {"a": 33, "b": 45}} - - resp = await client.post(f"/v0/check/{ACTION}?data={QUERY}", json=FAKE) - payload = await resp.json() - data, error = tuple(payload.get(k) for k in ("data", "error")) - - assert resp.status == 200, str(payload) - assert data - assert not error - - # TODO: validate response against specs - - assert data["path_value"] == ACTION - assert data["query_value"] == QUERY - - -def get_project_with_data() -> dict[str, Any]: - projects = [] - with open(current_dir / "../data/projects_with_data.json") as fp: - projects = json.load(fp) - - # TODO: add schema validation - return projects - - -@pytest.fixture -def mock_datcore_download(mocker, client): - # Use to mock downloading from DATCore - async def _fake_download_to_file_or_raise(session, url, dest_path): - print(f"Faking download: {url} -> {dest_path}") - Path(dest_path).write_text("FAKE: test_create_and_delete_folders_from_project") - - mocker.patch( - "simcore_service_storage.dsm.download_to_file_or_raise", - side_effect=_fake_download_to_file_or_raise, - ) - - dsm = client.app[APP_DSM_KEY] - assert dsm - assert isinstance(dsm, DataStorageManager) - - async def mock_download_link_datcore(*args, **kwargs): - return ["https://httpbin.org/image", "foo.txt"] - - mocker.patch.object(dsm, "download_link_datcore", mock_download_link_datcore) - - -@pytest.fixture -def mock_get_project_access_rights(mocker) -> None: - # NOTE: this avoid having to inject project in database - for module in ("dsm", "access_layer"): - mock = mocker.patch( - f"simcore_service_storage.{module}.get_project_access_rights" - ) - mock.return_value.set_result(AccessRights.all()) - - -async def _create_and_delete_folders_from_project( - project: dict[str, Any], client: TestClient -): - destination_project, nodes_map = clone_project_data(project) - - # CREATING - assert client.app - url = ( - client.app.router["copy_folders_from_project"].url_for().with_query(user_id="1") - ) - resp = await client.post( - f"{url}", - json={ - "source": project, - "destination": destination_project, - "nodes_map": nodes_map, - }, - ) - - data, _error = await assert_status(resp, expected_cls=web.HTTPCreated) - - # data should be equal to the destination project, and all store entries should point to simcore.s3 - for key in data: - if key != "workbench": - assert data[key] == destination_project[key] - else: - for _node_id, node in data[key].items(): - if "outputs" in node: - for _o_id, o in node["outputs"].items(): - if "store" in o: - assert o["store"] == SIMCORE_S3_ID - - # DELETING - project_id = data["uuid"] - url = ( - client.app.router["delete_folders_of_project"] - .url_for(folder_id=project_id) - .with_query(user_id="1") - ) - resp = await client.delete(f"{url}") - - await assert_status(resp, expected_cls=web.HTTPNoContent) - - -@pytest.mark.parametrize( - "project_name,project", [(prj["name"], prj) for prj in get_project_with_data()] -) -async def test_create_and_delete_folders_from_project( - client: TestClient, - dsm_mockup_db: dict[str, FileMetaData], - project_name: str, - project: dict[str, Any], - mock_get_project_access_rights, - mock_datcore_download, -): - source_project = project - await _create_and_delete_folders_from_project(source_project, client) - - -@pytest.mark.parametrize( - "project_name,project", [(prj["name"], prj) for prj in get_project_with_data()] -) -async def test_create_and_delete_folders_from_project_burst( - client, - dsm_mockup_db, - project_name, - project, - mock_get_project_access_rights, - mock_datcore_download, -): - source_project = project - - await asyncio.gather( - *[ - _create_and_delete_folders_from_project(source_project, client) - for _ in range(100) - ] - ) - - -async def test_s3_datasets_metadata(client: TestClient): - assert client.app - url = ( - client.app.router["get_datasets_metadata"] - .url_for(location_id=str(SIMCORE_S3_ID)) - .with_query(user_id="21") - ) - resp = await client.get(f"{url}") - payload = await resp.json() - assert resp.status == 200, str(payload) - data, error = tuple(payload.get(k) for k in ("data", "error")) - assert not error - - -async def test_s3_files_datasets_metadata(client: TestClient): - assert client.app - url = ( - client.app.router["get_files_metadata_dataset"] - .url_for(location_id=str(SIMCORE_S3_ID), dataset_id="aa") - .with_query(user_id="21") - ) - resp = await client.get(f"{url}") - payload = await resp.json() - assert resp.status == 200, str(payload) - data, error = tuple(payload.get(k) for k in ("data", "error")) - assert not error diff --git a/services/storage/tests/unit/test_route_s3_access.py b/services/storage/tests/unit/test_route_s3_access.py deleted file mode 100644 index 6c8c46124d5..00000000000 --- a/services/storage/tests/unit/test_route_s3_access.py +++ /dev/null @@ -1,54 +0,0 @@ -# pylint: disable=redefined-outer-name -# pylint: disable=unused-argument -# pylint: disable=unused-variable - -import asyncio -from typing import Callable - -import pytest -from aiohttp import web -from aiohttp.test_utils import TestClient -from pytest_simcore.helpers.utils_assert import assert_status -from settings_library.s3 import S3Settings -from simcore_service_storage.application import create -from simcore_service_storage.settings import Settings - -pytest_simcore_core_services_selection = ["postgres"] -pytest_simcore_ops_services_selection = ["minio", "adminer"] - - -@pytest.fixture -def app_settings( - aiopg_engine, postgres_host_config: dict[str, str], minio_config -) -> Settings: - test_app_settings = Settings.create_from_envs() - print(f"{test_app_settings.json(indent=2)=}") - return test_app_settings - - -@pytest.fixture -def client( - event_loop: asyncio.AbstractEventLoop, - aiohttp_client: Callable, - unused_tcp_port_factory: Callable[..., int], - app_settings: Settings, -) -> TestClient: - app = create(app_settings) - return event_loop.run_until_complete( - aiohttp_client(app, server_kwargs={"port": unused_tcp_port_factory()}) - ) - - -async def test_simcore_s3_access_returns_default(client: TestClient): - assert client.app - url = ( - client.app.router["get_or_create_temporary_s3_access"] - .url_for() - .with_query(user_id=1) - ) - response = await client.post(f"{url}") - data, error = await assert_status(response, web.HTTPOk) - assert not error - assert data - received_settings = S3Settings.parse_obj(data) - assert received_settings diff --git a/services/storage/tests/unit/test_s3.py b/services/storage/tests/unit/test_s3.py index c30986620c5..72c87fbac32 100644 --- a/services/storage/tests/unit/test_s3.py +++ b/services/storage/tests/unit/test_s3.py @@ -1,17 +1,44 @@ -# pylint: disable=protected-access +# pylint: disable=redefined-outer-name +# pylint: disable=unused-argument +# pylint: disable=unused-variable + import pytest -from simcore_service_storage import s3 - - -@pytest.mark.parametrize( - "url, expected", - [ - ("https://ceph.com", "ceph.com"), - ("http://ciao.com", "ciao.com"), - ("http://local.address:8012", "local.address:8012"), - ("https://remote.stragen.com:4432", "remote.stragen.com:4432"), - ], -) -def test_minio_client_endpint(url: str, expected: str) -> None: - assert s3._minio_client_endpint(url) == expected +from aiohttp.test_utils import TestClient +from simcore_service_storage.s3 import get_s3_client +from simcore_service_storage.settings import Settings + + +@pytest.fixture(params=[True, False]) +def enable_s3(request: pytest.FixtureRequest) -> bool: + return request.param # type: ignore + + +@pytest.fixture +def mock_config( + mocked_s3_server_envs, monkeypatch: pytest.MonkeyPatch, enable_s3: bool +): + # NOTE: override services/storage/tests/conftest.py::mock_config + monkeypatch.setenv("STORAGE_POSTGRES", "null") + if not enable_s3: + # disable S3 + monkeypatch.setenv("STORAGE_S3", "null") + + +async def test_s3_client(enable_s3: bool, app_settings: Settings, client: TestClient): + assert client.app + if enable_s3: + assert app_settings.STORAGE_S3 + s3_client = get_s3_client(client.app) + assert s3_client + + response = await s3_client.client.list_buckets() + assert response + assert "Buckets" in response + assert len(response["Buckets"]) == 1 + assert "Name" in response["Buckets"][0] + assert response["Buckets"][0]["Name"] == app_settings.STORAGE_S3.S3_BUCKET_NAME + else: + assert not app_settings.STORAGE_S3 + with pytest.raises(KeyError): + get_s3_client(client.app) diff --git a/services/storage/tests/unit/test_s3_client.py b/services/storage/tests/unit/test_s3_client.py new file mode 100644 index 00000000000..fcd3dba965f --- /dev/null +++ b/services/storage/tests/unit/test_s3_client.py @@ -0,0 +1,557 @@ +# pylint: disable=redefined-outer-name +# pylint: disable=unused-argument +# pylint: disable=unused-variable + + +import asyncio +from contextlib import AsyncExitStack +from pathlib import Path +from random import choice +from typing import AsyncIterator, Awaitable, Callable, Final, Optional +from uuid import uuid4 + +import botocore.exceptions +import pytest +from aiohttp import ClientSession +from faker import Faker +from models_library.projects import ProjectID +from models_library.projects_nodes import NodeID +from models_library.projects_nodes_io import SimcoreS3FileID +from pydantic import ByteSize, parse_obj_as +from pytest_simcore.helpers.utils_parametrizations import byte_size_ids +from simcore_service_storage.exceptions import S3BucketInvalidError, S3KeyNotFoundError +from simcore_service_storage.models import S3BucketName +from simcore_service_storage.s3_client import StorageS3Client +from simcore_service_storage.settings import Settings +from tests.helpers.file_utils import ( + parametrized_file_size, + upload_file_to_presigned_link, +) + +DEFAULT_EXPIRATION_SECS: Final[int] = 10 + + +@pytest.fixture +def mock_config(mocked_s3_server_envs, monkeypatch: pytest.MonkeyPatch): + # NOTE: override services/storage/tests/conftest.py::mock_config + monkeypatch.setenv("STORAGE_POSTGRES", "null") + + +async def test_storage_storage_s3_client_creation(app_settings: Settings): + assert app_settings.STORAGE_S3 + async with AsyncExitStack() as exit_stack: + storage_s3_client = await StorageS3Client.create( + exit_stack, app_settings.STORAGE_S3 + ) + assert storage_s3_client + response = await storage_s3_client.client.list_buckets() + assert not response["Buckets"] + with pytest.raises(botocore.exceptions.HTTPClientError): + await storage_s3_client.client.list_buckets() + + +async def _clean_bucket_content( + storage_s3_client: StorageS3Client, bucket: S3BucketName +): + response = await storage_s3_client.client.list_objects_v2(Bucket=bucket) + while response["KeyCount"] > 0: + await storage_s3_client.client.delete_objects( + Bucket=bucket, + Delete={ + "Objects": [ + {"Key": obj["Key"]} for obj in response["Contents"] if "Key" in obj + ] + }, + ) + response = await storage_s3_client.client.list_objects_v2(Bucket=bucket) + + +async def _remove_all_buckets(storage_s3_client: StorageS3Client): + response = await storage_s3_client.client.list_buckets() + bucket_names = [ + bucket["Name"] for bucket in response["Buckets"] if "Name" in bucket + ] + await asyncio.gather( + *( + _clean_bucket_content(storage_s3_client, S3BucketName(bucket)) + for bucket in bucket_names + ) + ) + await asyncio.gather( + *( + storage_s3_client.client.delete_bucket(Bucket=bucket) + for bucket in bucket_names + ) + ) + + +@pytest.fixture +async def storage_s3_client( + app_settings: Settings, +) -> AsyncIterator[StorageS3Client]: + assert app_settings.STORAGE_S3 + async with AsyncExitStack() as exit_stack: + storage_s3_client = await StorageS3Client.create( + exit_stack, app_settings.STORAGE_S3 + ) + # check that no bucket is lying around + assert storage_s3_client + response = await storage_s3_client.client.list_buckets() + assert not response[ + "Buckets" + ], f"for testing puproses, there should be no bucket lying around! {response=}" + yield storage_s3_client + # cleanup + await _remove_all_buckets(storage_s3_client) + + +async def test_create_bucket(storage_s3_client: StorageS3Client, faker: Faker): + response = await storage_s3_client.client.list_buckets() + assert not response["Buckets"] + bucket = faker.pystr() + await storage_s3_client.create_bucket(bucket) + response = await storage_s3_client.client.list_buckets() + assert response["Buckets"] + assert len(response["Buckets"]) == 1 + assert "Name" in response["Buckets"][0] + assert response["Buckets"][0]["Name"] == bucket + # now we create the bucket again, it should silently work even if it exists already + await storage_s3_client.create_bucket(bucket) + response = await storage_s3_client.client.list_buckets() + assert response["Buckets"] + assert len(response["Buckets"]) == 1 + assert "Name" in response["Buckets"][0] + assert response["Buckets"][0]["Name"] == bucket + + +@pytest.fixture +async def storage_s3_bucket( + storage_s3_client: StorageS3Client, faker: Faker +) -> AsyncIterator[str]: + response = await storage_s3_client.client.list_buckets() + assert not response["Buckets"] + bucket_name = faker.pystr() + await storage_s3_client.create_bucket(bucket_name) + response = await storage_s3_client.client.list_buckets() + assert response["Buckets"] + assert bucket_name in [ + bucket_struct.get("Name") for bucket_struct in response["Buckets"] + ], f"failed creating {bucket_name}" + + yield bucket_name + # cleanup the bucket + await _clean_bucket_content(storage_s3_client, bucket_name) + # remove bucket + await storage_s3_client.client.delete_bucket(Bucket=bucket_name) + response = await storage_s3_client.client.list_buckets() + assert bucket_name not in [ + bucket_struct.get("Name") for bucket_struct in response["Buckets"] + ], f"{bucket_name} is already in S3, please check why" + + +async def test_create_single_presigned_upload_link( + storage_s3_client: StorageS3Client, + storage_s3_bucket: S3BucketName, + create_file_of_size: Callable[[ByteSize], Path], + create_simcore_file_id: Callable[[ProjectID, NodeID, str], SimcoreS3FileID], +): + file = create_file_of_size(parse_obj_as(ByteSize, "1Mib")) + file_id = create_simcore_file_id(uuid4(), uuid4(), file.name) + presigned_url = await storage_s3_client.create_single_presigned_upload_link( + storage_s3_bucket, file_id, expiration_secs=DEFAULT_EXPIRATION_SECS + ) + assert presigned_url + + await upload_file_to_presigned_link(file, presigned_url) + + # check it is there + s3_metadata = await storage_s3_client.get_file_metadata(storage_s3_bucket, file_id) + assert s3_metadata.size == file.stat().st_size + assert s3_metadata.last_modified + assert s3_metadata.e_tag + + +async def test_create_single_presigned_upload_link_invalid_raises( + storage_s3_client: StorageS3Client, + storage_s3_bucket: S3BucketName, + create_file_of_size: Callable[[ByteSize], Path], + create_simcore_file_id: Callable[[ProjectID, NodeID, str], SimcoreS3FileID], +): + file = create_file_of_size(parse_obj_as(ByteSize, "1Mib")) + file_id = create_simcore_file_id(uuid4(), uuid4(), file.name) + with pytest.raises(S3BucketInvalidError): + await storage_s3_client.create_single_presigned_upload_link( + S3BucketName("pytestinvalidbucket"), + file_id, + expiration_secs=DEFAULT_EXPIRATION_SECS, + ) + + +@pytest.fixture +def upload_file_single_presigned_link( + storage_s3_client: StorageS3Client, + storage_s3_bucket: S3BucketName, + create_file_of_size: Callable[[ByteSize], Path], +) -> Callable[..., Awaitable[SimcoreS3FileID]]: + async def _uploader(file_id: Optional[SimcoreS3FileID] = None) -> SimcoreS3FileID: + file = create_file_of_size(parse_obj_as(ByteSize, "1Mib")) + if not file_id: + file_id = SimcoreS3FileID(file.name) + presigned_url = await storage_s3_client.create_single_presigned_upload_link( + storage_s3_bucket, file_id, expiration_secs=DEFAULT_EXPIRATION_SECS + ) + assert presigned_url + + await upload_file_to_presigned_link(file, presigned_url) + + # check the object is complete + s3_metadata = await storage_s3_client.get_file_metadata( + storage_s3_bucket, file_id + ) + assert s3_metadata.size == file.stat().st_size + return file_id + + return _uploader + + +async def test_delete_file( + storage_s3_client: StorageS3Client, + storage_s3_bucket: S3BucketName, + upload_file_single_presigned_link: Callable[..., Awaitable[SimcoreS3FileID]], +): + file_id = await upload_file_single_presigned_link() + + # delete the file + await storage_s3_client.delete_file(storage_s3_bucket, file_id) + + # check it is not available + with pytest.raises(S3KeyNotFoundError): + await storage_s3_client.get_file_metadata(storage_s3_bucket, file_id) + + +async def test_delete_file_invalid_raises( + storage_s3_client: StorageS3Client, + storage_s3_bucket: S3BucketName, + create_simcore_file_id: Callable[[ProjectID, NodeID, str], SimcoreS3FileID], + faker: Faker, +): + file_id = create_simcore_file_id(uuid4(), uuid4(), faker.file_name()) + with pytest.raises(S3BucketInvalidError): + await storage_s3_client.delete_file( + S3BucketName("pytestinvalidbucket"), file_id + ) + + # this does not raise + await storage_s3_client.delete_file(storage_s3_bucket, file_id) + + +async def test_delete_files_in_project_node( + storage_s3_client: StorageS3Client, + storage_s3_bucket: S3BucketName, + upload_file_single_presigned_link: Callable[..., Awaitable[SimcoreS3FileID]], + faker: Faker, +): + # we upload files in these paths + project_1 = uuid4() + project_2 = uuid4() + node_1 = uuid4() + node_2 = uuid4() + node_3 = uuid4() + upload_paths = ( + "", + f"{project_1}/", + f"{project_1}/{node_1}/", + f"{project_1}/{node_2}/", + f"{project_1}/{node_2}/", + f"{project_1}/{node_3}/", + f"{project_1}/{node_3}/", + f"{project_1}/{node_3}/", + f"{project_2}/", + f"{project_2}/{node_1}/", + f"{project_2}/{node_2}/", + f"{project_2}/{node_2}/", + f"{project_2}/{node_2}/", + f"{project_2}/{node_2}/", + f"{project_2}/{node_3}/", + f"{project_2}/{node_3}/states/", + f"{project_2}/{node_3}/some_folder_of_sort/", + ) + + uploaded_file_ids = await asyncio.gather( + *( + upload_file_single_presigned_link(file_id=f"{path}{faker.file_name()}") + for path in upload_paths + ) + ) + assert len(uploaded_file_ids) == len(upload_paths) + + async def _assert_deleted(*, deleted_ids: tuple[str, ...]): + for file_id in uploaded_file_ids: + if file_id.startswith(deleted_ids): + with pytest.raises(S3KeyNotFoundError): + await storage_s3_client.get_file_metadata( + storage_s3_bucket, file_id + ) + else: + s3_metadata = await storage_s3_client.get_file_metadata( + storage_s3_bucket, file_id + ) + assert s3_metadata.e_tag + + # now let's delete some files and check they are correctly deleted + await storage_s3_client.delete_files_in_project_node( + storage_s3_bucket, project_1, node_3 + ) + await _assert_deleted(deleted_ids=(f"{project_1}/{node_3}",)) + + # delete some stuff in project 2 + await storage_s3_client.delete_files_in_project_node( + storage_s3_bucket, project_2, node_3 + ) + await _assert_deleted( + deleted_ids=( + f"{project_1}/{node_3}", + f"{project_2}/{node_3}", + ) + ) + + # completely delete project 2 + await storage_s3_client.delete_files_in_project_node( + storage_s3_bucket, project_2, None + ) + await _assert_deleted( + deleted_ids=( + f"{project_1}/{node_3}", + f"{project_2}", + ) + ) + + +async def test_delete_files_in_project_node_invalid_raises( + storage_s3_client: StorageS3Client, + storage_s3_bucket: S3BucketName, + upload_file_single_presigned_link: Callable[..., Awaitable[SimcoreS3FileID]], + faker: Faker, +): + with pytest.raises(S3BucketInvalidError): + await storage_s3_client.delete_files_in_project_node( + S3BucketName("pytestinvalidbucket"), uuid4(), uuid4() + ) + # this should not raise + await storage_s3_client.delete_files_in_project_node( + storage_s3_bucket, uuid4(), uuid4() + ) + + +async def test_create_single_presigned_download_link( + storage_s3_client: StorageS3Client, + storage_s3_bucket: S3BucketName, + upload_file_single_presigned_link: Callable[..., Awaitable[SimcoreS3FileID]], + tmp_path: Path, + faker: Faker, +): + file_id = await upload_file_single_presigned_link() + + presigned_url = await storage_s3_client.create_single_presigned_download_link( + storage_s3_bucket, file_id, expiration_secs=DEFAULT_EXPIRATION_SECS + ) + + assert presigned_url + + dest_file = tmp_path / faker.file_name() + # download the file + async with ClientSession() as session: + response = await session.get(presigned_url) + response.raise_for_status() + with dest_file.open("wb") as fp: + fp.write(await response.read()) + assert dest_file.exists() + + s3_metadata = await storage_s3_client.get_file_metadata(storage_s3_bucket, file_id) + assert s3_metadata.e_tag + assert s3_metadata.last_modified + assert dest_file.stat().st_size == s3_metadata.size + + +async def test_create_single_presigned_download_link_invalid_raises( + storage_s3_client: StorageS3Client, + storage_s3_bucket: S3BucketName, + upload_file_single_presigned_link: Callable[..., Awaitable[SimcoreS3FileID]], + create_simcore_file_id: Callable[[ProjectID, NodeID, str], SimcoreS3FileID], + faker: Faker, +): + file_id = await upload_file_single_presigned_link() + + with pytest.raises(S3BucketInvalidError): + await storage_s3_client.create_single_presigned_download_link( + S3BucketName("invalidpytestbucket"), + file_id, + expiration_secs=DEFAULT_EXPIRATION_SECS, + ) + wrong_file_id = create_simcore_file_id(uuid4(), uuid4(), faker.file_name()) + with pytest.raises(S3KeyNotFoundError): + await storage_s3_client.create_single_presigned_download_link( + storage_s3_bucket, wrong_file_id, expiration_secs=DEFAULT_EXPIRATION_SECS + ) + + +@pytest.fixture +async def upload_file_with_aioboto3_managed_transfer( + storage_s3_client: StorageS3Client, + storage_s3_bucket: S3BucketName, + faker: Faker, + create_file_of_size: Callable[[ByteSize, Optional[str]], Path], + create_simcore_file_id: Callable[[ProjectID, NodeID, str], SimcoreS3FileID], +) -> Callable[[ByteSize], Awaitable[tuple[Path, SimcoreS3FileID]]]: + async def _uploader(file_size: ByteSize) -> tuple[Path, SimcoreS3FileID]: + file_name = faker.file_name() + file = create_file_of_size(file_size, file_name) + file_id = create_simcore_file_id(uuid4(), uuid4(), file_name) + response = await storage_s3_client.upload_file(storage_s3_bucket, file, file_id) + # there is no response from aioboto3... + assert not response + # check the object is uploaded + response = await storage_s3_client.client.list_objects_v2( + Bucket=storage_s3_bucket + ) + assert "Contents" in response + list_objects = response["Contents"] + assert len(list_objects) >= 1 + # find our object now + for s3_obj in list_objects: + if s3_obj.get("Key") == file_id: + # found it! + assert "ETag" in s3_obj + assert "Key" in s3_obj + assert s3_obj["Key"] == file_id + assert "Size" in s3_obj + assert s3_obj["Size"] == file.stat().st_size + return file, file_id + assert False, "Object was not properly uploaded!" + + return _uploader + + +@pytest.mark.parametrize( + "file_size", + [parametrized_file_size("500Mib")], + ids=byte_size_ids, +) +async def test_upload_file( + file_size: ByteSize, + upload_file_with_aioboto3_managed_transfer: Callable[ + [ByteSize], Awaitable[tuple[Path, SimcoreS3FileID]] + ], +): + await upload_file_with_aioboto3_managed_transfer(file_size) + + +async def test_upload_file_invalid_raises( + storage_s3_client: StorageS3Client, + create_file_of_size: Callable[[ByteSize, Optional[str]], Path], + create_simcore_file_id: Callable[[ProjectID, NodeID, str], SimcoreS3FileID], + faker: Faker, +): + file = create_file_of_size(ByteSize(10), None) + file_id = create_simcore_file_id(uuid4(), uuid4(), file.name) + with pytest.raises(S3BucketInvalidError): + await storage_s3_client.upload_file( + S3BucketName("pytestinvalidbucket"), file, file_id + ) + + +@pytest.mark.parametrize( + "file_size", + [parametrized_file_size("500Mib")], + ids=byte_size_ids, +) +async def test_copy_file( + file_size: ByteSize, + upload_file_with_aioboto3_managed_transfer: Callable[ + [ByteSize], Awaitable[tuple[Path, SimcoreS3FileID]] + ], + storage_s3_client: StorageS3Client, + storage_s3_bucket: S3BucketName, + create_simcore_file_id: Callable[[ProjectID, NodeID, str], SimcoreS3FileID], + faker: Faker, +): + src_file, src_file_uuid = await upload_file_with_aioboto3_managed_transfer( + file_size + ) + dst_file_name = faker.file_name() + dst_file_uuid = create_simcore_file_id(uuid4(), uuid4(), dst_file_name) + await storage_s3_client.copy_file(storage_s3_bucket, src_file_uuid, dst_file_uuid) + + # check the object is uploaded + response = await storage_s3_client.client.list_objects_v2(Bucket=storage_s3_bucket) + assert "Contents" in response + list_objects = response["Contents"] + assert len(list_objects) == 2 + + list_file_uuids = [src_file_uuid, dst_file_uuid] + for s3_obj in list_objects: + assert "ETag" in s3_obj + assert "Key" in s3_obj + assert s3_obj["Key"] in list_file_uuids + list_file_uuids.pop(list_file_uuids.index(s3_obj["Key"])) + assert "Size" in s3_obj + assert s3_obj["Size"] == src_file.stat().st_size + + +async def test_copy_file_invalid_raises( + upload_file_with_aioboto3_managed_transfer: Callable[ + [ByteSize], Awaitable[tuple[Path, SimcoreS3FileID]] + ], + storage_s3_client: StorageS3Client, + storage_s3_bucket: S3BucketName, + create_simcore_file_id: Callable[[ProjectID, NodeID, str], SimcoreS3FileID], + faker: Faker, +): + _, src_file_uuid = await upload_file_with_aioboto3_managed_transfer(ByteSize(1024)) + dst_file_name = faker.file_name() + dst_file_uuid = create_simcore_file_id(uuid4(), uuid4(), dst_file_name) + with pytest.raises(S3BucketInvalidError): + await storage_s3_client.copy_file( + S3BucketName("pytestinvalidbucket"), src_file_uuid, dst_file_uuid + ) + with pytest.raises(S3KeyNotFoundError): + await storage_s3_client.copy_file( + storage_s3_bucket, SimcoreS3FileID("missing_file_uuid"), dst_file_uuid + ) + + +async def test_list_files( + storage_s3_client: StorageS3Client, + storage_s3_bucket: S3BucketName, + upload_file_with_aioboto3_managed_transfer: Callable[ + [ByteSize], Awaitable[tuple[Path, SimcoreS3FileID]] + ], +): + list_files = await storage_s3_client.list_files(storage_s3_bucket, prefix="") + assert list_files == [] + + NUM_FILES = 12 + FILE_SIZE = parse_obj_as(ByteSize, "11Mib") + uploaded_files: list[tuple[Path, SimcoreS3FileID]] = [] + for _ in range(NUM_FILES): + uploaded_files.append( + await upload_file_with_aioboto3_managed_transfer(FILE_SIZE) + ) + + list_files = await storage_s3_client.list_files(storage_s3_bucket, prefix="") + assert len(list_files) == NUM_FILES + # test with prefix + file, file_id = choice(uploaded_files) + list_files = await storage_s3_client.list_files(storage_s3_bucket, prefix=file_id) + assert len(list_files) == 1 + assert list_files[0].file_id == file_id + assert list_files[0].size == file.stat().st_size + + +async def test_list_files_invalid_bucket_raises( + storage_s3_client: StorageS3Client, +): + with pytest.raises(S3BucketInvalidError): + await storage_s3_client.list_files( + S3BucketName("pytestinvalidbucket"), prefix="" + ) diff --git a/services/storage/tests/unit/test_s3_utils.py b/services/storage/tests/unit/test_s3_utils.py new file mode 100644 index 00000000000..d0beed84873 --- /dev/null +++ b/services/storage/tests/unit/test_s3_utils.py @@ -0,0 +1,34 @@ +# pylint: disable=redefined-outer-name +# pylint: disable=redefined-outer-name +# pylint: disable=unused-argument +# pylint: disable=unused-variable + + +import pytest +from pydantic import ByteSize, parse_obj_as +from pytest_simcore.helpers.utils_parametrizations import byte_size_ids +from simcore_service_storage.s3_utils import compute_num_file_chunks + + +@pytest.mark.xfail(reason="will work soon") +@pytest.mark.parametrize( + "file_size, expected_num_chunks, expected_chunk_size", + [ + (parse_obj_as(ByteSize, "5Mib"), 1, parse_obj_as(ByteSize, "10Mib")), + (parse_obj_as(ByteSize, "10Mib"), 1, parse_obj_as(ByteSize, "10Mib")), + (parse_obj_as(ByteSize, "20Mib"), 2, parse_obj_as(ByteSize, "10Mib")), + (parse_obj_as(ByteSize, "50Mib"), 5, parse_obj_as(ByteSize, "10Mib")), + (parse_obj_as(ByteSize, "150Mib"), 15, parse_obj_as(ByteSize, "10Mib")), + (parse_obj_as(ByteSize, "550Mib"), 55, parse_obj_as(ByteSize, "10Mib")), + (parse_obj_as(ByteSize, "560Gib"), 5735, parse_obj_as(ByteSize, "100Mib")), + (parse_obj_as(ByteSize, "5Tib"), 8739, parse_obj_as(ByteSize, "600Mib")), + (parse_obj_as(ByteSize, "15Tib"), 7680, parse_obj_as(ByteSize, "2Gib")), + ], + ids=byte_size_ids, +) +def test_compute_num_file_chunks( + file_size: ByteSize, expected_num_chunks: int, expected_chunk_size: ByteSize +): + num_chunks, chunk_size = compute_num_file_chunks(file_size) + assert num_chunks == expected_num_chunks + assert chunk_size == expected_chunk_size diff --git a/services/storage/tests/unit/test_temporary_handlers_utils.py b/services/storage/tests/unit/test_temporary_handlers_utils.py deleted file mode 100644 index faabfee271c..00000000000 --- a/services/storage/tests/unit/test_temporary_handlers_utils.py +++ /dev/null @@ -1,116 +0,0 @@ -from typing import Any - -import pytest -from models_library.api_schemas_storage import DatasetMetaDataGet, FileMetaDataGet -from simcore_service_storage.models import DatasetMetaData, FileMetaData, FileMetaDataEx -from simcore_service_storage.temporary_handlers_utils import ( - convert_to_api_dataset, - convert_to_api_fmd, -) - - -@pytest.mark.parametrize( - "internal_dataset_metadata, expected_dataset_metadata_get", - [ - (DatasetMetaData(**d), DatasetMetaDataGet.parse_obj(d)) - for d in DatasetMetaDataGet.Config.schema_extra["examples"] - ], -) -def test_convert_to_api_dataset( - internal_dataset_metadata: DatasetMetaData, - expected_dataset_metadata_get: DatasetMetaDataGet, -): - assert ( - convert_to_api_dataset(internal_dataset_metadata) - == expected_dataset_metadata_get - ) - - -_EXAMPLE_OLD_FILEMETADATA_EX = [ - # simcore S3 - { - "bucket_name": "master-simcore", - "created_at": "2022-05-30 15:14:00.005624", - "display_file_path": "New Study (1)/Docker Swarm Services - Grafana.pptx/Docker Swarm Services - Grafana.pptx", - "entity_tag": "8ee3ebd91fe8b873695defdc609b16f2", - "file_id": "1b3623fa-e02b-11ec-8777-02420a0194f6/dbc18a00-67bd-4064-92d4-0d696ed1982c/Docker Swarm Services - Grafana.pptx", - "file_name": "Docker Swarm Services - Grafana.pptx", - "file_size": 725842, - "file_uuid": "1b3623fa-e02b-11ec-8777-02420a0194f6/dbc18a00-67bd-4064-92d4-0d696ed1982c/Docker Swarm Services - Grafana.pptx", - "is_soft_link": False, - "last_modified": "2022-05-30 15:14:00+00", - "location_id": "0", - "location": "simcore.s3", - "node_id": "dbc18a00-67bd-4064-92d4-0d696ed1982c", - "node_name": "Docker Swarm Services - Grafana.pptx", - "object_name": "1b3623fa-e02b-11ec-8777-02420a0194f6/dbc18a00-67bd-4064-92d4-0d696ed1982c/Docker Swarm Services - Grafana.pptx", - "parent_id": "1b3623fa-e02b-11ec-8777-02420a0194f6/dbc18a00-67bd-4064-92d4-0d696ed1982c", - "project_id": "1b3623fa-e02b-11ec-8777-02420a0194f6", - "project_name": "New Study (1)", - "raw_file_path": "1b3623fa-e02b-11ec-8777-02420a0194f6/dbc18a00-67bd-4064-92d4-0d696ed1982c/Docker Swarm Services - Grafana.pptx", - "user_id": "3", - "user_name": None, - }, - # datcore - { - "bucket_name": "N:dataset:8fac2b22-b5c6-44f8-bf30-87a6c0bebad5", - "created_at": "2020-05-28T15:48:34.386302+00:00", - "display_file_path": "templatetemplate.json", - "entity_tag": None, - "file_id": "N:package:ce145b61-7e4f-470b-a113-033653e86d3d", - "file_name": "templatetemplate.json", - "file_size": 238, - "file_uuid": "Kember Cardiac Nerve Model/templatetemplate.json", - "is_soft_link": False, - "last_modified": "2020-05-28T15:48:37.507387+00:00", - "location_id": 1, - "location": "datcore", - "node_id": None, - "node_name": None, - "object_name": "Kember Cardiac Nerve Model/templatetemplate.json", - "parent_id": "", - "project_id": None, - "project_name": None, - "raw_file_path": None, - "user_id": None, - "user_name": None, - }, -] - -_KEYS_IN_NEW_INTERFACE = ( - "created_at", - "entity_tag", - "file_id", - "file_name", - "file_size", - "file_uuid", - "is_soft_link", - "last_modified", - "location_id", - "node_name", - "project_name", -) - - -def _test_parameters() -> list[tuple[FileMetaDataEx, FileMetaDataGet]]: - params = [] - for fmd_dict in _EXAMPLE_OLD_FILEMETADATA_EX: - parent_id = fmd_dict.pop("parent_id") - fmd = FileMetaData(**fmd_dict) - fmd_ex = FileMetaDataEx(fmd=fmd, parent_id=parent_id) - filtered_dict = { - key: fmd_dict[key] for key in fmd_dict if key in _KEYS_IN_NEW_INTERFACE - } - api_fmd = FileMetaDataGet.parse_obj(filtered_dict) - params.append((fmd_ex, api_fmd)) - return params - - -@pytest.mark.parametrize( - "internal_fmd, expected_api_fmd", - _test_parameters(), -) -def test_convert_to_api_fmd( - internal_fmd: dict[str, Any], expected_api_fmd: FileMetaDataGet -): - assert convert_to_api_fmd(internal_fmd) == expected_api_fmd diff --git a/services/storage/tests/unit/test_utils.py b/services/storage/tests/unit/test_utils.py index f376342b35f..e487c2a1c56 100644 --- a/services/storage/tests/unit/test_utils.py +++ b/services/storage/tests/unit/test_utils.py @@ -1,10 +1,17 @@ +import datetime import random from pathlib import Path -from typing import Optional +from typing import Callable, Optional +from uuid import uuid4 import pytest from aiohttp import ClientSession -from simcore_service_storage.models import FileMetaData +from faker import Faker +from models_library.projects import ProjectID +from models_library.projects_nodes_io import NodeID, SimcoreS3FileID +from pydantic import ByteSize, parse_obj_as +from simcore_service_storage.models import ETag, FileMetaData, S3BucketName +from simcore_service_storage.simcore_s3_dsm import SimcoreS3DataManager from simcore_service_storage.utils import ( MAX_CHUNK_SIZE, download_to_file_or_raise, @@ -27,20 +34,39 @@ async def test_download_files(tmpdir): @pytest.mark.parametrize( - "file_size, entity_tag, expected_validity", + "file_size, entity_tag, upload_expires_at, expected_validity", [ - (None, None, False), - (-1, None, False), - (0, None, False), - (random.randint(1, 1000000), None, False), - (None, "some_valid_entity_tag", False), - (-1, "some_valid_entity_tag", False), - (0, "some_valid_entity_tag", False), - (random.randint(1, 1000000), "some_valid_entity_tag", True), + (-1, None, None, False), + (0, None, None, False), + (random.randint(1, 1000000), None, None, False), + (-1, "some_valid_entity_tag", None, False), + (0, "some_valid_entity_tag", None, False), + ( + random.randint(1, 1000000), + "some_valid_entity_tag", + datetime.datetime.utcnow(), + False, + ), + (random.randint(1, 1000000), "some_valid_entity_tag", None, True), ], ) def test_file_entry_valid( - file_size: Optional[int], entity_tag: Optional[str], expected_validity: bool + file_size: ByteSize, + entity_tag: Optional[ETag], + upload_expires_at: Optional[datetime.datetime], + expected_validity: bool, + create_simcore_file_id: Callable[[ProjectID, NodeID, str], SimcoreS3FileID], + faker: Faker, ): - file_meta_data = FileMetaData(file_size=file_size, entity_tag=entity_tag) - assert is_file_entry_valid(file_meta_data) == expected_validity + file_id = create_simcore_file_id(uuid4(), uuid4(), faker.file_name()) + fmd = FileMetaData.from_simcore_node( + user_id=faker.pyint(min_value=1), + file_id=file_id, + bucket=S3BucketName("pytest-bucket"), + location_id=SimcoreS3DataManager.get_location_id(), + location_name=SimcoreS3DataManager.get_location_name(), + ) + fmd.file_size = parse_obj_as(ByteSize, file_size) + fmd.entity_tag = entity_tag + fmd.upload_expires_at = upload_expires_at + assert is_file_entry_valid(fmd) == expected_validity diff --git a/services/storage/tests/unit/test_utils_handlers.py b/services/storage/tests/unit/test_utils_handlers.py new file mode 100644 index 00000000000..3149c283cd4 --- /dev/null +++ b/services/storage/tests/unit/test_utils_handlers.py @@ -0,0 +1,61 @@ +# pylint: disable=redefined-outer-name +# pylint: disable=unused-argument +# pylint: disable=unused-variable + + +import pytest +from aiohttp import web +from aiohttp.typedefs import Handler +from pydantic import BaseModel, ValidationError +from pytest_mock import MockerFixture +from servicelib.aiohttp.aiopg_utils import DBAPIError +from simcore_service_storage.db_access_layer import InvalidFileIdentifier +from simcore_service_storage.exceptions import ( + FileAccessRightError, + FileMetaDataNotFoundError, + ProjectAccessRightError, + ProjectNotFoundError, + S3KeyNotFoundError, +) +from simcore_service_storage.utils_handlers import dsm_exception_handler + + +@pytest.fixture() +async def raising_handler( + mocker: MockerFixture, handler_exception: type[Exception] +) -> Handler: + mock = mocker.patch("aiohttp.typedefs.Handler", autospec=True) + mock.side_effect = handler_exception + return mock + + +@pytest.fixture +def mock_request(mocker: MockerFixture) -> web.Request: + mock = mocker.patch("aiohttp.web.Request", autospec=True) + return mock + + +class FakeErrorModel(BaseModel): + dummy: int = 1 + + +@pytest.mark.parametrize( + "handler_exception, expected_web_response", + [ + (InvalidFileIdentifier(identifier="x"), web.HTTPUnprocessableEntity), + (FileMetaDataNotFoundError(file_id="x"), web.HTTPNotFound), + (S3KeyNotFoundError(key="x", bucket="x"), web.HTTPNotFound), + (ProjectNotFoundError(project_id="x"), web.HTTPNotFound), + (FileAccessRightError(file_id="x", access_right="x"), web.HTTPForbidden), + (ProjectAccessRightError(project_id="x", access_right="x"), web.HTTPForbidden), + (ValidationError(errors=[], model=FakeErrorModel), web.HTTPUnprocessableEntity), + (DBAPIError, web.HTTPServiceUnavailable), + ], +) +async def test_dsm_exception_handler( + mock_request: web.Request, + raising_handler: Handler, + expected_web_response: type[web.HTTPClientError], +): + with pytest.raises(expected_web_response): + await dsm_exception_handler(mock_request, raising_handler) diff --git a/services/storage/tests/utils.py b/services/storage/tests/utils.py deleted file mode 100644 index e9e23bff044..00000000000 --- a/services/storage/tests/utils.py +++ /dev/null @@ -1,111 +0,0 @@ -import logging -import os -import sys -from pathlib import Path - -import pandas -import requests -import sqlalchemy as sa -from simcore_service_storage.models import ( - FileMetaData, - file_meta_data, - groups, - projects, - user_to_groups, - users, -) - -log = logging.getLogger(__name__) - - -DATABASE = "test" -USER = "admin" -PASS = "admin" - -ACCESS_KEY = "12345678" -SECRET_KEY = "12345678" - -BUCKET_NAME = "simcore-testing-bucket" -USER_ID = "0" - -PG_TABLES_NEEDED_FOR_STORAGE = [ - user_to_groups, - file_meta_data, - projects, - users, - groups, -] - -CURRENT_DIR = Path(sys.argv[0] if __name__ == "__main__" else __file__).resolve().parent -DATA_DIR = CURRENT_DIR / "data" - - -def has_datcore_tokens() -> bool: - # TODO: activate tests against BF services in the CI. - # - # CI shall add BF_API_KEY, BF_API_SECRET environs as secrets - # - if not os.environ.get("BF_API_KEY") or not os.environ.get("BF_API_SECRET"): - return False - return True - - -def is_responsive(url, code=200) -> bool: - """Check if something responds to ``url`` syncronously""" - try: - response = requests.get(url) - if response.status_code == code: - return True - except requests.exceptions.RequestException as _e: - pass - - return False - - -def insert_metadata(url: str, fmd: FileMetaData): - # FIXME: E1120:No value for argument 'dml' in method call - # pylint: disable=E1120 - ins = file_meta_data.insert().values( - file_uuid=fmd.file_uuid, - location_id=fmd.location_id, - location=fmd.location, - bucket_name=fmd.bucket_name, - object_name=fmd.object_name, - project_id=fmd.project_id, - project_name=fmd.project_name, - node_id=fmd.node_id, - node_name=fmd.node_name, - file_name=fmd.file_name, - user_id=fmd.user_id, - user_name=fmd.user_name, - file_id=fmd.file_id, - raw_file_path=fmd.raw_file_path, - display_file_path=fmd.display_file_path, - created_at=fmd.created_at, - last_modified=fmd.last_modified, - file_size=fmd.file_size, - entity_tag=fmd.entity_tag, - ) - - engine = sa.create_engine(url) - try: - conn = engine.connect() - conn.execute(ins) - finally: - engine.dispose() - - -def fill_tables_from_csv_files(url): - engine = None - - try: - engine = sa.create_engine(url) - for table in ["users", "file_meta_data", "projects"]: - with open(DATA_DIR / f"{table}.csv", "r") as file: - data_df = pandas.read_csv(file) - data_df.to_sql( - table, con=engine, index=False, index_label="id", if_exists="append" - ) - finally: - if engine is not None: - engine.dispose() diff --git a/services/web/server/src/simcore_service_webserver/exporter/formatters/models.py b/services/web/server/src/simcore_service_webserver/exporter/formatters/models.py index 0e535905710..b8c069353f4 100644 --- a/services/web/server/src/simcore_service_webserver/exporter/formatters/models.py +++ b/services/web/server/src/simcore_service_webserver/exporter/formatters/models.py @@ -1,18 +1,18 @@ import uuid from datetime import datetime from pathlib import Path -from typing import Callable, Dict, List, Union +from typing import Callable, Union import aiofiles from models_library.projects import Project -from models_library.projects_nodes_io import Location, StorageFileID +from models_library.projects_nodes_io import LocationID, StorageFileID from models_library.projects_state import ProjectStatus from pydantic import BaseModel, DirectoryPath, Field, parse_obj_as, validator from ..utils import makedirs from .base_models import BaseLoadingModel -ShuffledData = Dict[str, str] +ShuffledData = dict[str, str] class LinkAndPath2(BaseModel): @@ -22,7 +22,7 @@ class LinkAndPath2(BaseModel): ..., description="temporary directory where all data is stored, to be ignored from serialization", ) - storage_type: Location = Field( + storage_type: LocationID = Field( ..., description="usually 0 for S3 or 1 for Pennsieve", ) @@ -91,7 +91,7 @@ class ManifestFile(BaseLoadingModel): default_factory=datetime.utcnow, ) - attachments: List[str] = Field( + attachments: list[str] = Field( ..., description="list of paths for attachments found in the project directory" ) diff --git a/services/web/server/src/simcore_service_webserver/meta_modeling_projects.py b/services/web/server/src/simcore_service_webserver/meta_modeling_projects.py index a20b08c57a4..6e59564d318 100644 --- a/services/web/server/src/simcore_service_webserver/meta_modeling_projects.py +++ b/services/web/server/src/simcore_service_webserver/meta_modeling_projects.py @@ -8,11 +8,10 @@ import logging import re -from typing import List, Tuple from aiohttp import web from aiohttp.typedefs import Handler -from models_library.basic_regex import UUID_RE_BASE +from models_library.basic_regex import UUID_RE from models_library.projects import ProjectID from ._meta import api_version_prefix as VTAG @@ -28,7 +27,7 @@ # SEE https://github.com/ITISFoundation/osparc-simcore/blob/master/services/web/server/src/simcore_service_webserver/api/v0/openapi.yaml#L8563 -URL_PATTERN = re.compile(rf"^\/{VTAG}\/projects\/({UUID_RE_BASE})[\/]{{0,1}}") +URL_PATTERN = re.compile(rf"^\/{VTAG}\/projects\/({UUID_RE})[\/]{{0,1}}") def _match_project_id(request: web.Request): @@ -91,14 +90,14 @@ async def get_runnable_projects_ids( self, request: web.Request, project_uuid: ProjectID, - ) -> List[ProjectID]: + ) -> list[ProjectID]: return await get_runnable_projects_ids(request, project_uuid) async def get_or_create_runnable_projects( self, request: web.Request, project_uuid: ProjectID, - ) -> Tuple[List[ProjectID], List[CommitID]]: + ) -> tuple[list[ProjectID], list[CommitID]]: return await get_or_create_runnable_projects(request, project_uuid) diff --git a/services/web/server/src/simcore_service_webserver/storage_routes.py b/services/web/server/src/simcore_service_webserver/storage_routes.py index dfc02ed2392..609e957604d 100644 --- a/services/web/server/src/simcore_service_webserver/storage_routes.py +++ b/services/web/server/src/simcore_service_webserver/storage_routes.py @@ -4,7 +4,6 @@ """ import logging -from typing import List from aiohttp import web from servicelib.aiohttp import openapi @@ -14,7 +13,7 @@ log = logging.getLogger(__name__) -def create(specs: openapi.Spec) -> List[web.RouteDef]: +def create(specs: openapi.Spec) -> list[web.RouteDef]: # TODO: consider the case in which server creates routes for both v0 and v1!!! # TODO: should this be taken from servers instead? BASEPATH = "/v" + specs.info.version.split(".")[0] @@ -64,11 +63,6 @@ def create(specs: openapi.Spec) -> List[web.RouteDef]: operation_id = specs.paths[path].operations["get"].operation_id routes.append(web.get(BASEPATH + path, handle, name=operation_id)) - # TODO: Implements update - # path, handle = '/{location_id}/files/{file_id}/metadata', handlers.update_file_metadata - # operation_id = specs.paths[path].operations['patch'].operation_id - # routes.append( web.patch(BASEPATH+path, handle, name=operation_id) ) - _FILE_PATH = "/storage/locations/{location_id}/files/{file_id}" path, handle = ( _FILE_PATH, diff --git a/services/web/server/tests/integration/01/test_exporter.py b/services/web/server/tests/integration/01/test_exporter.py index 88437d240b3..e25dcfd25d1 100644 --- a/services/web/server/tests/integration/01/test_exporter.py +++ b/services/web/server/tests/integration/01/test_exporter.py @@ -16,17 +16,7 @@ from contextlib import contextmanager from copy import deepcopy from pathlib import Path -from typing import ( - Any, - AsyncIterator, - Awaitable, - Callable, - Dict, - Iterator, - List, - Set, - Tuple, -) +from typing import Any, AsyncIterator, Awaitable, Callable, Iterator from unittest import mock import aiofiles @@ -139,7 +129,7 @@ async def __delete_all_redis_keys__(redis_settings: RedisSettings): def client( event_loop: asyncio.AbstractEventLoop, aiohttp_client: Callable, - app_config: Dict, + app_config: dict, postgres_with_template_db: aiopg.sa.engine.Engine, mock_orphaned_services: mock.Mock, monkeypatch_setenv_from_app_config: Callable, @@ -196,7 +186,7 @@ async def login_user(client): return await log_client_in(client=client, user_data={"role": UserRole.USER.name}) -def get_exported_projects() -> List[Path]: +def get_exported_projects() -> list[Path]: # These files are generated from the front-end # when the formatter be finished exporter_dir = DATA_DIR / "exporter" @@ -210,7 +200,7 @@ def get_exported_projects() -> List[Path]: async def apply_access_rights( aiopg_engine: aiopg.sa.Engine, ) -> AsyncIterator[Callable[..., Awaitable[None]]]: - async def grant_rights_to_services(services: List[Tuple[str, str]]) -> None: + async def grant_rights_to_services(services: list[tuple[str, str]]) -> None: for service_key, service_version in services: metada_data_values = dict( key=service_key, @@ -271,7 +261,7 @@ async def grant_access_rights( @pytest.fixture(scope="session") -def push_services_to_registry(docker_registry: str, node_meta_schema: Dict) -> None: +def push_services_to_registry(docker_registry: str, node_meta_schema: dict) -> None: """Adds a itisfoundation/sleeper in docker registry""" # Used by V1 study _pull_push_service( @@ -300,7 +290,7 @@ def assemble_tmp_file_path(file_name: str) -> Iterator[Path]: async def query_project_from_db( aiopg_engine: aiopg.sa.Engine, project_uuid: str -) -> Dict[str, Any]: +) -> dict[str, Any]: async with aiopg_engine.acquire() as conn: project_result = await conn.execute( projects.select().where(projects.c.uuid == project_uuid) @@ -310,7 +300,7 @@ async def query_project_from_db( return dict(project) -def replace_uuids_with_sequences(original_project: Dict[str, Any]) -> Dict[str, Any]: +def replace_uuids_with_sequences(original_project: dict[str, Any]) -> dict[str, Any]: # first make a copy project = deepcopy(original_project) workbench = project["workbench"] @@ -345,7 +335,7 @@ def replace_uuids_with_sequences(original_project: Dict[str, Any]) -> Dict[str, return project -def dict_with_keys(dict_data: Dict[str, Any], kept_keys: Set[str]) -> Dict[str, Any]: +def dict_with_keys(dict_data: dict[str, Any], kept_keys: set[str]) -> dict[str, Any]: modified_dict = {} for key, value in dict_data.items(): if key in kept_keys: @@ -361,8 +351,8 @@ def dict_with_keys(dict_data: Dict[str, Any], kept_keys: Set[str]) -> Dict[str, def dict_without_keys( - dict_data: Dict[str, Any], skipped_keys: Set[str] -) -> Dict[str, Any]: + dict_data: dict[str, Any], skipped_keys: set[str] +) -> dict[str, Any]: modified_dict = {} for key, value in dict_data.items(): @@ -383,8 +373,8 @@ def assert_combined_entires_condition( def extract_original_files_for_node_sequence( - project: Dict[str, Any], normalized_project: Dict[str, Any] -) -> Dict[str, Dict[str, str]]: + project: dict[str, Any], normalized_project: dict[str, Any] +) -> dict[str, dict[str, str]]: """ Extracts path and store from ouput_1 field of each node and returns mapped to the normalized data node keys for simpler comparison @@ -402,12 +392,12 @@ def extract_original_files_for_node_sequence( async def extract_download_links_from_storage( app: aiohttp.web.Application, - original_files: Dict[str, Dict[str, Any]], + original_files: dict[str, dict[str, Any]], user_id: str, -) -> Dict[str, str]: +) -> dict[str, str]: async def _get_mapped_link( seq_key: str, location_id: LocationID, file_id: StorageFileID - ) -> Tuple[str, str]: + ) -> tuple[str, str]: link = await get_file_download_url( app=app, location_id=location_id, @@ -432,8 +422,8 @@ async def _get_mapped_link( async def download_files_and_get_checksums( - app: aiohttp.web.Application, download_links: Dict[str, str] -) -> Dict[str, str]: + app: aiohttp.web.Application, download_links: dict[str, str] +) -> dict[str, str]: with tempfile.TemporaryDirectory() as store_dir: download_paths = {} parallel_downloader = ParallelDownloader() @@ -456,10 +446,10 @@ async def download_files_and_get_checksums( async def get_checksums_for_files_in_storage( app: aiohttp.web.Application, - project: Dict[str, Any], - normalized_project: Dict[str, Any], + project: dict[str, Any], + normalized_project: dict[str, Any], user_id: str, -) -> Dict[str, str]: +) -> dict[str, str]: original_files = extract_original_files_for_node_sequence( project=project, normalized_project=normalized_project ) @@ -531,7 +521,7 @@ async def test_import_export_import_duplicate( assert url_export == URL(API_PREFIX + f"/projects/{imported_project_uuid}:xport") async with await client.post( - f"{url_export}", headers=headers, timeout=100000 + f"{url_export}", headers=headers, timeout=10 ) as export_response: assert export_response.status == 200, await export_response.text()