diff --git a/lambdas/service/app.py b/lambdas/service/app.py index 405652c806..fea36acd82 100644 --- a/lambdas/service/app.py +++ b/lambdas/service/app.py @@ -232,7 +232,7 @@ # changes and reset the minor version to zero. Otherwise, increment only # the minor version for backwards compatible changes. A backwards # compatible change is one that does not require updates to clients. - 'version': '9.1' + 'version': '9.2' }, 'tags': [ { diff --git a/lambdas/service/openapi.json b/lambdas/service/openapi.json index 2e75a0f1da..54e1f3043d 100644 --- a/lambdas/service/openapi.json +++ b/lambdas/service/openapi.json @@ -3,7 +3,7 @@ "info": { "title": "azul_service", "description": "\n# Overview\n\nAzul is a REST web service for querying metadata associated with\nboth experimental and analysis data from a data repository. In order\nto deliver response times that make it suitable for interactive use\ncases, the set of metadata properties that it exposes for sorting,\nfiltering, and aggregation is limited. Azul provides a uniform view\nof the metadata over a range of diverse schemas, effectively\nshielding clients from changes in the schemas as they occur over\ntime. It does so, however, at the expense of detail in the set of\nmetadata properties it exposes and in the accuracy with which it\naggregates them.\n\nAzul denormalizes and aggregates metadata into several different\nindices for selected entity types. Metadata entities can be queried\nusing the [Index](#operations-tag-Index) endpoints.\n\nA set of indices forms a catalog. There is a default catalog called\n`dcp2` which will be used unless a\ndifferent catalog name is specified using the `catalog` query\nparameter. Metadata from different catalogs is completely\nindependent: a response obtained by querying one catalog does not\nnecessarily correlate to a response obtained by querying another\none. Two catalogs can contain metadata from the same sources or\ndifferent sources. It is only guaranteed that the body of a\nresponse by any given endpoint adheres to one schema,\nindependently of which catalog was specified in the request.\n\nAzul provides the ability to download data and metadata via the\n[Manifests](#operations-tag-Manifests) endpoints. The\n`curl` format manifests can be used to\ndownload data files. Other formats provide various views of the\nmetadata. Manifests can be generated for a selection of files using\nfilters. These filters are interchangeable with the filters used by\nthe [Index](#operations-tag-Index) endpoints.\n\nAzul also provides a [summary](#operations-Index-get_index_summary)\nview of indexed data.\n\n## Data model\n\nAny index, when queried, returns a JSON array of hits. Each hit\nrepresents a metadata entity. Nested in each hit is a summary of the\nproperties of entities associated with the hit. An entity is\nassociated either by a direct edge in the original metadata graph,\nor indirectly as a series of edges. The nested properties are\ngrouped by the type of the associated entity. The properties of all\ndata files associated with a particular sample, for example, are\nlisted under `hits[*].files` in a `/index/samples` response. It is\nimportant to note that while each _hit_ represents a discrete\nentity, the properties nested within that hit are the result of an\naggregation over potentially many associated entities.\n\nTo illustrate this, consider a data file that is part of two\nprojects (a project is a group of related experiments, typically by\none laboratory, institution or consortium). Querying the `files`\nindex for this file yields a hit looking something like:\n\n```\n{\n \"projects\": [\n {\n \"projectTitle\": \"Project One\"\n \"laboratory\": ...,\n ...\n },\n {\n \"projectTitle\": \"Project Two\"\n \"laboratory\": ...,\n ...\n }\n ],\n \"files\": [\n {\n \"format\": \"pdf\",\n \"name\": \"Team description.pdf\",\n ...\n }\n ]\n}\n```\n\nThis example hit contains two kinds of nested entities (a hit in an\nactual Azul response will contain more): There are the two projects\nentities, and the file itself. These nested entities contain\nselected metadata properties extracted in a consistent way. This\nmakes filtering and sorting simple.\n\nAlso notice that there is only one file. When querying a particular\nindex, the corresponding entity will always be a singleton like\nthis.\n", - "version": "9.1" + "version": "9.2" }, "tags": [ { @@ -1153,6 +1153,22 @@ ], "additionalProperties": false }, + "dataUseRestriction": { + "type": "object", + "properties": { + "is": { + "type": "array", + "items": { + "type": "string", + "nullable": true + } + } + }, + "required": [ + "is" + ], + "additionalProperties": false + }, "developmentStage": { "type": "object", "properties": { @@ -2196,7 +2212,7 @@ } } }, - "description": "\nCriteria to filter entities from the search results.\n\nEach filter consists of a field name, a relation (relational operator),\nand an array of field values. The available relations are \"is\",\n\"within\", \"contains\", and \"intersects\". Multiple filters are combined\nusing \"and\" logic. An entity must match all filters to be included in\nthe response. How multiple field values within a single filter are\ncombined depends on the relation.\n\nFor the \"is\" relation, multiple values are combined using \"or\" logic.\nFor example, `{\"fileFormat\": {\"is\": [\"fastq\", \"fastq.gz\"]}}` selects\nentities where the file format is either \"fastq\" or \"fastq.gz\". For the\n\"within\", \"intersects\", and \"contains\" relations, the field values must\ncome in nested pairs specifying upper and lower bounds, and multiple\npairs are combined using \"and\" logic. For example, `{\"donorCount\":\n{\"within\": [[1,5], [5,10]]}}` selects entities whose donor organism\ncount falls within both ranges, i.e., is exactly 5.\n\nThe accessions field supports filtering for a specific accession and/or\nnamespace within a project. For example, `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\"}]}}` will filter for projects that have an\n`array_express` accession. Similarly, `{\"accessions\": {\"is\": [\n{\"accession\":\"ERP112843\"}]}}` will filter for projects that have the\naccession `ERP112843` while `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\", \"accession\": \"E-AAAA-00\"}]}}` will filter\nfor projects that match both values.\n\nThe organismAge field is special in that it contains two property keys:\nvalue and unit. For example, `{\"organismAge\": {\"is\": [{\"value\": \"20\",\n\"unit\": \"year\"}]}}`. Both keys are required. `{\"organismAge\": {\"is\":\n[null]}}` selects entities that have no organism age.\n\nSupported field names are: accessions, aggregateLastModifiedDate, aggregateSubmissionDate, aggregateUpdateDate, assayType, biologicalSex, bionetworkName, bundleUuid, bundleVersion, cellCount, cellLineType, contactName, contentDescription, developmentStage, donorCount, donorDisease, effectiveCellCount, effectiveOrgan, entryId, fileFormat, fileId, fileName, fileSize, fileSource, fileVersion, genusSpecies, institution, instrumentManufacturerModel, isIntermediate, isTissueAtlasProject, laboratory, lastModifiedDate, libraryConstructionApproach, matrixCellCount, modelOrgan, modelOrganPart, nucleicAcidSource, organ, organPart, organismAge, organismAgeRange, pairedEnd, preservationMethod, project, projectDescription, projectEstimatedCellCount, projectId, projectTitle, publicationTitle, sampleDisease, sampleEntityType, sampleId, selectedCellType, sourceId, sourceSpec, specimenDisease, specimenOrgan, specimenOrganPart, submissionDate, tissueAtlas, updateDate, workflow, accessible\n" + "description": "\nCriteria to filter entities from the search results.\n\nEach filter consists of a field name, a relation (relational operator),\nand an array of field values. The available relations are \"is\",\n\"within\", \"contains\", and \"intersects\". Multiple filters are combined\nusing \"and\" logic. An entity must match all filters to be included in\nthe response. How multiple field values within a single filter are\ncombined depends on the relation.\n\nFor the \"is\" relation, multiple values are combined using \"or\" logic.\nFor example, `{\"fileFormat\": {\"is\": [\"fastq\", \"fastq.gz\"]}}` selects\nentities where the file format is either \"fastq\" or \"fastq.gz\". For the\n\"within\", \"intersects\", and \"contains\" relations, the field values must\ncome in nested pairs specifying upper and lower bounds, and multiple\npairs are combined using \"and\" logic. For example, `{\"donorCount\":\n{\"within\": [[1,5], [5,10]]}}` selects entities whose donor organism\ncount falls within both ranges, i.e., is exactly 5.\n\nThe accessions field supports filtering for a specific accession and/or\nnamespace within a project. For example, `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\"}]}}` will filter for projects that have an\n`array_express` accession. Similarly, `{\"accessions\": {\"is\": [\n{\"accession\":\"ERP112843\"}]}}` will filter for projects that have the\naccession `ERP112843` while `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\", \"accession\": \"E-AAAA-00\"}]}}` will filter\nfor projects that match both values.\n\nThe organismAge field is special in that it contains two property keys:\nvalue and unit. For example, `{\"organismAge\": {\"is\": [{\"value\": \"20\",\n\"unit\": \"year\"}]}}`. Both keys are required. `{\"organismAge\": {\"is\":\n[null]}}` selects entities that have no organism age.\n\nSupported field names are: accessions, aggregateLastModifiedDate, aggregateSubmissionDate, aggregateUpdateDate, assayType, biologicalSex, bionetworkName, bundleUuid, bundleVersion, cellCount, cellLineType, contactName, contentDescription, dataUseRestriction, developmentStage, donorCount, donorDisease, effectiveCellCount, effectiveOrgan, entryId, fileFormat, fileId, fileName, fileSize, fileSource, fileVersion, genusSpecies, institution, instrumentManufacturerModel, isIntermediate, isTissueAtlasProject, laboratory, lastModifiedDate, libraryConstructionApproach, matrixCellCount, modelOrgan, modelOrganPart, nucleicAcidSource, organ, organPart, organismAge, organismAgeRange, pairedEnd, preservationMethod, project, projectDescription, projectEstimatedCellCount, projectId, projectTitle, publicationTitle, sampleDisease, sampleEntityType, sampleId, selectedCellType, sourceId, sourceSpec, specimenDisease, specimenOrgan, specimenOrganPart, submissionDate, tissueAtlas, updateDate, workflow, accessible\n" }, { "name": "entity_type", @@ -2245,6 +2261,7 @@ "cellLineType", "contactName", "contentDescription", + "dataUseRestriction", "developmentStage", "donorCount", "donorDisease", @@ -2624,6 +2641,22 @@ ], "additionalProperties": false }, + "dataUseRestriction": { + "type": "object", + "properties": { + "is": { + "type": "array", + "items": { + "type": "string", + "nullable": true + } + } + }, + "required": [ + "is" + ], + "additionalProperties": false + }, "developmentStage": { "type": "object", "properties": { @@ -3667,7 +3700,7 @@ } } }, - "description": "\nCriteria to filter entities from the search results.\n\nEach filter consists of a field name, a relation (relational operator),\nand an array of field values. The available relations are \"is\",\n\"within\", \"contains\", and \"intersects\". Multiple filters are combined\nusing \"and\" logic. An entity must match all filters to be included in\nthe response. How multiple field values within a single filter are\ncombined depends on the relation.\n\nFor the \"is\" relation, multiple values are combined using \"or\" logic.\nFor example, `{\"fileFormat\": {\"is\": [\"fastq\", \"fastq.gz\"]}}` selects\nentities where the file format is either \"fastq\" or \"fastq.gz\". For the\n\"within\", \"intersects\", and \"contains\" relations, the field values must\ncome in nested pairs specifying upper and lower bounds, and multiple\npairs are combined using \"and\" logic. For example, `{\"donorCount\":\n{\"within\": [[1,5], [5,10]]}}` selects entities whose donor organism\ncount falls within both ranges, i.e., is exactly 5.\n\nThe accessions field supports filtering for a specific accession and/or\nnamespace within a project. For example, `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\"}]}}` will filter for projects that have an\n`array_express` accession. Similarly, `{\"accessions\": {\"is\": [\n{\"accession\":\"ERP112843\"}]}}` will filter for projects that have the\naccession `ERP112843` while `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\", \"accession\": \"E-AAAA-00\"}]}}` will filter\nfor projects that match both values.\n\nThe organismAge field is special in that it contains two property keys:\nvalue and unit. For example, `{\"organismAge\": {\"is\": [{\"value\": \"20\",\n\"unit\": \"year\"}]}}`. Both keys are required. `{\"organismAge\": {\"is\":\n[null]}}` selects entities that have no organism age.\n\nSupported field names are: accessions, aggregateLastModifiedDate, aggregateSubmissionDate, aggregateUpdateDate, assayType, biologicalSex, bionetworkName, bundleUuid, bundleVersion, cellCount, cellLineType, contactName, contentDescription, developmentStage, donorCount, donorDisease, effectiveCellCount, effectiveOrgan, entryId, fileFormat, fileId, fileName, fileSize, fileSource, fileVersion, genusSpecies, institution, instrumentManufacturerModel, isIntermediate, isTissueAtlasProject, laboratory, lastModifiedDate, libraryConstructionApproach, matrixCellCount, modelOrgan, modelOrganPart, nucleicAcidSource, organ, organPart, organismAge, organismAgeRange, pairedEnd, preservationMethod, project, projectDescription, projectEstimatedCellCount, projectId, projectTitle, publicationTitle, sampleDisease, sampleEntityType, sampleId, selectedCellType, sourceId, sourceSpec, specimenDisease, specimenOrgan, specimenOrganPart, submissionDate, tissueAtlas, updateDate, workflow, accessible\n" + "description": "\nCriteria to filter entities from the search results.\n\nEach filter consists of a field name, a relation (relational operator),\nand an array of field values. The available relations are \"is\",\n\"within\", \"contains\", and \"intersects\". Multiple filters are combined\nusing \"and\" logic. An entity must match all filters to be included in\nthe response. How multiple field values within a single filter are\ncombined depends on the relation.\n\nFor the \"is\" relation, multiple values are combined using \"or\" logic.\nFor example, `{\"fileFormat\": {\"is\": [\"fastq\", \"fastq.gz\"]}}` selects\nentities where the file format is either \"fastq\" or \"fastq.gz\". For the\n\"within\", \"intersects\", and \"contains\" relations, the field values must\ncome in nested pairs specifying upper and lower bounds, and multiple\npairs are combined using \"and\" logic. For example, `{\"donorCount\":\n{\"within\": [[1,5], [5,10]]}}` selects entities whose donor organism\ncount falls within both ranges, i.e., is exactly 5.\n\nThe accessions field supports filtering for a specific accession and/or\nnamespace within a project. For example, `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\"}]}}` will filter for projects that have an\n`array_express` accession. Similarly, `{\"accessions\": {\"is\": [\n{\"accession\":\"ERP112843\"}]}}` will filter for projects that have the\naccession `ERP112843` while `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\", \"accession\": \"E-AAAA-00\"}]}}` will filter\nfor projects that match both values.\n\nThe organismAge field is special in that it contains two property keys:\nvalue and unit. For example, `{\"organismAge\": {\"is\": [{\"value\": \"20\",\n\"unit\": \"year\"}]}}`. Both keys are required. `{\"organismAge\": {\"is\":\n[null]}}` selects entities that have no organism age.\n\nSupported field names are: accessions, aggregateLastModifiedDate, aggregateSubmissionDate, aggregateUpdateDate, assayType, biologicalSex, bionetworkName, bundleUuid, bundleVersion, cellCount, cellLineType, contactName, contentDescription, dataUseRestriction, developmentStage, donorCount, donorDisease, effectiveCellCount, effectiveOrgan, entryId, fileFormat, fileId, fileName, fileSize, fileSource, fileVersion, genusSpecies, institution, instrumentManufacturerModel, isIntermediate, isTissueAtlasProject, laboratory, lastModifiedDate, libraryConstructionApproach, matrixCellCount, modelOrgan, modelOrganPart, nucleicAcidSource, organ, organPart, organismAge, organismAgeRange, pairedEnd, preservationMethod, project, projectDescription, projectEstimatedCellCount, projectId, projectTitle, publicationTitle, sampleDisease, sampleEntityType, sampleId, selectedCellType, sourceId, sourceSpec, specimenDisease, specimenOrgan, specimenOrganPart, submissionDate, tissueAtlas, updateDate, workflow, accessible\n" }, { "name": "entity_type", @@ -3716,6 +3749,7 @@ "cellLineType", "contactName", "contentDescription", + "dataUseRestriction", "developmentStage", "donorCount", "donorDisease", @@ -4210,6 +4244,22 @@ ], "additionalProperties": false }, + "dataUseRestriction": { + "type": "object", + "properties": { + "is": { + "type": "array", + "items": { + "type": "string", + "nullable": true + } + } + }, + "required": [ + "is" + ], + "additionalProperties": false + }, "developmentStage": { "type": "object", "properties": { @@ -5253,7 +5303,7 @@ } } }, - "description": "\nCriteria to filter entities from the search results.\n\nEach filter consists of a field name, a relation (relational operator),\nand an array of field values. The available relations are \"is\",\n\"within\", \"contains\", and \"intersects\". Multiple filters are combined\nusing \"and\" logic. An entity must match all filters to be included in\nthe response. How multiple field values within a single filter are\ncombined depends on the relation.\n\nFor the \"is\" relation, multiple values are combined using \"or\" logic.\nFor example, `{\"fileFormat\": {\"is\": [\"fastq\", \"fastq.gz\"]}}` selects\nentities where the file format is either \"fastq\" or \"fastq.gz\". For the\n\"within\", \"intersects\", and \"contains\" relations, the field values must\ncome in nested pairs specifying upper and lower bounds, and multiple\npairs are combined using \"and\" logic. For example, `{\"donorCount\":\n{\"within\": [[1,5], [5,10]]}}` selects entities whose donor organism\ncount falls within both ranges, i.e., is exactly 5.\n\nThe accessions field supports filtering for a specific accession and/or\nnamespace within a project. For example, `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\"}]}}` will filter for projects that have an\n`array_express` accession. Similarly, `{\"accessions\": {\"is\": [\n{\"accession\":\"ERP112843\"}]}}` will filter for projects that have the\naccession `ERP112843` while `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\", \"accession\": \"E-AAAA-00\"}]}}` will filter\nfor projects that match both values.\n\nThe organismAge field is special in that it contains two property keys:\nvalue and unit. For example, `{\"organismAge\": {\"is\": [{\"value\": \"20\",\n\"unit\": \"year\"}]}}`. Both keys are required. `{\"organismAge\": {\"is\":\n[null]}}` selects entities that have no organism age.\n\nSupported field names are: accessions, aggregateLastModifiedDate, aggregateSubmissionDate, aggregateUpdateDate, assayType, biologicalSex, bionetworkName, bundleUuid, bundleVersion, cellCount, cellLineType, contactName, contentDescription, developmentStage, donorCount, donorDisease, effectiveCellCount, effectiveOrgan, entryId, fileFormat, fileId, fileName, fileSize, fileSource, fileVersion, genusSpecies, institution, instrumentManufacturerModel, isIntermediate, isTissueAtlasProject, laboratory, lastModifiedDate, libraryConstructionApproach, matrixCellCount, modelOrgan, modelOrganPart, nucleicAcidSource, organ, organPart, organismAge, organismAgeRange, pairedEnd, preservationMethod, project, projectDescription, projectEstimatedCellCount, projectId, projectTitle, publicationTitle, sampleDisease, sampleEntityType, sampleId, selectedCellType, sourceId, sourceSpec, specimenDisease, specimenOrgan, specimenOrganPart, submissionDate, tissueAtlas, updateDate, workflow, accessible\n" + "description": "\nCriteria to filter entities from the search results.\n\nEach filter consists of a field name, a relation (relational operator),\nand an array of field values. The available relations are \"is\",\n\"within\", \"contains\", and \"intersects\". Multiple filters are combined\nusing \"and\" logic. An entity must match all filters to be included in\nthe response. How multiple field values within a single filter are\ncombined depends on the relation.\n\nFor the \"is\" relation, multiple values are combined using \"or\" logic.\nFor example, `{\"fileFormat\": {\"is\": [\"fastq\", \"fastq.gz\"]}}` selects\nentities where the file format is either \"fastq\" or \"fastq.gz\". For the\n\"within\", \"intersects\", and \"contains\" relations, the field values must\ncome in nested pairs specifying upper and lower bounds, and multiple\npairs are combined using \"and\" logic. For example, `{\"donorCount\":\n{\"within\": [[1,5], [5,10]]}}` selects entities whose donor organism\ncount falls within both ranges, i.e., is exactly 5.\n\nThe accessions field supports filtering for a specific accession and/or\nnamespace within a project. For example, `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\"}]}}` will filter for projects that have an\n`array_express` accession. Similarly, `{\"accessions\": {\"is\": [\n{\"accession\":\"ERP112843\"}]}}` will filter for projects that have the\naccession `ERP112843` while `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\", \"accession\": \"E-AAAA-00\"}]}}` will filter\nfor projects that match both values.\n\nThe organismAge field is special in that it contains two property keys:\nvalue and unit. For example, `{\"organismAge\": {\"is\": [{\"value\": \"20\",\n\"unit\": \"year\"}]}}`. Both keys are required. `{\"organismAge\": {\"is\":\n[null]}}` selects entities that have no organism age.\n\nSupported field names are: accessions, aggregateLastModifiedDate, aggregateSubmissionDate, aggregateUpdateDate, assayType, biologicalSex, bionetworkName, bundleUuid, bundleVersion, cellCount, cellLineType, contactName, contentDescription, dataUseRestriction, developmentStage, donorCount, donorDisease, effectiveCellCount, effectiveOrgan, entryId, fileFormat, fileId, fileName, fileSize, fileSource, fileVersion, genusSpecies, institution, instrumentManufacturerModel, isIntermediate, isTissueAtlasProject, laboratory, lastModifiedDate, libraryConstructionApproach, matrixCellCount, modelOrgan, modelOrganPart, nucleicAcidSource, organ, organPart, organismAge, organismAgeRange, pairedEnd, preservationMethod, project, projectDescription, projectEstimatedCellCount, projectId, projectTitle, publicationTitle, sampleDisease, sampleEntityType, sampleId, selectedCellType, sourceId, sourceSpec, specimenDisease, specimenOrgan, specimenOrganPart, submissionDate, tissueAtlas, updateDate, workflow, accessible\n" }, { "name": "entity_type", @@ -5302,6 +5352,7 @@ "cellLineType", "contactName", "contentDescription", + "dataUseRestriction", "developmentStage", "donorCount", "donorDisease", @@ -5801,6 +5852,22 @@ ], "additionalProperties": false }, + "dataUseRestriction": { + "type": "object", + "properties": { + "is": { + "type": "array", + "items": { + "type": "string", + "nullable": true + } + } + }, + "required": [ + "is" + ], + "additionalProperties": false + }, "developmentStage": { "type": "object", "properties": { @@ -6844,7 +6911,7 @@ } } }, - "description": "\nCriteria to filter entities from the search results.\n\nEach filter consists of a field name, a relation (relational operator),\nand an array of field values. The available relations are \"is\",\n\"within\", \"contains\", and \"intersects\". Multiple filters are combined\nusing \"and\" logic. An entity must match all filters to be included in\nthe response. How multiple field values within a single filter are\ncombined depends on the relation.\n\nFor the \"is\" relation, multiple values are combined using \"or\" logic.\nFor example, `{\"fileFormat\": {\"is\": [\"fastq\", \"fastq.gz\"]}}` selects\nentities where the file format is either \"fastq\" or \"fastq.gz\". For the\n\"within\", \"intersects\", and \"contains\" relations, the field values must\ncome in nested pairs specifying upper and lower bounds, and multiple\npairs are combined using \"and\" logic. For example, `{\"donorCount\":\n{\"within\": [[1,5], [5,10]]}}` selects entities whose donor organism\ncount falls within both ranges, i.e., is exactly 5.\n\nThe accessions field supports filtering for a specific accession and/or\nnamespace within a project. For example, `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\"}]}}` will filter for projects that have an\n`array_express` accession. Similarly, `{\"accessions\": {\"is\": [\n{\"accession\":\"ERP112843\"}]}}` will filter for projects that have the\naccession `ERP112843` while `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\", \"accession\": \"E-AAAA-00\"}]}}` will filter\nfor projects that match both values.\n\nThe organismAge field is special in that it contains two property keys:\nvalue and unit. For example, `{\"organismAge\": {\"is\": [{\"value\": \"20\",\n\"unit\": \"year\"}]}}`. Both keys are required. `{\"organismAge\": {\"is\":\n[null]}}` selects entities that have no organism age.\n\nSupported field names are: accessions, aggregateLastModifiedDate, aggregateSubmissionDate, aggregateUpdateDate, assayType, biologicalSex, bionetworkName, bundleUuid, bundleVersion, cellCount, cellLineType, contactName, contentDescription, developmentStage, donorCount, donorDisease, effectiveCellCount, effectiveOrgan, entryId, fileFormat, fileId, fileName, fileSize, fileSource, fileVersion, genusSpecies, institution, instrumentManufacturerModel, isIntermediate, isTissueAtlasProject, laboratory, lastModifiedDate, libraryConstructionApproach, matrixCellCount, modelOrgan, modelOrganPart, nucleicAcidSource, organ, organPart, organismAge, organismAgeRange, pairedEnd, preservationMethod, project, projectDescription, projectEstimatedCellCount, projectId, projectTitle, publicationTitle, sampleDisease, sampleEntityType, sampleId, selectedCellType, sourceId, sourceSpec, specimenDisease, specimenOrgan, specimenOrganPart, submissionDate, tissueAtlas, updateDate, workflow, accessible\n" + "description": "\nCriteria to filter entities from the search results.\n\nEach filter consists of a field name, a relation (relational operator),\nand an array of field values. The available relations are \"is\",\n\"within\", \"contains\", and \"intersects\". Multiple filters are combined\nusing \"and\" logic. An entity must match all filters to be included in\nthe response. How multiple field values within a single filter are\ncombined depends on the relation.\n\nFor the \"is\" relation, multiple values are combined using \"or\" logic.\nFor example, `{\"fileFormat\": {\"is\": [\"fastq\", \"fastq.gz\"]}}` selects\nentities where the file format is either \"fastq\" or \"fastq.gz\". For the\n\"within\", \"intersects\", and \"contains\" relations, the field values must\ncome in nested pairs specifying upper and lower bounds, and multiple\npairs are combined using \"and\" logic. For example, `{\"donorCount\":\n{\"within\": [[1,5], [5,10]]}}` selects entities whose donor organism\ncount falls within both ranges, i.e., is exactly 5.\n\nThe accessions field supports filtering for a specific accession and/or\nnamespace within a project. For example, `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\"}]}}` will filter for projects that have an\n`array_express` accession. Similarly, `{\"accessions\": {\"is\": [\n{\"accession\":\"ERP112843\"}]}}` will filter for projects that have the\naccession `ERP112843` while `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\", \"accession\": \"E-AAAA-00\"}]}}` will filter\nfor projects that match both values.\n\nThe organismAge field is special in that it contains two property keys:\nvalue and unit. For example, `{\"organismAge\": {\"is\": [{\"value\": \"20\",\n\"unit\": \"year\"}]}}`. Both keys are required. `{\"organismAge\": {\"is\":\n[null]}}` selects entities that have no organism age.\n\nSupported field names are: accessions, aggregateLastModifiedDate, aggregateSubmissionDate, aggregateUpdateDate, assayType, biologicalSex, bionetworkName, bundleUuid, bundleVersion, cellCount, cellLineType, contactName, contentDescription, dataUseRestriction, developmentStage, donorCount, donorDisease, effectiveCellCount, effectiveOrgan, entryId, fileFormat, fileId, fileName, fileSize, fileSource, fileVersion, genusSpecies, institution, instrumentManufacturerModel, isIntermediate, isTissueAtlasProject, laboratory, lastModifiedDate, libraryConstructionApproach, matrixCellCount, modelOrgan, modelOrganPart, nucleicAcidSource, organ, organPart, organismAge, organismAgeRange, pairedEnd, preservationMethod, project, projectDescription, projectEstimatedCellCount, projectId, projectTitle, publicationTitle, sampleDisease, sampleEntityType, sampleId, selectedCellType, sourceId, sourceSpec, specimenDisease, specimenOrgan, specimenOrganPart, submissionDate, tissueAtlas, updateDate, workflow, accessible\n" } ] }, @@ -7194,6 +7261,22 @@ ], "additionalProperties": false }, + "dataUseRestriction": { + "type": "object", + "properties": { + "is": { + "type": "array", + "items": { + "type": "string", + "nullable": true + } + } + }, + "required": [ + "is" + ], + "additionalProperties": false + }, "developmentStage": { "type": "object", "properties": { @@ -8237,7 +8320,7 @@ } } }, - "description": "\nCriteria to filter entities from the search results.\n\nEach filter consists of a field name, a relation (relational operator),\nand an array of field values. The available relations are \"is\",\n\"within\", \"contains\", and \"intersects\". Multiple filters are combined\nusing \"and\" logic. An entity must match all filters to be included in\nthe response. How multiple field values within a single filter are\ncombined depends on the relation.\n\nFor the \"is\" relation, multiple values are combined using \"or\" logic.\nFor example, `{\"fileFormat\": {\"is\": [\"fastq\", \"fastq.gz\"]}}` selects\nentities where the file format is either \"fastq\" or \"fastq.gz\". For the\n\"within\", \"intersects\", and \"contains\" relations, the field values must\ncome in nested pairs specifying upper and lower bounds, and multiple\npairs are combined using \"and\" logic. For example, `{\"donorCount\":\n{\"within\": [[1,5], [5,10]]}}` selects entities whose donor organism\ncount falls within both ranges, i.e., is exactly 5.\n\nThe accessions field supports filtering for a specific accession and/or\nnamespace within a project. For example, `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\"}]}}` will filter for projects that have an\n`array_express` accession. Similarly, `{\"accessions\": {\"is\": [\n{\"accession\":\"ERP112843\"}]}}` will filter for projects that have the\naccession `ERP112843` while `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\", \"accession\": \"E-AAAA-00\"}]}}` will filter\nfor projects that match both values.\n\nThe organismAge field is special in that it contains two property keys:\nvalue and unit. For example, `{\"organismAge\": {\"is\": [{\"value\": \"20\",\n\"unit\": \"year\"}]}}`. Both keys are required. `{\"organismAge\": {\"is\":\n[null]}}` selects entities that have no organism age.\n\nSupported field names are: accessions, aggregateLastModifiedDate, aggregateSubmissionDate, aggregateUpdateDate, assayType, biologicalSex, bionetworkName, bundleUuid, bundleVersion, cellCount, cellLineType, contactName, contentDescription, developmentStage, donorCount, donorDisease, effectiveCellCount, effectiveOrgan, entryId, fileFormat, fileId, fileName, fileSize, fileSource, fileVersion, genusSpecies, institution, instrumentManufacturerModel, isIntermediate, isTissueAtlasProject, laboratory, lastModifiedDate, libraryConstructionApproach, matrixCellCount, modelOrgan, modelOrganPart, nucleicAcidSource, organ, organPart, organismAge, organismAgeRange, pairedEnd, preservationMethod, project, projectDescription, projectEstimatedCellCount, projectId, projectTitle, publicationTitle, sampleDisease, sampleEntityType, sampleId, selectedCellType, sourceId, sourceSpec, specimenDisease, specimenOrgan, specimenOrganPart, submissionDate, tissueAtlas, updateDate, workflow, accessible\n" + "description": "\nCriteria to filter entities from the search results.\n\nEach filter consists of a field name, a relation (relational operator),\nand an array of field values. The available relations are \"is\",\n\"within\", \"contains\", and \"intersects\". Multiple filters are combined\nusing \"and\" logic. An entity must match all filters to be included in\nthe response. How multiple field values within a single filter are\ncombined depends on the relation.\n\nFor the \"is\" relation, multiple values are combined using \"or\" logic.\nFor example, `{\"fileFormat\": {\"is\": [\"fastq\", \"fastq.gz\"]}}` selects\nentities where the file format is either \"fastq\" or \"fastq.gz\". For the\n\"within\", \"intersects\", and \"contains\" relations, the field values must\ncome in nested pairs specifying upper and lower bounds, and multiple\npairs are combined using \"and\" logic. For example, `{\"donorCount\":\n{\"within\": [[1,5], [5,10]]}}` selects entities whose donor organism\ncount falls within both ranges, i.e., is exactly 5.\n\nThe accessions field supports filtering for a specific accession and/or\nnamespace within a project. For example, `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\"}]}}` will filter for projects that have an\n`array_express` accession. Similarly, `{\"accessions\": {\"is\": [\n{\"accession\":\"ERP112843\"}]}}` will filter for projects that have the\naccession `ERP112843` while `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\", \"accession\": \"E-AAAA-00\"}]}}` will filter\nfor projects that match both values.\n\nThe organismAge field is special in that it contains two property keys:\nvalue and unit. For example, `{\"organismAge\": {\"is\": [{\"value\": \"20\",\n\"unit\": \"year\"}]}}`. Both keys are required. `{\"organismAge\": {\"is\":\n[null]}}` selects entities that have no organism age.\n\nSupported field names are: accessions, aggregateLastModifiedDate, aggregateSubmissionDate, aggregateUpdateDate, assayType, biologicalSex, bionetworkName, bundleUuid, bundleVersion, cellCount, cellLineType, contactName, contentDescription, dataUseRestriction, developmentStage, donorCount, donorDisease, effectiveCellCount, effectiveOrgan, entryId, fileFormat, fileId, fileName, fileSize, fileSource, fileVersion, genusSpecies, institution, instrumentManufacturerModel, isIntermediate, isTissueAtlasProject, laboratory, lastModifiedDate, libraryConstructionApproach, matrixCellCount, modelOrgan, modelOrganPart, nucleicAcidSource, organ, organPart, organismAge, organismAgeRange, pairedEnd, preservationMethod, project, projectDescription, projectEstimatedCellCount, projectId, projectTitle, publicationTitle, sampleDisease, sampleEntityType, sampleId, selectedCellType, sourceId, sourceSpec, specimenDisease, specimenOrgan, specimenOrganPart, submissionDate, tissueAtlas, updateDate, workflow, accessible\n" } ] } @@ -8513,6 +8596,22 @@ ], "additionalProperties": false }, + "dataUseRestriction": { + "type": "object", + "properties": { + "is": { + "type": "array", + "items": { + "type": "string", + "nullable": true + } + } + }, + "required": [ + "is" + ], + "additionalProperties": false + }, "developmentStage": { "type": "object", "properties": { @@ -9556,7 +9655,7 @@ } } }, - "description": "\nCriteria to filter entities from the search results.\n\nEach filter consists of a field name, a relation (relational operator),\nand an array of field values. The available relations are \"is\",\n\"within\", \"contains\", and \"intersects\". Multiple filters are combined\nusing \"and\" logic. An entity must match all filters to be included in\nthe response. How multiple field values within a single filter are\ncombined depends on the relation.\n\nFor the \"is\" relation, multiple values are combined using \"or\" logic.\nFor example, `{\"fileFormat\": {\"is\": [\"fastq\", \"fastq.gz\"]}}` selects\nentities where the file format is either \"fastq\" or \"fastq.gz\". For the\n\"within\", \"intersects\", and \"contains\" relations, the field values must\ncome in nested pairs specifying upper and lower bounds, and multiple\npairs are combined using \"and\" logic. For example, `{\"donorCount\":\n{\"within\": [[1,5], [5,10]]}}` selects entities whose donor organism\ncount falls within both ranges, i.e., is exactly 5.\n\nThe accessions field supports filtering for a specific accession and/or\nnamespace within a project. For example, `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\"}]}}` will filter for projects that have an\n`array_express` accession. Similarly, `{\"accessions\": {\"is\": [\n{\"accession\":\"ERP112843\"}]}}` will filter for projects that have the\naccession `ERP112843` while `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\", \"accession\": \"E-AAAA-00\"}]}}` will filter\nfor projects that match both values.\n\nThe organismAge field is special in that it contains two property keys:\nvalue and unit. For example, `{\"organismAge\": {\"is\": [{\"value\": \"20\",\n\"unit\": \"year\"}]}}`. Both keys are required. `{\"organismAge\": {\"is\":\n[null]}}` selects entities that have no organism age.\n\nSupported field names are: accessions, aggregateLastModifiedDate, aggregateSubmissionDate, aggregateUpdateDate, assayType, biologicalSex, bionetworkName, bundleUuid, bundleVersion, cellCount, cellLineType, contactName, contentDescription, developmentStage, donorCount, donorDisease, effectiveCellCount, effectiveOrgan, entryId, fileFormat, fileId, fileName, fileSize, fileSource, fileVersion, genusSpecies, institution, instrumentManufacturerModel, isIntermediate, isTissueAtlasProject, laboratory, lastModifiedDate, libraryConstructionApproach, matrixCellCount, modelOrgan, modelOrganPart, nucleicAcidSource, organ, organPart, organismAge, organismAgeRange, pairedEnd, preservationMethod, project, projectDescription, projectEstimatedCellCount, projectId, projectTitle, publicationTitle, sampleDisease, sampleEntityType, sampleId, selectedCellType, sourceId, sourceSpec, specimenDisease, specimenOrgan, specimenOrganPart, submissionDate, tissueAtlas, updateDate, workflow, accessible\n" + "description": "\nCriteria to filter entities from the search results.\n\nEach filter consists of a field name, a relation (relational operator),\nand an array of field values. The available relations are \"is\",\n\"within\", \"contains\", and \"intersects\". Multiple filters are combined\nusing \"and\" logic. An entity must match all filters to be included in\nthe response. How multiple field values within a single filter are\ncombined depends on the relation.\n\nFor the \"is\" relation, multiple values are combined using \"or\" logic.\nFor example, `{\"fileFormat\": {\"is\": [\"fastq\", \"fastq.gz\"]}}` selects\nentities where the file format is either \"fastq\" or \"fastq.gz\". For the\n\"within\", \"intersects\", and \"contains\" relations, the field values must\ncome in nested pairs specifying upper and lower bounds, and multiple\npairs are combined using \"and\" logic. For example, `{\"donorCount\":\n{\"within\": [[1,5], [5,10]]}}` selects entities whose donor organism\ncount falls within both ranges, i.e., is exactly 5.\n\nThe accessions field supports filtering for a specific accession and/or\nnamespace within a project. For example, `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\"}]}}` will filter for projects that have an\n`array_express` accession. Similarly, `{\"accessions\": {\"is\": [\n{\"accession\":\"ERP112843\"}]}}` will filter for projects that have the\naccession `ERP112843` while `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\", \"accession\": \"E-AAAA-00\"}]}}` will filter\nfor projects that match both values.\n\nThe organismAge field is special in that it contains two property keys:\nvalue and unit. For example, `{\"organismAge\": {\"is\": [{\"value\": \"20\",\n\"unit\": \"year\"}]}}`. Both keys are required. `{\"organismAge\": {\"is\":\n[null]}}` selects entities that have no organism age.\n\nSupported field names are: accessions, aggregateLastModifiedDate, aggregateSubmissionDate, aggregateUpdateDate, assayType, biologicalSex, bionetworkName, bundleUuid, bundleVersion, cellCount, cellLineType, contactName, contentDescription, dataUseRestriction, developmentStage, donorCount, donorDisease, effectiveCellCount, effectiveOrgan, entryId, fileFormat, fileId, fileName, fileSize, fileSource, fileVersion, genusSpecies, institution, instrumentManufacturerModel, isIntermediate, isTissueAtlasProject, laboratory, lastModifiedDate, libraryConstructionApproach, matrixCellCount, modelOrgan, modelOrganPart, nucleicAcidSource, organ, organPart, organismAge, organismAgeRange, pairedEnd, preservationMethod, project, projectDescription, projectEstimatedCellCount, projectId, projectTitle, publicationTitle, sampleDisease, sampleEntityType, sampleId, selectedCellType, sourceId, sourceSpec, specimenDisease, specimenOrgan, specimenOrganPart, submissionDate, tissueAtlas, updateDate, workflow, accessible\n" }, { "name": "format", @@ -9937,6 +10036,22 @@ ], "additionalProperties": false }, + "dataUseRestriction": { + "type": "object", + "properties": { + "is": { + "type": "array", + "items": { + "type": "string", + "nullable": true + } + } + }, + "required": [ + "is" + ], + "additionalProperties": false + }, "developmentStage": { "type": "object", "properties": { @@ -10980,7 +11095,7 @@ } } }, - "description": "\nCriteria to filter entities from the search results.\n\nEach filter consists of a field name, a relation (relational operator),\nand an array of field values. The available relations are \"is\",\n\"within\", \"contains\", and \"intersects\". Multiple filters are combined\nusing \"and\" logic. An entity must match all filters to be included in\nthe response. How multiple field values within a single filter are\ncombined depends on the relation.\n\nFor the \"is\" relation, multiple values are combined using \"or\" logic.\nFor example, `{\"fileFormat\": {\"is\": [\"fastq\", \"fastq.gz\"]}}` selects\nentities where the file format is either \"fastq\" or \"fastq.gz\". For the\n\"within\", \"intersects\", and \"contains\" relations, the field values must\ncome in nested pairs specifying upper and lower bounds, and multiple\npairs are combined using \"and\" logic. For example, `{\"donorCount\":\n{\"within\": [[1,5], [5,10]]}}` selects entities whose donor organism\ncount falls within both ranges, i.e., is exactly 5.\n\nThe accessions field supports filtering for a specific accession and/or\nnamespace within a project. For example, `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\"}]}}` will filter for projects that have an\n`array_express` accession. Similarly, `{\"accessions\": {\"is\": [\n{\"accession\":\"ERP112843\"}]}}` will filter for projects that have the\naccession `ERP112843` while `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\", \"accession\": \"E-AAAA-00\"}]}}` will filter\nfor projects that match both values.\n\nThe organismAge field is special in that it contains two property keys:\nvalue and unit. For example, `{\"organismAge\": {\"is\": [{\"value\": \"20\",\n\"unit\": \"year\"}]}}`. Both keys are required. `{\"organismAge\": {\"is\":\n[null]}}` selects entities that have no organism age.\n\nSupported field names are: accessions, aggregateLastModifiedDate, aggregateSubmissionDate, aggregateUpdateDate, assayType, biologicalSex, bionetworkName, bundleUuid, bundleVersion, cellCount, cellLineType, contactName, contentDescription, developmentStage, donorCount, donorDisease, effectiveCellCount, effectiveOrgan, entryId, fileFormat, fileId, fileName, fileSize, fileSource, fileVersion, genusSpecies, institution, instrumentManufacturerModel, isIntermediate, isTissueAtlasProject, laboratory, lastModifiedDate, libraryConstructionApproach, matrixCellCount, modelOrgan, modelOrganPart, nucleicAcidSource, organ, organPart, organismAge, organismAgeRange, pairedEnd, preservationMethod, project, projectDescription, projectEstimatedCellCount, projectId, projectTitle, publicationTitle, sampleDisease, sampleEntityType, sampleId, selectedCellType, sourceId, sourceSpec, specimenDisease, specimenOrgan, specimenOrganPart, submissionDate, tissueAtlas, updateDate, workflow, accessible\n" + "description": "\nCriteria to filter entities from the search results.\n\nEach filter consists of a field name, a relation (relational operator),\nand an array of field values. The available relations are \"is\",\n\"within\", \"contains\", and \"intersects\". Multiple filters are combined\nusing \"and\" logic. An entity must match all filters to be included in\nthe response. How multiple field values within a single filter are\ncombined depends on the relation.\n\nFor the \"is\" relation, multiple values are combined using \"or\" logic.\nFor example, `{\"fileFormat\": {\"is\": [\"fastq\", \"fastq.gz\"]}}` selects\nentities where the file format is either \"fastq\" or \"fastq.gz\". For the\n\"within\", \"intersects\", and \"contains\" relations, the field values must\ncome in nested pairs specifying upper and lower bounds, and multiple\npairs are combined using \"and\" logic. For example, `{\"donorCount\":\n{\"within\": [[1,5], [5,10]]}}` selects entities whose donor organism\ncount falls within both ranges, i.e., is exactly 5.\n\nThe accessions field supports filtering for a specific accession and/or\nnamespace within a project. For example, `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\"}]}}` will filter for projects that have an\n`array_express` accession. Similarly, `{\"accessions\": {\"is\": [\n{\"accession\":\"ERP112843\"}]}}` will filter for projects that have the\naccession `ERP112843` while `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\", \"accession\": \"E-AAAA-00\"}]}}` will filter\nfor projects that match both values.\n\nThe organismAge field is special in that it contains two property keys:\nvalue and unit. For example, `{\"organismAge\": {\"is\": [{\"value\": \"20\",\n\"unit\": \"year\"}]}}`. Both keys are required. `{\"organismAge\": {\"is\":\n[null]}}` selects entities that have no organism age.\n\nSupported field names are: accessions, aggregateLastModifiedDate, aggregateSubmissionDate, aggregateUpdateDate, assayType, biologicalSex, bionetworkName, bundleUuid, bundleVersion, cellCount, cellLineType, contactName, contentDescription, dataUseRestriction, developmentStage, donorCount, donorDisease, effectiveCellCount, effectiveOrgan, entryId, fileFormat, fileId, fileName, fileSize, fileSource, fileVersion, genusSpecies, institution, instrumentManufacturerModel, isIntermediate, isTissueAtlasProject, laboratory, lastModifiedDate, libraryConstructionApproach, matrixCellCount, modelOrgan, modelOrganPart, nucleicAcidSource, organ, organPart, organismAge, organismAgeRange, pairedEnd, preservationMethod, project, projectDescription, projectEstimatedCellCount, projectId, projectTitle, publicationTitle, sampleDisease, sampleEntityType, sampleId, selectedCellType, sourceId, sourceSpec, specimenDisease, specimenOrgan, specimenOrganPart, submissionDate, tissueAtlas, updateDate, workflow, accessible\n" }, { "name": "format", diff --git a/scripts/post_deploy_tdr.py b/scripts/post_deploy_tdr.py index fe3565c10b..c3cd9a6097 100644 --- a/scripts/post_deploy_tdr.py +++ b/scripts/post_deploy_tdr.py @@ -24,7 +24,6 @@ TDRPlugin, ) from azul.terra import ( - SourceRef as TDRSourceRef, TDRClient, TDRSourceSpec, ) @@ -91,20 +90,9 @@ def verify_source(self, catalog: CatalogName, source_spec: TDRSourceSpec ) -> None: - source = self.tdr.lookup_source(source_spec) - log.info('TDR client is authorized for API access to %s.', source_spec) - require(source.project == source_spec.subdomain, - 'Actual Google project of TDR source differs from configured one', - source.project, source_spec.subdomain) - # Uppercase is standard for multi-regions in the documentation but TDR - # returns 'us' in lowercase - require(source.location.lower() == config.tdr_source_location.lower(), - 'Actual storage location of TDR source differs from configured one', - source.location, config.tdr_source_location) - # FIXME: Eliminate azul.terra.TDRClient.TDRSource - # https://github.com/DataBiosphere/azul/issues/5524 - ref = TDRSourceRef(id=source.id, spec=source_spec) plugin = self.repository_plugin(catalog) + ref = plugin.resolve_source(str(source_spec)) + log.info('TDR client is authorized for API access to %s.', source_spec) subgraph_count = sum(plugin.list_partitions(ref).values()) require(subgraph_count > 0, 'Source spec is empty (bad prefix?)', source_spec) diff --git a/scripts/update_subgraph_counts.py b/scripts/update_subgraph_counts.py index 2b28d54c6e..9bcd733a51 100644 --- a/scripts/update_subgraph_counts.py +++ b/scripts/update_subgraph_counts.py @@ -45,8 +45,8 @@ RepositoryPlugin, ) from azul.terra import ( - SourceRef as TDRSourceRef, TDRClient, + TDRSourceRef, ) environment = load_module(module_name='environment', diff --git a/src/azul/plugins/metadata/hca/__init__.py b/src/azul/plugins/metadata/hca/__init__.py index 2edcd38ba0..57e5500bdd 100644 --- a/src/azul/plugins/metadata/hca/__init__.py +++ b/src/azul/plugins/metadata/hca/__init__.py @@ -228,7 +228,8 @@ def _field_mapping(self) -> MetadataPlugin._FieldMapping: 'estimated_cell_count': 'projectEstimatedCellCount', 'is_tissue_atlas_project': 'isTissueAtlasProject', 'tissue_atlas': 'tissueAtlas', - 'bionetwork_name': 'bionetworkName' + 'bionetwork_name': 'bionetworkName', + 'data_use_restriction': 'dataUseRestriction' }, 'sequencing_protocols': { 'instrument_manufacturer_model': 'instrumentManufacturerModel', @@ -332,7 +333,8 @@ def facets(self) -> Sequence[str]: 'publicationTitle', 'isTissueAtlasProject', 'tissueAtlas', - 'bionetworkName' + 'bionetworkName', + 'dataUseRestriction' ] @property diff --git a/src/azul/plugins/metadata/hca/indexer/transform.py b/src/azul/plugins/metadata/hca/indexer/transform.py index 7c833b742a..04df78c8a1 100644 --- a/src/azul/plugins/metadata/hca/indexer/transform.py +++ b/src/azul/plugins/metadata/hca/indexer/transform.py @@ -686,7 +686,8 @@ def _project_types(cls) -> FieldTypes: 'is_tissue_atlas_project': null_bool, 'tissue_atlas': [tissue_atlas], 'bionetwork_name': [null_str], - 'estimated_cell_count': null_int + 'estimated_cell_count': null_int, + 'data_use_restriction': null_str } def _project(self, project: api.Project) -> MutableJSON: @@ -733,7 +734,8 @@ def _project(self, project: api.Project) -> MutableJSON: for bionetwork in project.bionetworks), 'tissue_atlas': list(map(self._tissue_atlas, project.bionetworks)), 'bionetwork_name': sorted(bionetwork.name for bionetwork in project.bionetworks), - 'estimated_cell_count': project.estimated_cell_count + 'estimated_cell_count': project.estimated_cell_count, + 'data_use_restriction': project.data_use_restriction } @classmethod diff --git a/src/azul/plugins/metadata/hca/service/response.py b/src/azul/plugins/metadata/hca/service/response.py index 624fde4e72..bbee7aabb3 100644 --- a/src/azul/plugins/metadata/hca/service/response.py +++ b/src/azul/plugins/metadata/hca/service/response.py @@ -340,7 +340,8 @@ def make_projects(self, entry) -> MutableJSONs: 'estimatedCellCount': project['estimated_cell_count'], 'isTissueAtlasProject': project['is_tissue_atlas_project'], 'tissueAtlas': project.get('tissue_atlas'), - 'bionetworkName': project['bionetwork_name'] + 'bionetworkName': project['bionetwork_name'], + 'dataUseRestriction': project.get('data_use_restriction') } if self.entity_type == 'projects': translated_project['projectDescription'] = project.get('project_description', []) diff --git a/src/azul/plugins/repository/tdr.py b/src/azul/plugins/repository/tdr.py index 2db5ca53d4..92e0025e4b 100644 --- a/src/azul/plugins/repository/tdr.py +++ b/src/azul/plugins/repository/tdr.py @@ -53,8 +53,8 @@ longest_common_prefix, ) from azul.terra import ( - SourceRef as TDRSourceRef, TDRClient, + TDRSourceRef, TDRSourceSpec, ) from azul.time import ( @@ -193,7 +193,7 @@ def _drs_client(cls, return cls._user_authenticated_tdr(authentication).drs_client() def _lookup_source_id(self, spec: TDRSourceSpec) -> str: - return self.tdr.lookup_source(spec).id + return self.tdr.lookup_source(spec) def list_bundles(self, source: TDRSourceRef, diff --git a/src/azul/plugins/repository/tdr_anvil/__init__.py b/src/azul/plugins/repository/tdr_anvil/__init__.py index e0ea1e3c6a..d5873ea5d9 100644 --- a/src/azul/plugins/repository/tdr_anvil/__init__.py +++ b/src/azul/plugins/repository/tdr_anvil/__init__.py @@ -53,9 +53,9 @@ TDRBundle, TDRBundleFQID, TDRPlugin, - TDRSourceRef, ) from azul.terra import ( + TDRSourceRef, TDRSourceSpec, ) from azul.types import ( diff --git a/src/azul/plugins/repository/tdr_hca/__init__.py b/src/azul/plugins/repository/tdr_hca/__init__.py index f06b6526f3..b48b65b247 100644 --- a/src/azul/plugins/repository/tdr_hca/__init__.py +++ b/src/azul/plugins/repository/tdr_hca/__init__.py @@ -60,7 +60,7 @@ TDRPlugin, ) from azul.terra import ( - SourceRef as TDRSourceRef, + TDRSourceRef, TDRSourceSpec, ) from azul.types import ( diff --git a/src/azul/terra.py b/src/azul/terra.py index a1080330d7..bbe69397e7 100644 --- a/src/azul/terra.py +++ b/src/azul/terra.py @@ -211,7 +211,7 @@ def contains(self, other: 'SourceSpec') -> bool: ) -class SourceRef(BaseSourceRef[TDRSourceSpec, 'TDRSourceRef']): +class TDRSourceRef(BaseSourceRef[TDRSourceSpec, 'TDRSourceRef']): pass @@ -400,28 +400,32 @@ class TDRClient(SAMClient): A client for the Broad Institute's Terra Data Repository aka "Jade". """ - # FIXME: Eliminate azul.terra.TDRClient.TDRSource - # https://github.com/DataBiosphere/azul/issues/5524 - @attrs.frozen(kw_only=True) - class TDRSource: - project: str - id: str - location: str - @cache - def lookup_source(self, source_spec: TDRSourceSpec) -> TDRSource: + def lookup_source(self, source_spec: TDRSourceSpec) -> str: + """ + Validate that the repository's reported values for the snapshot's Google + project name and storage location match our expectations, and return the + snapshot's UUID. + """ source = self._lookup_source(source_spec) + actual_project = source['dataProject'] + require(actual_project == source_spec.subdomain, + 'Actual Google project of TDR source differs from configured one', + actual_project, source_spec.subdomain) storage = one( - storage - for dataset in (s['dataset'] for s in source['source']) - for storage in dataset['storage'] - if storage['cloudResource'] == 'bigquery' + resource + for resource in source['storage'] + if resource['cloudResource'] == 'bigquery' ) - return self.TDRSource(project=source['dataProject'], - id=source['id'], - location=storage['region']) - - def _retrieve_source(self, source: SourceRef) -> MutableJSON: + actual_location = storage['region'] + # Uppercase is standard for multi-regions in the documentation but TDR + # returns 'us' in lowercase + require(actual_location.lower() == config.tdr_source_location.lower(), + 'Actual storage location of TDR source differs from configured one', + actual_location, config.tdr_source_location) + return source['id'] + + def _retrieve_source(self, source: TDRSourceRef) -> MutableJSON: endpoint = self._repository_endpoint('snapshots', source.id) response = self._request('GET', endpoint) response = self._check_response(endpoint, response) @@ -438,8 +442,7 @@ def _lookup_source(self, source: TDRSourceSpec) -> MutableJSON: if total == 0: raise self._insufficient_access(str(endpoint)) elif total == 1: - source_id = one(response['items'])['id'] - return self._retrieve_source(SourceRef(id=source_id, spec=source)) + return one(response['items']) else: raise TerraNameConflictException(endpoint, source.name, response) @@ -632,7 +635,7 @@ def for_registered_user(cls, authentication: OAuth2) -> 'TDRClient': def drs_client(self) -> DRSClient: return DRSClient(http_client=self._http_client) - def get_duos(self, source: SourceRef) -> Optional[MutableJSON]: + def get_duos(self, source: TDRSourceRef) -> Optional[MutableJSON]: response = self._retrieve_source(source) try: duos_id = response['duosFirecloudGroup']['duosId'] diff --git a/src/humancellatlas/data/metadata/api.py b/src/humancellatlas/data/metadata/api.py index 812b041564..26c650c250 100644 --- a/src/humancellatlas/data/metadata/api.py +++ b/src/humancellatlas/data/metadata/api.py @@ -292,6 +292,7 @@ class Project(Entity): supplementary_links: OrderedSet[str] estimated_cell_count: int | None bionetworks: OrderedSet[Bionetwork] + data_use_restriction: str | None def __init__(self, json: JSON) -> None: super().__init__(json) @@ -317,6 +318,7 @@ def __init__(self, json: JSON) -> None: self.bionetworks = OrderedSet(Bionetwork(**bionetwork) for bionetwork in content.get('hca_bionetworks', ()) if bionetwork) + self.data_use_restriction = content.get('data_use_restriction') def _accessions(self, namespace: str) -> set[str]: return {a.accession for a in self.accessions if a.namespace == namespace} diff --git a/test/indexer/data/80baee6e-00a5-4fdc-bfe3-d339ff8a7178.dss.hca.json b/test/indexer/data/80baee6e-00a5-4fdc-bfe3-d339ff8a7178.dss.hca.json index 2f6db1b09e..aebfefb10b 100644 --- a/test/indexer/data/80baee6e-00a5-4fdc-bfe3-d339ff8a7178.dss.hca.json +++ b/test/indexer/data/80baee6e-00a5-4fdc-bfe3-d339ff8a7178.dss.hca.json @@ -386,7 +386,7 @@ } }, "project/2d846095-8a33-4f3c-97d4-585bafac13b4": { - "describedBy": "https://schema.humancellatlas.org/type/project/15.0.0/project", + "describedBy": "https://schema.humancellatlas.org/type/project/19.0.0/project", "schema_type": "project", "project_core": { "project_short_name": "InfiltratingNeoplasticCellsHumanGlioblastoma", @@ -396,6 +396,7 @@ "supplementary_links": [ "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE84nnn/GSE84465/suppl/GSE84465_GBM_All_data.csv.gz,ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE84nnn/GSE84465/suppl/GSE84465_RAW.tar" ], + "data_use_restriction": "NRES", "insdc_project_accessions": [ "SRP079058" ], diff --git a/test/indexer/data/aaa96233-bf27-44c7-82df-b4dc15ad4d9d.2018-11-02T11:33:44.698028Z.results.json b/test/indexer/data/aaa96233-bf27-44c7-82df-b4dc15ad4d9d.2018-11-02T11:33:44.698028Z.results.json index febff19e83..b1db16dbda 100644 --- a/test/indexer/data/aaa96233-bf27-44c7-82df-b4dc15ad4d9d.2018-11-02T11:33:44.698028Z.results.json +++ b/test/indexer/data/aaa96233-bf27-44c7-82df-b4dc15ad4d9d.2018-11-02T11:33:44.698028Z.results.json @@ -323,7 +323,8 @@ "is_tissue_atlas_project": 0, "tissue_atlas": [], "estimated_cell_count": 9223372036854774784, - "estimated_cell_count_": null + "estimated_cell_count_": null, + "data_use_restriction": "~null" } ], "matrices": [], @@ -571,7 +572,8 @@ "is_tissue_atlas_project": 0, "tissue_atlas": [], "estimated_cell_count": 9223372036854774784, - "estimated_cell_count_": null + "estimated_cell_count_": null, + "data_use_restriction": "~null" } ], "dates": [ @@ -814,7 +816,8 @@ "is_tissue_atlas_project": 0, "tissue_atlas": [], "estimated_cell_count": 9223372036854774784, - "estimated_cell_count_": null + "estimated_cell_count_": null, + "data_use_restriction": "~null" } ], "dates": [ @@ -1082,7 +1085,8 @@ "is_tissue_atlas_project": 0, "tissue_atlas": [], "estimated_cell_count": 9223372036854774784, - "estimated_cell_count_": null + "estimated_cell_count_": null, + "data_use_restriction": "~null" } ], "dates": [ @@ -1350,7 +1354,8 @@ "is_tissue_atlas_project": 0, "tissue_atlas": [], "estimated_cell_count": 9223372036854774784, - "estimated_cell_count_": null + "estimated_cell_count_": null, + "data_use_restriction": "~null" } ], "matrices": [], @@ -1669,7 +1674,8 @@ "is_tissue_atlas_project": [0], "tissue_atlas": [], "estimated_cell_count": 9223372036854774784, - "estimated_cell_count_": null + "estimated_cell_count_": null, + "data_use_restriction": ["~null"] } ], "matrices": [], @@ -1973,7 +1979,8 @@ "is_tissue_atlas_project": [0], "tissue_atlas": [], "estimated_cell_count": 9223372036854774784, - "estimated_cell_count_": null + "estimated_cell_count_": null, + "data_use_restriction": ["~null"] } ], "dates": [ @@ -2275,7 +2282,8 @@ "is_tissue_atlas_project": [0], "tissue_atlas": [], "estimated_cell_count": 9223372036854774784, - "estimated_cell_count_": null + "estimated_cell_count_": null, + "data_use_restriction": ["~null"] } ], "dates": [ @@ -2537,7 +2545,8 @@ "is_tissue_atlas_project": [0], "tissue_atlas": [], "estimated_cell_count": 9223372036854774784, - "estimated_cell_count_": null + "estimated_cell_count_": null, + "data_use_restriction": ["~null"] } ], "dates": [ @@ -2862,7 +2871,8 @@ "is_tissue_atlas_project": 0, "tissue_atlas": [], "estimated_cell_count": 9223372036854774784, - "estimated_cell_count_": null + "estimated_cell_count_": null, + "data_use_restriction": "~null" } ], "matrices": [], @@ -3139,7 +3149,8 @@ "is_tissue_atlas_project": 0, "tissue_atlas": [], "estimated_cell_count": 9223372036854774784, - "estimated_cell_count_": null + "estimated_cell_count_": null, + "data_use_restriction": "~null" } ], "dates": [ @@ -3420,7 +3431,8 @@ "is_tissue_atlas_project": [0], "tissue_atlas": [], "estimated_cell_count": 9223372036854774784, - "estimated_cell_count_": null + "estimated_cell_count_": null, + "data_use_restriction": ["~null"] } ], "dates": [ diff --git a/test/integration_test.py b/test/integration_test.py index 9ab4658126..69ed468561 100644 --- a/test/integration_test.py +++ b/test/integration_test.py @@ -147,9 +147,6 @@ from azul.plugins.metadata.anvil.bundle import ( Link, ) -from azul.plugins.repository.tdr import ( - TDRSourceRef, -) from azul.plugins.repository.tdr_anvil import ( BundleEntityType, TDRAnvilBundleFQID, @@ -171,6 +168,7 @@ from azul.terra import ( ServiceAccountCredentialsProvider, TDRClient, + TDRSourceRef, TDRSourceSpec, UserCredentialsProvider, ) diff --git a/test/service/data/pfb_manifest.results.json b/test/service/data/pfb_manifest.results.json index 1e087119f1..c17ddc5103 100644 --- a/test/service/data/pfb_manifest.results.json +++ b/test/service/data/pfb_manifest.results.json @@ -505,6 +505,9 @@ null ], "contributors": [], + "data_use_restriction": [ + null + ], "document_id": [ "6615efae-fca8-4dd2-a223-9cfcf30fe94d" ], @@ -848,6 +851,9 @@ null ], "contributors": [], + "data_use_restriction": [ + null + ], "document_id": [ "e8642221-4c2c-4fd7-b926-a68bce363c88" ], @@ -3282,6 +3288,9 @@ null ], "contributors": [], + "data_use_restriction": [ + null + ], "document_id": [ "90bf705c-d891-5ce2-aa54-094488b445c6" ], diff --git a/test/service/data/pfb_manifest.schema.json b/test/service/data/pfb_manifest.schema.json index 5c0181c6c7..513481e72a 100644 --- a/test/service/data/pfb_manifest.schema.json +++ b/test/service/data/pfb_manifest.schema.json @@ -1976,6 +1976,14 @@ "null", "long" ] + }, + { + "name": "data_use_restriction", + "namespace": "projects", + "type": { + "items": ["null", "string"], + "type": "array" + } } ], "name": "projects", diff --git a/test/service/test_index_projects.py b/test/service/test_index_projects.py index 1f40b5eb24..3a44d10f2c 100644 --- a/test/service/test_index_projects.py +++ b/test/service/test_index_projects.py @@ -84,7 +84,8 @@ def assert_file_type_summaries(hit): 'tissueAtlas', 'isTissueAtlasProject', 'bionetworkName', - 'estimatedCellCount' + 'estimatedCellCount', + 'dataUseRestriction' } response_json = get_response_json() self.assertIn('hits', response_json) diff --git a/test/service/test_response.py b/test/service/test_response.py index a7b236500d..2274c050cd 100644 --- a/test/service/test_response.py +++ b/test/service/test_response.py @@ -73,13 +73,11 @@ from azul.plugins.metadata.hca.service.response import ( HCASearchResponseStage, ) -from azul.plugins.repository.tdr import ( - TDRSourceRef, -) from azul.service.elasticsearch_service import ( ResponsePagination, ) from azul.terra import ( + TDRSourceRef, TDRSourceSpec, ) from azul.types import ( @@ -259,6 +257,7 @@ def test_response_stage_files(self): "isTissueAtlasProject": [False], "tissueAtlas": [], "estimatedCellCount": None, + "dataUseRestriction": [None], } ], "protocols": [ @@ -590,7 +589,8 @@ def test_response_stage_projects(self): "bionetworkName": [None], "tissueAtlas": [], "isTissueAtlasProject": False, - "accessions": [] + "accessions": [], + "dataUseRestriction": None, } ], "protocols": [ @@ -854,6 +854,7 @@ def test_response_stage_projects_accessions(self): {"namespace": "insdc_project", "accession": "SRP000001"}, {"namespace": "insdc_study", "accession": "PRJNA000000"}, ], + "dataUseRestriction": None, } ], "protocols": [ @@ -3506,6 +3507,28 @@ def test_projects_response(self): } self.assertEqual({None: 2, 'Lung': 1, 'Retina': 1, 'Blood': 1}, terms) + def test_data_use_restriction(self): + field, value = 'dataUseRestriction', 'NRES' + params = { + 'catalog': self.catalog, + 'sort': field, + 'filters': json.dumps({field: {'is': [value]}}) + } + plugin = self.index_service.metadata_plugin(self.catalog) + for entity_type in plugin.exposed_indices: + url = self.base_url.set(path=('index', entity_type), args=params) + response = requests.get(url) + response.raise_for_status() + response = response.json() + facets = response['termFacets'] + terms = {term['term'] for term in facets[field]['terms']} + self.assertEqual({None, value}, terms) + hits = response['hits'] + self.assertGreater(len(hits), 0) + expected = value if entity_type == 'projects' else [value] + for hit in hits: + self.assertEqual(expected, one(hit['projects'])[field]) + class TestUnpopulatedIndexResponse(IndexResponseTestCase):