Skip to content

Commit

Permalink
fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
jim-sheldon committed May 18, 2022
1 parent 4a0e9c9 commit cc11650
Show file tree
Hide file tree
Showing 8 changed files with 17 additions and 65 deletions.
4 changes: 2 additions & 2 deletions data-serving/scripts/prune-uploads/hooks/aggregate.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@

AWS_REGION = os.getenv("GDH_AGGREGATE_AWS_REGION", "eu-central-1")
# Job definition names are of the form PREFIX-<env>
PREFIX = "gdh-map-aggregation"
JOB_QUEUE = "gdh-map-aggregation"
PREFIX = os.getenv("JOB_DEF_PREFIX", "gdh-map-aggregation")
JOB_QUEUE = os.getenv("AGG_JOB_QUEUE", "gdh-map-aggregation-fargate")


def run(sources: list[dict[str, Any]], env: str, dry_run: bool = False):
Expand Down
6 changes: 5 additions & 1 deletion data-serving/scripts/prune-uploads/hooks/country_export.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,17 @@

from functools import cache
import logging
import os
from typing import Any
import unicodedata

import boto3
import pycountry


JOB_QUEUE = os.getenv("EXP_JOB_QUEUE", "export-queue")


# We do not always use the pycountry names, here's a list of exceptions
_QUIRKS = {
"DEMOCRATIC REPUBLIC OF THE CONGO": "CD",
Expand Down Expand Up @@ -84,7 +88,7 @@ def run(sources: list[dict[str, Any]], env: str, dry_run: bool = False):
logging.info(f"Submitting job for {jobdef} ...")
if not dry_run:
batch.submit_job(
jobName=jobdef, jobDefinition=jobdef, jobQueue="export-queue"
jobName=jobdef, jobDefinition=jobdef, jobQueue=JOB_QUEUE
)
except Exception as e:
logging.exception(f"Error occurred while trying to submit {jobdef}")
58 changes: 6 additions & 52 deletions docs/data_landscape.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,68 +6,22 @@ What we have, and where it's stored. This is organised by somewhat-physical, som

MongoDB Atlas stores line list case data (including revision history), user records, data ingestion source records including ingestion histories, maps of Mapbox administrative area codes to names and front-end session tokens for the line list portal. There are two projects:

1. Covid19Map-Dev has one cluster, cluster-0, which is hosted in AWS us-east-1. It holds development data which is mostly based on historical snapshots of production.
2. Covid19Map-Prod has one cluster, covid19-map-cluster01, also hosted in AWS us-east-1. It holds production data.
1. Covid19Map-Dev has one cluster, cluster-0, which is hosted in AWS eu-central-1. It holds development data which is mostly based on historical snapshots of production.
2. Covid19Map-Prod has one cluster, covid19-map-cluster01, also hosted in AWS eu-central-1. It holds production data.

## S3 stores

Various buckets (data containers) are used for both temporary and long-term storage of G.h data. Unless otherwise noted, all S3 buckets are in eu-central-1.

### Unknown use

* config-bucket-612888738066 (contains logs relating to secrets management of the AWS Lambda infrastructure. This doesn't only relate to the old ADI implementation, so check whether this is still needed. In us-east-2)
* dev-vocviz-sample (old map code, probably not required, in us-east-2)
* ncov19 (us-east-1)

### Aggregates

Aggregated data from the line list used by the map visualisation.

* covid-19-aggregates
* covid-19-aggregates-dev

### Export

Country specific (country-) and full (data-) export files in various formats

* covid-19-country-export
* covid-19-country-export-dev
* covid-19-data-export
* covid-19-data-export-dev

### Map

Map is a static site exported to an S3 bucket

* dev-covid-19.global.health (only one of dev/dev-map is used, in us-east-2)
* dev-map.covid-19.global.health
* map.covid-19.global.health
* dev-react-map.covid-19.global.health (us-east-2, should move to dev-map)
* react-map.covid-19.global.health (should move to map.covid-19.global.health)
* qa-covid-19.global.health

### Ingestion

* gdh-credentials (used to authenticate against backend, should move to API keys)
* gdh-sources (raw files downloaded from source URLs, was epid-ingestion-raw)

### Miscellaneous

* gdh-terraform-state-main (terraform state for our stack)
* gdh-metrics (telemetry on UI and Map)
* h1n1.global.health (us-east-2, H1N1 map)
Various buckets (data containers) are used for both temporary and long-term storage of G.h data. All S3 buckets except the one storing terraform state are in eu-central-1.

## Application logs

All of the "backend" components log to CloudWatch log streams in us-east-1 with no automatic rotation or expiration.
All of the "backend" components log to CloudWatch log streams in eu-central-1 with no automatic rotation or expiration.

## Computing servers

The kubernetes cluster (i.e. the backend services for the line list app) runs on four EC2 instances in us-east-1. No application data is stored here.

Ingestion and export both run on AWS Batch "serverless" architecture, both in us-east-1. No application data is stored here.
The kubernetes cluster (i.e. the backend services for the line list app) runs on Fargate in eu-central-1. No application data is stored here.

Data export has until recently run on AWS Lambda, again no application data is stored here. This is on its way out but mentioned for completeness.
Ingestion and export both run on AWS Batch in eu-central-1. No application data is stored here.

## Anything else?

Expand Down
4 changes: 2 additions & 2 deletions ingestion/monitoring/completeness.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,8 +114,8 @@ def setup_logger():
setup_logger()
endpoint_url = os.getenv("ENDPOINT_URL")
objects = data_files(
os.getenv("COUNTRY_EXPORT_BUCKET", "covid-19-country-export"),
os.getenv("COUNTRY_EXPORT_BUCKET", "covid-19-country-export-eu"),
endpoint_url=endpoint_url,
)
data = completeness_s3_many(objects, endpoint_url)
upload(data, os.getenv("METRICS_BUCKET", "covid-19-aggregates"), endpoint_url)
upload(data, os.getenv("METRICS_BUCKET", "covid-19-aggregates-eu"), endpoint_url)
2 changes: 1 addition & 1 deletion ingestion/monitoring/daily_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
import boto3


BUCKET = "covid-19-aggregates"
BUCKET = "covid-19-aggregates-eu"
WEBHOOK_URL = os.environ.get("SLACK_WEBHOOK_METRICS_URL", None)

logger = logging.getLogger(__name__)
Expand Down
2 changes: 1 addition & 1 deletion ingestion/monitoring/freshness.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ def setup_logger():
setup_logger()
if not (api_key := os.getenv("GDH_API_KEY")):
raise ValueError("Set GDH_API_KEY to your Global.health API key")
bucket = os.getenv("BUCKET", "covid-19-aggregates")
bucket = os.getenv("BUCKET", "covid-19-aggregates-eu")
s3_endpoint = os.getenv("S3_ENDPOINT")
instance = os.getenv("GDH_URL", DEFAULT_INSTANCE)
if sources := fetch_sources(api_key, instance):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,6 @@ describe('Bulk upload form', function () {

// Case data
cy.contains('www.bulksource.com');
cy.contains('sourceEntryId');
cy.contains('superuser@test.com');
cy.contains('Data upload IDs')
.parent()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -200,10 +200,6 @@ describe('Curator', function () {
'www.example.com',
);
});
cy.get('input[name="caseReference.sourceEntryId"]').should(
'have.value',
'testSourceEntryID123',
);

// Demographics.
cy.get('input[name="gender"]').should('have.value', 'Female');
Expand Down Expand Up @@ -308,7 +304,6 @@ describe('Curator', function () {
cy.contains('td', 'www.example.com').click({ force: true });
// Case data.
cy.contains('www.example.com');
cy.contains('testSourceEntryID123');
cy.contains('superuser@test.com');
cy.contains('VERIFIED');
// Demographics.
Expand Down

0 comments on commit cc11650

Please sign in to comment.