Skip to content
This repository has been archived by the owner on Oct 15, 2024. It is now read-only.

Adding Athena data catalogs and prepared statements #1254

Conversation

swhite-oreilly
Copy link
Contributor

This PR adds support for AWS Athena data catalogs and prepared statements.

Testing

Athena resources were created using the following script with these resources add to the config file:
AthenaDataCatalog
AthenaNamedQuery
AthenaPreparedStatement
AthenaWorkGroup

# Generate a random string to use as a bucket name and classifier name suffix
RANDOM_STRING=$(openssl rand -hex 20)
# Generate a random string for shorter names
SHORT_RANDOM_STRING=$(openssl rand -hex 10)

# Set your preferred bucket names
INPUT_BUCKET="input-bucket-$RANDOM_STRING"

# Get AWS account ID
AWS_ACCOUNT_ID=$(aws sts get-caller-identity --query "Account" --output text)
echo "AWS Account ID: $AWS_ACCOUNT_ID"

# Create input bucket
aws s3api create-bucket --bucket $INPUT_BUCKET
echo "Input bucket created: s3://$INPUT_BUCKET"

# Create a KMS key
KMS_KEY_ARN=$(aws kms create-key --query 'KeyMetadata.Arn' --output text)
echo "KMS key created: $KMS_KEY_ARN"

# Extract the key ID from the ARN
KMS_KEY_ID=$(basename "$KMS_KEY_ARN" | cut -d '/' -f 2)
echo "KMS key ID: $KMS_KEY_ID"

# Set encryption settings for the Data Catalog
aws glue put-data-catalog-encryption-settings \
    --data-catalog-encryption-settings '{
        "EncryptionAtRest": {
            "CatalogEncryptionMode": "SSE-KMS",
            "SseAwsKmsKeyId": "'"$KMS_KEY_ID"'"
        }
    }'
echo "Data Catalog encryption settings set"

# Create the database if it doesn't exist
aws glue create-database --database-input '{"Name": "my_database"}'
echo "Database created"

# Create a data catalog table
aws glue create-table --database-name my_database --table-input '{
    "Name": "my_table",
    "Description": "My Glue Data Catalog table",
    "PartitionKeys": [
        {
            "Name": "partition_column",
            "Type": "string"
        }
    ],
    "StorageDescriptor": {
        "Columns": [
            {
                "Name": "column1",
                "Type": "string"
            },
            {
                "Name": "column2",
                "Type": "int"
            }
        ],
        "Location": "s3://'$INPUT_BUCKET'/data/",
        "InputFormat": "org.apache.hadoop.mapred.TextInputFormat",
        "OutputFormat": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat",
        "Compressed": false,
        "SerdeInfo": {
            "SerializationLibrary": "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe",
            "Parameters": {
                "field.delim": ","
            }
        }
    }
}'
echo "Data catalog table created"

# Create a data catalog classifier
aws athena create-data-catalog \
    --name "athena-data-catalog" \
    --type GLUE \
    --parameters catalog-id=$AWS_ACCOUNT_ID
echo "Data catalog classifier created"

# Create a named query in Athena
aws athena create-named-query \
    --name "athena-test-query" \
    --database "my_database" \
    --query-string "SELECT * FROM my_table"
echo "Named query created"


# Create Athena_DefaultRole if it doesn't exist
aws iam create-role --role-name Athena_DefaultRole --assume-role-policy-document '{
    "Version": "2012-10-17",
    "Statement": [{
        "Effect": "Allow",
        "Principal": {
            "Service": "athena.amazonaws.com"
        },
        "Action": "sts:AssumeRole"
    }]
}'
echo "Athena_DefaultRole created"

# Attach custom policy to Athena_DefaultRole
aws iam put-role-policy \
    --role-name Athena_DefaultRole \
    --policy-name AthenaFullAccessPolicy \
    --policy-document '{
        "Version": "2012-10-17",
        "Statement": [
            {
                "Effect": "Allow",
                "Action": [
                    "athena:*"
                ],
                "Resource": "*"
            }
        ]
    }'
echo "Custom policy attached to Athena_DefaultRole"

# Construct the correct IAM role ARN
ATHENA_DEFAULT_ROLE_ARN="arn:aws:iam::$AWS_ACCOUNT_ID:role/Athena_DefaultRole"
echo "Athena Default Role ARN: $ATHENA_DEFAULT_ROLE_ARN"

# Create an athena workgroup
aws athena create-work-group \
    --name "my_workgroup" \
    --configuration '{
        "ResultConfiguration": {
            "OutputLocation": "s3://'$INPUT_BUCKET'/query-results/",
            "EncryptionConfiguration": {
                "EncryptionOption": "SSE_S3"
            }
        },
        "EnforceWorkGroupConfiguration": true,
        "EngineVersion": {
            "SelectedEngineVersion": "PySpark engine version 3"
        },
        "ExecutionRole": "'"$ATHENA_DEFAULT_ROLE_ARN"'"
    }'
echo "Athena workgroup created"


# Create an athena notebook
aws athena create-notebook \
    --work-group "my_workgroup" \
    --name "my_athena_notebook"
echo "Athena notebook created"

# Create a prepared statement in Athena
aws athena create-prepared-statement \
    --statement-name "my_prepared_statement" \
    --work-group "primary" \
    --query-statement "SELECT * FROM my_table WHERE created_at > CAST(? AS date) LIMIT 10"
echo "Prepared statement created"

# List data catalogs
aws athena list-data-catalogs

# List athena named queries
aws athena list-named-queries

# List athena workgroups
aws athena list-work-groups

# List athena notebooks metadata
aws athena list-notebook-metadata --work-group "my_workgroup"

# List prepared statement
aws athena list-prepared-statements --work-group "primary"

This PR adds support for AWS athena data catalogs and prepared statements.
@swhite-oreilly swhite-oreilly requested a review from a team as a code owner August 7, 2024 15:22
@ekristen
Copy link
Contributor

@swhite-oreilly this has been implemented via ekristen/aws-nuke#269 of what is now the active fork of aws-nuke.

This project has now been deprecated in favor of this fork. Sven kindly granted me access to directly answer and close pull requests and issues so that we can notify users if their issues have been addressed or not. Please see the welcome issue for more information.

@ekristen ekristen closed this Sep 12, 2024
Sign up for free to subscribe to this conversation on GitHub. Already have an account? Sign in.
Labels
None yet
Projects
None yet
Development

Successfully merging this pull request may close these issues.

2 participants