From 4c06cc1844eb99f790acae0d85081af79619cf1b Mon Sep 17 00:00:00 2001 From: Vishal Dharmadhikari Date: Thu, 16 Mar 2023 17:06:14 +0000 Subject: [PATCH 1/2] Add: content cleanup automation example --- .../README.md | 246 ++++++++++++++ .../main.py | 300 ++++++++++++++++++ .../requirements.txt | 5 + 3 files changed, 551 insertions(+) create mode 100644 examples/python/cloud-function-content-cleanup-automation/README.md create mode 100644 examples/python/cloud-function-content-cleanup-automation/main.py create mode 100644 examples/python/cloud-function-content-cleanup-automation/requirements.txt diff --git a/examples/python/cloud-function-content-cleanup-automation/README.md b/examples/python/cloud-function-content-cleanup-automation/README.md new file mode 100644 index 000000000..095629d62 --- /dev/null +++ b/examples/python/cloud-function-content-cleanup-automation/README.md @@ -0,0 +1,246 @@ +# Looker Content Cleanup Automation with Cloud Functions & Cloud Scheduler + +## Overview + +This repository contains a [Google Cloud Function](https://cloud.google.com/functions) that leverages the Looker Python SDK to automate Looker content cleanup. It can be triggered to run at your desired cadence using [Cloud Scheduler](https://cloud.google.com/scheduler). + +Implementing an automated content cleanup process will help your instance avoid content bloat and make users more productive when searching for content. [Content bloat is the effect on an organization when time is wasted finding the relevant content to answer a question or recreating content that already exists](https://sarahsnewsletter.substack.com/p/the-thrill-of-deprecating-dashboards). + +### Content Cleanup Process + +The cleanup process implemented by this script is as follows: + +1. Schedule content clean up automation to run every 90 days (default). +2. Dashboards and Looks not used in the past 90 days (default) are archived (soft deleted). Soft deleting a piece of content means moving it to the [Trash folder](https://cloud.google.com/looker/docs/admin-spaces#trash) which only admins have access to. + - Soft deleted content can be restored to its original folder from either the UI or with the API ([Appendix](#appendix)). +3. Permanently delete content (i.e. remove from Trash folder) that's been soft-deleted and goes unclaimed for another 90 days (default). Before permanently deleting dashboards, the dashboard LookML is saved to a [Google Cloud Storage](https://cloud.google.com/storage) bucket. + - Permanently deleted dashboards which were backed up before deletion can be restored using the [import_dashboard_from_lookml](https://developers.looker.com/api/explorer/4.0/methods/Dashboard/import_dashboard_from_lookml?sdk=py) method. + - ⚠️ **WARNING**: Permanently deleted content is lost forever. You cannot undo this action! + +Running the automation every 90 days allows the script to handle both soft-deleting and permanently deleting content at the same time. That said, the days are configurable within the script. + +**NOTE**: this automation only works for Looks and user-defined dashboards. The unused content email notification will contain LookML dashboards which can be deleted by removing their dashboard lkml file in their LookML project. + +### Automation Diagram + +![diagram](https://user-images.githubusercontent.com/61256217/224140284-466c5c0d-0432-47de-a1b3-7c599423b2c7.png) + +1. Trigger automation at desired interval +2. Run queries and update content +3. Backup dashboards before permanent deletion +4. Send unused content and deleted content email notification + +## Requirements + +- Looker instance in which you have Admin permissions. +- Google Cloud Project with the following APIs enabled: + - Artifact Registry API + - Cloud Run Admin API + - Cloud Build API + - Cloud Functions API + - Cloud Logging API + - Cloud Pub/Sub API + - Cloud Scheduler API + - Secret Manager API + - Cloud Storage API + +## How it works + +The script executes the following steps each time it is run: + +1. Get two query IDs which run a System Activity query to identify content unused in the past 90 days (default) and content deleted more than 90 days ago (default), respectively. +2. Run both queries to get data for unused content and deleted content. +3. Soft delete unused content. +4. Permanently delete content in Trash folder. + - Dashboards will be backed up to a GCS bucket before being deleted. Backups are not available for Looks. +5. Send two emails containing the soft deleted and permanently deleted content in CSV format. + - Delivery format can be updated on [line 185 of main.py](../looker_content_cleanup_automation/main.py#L185) to any of the [accepted formats](https://developers.looker.com/api/explorer/4.0/methods/ScheduledPlan/scheduled_plan_run_once). + +### Dry Run / Safe Mode + +The script is currently in dry run / safe mode to avoid accidental content deletions while setting up this automation. This means the soft delete and hard delete functions are commented out in `main.py` (`soft_delete_dashboard`, `soft_delete_look`, `hard_delete_dashboard`, `hard_delete_look`). + +In dry run mode, the automation will run the queries, send the schedules, and backup dashboards that are to be hard deleted without actually deleting any content. + +### Required before running the script + +In `main.py` search `todo` to: + +- Update `GCP_PROJET_ID` and `GCS_BUCKET_NAME` to enable backing up dashboards to GCS before permanent deletion. +- Update `DAYS_BEFORE_SOFT_DELETE` (# of days content is unused before archival) and `DAYS_BEFORE_HARD_DELETE` (# of days in trash before permanently deletion). +- Update `NOTIFICATION_EMAIL_ADDRESS` (email address for content deletion notification). +- Toggle dry run of automation off/on depending on if you want content to be deleted. + +## Setup + +Before deploying to production, please abide by the principle of least privilege and modify the service account used for this automation to meet your company's security standards and has their approval. + +The following steps assume deployment using the Google Cloud UI Console. + +1. Obtain a [Looker API3 Key](https://docs.looker.com/admin-options/settings/users#api3_keys). + +2. In `main.py` update: + + 1. `GCP_PROJECT_ID` on [line 28](../looker_content_cleanup_automation/main.py#L28) + 2. `DAYS_BEFORE_SOFT_DELETE` on [line 29](../looker_content_cleanup_automation/main.py#L29) + 3. `DAYS_BEFORE_HARD_DELETE` on [line 30](../looker_content_cleanup_automation/main.py#L30) + 4. `NOTIFICATION_EMAIL_ADDRESS` on [line 31](../looker_content_cleanup_automation/main.py#L31) + +3. Go to [Cloud Secret Manager](https://cloud.google.com/secret-manager) and enable the Secret Manager API. Create the following secrets: + + 1. `looker-base-url`: secret value is your Looker instance URL (e.g. `https://my_looker_instance.cloud.looker.com/`) + 2. `looker-client-id`: secret value is the Client ID generated in Step 1. + 3. `looker-client-secret`: secret value is the Client Secret generated in Step 1. + +4. Go to Cloud Storage and create a new bucket. + +5. Cloud Storage bucket suggested settings, modify as necessary: + + 1. **Name your bucket**: `looker-automation-dashboards-backup` + + - Update `GCS_BUCKET_NAME` with this value on [line 32 of main.py](../looker_content_cleanup_automation/main.py#L32). + - Select `Continue` + + 2. **Choose where to store your data** + + - **Location type**: `Region`, `us-west1 (Oregon)` (or preferred type & region) + - Select `Continue` + + 3. **Choose a storage class for your data** + + - `Set a default class` --> `Coldline` or `Archive` + - Select `Continue` + + 4. **Choose how to control access to objects** + + - **Prevent public access**: `Enabled` + - **Access control**: `Uniform` + - Select `Continue` + + 5. Select `Create` + +6. Go to Cloud Functions and create a new function. + +7. Cloud Functions function suggested settings, modify as necessary: + + 1. **Basics** + + - **Environment**: `2nd gen` + - **Function name**: `looker-content-cleanup-automation` + - **Region**: `us-west1` (or preferred region) + - **Authentication**: `Require authentication` + - **Require HTTPS**: `Enabled` + - Select `Save` + + 2. **Runtime, build, connections and security settings** + + - **Runtime** + + - **Memory allocated**: `512 MB` + - **Timeout**: `3600` + - **Runtime service account**: `App Engine default service account` + + - **Security and Image Repo** + + - **Reference a Secret**: reference the `looker-base-url` secret created in Step 3 and map it to the `LOOKERSDK_BASE_URL` environment variable. + - **Secret**: `looker-base-url` + - **Reference method**: `Exposed as environment variable` + - **Name 1**: `LOOKERSDK_BASE_URL` + - Select `Done` + - **Reference a Secret**: reference the `looker-client-id` secret created in Step 3 and map it to the `LOOKERSDK_CLIENT_ID` environment variable. + - **Secret**: `looker-client-id` + - **Reference method**: `Exposed as environment variable` + - **Name 1**: `LOOKERSDK_CLIENT_ID` + - Select `Done` + - **Reference a Secret**: reference the `looker-client-secret` secret created in Step 3 and map it to the `LOOKERSDK_CLIENT_SECRET` environment variable. + - **Secret**: `looker-client-secret` + - **Reference method**: `Exposed as environment variable` + - **Name 1**: `LOOKERSDK_CLIENT_SECRET` + - Select `Done` + + - Select `Next` + + 3. **Code** + + - **Runtime**: `Python 3.9` + - Copy and paste the contents of `main.py` in this repository into the `main.py` file once inside Cloud Function's inline editor. + - **Entry point**: `main` + - **NOTE**: review the `todo` items listed in the script prior to deploying, otherwise the automation won't work as intended. + - Copy and paste the contents of `requirements.txt` in this repository to the `requirements.txt` file once inside Cloud Function's inline editor. + + 4. Deploy the function. + +8. Go to Cloud IAM > IAM and grant the `App Engine default service account` (`@appspot.gserviceaccount.com`) principal: + + 1. `Secret Manager Secret Accessor` role to access the secrets created in Step 2. + 2. `Storage Object Creator` role to backup dashboards to the GCS bucket created in Step 5. + +9. Test the automation function in dry run mode (run queries, backup dashboards, and send schedules, without soft deleting or hard deleting any content). + + - Check out [this article](https://cloud.google.com/functions/docs/quickstart-python#test_the_function) for detailed instructions. + +10. Go to Cloud Scheduler and select `Schedule a job` (or `Create job`). + +11. Cloud Scheduler job suggested settings, modify as necessary: + + 1. **Define the schedule** + + - **Name**: `trigger-looker-content-cleanup-automation` + - **Region**: `us-west1 (Oregon)` (same region as Cloud Function) + - **Frequency**: `0 0 1 */3 *` (every 3 months or update to desired frequency of how often the automation should run) + - **Timezone**: Select desired timezone the scheduled job should use + - Select `Continue` + + 2. **Configure the execution** + + - **Target type**: `HTTP` + - **URL**: Trigger URL from function created in Step 4 + - **HTTP method**: `POST` + - **Auth header**: `Add OIDC token` + - **Service account**: `App Engine default service account` + + 3. Select `Create` + +12. Test the schedule (Actions > Force run) to confirm it triggers the `looker-content-cleanup-automation` function in dry run mode. + +13. After validating everything is working as expected, make the `todo` changes to `main.py` to toggle off dry run mode. + +## Appendix + +### Restore soft deleted content + +```python +import looker_sdk +from looker_sdk import models40 + +config_file = "looker.ini" +sdk = looker_sdk.init40(config_file) + +def restore_soft_delete_dashboard(dashboard_id): + dashboard = models40.WriteDashboard(deleted=False) + try: + sdk.update_dashboard(str(dashboard_id), body=dashboard) + print(f"Successfully restored dashboard {dashboard_id}") + except Exception as e: + print(f"Error: {e}") + +def restore_soft_delete_look(look_id): + look = models40.WriteLookWithQuery(deleted=False) + try: + sdk.update_look(str(look_id), body=look) + print(f"Successfully restored look {look_id}") + except Exception as e: + print(f"Error: {e}") + +# Provide a list of look_ids to restore +looks_to_restore = [] + +for look in looks_to_restore: + restore_soft_delete_look(look) + +# Provide a list of dashboard_ids to restore +dashboards_to_restore = [1] + +for dashboard in dashboards_to_restore: + restore_soft_delete_dashboard(dashboard) +``` diff --git a/examples/python/cloud-function-content-cleanup-automation/main.py b/examples/python/cloud-function-content-cleanup-automation/main.py new file mode 100644 index 000000000..321798eb6 --- /dev/null +++ b/examples/python/cloud-function-content-cleanup-automation/main.py @@ -0,0 +1,300 @@ +""" This Cloud Function leverages the Looker Python SDK to automate Looker content cleanup. + +It accomplishes the following tasks: +1. Get unused content and deleted content data from a Looker System Activity query in JSON. +2. Archive (soft delete) dashboards and Looks which were last accessed more than 90 days ago. +3. Permanently (hard) delete dashboards and Looks which have been archived for more than 90 days. +4. Send an email notification using Looker's scheduler of all the content that was archived & permanently deleted. + +Search `todo` to: +- Update GCP_PROJET_ID and GCS_BUCKET_NAME to enable backing up dashboards to GCS before permanent deletion. +- Update DAYS_BEFORE_SOFT_DELETE (# of days content is unused before archival) and DAYS_BEFORE_HARD_DELETE (# of days in trash before permanent deletion). +- Update NOTIFICATION_EMAIL_ADDRESS (email address for content deletion notification). +- Toggle dry run of automation off/on. + +Last modified: March 2023 +""" + +import looker_sdk +from looker_sdk import models40 +from looker_sdk import error +from google.cloud import storage +from google.cloud import exceptions +import json +from datetime import datetime + + +# todo: enter desired configuration +GCP_PROJECT_ID = "" +DAYS_BEFORE_SOFT_DELETE = 90 +DAYS_BEFORE_HARD_DELETE = 90 +NOTIFICATION_EMAIL_ADDRESS = "email@address.com" +GCS_BUCKET_NAME = "" + + +# Initialize Looker SDK & Google Cloud Storage +sdk = looker_sdk.init40() +storage_client = storage.Client(project=GCP_PROJECT_ID) + + +def main(request): + # Run a System Activity query to get unused content in past 90 (default) days, archive (soft delete) the content, then send an email with a list of the content. + unused_content_query_id = get_unused_content_query_id( + DAYS_BEFORE_SOFT_DELETE) + unused_content = get_unused_content(unused_content_query_id) + unused_dashboard_ids = get_dashboard_ids(unused_content) + unused_look_ids = get_look_ids(unused_content) + + for dashboard_id, _ in unused_dashboard_ids: + soft_delete_dashboard(dashboard_id) + + for look_id in unused_look_ids: + soft_delete_look(look_id) + + send_content_notification( + unused_content_query_id, + "soft", + NOTIFICATION_EMAIL_ADDRESS + ) + + # Run a System Activity query to get content deleted 90+ (default) days ago, permenantly (hard) delete the content, then send an email with a list of the content. + deleted_content_query_id = get_deleted_content_query_id( + DAYS_BEFORE_HARD_DELETE) + deleted_content = get_deleted_content(deleted_content_query_id) + deleted_dashboard_ids = get_dashboard_ids(deleted_content) + deleted_look_ids = get_look_ids(deleted_content) + + for dashboard_id, dashboard_title in deleted_dashboard_ids: + # todo: comment out backup_dashboard_lookml to disable backing up dashboard LookML to GCS feature before hard deleting the dashboard. + backup_dashboard_lookml(dashboard_id, dashboard_title) + hard_delete_dashboard(dashboard_id) + + for look_id in deleted_look_ids: + hard_delete_look(look_id) + + send_content_notification( + deleted_content_query_id, + "hard", + NOTIFICATION_EMAIL_ADDRESS + ) + + return "Successfully ran soft delete and hard delete content automation." + + +def get_unused_content_query_id(days: int): + """ Get a re-useable query ID for a System Activity query which returns all content that hasn't been used in at least 90 (default) days. + This query ID can be used to run the query and send a schedule with the query's results. + """ + unused_content_query = models40.WriteQuery( + model="system__activity", + view="content_usage", + fields=[ + "content_usage.content_title", + "content_usage.content_type", + "content_usage.last_accessed_date", + "dashboard.id", + "look.id" + ], + pivots=None, + fill_fields=None, + filters={ + "content_usage.days_since_last_accessed": f">{days}", + "content_usage.content_type": "dashboard,look", + "_dashboard_linked_looks.is_used_on_dashboard": "No", + "look.public": "No" + }, + filter_expression="if(is_null(${dashboard.deleted_date}) = no OR is_null(${look.deleted_date}) = no,no,yes)", + sorts=["content_usage.last_accessed_date"], + limit="50000" + ) + unused_content_query = sdk.create_query( + body=unused_content_query + ) + return unused_content_query.id + + +def get_unused_content(query_id: str): + """ Run a query against System Activity to get a list of unused content. """ + unused_content = json.loads(sdk.run_query( + query_id=query_id, + result_format="json", + cache=True + )) + + return unused_content + + +def get_deleted_content_query_id(days: int): + """ Get a re-usable query ID for a System Activity query which returns all content that's been soft deleted for 90+ (default) days. + This query ID can be used to run the query and send a schedule with the query's results. + """ + deleted_query = models40.WriteQuery( + model="system__activity", + view="content_usage", + fields=[ + "content_usage.content_title", + "content_usage.content_type", + "content_usage.last_accessed_date", + "dashboard.deleted_date", + "dashboard.id", + "look.deleted_date", + "look.id" + ], + dynamic_fields='[{"category":"dimension",\ + "expression":"diff_days(coalesce(${dashboard.deleted_date},${look.deleted_date}), now())",\ + "label":"Days Since Moved to Trash",\ + "value_format":null,\ + "value_format_name":null,\ + "dimension":"days_since_moved_to_trash",\ + "_kind_hint":"dimension",\ + "_type_hint":"number"}]', + pivots=None, + fill_fields=None, + filters={ + "content_usage.content_type": "dashboard,look", + "days_since_moved_to_trash": f">{days}" + }, + filter_expression="if(is_null(${dashboard.deleted_date}) = no OR is_null(${look.deleted_date}) = no,yes,no)", + sorts=["content_usage.last_accessed_date"], + limit="50000" + ) + trashed_content_query = sdk.create_query( + body=deleted_query + ) + return trashed_content_query.id + + +def get_deleted_content(query_id: str): + """ Run a query against System Activity to get a list of content soft deleted for 90+ (default) days. """ + unused_content = json.loads(sdk.run_query( + query_id=query_id, + result_format="json", + cache=True + )) + + return unused_content + + +def send_content_notification(query_id: str, delete_type: str, address: str): + """ Send an email notification to the given email address(es) about the content that was soft/hard deleted on the given date. + """ + created_date = datetime.today().strftime('%Y-%m-%d') + + scheduled_plan_destination_body = models40.ScheduledPlanDestination( + format="csv", + type="email", + address=address, + message=f"List of dashboards and Looks that were {delete_type} deleted on {created_date}.\ + Note, LookML dashboards are unaffected by this automation, the dashboard lkml file has to be deleted from its LookML project.", + apply_formatting=False, + apply_vis=False + ) + unused_content_notification = models40.WriteScheduledPlan( + name=f"[Looker Automation] {delete_type.capitalize()} deleted content ({created_date}).", + query_id=query_id, + scheduled_plan_destination=[ + scheduled_plan_destination_body + ] + ) + + try: + send_notification = sdk.scheduled_plan_run_once( + body=unused_content_notification + ) + return send_notification + except Exception as e: + print( + f"Error sending {delete_type} delete email notification ({created_date}): {e}") + + +def get_dashboard_ids(content: list): + """ Get the dashboard IDs for the given content. """ + return [ + (str(dashboard['dashboard.id']), + dashboard['content_usage.content_title']) + for dashboard in content + if dashboard['content_usage.content_type'] == 'dashboard' + and dashboard['dashboard.id'] is not None + ] + + +def get_look_ids(content: list): + """ Get the look IDs for the given content. """ + return [ + str(look['look.id']) + for look in content + if look['content_usage.content_type'] == 'look' + ] + + +def soft_delete_dashboard(dashboard_id: str): + """ Soft delete the given dashboard. """ + # todo: to toggle off safe mode and soft delete dashboards, comment out `deleted=False`` line and uncomment `deleted=True` line + dashboard = models40.WriteDashboard(deleted=False) + # dashboard = models40.WriteDashboard(deleted=True) + try: + sdk.update_dashboard(dashboard_id, body=dashboard) + print(f"Successfully soft deleted dashboard: {dashboard_id}") + except Exception as e: + print(f"Error with soft deleting dashboard ({dashboard_id}): {e}") + + +def soft_delete_look(look_id: str): + """ Soft delete the given look. """ + # todo: to toggle off safe mode and soft delete Looks, comment out `deleted=False`` line and uncomment `deleted=True` line + look = models40.WriteLookWithQuery(deleted=False) + # look = models40.WriteLookWithQuery(deleted=True) + try: + sdk.update_look(look_id, body=look) + print(f"Successfully soft deleted Look: {look_id}") + except Exception as e: + print(f"Error with soft deleting Look ({look_id}): {e}") + + +def hard_delete_dashboard(dashboard_id: str): + """ Hard (permanently) delete a dashboard from the instanace. There is no undo for this kind of delete! """ + try: + # todo: to toggle off safe mode and hard delete dashboards, uncomment the delete_dashboard() method + # sdk.delete_dashboard(dashboard_id) + print(f"Successfully permanently deleted dashboard: {dashboard_id}") + except Exception as e: + print(f"Error permanently deleting dashboard ({dashboard_id}): {e}") + + +def hard_delete_look(look_id: str): + """ Hard (permanently) delete a Look from the instanace. There is no undo for this kind of delete! """ + try: + # todo: to toggle off safe mode and hard delete Looks, uncomment the delete_look() method + # sdk.delete_look(look_id) + print(f"Successfully permanently deleted Look: {look_id}") + except Exception as e: + print(f"Error permanently deleting Look ({look_id}): {e}") + + +def backup_dashboard_lookml(dashboard_id: str, dashboard_title: str): + """ Saves a user-defined dashboard's LookML to a GCS bucket. """ + created_date = datetime.today().strftime('%Y-%m-%d') + folder_name = f"dashboards_{created_date}" + file_name = f'{dashboard_id}-{dashboard_title}' + + try: + dashboard_lookml = sdk.dashboard_lookml( + dashboard_id=dashboard_id)['lookml'] + + except error.SDKError as e: + dashboard_lookml = None + print( + f"Broken dashboard, dashboard LookML was not imported for dashboard {dashboard_id}.") + + if dashboard_lookml: + try: + bucket = storage_client.get_bucket(GCS_BUCKET_NAME) + full_path = f"{folder_name}" + "/" + file_name + ".json" + blob = bucket.blob(full_path) + blob.upload_from_string(dashboard_lookml) + print(f"Successful GCS back up of dashboard: {dashboard_id}") + + except exceptions.GoogleCloudError as e: + bucket = None + print(f"Error uploading dashboard {dashboard_id} to GCS: {e}") + return bucket diff --git a/examples/python/cloud-function-content-cleanup-automation/requirements.txt b/examples/python/cloud-function-content-cleanup-automation/requirements.txt new file mode 100644 index 000000000..a56d30214 --- /dev/null +++ b/examples/python/cloud-function-content-cleanup-automation/requirements.txt @@ -0,0 +1,5 @@ +# Function dependencies, for example: +# package>=version +looker_sdk +google-cloud-storage==2.7.0 +google-api-core==2.11.0 \ No newline at end of file From 4b3e0083d292953910ff592b5f4d6ae40d73a7a6 Mon Sep 17 00:00:00 2001 From: Vishal Dharmadhikari Date: Thu, 16 Mar 2023 17:17:35 +0000 Subject: [PATCH 2/2] Fix: update examples README --- examples/python/README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/python/README.md b/examples/python/README.md index e05bd3bac..3ebfacefc 100644 --- a/examples/python/README.md +++ b/examples/python/README.md @@ -9,6 +9,7 @@ The full details of all Looker API endpoints are listed in Looker Docs: [Version - [Flask full app demo](lookersdk-flask) - [Google Cloud Function & Google Sheet : Create new users from reading email addresses in a Google Sheet](cloud-function-user-provision) - [Google Cloud Function & BigQuery: Run a query in Looker, and write the result to a BigQuery table](cloud-function-write-to-bigquery) +- [Google Cloud Function, Cloud Scheduler, & GCS: Automate the Looker content cleanup process](cloud-function-content-cleanup-automation) ## Connection : Manage Database Connections