Skip to content

Commit

Permalink
Set up sitemaps (#2297)
Browse files Browse the repository at this point in the history
Set up sitemaps generation on a cronjob to a bucket.

The bucket is served via the load balancer on paths prefixed by
`/sitemap_`

This PR contains:
- New bucket for prod and test
- New bucket backend for serving assets
- Load balancer config to redirect requests to the bucket backend when
appropriate
- Cron config for prod and test
  • Loading branch information
another-rex committed Jun 12, 2024
1 parent c50b373 commit d4509b6
Show file tree
Hide file tree
Showing 14 changed files with 132 additions and 78 deletions.
24 changes: 24 additions & 0 deletions deployment/clouddeploy/gke-workers/base/generate-sitemap.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
apiVersion: batch/v1
kind: CronJob
metadata:
name: generate-sitemap
spec:
schedule: "30 8 * * *"
concurrencyPolicy: Forbid
jobTemplate:
spec:
template:
spec:
containers:
- name: generate-sitemap-cron
image: cron
imagePullPolicy: Always
command: ["/usr/local/bin/generate_sitemap/generate_sitemap.py", "--base_url", "$BASE_URL"]
resources:
requests:
cpu: 1
memory: "4G"
limits:
cpu: 1
memory: "6G"
restartPolicy: OnFailure
1 change: 1 addition & 0 deletions deployment/clouddeploy/gke-workers/base/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,4 @@ resources:
- alias-computation.yaml
- nvd-mirror.yaml
- backup.yaml
- generate-sitemap.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
apiVersion: batch/v1
kind: CronJob
metadata:
name: generate-sitemap
spec:
jobTemplate:
spec:
template:
spec:
containers:
- name: generate-sitemap-cron
env:
- name: BASE_URL
value: "https://test.osv.dev"
- name: GOOGLE_CLOUD_PROJECT
value: oss-vdb-test
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,4 @@ patches:
- path: nvd-mirror.yaml
- path: alias-computation.yaml
- path: backup.yaml
- path: generate-sitemap.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
apiVersion: batch/v1
kind: CronJob
metadata:
name: generate-sitemap
spec:
jobTemplate:
spec:
template:
spec:
containers:
- name: generate-sitemap-cron
env:
- name: BASE_URL
value: "https://osv.dev"
- name: GOOGLE_CLOUD_PROJECT
value: oss-vdb
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,4 @@ patches:
- path: nvd-mirror.yaml
- path: alias-computation.yaml
- path: backup.yaml
- path: generate-sitemap.yaml
1 change: 1 addition & 0 deletions deployment/terraform/environments/oss-vdb-test/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ module "osv_test" {
logs_bucket = "osv-test-logs"
cve_osv_conversion_bucket = "osv-test-cve-osv-conversion"
debian_osv_conversion_bucket = "osv-test-debian-osv"
osv_dev_sitemap_bucket = "test-osv-dev-sitemap"
backups_bucket = "osv-test-backup"
backups_bucket_retention_days = 5
affected_commits_backups_bucket = "osv-test-affected-commits"
Expand Down
1 change: 1 addition & 0 deletions deployment/terraform/environments/oss-vdb/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ module "osv" {
cve_osv_conversion_bucket = "cve-osv-conversion"
debian_osv_conversion_bucket = "debian-osv"
logs_bucket = "osv-logs"
osv_dev_sitemap_bucket = "osv-dev-sitemap"
backups_bucket = "osv-backup"
backups_bucket_retention_days = 60
affected_commits_backups_bucket = "osv-affected-commits"
Expand Down
11 changes: 11 additions & 0 deletions deployment/terraform/modules/osv/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,17 @@ resource "google_storage_bucket" "affected_commits_backups_bucket" {
}
}

resource "google_storage_bucket" "osv_dev_sitemap_bucket" {
project = var.project_id
name = var.osv_dev_sitemap_bucket
location = "US"
storage_class = "STANDARD"
uniform_bucket_level_access = true
lifecycle {
prevent_destroy = true
}
}

# Service account permissions
resource "google_service_account" "deployment_service" {
project = var.project_id
Expand Down
7 changes: 6 additions & 1 deletion deployment/terraform/modules/osv/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,11 @@ variable "debian_osv_conversion_bucket" {
description = "Name of bucket to store converted debian advisories in."
}

variable "osv_dev_sitemap_bucket" {
type = string
description = "Name of bucket to store the osv.dev sitemap."
}

variable "api_url" {
type = string
description = "URL to serve the OSV API on. Domain ownership and DNS settings has to be set up manually."
Expand All @@ -61,4 +66,4 @@ variable "esp_version" {
variable "website_domain" {
type = string
description = "Domain to serve the OSV website on. Domain ownership and DNS settings must be manually configured."
}
}
22 changes: 21 additions & 1 deletion deployment/terraform/modules/osv/website.tf
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,15 @@ resource "google_certificate_manager_certificate_map_entry" "website" {
}
}

# Create LB backend buckets
resource "google_compute_backend_bucket" "osv_dev_sitemap_backend" {
project = var.project_id
name = "osv-dev-sitemap-backend"
description = "Backend for GCS bucket containing osv.dev's sitemap"
bucket_name = google_storage_bucket.osv_dev_sitemap_bucket.id
enable_cdn = false
}

# Load Balancer
module "gclb" {
source = "terraform-google-modules/lb-http/google//modules/serverless_negs"
Expand Down Expand Up @@ -145,8 +154,19 @@ resource "google_compute_url_map" "website" {
path_matcher {
name = "allpaths"
default_service = module.gclb.backend_services.cloudrun.id

// Sitemap specific URLs
route_rules {
priority = 1
match_rules {
prefix_match = "/sitemap_"
}
service = google_compute_backend_bucket.osv_dev_sitemap_backend.id
}

// Rest of the website should go to the cloud run
route_rules {
priority = 10
match_rules {
prefix_match = "/"
}
Expand Down Expand Up @@ -174,4 +194,4 @@ output "website_dns_records" {
name = "${var.website_domain}."
type = "AAAA"
}], google_certificate_manager_dns_authorization.website.dns_resource_record)
}
}
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
# Copyright 2021 Google LLC
#!/usr/bin/env python3

# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand All @@ -12,8 +14,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""Generate sitemap."""
import gzip
import shutil
import sys
import os
import osv
Expand All @@ -24,8 +24,9 @@

from xml.etree.ElementTree import Element, SubElement, ElementTree

_SITEMAPS_DIRECTORY = './sitemap'
_SITEMAP_INDEX_PATH = f'{_SITEMAPS_DIRECTORY}/index.xml'
_OUTPUT_DIRECTORY = './sitemap_output'
_SITEMAPS_PREFIX = 'sitemap_'
_SITEMAP_INDEX_PATH = f'./{_SITEMAPS_PREFIX}index.xml'
_SITEMAP_URL_LIMIT = 49999


Expand All @@ -48,18 +49,16 @@ def osv_get_ecosystems():

def get_sitemap_filename_for_ecosystem(ecosystem: str) -> str:
ecosystem_name = ecosystem.replace(' ', '_').replace('.', '__').strip()
return f'{_SITEMAPS_DIRECTORY}/{ecosystem_name}.xml'
return f'./{_SITEMAPS_PREFIX}{ecosystem_name}.xml'


def get_sitemap_url_for_ecosystem(ecosystem: str, base_url: str) -> str:
ecosystem_name = ecosystem.replace(' ', '_').replace('.', '__').strip()
return f'{base_url}/sitemap/{ecosystem_name}.xml'
return f'{base_url}/{_SITEMAPS_PREFIX}{ecosystem_name}.xml'


def generate_sitemap_for_ecosystem(ecosystem: str, base_url: str) -> None:
"""Generate a sitemap for the give n ecosystem."""
os.makedirs(_SITEMAPS_DIRECTORY, exist_ok=True)

vulnerability_ids = fetch_vulnerability_ids(ecosystem)
filename = get_sitemap_filename_for_ecosystem(ecosystem)
urlset = Element(
Expand All @@ -78,22 +77,8 @@ def generate_sitemap_for_ecosystem(ecosystem: str, base_url: str) -> None:
tree.write(filename, encoding='utf-8', xml_declaration=True)


def compress_file(file_path: str) -> str:
"""Compress the file using gzip and return the path to the compressed file."""
base, _ = os.path.splitext(file_path)
compressed_file_path = f"{base}.gz"
with open(file_path, 'rb') as f_in:
with gzip.open(compressed_file_path, 'wb') as f_out:
shutil.copyfileobj(f_in, f_out)
# Remove the original uncompressed file
os.remove(file_path)
return compressed_file_path


def generate_sitemap_index(ecosystems: set[str], base_url: str) -> None:
"""Generate a sitemap index."""
os.makedirs(_SITEMAPS_DIRECTORY, exist_ok=True)

sitemapindex = Element(
'sitemapindex', xmlns="http://www.sitemaps.org/schemas/sitemap/0.9")

Expand All @@ -118,17 +103,21 @@ def generate_sitemaps(base_url: str) -> None:
}
for ecosystem in base_ecosystems:
generate_sitemap_for_ecosystem(ecosystem, base_url)
compress_file(get_sitemap_filename_for_ecosystem(ecosystem))

generate_sitemap_index(base_ecosystems, base_url)
compress_file(_SITEMAP_INDEX_PATH)


def main() -> int:
parser = argparse.ArgumentParser(description='Generate sitemaps.')
parser.add_argument(
'--base_url', required=True, help='The base URL for the sitemap entries.')
'--base_url',
required=True,
help='The base URL for the sitemap entries (without trailing /).')
args = parser.parse_args()

os.makedirs(_OUTPUT_DIRECTORY, exist_ok=True)
os.chdir(_OUTPUT_DIRECTORY)

generate_sitemaps(args.base_url)
return 0

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,6 @@
# limitations under the License.
import unittest
import tempfile
import os
import gzip
from unittest.mock import patch, MagicMock
import generate_sitemap
import osv
Expand All @@ -32,27 +30,6 @@ def temp_file(self):
self.test_file.close()
return self.test_file.name

def test_compress_file(self):
"""Test it compresses the file and removes the original file."""
input_filename = self.temp_file()

# Call the compress_file function
compressed_file_path = generate_sitemap.compress_file(input_filename)

# Verify that the original file is removed
self.assertFalse(os.path.exists(input_filename))

# Verify that the compressed file is created
self.assertTrue(os.path.exists(compressed_file_path))

# Verify the contents of the compressed file
with gzip.open(compressed_file_path, 'rb') as f:
content = f.read()
self.assertEqual(content, b'This is a test file.')

# Clean up compressed file created during the test
os.remove(compressed_file_path)

@patch.object(osv.Bug, 'query')
def test_fetch_vulnerability_ids(self, mock_query):
"""Test it returns the vulnerability ids for ecosystem"""
Expand All @@ -79,25 +56,21 @@ def test_osv_get_ecosystems(self, mock_query):

@patch('generate_sitemap.fetch_vulnerability_ids')
@patch('generate_sitemap.ElementTree')
@patch('generate_sitemap.os.makedirs')
def test_generate_sitemap_for_ecosystem(self, mock_makedirs,
mock_element_tree, mock_fetch_vulns):
def test_generate_sitemap_for_ecosystem(self, mock_element_tree,
mock_fetch_vulns):
"""Check it generates the sitemap for ecosystem"""
mock_fetch_vulns.return_value = ['vuln1', 'vuln2']
mock_tree = MagicMock()
mock_element_tree.return_value = mock_tree

generate_sitemap.generate_sitemap_for_ecosystem('Go', 'http://example.com')

mock_makedirs.assert_called_once_with('./sitemap', exist_ok=True)
mock_tree.write.assert_called_once_with(
'./sitemap/Go.xml', encoding='utf-8', xml_declaration=True)
'./sitemap_Go.xml', encoding='utf-8', xml_declaration=True)

@patch('generate_sitemap.fetch_vulnerability_ids')
@patch('generate_sitemap.ElementTree')
@patch('generate_sitemap.os.makedirs')
def test_generate_sitemap_for_ecosystem_with_space(self, mock_makedirs,
mock_element_tree,
def test_generate_sitemap_for_ecosystem_with_space(self, mock_element_tree,
mock_fetch_vulns):
""""
Check it creates the sitemap correctly where there is a space in the
Expand All @@ -110,15 +83,12 @@ def test_generate_sitemap_for_ecosystem_with_space(self, mock_makedirs,
generate_sitemap.generate_sitemap_for_ecosystem('Rocky Linux',
'http://example.com')

mock_makedirs.assert_called_once_with('./sitemap', exist_ok=True)
mock_tree.write.assert_called_once_with(
'./sitemap/Rocky_Linux.xml', encoding='utf-8', xml_declaration=True)
'./sitemap_Rocky_Linux.xml', encoding='utf-8', xml_declaration=True)

@patch('generate_sitemap.fetch_vulnerability_ids')
@patch('generate_sitemap.ElementTree')
@patch('generate_sitemap.os.makedirs')
def test_generate_sitemap_for_ecosystem_with_period(self, mock_makedirs,
mock_element_tree,
def test_generate_sitemap_for_ecosystem_with_period(self, mock_element_tree,
mock_fetch_vulns):
""""
Check it creates the sitemap correctly where there is a period in the
Expand All @@ -131,29 +101,25 @@ def test_generate_sitemap_for_ecosystem_with_period(self, mock_makedirs,
generate_sitemap.generate_sitemap_for_ecosystem('crates.io',
'http://example.com')

mock_makedirs.assert_called_once_with('./sitemap', exist_ok=True)
mock_tree.write.assert_called_once_with(
'./sitemap/crates__io.xml', encoding='utf-8', xml_declaration=True)
'./sitemap_crates__io.xml', encoding='utf-8', xml_declaration=True)

@patch('generate_sitemap.ElementTree')
@patch('generate_sitemap.os.makedirs')
def test_generate_sitemap_index(self, mock_makedirs, mock_element_tree):
def test_generate_sitemap_index(self, mock_element_tree):
"""Check it generates the sitemap index as expected"""
mock_tree = MagicMock()
mock_element_tree.return_value = mock_tree

generate_sitemap.generate_sitemap_index({'Go', 'UVI'}, 'http://example.com')

mock_makedirs.assert_called_once_with('./sitemap', exist_ok=True)
mock_tree.write.assert_called_once_with(
'./sitemap/index.xml', encoding='utf-8', xml_declaration=True)
'./sitemap_index.xml', encoding='utf-8', xml_declaration=True)

@patch('generate_sitemap.generate_sitemap_for_ecosystem')
@patch('generate_sitemap.generate_sitemap_index')
@patch('generate_sitemap.osv_get_ecosystems')
@patch('generate_sitemap.compress_file')
def test_generate_sitemap(self, mock_compress_file, mock_get_ecosystems,
mock_generate_index, mock_generate_sitemap):
def test_generate_sitemap(self, mock_get_ecosystems, mock_generate_index,
mock_generate_sitemap):
"""
Check the outer wrapper generates the ecosystems' sitemaps as well as
sitemap index.
Expand All @@ -166,11 +132,6 @@ def test_generate_sitemap(self, mock_compress_file, mock_get_ecosystems,
mock_generate_sitemap.assert_any_call('Go', 'http://example.com')
mock_generate_sitemap.assert_any_call('Android', 'http://example.com')

self.assertEqual(mock_compress_file.call_count, 3)
mock_compress_file.assert_any_call('./sitemap/Go.xml')
mock_compress_file.assert_any_call('./sitemap/Android.xml')
mock_compress_file.assert_any_call('./sitemap/index.xml')

mock_generate_index.assert_called_once_with({'Android', 'Go'},
'http://example.com')

Expand Down
Loading

0 comments on commit d4509b6

Please sign in to comment.