Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DISCO-3202 - Enrich the curated-recommendations endpoint with icon-urls #784

Merged
merged 1 commit into from
Feb 7, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions merino/curated_recommendations/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@
from merino.utils.http_client import create_http_client
from merino.utils.synced_gcs_blob import SyncedGcsBlob

from merino.providers.manifest import get_provider as get_manifest_provider

logger = logging.getLogger(__name__)

_provider: CuratedRecommendationsProvider
Expand Down Expand Up @@ -121,6 +123,7 @@ def init_provider() -> None:
http_client=create_http_client(base_url=""),
graph_config=CorpusApiGraphConfig(),
metrics_client=get_metrics_client(),
manifest_provider=get_manifest_provider(),
)

extended_expiration_corpus_backend = ExtendedExpirationCorpusBackend(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
ScheduledSurfaceId,
)
from merino.exceptions import BackendError
from merino.providers.manifest import Provider as ManifestProvider
from merino.utils.version import fetch_app_version_from_file

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -123,6 +124,7 @@ class CorpusApiBackend(CorpusBackend):
http_client: AsyncClient
graph_config: CorpusApiGraphConfig
metrics_client: aiodogstatsd.Client
manifest_provider: ManifestProvider

# time-to-live was chosen because 1 minute (+/- 10 s) is short enough that updates by curators
# such as breaking news or editorial corrections propagate fast enough, and that the request
Expand All @@ -137,10 +139,12 @@ def __init__(
http_client: AsyncClient,
graph_config: CorpusApiGraphConfig,
metrics_client: aiodogstatsd.Client,
manifest_provider: ManifestProvider,
):
self.http_client = http_client
self.graph_config = graph_config
self.metrics_client = metrics_client
self.manifest_provider = manifest_provider
self._cache = {}
self._background_tasks = set()

Expand Down Expand Up @@ -257,7 +261,9 @@ async def fetch(
before_sleep=before_sleep_log(logger, logging.WARNING),
)
async def _fetch_from_backend(
self, surface_id: ScheduledSurfaceId, days_offset: int
self,
surface_id: ScheduledSurfaceId,
days_offset: int,
) -> list[CorpusItem]:
"""Issue a scheduledSurface query"""
query = """
Expand Down Expand Up @@ -321,6 +327,9 @@ async def _fetch_from_backend(
item["corpusItem"]["url"] = self.update_url_utm_source(
item["corpusItem"]["url"], str(utm_source)
)
# Add icon URL if available
if icon_url := self.manifest_provider.get_icon_url(item["corpusItem"]["url"]):
item["corpusItem"]["iconUrl"] = icon_url

curated_recommendations = [
CorpusItem(
Expand Down
1 change: 1 addition & 0 deletions merino/curated_recommendations/corpus_backends/protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ class CorpusItem(BaseModel):
publisher: str
isTimeSensitive: bool
imageUrl: HttpUrl
iconUrl: HttpUrl | None = None


class CorpusBackend(Protocol):
Expand Down
34 changes: 33 additions & 1 deletion merino/providers/manifest/provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@
import asyncio
import time
import logging
from urllib.parse import urlparse

from pydantic import HttpUrl

from merino.providers.manifest.backends.filemanager import GetManifestResultCode
from merino.utils import cron
Expand All @@ -20,6 +23,7 @@ class Provider:
"""Provide access to in-memory manifest data fetched from GCS."""

manifest_data: ManifestData | None
domain_lookup_table: dict[str, int]
cron_task: asyncio.Task
gruberb marked this conversation as resolved.
Show resolved Hide resolved
resync_interval_sec: int
cron_interval_sec: int
Expand All @@ -39,6 +43,7 @@ def __init__(
self.cron_interval_sec = cron_interval_sec
self.last_fetch_at = 0.0
self.manifest_data = ManifestData(domains=[])
self.domain_lookup_table = {}
self.data_fetched_event = asyncio.Event()

super().__init__()
Expand All @@ -63,8 +68,11 @@ async def _fetch_data(self) -> None:
result_code, data = await self.backend.fetch()

match GetManifestResultCode(result_code):
case GetManifestResultCode.SUCCESS:
case GetManifestResultCode.SUCCESS if data is not None:
self.manifest_data = data
self.domain_lookup_table = {
domain.domain: idx for idx, domain in enumerate(data.domains)
}
self.last_fetch_at = time.time()

case GetManifestResultCode.SKIP:
Expand All @@ -86,3 +94,27 @@ def _should_fetch(self) -> bool:
def get_manifest_data(self) -> ManifestData | None:
"""Return manifest data"""
return self.manifest_data

def get_icon_url(self, url: str | HttpUrl) -> str | None:
"""Get icon URL for a domain.

Args:
url: Full URL to extract domain from (string or HttpUrl)

Returns:
Icon URL if found, None otherwise
"""
try:
url_str = str(url)
# Remove www. and get the domain
domain = urlparse(url_str).netloc.replace("www.", "")
# Strip TLD by taking first part of domain
base_domain = domain.split(".")[0] if "." in domain else domain

idx = self.domain_lookup_table.get(base_domain, -1)
if idx >= 0 and self.manifest_data is not None:
return self.manifest_data.domains[idx].icon

gruberb marked this conversation as resolved.
Show resolved Hide resolved
except Exception as e:
logger.warning(f"Error getting icon for URL {url}: {e}")
return None
gruberb marked this conversation as resolved.
Show resolved Hide resolved
44 changes: 44 additions & 0 deletions tests/integration/api/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,16 @@

from logging import LogRecord
from typing import Iterator, Generator
from unittest.mock import AsyncMock

import pytest
import orjson
from starlette.testclient import TestClient
from aiodogstatsd import Client as AioDogstatsdClient

from merino.providers.manifest import Provider
from merino.providers.manifest.backends.manifest import ManifestBackend
from merino.providers.manifest.backends.protocol import GetManifestResultCode, ManifestData
from merino.utils.gcs.gcs_uploader import GcsUploader
from contextlib import nullcontext
from merino.curated_recommendations.fakespot_backend.protocol import (
Expand Down Expand Up @@ -134,3 +139,42 @@ def fakespot_feed() -> FakespotFeed:
footerCopy=FAKESPOT_FOOTER_COPY,
cta=FakespotCTA(ctaCopy=FAKESPOT_CTA_COPY, url=FAKESPOT_CTA_URL),
)


@pytest.fixture
def mock_manifest():
"""Mock manifest data with a known icon URL."""
return {
"domains": [
{
"rank": 1,
"domain": "spotify",
"categories": ["Entertainment"],
"serp_categories": [0],
"url": "https://www.spotify.com",
"title": "Spotify",
"icon": "https://test.com/spotify-favicon.ico",
}
]
}


@pytest.fixture
def mock_manifest_backend(mock_manifest):
"""Mock ManifestBackend that returns our test data."""
backend = ManifestBackend()
backend.fetch = AsyncMock(
return_value=(GetManifestResultCode.SUCCESS, ManifestData(**mock_manifest))
)
return backend


@pytest.fixture
def manifest_provider(mock_manifest_backend):
"""Override the manifest provider fixture with our mocked data."""
provider = Provider(
backend=mock_manifest_backend,
resync_interval_sec=86400,
cron_interval_sec=3600,
)
return provider
Original file line number Diff line number Diff line change
Expand Up @@ -65,11 +65,12 @@ def corpus_http_client(fixture_response_data, fixture_request_data) -> AsyncMock


@pytest.fixture()
def corpus_backend(corpus_http_client: AsyncMock) -> CorpusApiBackend:
def corpus_backend(corpus_http_client: AsyncMock, manifest_provider) -> CorpusApiBackend:
"""Mock corpus api backend."""
# Initialize the backend with the mock HTTP client
return CorpusApiBackend(
http_client=corpus_http_client,
graph_config=CorpusApiGraphConfig(),
metrics_client=get_metrics_client(),
manifest_provider=manifest_provider,
)
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ async def test_fetch(corpus_backend: CorpusApiBackend, fixture_response_data):
),
scheduledCorpusItemId="de614b6b-6df6-470a-97f2-30344c56c1b3",
corpusItemId="4095b364-02ff-402c-b58a-792a067fccf2",
iconUrl=None,
)


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@
)
from merino.curated_recommendations.protocol import CuratedRecommendation
from merino.main import app
from merino.providers.manifest import get_provider as get_manifest_provider
from merino.providers.manifest.backends.protocol import Domain
from tests.integration.api.conftest import fakespot_feed


Expand Down Expand Up @@ -121,6 +123,15 @@ def extended_expiration_corpus_backend(
)


@pytest.fixture(autouse=True)
def setup_manifest_provider(manifest_provider):
"""Set up the manifest provider dependency"""
app.dependency_overrides[get_manifest_provider] = lambda: manifest_provider
yield
if get_manifest_provider in app.dependency_overrides:
del app.dependency_overrides[get_manifest_provider]
gruberb marked this conversation as resolved.
Show resolved Hide resolved


@pytest.fixture(name="corpus_provider")
def provider(
corpus_backend: CorpusApiBackend,
Expand All @@ -140,7 +151,7 @@ def provider(


@pytest.fixture(autouse=True)
def setup_providers(corpus_provider):
def setup_suggest_providers(corpus_provider):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
def setup_suggest_providers(corpus_provider):
def setup_curated_recommendations_provider(corpus_provider):

"""Set up the curated recommendations provider"""
app.dependency_overrides[get_provider] = lambda: corpus_provider

Expand Down Expand Up @@ -1529,3 +1540,75 @@ def mock_post_by_days_ago(*args, **kwargs):
else:
# Check that only today's items are returned if in control or not in the experiment.
assert set(days_ago_counter.keys()) == {0}


@pytest.mark.asyncio
async def test_curated_recommendations_enriched_with_icons(
manifest_provider,
corpus_http_client,
fixture_request_data,
):
"""Test the enrichment of a curated recommendation with an added icon-url."""
# Set up the manifest data first
manifest_provider.manifest_data.domains = [
Domain(
rank=2,
title="Microsoft – AI, Cloud, Productivity, Computing, Gaming & Apps",
url="https://www.microsoft.com",
domain="microsoft",
icon="https://merino-images.services.mozilla.com/favicons/microsoft-icon.png",
categories=["Business", "Information Technology"],
serp_categories=[0],
)
]
manifest_provider.domain_lookup_table = {"microsoft": 0}

mocked_response = {
"data": {
"scheduledSurface": {
"items": [
{
"id": "scheduledSurfaceItemId-ABC",
"corpusItem": {
"id": "corpusItemId-XYZ",
"url": "https://www.microsoft.com/some-article?utm_source=firefox-newtab-en-us",
"title": "Some MS Article",
"excerpt": "All about Microsoft something",
"topic": "tech",
"publisher": "ExamplePublisher",
"isTimeSensitive": False,
"imageUrl": "https://somewhere.com/test.jpg",
},
}
]
}
}
}
corpus_http_client.post.return_value = Response(
status_code=200,
json=mocked_response,
request=fixture_request_data,
)

async with AsyncClient(app=app, base_url="http://test") as ac:
response = await ac.post(
"/api/v1/curated-recommendations",
json={"locale": "en-US"},
)
assert response.status_code == 200

data = response.json()
items = data["data"]
assert len(items) == 1

item = items[0]
assert item["url"] == "https://www.microsoft.com/some-article?utm_source=firefox-newtab-en-us"

assert "iconUrl" in item
assert (
item["iconUrl"] == "https://merino-images.services.mozilla.com/favicons/microsoft-icon.png"
)

# Clean up
if get_provider in app.dependency_overrides:
del app.dependency_overrides[get_provider]
24 changes: 2 additions & 22 deletions tests/integration/api/v1/manifest/test_manifest.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,24 +11,6 @@
from merino.web.api_v1 import router


@pytest.fixture(scope="module")
def mock_manifest_2025() -> dict:
"""Mock manifest json for the year 2025"""
return {
"domains": [
{
"rank": 1,
"domain": "spotify",
"categories": ["Entertainment"],
"serp_categories": [0],
"url": "https://www.spotify.com",
"title": "Spotify",
"icon": "",
}
]
}


@pytest.fixture(autouse=True, name="cleanup")
def cleanup_tasks_fixture():
"""Return a method that cleans up existing cron tasks after initialization"""
Expand All @@ -46,9 +28,7 @@ async def cleanup_tasks(provider: Provider):


@pytest.mark.asyncio
async def test_get_manifest_success(
client_with_metrics, gcp_uploader, mock_manifest_2025, cleanup
):
async def test_get_manifest_success(client_with_metrics, gcp_uploader, mock_manifest, cleanup):
"""Uploads a manifest to the gcs bucket and verifies that the endpoint returns the uploaded file."""
# initialize provider on startup
await init_provider()
Expand All @@ -57,7 +37,7 @@ async def test_get_manifest_success(
app.include_router(router, prefix="/api/v1")

# upload a manifest file to GCS test container
gcp_uploader.upload_content(orjson.dumps(mock_manifest_2025), "top_picks_latest.json")
gcp_uploader.upload_content(orjson.dumps(mock_manifest), "top_picks_latest.json")

provider = get_provider()
await provider.data_fetched_event.wait()
Expand Down
Loading