Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

🎉 Admin for related charts #3833

Draft
wants to merge 7 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions apps/cli/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,12 @@ def cli_back() -> None:
"anomalist": "apps.anomalist.cli.cli",
},
},
{
"name": "Related Charts",
"commands": {
"related-charts": "apps.related_charts.cli.cli",
},
},
]
# Add subgroups (don't modify)
+ subgroups
Expand Down
4 changes: 1 addition & 3 deletions apps/housekeeper/charts.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,9 +214,7 @@ def _get_main_message_usage(chart, refs):
def send_extra_messages(chart, refs, **kwargs):
"""Provide more context in the thread"""
## 1/ Similar charts
similar_messages = (
f"🕵️ <{OWID_ENV.wizard_url}similar_charts?chart_search_text={chart['slug']}| → Explore similar charts>"
)
similar_messages = f"🕵️ <{OWID_ENV.wizard_url}similar_charts?slug={chart['slug']}| → Explore similar charts>"

## 2/ AI: Chart description, chart edit timeline, suggestion
log.info("Getting AI summary...")
Expand Down
Empty file added apps/related_charts/__init__.py
Empty file.
217 changes: 217 additions & 0 deletions apps/related_charts/cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,217 @@
import datetime as dt
from typing import Optional

import click
import pandas as pd
import structlog
from rich_click.rich_command import RichCommand
from sqlalchemy import text
from tqdm.auto import tqdm

from apps.wizard.app_pages.similar_charts import data
from etl import config
from etl.db import get_engine

config.enable_bugsnag()
log = structlog.get_logger()


def load_data(chart_slug: Optional[str]) -> tuple[pd.DataFrame, pd.DataFrame]:
"""
Load chart data and coview sessions DataFrame.

Returns:
charts: DataFrame indexed by slug, containing metadata (like chart_id, views_365d, etc.)
coviews_df: DataFrame with MultiIndex (slug1, slug2)
and columns ['coviews', 'pageviews'].
"""
log.info("Loading chart data...")
charts = data.get_raw_charts().set_index("slug", drop=False)

# If chart_slug is provided, verify it's in charts
if chart_slug and chart_slug not in charts.index:
log.warning("Chart slug not found in data. Exiting.", chart_slug=chart_slug)
return pd.DataFrame(), pd.DataFrame()

log.info("Loading coview sessions...")
coviews_df = data.get_coviews_sessions(
after_date=str(dt.date.today() - dt.timedelta(days=365)), min_sessions=3
).to_frame(name="coviews")

# If a single chart slug is given, filter for that slug1 only
if chart_slug:
coviews_df = coviews_df[coviews_df.index.get_level_values("slug1") == chart_slug]

# Filter out any coviews rows whose slug1 isn't in our charts
coviews_df = coviews_df[coviews_df.index.get_level_values("slug1").isin(charts.index)]

# Add pageviews of slug2 to coviews dataframe
coviews_df["pageviews_1"] = charts["views_365d"].reindex(coviews_df.index.get_level_values("slug1")).values
coviews_df["pageviews_2"] = charts["views_365d"].reindex(coviews_df.index.get_level_values("slug2")).values

return charts, coviews_df


def compute_recommendations(
charts: pd.DataFrame,
coviews_df: pd.DataFrame,
chart_slug: Optional[str],
top: int,
regularization: float,
score_method: str,
) -> pd.DataFrame:
"""
Given charts and coview data, compute a DataFrame of recommended pairs:
chosen_chart, related_chart, chartId, relatedChartId, etc.

The 'score' is computed as:
score = coviews - regularization * pageviews

Args:
charts: DataFrame of chart metadata.
coviews_df: DataFrame with columns ['coviews', 'pageviews'].
chart_slug: Optional single slug to process; otherwise compute for all slugs.
top: How many top-related charts to retrieve for each slug.
regularization: Factor to penalize high-view charts.
score_method: Scoring method to use for recommendations.

Returns:
A DataFrame of recommended chart pairs (chosen_chart, related_chart, score, etc.).
"""
# If we failed to load data (e.g., an invalid slug), return empty
if charts.empty or coviews_df.empty:
return pd.DataFrame()

# Compute the score
if score_method == "coviews":
coviews_df["score"] = coviews_df["coviews"] - regularization * coviews_df["pageviews_2"]
elif score_method == "jaccard":
coviews_df["score"] = coviews_df["coviews"] / (
coviews_df["pageviews_1"] + coviews_df["pageviews_2"] - coviews_df["coviews"]
)
# If a single chart slug is requested, ensure we only keep those rows
if chart_slug:
if chart_slug not in coviews_df.index.get_level_values("slug1"):
log.info("No coview data for this chart slug.", chart_slug=chart_slug)
return pd.DataFrame()
coviews_df = coviews_df.loc[[chart_slug]]

recommended_rows = []

# Group by 'slug1' so each group has all charts related to that slug1
grouped = coviews_df.groupby(level="slug1", sort=False)
log.info("Calculating related charts...")

for slug1, group in tqdm(grouped, desc="Calculating related charts"):
top_related = group.sort_values("score", ascending=False).head(top)
for related_slug, score in zip(top_related.index.get_level_values("slug2"), top_related["score"]):
recommended_rows.append({"chosen_chart": slug1, "related_chart": related_slug, "score": score})

if not recommended_rows:
return pd.DataFrame()

# Build the recommendations DataFrame
recommended_df = pd.DataFrame(recommended_rows)
recommended_df["chartId"] = recommended_df["chosen_chart"].map(charts["chart_id"])
recommended_df["relatedChartId"] = recommended_df["related_chart"].map(charts["chart_id"])
recommended_df["label"] = "good"
recommended_df["reviewer"] = "production"

# Warn if some related_chart slugs can't be mapped to chartIds
ix_missing = recommended_df["relatedChartId"].isnull()
if ix_missing.any():
log.warning("Chart ID not found for some related chart slugs.", n_missing=ix_missing.sum())
recommended_df = recommended_df[~ix_missing]

return recommended_df


def write_recommendations(
engine, recommended_df: pd.DataFrame, charts: pd.DataFrame, chart_slug: Optional[str]
) -> None:
"""
Writes the recommended DataFrame to the 'related_charts' table in the database.
If 'chart_slug' is specified, only deletes existing rows for that slug before inserting.
Otherwise, clears all 'production' rows first.
"""
if recommended_df.empty:
log.info("No related charts found. Nothing to write.")
return

with engine.begin() as conn:
if chart_slug:
log.info("Deleting existing 'production' reviews for this chart.", chart_slug=chart_slug)
conn.execute(
text("""
DELETE FROM related_charts
WHERE reviewer = 'production' AND chartId = :chartId
"""),
{"chartId": charts.loc[chart_slug, "chart_id"]},
)
else:
log.info("Deleting all existing 'production' reviews.")
conn.execute(text("DELETE FROM related_charts WHERE reviewer = 'production'"))

log.info("Inserting new related chart records.", rows=len(recommended_df))
recommended_df[["chartId", "relatedChartId", "label", "reviewer", "score"]].to_sql(
"related_charts", con=conn, if_exists="append", index=False
)


@click.command(name="related-charts", cls=RichCommand, help=__doc__)
@click.option(
"--chart-slug",
type=str,
help="Get related charts only for the chart with this slug.",
)
@click.option(
"--top",
type=int,
default=6,
help="Pick the top N related charts.",
)
@click.option(
"--regularization", type=float, default=0.001, help="Factor by which to penalize charts with high pageviews."
)
@click.option(
"--score",
type=click.Choice(["jaccard", "coviews"]),
default="jaccard",
help="Scoring method to use for recommendations.",
)
@click.option(
"--dry-run/--no-dry-run",
default=False,
help="If set, no changes will be written to the database.",
)
def cli(chart_slug: Optional[str], top: int, regularization: float, score: str, dry_run: bool) -> None:
"""
Generates a table of related charts (by coviews) and optionally writes them
to the database. If a single chart slug is provided, only that chart’s
related charts will be generated.
"""
engine = get_engine()

# 1. Load data (no score calculated here)
charts, coviews_df = load_data(chart_slug)

# 2. Compute recommendations (score is applied here)
recommended_df = compute_recommendations(charts, coviews_df, chart_slug, top, regularization, score)

if recommended_df.empty:
log.info("No recommendations generated. Exiting.")
return

# 3. Dry-run check
if dry_run:
log.info("Dry run mode enabled. No changes will be written to the database.")
log.info("Recommended DataFrame preview:", data=recommended_df.head())
return

# 4. Otherwise, write to DB
write_recommendations(engine, recommended_df, charts, chart_slug)
log.info("Related charts updated successfully.")


if __name__ == "__main__":
cli()
4 changes: 1 addition & 3 deletions apps/wizard/app_pages/insight_search/embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,12 @@

import streamlit as st
import torch
from joblib import Memory
from sentence_transformers import SentenceTransformer, util
from structlog import get_logger

from etl.config import memory
from etl.paths import CACHE_DIR

memory = Memory(CACHE_DIR, verbose=0)

# Initialize log.
log = get_logger()

Expand Down
Loading
Loading