From 6f021e8a1a3e075e705afa94351a6266f20214cc Mon Sep 17 00:00:00 2001
From: Will Chen <willchen90@gmail.com>
Date: Tue, 10 Sep 2024 16:33:41 -0700
Subject: [PATCH] Create MVP AI console (#934)

---
 .gitignore                                    |   3 +
 .gitmodules                                   |   4 +
 ai/README.md                                  |  18 ++
 ai/src/ai/common/diff.py                      |  36 +++
 ai/src/ai/common/entity_store.py              |  49 ++++
 ai/src/ai/common/example.py                   | 137 +++++++++++
 ai/src/ai/common/executor.py                  | 128 +++++++++++
 ai/src/ai/common/model.py                     |  17 ++
 ai/src/ai/common/producer.py                  |  16 ++
 ai/src/ai/common/prompt_context.py            |  15 ++
 ai/src/ai/common/prompt_fragment.py           |  31 +++
 ai/src/ai/console/__init__.py                 |   0
 ai/src/ai/console/pages/__init__.py           |   0
 ai/src/ai/console/pages/add_edit_eval_page.py |  53 +++++
 .../pages/add_edit_expected_examples_page.py  |  56 +++++
 .../pages/add_edit_golden_examples_page.py    | 126 ++++++++++
 .../ai/console/pages/add_edit_model_page.py   |  25 ++
 .../ai/console/pages/add_edit_page_helper.py  | 165 ++++++++++++++
 .../console/pages/add_edit_producer_page.py   |  74 ++++++
 .../pages/add_edit_prompt_context_page.py     |  78 +++++++
 .../pages/add_edit_prompt_fragment_page.py    |  84 +++++++
 .../pages/create_golden_dataset_page.py       |  81 +++++++
 ai/src/ai/console/pages/eval_item_page.py     | 111 +++++++++
 ai/src/ai/console/pages/eval_page.py          | 100 ++++++++
 ai/src/ai/console/pages/evals_page.py         |  48 ++++
 .../console/pages/expected_examples_page.py   |  51 +++++
 .../ai/console/pages/golden_examples_page.py  |  61 +++++
 ai/src/ai/console/pages/models_page.py        |  44 ++++
 ai/src/ai/console/pages/producers_page.py     |  64 ++++++
 .../ai/console/pages/prompt_contexts_page.py  |  55 +++++
 .../ai/console/pages/prompt_fragments_page.py |  57 +++++
 ai/src/ai/console/scaffold.py                 | 190 ++++++++++++++++
 ai/src/ai/offline_common/eval.py              | 215 ++++++++++++++++++
 ai/src/ai/offline_common/golden_dataset.py    |   9 +
 ai/src/console.py                             |  68 ++++++
 ai/src/migrate_goldens.py                     |  55 +++++
 ai/src/service.py                             |  29 ++-
 37 files changed, 2342 insertions(+), 11 deletions(-)
 create mode 100644 ai/src/ai/common/diff.py
 create mode 100644 ai/src/ai/common/entity_store.py
 create mode 100644 ai/src/ai/common/example.py
 create mode 100644 ai/src/ai/common/executor.py
 create mode 100644 ai/src/ai/common/model.py
 create mode 100644 ai/src/ai/common/producer.py
 create mode 100644 ai/src/ai/common/prompt_context.py
 create mode 100644 ai/src/ai/common/prompt_fragment.py
 create mode 100644 ai/src/ai/console/__init__.py
 create mode 100644 ai/src/ai/console/pages/__init__.py
 create mode 100644 ai/src/ai/console/pages/add_edit_eval_page.py
 create mode 100644 ai/src/ai/console/pages/add_edit_expected_examples_page.py
 create mode 100644 ai/src/ai/console/pages/add_edit_golden_examples_page.py
 create mode 100644 ai/src/ai/console/pages/add_edit_model_page.py
 create mode 100644 ai/src/ai/console/pages/add_edit_page_helper.py
 create mode 100644 ai/src/ai/console/pages/add_edit_producer_page.py
 create mode 100644 ai/src/ai/console/pages/add_edit_prompt_context_page.py
 create mode 100644 ai/src/ai/console/pages/add_edit_prompt_fragment_page.py
 create mode 100644 ai/src/ai/console/pages/create_golden_dataset_page.py
 create mode 100644 ai/src/ai/console/pages/eval_item_page.py
 create mode 100644 ai/src/ai/console/pages/eval_page.py
 create mode 100644 ai/src/ai/console/pages/evals_page.py
 create mode 100644 ai/src/ai/console/pages/expected_examples_page.py
 create mode 100644 ai/src/ai/console/pages/golden_examples_page.py
 create mode 100644 ai/src/ai/console/pages/models_page.py
 create mode 100644 ai/src/ai/console/pages/producers_page.py
 create mode 100644 ai/src/ai/console/pages/prompt_contexts_page.py
 create mode 100644 ai/src/ai/console/pages/prompt_fragments_page.py
 create mode 100644 ai/src/ai/console/scaffold.py
 create mode 100644 ai/src/ai/offline_common/eval.py
 create mode 100644 ai/src/ai/offline_common/golden_dataset.py
 create mode 100644 ai/src/console.py
 create mode 100644 ai/src/migrate_goldens.py

diff --git a/.gitignore b/.gitignore
index e6ff7466a..e4db60611 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,6 +6,9 @@ __pycache__
 
 *.log
 
+# This is a git submodule
+/ai/data/
+
 # Do not save generated files
 /ai/ft/outputs/
 /ai/outputs/
diff --git a/.gitmodules b/.gitmodules
index 6317b4346..2a17e79e4 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +1,7 @@
 [submodule "third_party/angular_components"]
 	path = third_party/angular_components
 	url = https://github.com/angular/components.git
+
+[submodule "ai/data"]
+	path = ai/data
+	url = git@hf.co:datasets/wwwillchen/mesop-data
diff --git a/ai/README.md b/ai/README.md
index f9fc5a291..a9a61be6d 100644
--- a/ai/README.md
+++ b/ai/README.md
@@ -9,6 +9,24 @@ All the commands should be run from the `ai/` directory.
 - All entry-points are in `src/*.py` - this includes the AI service and scripts.
 - `src/common` contains code that's shared between offline scripts and the online service.
 
+## AI Console
+
+**Setup**:
+
+```sh
+git clone git@hf.co:datasets/wwwillchen/mesop-data data
+```
+
+**Running**:
+
+Inside `ai/src/`, run the following command:
+
+```sh
+mesop console.py --port=32124
+```
+
+> Note: you can run this on a separate port to avoid conflicting with the main Mesop development app.
+
 ## Scripts
 
 These are scripts used to generate and process data for offline evaluation.
diff --git a/ai/src/ai/common/diff.py b/ai/src/ai/common/diff.py
new file mode 100644
index 000000000..4e82a5deb
--- /dev/null
+++ b/ai/src/ai/common/diff.py
@@ -0,0 +1,36 @@
+import re
+from typing import NamedTuple
+
+EDIT_HERE_MARKER = " # <--- EDIT HERE"
+
+
+class ApplyPatchResult(NamedTuple):
+  has_error: bool
+  result: str
+
+
+def apply_patch(original_code: str, patch: str) -> ApplyPatchResult:
+  # Extract the diff content
+  diff_pattern = r"<<<<<<< ORIGINAL(.*?)=======\n(.*?)>>>>>>> UPDATED"
+  matches = re.findall(diff_pattern, patch, re.DOTALL)
+  patched_code = original_code
+  if len(matches) == 0:
+    print("[WARN] No diff found:", patch)
+    return ApplyPatchResult(
+      True,
+      "[AI-001] Sorry! AI output was mis-formatted. Please try again.",
+    )
+  for original, updated in matches:
+    original = original.strip().replace(EDIT_HERE_MARKER, "")
+    updated = updated.strip().replace(EDIT_HERE_MARKER, "")
+
+    # Replace the original part with the updated part
+    new_patched_code = patched_code.replace(original, updated, 1)
+    if new_patched_code == patched_code:
+      return ApplyPatchResult(
+        True,
+        "[AI-002] Sorry! AI output could not be used. Please try again.",
+      )
+    patched_code = new_patched_code
+
+  return ApplyPatchResult(False, patched_code)
diff --git a/ai/src/ai/common/entity_store.py b/ai/src/ai/common/entity_store.py
new file mode 100644
index 000000000..b17f68bed
--- /dev/null
+++ b/ai/src/ai/common/entity_store.py
@@ -0,0 +1,49 @@
+import os
+from typing import Generic, TypeVar
+
+from pydantic import BaseModel
+
+T = TypeVar("T", bound=BaseModel)
+
+
+def get_data_path(dirname: str) -> str:
+  return os.path.join(
+    os.path.dirname(__file__), "..", "..", "..", "data", dirname
+  )
+
+
+class EntityStore(Generic[T]):
+  def __init__(self, entity_type: type[T], *, dirname: str):
+    self.entity_type = entity_type
+    self.directory_path = get_data_path(dirname)
+
+  def get(self, id: str) -> T:
+    file_path = os.path.join(self.directory_path, f"{id}.json")
+    with open(file_path) as f:
+      entity_json = f.read()
+    entity = self.entity_type.model_validate_json(entity_json)
+    return entity
+
+  def get_all(self) -> list[T]:
+    entities: list[T] = []
+    for filename in os.listdir(self.directory_path):
+      if filename.endswith(".json"):
+        file_path = os.path.join(self.directory_path, filename)
+        with open(file_path) as f:
+          entity_json = f.read()
+        entities.append(self.entity_type.model_validate_json(entity_json))
+    entities.sort(key=lambda x: x.id, reverse=True)
+    return entities
+
+  def save(self, entity: T, overwrite: bool = False):
+    id = entity.id  # type: ignore
+    entity_path = os.path.join(self.directory_path, f"{id}.json")
+    if not overwrite and os.path.exists(entity_path):
+      raise ValueError(
+        f"{self.entity_type.__name__} with id {id} already exists"
+      )
+    with open(entity_path, "w") as f:
+      f.write(entity.model_dump_json(indent=4))
+
+  def delete(self, entity_id: str):
+    os.remove(os.path.join(self.directory_path, f"{entity_id}.json"))
diff --git a/ai/src/ai/common/example.py b/ai/src/ai/common/example.py
new file mode 100644
index 000000000..ec91a7a0b
--- /dev/null
+++ b/ai/src/ai/common/example.py
@@ -0,0 +1,137 @@
+"""
+An example is a single input/output pair.
+    - Examples are used for fine-tuning a model (i.e. golden example) or running an eval (i.e. expected example).
+    - There are two types of examples:
+        - **Golden Example**: A golden example is an example that is used to create a golden dataset.
+        - **Expected Example**: An expected example is an example that is used to evaluate a producer.
+        Internally, once an expected example has been run through an eval, we create an **evaluated example**, but you don't need to create this manually in the UI.
+"""
+
+import os
+import shutil
+from typing import Generic, Literal, TypeVar
+
+from pydantic import BaseModel
+
+
+class ExampleInput(BaseModel):
+  prompt: str
+  input_code: str | None = None
+  line_number_target: int | None = None
+
+
+class BaseExample(BaseModel):
+  id: str
+  input: ExampleInput
+
+
+class ExampleOutput(BaseModel):
+  output_code: str | None = None
+  raw_output: str | None = None
+  output_type: Literal["full", "diff"] = "diff"
+
+
+class ExpectedExample(BaseExample):
+  expect_executable: bool = True
+  expect_type_checkable: bool = True
+
+
+class ExpectResult(BaseModel):
+  name: Literal["executable", "type_checkable", "patchable"]
+  score: int  # 0 or 1
+  message: str | None = None
+
+
+class EvaluatedExampleOutput(BaseModel):
+  time_spent_secs: float
+  tokens: int
+  output: ExampleOutput
+  expect_results: list[ExpectResult]
+
+
+class EvaluatedExample(BaseModel):
+  expected: ExpectedExample
+  outputs: list[EvaluatedExampleOutput]
+
+
+class GoldenExample(BaseExample):
+  output: ExampleOutput
+
+
+T = TypeVar("T", bound=BaseExample)
+
+
+class ExampleStore(Generic[T]):
+  def __init__(self, entity_type: type[T], *, dirname: str):
+    self.entity_type = entity_type
+    self.directory_path = os.path.join(
+      os.path.dirname(__file__), "..", "..", "..", "data", dirname
+    )
+
+  def get(self, id: str) -> T:
+    dir_path = os.path.join(self.directory_path, id)
+    json_path = os.path.join(dir_path, "example_input.json")
+    with open(json_path) as f:
+      entity_json = f.read()
+    entity = self.entity_type.model_validate_json(entity_json)
+    input = entity.input
+    input_py_path = os.path.join(dir_path, "input.py")
+    if os.path.exists(input_py_path):
+      with open(input_py_path) as f:
+        input.input_code = f.read()
+    if isinstance(entity, GoldenExample):
+      output_py_path = os.path.join(dir_path, "output.py")
+      if os.path.exists(output_py_path):
+        with open(output_py_path) as f:
+          entity.output.output_code = f.read()
+      raw_output_path = os.path.join(dir_path, "raw_output.txt")
+      if os.path.exists(raw_output_path):
+        with open(raw_output_path) as f:
+          entity.output.raw_output = f.read()
+    return entity
+
+  def get_all(self) -> list[T]:
+    entities: list[T] = []
+    for filename in os.listdir(self.directory_path):
+      entities.append(self.get(filename))
+    return entities
+
+  def save(self, entity: T, overwrite: bool = False):
+    id = entity.id
+    dir_path = os.path.join(self.directory_path, id)
+
+    if not overwrite:
+      if os.path.exists(dir_path):
+        raise ValueError(
+          f"{self.entity_type.__name__} with id {id} already exists"
+        )
+      else:
+        os.mkdir(dir_path)
+    json_path = os.path.join(dir_path, "example_input.json")
+    input_code = entity.input.input_code
+    if input_code:
+      input_py_path = os.path.join(dir_path, "input.py")
+      with open(input_py_path, "w") as f:
+        f.write(input_code)
+    entity.input.input_code = None
+
+    if isinstance(entity, GoldenExample):
+      output_py_path = os.path.join(dir_path, "output.py")
+      with open(output_py_path, "w") as f:
+        f.write(entity.output.output_code)
+      raw_output_path = os.path.join(dir_path, "raw_output.txt")
+      with open(raw_output_path, "w") as f:
+        f.write(entity.output.raw_output)
+      entity.output.output_code = None
+      entity.output.raw_output = None
+    with open(json_path, "w") as f:
+      f.write(entity.model_dump_json(indent=4))
+
+  def delete(self, entity_id: str):
+    shutil.rmtree(os.path.join(self.directory_path, entity_id))
+
+
+expected_example_store = ExampleStore(
+  ExpectedExample, dirname="expected_examples"
+)
+golden_example_store = ExampleStore(GoldenExample, dirname="golden_examples")
diff --git a/ai/src/ai/common/executor.py b/ai/src/ai/common/executor.py
new file mode 100644
index 000000000..5e905b326
--- /dev/null
+++ b/ai/src/ai/common/executor.py
@@ -0,0 +1,128 @@
+from os import getenv
+from typing import Iterator
+
+from openai import OpenAI
+from openai.types.chat import (
+  ChatCompletionMessageParam,
+)
+
+from ai.common.diff import EDIT_HERE_MARKER, ApplyPatchResult, apply_patch
+from ai.common.entity_store import get_data_path
+from ai.common.example import ExampleInput
+from ai.common.model import model_store
+from ai.common.producer import producer_store
+from ai.common.prompt_context import prompt_context_store
+from ai.common.prompt_fragment import PromptFragment, prompt_fragment_store
+
+
+class ProviderExecutor:
+  def __init__(self, model_name: str, prompt_fragments: list[PromptFragment]):
+    self.model_name = model_name
+
+    self.prompt_fragments = [
+      PromptFragment(
+        id=pf.id,
+        role=pf.role,
+        chain_of_thought=pf.chain_of_thought,
+        content_value=get_content_value(pf),
+        content_path=None,
+      )
+      for pf in prompt_fragments
+    ]
+
+  def format_messages(
+    self, input: ExampleInput
+  ) -> list[ChatCompletionMessageParam]:
+    code = input.input_code or ""
+    # Add sentinel token based on line_number (1-indexed)
+    if input.line_number_target is not None:
+      code_lines = code.splitlines()
+      if 1 <= input.line_number_target <= len(code_lines):
+        code_lines[input.line_number_target - 1] += EDIT_HERE_MARKER
+      code = "\n".join(code_lines)
+
+    return [
+      {
+        "role": pf.role,
+        "content": pf.content_value.replace("<APP_CODE>", code).replace(  # type: ignore
+          "<APP_CHANGES>", input.prompt
+        ),
+      }
+      for pf in self.prompt_fragments
+    ]
+
+  def execute(self, input: ExampleInput) -> str: ...
+
+  def execute_stream(self, input: ExampleInput) -> Iterator[str]: ...
+
+
+class OpenaiExecutor(ProviderExecutor):
+  def __init__(self, model_name: str, prompt_fragments: list[PromptFragment]):
+    super().__init__(model_name, prompt_fragments)
+    self.client = OpenAI(
+      api_key=getenv("OPENAI_API_KEY"),
+    )
+
+  def execute(self, input: ExampleInput) -> str:
+    response = self.client.chat.completions.create(
+      model=self.model_name,
+      max_tokens=10_000,
+      messages=self.format_messages(input),
+    )
+    return response.choices[0].message.content or ""
+
+  def execute_stream(self, input: ExampleInput) -> Iterator[str]:
+    stream = self.client.chat.completions.create(
+      model=self.model_name,
+      max_tokens=10_000,
+      messages=self.format_messages(input),
+      stream=True,
+    )
+    for chunk in stream:
+      content = chunk.choices[0].delta.content
+      yield content or ""
+
+
+provider_executors: dict[str, type[ProviderExecutor]] = {
+  "openai": OpenaiExecutor,
+}
+
+
+class ProducerExecutor:
+  def __init__(self, producer_id: str):
+    self.producer = producer_store.get(producer_id)
+
+  def get_provider_executor(self) -> ProviderExecutor:
+    prompt_context = prompt_context_store.get(self.producer.prompt_context_id)
+    prompt_fragments = [
+      prompt_fragment_store.get(pfid) for pfid in prompt_context.fragment_ids
+    ]
+    model = model_store.get(self.producer.mesop_model_id)
+    provider_executor_type = provider_executors.get(model.provider)
+    if provider_executor_type is None:
+      raise ValueError(f"Provider {model.provider} not supported")
+    provider_executor = provider_executor_type(model.name, prompt_fragments)
+    return provider_executor
+
+  def execute(self, input: ExampleInput):
+    return self.get_provider_executor().execute(input)
+
+  def execute_stream(self, input: ExampleInput):
+    return self.get_provider_executor().execute_stream(input)
+
+  def transform_output(self, input_code: str, output: str):
+    if self.producer.output_format == "diff":
+      return apply_patch(input_code, output)
+    elif self.producer.output_format == "full":
+      return ApplyPatchResult(True, output)
+    else:
+      raise ValueError(f"Unknown output format: {self.producer.output_format}")
+
+
+def get_content_value(pf: PromptFragment) -> str | None:
+  if pf.content_value is not None:
+    return pf.content_value
+  if pf.content_path is not None:
+    with open(get_data_path(pf.content_path.replace("//", ""))) as f:
+      return f.read()
+  return None
diff --git a/ai/src/ai/common/model.py b/ai/src/ai/common/model.py
new file mode 100644
index 000000000..f9c877e30
--- /dev/null
+++ b/ai/src/ai/common/model.py
@@ -0,0 +1,17 @@
+from pydantic import BaseModel
+
+from ai.common.entity_store import EntityStore
+
+
+class Model(BaseModel):
+  """
+  Model represents an LLM.
+  Name should match the model name used by the provider for the API call.
+  """
+
+  id: str
+  name: str
+  provider: str
+
+
+model_store = EntityStore(Model, dirname="models")
diff --git a/ai/src/ai/common/producer.py b/ai/src/ai/common/producer.py
new file mode 100644
index 000000000..d23921baa
--- /dev/null
+++ b/ai/src/ai/common/producer.py
@@ -0,0 +1,16 @@
+from typing import Literal
+
+from pydantic import BaseModel
+
+from ai.common.entity_store import EntityStore
+
+
+class Producer(BaseModel):
+  id: str
+  mesop_model_id: str  # using model_id has a conflict with Pydantic
+  prompt_context_id: str
+  output_format: Literal["full", "diff"]
+  temperature: float = 0.8
+
+
+producer_store = EntityStore(Producer, dirname="producers")
diff --git a/ai/src/ai/common/prompt_context.py b/ai/src/ai/common/prompt_context.py
new file mode 100644
index 000000000..7c25592e4
--- /dev/null
+++ b/ai/src/ai/common/prompt_context.py
@@ -0,0 +1,15 @@
+from pydantic import BaseModel
+
+from ai.common.entity_store import EntityStore
+
+
+class PromptContext(BaseModel):
+  """
+  PromptContext represents the context of a prompt.
+  """
+
+  id: str
+  fragment_ids: list[str]
+
+
+prompt_context_store = EntityStore(PromptContext, dirname="prompt_contexts")
diff --git a/ai/src/ai/common/prompt_fragment.py b/ai/src/ai/common/prompt_fragment.py
new file mode 100644
index 000000000..9cd148399
--- /dev/null
+++ b/ai/src/ai/common/prompt_fragment.py
@@ -0,0 +1,31 @@
+from typing import Literal
+
+from pydantic import BaseModel, model_validator
+
+from ai.common.entity_store import EntityStore
+
+
+class PromptFragment(BaseModel):
+  id: str
+  content_value: str | None = None
+  content_path: str | None = None
+  role: Literal["user", "assistant", "system"]
+  chain_of_thought: bool = False
+
+  @model_validator(mode="after")
+  def check_content_value_or_path(self):
+    if self.content_value == "":
+      self.content_value = None
+    if self.content_path == "":
+      self.content_path = None
+
+    content_value = self.content_value
+    content_path = self.content_path
+    if content_value is not None and content_path is not None:
+      raise ValueError("Only one of content_value or content_path is allowed")
+    if content_value is None and content_path is None:
+      raise ValueError("Either content_value or content_path is required")
+    return self
+
+
+prompt_fragment_store = EntityStore(PromptFragment, dirname="prompt_fragments")
diff --git a/ai/src/ai/console/__init__.py b/ai/src/ai/console/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/ai/src/ai/console/pages/__init__.py b/ai/src/ai/console/pages/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/ai/src/ai/console/pages/add_edit_eval_page.py b/ai/src/ai/console/pages/add_edit_eval_page.py
new file mode 100644
index 000000000..ab3e35433
--- /dev/null
+++ b/ai/src/ai/console/pages/add_edit_eval_page.py
@@ -0,0 +1,53 @@
+import datetime
+from typing import Any
+
+import mesop as me
+from ai.common.producer import producer_store
+from ai.console.pages.add_edit_page_helper import (
+  create_add_edit_page,
+  form_field,
+  get_field_value,
+  update_state,
+)
+from ai.offline_common.eval import (
+  Eval,
+)
+from ai.offline_common.eval import (
+  eval_store as store,
+)
+
+
+def get_producer_ids():
+  options: list[me.AutocompleteOption] = []
+
+  for producer in producer_store.get_all():
+    options.append(me.AutocompleteOption(label=producer.id, value=producer.id))
+
+  return options
+
+
+def form():
+  form_field("id", "Eval id")
+  me.autocomplete(
+    value=get_field_value("producer_id"),
+    label="Producer id",
+    options=get_producer_ids(),
+    style=me.Style(width="100%"),
+    on_selection_change=lambda e: update_state("producer_id", e.value),
+  )
+
+
+def create_default_eval() -> dict[str, Any]:
+  id = datetime.datetime.now().replace(microsecond=0).isoformat()
+  return {"id": id, "producer_id": ""}
+
+
+create_add_edit_page(
+  store=store,
+  entity_type=Eval,
+  entity_name="Eval",
+  root_path="/evals",
+  form=form,
+  create_default_entity=create_default_eval,
+  disable_edit=True,
+)
diff --git a/ai/src/ai/console/pages/add_edit_expected_examples_page.py b/ai/src/ai/console/pages/add_edit_expected_examples_page.py
new file mode 100644
index 000000000..9cfc9d6cf
--- /dev/null
+++ b/ai/src/ai/console/pages/add_edit_expected_examples_page.py
@@ -0,0 +1,56 @@
+import mesop as me
+from ai.common.example import (
+  ExpectedExample,
+)
+from ai.common.example import (
+  expected_example_store as store,
+)
+from ai.console.pages.add_edit_page_helper import (
+  create_add_edit_page,
+  form_field,
+  get_field_value,
+  update_state,
+)
+
+
+def form():
+  form_field("id", "Unique identifier for the example")
+  form_field("input.prompt", "Input: prompt")
+  me.textarea(
+    value=get_field_value("input.input_code"),
+    appearance="outline",
+    label="Input code",
+    on_blur=lambda e: update_state(e.key, e.value),
+    key="input.input_code",
+    hint_label=f"Input code path: data/golden_examples/{get_field_value('id')}/input.py",
+    style=me.Style(width="min(100%, 360px)"),
+  )
+  form_field(
+    "input.line_number_target", "Input: line number target", type="number"
+  )
+
+  me.checkbox(
+    checked=bool(get_field_value("expect_executable")),
+    label="Expect executable",
+    key="expect_executable",
+    on_change=lambda e: update_state("expect_executable", e.checked),
+  )
+  me.checkbox(
+    checked=bool(get_field_value("expect_type_checkable")),
+    label="Expect type checkable",
+    key="expect_type_checkable",
+    on_change=lambda e: update_state("expect_type_checkable", e.checked),
+  )
+
+
+create_add_edit_page(
+  store=store,
+  entity_type=ExpectedExample,
+  entity_name="Expected Example",
+  root_path="/expected-examples",
+  form=form,
+  create_default_entity=lambda: {
+    "expect_executable": True,
+    "expect_type_checkable": True,
+  },
+)
diff --git a/ai/src/ai/console/pages/add_edit_golden_examples_page.py b/ai/src/ai/console/pages/add_edit_golden_examples_page.py
new file mode 100644
index 000000000..66d6f1746
--- /dev/null
+++ b/ai/src/ai/console/pages/add_edit_golden_examples_page.py
@@ -0,0 +1,126 @@
+import base64
+
+import requests
+
+import mesop as me
+from ai.common.diff import apply_patch
+from ai.common.example import (
+  GoldenExample,
+)
+from ai.common.example import (
+  golden_example_store as store,
+)
+from ai.console.pages.add_edit_page_helper import (
+  create_add_edit_page,
+  form_field,
+  get_field_value,
+  update_state,
+)
+from ai.offline_common.eval import SANDBOX_URL
+
+
+@me.stateclass
+class State:
+  preview_url: str
+  preview_error: str
+
+
+def load_preview(e: me.ClickEvent):
+  state = me.state(State)
+  code = get_field_value("output.output_code")
+  result = requests.post(
+    SANDBOX_URL + "/exec",
+    data={"code": base64.b64encode(code.encode("utf-8"))},
+  )
+  if result.status_code == 200:
+    url_path = result.content.decode("utf-8")
+    state.preview_url = SANDBOX_URL + url_path
+    state.preview_error = ""
+  else:
+    state.preview_error = result.content.decode("utf-8")
+
+
+def form():
+  state = me.state(State)
+  me.button("Load preview", on_click=load_preview, type="flat")
+  if state.preview_url:
+    me.link(
+      text="Open preview",
+      url=state.preview_url,
+      style=me.Style(color=me.theme_var("primary"), text_decoration="none"),
+      open_in_new_tab=True,
+    )
+  if state.preview_error:
+    me.text(
+      state.preview_error,
+      style=me.Style(font_family="monospace", white_space="pre"),
+    )
+
+  form_field("id", "Unique identifier for the example")
+  form_field("input.prompt", "Input: prompt")
+  me.textarea(
+    value=get_field_value("input.input_code"),  # type: ignore
+    appearance="outline",
+    label="Input code",
+    on_blur=lambda e: update_state(e.key, e.value),
+    key="input.input_code",
+    hint_label=f"Input code path: data/expected_examples/{get_field_value('id')}/input.py",
+    style=me.Style(width="100%"),
+  )
+  form_field(
+    "input.line_number_target", "Input: line number target", type="number"
+  )
+  me.select(
+    value=get_field_value("output.output_type"),  # type: ignore
+    options=[
+      me.SelectOption(label="Full", value="full"),
+      me.SelectOption(label="Diff", value="diff"),
+    ],
+    on_selection_change=lambda e: update_state(e.key, e.value),
+    key="output.output_type",
+    label="Output type",
+    style=me.Style(width="min(100%, 360px)"),
+  )
+  me.textarea(
+    value=get_field_value("output.raw_output"),  # type: ignore
+    appearance="outline",
+    label="Raw output",
+    on_blur=update_raw_output,
+    key="output.raw_output",
+    hint_label=f"Output code path: data/expected_examples/{get_field_value('id')}/raw_output.txt",
+    style=me.Style(width="100%"),
+  )
+  me.textarea(
+    readonly=True,
+    value=get_field_value("output.output_code"),  # type: ignore
+    appearance="outline",
+    label="Generated output code (read-only)",
+    hint_label=f"Output code path: data/expected_examples/{get_field_value('id')}/output.py",
+    style=me.Style(width="100%"),
+  )
+
+
+def update_raw_output(e: me.InputBlurEvent):
+  update_state(e.key, e.value)
+  output_type = get_field_value("output.output_type")
+  if output_type == "full":
+    update_state("output.output_code", e.value)
+  elif output_type == "diff":
+    result = apply_patch(get_field_value("input.input_code"), e.value)
+    update_state("output.output_code", result.result)
+  else:
+    raise ValueError(f"Unknown output type: {output_type}")
+
+
+create_add_edit_page(
+  store=store,
+  entity_type=GoldenExample,
+  entity_name="Golden Example",
+  root_path="/golden-examples",
+  form=form,
+  create_default_entity=lambda: {
+    "output": {
+      "output_type": "diff",
+    },
+  },
+)
diff --git a/ai/src/ai/console/pages/add_edit_model_page.py b/ai/src/ai/console/pages/add_edit_model_page.py
new file mode 100644
index 000000000..891ce0899
--- /dev/null
+++ b/ai/src/ai/console/pages/add_edit_model_page.py
@@ -0,0 +1,25 @@
+from ai.common.model import (
+  Model,
+)
+from ai.common.model import (
+  model_store as store,
+)
+from ai.console.pages.add_edit_page_helper import (
+  create_add_edit_page,
+  form_field,
+)
+
+
+def form():
+  form_field("provider", "Provider of the model")
+  form_field("name", "Descriptive name for the model")
+  form_field("id", "Unique identifier for the model")
+
+
+create_add_edit_page(
+  store=store,
+  entity_type=Model,
+  entity_name="Model",
+  root_path="/models",
+  form=form,
+)
diff --git a/ai/src/ai/console/pages/add_edit_page_helper.py b/ai/src/ai/console/pages/add_edit_page_helper.py
new file mode 100644
index 000000000..5e496fdeb
--- /dev/null
+++ b/ai/src/ai/console/pages/add_edit_page_helper.py
@@ -0,0 +1,165 @@
+from functools import partial
+from typing import Any, Callable, Type, TypeVar
+
+from pydantic import BaseModel
+
+import mesop as me
+from ai.common.entity_store import EntityStore
+from ai.common.example import BaseExample, ExampleStore
+from ai.console.scaffold import page_scaffold
+
+
+@me.stateclass
+class State:
+  entity: dict[str, Any]
+
+
+def form_field(field: str, description: str, type: str | None = None):
+  disabled = "id" in me.query_params and field == "id"
+  me.input(
+    disabled=disabled,
+    value=str(get_field_value(field)),
+    appearance="outline",
+    type=type,  # type: ignore
+    label=field,
+    on_blur=lambda e: update_state(e.key, e.value),
+    key=field,
+    hint_label=description,
+    style=me.Style(width="min(100%, 360px)"),
+  )
+
+
+def update_state(key: str, value: Any):
+  state = me.state(State)
+  state.entity[key] = value
+
+
+def get_field_value(field_name: str):
+  state = me.state(State)
+  # We do some hacky-ish logic to support both dot notation and nested dicts.
+
+  # When the field is set in the current page, we set it with dot notation.
+  if field_name in state.entity:
+    return state.entity[field_name] or ""
+
+  # Otherwise, if we loaded the entity from the store (i.e. filesystem),
+  # we access it as a nested dict.
+  keys = field_name.split(".")
+  value = state.entity
+  for key in keys:
+    if isinstance(value, dict):
+      value = value.get(key, "")
+    else:
+      return ""
+  return value or ""
+
+
+T = TypeVar("T", bound=BaseModel)
+E = TypeVar("E", bound=BaseExample)
+
+
+def create_add_edit_page(
+  *,
+  store: EntityStore[T] | ExampleStore[E],
+  entity_type: Type[T] | Type[E],
+  entity_name: str,
+  root_path: str,
+  form: Callable[[], None],
+  create_default_entity: Callable[[], dict[str, Any]] | None = None,
+  disable_edit: bool = False,
+):
+  def on_load_edit_page(e: me.LoadEvent):
+    me.set_theme_mode("system")
+    id = me.query_params.get("id")
+    assert id is not None
+    entity = store.get(id)
+    state = me.state(State)
+    state.entity = entity.model_dump()
+
+  def delete(e: me.ClickEvent):
+    store.delete(me.state(State).entity["id"])
+    reset_and_navigate()
+
+  if not disable_edit:
+
+    @me.page(path=root_path + "/edit", on_load=on_load_edit_page)
+    def edit_page():  # type: ignore
+      with page_scaffold(title=f"Edit {entity_name}"):
+        with me.box(
+          style=me.Style(
+            display="flex", flex_direction="column", gap=24, max_width=640
+          )
+        ):
+          form()
+          with me.box(
+            style=me.Style(
+              display="flex",
+              flex_direction="row",
+              justify_content="space-between",
+              gap=16,
+            )
+          ):
+            me.button(
+              "Back",
+              type="stroked",
+              on_click=lambda e: reset_and_navigate(),
+            )
+            me.button("Delete", type="flat", color="warn", on_click=delete)
+            me.button(
+              "Save", type="flat", on_click=partial(update, overwrite=True)
+            )
+
+  def on_load_add_page(e: me.LoadEvent):
+    me.set_theme_mode("system")
+    state = me.state(State)
+    if create_default_entity is not None:
+      state.entity = create_default_entity()
+    else:
+      state.entity = {}
+
+  @me.page(path=root_path + "/add", on_load=on_load_add_page)
+  def add_page():  # type: ignore
+    with page_scaffold(
+      title=f"Add {entity_name}",
+    ):
+      with me.box(
+        style=me.Style(
+          display="flex", flex_direction="column", gap=24, max_width=640
+        )
+      ):
+        form()
+        with me.box(
+          style=me.Style(
+            display="flex",
+            flex_direction="row",
+            justify_content="space-between",
+            gap=16,
+          )
+        ):
+          me.button(
+            "Back",
+            type="stroked",
+            on_click=lambda e: reset_and_navigate(),
+          )
+          me.button("Add", type="flat", on_click=update)
+
+  def update(e: me.ClickEvent, *, overwrite: bool = False):
+    state = me.state(State)
+    # convert dot notation to nested dicts
+    converted: dict[str, Any] = {}
+    for key in state.entity:
+      keys = key.split(".")
+      current = converted
+      for k in keys[:-1]:
+        if k not in current:
+          current[k] = {}
+        current = current[k]
+      current[keys[-1]] = state.entity[key]
+
+    store.save(entity_type(**converted), overwrite=overwrite)  # type: ignore
+    reset_and_navigate()
+
+  def reset_and_navigate():
+    state = me.state(State)
+    state.entity = {}
+    me.navigate(root_path or "/")
diff --git a/ai/src/ai/console/pages/add_edit_producer_page.py b/ai/src/ai/console/pages/add_edit_producer_page.py
new file mode 100644
index 000000000..e7a3a332b
--- /dev/null
+++ b/ai/src/ai/console/pages/add_edit_producer_page.py
@@ -0,0 +1,74 @@
+import mesop as me
+from ai.common.model import model_store
+from ai.common.producer import (
+  Producer,
+)
+from ai.common.producer import (
+  producer_store as store,
+)
+from ai.common.prompt_context import prompt_context_store
+from ai.console.pages.add_edit_page_helper import (
+  create_add_edit_page,
+  form_field,
+  get_field_value,
+  update_state,
+)
+
+
+def get_model_ids():
+  options: list[me.AutocompleteOption] = []
+
+  for model in model_store.get_all():
+    options.append(me.AutocompleteOption(label=model.id, value=model.id))
+
+  return options
+
+
+def get_prompt_context_ids():
+  options: list[me.AutocompleteOption] = []
+
+  for prompt_context in prompt_context_store.get_all():
+    options.append(
+      me.AutocompleteOption(label=prompt_context.id, value=prompt_context.id)
+    )
+
+  return options
+
+
+def form():
+  form_field("id", "Unique identifier for the producer")
+  me.autocomplete(
+    value=get_field_value("mesop_model_id"),
+    label="Model id",
+    options=get_model_ids(),
+    style=me.Style(width="min(100%, 360px)"),
+    on_selection_change=lambda e: update_state("mesop_model_id", e.value),
+  )
+  me.autocomplete(
+    value=get_field_value("prompt_context_id"),
+    label="Prompt context id",
+    options=get_prompt_context_ids(),
+    style=me.Style(width="min(100%, 360px)"),
+    on_selection_change=lambda e: update_state("prompt_context_id", e.value),
+  )
+
+  me.select(
+    value=get_field_value("output_format"),
+    label="Output format",
+    options=[
+      me.SelectOption(label="Full", value="full"),
+      me.SelectOption(label="Diff", value="diff"),
+    ],
+    on_selection_change=lambda e: update_state("output_format", e.value),
+    style=me.Style(width="min(100%, 360px)"),
+  )
+  form_field("temperature", "temperature (default 0.8)", type="number")
+
+
+create_add_edit_page(
+  store=store,
+  entity_type=Producer,
+  entity_name="Producer",
+  root_path="/producers",
+  form=form,
+)
diff --git a/ai/src/ai/console/pages/add_edit_prompt_context_page.py b/ai/src/ai/console/pages/add_edit_prompt_context_page.py
new file mode 100644
index 000000000..4ca1f1b33
--- /dev/null
+++ b/ai/src/ai/console/pages/add_edit_prompt_context_page.py
@@ -0,0 +1,78 @@
+from functools import partial
+
+import mesop as me
+from ai.common.prompt_context import (
+  PromptContext,
+)
+from ai.common.prompt_context import (
+  prompt_context_store as store,
+)
+from ai.common.prompt_fragment import (
+  prompt_fragment_store,
+)
+from ai.console.pages.add_edit_page_helper import (
+  create_add_edit_page,
+  form_field,
+  get_field_value,
+  update_state,
+)
+
+
+def update_fragment_id(e: me.SelectSelectionChangeEvent, index: int):
+  fragment_ids = get_field_value("fragment_ids")
+  fragment_ids[index] = e.value
+  update_state("fragment_ids", fragment_ids)
+
+
+def delete_fragment_id(e: me.ClickEvent, index: int):
+  fragment_ids = get_field_value("fragment_ids")
+  fragment_ids.pop(index)
+  update_state("fragment_ids", fragment_ids)
+
+
+def append_fragment_id(e: me.SelectSelectionChangeEvent):
+  fragment_ids = get_field_value("fragment_ids")
+  if fragment_ids is None or fragment_ids == "":
+    fragment_ids = []
+  fragment_ids.append(e.value)
+  update_state("fragment_ids", fragment_ids)
+
+
+def form():
+  form_field("id", "Unique identifier")
+  fragment_ids = get_field_value("fragment_ids")
+  for index, fragment_id in enumerate(fragment_ids):
+    with me.box(style=me.Style(display="flex", gap=8)):
+      me.select(
+        value=fragment_id,
+        label="Fragment IDs",
+        options=get_fragment_options(),
+        style=me.Style(width="360px"),
+        on_selection_change=partial(update_fragment_id, index=index),
+      )
+      me.button(
+        "Remove",
+        on_click=partial(delete_fragment_id, index=index),
+      )
+  me.select(
+    label="Fragment IDs",
+    options=get_fragment_options(),
+    style=me.Style(width="min(100%, 360px)"),
+    on_selection_change=append_fragment_id,
+  )
+
+
+def get_fragment_options():
+  return [
+    me.SelectOption(label=fragment.id, value=fragment.id)
+    for fragment in prompt_fragment_store.get_all()
+  ]
+
+
+create_add_edit_page(
+  store=store,
+  entity_type=PromptContext,
+  entity_name="Prompt Context",
+  root_path="/prompt-contexts",
+  form=form,
+)
diff --git a/ai/src/ai/console/pages/add_edit_prompt_fragment_page.py b/ai/src/ai/console/pages/add_edit_prompt_fragment_page.py
new file mode 100644
index 000000000..f5ea466be
--- /dev/null
+++ b/ai/src/ai/console/pages/add_edit_prompt_fragment_page.py
@@ -0,0 +1,84 @@
+import os
+
+import mesop as me
+from ai.common.prompt_fragment import (
+  PromptFragment,
+)
+from ai.common.prompt_fragment import (
+  prompt_fragment_store as store,
+)
+from ai.console.pages.add_edit_page_helper import (
+  create_add_edit_page,
+  form_field,
+  get_field_value,
+  update_state,
+)
+
+
+def get_autocomplete_options():
+  options: list[me.AutocompleteOption] = []
+  prompt_contents_dir = os.path.join(
+    os.path.dirname(__file__), "..", "..", "..", "..", "data", "prompt_contents"
+  )
+  if os.path.exists(prompt_contents_dir):
+    for filename in os.listdir(prompt_contents_dir):
+      file_path = os.path.join(prompt_contents_dir, filename)
+      if os.path.isfile(file_path):
+        options.append(
+          me.AutocompleteOption(
+            label=filename, value="//prompt_contents/" + filename
+          )
+        )
+
+  return options
+
+
+def form():
+  form_field("id", "Unique identifier")
+  me.select(
+    value=get_field_value("role"),
+    label="Role",
+    options=[
+      me.SelectOption(label="User", value="user"),
+      me.SelectOption(label="Assistant", value="assistant"),
+      me.SelectOption(label="System", value="system"),
+    ],
+    on_selection_change=lambda e: update_state("role", e.value),
+    style=me.Style(width="min(100%, 360px)"),
+  )
+  me.divider()
+  me.text("Content (set either value or path)")
+  me.textarea(
+    value=get_field_value("content_value"),
+    appearance="outline",
+    label="Content value",
+    on_blur=lambda e: update_state(e.key, e.value),
+    key="content_value",
+    # TODO: potentially support golden example variables
+    # for more powerful few-shot prompting, e.g. <EXAMPLE:$EXAMPLE_ID>
+    hint_label="Variables: <APP_CHANGES> <USER_INPUT>",
+    style=me.Style(width="min(100%)"),
+  )
+  me.autocomplete(
+    value=get_field_value("content_path"),
+    label="Content path",
+    options=get_autocomplete_options(),
+    hint_label="(absolute) path to a file containing the content",
+    style=me.Style(width="min(100%, 360px)"),
+    on_selection_change=lambda e: update_state("content_path", e.value),
+  )
+  me.divider()
+  me.checkbox(
+    checked=bool(get_field_value("chain_of_thought")),
+    label="Chain of thought",
+    on_change=lambda e: update_state("chain_of_thought", e.checked),
+  )
+
+
+create_add_edit_page(
+  store=store,
+  entity_type=PromptFragment,
+  entity_name="Prompt Fragment",
+  root_path="/prompt-fragments",
+  form=form,
+)
diff --git a/ai/src/ai/console/pages/create_golden_dataset_page.py b/ai/src/ai/console/pages/create_golden_dataset_page.py
new file mode 100644
index 000000000..acb501b98
--- /dev/null
+++ b/ai/src/ai/console/pages/create_golden_dataset_page.py
@@ -0,0 +1,81 @@
+import mesop as me
+from ai.common.prompt_context import prompt_context_store
+from ai.console.scaffold import page_scaffold
+from ai.offline_common.golden_dataset import create_golden_dataset
+
+
+def on_load(e: me.LoadEvent):
+  me.set_theme_mode("system")
+
+
+def get_prompt_context_options():
+  return [
+    me.SelectOption(label=context.id, value=context.id)
+    for context in prompt_context_store.get_all()
+  ]
+
+
+@me.stateclass
+class State:
+  prompt_context_id: str
+  dataset_name: str
+  dataset_path: str
+
+
+def select_prompt_context(e: me.SelectSelectionChangeEvent):
+  state = me.state(State)
+  state.prompt_context_id = e.value
+
+
+def on_dataset_name_blur(e: me.InputBlurEvent):
+  state = me.state(State)
+  state.dataset_name = e.value
+
+
+@me.page(path="/create-golden-dataset", on_load=on_load)
+def create_golden_dataset_page():
+  state = me.state(State)
+  with page_scaffold(
+    current_path="/create-golden-dataset", title="Create golden dataset"
+  ):
+    me.input(
+      label="Dataset name",
+      on_blur=on_dataset_name_blur,
+    )
+    me.select(
+      label="Prompt Context",
+      options=get_prompt_context_options(),
+      style=me.Style(width="min(100%, 360px)"),
+      on_selection_change=select_prompt_context,
+    )
+    with me.box(
+      style=me.Style(
+        padding=me.Padding(bottom=16),
+        display="flex",
+        justify_content="space-between",
+      )
+    ):
+      me.button(
+        "Back",
+        on_click=lambda e: me.navigate("/golden-examples"),
+        type="stroked",
+        color="accent",
+      )
+      me.button(
+        "Create dataset",
+        on_click=create_dataset,
+        type="flat",
+        color="accent",
+      )
+    if state.dataset_path:
+      me.text(state.dataset_path)
+
+
+def create_dataset(e: me.ClickEvent):
+  state = me.state(State)
+  prompt_context_id = state.prompt_context_id
+  dataset_name = state.dataset_name
+  prompt_context = prompt_context_store.get(prompt_context_id)
+  dataset_path = create_golden_dataset(prompt_context, dataset_name)
+  state.dataset_path = dataset_path
+  print("dataset_path", dataset_path)
diff --git a/ai/src/ai/console/pages/eval_item_page.py b/ai/src/ai/console/pages/eval_item_page.py
new file mode 100644
index 000000000..eae5f8ee3
--- /dev/null
+++ b/ai/src/ai/console/pages/eval_item_page.py
@@ -0,0 +1,111 @@
+import base64
+
+import requests
+
+import mesop as me
+from ai.console.scaffold import page_scaffold
+from ai.offline_common.eval import (
+  SANDBOX_URL,
+  get_eval_example,
+)
+
+
+def on_load(e: me.LoadEvent):
+  me.set_theme_mode("system")
+  state = me.state(State)
+  example = get_eval_example(
+    me.query_params["eval-id"], me.query_params["example-id"]
+  )
+  code = example.outputs[0].output.output_code or ""
+  result = requests.post(
+    SANDBOX_URL + "/exec",
+    data={"code": base64.b64encode(code.encode("utf-8"))},
+  )
+  if result.status_code == 200:
+    url_path = result.content.decode("utf-8")
+    state.loaded_url = SANDBOX_URL + url_path
+    state.error = ""
+  else:
+    state.error = result.content.decode("utf-8")
+
+
+@me.stateclass
+class State:
+  loaded_url: str
+  error: str
+
+
+@me.page(title="Mesop AI Console - Eval", path="/eval-item", on_load=on_load)
+def eval_item_page():
+  state = me.state(State)
+  example = get_eval_example(
+    me.query_params["eval-id"], me.query_params["example-id"]
+  )
+  with page_scaffold(current_path="/eval", title="Eval"):
+    with me.box(
+      style=me.Style(
+        display="grid",
+        grid_template_columns="80px 1fr",
+        gap=8,
+        justify_items="start",
+        margin=me.Margin(bottom=8),
+      )
+    ):
+      with me.box(
+        style=me.Style(
+          display="grid",
+          grid_template_columns="repeat(2, calc(calc(100vw - 310px)/2))",
+          gap=16,
+          align_items="start",
+        )
+      ):
+        # Header
+        me.text("Result", style=me.Style(font_weight="bold"))
+        me.text("Preview", style=me.Style(font_weight="bold"))
+
+        # Body
+        with me.box(
+          style=me.Style(
+            display="flex",
+            flex_direction="column",
+            gap=8,
+            height="calc(100vh - 160px)",
+            overflow_y="auto",
+          )
+        ):
+          me.text("ID", style=me.Style(font_weight="bold"))
+          me.text(example.expected.id)
+
+          me.text("Results", style=me.Style(font_weight="bold"))
+          for result in example.outputs[0].expect_results:
+            with me.box(
+              style=me.Style(display="flex", flex_direction="row", gap=8)
+            ):
+              me.text(result.name)
+              me.text(str(result.score))
+
+            me.text(
+              result.message,
+              style=me.Style(font_family="monospace", white_space="pre"),
+            )
+
+          me.text("Code")
+          me.markdown(
+            "```\n" + (example.outputs[0].output.output_code or "") + "\n```",
+            style=me.Style(font_size=14),
+          )
+          me.divider()
+          me.text("Raw output")
+          me.markdown(
+            "```\n" + (example.outputs[0].output.raw_output or "") + "\n```",
+            style=me.Style(font_size=14),
+          )
+        with me.box(
+          style=me.Style(display="flex", flex_direction="column", gap=8)
+        ):
+          if state.error:
+            me.text("Error")
+            me.text(state.error)
+          me.embed(
+            src=state.loaded_url, style=me.Style(width="100%", height="80vh")
+          )
diff --git a/ai/src/ai/console/pages/eval_page.py b/ai/src/ai/console/pages/eval_page.py
new file mode 100644
index 000000000..3a9c5b391
--- /dev/null
+++ b/ai/src/ai/console/pages/eval_page.py
@@ -0,0 +1,100 @@
+import mesop as me
+from ai.console.scaffold import page_scaffold
+from ai.offline_common.eval import EvalRunner, get_eval_examples
+from ai.offline_common.eval import eval_store as store
+
+
+def on_load(e: me.LoadEvent):
+  me.set_theme_mode("system")
+
+
+def run_eval(e: me.ClickEvent):
+  eval = store.get(me.query_params["id"])
+  EvalRunner(eval).run()
+
+
+@me.page(title="Mesop AI Console - Eval", path="/eval", on_load=on_load)
+def eval_page():
+  eval = store.get(me.query_params["id"])
+  examples = get_eval_examples(eval.id)
+  with page_scaffold(current_path="/eval", title="Eval"):
+    with me.box(
+      style=me.Style(
+        display="grid",
+        grid_template_columns="80px 1fr",
+        gap=8,
+        justify_items="start",
+      )
+    ):
+      me.text("ID", style=me.Style(font_weight="bold"))
+      me.text(eval.id)
+      me.text("State", style=me.Style(font_weight="bold"))
+      me.text(eval.state)
+      me.text("Examples", style=me.Style(font_weight="bold"))
+      me.text(str(len(examples)))
+      if eval.eval_outcome:
+        me.text("Score", style=me.Style(font_weight="bold"))
+        with me.tooltip(
+          message=f"Score: {eval.eval_outcome.score} / Max score: {eval.eval_outcome.max_score}"
+        ):
+          me.text(
+            f"{eval.eval_outcome.score / eval.eval_outcome.max_score * 100:.0f}% "
+          )
+    with me.box(style=me.Style(padding=me.Padding(top=32))):
+      if eval.state == "pending":
+        me.button(
+          "Run eval",
+          on_click=run_eval,
+          type="flat",
+          color="accent",
+        )
+
+    if eval.state == "complete":
+      with me.box(
+        style=me.Style(
+          display="grid",
+          grid_template_columns="220px 300px 32px 48px 1fr",
+          gap=16,
+          align_items="center",
+        )
+      ):
+        # Header
+        me.text("ID", style=me.Style(font_weight="bold"))
+        me.text("Prompt", style=me.Style(font_weight="bold"))
+        me.text("Secs", style=me.Style(font_weight="bold"))
+        me.text("Tokens", style=me.Style(font_weight="bold"))
+        me.text("Expect results", style=me.Style(font_weight="bold"))
+        # Body
+        for example in examples:
+          # use a link because back navigation drops the query params
+          me.link(
+            text=example.expected.id,
+            style=me.Style(
+              font_size=16,
+              text_decoration="none",
+              color=me.theme_var("primary"),
+            ),
+            url=f"/eval-item?example-id={example.expected.id}&eval-id={eval.id}",
+          )
+          me.text(example.expected.input.prompt)
+          me.text(f"{example.outputs[0].time_spent_secs:.1f}")
+          me.text(str(example.outputs[0].tokens))
+          with me.box(
+            style=me.Style(display="flex", flex_direction="row", gap=12)
+          ):
+            for result in example.outputs[0].expect_results:
+              with me.tooltip(message=result.message or ""[-300:-120]):
+                with me.box(
+                  style=me.Style(
+                    display="flex",
+                    flex_direction="column",
+                    gap=8,
+                    background=me.theme_var("error-container")
+                    if result.score == 0
+                    else None,
+                    padding=me.Padding.all(4),
+                    border_radius=8,
+                  )
+                ):
+                  me.text(result.name[:5], style=me.Style(font_weight="bold"))
+                  me.text(str(result.score))
diff --git a/ai/src/ai/console/pages/evals_page.py b/ai/src/ai/console/pages/evals_page.py
new file mode 100644
index 000000000..d3e0899ea
--- /dev/null
+++ b/ai/src/ai/console/pages/evals_page.py
@@ -0,0 +1,48 @@
+import mesop as me
+from ai.console.scaffold import page_scaffold
+from ai.offline_common.eval import eval_store as store
+
+
+def on_load(e: me.LoadEvent):
+  me.set_theme_mode("system")
+
+
+@me.page(title="Mesop AI Console - Evals", path="/evals", on_load=on_load)
+def evals_page():
+  with page_scaffold(current_path="/evals", title="Evals"):
+    evals = store.get_all()
+    with me.box(
+      style=me.Style(
+        display="grid",
+        grid_template_columns="repeat(2, 1fr)",
+        gap=16,
+        align_items="center",
+      )
+    ):
+      # Header
+      me.text("ID", style=me.Style(font_weight="bold"))
+      me.text("Name", style=me.Style(font_weight="bold"))
+      # Body
+      for eval in evals:
+        me.button(
+          eval.id,
+          on_click=lambda e: me.navigate("/eval", query_params={"id": e.key}),
+          key=eval.id,
+          style=me.Style(font_size=16),
+        )
+        me.button(
+          eval.producer_id,
+          on_click=lambda e: me.navigate(
+            "/producers/edit", query_params={"id": e.key}
+          ),
+          key=eval.producer_id,
+          style=me.Style(font_size=16),
+        )
+
+    with me.box(style=me.Style(padding=me.Padding(top=32))):
+      me.button(
+        "Create eval",
+        on_click=lambda e: me.navigate("/evals/add"),
+        type="flat",
+        color="accent",
+      )
diff --git a/ai/src/ai/console/pages/expected_examples_page.py b/ai/src/ai/console/pages/expected_examples_page.py
new file mode 100644
index 000000000..dc334c477
--- /dev/null
+++ b/ai/src/ai/console/pages/expected_examples_page.py
@@ -0,0 +1,51 @@
+import mesop as me
+from ai.common.example import expected_example_store as store
+from ai.console.scaffold import page_scaffold
+
+
+def on_load(e: me.LoadEvent):
+  me.set_theme_mode("system")
+
+
+@me.page(path="/expected-examples", on_load=on_load)
+def expected_examples_page():
+  with page_scaffold(
+    current_path="/expected-examples", title="Expected Examples"
+  ):
+    with me.box(style=me.Style(padding=me.Padding(bottom=16))):
+      me.button(
+        "Add Expected Example",
+        on_click=lambda e: me.navigate("/expected-examples/add"),
+        type="flat",
+        color="accent",
+      )
+
+    examples = store.get_all()
+    with me.box(
+      style=me.Style(
+        display="grid",
+        grid_template_columns="repeat(4, 1fr)",
+        gap=16,
+        align_items="center",
+        overflow_y="auto",
+        height="100%",
+      )
+    ):
+      # Header
+      me.text("ID", style=me.Style(font_weight="bold"))
+      me.text("Prompt", style=me.Style(font_weight="bold"))
+      me.text("Has input code", style=me.Style(font_weight="bold"))
+      me.text("Has line # target", style=me.Style(font_weight="bold"))
+      # Body
+      for example in examples:
+        me.button(
+          example.id,
+          on_click=lambda e: me.navigate(
+            "/expected-examples/edit", query_params={"id": e.key}
+          ),
+          key=example.id,
+          style=me.Style(font_size=16),
+        )
+        me.text(example.input.prompt)
+        me.text(str(bool(example.input.input_code)))
+        me.text(str(bool(example.input.line_number_target)))
diff --git a/ai/src/ai/console/pages/golden_examples_page.py b/ai/src/ai/console/pages/golden_examples_page.py
new file mode 100644
index 000000000..ad401350b
--- /dev/null
+++ b/ai/src/ai/console/pages/golden_examples_page.py
@@ -0,0 +1,61 @@
+import mesop as me
+from ai.common.example import golden_example_store as store
+from ai.console.scaffold import page_scaffold
+
+
+def on_load(e: me.LoadEvent):
+  me.set_theme_mode("system")
+
+
+@me.page(path="/golden-examples", on_load=on_load)
+def golden_examples_page():
+  with page_scaffold(current_path="/golden-examples", title="golden Examples"):
+    examples = store.get_all()
+    with me.box(
+      style=me.Style(
+        padding=me.Padding(bottom=16),
+        display="flex",
+        justify_content="space-between",
+      )
+    ):
+      me.button(
+        "Add golden Example",
+        on_click=lambda e: me.navigate("/golden-examples/add"),
+        type="flat",
+        color="accent",
+      )
+      with me.tooltip(message="Create a golden dataset for fine-tuning"):
+        me.button(
+          "Create golden dataset",
+          on_click=lambda e: me.navigate("/create-golden-dataset"),
+          type="flat",
+          color="accent",
+        )
+    with me.box(
+      style=me.Style(
+        display="grid",
+        grid_template_columns="200px 1fr 48px 48px",
+        gap=12,
+        align_items="center",
+        overflow_y="auto",
+        height="100%",
+      )
+    ):
+      # Header
+      me.text("ID", style=me.Style(font_weight="bold"))
+      me.text("Prompt", style=me.Style(font_weight="bold"))
+      me.text("Has input code", style=me.Style(font_weight="bold"))
+      me.text("Has line # target", style=me.Style(font_weight="bold"))
+      # Body
+      for example in examples:
+        me.button(
+          example.id[0:20] + "..." if len(example.id) > 20 else example.id,
+          on_click=lambda e: me.navigate(
+            "/golden-examples/edit", query_params={"id": e.key}
+          ),
+          key=example.id,
+          style=me.Style(font_size=16),
+        )
+        me.text(example.input.prompt)
+        me.text(str(bool(example.input.input_code)))
+        me.text(str(bool(example.input.line_number_target)))
diff --git a/ai/src/ai/console/pages/models_page.py b/ai/src/ai/console/pages/models_page.py
new file mode 100644
index 000000000..2e9b468f9
--- /dev/null
+++ b/ai/src/ai/console/pages/models_page.py
@@ -0,0 +1,44 @@
+import mesop as me
+from ai.common.model import model_store as store
+from ai.console.scaffold import page_scaffold
+
+
+def on_load(e: me.LoadEvent):
+  me.set_theme_mode("system")
+
+
+@me.page(title="Mesop AI Console - Models", path="/models", on_load=on_load)
+def models_page():
+  with page_scaffold(current_path="/models", title="Models"):
+    models = store.get_all()
+    with me.box(
+      style=me.Style(
+        display="grid",
+        grid_template_columns="repeat(3, 1fr)",
+        gap=16,
+        align_items="center",
+      )
+    ):
+      # Header
+      me.text("ID", style=me.Style(font_weight="bold"))
+      me.text("Name", style=me.Style(font_weight="bold"))
+      me.text("Provider", style=me.Style(font_weight="bold"))
+      # Body
+      for model in models:
+        me.button(
+          model.id,
+          on_click=lambda e: me.navigate(
+            "/models/edit", query_params={"id": e.key}
+          ),
+          key=model.id,
+          style=me.Style(font_size=16),
+        )
+        me.text(model.name)
+        me.text(model.provider)
+    with me.box(style=me.Style(padding=me.Padding(top=32))):
+      me.button(
+        "Add Model",
+        on_click=lambda e: me.navigate("/models/add"),
+        type="flat",
+        color="accent",
+      )
diff --git a/ai/src/ai/console/pages/producers_page.py b/ai/src/ai/console/pages/producers_page.py
new file mode 100644
index 000000000..34cc5bebe
--- /dev/null
+++ b/ai/src/ai/console/pages/producers_page.py
@@ -0,0 +1,64 @@
+import mesop as me
+from ai.common.producer import producer_store as store
+from ai.console.scaffold import page_scaffold
+
+
+def on_load(e: me.LoadEvent):
+  me.set_theme_mode("system")
+
+
+@me.page(
+  title="Mesop AI Console - Producers", path="/producers", on_load=on_load
+)
+def producers_page():
+  with page_scaffold(current_path="/producers", title="Producers"):
+    producers = store.get_all()
+    with me.box(
+      style=me.Style(
+        display="grid",
+        grid_template_columns="repeat(5, 1fr)",
+        gap=16,
+        align_items="center",
+      )
+    ):
+      # Header
+      me.text("ID", style=me.Style(font_weight="bold"))
+      me.text("Model", style=me.Style(font_weight="bold"))
+      me.text("Prompt Context", style=me.Style(font_weight="bold"))
+      me.text("Output Format", style=me.Style(font_weight="bold"))
+      me.text("Temperature", style=me.Style(font_weight="bold"))
+      # Body
+      for producer in producers:
+        me.button(
+          producer.id,
+          on_click=lambda e: me.navigate(
+            "/producers/edit", query_params={"id": e.key}
+          ),
+          key=producer.id,
+          style=me.Style(font_size=16),
+        )
+        me.button(
+          producer.mesop_model_id,
+          on_click=lambda e: me.navigate(
+            "/models/edit", query_params={"id": e.key}
+          ),
+          key=producer.mesop_model_id,
+          style=me.Style(font_size=16),
+        )
+        me.button(
+          producer.prompt_context_id,
+          on_click=lambda e: me.navigate(
+            "/prompt-contexts/edit", query_params={"id": e.key}
+          ),
+          key=producer.prompt_context_id,
+          style=me.Style(font_size=16),
+        )
+        me.text(producer.output_format)
+        me.text(str(producer.temperature))
+    with me.box(style=me.Style(padding=me.Padding(top=32))):
+      me.button(
+        "Add Producer",
+        on_click=lambda e: me.navigate("/producers/add"),
+        type="flat",
+        color="accent",
+      )
diff --git a/ai/src/ai/console/pages/prompt_contexts_page.py b/ai/src/ai/console/pages/prompt_contexts_page.py
new file mode 100644
index 000000000..2d4c8ac83
--- /dev/null
+++ b/ai/src/ai/console/pages/prompt_contexts_page.py
@@ -0,0 +1,55 @@
+import mesop as me
+from ai.common.prompt_context import prompt_context_store
+from ai.console.scaffold import page_scaffold
+
+
+def on_load(e: me.LoadEvent):
+  me.set_theme_mode("system")
+
+
+@me.page(
+  title="Mesop AI Console - Prompt Contexts",
+  path="/prompt-contexts",
+  on_load=on_load,
+)
+def prompt_contexts_page():
+  with page_scaffold(current_path="/prompt-contexts", title="Prompt Contexts"):
+    prompt_contexts = prompt_context_store.get_all()
+    with me.box(
+      style=me.Style(
+        display="grid",
+        grid_template_columns="400px 400px",
+        gap=16,
+        align_items="center",
+      )
+    ):
+      # Header
+      me.text("ID", style=me.Style(font_weight="bold"))
+      me.text("Fragments", style=me.Style(font_weight="bold"))
+      # Body
+      for prompt_context in prompt_contexts:
+        me.button(
+          prompt_context.id,
+          on_click=lambda e: me.navigate(
+            "/prompt-contexts/edit", query_params={"id": e.key}
+          ),
+          key=prompt_context.id,
+          style=me.Style(font_size=16, flex_wrap="wrap", word_wrap="anywhere"),
+        )
+        with me.box(style=me.Style(display="flex-wrap", flex_direction="row")):
+          for fragment_id in prompt_context.fragment_ids:
+            me.button(
+              fragment_id,
+              on_click=lambda e: me.navigate(
+                "/prompt-fragments/edit", query_params={"id": e.key}
+              ),
+              key=fragment_id,
+              style=me.Style(font_size=16),
+            )
+    with me.box(style=me.Style(padding=me.Padding(top=32))):
+      me.button(
+        "Add Prompt Context",
+        on_click=lambda e: me.navigate("/prompt-contexts/add"),
+        type="flat",
+        color="accent",
+      )
diff --git a/ai/src/ai/console/pages/prompt_fragments_page.py b/ai/src/ai/console/pages/prompt_fragments_page.py
new file mode 100644
index 000000000..24f4feaa1
--- /dev/null
+++ b/ai/src/ai/console/pages/prompt_fragments_page.py
@@ -0,0 +1,57 @@
+import mesop as me
+from ai.common.prompt_fragment import prompt_fragment_store
+from ai.console.scaffold import page_scaffold
+
+
+def on_load(e: me.LoadEvent):
+  me.set_theme_mode("system")
+
+
+@me.page(
+  title="Mesop AI Console - Prompt Fragments",
+  path="/prompt-fragments",
+  on_load=on_load,
+)
+def prompt_fragments_page():
+  with page_scaffold(
+    current_path="/prompt-fragments", title="Prompt Fragments"
+  ):
+    prompt_fragments = prompt_fragment_store.get_all()
+    with me.box(
+      style=me.Style(
+        display="grid",
+        grid_template_columns="1fr 1fr 1fr 48px",
+        gap=16,
+        align_items="center",
+      )
+    ):
+      # Header
+      me.text("ID", style=me.Style(font_weight="bold"))
+      me.text("Contents", style=me.Style(font_weight="bold"))
+      me.text("Role", style=me.Style(font_weight="bold"))
+      with me.tooltip(message="Chain of Thought"):
+        me.text("CoT", style=me.Style(font_weight="bold"))
+      # Body
+      for prompt_fragment in prompt_fragments:
+        me.button(
+          prompt_fragment.id,
+          on_click=lambda e: me.navigate(
+            "/prompt-fragments/edit", query_params={"id": e.key}
+          ),
+          key=prompt_fragment.id,
+          style=me.Style(font_size=16),
+        )
+        if prompt_fragment.content_value:
+          me.text("Value: " + prompt_fragment.content_value[:10])
+        elif prompt_fragment.content_path:
+          me.text("Path: " + prompt_fragment.content_path)
+
+        me.text(prompt_fragment.role)
+        me.text(str(prompt_fragment.chain_of_thought))
+    with me.box(style=me.Style(padding=me.Padding(top=32))):
+      me.button(
+        "Add Prompt Fragment",
+        on_click=lambda e: me.navigate("/prompt-fragments/add"),
+        type="flat",
+        color="accent",
+      )
diff --git a/ai/src/ai/console/scaffold.py b/ai/src/ai/console/scaffold.py
new file mode 100644
index 000000000..179eb296b
--- /dev/null
+++ b/ai/src/ai/console/scaffold.py
@@ -0,0 +1,190 @@
+import mesop as me
+
+
+@me.stateclass
+class State:
+  sidenav_menu_open: bool
+
+
+def toggle_menu_button(e: me.ClickEvent):
+  s = me.state(State)
+  s.sidenav_menu_open = not s.sidenav_menu_open
+
+
+def is_mobile():
+  return me.viewport_size().width < 640
+
+
+@me.content_component
+def page_scaffold(current_path: str = "", title: str = "Mesop AI Console"):
+  with me.box(style=me.Style(display="flex", height="100%")):
+    if is_mobile():
+      with me.content_button(
+        type="icon",
+        style=me.Style(top=6, left=8, position="absolute", z_index=9),
+        on_click=toggle_menu_button,
+      ):
+        me.icon("menu")
+      with me.sidenav(
+        opened=me.state(State).sidenav_menu_open,
+        style=me.Style(
+          background=me.theme_var("surface-container-low"),
+        ),
+      ):
+        sidenav(current_path)
+    else:
+      sidenav(current_path)
+    with me.box(
+      style=me.Style(
+        background=me.theme_var("surface-container-low"),
+        display="flex",
+        flex_direction="column",
+        flex_grow=1,
+      )
+    ):
+      header(title)
+      with me.box(
+        style=me.Style(
+          background=me.theme_var("background"),
+          flex_grow=1,
+          padding=me.Padding(
+            left=16,
+            right=16,
+            top=16,
+          ),
+          border_radius=16,
+          overflow_y="auto",
+          display="flex",
+          flex_direction="column",
+        )
+      ):
+        me.slot()
+
+
+def toggle_theme(e: me.ClickEvent):
+  if me.theme_brightness() == "light":
+    me.set_theme_mode("dark")
+  else:
+    me.set_theme_mode("light")
+
+
+def header(title: str):
+  with me.box(
+    style=me.Style(
+      height=64,
+      width="100%",
+      padding=me.Padding.all(16),
+      display="flex",
+      align_items="center",
+    ),
+  ):
+    me.text(
+      title,
+      style=me.Style(
+        color=me.theme_var("on-background"),
+        font_size=22,
+        font_weight=500,
+        letter_spacing="0.8px",
+        padding=me.Padding(left=36) if is_mobile() else None,
+      ),
+    )
+
+    with me.content_button(
+      type="icon",
+      style=me.Style(position="absolute", right=4, top=8),
+      on_click=toggle_theme,
+    ):
+      me.icon("light_mode" if me.theme_brightness() == "dark" else "dark_mode")
+
+
+def sidenav(current_path: str):
+  with me.box(
+    style=me.Style(
+      width=240,
+      min_width=240,
+      max_width=240,
+      height="100%",
+      background=me.theme_var("surface-container-low"),
+      padding=me.Padding.all(16),
+    )
+  ):
+    with me.box(
+      style=me.Style(
+        padding=me.Padding(top=24),
+        display="flex",
+        flex_direction="column",
+        gap=12,
+      ),
+    ):
+      nav_link("Home", icon="home", path="/", current_path=current_path)
+      nav_link("Evals", icon="labs", path="/evals", current_path=current_path)
+      nav_link(
+        "Producers",
+        icon="precision_manufacturing",
+        path="/producers",
+        current_path=current_path,
+      )
+      nav_link(
+        "Models", icon="model", path="/models", current_path=current_path
+      )
+      me.text(
+        "Prompts",
+        style=me.Style(
+          font_weight=500, font_size=16, margin=me.Margin(top=4, left=4)
+        ),
+      )
+      nav_link(
+        "Prompt Contexts",
+        icon="notebook",
+        path="/prompt-contexts",
+        current_path=current_path,
+      )
+      nav_link(
+        "Prompt Fragments",
+        icon="description",
+        path="/prompt-fragments",
+        current_path=current_path,
+      )
+      me.text(
+        "Examples",
+        style=me.Style(
+          font_weight=500, font_size=16, margin=me.Margin(top=4, left=4)
+        ),
+      )
+      nav_link(
+        "Expected Examples",
+        icon="labs",
+        path="/expected-examples",
+        current_path=current_path,
+      )
+      nav_link(
+        "Golden Examples",
+        icon="school",
+        path="/golden-examples",
+        current_path=current_path,
+      )
+
+
+def nav_link(
+  label: str, icon: str, path: str, current_path: str, nested: bool = False
+):
+  with me.box(
+    style=me.Style(
+      cursor="pointer",
+      margin=me.Margin(left=32) if nested else None,
+      padding=me.Padding.all(12),
+      border_radius=12,
+      display="flex",
+      align_items="center",
+      gap=12,
+      background=me.theme_var("secondary-container")
+      if path == current_path
+      else None,
+      font_weight=500,
+      font_size=16,
+    ),
+    key=path,
+    on_click=lambda e: me.navigate(e.key),
+  ):
+    me.icon(icon)
+    me.text(label)
diff --git a/ai/src/ai/offline_common/eval.py b/ai/src/ai/offline_common/eval.py
new file mode 100644
index 000000000..56035d9cf
--- /dev/null
+++ b/ai/src/ai/offline_common/eval.py
@@ -0,0 +1,215 @@
+import base64
+import concurrent.futures
+import os
+import subprocess
+import time
+from concurrent.futures import ThreadPoolExecutor
+from typing import Literal
+
+import requests
+from pydantic import BaseModel
+
+from ai.common.entity_store import EntityStore, get_data_path
+from ai.common.example import (
+  EvaluatedExample,
+  EvaluatedExampleOutput,
+  ExampleOutput,
+  ExpectedExample,
+  ExpectResult,
+  expected_example_store,
+)
+from ai.common.executor import ProducerExecutor
+
+SANDBOX_URL = "http://localhost:8080"
+
+
+class EvalOutcome(BaseModel):
+  examples_run: int
+  examples_succeeded: int
+  score: float  # sum of scores across expect_results in examples
+  max_score: float  # potential max score across expect_results in examples
+
+
+class Eval(BaseModel):
+  id: str
+  producer_id: str
+  state: Literal["pending", "running", "complete", "failed"] = "pending"
+  eval_outcome: EvalOutcome | None = None
+
+
+eval_store = EntityStore(Eval, dirname="evals")
+
+
+def get_eval_example(eval_id: str, example_id: str) -> EvaluatedExample:
+  eval_path = get_data_path(os.path.join("evals", eval_id))
+  if not os.path.exists(eval_path):
+    raise ValueError(f"Eval {eval_id} example {example_id} not found")
+  with open(os.path.join(eval_path, example_id, "evaluated_example.json")) as f:
+    evaluated_example = EvaluatedExample.model_validate_json(f.read())
+  with open(os.path.join(eval_path, example_id, "output.txt")) as f:
+    evaluated_example.outputs[0].output.raw_output = f.read()
+  with open(os.path.join(eval_path, example_id, "patched.py")) as f:
+    evaluated_example.outputs[0].output.output_code = f.read()
+  return evaluated_example
+
+
+def get_eval_examples(eval_id: str) -> list[EvaluatedExample]:
+  eval_path = get_data_path(os.path.join("evals", eval_id))
+  if not os.path.exists(eval_path):
+    return []
+  examples: list[EvaluatedExample] = []
+  for file in os.listdir(eval_path):
+    with open(os.path.join(eval_path, file, "evaluated_example.json")) as f:
+      examples.append(EvaluatedExample.model_validate_json(f.read()))
+  return examples
+
+
+class EvalRunner:
+  def __init__(self, eval: Eval):
+    self.eval = eval
+    self.producer_executor = ProducerExecutor(self.eval.producer_id)
+    self.eval_path = get_data_path(os.path.join("evals", self.eval.id))
+
+  def run(self):
+    os.makedirs(self.eval_path, exist_ok=True)
+
+    examples = expected_example_store.get_all()
+    eval_outcome = EvalOutcome(
+      examples_run=0, examples_succeeded=0, score=0, max_score=0
+    )
+
+    try:
+      with ThreadPoolExecutor() as executor:
+        future_to_example = {
+          executor.submit(self.eval_example, example): example
+          for example in examples
+        }
+        for future in concurrent.futures.as_completed(future_to_example):
+          evaluated_example = future.result()
+          eval_outcome.examples_run += 1
+          for result in evaluated_example.outputs[0].expect_results:
+            eval_outcome.score += result.score
+            eval_outcome.max_score += 1
+          if all(
+            result.score == 1
+            for result in evaluated_example.outputs[0].expect_results
+          ):
+            eval_outcome.examples_succeeded += 1
+          print("---")
+          print("Proessed: ", evaluated_example.expected.id)
+          print(
+            f"Examples succeeded: {eval_outcome.examples_succeeded}/{len(examples)}"
+          )
+          print("---")
+    except Exception as e:
+      self.eval.state = "failed"
+      eval_store.save(self.eval, overwrite=True)
+      raise e
+
+    self.eval.state = "complete"
+    self.eval.eval_outcome = eval_outcome
+    eval_store.save(self.eval, overwrite=True)
+
+  def eval_example(self, example: ExpectedExample) -> EvaluatedExample:
+    example_path = os.path.join(self.eval_path, example.id)
+    os.makedirs(example_path)
+
+    start_time = time.time()
+    output = self.producer_executor.execute(example.input)
+    end_time = time.time()
+    time_elapsed = end_time - start_time
+    with open(os.path.join(example_path, "output.txt"), "w") as f:
+      f.write(output)
+
+    evaluated_example_output = EvaluatedExampleOutput(
+      time_spent_secs=time_elapsed,
+      tokens=int(len(output) / 4),  # rough estimate
+      output=ExampleOutput(
+        output_type=self.producer_executor.producer.output_format,
+      ),
+      expect_results=[],
+    )
+
+    patched_code = self.producer_executor.transform_output(
+      input_code=example.input.input_code or "", output=output
+    )
+    evaluated_example_output.expect_results.append(
+      ExpectResult(
+        name="patchable",
+        score=0 if patched_code.has_error else 1,
+        message=patched_code.result if patched_code.has_error else "Success",
+      )
+    )
+    if not patched_code.has_error:
+      patched_code_path = os.path.join(example_path, "patched.py")
+      with open(patched_code_path, "w") as f:
+        f.write(patched_code.result)
+
+      self.check_executable(evaluated_example_output, patched_code_path)
+      self.check_type_checkable(evaluated_example_output, patched_code_path)
+    evaluated_example = EvaluatedExample(
+      expected=example,
+      outputs=[evaluated_example_output],
+    )
+
+    with open(os.path.join(example_path, "evaluated_example.json"), "w") as f:
+      f.write(evaluated_example.model_dump_json(indent=4))
+    return evaluated_example
+
+  def check_type_checkable(
+    self,
+    evaluated_example_output: EvaluatedExampleOutput,
+    patched_code_path: str,
+  ):
+    try:
+      subprocess.run(
+        [
+          "yarn",
+          "pyright",
+          patched_code_path,
+        ],
+        capture_output=True,
+        text=True,
+        check=True,
+      )
+      evaluated_example_output.expect_results.append(
+        ExpectResult(
+          name="type_checkable",
+          score=1,
+          message="Success",
+        )
+      )
+    except subprocess.CalledProcessError as e:
+      evaluated_example_output.expect_results.append(
+        ExpectResult(
+          name="type_checkable", score=0, message=e.stdout + e.stderr
+        )
+      )
+
+  def check_executable(
+    self,
+    evaluated_example_output: EvaluatedExampleOutput,
+    patched_code_path: str,
+  ):
+    with open(patched_code_path) as f:
+      code = f.read()
+    result = requests.post(
+      SANDBOX_URL + "/exec-py",
+      data={"code": base64.b64encode(code.encode("utf-8"))},
+    )
+    if result.status_code == 200:
+      evaluated_example_output.expect_results.append(
+        ExpectResult(
+          name="executable",
+          score=1,
+          message="Success",
+        )
+      )
+    else:
+      evaluated_example_output.expect_results.append(
+        ExpectResult(
+          name="executable",
+          score=0,
+          message=result.text,
+        )
+      )
diff --git a/ai/src/ai/offline_common/golden_dataset.py b/ai/src/ai/offline_common/golden_dataset.py
new file mode 100644
index 000000000..955839d7d
--- /dev/null
+++ b/ai/src/ai/offline_common/golden_dataset.py
@@ -0,0 +1,9 @@
+from ai.common.example import GoldenExample
+from ai.common.prompt_context import PromptContext
+
+
+def create_golden_dataset(
+  examples: list[GoldenExample], prompt_context: PromptContext
+) -> str:
+  print("create_golden_dataset", examples, prompt_context)
+  return "not yet implemented"
diff --git a/ai/src/console.py b/ai/src/console.py
new file mode 100644
index 000000000..05077ac87
--- /dev/null
+++ b/ai/src/console.py
@@ -0,0 +1,68 @@
+import ai.console.scaffold as scaffold
+import mesop as me
+from ai.console.pages import add_edit_eval_page as add_edit_eval_page
+from ai.console.pages import (
+  add_edit_expected_examples_page as add_edit_expected_examples_page,
+)
+from ai.console.pages import (
+  add_edit_golden_examples_page as add_edit_golden_examples_page,
+)
+from ai.console.pages import add_edit_model_page as add_edit_model_page
+from ai.console.pages import add_edit_producer_page as add_edit_producer_page
+from ai.console.pages import (
+  add_edit_prompt_context_page as add_edit_prompt_context_page,
+)
+from ai.console.pages import (
+  add_edit_prompt_fragment_page as add_edit_prompt_fragment_page,
+)
+from ai.console.pages import (
+  create_golden_dataset_page as create_golden_dataset_page,
+)
+from ai.console.pages import eval_item_page as eval_item_page
+from ai.console.pages import eval_page as eval_page
+from ai.console.pages import evals_page as evals_page
+from ai.console.pages import expected_examples_page as expected_examples_page
+from ai.console.pages import golden_examples_page as golden_examples_page
+from ai.console.pages import models_page as models_page
+from ai.console.pages import producers_page as producers_page
+from ai.console.pages import prompt_contexts_page as prompt_contexts_page
+from ai.console.pages import prompt_fragments_page as prompt_fragments_page
+
+
+def on_load(e: me.LoadEvent):
+  me.set_theme_mode("system")
+
+
+@me.page(title="Mesop AI Console", path="/", on_load=on_load)
+def index_page():
+  with scaffold.page_scaffold(current_path="/", title="Home"):
+    me.markdown(
+      """
+# Mesop AI Console Overview
+
+## Principles
+
+- **Version Control**: Mesop AI Console is a UI on top of the [mesop-data](https://huggingface.co/datasets/wwwillchen/mesop-data) Git repo.
+    - If you make changes, you should `cd ai/data` and commit the Git changes and push/make a pull request.
+
+## Core Entities
+
+- **Producer**: A producer fully specifies how to call a model, including configurations like temperature, prompt context, and how to process its outputs (e.g. taking the diff and apply it to the input code).
+    - A producer can be used for inference (online) or evaluation (offline).
+    - Producer = Model + Prompt Context + Settings (e.g. temperature, output format)
+
+- **Prompt Context**: A prompt context is a prompt template with variables that are filled in at execution time.
+    - A prompt context consists of one or more prompt fragments.
+
+- **Prompt Fragment**: A prompt fragment is a chunk of a prompt for a specific role, e.g. `user` or `system`
+    - Note: you can have multiple fragments with the same role, which are effectively concatenated together.
+
+- **Example**: An example is a single input/output pair.
+    - Examples are used for fine-tuning a model (i.e. golden example) or running an eval (i.e. expected example).
+    - There are two types of examples:
+        - **Golden Example**: A golden example is an example that is used to create a golden dataset.
+        - **Expected Example**: An expected example is an example that is used to evaluate a producer.
+        Internally, once an expected example has been run through an eval, we create an **evaluated example**, but you don't need to create this manually in the UI.
+          """,
+      style=me.Style(line_height=1.5),
+    )
diff --git a/ai/src/migrate_goldens.py b/ai/src/migrate_goldens.py
new file mode 100644
index 000000000..7967622c4
--- /dev/null
+++ b/ai/src/migrate_goldens.py
@@ -0,0 +1,55 @@
+import json
+import os
+
+from ai.common.example import (
+  ExampleInput,
+  ExampleOutput,
+  GoldenExample,
+  golden_example_store,
+)
+
+OLD_GOLDENS_DIR = os.path.join(os.path.dirname(__file__), "..", "ft", "goldens")
+NEW_GOLDENS_DIR = os.path.join(
+  os.path.dirname(__file__), "..", "data", "golden_examples"
+)
+
+
+def migrate_goldens():
+  for filename in os.listdir(OLD_GOLDENS_DIR):
+    old_dir_path = os.path.join(OLD_GOLDENS_DIR, filename)
+    if not os.path.isdir(old_dir_path):
+      continue
+    with open(os.path.join(old_dir_path, "diff.txt")) as f:
+      diff = f.read()
+    with open(os.path.join(old_dir_path, "prompt.txt")) as f:
+      prompt = f.read()
+    source = None
+    if os.path.exists(os.path.join(old_dir_path, "source.py")):
+      with open(os.path.join(old_dir_path, "source.py")) as f:
+        source = f.read()
+    with open(os.path.join(old_dir_path, "patched.py")) as f:
+      patched = f.read()
+    line_number = None
+    if os.path.exists(os.path.join(old_dir_path, "metadata.json")):
+      with open(os.path.join(old_dir_path, "metadata.json")) as f:
+        metadata = json.load(f)
+      line_number = metadata.get("line_number", None)
+    golden_example = GoldenExample(
+      id=filename,
+      input=ExampleInput(
+        prompt=prompt, input_code=source, line_number_target=line_number
+      ),
+      output=ExampleOutput(
+        output_code=patched, raw_output=diff, output_type="diff"
+      ),
+      #   diff=diff,
+      #   prompt=prompt,
+      #   source=source,
+      #   patched=patched,
+      #   metadata=metadata,
+    )
+    golden_example_store.save(golden_example)
+
+
+if __name__ == "__main__":
+  migrate_goldens()
diff --git a/ai/src/service.py b/ai/src/service.py
index 968824ae9..546ad28d2 100644
--- a/ai/src/service.py
+++ b/ai/src/service.py
@@ -8,7 +8,10 @@
 
 from flask import Flask, Response, request, stream_with_context
 
-from ai.common.llm_lib import adjust_mesop_app_stream, apply_patch
+from ai.common.example import ExampleInput
+from ai.common.executor import (
+  ProducerExecutor,
+)
 
 app = Flask(__name__)
 
@@ -53,6 +56,9 @@ def save_interaction_endpoint() -> Response | dict[str, str]:
   return {"folder": folder_name}
 
 
+DEFAULT_PRODUCER_ID = "openai-gpt4o-mini-ft-2024-08-default"
+
+
 @app.route("/adjust-mesop-app", methods=["POST"])
 def adjust_mesop_app_endpoint():
   data = request.json
@@ -65,23 +71,24 @@ def adjust_mesop_app_endpoint():
     return Response("Both 'code' and 'prompt' are required", status=400)
 
   def generate():
-    stream = adjust_mesop_app_stream(
-      code=code,
-      user_input=prompt,
-      line_number=line_number,
+    executor = ProducerExecutor(DEFAULT_PRODUCER_ID)
+    stream = executor.execute_stream(
+      ExampleInput(
+        input_code=code, prompt=prompt, line_number_target=line_number
+      )
     )
-    diff = ""
+
+    acc = ""
     for chunk in stream:
-      if chunk:
-        diff += chunk
-        yield f"data: {json.dumps({'type': 'progress', 'data': chunk})}\n\n"
+      acc += chunk
+      yield f"data: {json.dumps({'type': 'progress', 'data': chunk})}\n\n"
 
-    result = apply_patch(code, diff)
+    result = executor.transform_output(input_code=code, output=acc)
     if result.has_error:
       yield f"data: {json.dumps({'type': 'error', 'error': result.result})}\n\n"
       return
 
-    yield f"data: {json.dumps({'type': 'end', 'code': result.result, 'diff': diff})}\n\n"
+    yield f"data: {json.dumps({'type': 'end', 'code': result.result, 'diff': acc})}\n\n"
 
   return Response(
     stream_with_context(generate()), content_type="text/event-stream"