log10-io · wenzhe-log10 · Mar 8, 2024 · Mar 8, 2024 · Mar 8, 2024 · Mar 8, 2024
diff --git a/README.md b/README.md
@@ -141,13 +141,23 @@ Read more here for options for logging using library wrapper, langchain callback
 
 ### 🤖👷 Prompt engineering copilot
 
-Optimizing prompts requires a lot of manual effort. Log10 provides a copilot that can help you with suggestions on how to [optimize your prompt](https://log10.io/docs/prompt_engineering/auto_prompt#how-to-use-auto-prompting-in-log10-python-library). 
+Optimizing prompts requires a lot of manual effort. Log10 provides a copilot that can help you with suggestions on how to [optimize your prompt](https://log10.io/docs/prompt_engineering/auto_prompt#how-to-use-auto-prompting-in-log10-python-library).
 
 ### 👷🔢 Feedback
 
 Add feedback to your completions. Checkout the Python [example](/examples/feedback/simple_feedback.py)
 or use CLI `log10 feedback-task create` and `log10 feedback create`. Please check our [doc](https://log10.io/docs/feedback) for more details.
 
+#### AutoFeedback
+Leverage your current feedback and AI by using our AutoFeedback feature to generate feedback automatically. Here’s a quick guide:
+
+* Summary feedback: Use [TLDR summary feedback](/log10/feedback/_summary_feedback_utils.py) rubics to rate summarization. E.g. `log10 feedback predict --task_id $FEEDBACK_TASK_ID --content '{"prompt": "this is article", "response": "summary of the article."}'`.
+  * You can pass a file containing the context with `--file` or pass a completion from your Log10 logs with `--completion_id`.
+* Custom Feedback Rubrics: Integrate your own feedback criteria for personalized assessments.
+* Getting Started: To explore all options and usage details, use CLI `log10 feedback predict --help`.
+
+Feel free to integrate AutoFeedback into your workflow to enhance the feedback and evaluation process.
+
 ### 🔍🐞 Prompt chain debugging
 
 Prompt chains such as those in [Langchain](https://github.com/hwchase17/langchain) can be difficult to debug. Log10 provides prompt provenance, session tracking and call stack functionality to help debug chains.

diff --git a/log10/__main__.py b/log10/__main__.py
@@ -1,6 +1,7 @@
 import click
 
 from log10.completions.completions import download_completions, get_completion, list_completions
+from log10.feedback.autofeedback import auto_feedback_icl
 from log10.feedback.feedback import create_feedback, download_feedback, get_feedback, list_feedback
 from log10.feedback.feedback_task import create_feedback_task, get_feedback_task, list_feedback_task
 
@@ -44,6 +45,7 @@ def feedback_task():
 feedback.add_command(list_feedback, "list")
 feedback.add_command(get_feedback, "get")
 feedback.add_command(download_feedback, "download")
+feedback.add_command(auto_feedback_icl, "predict")
 
 cli.add_command(feedback_task)
 feedback_task.add_command(create_feedback_task, "create")

diff --git a/log10/completions/completions.py b/log10/completions/completions.py
@@ -179,7 +179,7 @@ def get_completion(id):
     Get a completion by id
     """
     res = _get_completion(id)
-    rich.print_json(json.dumps(res.json(), indent=4))
+    rich.print_json(json.dumps(res.json()["data"], indent=4))
 
 
 def _write_completions(res, output_file, compact_mode):

diff --git a/log10/feedback/_summary_feedback_utils.py b/log10/feedback/_summary_feedback_utils.py
@@ -0,0 +1,92 @@
+from magentic import SystemMessage, UserMessage, chatprompt
+from magentic.chat_model.openai_chat_model import OpenaiChatModel
+from magentic.chatprompt import escape_braces
+
+
+# define prompts for tldr dataset
+SUMMARY_SYSTEM_PROMPT = """You are an evaluator of summaries of articles on reddit. You are tasked with grading the summaries for accuracy, coherence, coverage and overall.
+
+Coherence
+For this axis, answer the question “how coherent is the summary on its own?” A summary is
+coherent if, when read by itself, it’s easy to understand and free of English errors. A summary is
+not coherent if it’s difficult to understand what the summary is trying to say. Generally, it’s more
+important that the summary is understandable than it being free of grammar errors.
+Rubric:
+Score of 1: The summary is impossible to understand.
+Score of 4: The summary has mistakes or confusing phrasing that make it a bit hard to understand.
+Score of 7: The summary is perfectly clear.
+
+Accuracy
+For this axis, answer the question “does the factual information in the summary accurately match
+the post?” A summary is accurate if it doesn’t say things that aren’t in the article, it doesn’t mix up
+people, and generally is not misleading. If the summary says anything at all that is not mentioned
+in the post or contradicts something in the post, it should be given a maximum score of 5. (If you
+are confused about how to use ‘6’, see the FAQ!)
+Rubric:
+Score of 1: The summary is completely wrong, made up, or exactly contradicts what is written in
+the post.
+Score of 4: The summary says at least one substantial thing that is not mentioned in the post, or
+that contradicts something in the post.
+(Score of 5: The summary says anything, no matter how small, that is not mentioned in the post,
+or that contradicts something in the post.)
+Score of 7: The summary has no incorrect statements or misleading implications.
+
+Coverage
+For this axis, answer the question “how well does the summary cover the important information
+in the post?” A summary has good coverage if it mentions the main information from the post
+that’s important to understand the situation described in the post. A summary has poor coverage if
+someone reading only the summary would be missing several important pieces of information
+about the situation in the post. A summary with good coverage should also match the purpose of
+the original post (e.g. to ask for advice).
+Rubric:
+Score of 1: The summary contains no information relevant to the post.
+Score of 4: The summary is missing at least 1 important piece of information required to understand the situation.
+Score of 7: The summary covers all of the important information required to understand the
+situation.
+
+Overall quality
+For this axis, answer the question “how good is the summary overall at representing the post?”
+This can encompass all of the above axes of quality, as well as others you feel are important. If
+it’s hard to find ways to make the summary better, give the summary a high score. If there are lots
+of different ways the summary can be made better, give the summary a low score.
+Rubric:
+Score of 1: The summary is terrible.
+Score of 4: The summary is an okay representation of the post, but could be significantly improved.
+Score of 7: The summary is an excellent representation of the post."""
+
+SUMMARY_USER_MESSAGE = """
+Assign scores and write a explanation note for the summary in the test post in json format based on what you think the evaluators would have assigned it.
+Do not generate a new summary but just grade the summary that is presented in the last test example
+Here is an example format for the final output:
+{"note": "This summary is pretty concise but the key points are conveyed here", "axes": {"overall": "6", "accuracy": "6", "coverage": "5", "coherence": "6"}}
+
+Only answer with the scores and note for the final test post and not the example posts.
+Remember to not add any additional text beyond the json output
+For e.g. don't say things such as "Here is my assessment:" or "Here is the extracted JSON:"
+"""
+
+SUMMARY_USER_MESSAGE = escape_braces(SUMMARY_USER_MESSAGE)
+
+
+@chatprompt(
+    SystemMessage(SUMMARY_SYSTEM_PROMPT),
+    UserMessage(SUMMARY_USER_MESSAGE),
+    UserMessage("Examples: \n{examples}\n\nTest: \n{prompt}"),
+    model=OpenaiChatModel("gpt-4-0125-preview", temperature=0.2),
+)
+def summary_feedback_llm_call(examples, prompt) -> str: ...
+
+
+def flatten_messages(completion: dict) -> dict:
+    request_messages = completion.get("request", {}).get("messages", [])
+    if len(request_messages) > 1 and request_messages[1].get("content", ""):
+        prompt = request_messages[1].get("content")
+    else:
+        prompt = ""
+
+    response_choices = completion.get("response", {}).get("choices", [])
+    if response_choices and response_choices[0].get("message", {}):
+        response = response_choices[0].get("message", {}).get("content", "")
+    else:
+        response = ""
+    return {"prompt": prompt, "response": response}
diff --git a/log10/feedback/autofeedback.py b/log10/feedback/autofeedback.py
@@ -0,0 +1,108 @@
+import json
+import logging
+import random
+from types import FunctionType
+
+import click
+import openai
+from rich.console import Console
+
+from log10.completions.completions import _get_completion
+from log10.feedback._summary_feedback_utils import flatten_messages, summary_feedback_llm_call
+from log10.feedback.feedback import _get_feedback_list
+from log10.load import log10, log10_session
+
+
+log10(openai)
+
+logger = logging.getLogger("LOG10")
+logger.setLevel(logging.INFO)
+
+
+class AutoFeedbackICL:
+    """
+    Generate feedback with in context learning (ICL) based on existing feedback.
+    """
+
+    _examples: list[dict] = []
+    _predict_func: FunctionType = None
+
+    def __init__(self, task_id: str, num_samples: int = 5, predict_func: FunctionType = summary_feedback_llm_call):
+        self.num_samples = num_samples
+        self.task_id = task_id
+        self._predict_func = predict_func
+
+    def _get_examples(self):
+        logger.info(f"Getting {self.num_samples} feedback for task {self.task_id}")
+        feedback_data = _get_feedback_list(offset=0, limit="", task_id=self.task_id)
+        assert feedback_data, f"No feedback found for task {self.task_id}."
+        assert (
+            len(feedback_data) >= self.num_samples
+        ), f"Insufficient feedback for task {self.task_id}, found {len(feedback_data)} feedback. Sample size {self.num_samples}."
+        sampled_feedback = random.sample(feedback_data, self.num_samples)
+        few_shot_examples = []
+        for fb in sampled_feedback:
+            feedback_values = fb["json_values"]
+            completion_id = fb["matched_completion_ids"][0]
+            try:
+                res = _get_completion(completion_id)
+            except Exception as e:
+                print(e)
+                continue
+            completion = res.json()["data"]
+            prompt = completion["request"]["messages"][1]["content"]
+            response = completion["response"]["choices"][0]["message"]["content"]
+            few_shot_examples.append(
+                {
+                    "completion_id": completion_id,
+                    "prompt": prompt,
+                    "response": response,
+                    "feedback": json.dumps(feedback_values),
+                }
+            )
+        logger.info(f"Sampled completion ids: {[d['completion_id'] for d in few_shot_examples]}")
+        return few_shot_examples
+
+    def predict(self, text: str = None, completion_id: str = None) -> str:
+        if not self._examples:
+            self._examples = self._get_examples()
+
+        # Here assumps the completion is summary, prompt is article, response is summary
+        if completion_id and not text:
+            completion = _get_completion(completion_id)
+            pr = flatten_messages(completion.json()["data"])
+            text = json.dumps(pr)
+
+        logger.info(f"{text=}")
+
+        predict_func_name = self._predict_func.__name__
+        logger.info(f"Using predict llm_call: {predict_func_name}")
+        with log10_session(tags=["autofeedback_icl", predict_func_name]):
+            ret = self._predict_func(examples="\n".join([str(d) for d in self._examples]), prompt=text)
+        return ret
+
+
+@click.command()
+@click.option("--task_id", help="Feedback task ID")
+@click.option("--content", help="Completion content")
+@click.option("--file", "-f", help="File containing completion content")
+@click.option("--completion_id", help="Completion ID")
+@click.option("--num_samples", default=5, help="Number of samples to use for few-shot learning")
+def auto_feedback_icl(task_id: str, content: str, file: str, completion_id: str, num_samples: int):
+    options_count = sum([1 for option in [content, file, completion_id] if option])
+    if options_count > 1:
+        click.echo("Only one of --content, --file, or --completion_id should be provided.")
+        return
+
+    console = Console()
+    auto_feedback_icl = AutoFeedbackICL(task_id, num_samples=num_samples)
+    if completion_id:
+        results = auto_feedback_icl.predict(completion_id=completion_id)
+        console.print_json(results)
+        return
+
+    if file:
+        with open(file, "r") as f:
+            content = f.read()
+    results = auto_feedback_icl.predict(text=content)
+    console.print_json(results)
diff --git a/log10/feedback/feedback.py b/log10/feedback/feedback.py
@@ -59,10 +59,10 @@ def create(
         res = self._post_request(self.feedback_create_url, json_payload)
         return res
 
-    def list(self, offset: int = 0, limit: int = 25, task_id: str = None) -> httpx.Response:
+    def list(self, offset: int = 0, limit: int = 50, task_id: str = None) -> httpx.Response:
         base_url = self._log10_config.url
         api_url = "/api/v1/feedback"
-        url = f"{base_url}{api_url}?organization_id={self._log10_config.org_id}&offset={offset}&limit={limit}"
+        url = f"{base_url}{api_url}?organization_id={self._log10_config.org_id}&offset={offset}&limit={limit}&task_id={task_id}"
 
         # GET feedback
         try:
@@ -107,22 +107,34 @@ def create_feedback(task_id, values, completion_tags_selector, comment):
 
 
 def _get_feedback_list(offset, limit, task_id):
-    # TODO: update when api support filtering by task_id
-    # get all feedback and then filter by task_id
-    if task_id:
-        offset = ""
-        limit = ""
+    total_fetched = 0
+    feedback_data = []
+    total_feedback = 0
+    if limit:
+        limit = int(limit)
+
     try:
-        res = Feedback().list(offset=offset, limit=limit)
+        while True:
+            fetch_limit = limit - total_fetched if limit else 50
+            res = Feedback().list(offset=offset, limit=fetch_limit, task_id=task_id)
+            new_data = res.json().get("data", [])
+            if total_feedback == 0:
+                total_feedback = res.json().get("total", 0)
+            if not limit:
+                limit = total_feedback
+            feedback_data.extend(new_data)
+
+            current_fetched = len(new_data)
+            total_fetched += current_fetched
+            offset += current_fetched
+            if total_fetched >= limit or total_fetched >= total_feedback:
+                break
     except Exception as e:
         click.echo(f"Error fetching feedback {e}")
         if hasattr(e, "response") and hasattr(e.response, "json") and "error" in e.response.json():
             click.echo(e.response.json()["error"])
-        return
-    feedback_data = res.json()["data"]
-    # TODO: update when api support filtering by task_id
-    if task_id:
-        feedback_data = [feedback for feedback in feedback_data if feedback["task_id"] == task_id]
+        return []
+
     return feedback_data
 
 
@@ -135,7 +147,7 @@ def _get_feedback_list(offset, limit, task_id):
 )
 @click.option(
     "--task_id",
-    required=False,
+    default="",
     type=str,
     help="The specific Task ID to filter feedback. If not provided, feedback for all tasks will be fetched.",
 )
@@ -189,15 +201,15 @@ def get_feedback(id):
 @click.command()
 @click.option(
     "--offset",
-    default="",
+    default=0,
     help="The starting index from which to begin the feedback fetch. Leave empty to start from the beginning.",
 )
 @click.option(
     "--limit", default="", help="The maximum number of feedback items to retrieve. Leave empty to retrieve all."
 )
 @click.option(
     "--task_id",
-    required=False,
+    default="",
     type=str,
     help="The specific Task ID to filter feedback. If not provided, feedback for all tasks will be fetched.",
 )