Merge pull request #44 from datakind/feat/recipe-runner

Tactical fixes for the approaching demo ... Recipe code execution for different contexts HDX attribution links Update system prompt Here is a little video demo: Data Recipes Demo Not terribly stable, work to follow.
datakind · Jun 7, 2024 · 59d7b4e · 59d7b4e
2 parents bf78594 + bd9561e
commit 59d7b4e
Show file tree

Hide file tree

Showing 10 changed files with 578 additions and 325 deletions.
diff --git a/actions/actions_plugins/recipe-server/actions.py b/actions/actions_plugins/recipe-server/actions.py
@@ -4,6 +4,7 @@
 import json
 import logging
 import os
+import re
 import subprocess
 import sys
 from functools import lru_cache
@@ -166,24 +167,34 @@ def run_recipe(custom_id: str, recipe: dict, user_input, chat_history):
         with open(recipe_path, "w") as f:
             f.write(code)
 
-        cmd = f"python {recipe_path}"
+        os.chdir(recipes_work_dir)
+        cmd = f"python {custom_id}.py"
         run_output = subprocess.run(cmd, shell=True, capture_output=True, text=True)
 
         result["output"] = run_output.stdout
-        result["errors"] = run_output.stderr
-        result["attribution"] = "Data was sroued from HDX"
+        # result["errors"] = run_output.stderr
 
-    # Run the recipe here
-    # exec(recipe)
+        # TODO - this is terrible, just for the demo, extract JSON between "{" and "}""
+        # Match { }
+        if result["output"].find("{") != -1:
+            result["output"] = result["output"][result["output"].find("{") :]
+            result["output"] = result["output"][: result["output"].rfind("}") + 1]
+            print("Output: ", result["output"])
+            j = json.loads(result["output"].replace("'", '"'))
+            attribution = j["attribution"]
+        else:
+            attribution = "Data was sourced from HDX"
+
+        result["attribution"] = attribution
 
     print("Recipe executed successfully.")
     print(result)
-    return result
+    return run_output + " >> ATTRIBUTION: " + attribution
 
 
-@lru_cache(maxsize=100)
+# @lru_cache(maxsize=100)
 @action()
-def get_memory_recipe(user_input, chat_history, generate_intent=True) -> str:
+def get_memory_recipe(user_input, chat_history, generate_intent="true") -> str:
     """
     Performs a search in the memory for a given intent and returns the best match found.
 
@@ -200,9 +211,9 @@ def get_memory_recipe(user_input, chat_history, generate_intent=True) -> str:
     logging.info("Python HTTP trigger function processed a request.")
     # Retrieve the CSV file from the request
 
-    generate_intent = False
+    generate_intent = "false"
 
-    if generate_intent is not None and generate_intent is True:
+    if generate_intent is not None and generate_intent == "true":
         # chat history is passed from promptflow as a string representation of a list and this has to be converted back to a list for the intent generation to work!
         history_list = ast.literal_eval(chat_history)
         history_list.append({"inputs": {"question": user_input}})
@@ -220,12 +231,14 @@ def get_memory_recipe(user_input, chat_history, generate_intent=True) -> str:
             # Get data from memory or recipe tables
             table_data = get_memory_recipe_metadata(custom_id, mem_type)
             if mem_type == "recipe":
-                # Run the recipe
                 result = run_recipe(custom_id, table_data, user_input, chat_history)
             else:
                 # Take the result directly from memory
                 result = process_memory_recipe_results(result, table_data)
-                print(result)
+
+            result = re.escape(str(result))
+            print(result)
+
             return str(result)
 
     result = "Sorry, no recipe or found"
@@ -239,7 +252,8 @@ def get_memory_recipe(user_input, chat_history, generate_intent=True) -> str:
     # query = "What's the total population of AFG"
     # query = "what's the population of Mali"
     # query = "what recipes do you have"
-    query = "Create a chart that demonstrates the number of organizations working in Sila within each sector"
+    # query = "Create a chart that demonstrates the number of organizations working in Sila within each sector"
+    query = "plot a map showing food security in IPC Phase 3 across regions in Chad"
     # history = str(
     # [
     #    {

diff --git a/actions/actions_plugins/recipe-server/package.yaml b/actions/actions_plugins/recipe-server/package.yaml
@@ -31,3 +31,4 @@ dependencies:
   - pandas=2.2.2
   - seaborn=0.13.2
   - geopandas=0.10.2
+  - hdx_python_api=6.2.4
diff --git a/assistants/plugin_assistants/GPT-4o_Assistant.json b/assistants/plugin_assistants/GPT-4o_Assistant.json
diff --git a/db/recipedb/2-demo-data-langchain.sql b/db/recipedb/2-demo-data-langchain.sql
diff --git a/db/recipedb/3-demo-data-recipes.sql b/db/recipedb/3-demo-data-recipes.sql
diff --git a/db/recipedb/4-demo-data-memories.sql b/db/recipedb/4-demo-data-memories.sql
diff --git a/recipes-management/recipe_sync.py b/recipes-management/recipe_sync.py
@@ -1107,8 +1107,8 @@ def update_metadata_file_results(recipe_folder, result):
 
         # See if result.stdout is a JSON file, if so extract "file"
         try:
-            result = json.loads(str(result.stdout))
-            png_file = result["file"]
+            result_json = json.loads(str(result.stdout))
+            png_file = result_json["file"]
         except json.JSONDecodeError:
             print("Extract png file location from stdout")
             png_file = re.search(r"(\w+\.png)", result.stdout).group(1)
@@ -1140,9 +1140,9 @@ def update_metadata_file_results(recipe_folder, result):
                     "image_validation_prompt.jinja2"
                 )
                 prompt = image_validation_prompt.render(user_input=metadata["intent"])
-                result = call_llm("", prompt, image=png_file_path)
-                if "answer" in result:
-                    if result["answer"] == "yes":
+                llm_result = call_llm("", prompt, image=png_file_path)
+                if "answer" in llm_result:
+                    if llm_result["answer"] == "yes":
                         print("Image validation passed")
                     else:
                         print(
@@ -1154,6 +1154,15 @@ def update_metadata_file_results(recipe_folder, result):
         metadata["sample_result"] = result.stdout
         metadata["sample_result_type"] = "text"
 
+    # Is there an attribution
+    if "attribution" in result.stdout:
+        print(result.stdout)
+        attribution = re.search(r"'attribution': (.*)\}", result.stdout).group(1)
+        attribution = attribution.replace("'", "")
+        metadata["sample_attribution"] = attribution
+    else:
+        metadata["sample_attribution"] = ""
+
     with open(metadata_path, "w") as file:
         json.dump(metadata, file, indent=4)
 
@@ -1367,7 +1376,8 @@ def save_as_memory(recipe_folder):
                 source,
                 created_by,
                 updated_by,
-                last_updated
+                last_updated,
+                attribution
             )
             VALUES (
                 :custom_id,
@@ -1378,7 +1388,8 @@ def save_as_memory(recipe_folder):
                 :source,
                 :created_by,
                 :updated_by,
-                NOW()
+                NOW(),
+                :attribution
             )
             """
         )
@@ -1393,6 +1404,7 @@ def save_as_memory(recipe_folder):
             "source": "Recipe sample result",
             "created_by": metadata["created_by"],
             "updated_by": metadata["created_by"],
+            "attribution": metadata["sample_attribution"],
         }
         conn.execute(query_template, params)
 

diff --git a/recipes-management/requirements.txt b/recipes-management/requirements.txt
@@ -13,4 +13,5 @@ langchain-community==0.2.1
 matplotlib==3.9.0
 geopandas==0.10.2
 seaborn==0.13.2
+hdx_python_api==6.2.4
 
diff --git a/recipes-management/skills.py b/recipes-management/skills.py
@@ -13,6 +13,8 @@
 import psycopg2
 import requests
 from dotenv import load_dotenv
+from hdx.api.configuration import Configuration
+from hdx.data.resource import Resource
 
 # This is copied or mounted into Docker image
 from utils import *
@@ -28,6 +30,39 @@
 load_dotenv()
 
 
+def get_hdx_dataset_url(resource_id):
+    """
+    Retrieves the dataset URL based on the given resource ID.
+
+    Args:
+        resource_id (str): The ID of the resource.
+
+    Returns:
+        str: The dataset URL.
+
+    Raises:
+        Exception: If the resource cannot be fetched or the dataset ID cannot be obtained.
+    """
+
+    try:
+        Configuration.create(hdx_site='prod', user_agent='Data Recipes AI', hdx_read_only=True)
+    except Exception:
+        print('HDX already activated')
+
+    print(resource_id)
+
+    # Fetch the resource
+    resource = Resource.read_from_hdx(resource_id)
+
+    # Get the dataset ID
+    dataset_id = resource['package_id']
+
+    # Construct the dataset URL
+    dataset_url = f'https://data.humdata.org/dataset/{dataset_id}'
+
+    return dataset_url
+
+
 def get_connection():
     """
     This function gets a connection to the database

diff --git a/utils/recipes.py b/utils/recipes.py
@@ -30,7 +30,7 @@
 load_dotenv()
 
 # Lower numbers are more similar
-similarity_cutoff = {"memory": 0.2, "recipe": 0.3, "helper_function": 0.2}
+similarity_cutoff = {"memory": 0.3, "recipe": 0.3, "helper_function": 0.2}
 
 conn_params = {
     "RECIPES_OPENAI_API_TYPE": os.getenv("RECIPES_OPENAI_API_TYPE"),