diff --git a/.env.example b/.env.example
index 80de2759..402cab9e 100644
--- a/.env.example
+++ b/.env.example
@@ -106,6 +106,12 @@ REMOTE_DB_CONN_STRING=
 # If using Fast API
 RECIPE_SERVER_API=http://server:8080/
 
+#==================================================#
+#                        E2E Tests                 #
+#==================================================#
+# Note, on Github use http://host.docker.internal:8000
+CHAT_URL="http://chat:8000/"
+
 #==================================================#
 #                    Chainlit Settings             #
 #==================================================#
diff --git a/.github/workflows/code_quality_checks.yml b/.github/workflows/code_quality_checks.yml
index 02138fef..d3ab4c63 100644
--- a/.github/workflows/code_quality_checks.yml
+++ b/.github/workflows/code_quality_checks.yml
@@ -1,6 +1,6 @@
 name: Code quality checks
 
-on: [push, pull_request]
+on: [push]
 
 jobs:
   build:
diff --git a/.github/workflows/e2e_tests.yml b/.github/workflows/e2e_tests.yml
index 1d07681c..74002313 100644
--- a/.github/workflows/e2e_tests.yml
+++ b/.github/workflows/e2e_tests.yml
@@ -1,4 +1,4 @@
-name: End-to-End Tests
+name: End-to-End tests
 
 #on: [push, pull_request]
 
@@ -10,7 +10,7 @@ on: [push]
 
 jobs:
   test:
-      runs-on: ubuntu-latest
+      runs-on: ubuntu-latest 
       environment: "GitHub Actions 1"
       env: 
         OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
@@ -49,93 +49,86 @@ jobs:
 
         IMAGE_HOST: ${{ secrets.IMAGE_HOST }} 
         RECIPE_SERVER_API: ${{ secrets.RECIPE_SERVER_API_FROM_GH_HOST }} 
+        CHAT_URL: ${{ secrets.CHAT_URL }}
 
         CHAINLIT_AUTH_SECRET: ${{ secrets.CHAINLIT_AUTH_SECRET }} 
         USER_LOGIN: ${{ secrets.USER_LOGIN }} 
         USER_PASSWORD: ${{ secrets.USER_PASSWORD }} 
 
-        COLUMNS: 150
-
       steps:
 
         - name: Checkout
           uses: actions/checkout@v3
 
-        #- name: Checkout integration tests data
-        #  uses: actions/checkout@master
-        #  with:
-        #    repository: datakind/recipes-ai-test-data
-        #    ssh-key: ${{ secrets.GITHUB_SSH_PRIVATE_KEY}} 
-        #    path: recipes-ai-test-data
-
         - name: Expose GitHub Runtime
           uses: crazy-max/ghaction-github-runtime@v2
             
-        - name: Spin up DB and recipes server
+        - name: Spin up Docker containers
           run: |
             env > .env 
 
             echo "Installing demo data ..."
+
+            # Get demo data
             pip3 install gdown==5.2.0
             cd data && python3 download_demo_data.py && cd ..
-            ls data/datadb
 
-            mkdir -p ./flows/chainlit-ui-evaluation/recipes/public
-            
             # TODO this should be enhanced to use a buildx bake to leverage layer caching for faster builds, or push to repo and simply have a pull for the run
             # TODO docker-compose files should be refactored to use scopes instead of different versions for each environment
             echo "Starting docker containers for dbs and server ..."
-            docker-compose -f ./docker-compose-github.yml pull
-            docker-compose -f ./docker-compose-github.yml up -d --build 
+            docker-compose pull
+            docker-compose up -d --build 
+
+            # TODO: For some reason, maybe buildkit, in Github docker compose builds the image differently, and it doesn't work. Individual image build works.
+            docker build --build-arg OPENAI_API_KEY=$OPENAI_API_KEY --build-arg CHAT_URL=$CHAT_URL --build-arg OPENAI_API_ENDPOINT=$OPENAI_API_ENDPOINT --no-cache -t promptflow -f ./flows/chainlit-ui-evaluation/Dockerfile  .
+            docker run --env RECIPES_MODEL_MAX_TOKENS=${RECIPES_MODEL_MAX_TOKENS} --env RECIPES_MODEL_TEMP=${RECIPES_MODEL_TEMP} --env RECIPES_OPENAI_API_TYPE=${ASSISTANTS_API_TYPE} --env RECIPES_OPENAI_API_KEY=${ASSISTANTS_API_KEY} --env RECIPES_MODEL=${RECIPES_MODEL} --env RECIPES_BASE_URL=${RECIPES_BASE_URL} --env USER_LOGIN=${USER_LOGIN} --env USER_PASSWORD=${USER_PASSWORD} --env CHAT_URL=${CHAT_URL} --network=data-recipes-ai_default -d --name promptflow promptflow 
+
+        - name: Check logs
+          run: |
+
+            docker ps
+
             echo "logs datadb ..."
-            docker-compose -f docker-compose-github.yml  logs datadb
+            docker compose logs datadb
+
+            echo "logs promptflow ..."
+            docker logs promptflow
+
+            echo "logs chat ..."
+            docker compose logs chat
+
+            echo "logs server ..."
+            docker compose logs server
+
             docker ps
 
-        # TODO The promptflow docker build wasn't working in GH actions, so deploying promptflow to host for now
-        - name: Set up promtpflow and run tests
-          uses: actions/setup-python@v4
-          with:
-            python-version: "3.11.4"
-        - run: |
-            echo "Installing promptflow packages ..."
-            pip3 install promptflow==1.12.0
-            pip3 install promptflow-tools==1.4.0
-            pip3 install chainlit==1.1.305
-            pip3 install langchain==0.2.1
-            pip3 install langchain_openai==0.1.7
-            pip3 install psycopg2_binary==2.9.9
-            pip3 install keyrings.alt
-            pip3 list
-        - run: |
-
-            echo "Setting up folders ..."
-            cd flows/chainlit-ui-evaluation/
-            cp ../../ui/chat-chainlit-assistant/app.py .
-            cp -r ../../utils .
-            cp -r ../../templates .
-            cp ../../management/skills.py .
-
-            echo "Setting up Promptflow connections ..."
-            pf connection create --file ./openai.yaml --set api_key=$OPENAI_API_KEY  --name open_ai_connection 
-            pf connection create --file ./azure_openai.yaml --set api_key=$OPENAI_API_KEY --set api_base=$OPENAI_API_ENDPOINT --name azure_openai 
-
-            # Test running one node with default inputs. Good for debugging GH actions
-            #pf flow test --flow . --node call_assistant
-            #python3 call_assistant.py --chat_history '[{"author": "user","content": "Hi!"}, {"author": "user","content": "What is the total population of Mali"}]'
-            #python3 call_assistant.py --chat_history '[{"author": "user","content": "plot a line chart of fatalities by month for Chad using HDX data as an image"}]'
-            # This runs a few, with the script kill, like promptflow, but prints all debug. Good for testing.
-            python3 call_assistant_debug.py
-
-            echo "Starting Promptflow batch run  using data.jsonl ..."
-            pf run create --flow . --data ./data.jsonl --stream  --column-mapping query='${data.query}' context='${data.context}' chat_history='${data.chat_history}'  --name base_run 
-
-        - run: |
-            echo "Promptflow results ..."
-            cd flows/chainlit-ui-evaluation/
-            pf run show-details -n base_run
+            sleep 10
+
+        # Debugging GitHUb actions interactively, by connecting to the runner ...
+        # Get ssh connection details for runner
+        # See here https://github.com/marketplace/actions/debugging-with-ssh
+        # Basically, uncomment this, then get connection string in actions output, then connect with
+        #
+        # ssh -i <YOUR GITHUB SSH KEY> <CONN STRING ON ACTIONS>
+        #
+        #- name: DEBUG - Setup upterm session
+        #  uses: lhotari/action-upterm@v1
+
+        # - name: DEBUG - Run Selenium outside of promptflow
+        #   run: |
+        #     docker exec promptflow python call_assistant.py
+
+        - name: Run tests
+          run: |
+            env > .env 
+            docker exec promptflow pf run create --flow . --data ./data.jsonl --stream  --column-mapping query='${data.query}' context='${data.context}' chat_history='${data.chat_history}'  --name base_run 
+
+        - name: Show results
+          run: |
+            docker exec promptflow COLUMNS=150 pf run show-details -n base_run
             echo "Getting metrics ..."
-            pf run show-metrics -n base_run
-            ##pf run visualize -n base_run
+            docker exec promptflow  pf run show-metrics -n base_run
+            ##docker exec promptflow  pf run visualize -n base_run
             echo "Checking results ..."
-            python3 check_evaluation_results.py
+            docker exec promptflow python3 check_evaluation_results.py
  
\ No newline at end of file
diff --git a/.github/workflows/get_memory_test.yml b/.github/workflows/get_memory_test.yml
index 04a680a5..f5858c0b 100644
--- a/.github/workflows/get_memory_test.yml
+++ b/.github/workflows/get_memory_test.yml
@@ -75,22 +75,32 @@ jobs:
             ls data/datadb
             
             echo "Starting docker containers for dbs and server ..."
-            docker-compose -f ./docker-compose-github.yml pull
-            docker-compose -f ./docker-compose-github.yml up -d --build 
-            echo "logs datadb ..."
-            docker-compose -f docker-compose-github.yml  logs datadb
+            docker-compose pull
+            docker-compose up -d --build datadb recipedb server datadb
+
+            sleep 10
+
+            docker ps
+
+            
+        - name: Check logs
+          run: |
+
             docker ps
 
+            echo "logs datadb ..."
+            docker compose logs datadb
+
+            echo "logs recipedb ..."
+            docker compose logs recipedb
+
+            echo "logs server ..."
+            docker compose logs server
+
+            sleep 10
+
         - name: Run tests
-          uses: actions/setup-python@v4
-          with:
-            python-version: "3.11.4"
-        - run: |
-            pip3 install pytest==8.2.2
-            pip3 install requests==2.32.3
-            pip3 install python-dotenv==1.0.1
-
-        - run: |
+          run: |
             echo "exec into container ..."
-            docker exec recipes-ai-server bash -c "cd tests/ && pytest"
+            docker compose exec server bash -c "cd tests/ && pytest"
  
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index fa58168a..d8bbfc7a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -27,4 +27,8 @@ data
 server/fastapi/recipes/
 assistants/chat_ui/files/file_search/custom
 assistants/chat_ui/files/code_interpreter/custom
-
+flows/chainlit-ui-evaluation/app.py
+flows/chainlit-ui-evaluation/data.new.jsonl
+flows/chainlit-ui-evaluation/recipes/
+flows/chainlit-ui-evaluation/temp.png
+ui/chat-chainlit-assistant/.files/
diff --git a/CONTRIBUTION.md b/CONTRIBUTION.md
index 29aa7acb..0cbc1d17 100644
--- a/CONTRIBUTION.md
+++ b/CONTRIBUTION.md
@@ -1,4 +1,4 @@
-# Contributing to DOT
+# Contributing to Data Recipes AI
 
 Hi! Thanks for your interest in contributing to Data Recipes AI, we're really excited to see you! In this document we'll try to summarize everything that you need to know to do a good job.
 
@@ -40,6 +40,8 @@ GitHub has an action to run the pre-commit tests to ensure code adheres to stand
 
 ## Tests
 
+### Unit tests
+
 You should write tests for every feature you add or bug you solve in the code.
 Having automated tests for every line of our code lets us make big changes
 without worries: there will always be tests to verify if the changes introduced
@@ -53,11 +55,9 @@ the desired feature.
 
 You can use `pytest` to run your tests, no matter which type of test it is.
 
-### End-to-end tests
+### End-to-end tests (using Selenium and Promptflow)
 
-End-to-end tests have been configured in GitHub actions which use promptflow to call a wrapper around the chainlit UI, or order to test when memories/recipes are used as well as when the assistant does some on-the-fly analysis. To do this, the chainlit class is patched heavily, and there are limitations in how
-cleanly this could be done, so it isn't an exact replica of the true application, but does capture changes
-with the flow as well as test the assistant directly. The main body of integration tests will test recipes server and the assistant independently.
+End-to-end tests have been configured in GitHub actions which use promptflow to call a wrapper around the chainlit UI, or order to test when memories/recipes are used as well as when the assistant does some on-the-fly analysis. To do this, the chainlit class is patched heavily, and there are limitations in how cleanly this could be done, so it isn't an exact replica of the true application, but does capture changes with the flow as well as test the assistant directly. The main body of integration tests will test recipes server and the assistant independently.
 
 Additionally, there were some limitation when implementing in GitHub actions where workarounds were implemented
 until a lter data, namely: promptflow is run on the GitHub actions host rather than in docker, and the promptflow wrapper to call chainlit has to run as a script and kill the script based on a STDOUT string. These should be fixed in future.
@@ -66,14 +66,30 @@ Code for e2e tests can be found in `flows/chainlit-ui-evaluation` as run by `.gi
 
 The tests work using promptflow evaluation and a call to an LLM to guage groundedness, due to the fact LLM assistants can produce slightly different results if not providing answers from memory/recipes. The promptflow evaluation test data can be found in `flows/chainlit-ui-evaluation/data.jsonl`. 
 
-A useful way to test a new scenario and to get the 'expected' output for `data.jsonl`, is to add it to `call_assistant_debug.py`.
+See "Evaluating with Promptflow" below to see how to run e2e tests locally.
+
+#### Running Promptflow evaluation locally
+
+First, you will need to build the environment to include Prompt Flow ...
+
+`docker compose -f docker-compose.yml -f docker-compose-dev.yml up -d --build`
+
+Then ...
+
+1. Install the DevContainers VSCode extension 
+2. Build data recipes using the `docker compose` command mentioned above
+3. Open the command palette in VSCode (CMD + Shift + P on Mac; CTRL + Shift + P on Windows) and select 
 
-TODO, future work:
+   `Dev Containers: Attach to remote container`. 
 
-- Add promptflow to docker-compose-github.yml and update action to use this env (time was short and wasn't working). This will reduce overhead and complexity
-- Figure out how to make call_assistant.py exit async look so it doesn't have to run in a wrapper that then kills process
-- Push docker containers to a registry so flow doesn't run build every time
-- Bug the chainlit folks to see if they can do something more formal around testing, to avoid complex monkey patching
+   Select the promptflow container. This opens a new VSCode window - use it for the next steps.
+4. Install Promptflow add-in
+5. Open folder `/app`
+6. Click on `flow.dag.yaml`
+7. Top left of main pane, click on 'Visual editor'
+     - If you are taken to the promptflow 'Install dependencies'' screen, change the Python runtime to be ` /azureml-envs/prompt-flow/runtime/bin/python` 'runtime', then close and re-open `flow.dag.yaml`
+8. On the Groundedness node, select your new connection
+9. You can no run by clicking the play icon. See Promptflow documentation for more details
 
 ## GitHub Workflow
 
diff --git a/README.md b/README.md
index 7a7961a2..2477d513 100644
--- a/README.md
+++ b/README.md
@@ -17,7 +17,7 @@ For more information on the recipes concept, please see see [here](https://towar
 
 Data recipes have two types: (i) Exact memories, eg '*What is the population of Mali?*' which can be served directly to the user when they ask this question; (ii) Generic skills which can be run when requested for a scenario not in memory, eg a skill for 'What is the population of country X?' which can be called when the user asks something like '*What is the population of Nigeria?*'. In both cases the match to the user's intent is made using semantic search with LLM-reranking.
 
-Given the rapidly changing landscape of LLMs, we have tried as much as possible to implement data recipes in such a way that it can be integrated with various semantic architectures and frameworks. By implementing recipes using a recipes server (powered by [Robocorps actions server](https://github.com/robocorp/robocorp#readme)), it can be called from [Open AI assistant](https://platform.openai.com/docs/assistants/overview) actions and [Copilot Studio](https://www.microsoft.com/en-us/microsoft-copilot/microsoft-copilot-studio) as well from any custom code. Also included in this repo is an example of using recipes via OpenAI format plugins, as supported by frameworks such as [semantic kernel](https://learn.microsoft.com/en-us/semantic-kernel/overview/?tabs=Csharp). 
+Given the rapidly changing landscape of LLMs, we have tried as much as possible to implement data recipes in such a way that it can be integrated with various semantic architectures and frameworks. By implementing recipes using a recipes server (powered by FastAPI), it can be called from [Open AI assistant](https://platform.openai.com/docs/assistants/overview) actions and [Copilot Studio](https://www.microsoft.com/en-us/microsoft-copilot/microsoft-copilot-studio) as well from any custom code. Also included in this repo is an example of using recipes via OpenAI format plugins, as supported by frameworks such as [semantic kernel](https://learn.microsoft.com/en-us/semantic-kernel/overview/?tabs=Csharp). 
 
 Data recipes supports datasources accessed via API, but in some cases it is preferable to ingest data in order to leverage LLM SQL capabilities. We include an initial set of data sources specific to humanitarian response in the ingestion module, which can be extended to include additional sources as required.
 
diff --git a/assets/system.png b/assets/system.png
index cd11ac44..7b1e7126 100644
Binary files a/assets/system.png and b/assets/system.png differ
diff --git a/docker-compose-github.yml b/docker-compose-github.yml
deleted file mode 100644
index d90c5ef6..00000000
--- a/docker-compose-github.yml
+++ /dev/null
@@ -1,73 +0,0 @@
-services:
-  # promptflow:
-  #   build:
-  #     context: .
-  #     dockerfile: ./flows/chainlit-ui-evaluation/Dockerfile
-  #     args:
-  #       OPENAI_API_KEY: ${OPENAI_API_KEY}
-  #       OPENAI_API_ENDPOINT: ${ASSISTANTS_BASE_URL}
-  #   container_name: recipes-ai-promptflow
-  #   env_file:
-  #     - .env
-  #   volumes:
-  #     - ./flows:/app
-  #     - ./utils:/app/chainlit-ui-evaluation/utils
-  #     - ./templates:/app/chainlit-ui-evaluation/templates
-  #     - ./recipes/public:/app/chainlit-ui-evaluation/recipes/public
-  #     - ./management/skills.py:/app/chainlit-ui-evaluation/recipes/skills.py
-  #     - ./ui/chat-chainlit-assistant/app.py:/app/chainlit-ui-evaluation/app.py
-
-  datadb:
-    platform: linux/amd64
-    image: postgis/postgis:12-3.4
-    container_name: recipes-ai-datadb
-    environment:
-      POSTGRES_DB: ${POSTGRES_DATA_DB}
-      POSTGRES_USER: ${POSTGRES_DATA_USER}
-      POSTGRES_PASSWORD: ${POSTGRES_DATA_PASSWORD}
-      POSTGRES_PORT: ${POSTGRES_DATA_PORT}
-    restart: always
-    volumes:
-      - ./data/datadb:/var/lib/postgresql/data
-    ports:
-      - 5433:5432
-
-  recipedb:
-    platform: linux/amd64
-    image: ankane/pgvector:latest
-    container_name: recipes-ai-recipesdb
-    environment:
-      POSTGRES_DB: ${POSTGRES_RECIPE_DB}
-      POSTGRES_USER: ${POSTGRES_RECIPE_USER}
-      POSTGRES_PASSWORD: ${POSTGRES_RECIPE_PASSWORD}
-      POSTGRES_PORT: ${POSTGRES_RECIPE_PORT}
-    restart: always
-    ports:
-      - 5435:5432
-    volumes:
-      - ./db/recipedb:/docker-entrypoint-initdb.d
-      - ./data/recipesdb:/var/lib/postgresql/data
-
-  server:
-     platform: linux/amd64
-     container_name: recipes-ai-server
-     build:
-       context: .
-       dockerfile: ./server/fastapi/Dockerfile
-       args:
-         DATA_DB_CONN_STRING: ${DATA_DB_CONN_STRING}
-     ports:
-       - 4001:8080
-     env_file:
-       - .env
-     volumes:
-       - ./server/fastapi:/app
-       - ./flows/chainlit-ui-evaluation/recipes/public:/app/recipes/public
-       - ./templates:/app/templates
-       - ./utils:/app/utils
-       - ./management/skills.py:/app/recipes/skills.py
-       - ./tests:/app/tests
-       - ./utils:/app/tests/utils
-
-volumes:
-  pgdata2:
\ No newline at end of file
diff --git a/docker-compose.yml b/docker-compose.yml
index ce2aff11..ad7569ed 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -1,35 +1,38 @@
 #version: "3.4"
 
 services:
+
   datadb:
+    platform: linux/amd64
     image: postgis/postgis:12-3.4
     container_name: recipes-ai-datadb
     environment:
       POSTGRES_DB: ${POSTGRES_DATA_DB}
       POSTGRES_USER: ${POSTGRES_DATA_USER}
       POSTGRES_PASSWORD: ${POSTGRES_DATA_PASSWORD}
+      POSTGRES_PORT: ${POSTGRES_DATA_PORT}
     restart: always
-    ports:
-      - 5433:5432
     volumes:
       - ./data/datadb:/var/lib/postgresql/data
-    env_file:
-      - .env
+    ports:
+      - 5433:5432
+
   recipedb:
+    platform: linux/amd64
     image: ankane/pgvector:latest
     container_name: recipes-ai-recipesdb
     environment:
       POSTGRES_DB: ${POSTGRES_RECIPE_DB}
       POSTGRES_USER: ${POSTGRES_RECIPE_USER}
       POSTGRES_PASSWORD: ${POSTGRES_RECIPE_PASSWORD}
+      POSTGRES_PORT: ${POSTGRES_RECIPE_PORT}
     restart: always
     ports:
       - 5435:5432
     volumes:
       - ./db/recipedb:/docker-entrypoint-initdb.d
       - ./data/recipesdb:/var/lib/postgresql/data
-    env_file:
-      - .env
+
   # recipes-server:
   #   platform: linux/amd64
   #   container_name: recipes-ai-recipes-server
@@ -70,6 +73,8 @@ services:
        - ./templates:/app/templates
        - ./utils:/app/utils
        - ./management/skills.py:/app/recipes/skills.py
+       - ./tests:/app/tests
+       - ./utils:/app/tests/utils
   chat:
      platform: linux/amd64
      container_name: recipes-ai-chat
@@ -90,6 +95,7 @@ services:
        - ./assistants/chat_ui/create_update_assistant.py:/app/create_update_assistant.py
        - ./assistants/chat_ui/files:/app/files
   ingestion:
+    platform: linux/amd64 
     container_name: recipes-ai-ingestion
     build:
       context: .
diff --git a/flows/chainlit-ui-evaluation/Dockerfile b/flows/chainlit-ui-evaluation/Dockerfile
index 82d06ed3..d12472d2 100644
--- a/flows/chainlit-ui-evaluation/Dockerfile
+++ b/flows/chainlit-ui-evaluation/Dockerfile
@@ -1,4 +1,18 @@
-FROM mcr.microsoft.com/azureml/promptflow/promptflow-runtime-stable:latest
+# See here: https://microsoft.github.io/promptflow/cloud/azureai/runtime-change-log.html#v4
+FROM mcr.microsoft.com/azureml/promptflow/promptflow-runtime-stable:20240411.v4 
+
+# Directly:
+# docker build --build-arg OPENAI_API_KEY=$OPENAI_API_KEY --build-arg OPENAI_API_ENDPOINT=$OPENAI_API_ENDPOINT --no-cache -t test -f ./flows/chainlit-ui-evaluation/Dockerfile  .
+# docker run -d --name test test
+# docker exec -it test /bin/bash
+#
+# Docker compose:
+# docker-compose -f docker-compose-github.yml build promptflow
+# docker-compose -f docker-compose-github.yml up -d promptflow
+# docker-compose exec -f docker-compose-github.yml -it promptflow /bin/bash
+#
+# export DOCKER_BUILDKIT=0
+#
 
 ARG OPENAI_API_KEY
 ENV OPENAI_API_KEY=$OPENAI_API_KEY
@@ -6,6 +20,9 @@ ENV OPENAI_API_KEY=$OPENAI_API_KEY
 ARG OPENAI_API_ENDPOINT
 ENV OPENAI_API_ENDPOINT=$OPENAI_API_ENDPOINT
 
+ARG CHAT_URL
+ENV CHAT_URL=$CHAT_URL
+
 COPY ./flows /app
 COPY ./utils /app/chainlit-ui-evaluation/utils
 COPY ./templates /app/chainlit-ui-evaluation/templates
@@ -14,17 +31,40 @@ COPY ./ui/chat-chainlit-assistant/app.py /app/chainlit-ui-evaluation/app.py
 
 RUN pip3 install --upgrade pip
 
+WORKDIR /app/chainlit-ui-evaluation
+
 # Needed for running chainlit code if using Mock tests
 RUN pip3 install chainlit==1.1.305
 RUN pip3 install langchain==0.2.1
 RUN pip3 install langchain_openai==0.1.7
 RUN pip3 install psycopg2_binary==2.9.9
-
-# Needed for promptflow connections
-RUN pip install keyrings.alt
-
-WORKDIR /app/chainlit-ui-evaluation
+RUN pip3 install keyrings.alt
 
 # Set up Connections
 RUN pf connection create --file ./openai.yaml --set api_key=$OPENAI_API_KEY  --name open_ai_connection 
-RUN pf connection create --file ./azure_openai.yaml --set api_key=$OPENAI_API_KEY --set api_base=$OPENAI_API_ENDPOINT --name azure_openai 
+RUN pf connection create --file ./azure_openai.yaml --set api_key=$OPENAI_API_KEY --set api_base=$OPENAI_API_ENDPOINT --name azure_open_ai_connection 
+
+RUN echo "DEBUG DOCKER"
+RUN which python
+RUN env
+
+# # ======= SELENIUM BEGIN ========
+
+# Update package lists for the Ubuntu system
+RUN apt-get update
+
+# Download Chrome Debian package
+RUN wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb 
+
+# Install the Chrome Debian package
+RUN apt install ./google-chrome-stable_current_amd64.deb -y
+
+# Print the version of Google Chrome installed
+RUN google-chrome --version
+
+RUN pip install selenium==4.22.0
+RUN pip install chromedriver-py==126.0.6478.126
+
+# # ======= SELENIUM END ========
+
+ENV PATH=$PATH:/azureml-envs/prompt-flow/runtime/bin:/opt/miniconda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
\ No newline at end of file
diff --git a/flows/chainlit-ui-evaluation/azure_openai.yaml b/flows/chainlit-ui-evaluation/azure_openai.yaml
index 5b916e77..ef5bed6c 100644
--- a/flows/chainlit-ui-evaluation/azure_openai.yaml
+++ b/flows/chainlit-ui-evaluation/azure_openai.yaml
@@ -1,5 +1,5 @@
 $schema: https://azuremlschemas.azureedge.net/promptflow/latest/AzureOpenAIConnection.schema.json
-name: open_ai_connection
+name: azure_open_ai_connection
 type: azure_open_ai
 api_key: "<user-input>"
 api_base: "<user-input>"
diff --git a/flows/chainlit-ui-evaluation/call_assistant.py b/flows/chainlit-ui-evaluation/call_assistant.py
index 99c876cd..49081175 100644
--- a/flows/chainlit-ui-evaluation/call_assistant.py
+++ b/flows/chainlit-ui-evaluation/call_assistant.py
@@ -1,543 +1,344 @@
-import argparse
-import asyncio
-import hashlib
-import inspect
 import json
 import os
-import signal
-import subprocess
-import threading
+import sys
 import time
 
+import requests
 from promptflow.core import tool
+from selenium import webdriver
+from selenium.common.exceptions import NoSuchElementException
+from selenium.webdriver.chrome.options import Options
+from selenium.webdriver.common.by import By
+from selenium.webdriver.common.keys import Keys
 
 from utils.llm import call_llm
 
-FINISH_PHRASE = "all done"
-OUTPUT_TAG = "ASSISTANT_OUTPUT"
-IMAGE_DIR = "./recipes/public/"
+# Time interval to poll for results in UI
+POLL_TIME = 1
 
+# TIme waiting for response before exiting
+TIMEOUT_TIME = 120
+RETRY_WAIT = 5
 
-@tool
-def call_assistant(query: str, chat_history: str) -> dict:
-    """
-    Calls the assistant API with the given input and retrieves the response.
-
-    TODO: This spawns a shall and runs the mock version of chainlit, monitors output, then
-    kills the process. This is a workaround because running the exact chainlit code does not
-    exit all asynchronous threads and hangs. This workaround is temporary, brittle and hard to maintain
-    and should be replaced!! To debug, see call_assistant_debug.py, which at least removes a few layers.
-
-    Args:
-        query: What the user asked
-        chat_history (list): A list containing the chat history, of the format ...
-
-        [
-            {
-                "author": "user",
-                "content": "Hi"
-            },
-            {
-                "author": "assistant",
-                "content": "Hello! How can I help you today?",
-            },
-            {
-                "author": "assistant",
-                "content": "What's the total population of Mali?",
-            }
-        ]
-
-    Returns:
-        dict: A dictionary containing the response from the assistant, function name, function arguments,
-              function output, and the number of tokens in the function output.
-    """
-
-    print(chat_history)
+# Web elements used
+LOGIN_EMAIL_FIELD = "email"
+LOGIN_PASSWORD_FIELD = "password"
+LOGIN_BUTTON_XPATH = '//button[contains(., "Continue")]'
+CHAT_INPUT_CLASS = "chat-input"
+# Used to identify changes
+MARKDOWN_BODY_CLASS = "markdown-body"
 
-    chat_history = json.loads(chat_history)
-
-    # Add user query to chat history
-    chat_history.append({"author": "user", "content": query})
-
-    # chat_history = [
-    #     {"author": "user", "content": "Hi"},
-    #     {
-    #         "author": "assistant",
-    #         "content": "Hello! How can I help you today?",
-    #     },
-    #     {
-    #         "author": "assistant",
-    #         "content": "Hi again!",
-    #     },
-    # ]
+# Used to extract messages
+MESSAGES_CLASS = "message-content"
 
-    chat_history = json.dumps(chat_history)
-    chat_history = chat_history.replace('"', '\\"')
-    chat_history = chat_history.replace("'", "\\'")
+TMP_IMAGE = "temp.png"
+CHAT_URL = os.getenv("CHAT_URL")
 
-    print("History:", chat_history)
+IMAGE_SUMMARIZATION_PROMPT = "Summarize the image"
 
-    result = run_chainlit_mock(chat_history)
 
-    response = {"response": result}
-
-    return response
+def set_chrome_options() -> Options:
+    """Sets chrome options for Selenium.
+    Chrome options for headless browser is enabled.
+    """
+    chrome_options = Options()
+    chrome_options.add_argument("--headless")
+    chrome_options.add_argument("--no-sandbox")
+    chrome_options.add_argument("--disable-dev-shm-usage")
+    chrome_prefs = {}
+    chrome_options.experimental_options["prefs"] = chrome_prefs
+    chrome_prefs["profile.default_content_settings"] = {"images": 2}
+    return chrome_options
 
 
-def setup_mock_class():
+def check_element_exists(element, by, value):
     """
-    Creates and returns a mock class for testing purposes.
+    Checks if an element exists on a web page.
+
+    Args:
+        driver: The WebDriver instance
+        by: The method used to locate the element (e.g., By.ID, By.XPATH, etc.)
+        value: The value used to locate the element
 
     Returns:
-        cl_mock (MockChainlit): The mock class instance.
+        True if the element exists, False otherwise.
     """
-
-    class MockMessage:
-        """
-        A class representing a mock message.
-
-        Attributes:
-            author (str): The author of the message.
-            content (str): The content of the message.
-            elements (list): The elements of the message.
-            disable_feedback (bool): Flag indicating whether feedback is disabled.
-
-        Methods:
-            send(): Sends the message.
-            stream_token(content): Streams a token.
-            update(): Updates the message.
-        """
-
-        def __init__(
-            self, author=None, content=None, elements=None, disable_feedback=False
-        ):
-            if content is None:
-                content = ""
-            self.author = author
-            self.content = content
-            self.disable_feedback = disable_feedback
-            self.elements = elements if elements is not None else []
-
-        async def send(self):
-            """
-            Sends the message.
-
-            Returns:
-                MockMessage: The sent message.
-            """
-            print(
-                f"Sending message: Author: {self.author}, Content: {self.content}, Elements: {self.elements}"
-            )
-            return self
-
-        async def stream_token(self, content):
-            """
-            Streams a token.
-
-            Args:
-                content (str): The content of the token.
-
-            Returns:
-                MockMessage: The updated message.
-            """
-            # print(f"Streaming token: Author: {self.author}, Content: {content}")
-            self.content += content
-            return self
-
-        async def update(self):
-            """
-            Updates the message.
-
-            Returns:
-                MockMessage: The updated message.
-            """
-            print(
-                f"Updating message: Author: {self.author}, Content: {self.content}, Elements: {self.elements}"
-            )
-            return self
-
-    class MockUserSession:
-        """
-        A class representing a mock user session.
-
-        Attributes:
-            session_data (dict): A dictionary to store session data.
-
-        Methods:
-            get(key): Retrieves the value associated with the given key from the session data.
-            set(key, value): Sets the value associated with the given key in the session data.
-        """
-
-        def __init__(self):
-            self.session_data = {}
-
-        def get(self, key):
-            """
-            Retrieves the value associated with the given key from the session data.
-
-            Args:
-                key (str): The key to retrieve the value for.
-
-            Returns:
-                The value associated with the given key, or None if the key is not found.
-            """
-            return self.session_data.get(key, None)
-
-        def set(self, key, value):
-            """
-            Sets the value associated with the given key in the session data.
-
-            Args:
-                key (str): The key to set the value for.
-                value: The value to be associated with the key.
-            """
-            self.session_data[key] = value
-
-    class MockChainlit:
-        """
-        A mock implementation of the Chainlit class.
-        """
-
-        def __init__(self):
-            self.Message = MockMessage
-            self.user_session = MockUserSession()
-            self.__name__ = "chainlit"
-            self.step = None
-
-        def Text(self, name, content, display):
-            """
-            Creates a text element.
-
-            Args:
-                text (str): The text content.
-
-            Returns:
-                dict: A dictionary containing the text element.
-            """
-            return {"type": "Text", "text": content}
-
-        def Image(self, path, display, size):
-            """
-            Creates an Image element.
-
-            Args:
-                path (str): The path to the image file.
-                display (str): The display mode for the image.
-                size (str): The size of the image.
-
-            Returns:
-                dict: A dictionary containing the text element.
-            """
-
-            print(path)
-            cksum, image_path = get_image_cksum(path)
-
-            return {
-                "type": "Image",
-                "path": path,
-                "cksum": cksum,
-                "image_path": image_path,
-            }
-
-        def instrument_openai(self):
-            """
-            Instruments the OpenAI MOCK.
-
-            This method is responsible for instrumenting the OpenAI MOCK.
-            It prints a message indicating that the OpenAI MOCK is being instrumented.
-
-            Parameters:
-                None
-
-            Returns:
-                None
-            """
-            print("Instrumenting OpenAI MOCK")
-
-    cl_mock = MockChainlit()
-
-    return cl_mock
+    try:
+        element.find_element(by, value)
+        return True
+    except NoSuchElementException:
+        return False
 
 
-# Method to run a supplied function to override chainlit's run_sync method
-def run_async_coroutine(coroutine):
+def poll_page(element_id=CHAT_INPUT_CLASS):
     """
-    Runs an asynchronous coroutine in a separate event loop and returns the result.
+    Polls the page for new messages until a new message appears or a timeout occurs.
+    It does this by monitoring the chat box to see when it becomes enabled.
 
     Args:
-        coroutine: The coroutine to be executed asynchronously.
-
-    Returns:
-        The result of the coroutine execution.
+        driver: The WebDriver instance used to interact with the web page.
+        element_name (str): The name of the element class to search for new messages.
 
     Raises:
-        asyncio.TimeoutError: If the coroutine execution times out.
+        status: True if a new message appears, False otherwise.
 
+    Returns:
+        None
     """
 
-    def start_loop(loop):
-        asyncio.set_event_loop(loop)
-        loop.run_forever()
+    chat_box = driver.find_element(By.ID, element_id)
+    is_disabled = chat_box.get_attribute("disabled")
 
-    new_loop = asyncio.new_event_loop()
-    t = threading.Thread(target=start_loop, args=(new_loop,))
-    t.start()
-    future = asyncio.run_coroutine_threadsafe(coroutine, new_loop)
-    try:
-        return future.result(timeout=10)
-    except asyncio.TimeoutError:
-        print("Coroutine execution timed out.")
-        return None
+    # Loop waiting for the chat box to become enabled, indicating agent output complete
+    tot_time = 0
+    while is_disabled is not None:
+        print(f"         ... {tot_time} s")
+        time.sleep(POLL_TIME)
+        chat_box = driver.find_element(By.ID, element_id)
+        is_disabled = chat_box.get_attribute("disabled")
+        tot_time += POLL_TIME
+        if tot_time > TIMEOUT_TIME:
+            print(f"ERROR: Timed out waiting for chat box {element_id} to re-enable ")
+            return False
+
+    return True
 
 
-def get_image_cksum(image_path):
+def get_history():
     """
-    Calculate the MD5 checksum of an image file.
+    Retrieves the chat history from the web driver.
 
     Args:
-        image_path (str): The path to the image file.
+        driver: The web driver object.
 
     Returns:
-        str: The MD5 checksum of the image file.
-
+        A list containing the chat history elements.
     """
-    image_name = image_path.split("/")[-1]
-    image_path = f"{IMAGE_DIR}/{image_name}"
+    markdown_body_elements = driver.find_elements(By.CLASS_NAME, MESSAGES_CLASS)
+    history = []
+    for element in markdown_body_elements:
+        history.append(element)
+
+    # Remove the first greeting
+    history = history[1:]
 
-    with open(image_path, "rb") as f:
-        image = f.read()
-    cksum = hashlib.md5(image).hexdigest()
-    return cksum, image_path
+    return history
 
 
-def process_images(result):
+def send_message(message, num_tries=0, tot_tries=3):
     """
-    Process the images in the result to replace them with their checksums.
+    Sends a message to the chat box and retrieves the bot's response.
 
     Args:
-        result (str): The result containing images.
+        message: The message to send to the chat box.
+        num_tries: The number of times to try sending the message
 
     Returns:
-        str: The result with images replaced by their checksums.
-
+        A list of outputs generated by the bot in response to the message.
     """
 
-    print(f"Processing images ...\n\n{result}")
-    if ".png" in result:
-        image_location = result
-        if "http" in result:
-            image_location = result[result.find("http") : result.find(".png") + 4]
+    history = get_history()
+    len_history_original = len(history)
 
-        cksum, image_path = get_image_cksum(image_location)
+    try:
+        print(f"\nYOU: {message}")
+        chat_box = driver.find_element(By.ID, CHAT_INPUT_CLASS)
+        chat_box.send_keys(message)
+        chat_box.send_keys(Keys.RETURN)
+    except Exception as e:
+        print(f"Error sending message: {e}")
+        if num_tries <= tot_tries:
+            print(f"Retrying ... {num_tries}")
+            time.sleep(RETRY_WAIT)
+            return send_message(message, num_tries + 1, tot_tries)
+        else:
+            print(f"Failed to send message after {tot_tries} tries")
+            return ["ERROR: TIMED OUT SENDING MESSAGE"]
 
-        print(
-            f"Processing image: {image_path}, cksum: {cksum}, location: {image_location}"
-        )
+    # Poll page waiting for output to complete
+    poll_page(CHAT_INPUT_CLASS)
 
-        if os.getenv("RECIPES_MODEL") == "gpt-4o":
-            # image_validation_prompt = environment.get_template(
-            #    "image_validation_prompt.jinja2"
-            # )
-            # prompt = image_validation_prompt.render(user_input=metadata["intent"])
+    history = get_history()
+    len_history_new = len(history)
 
-            prompt = "Describe this image in detail. Is it relevant to the user query?"
+    num_new_outputs = len_history_new - len_history_original
 
-            llm_result = call_llm("", prompt, image=image_path)
+    # Get all the new outputs and output them
+    history[-1 * num_new_outputs]
+    outputs = []
 
-            image_str = f"Image cksum: {cksum}\nImage description: {llm_result}"
+    for i in range(num_new_outputs - 1, 0, -1):
+        record = history[-1 * i]
 
-            result = result.replace(image_location, image_str)
+        if check_element_exists(record, By.TAG_NAME, "img"):
+            image = record.find_element(By.TAG_NAME, "img")
+            url = image.get_attribute("src")
+            output = get_image_summary(url)
+        else:
+            output = record.text
 
-    return result
+        outputs.append(output)
 
+        # Print the last response
+        print(f"\n🤖 BOT: {output}")
 
-def dump_stderr(process):
+    return outputs
+
+
+def download_image(url, filename):
     """
-    Print the stderr output of a process.
+    Downloads an image from the specified URL and saves it to the specified filename.
 
     Args:
-        process: The process to print the stderr output for.
+        url (str): The URL of the image to download.
+        filename (str): The name of the file to save the image as.
 
     Returns:
-        None.
-
+        None
     """
-    all_error = process.stderr.read()
-    if len(all_error) > 0:
-        print("STDERR:")
-        print(all_error)
+    response = requests.get(url)
+    file = open(filename, "wb")
+    file.write(response.content)
+    file.close()
 
+    return
 
-def run_chainlit_mock(chat_history: str) -> str:
+
+def get_image_summary(url):
     """
-    This function is used to run the chainlit script and monitor its output.
-    TODO It is a temporary workaround because running the exact chainlit code
-    does not exit all asynchronous threads and hangs. This workaround is temporary
-    and should be replaced by breaking e2e testing into data recipes API and
-    the assistant. Testing both independently is way less complicated.
+    Downloads an image from the given URL, calls LLM to summarize the image,
+    and returns the generated description.
 
     Args:
-        chat_history (str): A string containing the chat history.
+        url (str): The URL of the image to download and summarize.
 
     Returns:
-        result (str): The result of the chainlit script running with input history
-
+        str: The LLM-generated description of the image.
     """
+    print("         Downloading image ...")
+    download_image(url, TMP_IMAGE)
+    print("         Calling LLM to summarize ...")
+    summary = call_llm("", IMAGE_SUMMARIZATION_PROMPT, image=TMP_IMAGE, debug=True)
+    summary = summary["content"]
+    summary = (
+        f"*AN IMAGE WAS OUTPUT, HERE IS ITS LLM-GENERATED DESCRIPTION* ... {summary}"
+    )
+    print(summary)
+    return summary
 
-    all_output = ""
-    result = ""
-    print("Monitoring chainlit output")
-
-    if not chat_history.startswith("'"):
-        chat_history = f"'{chat_history}'"
 
-    print(f"python3 call_assistant.py --chat_history {chat_history}")
-    process = subprocess.Popen(
-        ["python3", "call_assistant.py", "--chat_history", chat_history],
-        stdout=subprocess.PIPE,
-        stderr=subprocess.PIPE,
-    )
-    print(process)
-    while True:
-        output = process.stdout.readline()
-        if output == b"" and process.poll() is not None:
-            print(
-                "Process finished with No output, try running call_assistant by hand to debug."
-            )
-            dump_stderr(process)
-            break
-        if output:
-            all_output += output.decode("utf-8")
-            print(output.strip())
-            if FINISH_PHRASE in str(output).lower():
-                print("Killing process")
-                os.kill(process.pid, signal.SIGKILL)
-                if OUTPUT_TAG in all_output:
-                    result = all_output.split(OUTPUT_TAG)[1].strip()
-                    result = process_images(result)
-                    dump_stderr(process)
-                    print("Result:", result)
-                else:
-                    dump_stderr(process)
-                    result = "Unparsable output"
-                break
-        time.sleep(0.1)
-    return result
-
-
-def run_sync(func, *args, **kwargs):
+def login(num_tries=0, tot_tries=3):
     """
-    Run a function synchronously or asynchronously depending on its type.
+    Logs into the application using the provided driver.
 
     Args:
-        func: The function to be executed.
-        *args: Positional arguments to be passed to the function.
-        **kwargs: Keyword arguments to be passed to the function.
+        num_tries: The number of times to try logging in.
+        tot_tries: The total number of times to try logging in.
 
     Returns:
-        The result of the function execution.
-
-    Raises:
-        None.
-
+        None
     """
-    if inspect.iscoroutinefunction(func):
-        # Use the alternative approach for coroutine functions
-        coroutine = func(*args, **kwargs)
-        return run_async_coroutine(coroutine)
-    elif asyncio.iscoroutine(func):
-        # Directly pass the coroutine object
-        return run_async_coroutine(func)
-    else:
-        # Handle synchronous function
-        return func(*args, **kwargs)
-
-
-async def test_using_app_code_async(chat_history, timeout=5):
-
-    cl_mock = setup_mock_class()
-    import app as app
-
-    app.run_sync = run_sync
-    app.cl = cl_mock
-    app.images_loc = IMAGE_DIR + "/"
-
-    await app.start_chat()
-
-    sync_openai_client = app.cl.user_session.get("sync_openai_client")
-
-    thread_id = app.cl.user_session.get("thread_id")
 
-    # Here build history
-    chat_history = chat_history.replace("\\", "")
-    # Extract test between []
-    chat_history = chat_history[chat_history.find("[") : chat_history.rfind("]") + 1]
-    print(">>>>>>>> Chat history:", chat_history)
-    history = json.loads(chat_history)
-    last_message = history[-1]
-    app_chat_history = app.cl.user_session.get("chat_history")
-    for message in history:
-        role = message["author"]
-        msg = message["content"]
-        await app.add_message_to_thread(thread_id, role, msg)
-        app_chat_history.append({"role": role, "content": msg})
-    app.cl.user_session.set("chat_history", history)
+    print("Logging in ...")
 
-    print("<<<<<<<< Last message:", last_message)
-
-    msg = cl_mock.Message(author="user", content=last_message["content"], elements=[])
-    await app.process_message(msg)
-
-    messages = sync_openai_client.beta.threads.messages.list(thread_id)
-    print("Messages:", messages.data[0].content[0])
-    if messages.data[0].content[0].type == "image_file":
-        file_id = messages.data[0].content[0].image_file.file_id
-        file_path = f"{IMAGE_DIR}/{file_id}.png"
-        result = file_path
-    else:
-        result = messages.data[0].content[0].text.value
+    try:
+        login_box = driver.find_element(By.NAME, LOGIN_EMAIL_FIELD)
+        password_box = driver.find_element(By.NAME, LOGIN_PASSWORD_FIELD)
+        login_box.send_keys(os.getenv("USER_LOGIN"))
+        password_box.send_keys(os.getenv("USER_PASSWORD"))
+        button = driver.find_element(By.XPATH, LOGIN_BUTTON_XPATH)
+        button.click()
+
+        time.sleep(5)
+
+        # Check for login success
+        if check_element_exists(driver, By.ID, CHAT_INPUT_CLASS):
+            print("Login successful")
+            time.sleep(10)
+        else:
+            print("Login failed")
+
+    except Exception as e:
+        print(f"Error logging in: {e}")
+        if num_tries <= tot_tries:
+            print(f"Retrying ... {num_tries}")
+            time.sleep(RETRY_WAIT)
+            return login(num_tries + 1, tot_tries)
+        else:
+            print(f"Failed to login after {tot_tries} tries")
+            sys.exit()
 
-    return result
 
+@tool
+def call_assistant(query, chat_history):
+    """
+    Calls the assistant using the provided user input and history. It will first play the
+    messages in the history (as the user) and then send the user input to the assistant.
 
-def test_using_app_code(chat_history):
-    loop = asyncio.new_event_loop()
-    asyncio.set_event_loop(loop)
-    result = loop.run_until_complete(test_using_app_code_async(chat_history))
-    loop.close()
-    return result
+    Args:
+        query (str): The user input to send to the assistant.
+        chat_history (str): A JSON list of previous messages sent to the assistant.
 
+    Returns:
+        str: The response from the assistant.
+    """
 
-def main():
+    global driver
+    driver = webdriver.Chrome(options=set_chrome_options())
+    driver.get(CHAT_URL)
 
-    parser = argparse.ArgumentParser(
-        description="Process check in and check out operations (i.e. extracting recipes and recipes from the database for quality checks and edits)."
-    )
+    login()
 
-    parser.add_argument(
-        "--chat_history",
-        type=str,
-        required=True,
-        help="""
-            A list containing the chat history, of the format (but in one line) ...
+    # First replay history
+    chat_history = json.loads(chat_history)
+    for message in chat_history:
+        send_message(message)
 
-            '[{\"author\": \"user\",\"content\": \"Hi\"},{\"author\":\"assistant\",\"content\": \"Hello! How can I help you today?\"},{\"author\": \"assistant\",\"content\": \"What is the total population of Mali?\"}]'
-        """,
-    )
+    # Wait for a bit
+    time.sleep(10)
 
-    args = parser.parse_args()
-    chat_history = args.chat_history
+    # Now send the user input
+    response = send_message(query)
 
-    if chat_history:
-        print("Running app code ...")
-        result = test_using_app_code(chat_history)
-        print(OUTPUT_TAG)
-        print(result)
-        print(OUTPUT_TAG)
+    if response is None:
+        response = "ERROR: No response from assistant"
 
-        # Do not remove this line
-        print(FINISH_PHRASE)
+    return response
 
 
 if __name__ == "__main__":
-    main()
+
+    # chat_history = [
+    #    "Hello! How can I assist you today?",
+    #    "What is the total population of Mali",
+    #    "plot a line chart of fatalities by month for Chad using HDX data as an image",
+    #    "Plot population pyramids for Nigeria",
+    #    "How many rows does the population table have for Nigeria",
+    #    "Plot f{x}=10",
+    # ]
+    # user_input = chat_history[4]
+    # print(user_input)
+
+    # user_input="Is your data updated in real time?"
+    # call_assistant(user_input, "[]")
+    # sys.exit()
+
+    # read data.jsonl
+    with open("data.jsonl") as f:
+        data = f.readlines()
+
+    data_new = []
+    for d in data:
+        d = json.loads(d)
+        user_input = d["query"]
+        chat_history = d["chat_history"]
+        output = call_assistant(user_input, "[]")
+        d["context"] = str(output)
+        data_new.append(d)
+
+    with open("data.new.jsonl", "w") as f:
+        for d in data_new:
+            f.write(json.dumps(d) + "\n")
+        print(
+            "\n\nReview data.new.jsonl for the output and copy to data.jsonl if satisfied."
+        )
diff --git a/flows/chainlit-ui-evaluation/call_assistant_debug.py b/flows/chainlit-ui-evaluation/call_assistant_debug.py
deleted file mode 100644
index e8159735..00000000
--- a/flows/chainlit-ui-evaluation/call_assistant_debug.py
+++ /dev/null
@@ -1,50 +0,0 @@
-import sys
-
-from call_assistant import run_chainlit_mock
-
-#
-# Note, you can also debug by running directly, eg ...
-#
-# python3 call_assistant.py --chat_history '[{"author": "user","content": "How many rows does the population table have for Nigeria"}]'
-#
-# But this will hang. Below use the wrapper to terminate the process.
-#
-
-
-def main():
-    """
-    This function is the entry point of the program.
-    It demonstrates different scenarios by calling the `run_chainlit_mock` function with different inputs.
-    """
-
-    # Assistant smalltalk
-    run_chainlit_mock('[{"author": "user","content": "Hi"}]')
-
-    sys.exit(0)
-
-    # Memories, text output
-    run_chainlit_mock(
-        '[{"author": "user","content": "what is the population of Mali?"}]'
-    )
-
-    # Memories, image output
-    run_chainlit_mock(
-        '[{"author": "user","content": "plot a line chart of fatalities by month for Chad using HDX data as an image"}]'
-    )
-
-    # Recipe run, image output
-    run_chainlit_mock(
-        '[{"author": "user","content": "plot population pyramids for Nigeria"}]'
-    )
-
-    # Assistant on-the-fly SQL analysis of DB, text output
-    run_chainlit_mock(
-        '[{"author": "user","content": "How many rows does the population table have for Nigeria"}]'
-    )
-
-    # Assistant simple analysis and code interpretor, image output
-    run_chainlit_mock('[{"author": "user","content": "Plot f{x}=10"}]')
-
-
-if __name__ == "__main__":
-    main()
diff --git a/flows/chainlit-ui-evaluation/check_evaluation_results.py b/flows/chainlit-ui-evaluation/check_evaluation_results.py
index 352ed21e..b94410ba 100644
--- a/flows/chainlit-ui-evaluation/check_evaluation_results.py
+++ b/flows/chainlit-ui-evaluation/check_evaluation_results.py
@@ -1,4 +1,3 @@
-# This script runs after GitHub action to check promptflow evaluation
 import json
 import os
 import subprocess
@@ -19,6 +18,7 @@ def check_result(run_name="base_run", cutoff=100.0):
     Raises:
         SystemExit: If the evaluation result is below the cutoff value.
     """
+
     cmd = f"pf run show-metrics -n {run_name}"
     print(cmd)
     # Run cmd and capture output
diff --git a/flows/chainlit-ui-evaluation/data.jsonl b/flows/chainlit-ui-evaluation/data.jsonl
index 1e0b72d8..0b8637dc 100644
--- a/flows/chainlit-ui-evaluation/data.jsonl
+++ b/flows/chainlit-ui-evaluation/data.jsonl
@@ -1,7 +1,7 @@
-{"test_scenario":"Assistant smalltalk, salutation", "query": "Hi", "chat_history": "[]", "context": "Hello! How can I assist you today?"}
-{"test_scenario":"Text answer from memory", "query": "What is the total population of Mali", "chat_history": "[{\"author\": \"user\",\"content\": \"Hi!\"}]", "context": "The answer is:\n\n            \n            The answer is: **17,907,114.0**\n\n            Metadata for the answer:\n            {\"params\": {\"country_code\": \"MLI\"}, \"attribution\": \"https://data.humdata.org/dataset/ce21c7db-d8f0-40f8-adc2-452d2d2d105c\", \"data_url\": \"https://data.humdata.org/dataset/ce21c7db-d8f0-40f8-adc2-452d2d2d105c/resource/6f243ba2-4d4a-4663-a7c4-e917dbbde73a/download/mli_pop_adm0_v2.csv\", \"time_period\": {\"start\": \"2018-01-01\", \"end\": \"2018-12-31T23:59:59\"}}"}
-{"test_scenario":"Image answer from memory", "query": "plot a line chart of fatalities by month for Chad using HDX data as an image", "chat_history": "[]", "context": "The answer is:\n\n            Image cksum: 6a410014fde98dc5bde69c24e6d64cc1\nImage description: {'content': 'The image is a line graph titled \"Fatalities by Month for Chad.\" It displays the total number of fatalities over time, with the x-axis representing the months from January 2008 to January 2024, and the y-axis representing the total number of fatalities, ranging from 0 to 500.\\n\\nKey observations from the graph:\\n- There are several peaks indicating months with high fatalities.\\n- Notable spikes occur around mid-2008, early 2009, mid-2015, early 2021, and mid-2021.\\n- The highest peak appears to be around early 2021, reaching close to 500 fatalities.\\n- There are periods of relatively low fatalities, particularly between 2010 and 2014.\\n\\nTo determine if this image is relevant to the user query, more context about the query is needed. If the query pertains to historical data on fatalities in Chad, trends in violence or conflict, or similar topics, then this image is highly relevant.'}\n            \n\n            Metadata for the answer:\n            {\"params\": {\"country_code\": \"TCD\"}, \"attribution\": \"https://data.humdata.org/dataset/b009f9b0-aa65-49c5-b188-a33daade0f4a\", \"data_url\": \"https://data.humdata.org/dataset/b009f9b0-aa65-49c5-b188-a33daade0f4a/resource/bb78c035-ec19-4503-b325-0673749c2eb4/download/chad_hrp_political_violence_events_and_fatalities_by_month-year_as-of-29may2024.xlsx\"}"}
-{"test_scenario":"Image answer from recipe", "query": "Plot population pyramids for Nigeria", "chat_history": "[]", "context": "The answer is:\n\n            Image cksum: 7940162caf0e79eba9caae30c2955a6e\nImage description: {'content': \"The image is a population pyramid for Nigeria (NGA). It is a bar chart that displays the distribution of various age groups in the population, divided by gender. The x-axis represents the population in millions, with males on the left side (in blue) and females on the right side (in pink). The y-axis represents the age range, divided into 5-year intervals from 0-4 up to 80+.\\n\\nKey features of the population pyramid:\\n- The base of the pyramid (0-4 age range) is the widest, indicating a high number of young children.\\n- As the age range increases, the width of the bars decreases, showing a tapering effect typical of a youthful population.\\n- The population decreases steadily with age, with the smallest population in the 80+ age range.\\n- The pyramid shows a relatively balanced distribution between males and females across most age groups.\\n\\nThis image is relevant to a user query related to demographic analysis, population studies, or understanding the age and gender distribution of Nigeria's population.\"}\n            \n\n            Metadata for the answer:\n            {'params': {'adm0_code': 'NGA'}, 'attribution': 'https://data.humdata.org/dataset/a7c3de5e-ff27-4746-99cd-05f2ad9b1066', 'data_url': 'https://data.humdata.org/dataset/a7c3de5e-ff27-4746-99cd-05f2ad9b1066/resource/562e7757-0683-4d61-87bd-a7c94af2ee38/download/nga_admpop_adm2_2020.csv', 'time_period': {'start': '2020-01-01', 'end': '2020-12-31T23:59:59'}}"}
-{"test_scenario":"Assistant on-the-fly SQL, text answer", "query": "How many rows does the population table have for Nigeria", "chat_history": "[]", "context": "There are **43,794** rows of data in the population table for Nigeria."}
-{"test_scenario":"Assistant created image (simple)", "query": "Plot f{x}=10", "chat_history": "[]", "context": "Image cksum: 3f4dafc66e68dc03e3ef6d2f02a85bc7\nImage description: {'content': 'The image is a plot of the function \\\\( f(x) = 10 \\\\). Here are the details of the plot:\\n\\n- The title of the plot is \"Plot of f(x) = 10\".\\n- The x-axis ranges from -10 to 10.\\n- The y-axis ranges from 0 to 10.\\n- The function \\\\( f(x) = 10 \\\\) is represented by a horizontal orange line at \\\\( y = 10 \\\\).\\n- There is a legend in the plot that labels the orange line as \"f(x) = 10\".\\n- The x-axis is labeled \"x\" and the y-axis is labeled \"f(x)\".\\n- The plot has grid lines for better readability.\\n\\nThe plot is relevant if the user query is about visualizing or understanding the function \\\\( f(x) = 10 \\\\), which is a constant function.'}"}
-{"test_scenario":"Assistant answering from uploaded documents", "query": "Is your data updated in realtime?", "chat_history": "[]", "context": "The data is not updated in real-time. For data sources configured as API data sources, the system will call them on-demand to pull in the latest data from the remote system. However, for data sources where data is ingested, like HAPI, the update frequency depends on how often the ingestion process is run, which is controlled by the user of the humanitarian AI assistant"}
\ No newline at end of file
+{"test_scenario": "Assistant smalltalk, salutation", "query": "Hi", "chat_history": "[]", "context": "['Hello! How can I assist you today?']"}
+{"test_scenario": "Text answer from memory", "query": "What is the total population of Mali", "chat_history": "[]", "context": "['The answer is: 17,907,114.0', '\u2705 A human approved this data recipe; Source; Raw data; 2018-01-01 to 2018-12-31']"}
+{"test_scenario": "Image answer from memory", "query": "plot a line chart of fatalities by month for Chad using HDX data as an image", "chat_history": "[]", "context": "['*AN IMAGE WAS OUTPUT, HERE IS ITS LLM-GENERATED DESCRIPTION* ... The image is a line graph titled \"Fatalities by Month for Chad,\" depicting the total number of fatalities per month from January 2008 to January 2024. The x-axis represents the months, while the y-axis represents the total fatalities. \\n\\nKey observations:\\n- There are several peaks indicating months with high fatalities.\\n- Notable spikes occur around mid-2008, early 2009, mid-2015, early 2021, and mid-2021.\\n- The highest peak appears to be in early 2021, with fatalities exceeding 500.\\n- There are periods of relatively low fatalities, particularly between 2010 and 2014.\\n- Overall, there is an increasing trend in fatalities over the years, with more frequent spikes in recent years.', '\u2705 A human approved this data recipe; Source; Raw data']"}
+{"test_scenario": "Image answer from recipe", "query": "Plot population pyramids for Nigeria", "chat_history": "[]", "context": "['*AN IMAGE WAS OUTPUT, HERE IS ITS LLM-GENERATED DESCRIPTION* ... The image is a population pyramid for Nigeria (NGA). It displays the population distribution by age and gender. The horizontal axis represents the population in millions, with males on the left (in blue) and females on the right (in pink). The vertical axis represents different age ranges, starting from 0-4 at the bottom to 80+ at the top.\\n\\nKey observations:\\n- The pyramid has a broad base, indicating a large number of young people (0-4 age range).\\n- The population decreases steadily with increasing age, forming a typical pyramid shape.\\n- There are more males than females in the younger age groups, but the difference is not very pronounced.\\n- The population in older age groups (60 and above) is significantly smaller compared to younger age groups.', '\u2705 A human approved this data recipe; Source; Raw data; 2020-01-01 to 2020-12-31']"}
+{"test_scenario": "Assistant on-the-fly SQL, text answer", "query": "How many rows does the population table have for Nigeria", "chat_history": "[]", "context": "['The population table has 43,794 rows for Nigeria.']"}
+{"test_scenario": "Assistant created image (simple)", "query": "Plot f{x}=10", "chat_history": "[]", "context": "['*AN IMAGE WAS OUTPUT, HERE IS ITS LLM-GENERATED DESCRIPTION* ... The image is a plot of the function \\\\( f(x) = 10 \\\\). The graph shows a horizontal line at \\\\( f(x) = 10 \\\\) across the range of \\\\( x \\\\) values from -10 to 10. The x-axis is labeled \"x\" and the y-axis is labeled \"f(x)\". There is a legend on the right side of the plot indicating that the orange line represents \\\\( f(x) = 10 \\\\). The plot title is \"Plot of f(x) = 10\".']"}
+{"test_scenario": "Assistant answers using vector store", "query": "Is your data updated in real time?", "chat_history": "[]", "context": "['The data is updated in real-time for data sources configured as API data sources. The system calls these APIs on-demand to pull in the latest data from the remote system. However, for data sources where data is ingested, such as HAPI, the update frequency depends on how often the ingestion process is run, which is controlled by the user of the humanitarian AI assistant[0].', '\ud83e\udd16 Caution: LLM Analysis; Sources: [0] HDIP FAQs (External) .pdf']"}
diff --git a/flows/chainlit-ui-evaluation/flow.dag.yaml b/flows/chainlit-ui-evaluation/flow.dag.yaml
index 9b6236ac..32181ba3 100644
--- a/flows/chainlit-ui-evaluation/flow.dag.yaml
+++ b/flows/chainlit-ui-evaluation/flow.dag.yaml
@@ -4,25 +4,28 @@ environment:
 inputs:
   query:
     type: string
-    default: plot f{x}=10
+    default: What is the total population of Mali
   context:
     type: string
-    default: '"The answer is:\n\n            \n            The answer is:
-      **17,907,114.0**\n\n            Metadata for the
-      answer:\n            {\"params\": {\"country_code\": \"MLI\"},
-      \"attribution\":
-      \"https://data.humdata.org/dataset/ce21c7db-d8f0-40f8-adc2-452d2d2d105c\",
-      \"data_url\":
-      \"https://data.humdata.org/dataset/ce21c7db-d8f0-40f8-adc2-452d2d2d105c/resource/6f243ba2-4d4a-4663-a7c4-e917dbbde73a/download/mli_pop_adm0_v2.csv\",
-      \"time_period\": {\"start\": \"2018-01-01\", \"end\":
-      \"2018-12-31T23:59:59\"}}"'
+    default: "\"The answer is:\\n\\n            \\n            The answer is:
+      **17,907,114.0**\\n\\n            Metadata for the
+      answer:\\n            {\\\"params\\\": {\\\"country_code\\\":
+      \\\"MLI\\\"}, \\\"attribution\\\":
+      \\\"https://data.humdata.org/dataset/ce21c7db-d8f0-40f8-adc2-452d2d2d105c\
+      \\\", \\\"data_url\\\":
+      \\\"https://data.humdata.org/dataset/ce21c7db-d8f0-40f8-adc2-452d2d2d105c\
+      /resource/6f243ba2-4d4a-4663-a7c4-e917dbbde73a/download/mli_pop_adm0_v2.c\
+      sv\\\", \\\"time_period\\\": {\\\"start\\\": \\\"2018-01-01\\\",
+      \\\"end\\\": \\\"2018-12-31T23:59:59\\\"}}\", \"output\": \"['The answer
+      is: 17,907,114.0', '\\u2705 A human approved this data recipe; Source; Raw
+      data; 2018-01-01 to 2018-12-31']\"}"
   chat_history:
     type: string
     default: "[]"
 outputs:
   agent_output:
     type: string
-    reference: ${call_assistant.output.response}
+    reference: ${call_assistant.output}
   groundedness_score:
     type: string
     reference: ${groundedness_score.output}
@@ -48,10 +51,10 @@ nodes:
     path: templates/groundedness_score.jinja2
   inputs:
     deployment_name: gpt-4-turbo
-    answer: ${call_assistant.output.response}
+    answer: ${call_assistant.output}
     context: ${inputs.context}
     temperature: 1
-    model: gpt-4-turbo-preview
+    model: gpt-4o
   connection: open_ai_connection
   api: chat
 - name: concat_scores
diff --git a/requirements.txt b/requirements.txt
index 40e76259..560f87b9 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -23,9 +23,9 @@ plotly==5.22.0
 psycopg2_binary==2.9.9
 pyarrow==16.1.0
 python-dotenv==1.0.0
-robocorp
-robocorp-actions
-robocorp-truststore
+#robocorp
+#robocorp-actions
+#robocorp-truststore
 seaborn==0.13.2
 sqlalchemy==2.0.30
 pytest==8.2.2
diff --git a/templates/groundedness_score.jinja2 b/templates/groundedness_score.jinja2
index e3a5dad2..857f5eda 100644
--- a/templates/groundedness_score.jinja2
+++ b/templates/groundedness_score.jinja2
@@ -27,6 +27,12 @@ Independent Examples:
 ## Example Task #4 Output:
 1
 
+Key points:
+
+- if the CONTEXT is a basic greeting, like "Hello" and the ANSWER is a basic greeting also, groundedness is 5
+- Focus on facts, not on the language used to present the facts
+- If the ANSWER presents a fact, and the CONTEXT presents the same fact, groundedness is 5
+
 Reminder: The return values for each task should be correctly formatted as an integer between 1 and 5. Do not repeat the context.
 
 ## Actual Task Input:
diff --git a/ui/chat-chainlit-assistant/Dockerfile b/ui/chat-chainlit-assistant/Dockerfile
index 3339cd9c..1e8a7c92 100644
--- a/ui/chat-chainlit-assistant/Dockerfile
+++ b/ui/chat-chainlit-assistant/Dockerfile
@@ -7,7 +7,6 @@ WORKDIR /app
 RUN pip install --upgrade pip
 RUN pip install -r requirements.txt
 
-
 COPY ./utils/ ./utils
 COPY ./templates ./templates
 #
diff --git a/ui/chat-chainlit-assistant/actions.py b/ui/chat-chainlit-assistant/actions.py
new file mode 100755
index 00000000..e69de29b
diff --git a/ui/chat-chainlit-assistant/ingestion.config b/ui/chat-chainlit-assistant/ingestion.config
new file mode 100755
index 00000000..e69de29b
diff --git a/utils/llm.py b/utils/llm.py
index d5fda88d..c2fc8a67 100644
--- a/utils/llm.py
+++ b/utils/llm.py
@@ -25,6 +25,7 @@
 template_dir = "../templates"
 if not os.path.exists(template_dir):
     template_dir = "./templates"
+
 environment = Environment(loader=FileSystemLoader(template_dir))
 sql_prompt_template = environment.get_template("gen_sql_prompt.jinja2")
 
@@ -85,7 +86,7 @@ def get_models():
     return embedding_model, chat
 
 
-def call_llm(instructions, prompt, image=None):
+def call_llm(instructions, prompt, image=None, debug=False):
     """
     Call the LLM (Language Learning Model) API with the given instructions and prompt.
 
@@ -93,6 +94,7 @@ def call_llm(instructions, prompt, image=None):
         instructions (str): The instructions to provide to the LLM API.
         prompt (str): The prompt to provide to the LLM API.
         chat (Langchain Open AI model): Chat model used for AI judging
+        debug (bool): Whether to print debug information. Defaults to False.
 
     Returns:
         dict or None: The response from the LLM API as a dictionary, or None if an error occurred.
@@ -107,7 +109,7 @@ def call_llm(instructions, prompt, image=None):
     # Multimodal
     if image:
         if os.getenv("RECIPES_MODEL") == "gpt-4o":
-            print("Sending image to LLM ...")
+            print("         Sending image to LLM ...")
             with open(image, "rb") as image_file:
                 encoded_string = base64.b64encode(image_file.read()).decode()
 
@@ -164,13 +166,14 @@ def call_llm(instructions, prompt, image=None):
                 response["message"] = message
             else:
                 # Finally just send it back
-                print("LLM response unparsable, using raw results")
-                print(response)
+                if debug:
+                    print("LLM response unparsable, using raw results")
+                    print(response)
                 response = {"content": response}
         return response
 
     except Exception as e:
-        # print(response)
+        print(response)
         print("Error calling LLM: ", e)
         response = None