Merge pull request #65 from datakind/feat/selenium-e2e-3

Feat/selenium e2e tests
datakind · Jul 11, 2024 · 6c35800 · 6c35800
2 parents 94cd770 + 4928df3
commit 6c35800
Show file tree

Hide file tree

Showing 23 changed files with 467 additions and 703 deletions.
diff --git a/.env.example b/.env.example
@@ -106,6 +106,12 @@ REMOTE_DB_CONN_STRING=
 # If using Fast API
 RECIPE_SERVER_API=http://server:8080/
 
+#==================================================#
+#                        E2E Tests                 #
+#==================================================#
+# Note, on Github use http://host.docker.internal:8000
+CHAT_URL="http://chat:8000/"
+
 #==================================================#
 #                    Chainlit Settings             #
 #==================================================#

diff --git a/.github/workflows/code_quality_checks.yml b/.github/workflows/code_quality_checks.yml
@@ -1,6 +1,6 @@
 name: Code quality checks
 
-on: [push, pull_request]
+on: [push]
 
 jobs:
   build:

diff --git a/.github/workflows/e2e_tests.yml b/.github/workflows/e2e_tests.yml
@@ -1,4 +1,4 @@
-name: End-to-End Tests
+name: End-to-End tests
 
 #on: [push, pull_request]
 
@@ -10,7 +10,7 @@ on: [push]
 
 jobs:
   test:
-      runs-on: ubuntu-latest
+      runs-on: ubuntu-latest 
       environment: "GitHub Actions 1"
       env: 
         OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
@@ -49,93 +49,86 @@ jobs:
 
         IMAGE_HOST: ${{ secrets.IMAGE_HOST }} 
         RECIPE_SERVER_API: ${{ secrets.RECIPE_SERVER_API_FROM_GH_HOST }} 
+        CHAT_URL: ${{ secrets.CHAT_URL }}
 
         CHAINLIT_AUTH_SECRET: ${{ secrets.CHAINLIT_AUTH_SECRET }} 
         USER_LOGIN: ${{ secrets.USER_LOGIN }} 
         USER_PASSWORD: ${{ secrets.USER_PASSWORD }} 
 
-        COLUMNS: 150
-
       steps:
 
         - name: Checkout
           uses: actions/checkout@v3
 
-        #- name: Checkout integration tests data
-        #  uses: actions/checkout@master
-        #  with:
-        #    repository: datakind/recipes-ai-test-data
-        #    ssh-key: ${{ secrets.GITHUB_SSH_PRIVATE_KEY}} 
-        #    path: recipes-ai-test-data
-
         - name: Expose GitHub Runtime
           uses: crazy-max/ghaction-github-runtime@v2
 
-        - name: Spin up DB and recipes server
+        - name: Spin up Docker containers
           run: |
             env > .env 
 
             echo "Installing demo data ..."
+
+            # Get demo data
             pip3 install gdown==5.2.0
             cd data && python3 download_demo_data.py && cd ..
-            ls data/datadb
 
-            mkdir -p ./flows/chainlit-ui-evaluation/recipes/public
-            
             # TODO this should be enhanced to use a buildx bake to leverage layer caching for faster builds, or push to repo and simply have a pull for the run
             # TODO docker-compose files should be refactored to use scopes instead of different versions for each environment
             echo "Starting docker containers for dbs and server ..."
-            docker-compose -f ./docker-compose-github.yml pull
-            docker-compose -f ./docker-compose-github.yml up -d --build 
+            docker-compose pull
+            docker-compose up -d --build 
+
+            # TODO: For some reason, maybe buildkit, in Github docker compose builds the image differently, and it doesn't work. Individual image build works.
+            docker build --build-arg OPENAI_API_KEY=$OPENAI_API_KEY --build-arg CHAT_URL=$CHAT_URL --build-arg OPENAI_API_ENDPOINT=$OPENAI_API_ENDPOINT --no-cache -t promptflow -f ./flows/chainlit-ui-evaluation/Dockerfile  .
+            docker run --env RECIPES_MODEL_MAX_TOKENS=${RECIPES_MODEL_MAX_TOKENS} --env RECIPES_MODEL_TEMP=${RECIPES_MODEL_TEMP} --env RECIPES_OPENAI_API_TYPE=${ASSISTANTS_API_TYPE} --env RECIPES_OPENAI_API_KEY=${ASSISTANTS_API_KEY} --env RECIPES_MODEL=${RECIPES_MODEL} --env RECIPES_BASE_URL=${RECIPES_BASE_URL} --env USER_LOGIN=${USER_LOGIN} --env USER_PASSWORD=${USER_PASSWORD} --env CHAT_URL=${CHAT_URL} --network=data-recipes-ai_default -d --name promptflow promptflow 
+
+        - name: Check logs
+          run: |
+
+            docker ps
+
             echo "logs datadb ..."
-            docker-compose -f docker-compose-github.yml  logs datadb
+            docker compose logs datadb
+
+            echo "logs promptflow ..."
+            docker logs promptflow
+
+            echo "logs chat ..."
+            docker compose logs chat
+
+            echo "logs server ..."
+            docker compose logs server
+
             docker ps
 
-        # TODO The promptflow docker build wasn't working in GH actions, so deploying promptflow to host for now
-        - name: Set up promtpflow and run tests
-          uses: actions/setup-python@v4
-          with:
-            python-version: "3.11.4"
-        - run: |
-            echo "Installing promptflow packages ..."
-            pip3 install promptflow==1.12.0
-            pip3 install promptflow-tools==1.4.0
-            pip3 install chainlit==1.1.305
-            pip3 install langchain==0.2.1
-            pip3 install langchain_openai==0.1.7
-            pip3 install psycopg2_binary==2.9.9
-            pip3 install keyrings.alt
-            pip3 list
-        - run: |
-
-            echo "Setting up folders ..."
-            cd flows/chainlit-ui-evaluation/
-            cp ../../ui/chat-chainlit-assistant/app.py .
-            cp -r ../../utils .
-            cp -r ../../templates .
-            cp ../../management/skills.py .
-
-            echo "Setting up Promptflow connections ..."
-            pf connection create --file ./openai.yaml --set api_key=$OPENAI_API_KEY  --name open_ai_connection 
-            pf connection create --file ./azure_openai.yaml --set api_key=$OPENAI_API_KEY --set api_base=$OPENAI_API_ENDPOINT --name azure_openai 
-
-            # Test running one node with default inputs. Good for debugging GH actions
-            #pf flow test --flow . --node call_assistant
-            #python3 call_assistant.py --chat_history '[{"author": "user","content": "Hi!"}, {"author": "user","content": "What is the total population of Mali"}]'
-            #python3 call_assistant.py --chat_history '[{"author": "user","content": "plot a line chart of fatalities by month for Chad using HDX data as an image"}]'
-            # This runs a few, with the script kill, like promptflow, but prints all debug. Good for testing.
-            python3 call_assistant_debug.py
-
-            echo "Starting Promptflow batch run  using data.jsonl ..."
-            pf run create --flow . --data ./data.jsonl --stream  --column-mapping query='${data.query}' context='${data.context}' chat_history='${data.chat_history}'  --name base_run 
-
-        - run: |
-            echo "Promptflow results ..."
-            cd flows/chainlit-ui-evaluation/
-            pf run show-details -n base_run
+            sleep 10
+
+        # Debugging GitHUb actions interactively, by connecting to the runner ...
+        # Get ssh connection details for runner
+        # See here https://github.com/marketplace/actions/debugging-with-ssh
+        # Basically, uncomment this, then get connection string in actions output, then connect with
+        #
+        # ssh -i <YOUR GITHUB SSH KEY> <CONN STRING ON ACTIONS>
+        #
+        #- name: DEBUG - Setup upterm session
+        #  uses: lhotari/action-upterm@v1
+
+        # - name: DEBUG - Run Selenium outside of promptflow
+        #   run: |
+        #     docker exec promptflow python call_assistant.py
+
+        - name: Run tests
+          run: |
+            env > .env 
+            docker exec promptflow pf run create --flow . --data ./data.jsonl --stream  --column-mapping query='${data.query}' context='${data.context}' chat_history='${data.chat_history}'  --name base_run 
+
+        - name: Show results
+          run: |
+            docker exec promptflow COLUMNS=150 pf run show-details -n base_run
             echo "Getting metrics ..."
-            pf run show-metrics -n base_run
-            ##pf run visualize -n base_run
+            docker exec promptflow  pf run show-metrics -n base_run
+            ##docker exec promptflow  pf run visualize -n base_run
             echo "Checking results ..."
-            python3 check_evaluation_results.py
+            docker exec promptflow python3 check_evaluation_results.py
  
diff --git a/.github/workflows/get_memory_test.yml b/.github/workflows/get_memory_test.yml
@@ -75,22 +75,32 @@ jobs:
             ls data/datadb
             
             echo "Starting docker containers for dbs and server ..."
-            docker-compose -f ./docker-compose-github.yml pull
-            docker-compose -f ./docker-compose-github.yml up -d --build 
-            echo "logs datadb ..."
-            docker-compose -f docker-compose-github.yml  logs datadb
+            docker-compose pull
+            docker-compose up -d --build datadb recipedb server datadb
+
+            sleep 10
+
+            docker ps
+
+            
+        - name: Check logs
+          run: |
+
             docker ps
 
+            echo "logs datadb ..."
+            docker compose logs datadb
+
+            echo "logs recipedb ..."
+            docker compose logs recipedb
+
+            echo "logs server ..."
+            docker compose logs server
+
+            sleep 10
+
         - name: Run tests
-          uses: actions/setup-python@v4
-          with:
-            python-version: "3.11.4"
-        - run: |
-            pip3 install pytest==8.2.2
-            pip3 install requests==2.32.3
-            pip3 install python-dotenv==1.0.1
-
-        - run: |
+          run: |
             echo "exec into container ..."
-            docker exec recipes-ai-server bash -c "cd tests/ && pytest"
+            docker compose exec server bash -c "cd tests/ && pytest"
  
diff --git a/.gitignore b/.gitignore
@@ -27,4 +27,8 @@ data
 server/fastapi/recipes/
 assistants/chat_ui/files/file_search/custom
 assistants/chat_ui/files/code_interpreter/custom
-
+flows/chainlit-ui-evaluation/app.py
+flows/chainlit-ui-evaluation/data.new.jsonl
+flows/chainlit-ui-evaluation/recipes/
+flows/chainlit-ui-evaluation/temp.png
+ui/chat-chainlit-assistant/.files/
diff --git a/CONTRIBUTION.md b/CONTRIBUTION.md
@@ -1,4 +1,4 @@
-# Contributing to DOT
+# Contributing to Data Recipes AI
 
 Hi! Thanks for your interest in contributing to Data Recipes AI, we're really excited to see you! In this document we'll try to summarize everything that you need to know to do a good job.
 
@@ -40,6 +40,8 @@ GitHub has an action to run the pre-commit tests to ensure code adheres to stand
 
 ## Tests
 
+### Unit tests
+
 You should write tests for every feature you add or bug you solve in the code.
 Having automated tests for every line of our code lets us make big changes
 without worries: there will always be tests to verify if the changes introduced
@@ -53,11 +55,9 @@ the desired feature.
 
 You can use `pytest` to run your tests, no matter which type of test it is.
 
-### End-to-end tests
+### End-to-end tests (using Selenium and Promptflow)
 
-End-to-end tests have been configured in GitHub actions which use promptflow to call a wrapper around the chainlit UI, or order to test when memories/recipes are used as well as when the assistant does some on-the-fly analysis. To do this, the chainlit class is patched heavily, and there are limitations in how
-cleanly this could be done, so it isn't an exact replica of the true application, but does capture changes
-with the flow as well as test the assistant directly. The main body of integration tests will test recipes server and the assistant independently.
+End-to-end tests have been configured in GitHub actions which use promptflow to call a wrapper around the chainlit UI, or order to test when memories/recipes are used as well as when the assistant does some on-the-fly analysis. To do this, the chainlit class is patched heavily, and there are limitations in how cleanly this could be done, so it isn't an exact replica of the true application, but does capture changes with the flow as well as test the assistant directly. The main body of integration tests will test recipes server and the assistant independently.
 
 Additionally, there were some limitation when implementing in GitHub actions where workarounds were implemented
 until a lter data, namely: promptflow is run on the GitHub actions host rather than in docker, and the promptflow wrapper to call chainlit has to run as a script and kill the script based on a STDOUT string. These should be fixed in future.
@@ -66,14 +66,30 @@ Code for e2e tests can be found in `flows/chainlit-ui-evaluation` as run by `.gi
 
 The tests work using promptflow evaluation and a call to an LLM to guage groundedness, due to the fact LLM assistants can produce slightly different results if not providing answers from memory/recipes. The promptflow evaluation test data can be found in `flows/chainlit-ui-evaluation/data.jsonl`. 
 
-A useful way to test a new scenario and to get the 'expected' output for `data.jsonl`, is to add it to `call_assistant_debug.py`.
+See "Evaluating with Promptflow" below to see how to run e2e tests locally.
+
+#### Running Promptflow evaluation locally
+
+First, you will need to build the environment to include Prompt Flow ...
+
+`docker compose -f docker-compose.yml -f docker-compose-dev.yml up -d --build`
+
+Then ...
+
+1. Install the DevContainers VSCode extension 
+2. Build data recipes using the `docker compose` command mentioned above
+3. Open the command palette in VSCode (CMD + Shift + P on Mac; CTRL + Shift + P on Windows) and select 
 
-TODO, future work:
+   `Dev Containers: Attach to remote container`. 
 
-- Add promptflow to docker-compose-github.yml and update action to use this env (time was short and wasn't working). This will reduce overhead and complexity
-- Figure out how to make call_assistant.py exit async look so it doesn't have to run in a wrapper that then kills process
-- Push docker containers to a registry so flow doesn't run build every time
-- Bug the chainlit folks to see if they can do something more formal around testing, to avoid complex monkey patching
+   Select the promptflow container. This opens a new VSCode window - use it for the next steps.
+4. Install Promptflow add-in
+5. Open folder `/app`
+6. Click on `flow.dag.yaml`
+7. Top left of main pane, click on 'Visual editor'
+     - If you are taken to the promptflow 'Install dependencies'' screen, change the Python runtime to be ` /azureml-envs/prompt-flow/runtime/bin/python` 'runtime', then close and re-open `flow.dag.yaml`
+8. On the Groundedness node, select your new connection
+9. You can no run by clicking the play icon. See Promptflow documentation for more details
 
 ## GitHub Workflow
 

diff --git a/README.md b/README.md
@@ -17,7 +17,7 @@ For more information on the recipes concept, please see see [here](https://towar
 
 Data recipes have two types: (i) Exact memories, eg '*What is the population of Mali?*' which can be served directly to the user when they ask this question; (ii) Generic skills which can be run when requested for a scenario not in memory, eg a skill for 'What is the population of country X?' which can be called when the user asks something like '*What is the population of Nigeria?*'. In both cases the match to the user's intent is made using semantic search with LLM-reranking.
 
-Given the rapidly changing landscape of LLMs, we have tried as much as possible to implement data recipes in such a way that it can be integrated with various semantic architectures and frameworks. By implementing recipes using a recipes server (powered by [Robocorps actions server](https://github.com/robocorp/robocorp#readme)), it can be called from [Open AI assistant](https://platform.openai.com/docs/assistants/overview) actions and [Copilot Studio](https://www.microsoft.com/en-us/microsoft-copilot/microsoft-copilot-studio) as well from any custom code. Also included in this repo is an example of using recipes via OpenAI format plugins, as supported by frameworks such as [semantic kernel](https://learn.microsoft.com/en-us/semantic-kernel/overview/?tabs=Csharp). 
+Given the rapidly changing landscape of LLMs, we have tried as much as possible to implement data recipes in such a way that it can be integrated with various semantic architectures and frameworks. By implementing recipes using a recipes server (powered by FastAPI), it can be called from [Open AI assistant](https://platform.openai.com/docs/assistants/overview) actions and [Copilot Studio](https://www.microsoft.com/en-us/microsoft-copilot/microsoft-copilot-studio) as well from any custom code. Also included in this repo is an example of using recipes via OpenAI format plugins, as supported by frameworks such as [semantic kernel](https://learn.microsoft.com/en-us/semantic-kernel/overview/?tabs=Csharp). 
 
 Data recipes supports datasources accessed via API, but in some cases it is preferable to ingest data in order to leverage LLM SQL capabilities. We include an initial set of data sources specific to humanitarian response in the ingestion module, which can be extended to include additional sources as required.
 

diff --git a/assets/system.png b/assets/system.png
diff --git a/docker-compose-github.yml b/docker-compose-github.yml