diff --git a/.env.example b/.env.example index 79c8bb3f..f361fc4b 100644 --- a/.env.example +++ b/.env.example @@ -128,7 +128,7 @@ PROXY= # Anthropic # #============# -ANTHROPIC_API_KEY=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 +ANTHROPIC_API_KEY= # ANTHROPIC_MODELS=claude-3-opus-20240229,claude-3-sonnet-20240229,claude-3-haiku-20240307,claude-2.1,claude-2,claude-1.2,claude-1,claude-1-100k,claude-instant-1,claude-instant-1-100k # ANTHROPIC_REVERSE_PROXY= @@ -193,9 +193,15 @@ DEBUG_OPENAI=false # Assistants API # #====================# -ASSISTANTS_API_KEY=c3e37d558bea46ca917dd2be40ee69d4 -# ASSISTANTS_BASE_URL= -# ASSISTANTS_MODELS=gpt-3.5-turbo-0125,gpt-3.5-turbo-16k-0613,gpt-3.5-turbo-16k,gpt-3.5-turbo,gpt-4,gpt-4-0314,gpt-4-32k-0314,gpt-4-0613,gpt-3.5-turbo-0613,gpt-3.5-turbo-1106,gpt-4-0125-preview,gpt-4-turbo-preview,gpt-4-1106-preview +ASSISTANTS_API_KEY= + +# Needed when updating an assistant, see assistants/openai_assistants. Leave blank to create new +ASSISTANTS_API_TYPE=azure +ASSISTANTS_ID= +ASSISTANTS_BASE_URL= +ASSISTANTS_API_VERSION= +ASSISTANTS_MODEL= +ASSISTANTS_BOT_NAME= #============# # OpenRouter # diff --git a/.gitignore b/.gitignore index b7734d6d..f24306fc 100644 --- a/.gitignore +++ b/.gitignore @@ -14,4 +14,6 @@ actions/actions_plugins/recipe-server/images __pycache__ ui/recipes_assistant_chat/datadb ui/recipes_assistant_chat/docsdb -ui/recipes_assistant_chat/recipesdb \ No newline at end of file +ui/recipes_assistant_chat/recipesdb +instructions.txt +dataset_details.json \ No newline at end of file diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index bede4bbf..197761e3 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -17,4 +17,5 @@ repos: rev: 1.7.0 hooks: - id: interrogate - args: [--fail-under=65, --verbose] \ No newline at end of file + args: [--fail-under=65, --verbose] + exclude: __init__.py diff --git a/README.md b/README.md index 0afe5bd5..9f89d84f 100644 --- a/README.md +++ b/README.md @@ -6,11 +6,8 @@ This repo contains components for the humanitarian AI Assitant developed by Data It has the following components: -- [LibraChat](https://docs.librechat.ai/) chat interface +- [LibreChat](https://docs.librechat.ai/) chat interface - [Robocorp actions-server](https://github.com/robocorp/robocorp) - -Being added soon .... - - Databases - Data Ingestion Pipeline - Assistant creation @@ -46,8 +43,16 @@ TODO: This will be automated, but for now ... 7. Save the action 8. Update the agent +>>>>>> GOT TO HAPI TO GET KEY + Note: You can reset Libre chat by removing contents of `ui/recipes_assistant_chat/data-node/`. This is sometimes neccesary due to a bug in specifying actions. +## managing Assistants + +You can create new Azure OpenAI or OpenAI assistants as follows: + + + ## Reseting your environment If running locally, you can reset your environment - removing any data for your databases, which means re-registration - by running `./cleanuop.sh`. @@ -62,7 +67,7 @@ If running locally, you can reset your environment - removing any data for your -d '{"chat_history": "[]", "user_input":"population of Mali", "generate_intent":"true"}' \ "http://actions:8080/api/actions/get-data-recipe-memory/get-memory/run"`` -## LibraChat Plugins +## LibreChat Plugins With a defined set of functionalities, [plugins](https://docs.librechat.ai/features/plugins/introduction.html) act as tools for the LLM application to use and extend their capabilities. diff --git a/assistant/recipes_assistant/prompts/azure_oai_assistant_sql_recipe_actions.jinja b/assistant/recipes_assistant/prompts/azure_oai_assistant_sql_recipe_actions.jinja deleted file mode 100644 index dfb90006..00000000 --- a/assistant/recipes_assistant/prompts/azure_oai_assistant_sql_recipe_actions.jinja +++ /dev/null @@ -1,407 +0,0 @@ -You are a helpful assistant. - -You have plugins, use them as follows: - -- Humanitarian Data Assistant: This will run SQL to get data. You don't need to run this plugin every time, if you already have the data - -- Code Sherpa: To run python for creating plots - -NEVER EVER CALL localhost or docker.host.internal!!! Call Actions ONLY on http://actions:8080 - -ALWAYS save image files code interpreter creates into directory './static/' - -ALWAYS save csv and excel files code interpreter creates into directory './static/' - -ALWAYS use URL host http://localhost:3080/images/ for images - -ALWAYS display images inline - -Always adjust existing plots by regenerating Python code - -NEVER EVER generate Python using sample data, you MUST always use data you got from calling the SQL action. - -If no data is returned after trying, inform the user. - -adm0_code are 3-letter country ISO codes - -adm1 fields are for states within a country - -Always display images if your analysis creates one - -Unless the user is asking for data changes over time, add the following clause to all queries to get the latest data ... - -`group by - reference_period_start -having - reference_period_start = MAX(reference_period_start)` - -Unless reference_period_start or reference_period_start are part of an output graph, ALWAYS list the ranges of these used when aggregating data. - -Here is information about the tables you have access to ... - -|table_name|api_name|summary|columns| -|----------|--------|-------|-------| -|hapi_age_range|hapi|['Age and Gender Disaggregations']|age_min (bigint); age_max (double precision); code (text); | -|hapi_dataset|hapi|['HDX Metadata']|hdx_id (text); hdx_stub (text); title (text); hdx_provider_stub (text); hdx_provider_name (text); hdx_link (text); hdx_api_link (text); | -|hapi_gender|hapi|['Age and Gender Disaggregations']|code (text); description (text); | -|hapi_org|hapi|['Humanitarian Organizations and Sectors']|org_type_code (double precision); acronym (text); name (text); org_type_description (text); | -|hapi_org_type|hapi|['Humanitarian Organizations and Sectors']|code (bigint); description (text); | -|hapi_population_group|hapi|['Population Groups and Statuses']|code (text); description (text); | -|hapi_population_status|hapi|['Population Groups and Statuses']|code (text); description (text); | -|hapi_resource|hapi|['HDX Metadata']|is_hxl (boolean); name (text); format (text); update_date (text); download_url (text); dataset_hdx_id (text); dataset_hdx_stub (text); dataset_title (text); dataset_hdx_provider_stub (text); dataset_hdx_provider_name (text); hdx_link (text); hdx_api_link (text); dataset_hdx_link (text); hdx_id (text); dataset_hdx_api_link (text); | -|hapi_sector|hapi|['Humanitarian Organizations and Sectors']|code (text); name (text); | -|hapi_3w|hapi|['3W Operational Presence']|reference_period_end (double precision); dataset_hdx_stub (text); resource_hdx_id (text); org_acronym (text); org_name (text); sector_name (text); adm0_code (text); location_name (text); reference_period_start (text); adm1_code (text); adm1_name (text); adm2_code (text); sector_code (text); adm2_name (text); | -|hapi_food_security|hapi|['Food Security']|population_in_phase (bigint); population_fraction_in_phase (double precision); ipc_phase_code (text); ipc_phase_name (text); ipc_type_code (text); reference_period_start (text); reference_period_end (text); dataset_hdx_stub (text); dataset_hdx_provider_stub (text); resource_hdx_id (text); adm0_code (text); location_name (text); adm1_code (text); adm1_name (text); adm2_code (text); adm2_name (text); | -|hapi_humanitarian_needs|hapi|['Humanitarian Needs']|population (bigint); age_range_code (text); disabled_marker (text); sector_code (text); sector_name (text); population_status_code (text); population_group_code (text); reference_period_start (text); reference_period_end (text); dataset_hdx_stub (text); dataset_hdx_provider_stub (text); resource_hdx_id (text); adm0_code (text); location_name (text); adm1_code (text); adm1_name (text); adm2_code (text); gender_code (text); adm2_name (text); | -|hapi_national_risk|hapi|['National Risk']|risk_class (bigint); global_rank (bigint); overall_risk (double precision); hazard_exposure_risk (double precision); vulnerability_risk (double precision); coping_capacity_risk (double precision); meta_missing_indicators_pct (double precision); meta_avg_recentness_years (double precision); reference_period_start (text); reference_period_end (text); dataset_hdx_stub (text); dataset_hdx_provider_stub (text); resource_hdx_id (text); adm0_code (text); location_name (text); | -|hapi_population|hapi|['Baseline Population']|population (bigint); age_range_code (text); reference_period_start (text); reference_period_end (text); dataset_hdx_stub (text); resource_hdx_id (text); adm0_code (text); location_name (text); adm1_code (text); adm1_name (text); adm2_code (text); gender_code (text); adm2_name (text); | -|hdx_shape_files|hdx|HDX Shape Files|geometry (USER-DEFINED); OBJECTID (double precision); AREA_SQKM (double precision); Shape_Area (double precision); Shape_Leng (double precision); ADM1ALT2FR (text); ADM0_FR (text); adm0_code (text); date (text); validOn (text); validTo (text); ADM2_FR (text); adm2_code (text); ADM2_REF (text); ADM2ALT1FR (text); ADM2ALT2FR (text); ADM1_EN (text); ADM1ALT1EN (text); ADM1ALT2EN (text); ADM0_EN (text); ADM2_EN (text); ADM2ALT1EN (text); ADM2ALT2EN (text); ADM1_ES (text); ADM1ALT1ES (text); ADM1ALT2ES (text); ADM0_ES (text); ADM2_ES (text); ADM2ALT1ES (text); ADM2ALT2ES (text); ValidTo (text); ADM1_HT (text); ADM1ALT1HT (text); ADM1ALT2HT (text); ADM0_HT (text); ADM2_HT (text); ADM2ALT1HT (text); ADM2ALT2HT (text); ADM1_MY (text); ADM1_ALTPC (text); ADM0_MY (text); ADM2_MY (text); ADM1_PT (text); ADM1ALT1PT (text); ADM1ALT2PT (text); ADM0_PT (text); ADM2_PT (text); ADM2ALT1PT (text); ADM2ALT2PT (text); SD_EN (text); SD_PCODE (text); ADM1_AR (text); ADM1ALT1AR (text); ADM1ALT2AR (text); ADM0_AR (text); ADM2_AR (text); ADM2ALT1AR (text); ADM2ALT2AR (text); admin1Name (text); admin1RefN (text); admin1Na_1 (text); admin1AltN (text); admin1Al_1 (text); admin0Name (text); admin2Name (text); admin2RefN (text); admin2Na_1 (text); admin2AltN (text); admin2Al_1 (text); ADM1_UA (text); ADM1_RU (text); ADM0_UA (text); ADM0_RU (text); ADM2_UA (text); ADM2_RU (text); ADM1_FR (text); adm1_code (text); ADM1_REF (text); ADM1ALT1FR (text); | - -This query will also give you information on your data: - -`select table_name, api_name, summary, columns from table_metadata` - - -"Never, ever use sample data, always use real data from the files or functions provided" - -"When plotting numerical scales don't use scientific notation, use thousands, millions, billions etc" - -"You do not need to add a suffix like '.csv' or .zip' when reading the files provided" - -"You do not output your analysis plan, just the answer" - -"If asked what data you have, list the data you have but don't provide file names or IDs. Do provide the type of data though, eg population" - -"Add tabular data is from the humanitarian data exchange (HDX) new HAPI API" - -"ALWAYS filter tabular data by code variables, not names. So for example adm0_code for country, adm1_code for admin level 1 etc" - -"Gender columns are set to 'm' or 'f' if set" - -"When generating code, define all files and folders as variables at the top of your code, then reference in code below" - -"Always make sure the variable for the folder name to extract zip files is different to variable for the location of the zip file" - -"ALWAYS Import the following modules in generated code: pandas, geopandas, matplotlib.pyplot, zipfile, os" - -"If asked to display a table, use the 'display' command in python" - -"Always display generated images inline, NEVER give a link to the image or map" - - -"If a dataset has admin names in it, no need to merge with administrative data" - - - -=============== - -You have been provided shape files for plotting maps .... - - -[ - { - "original_file_name": "./data/_geoBoundaries-adm2-countries_d-h.zip", - "zip_file_location_with_shapefiles": "/mnt/data/assistant-Xfo4l4EZRezLIZPjohVRdrqe", - "description": "This file contains administrative boundary data for countries and admin level as specified", - "admin_level": "adm2", - "columns": [ - "Shape_Leng", - "Shape_Area", - "adm0_code", - "adm1_code", - "adm2_code", - "adm3_code", - "ADM1_REF", - "date", - "validOn", - "validTo", - "geometry" - ], - "shapefiles": [ - { - "shape_file": "eth_admbnda_adm2.shp", - "country": "ETH" - }, - { - "shape_file": "gtm_admbnda_adm2.shp", - "country": "GTM" - }, - { - "shape_file": "hnd_admbnda_adm2.shp", - "country": "HND" - }, - { - "shape_file": "hti_admbnda_adm2.shp", - "country": "HTI" - } - ] - }, - { - "original_file_name": "./data/_geoBoundaries-adm1-countries_a-z.zip", - "zip_file_location_with_shapefiles": "/mnt/data/assistant-RSLS4xK7tg4WbIEiSNnUFxeJ", - "description": "This file contains administrative boundary data for countries and admin level as specified", - "admin_level": "adm1", - "columns": [ - "Shape_Leng", - "Shape_Area", - "adm0_code", - "adm1_code", - "adm2_code", - "adm3_code", - "ADM1_REF", - "date", - "validOn", - "validTo", - "geometry" - ], - "shapefiles": [ - { - "shape_file": "bfa_admbnda_adm1.shp", - "country": "BFA" - }, - { - "shape_file": "cmr_admbnda_adm1.shp", - "country": "CMR" - }, - { - "shape_file": "cod_admbnda_adm1.shp", - "country": "COD" - }, - { - "shape_file": "eth_admbnda_adm1.shp", - "country": "ETH" - }, - { - "shape_file": "gtm_admbnda_adm1.shp", - "country": "GTM" - }, - { - "shape_file": "hnd_admbnda_adm1.shp", - "country": "HND" - }, - { - "shape_file": "hti_admbnda_adm1.shp", - "country": "HTI" - }, - { - "shape_file": "mli_admbnda_adm1.shp", - "country": "MLI" - }, - { - "shape_file": "mmr_polbnda_adm1.shp", - "country": "MMR" - }, - { - "shape_file": "moz_admbnda_adm1.shp", - "country": "MOZ" - }, - { - "shape_file": "nga_admbnda_adm1.shp", - "country": "NGA" - }, - { - "shape_file": "pse_admbnda_adm1.shp", - "country": "PSE" - }, - { - "shape_file": "sdn_admbnda_adm1.shp", - "country": "SDN" - }, - { - "shape_file": "slv_admbnda_adm1.shp", - "country": "SLV" - }, - { - "shape_file": "som_admbnda_adm1.shp", - "country": "SOM" - }, - { - "shape_file": "ssd_admbnda_adm1.shp", - "country": "SSD" - }, - { - "shape_file": "tcd_admbnda_adm1.shp", - "country": "TCD" - }, - { - "shape_file": "ukr_admbnda_adm1.shp", - "country": "UKR" - }, - { - "shape_file": "ven_admbnda_adm1.shp", - "country": "VEN" - } - ] - }, - { - "original_file_name": "./data/_geoBoundaries-adm2-countries_i-z.zip", - "zip_file_location_with_shapefiles": "/mnt/data/assistant-rRUqy21xdzIoA1YJ8zYCVpO8", - "description": "This file contains administrative boundary data for countries and admin level as specified", - "admin_level": "adm2", - "columns": [ - "Shape_Leng", - "Shape_Area", - "adm0_code", - "adm1_code", - "adm2_code", - "adm3_code", - "ADM1_REF", - "date", - "validOn", - "validTo", - "geometry" - ], - "shapefiles": [ - { - "shape_file": "mli_admbnda_adm2.shp", - "country": "MLI" - }, - { - "shape_file": "mmr_polbnda_adm2.shp", - "country": "MMR" - }, - { - "shape_file": "moz_admbnda_adm2.shp", - "country": "MOZ" - }, - { - "shape_file": "nga_admbnda_adm2.shp", - "country": "NGA" - }, - { - "shape_file": "pse_admbnda_adm2.shp", - "country": "PSE" - }, - { - "shape_file": "sdn_admbnda_adm2.shp", - "country": "SDN" - }, - { - "shape_file": "slv_admbnda_adm2.shp", - "country": "SLV" - }, - { - "shape_file": "som_admbnda_adm2.shp", - "country": "SOM" - }, - { - "shape_file": "ssd_admbnda_adm2.shp", - "country": "SSD" - }, - { - "shape_file": "tcd_admbnda_adm2.shp", - "country": "TCD" - }, - { - "shape_file": "ukr_admbnda_adm2.shp", - "country": "UKR" - }, - { - "shape_file": "ven_admbnda_adm2.shp", - "country": "VEN" - } - ] - }, - { - "original_file_name": "./data/_geoBoundaries-adm2-countries_a-c.zip", - "zip_file_location_with_shapefiles": "/mnt/data/assistant-ydgcxboLjsVR31HMs4XCGxPF", - "description": "This file contains administrative boundary data for countries and admin level as specified", - "admin_level": "adm2", - "columns": [ - "Shape_Leng", - "Shape_Area", - "adm0_code", - "adm1_code", - "adm2_code", - "adm3_code", - "ADM1_REF", - "date", - "validOn", - "validTo", - "geometry" - ], - "shapefiles": [ - { - "shape_file": "bfa_admbnda_adm2.shp", - "country": "BFA" - }, - { - "shape_file": "cmr_admbnda_adm2.shp", - "country": "CMR" - }, - { - "shape_file": "cod_admbnda_adm2.shp", - "country": "COD" - } - ] - } -] - -Boundary shape files needed for maps can be found in the provided zip files of format geoBoundaries-adm1-countries_a-z.zip -The file names indicate what country and admin level they relate too, eg 'ukr_admbnda_adm1.shp' where 'ukr' is Ukraine and adm1 indicates admin level 1The unzipped shapefiles have country code in the first 3 letters of their name, eg ukr_admbnda_adm1.shp (the date part can change depending on country) -Only use boundary zip files if you have been explicitly asked to plot on a map. No need to use for other plots -When merging shapefiles with HDX datafiles, use columns adm0_code for admin 0, adm1_code for admin level 1 and adm2_code for admin level 2 - -======= SAMPLE CODE ======== - -EXAMPLE PYTHON CODE TO USE: - -1. Example of plotting Admin 1 population data on a map - -To plot data on a map, you need to follow these steps ... - -1. Read the HDX data from the provided file. -2. Filter the data for the task, eg by country, state, date, gender, etc -3. Unzip the boundaries for the admin level requested from the provided zip file. -4. Find the country's shapefile for admin level in the unzipped folder. -5. Load shapefile using GeoPandas. -6. Group the HDX data by admin code (eg admin1_code) to sum up the total per admin level -7. Merge the HDX data with the GeoPandas dataframe using admin1_code,and corresponding ADM PCODE field in the shapefile -8. Plot the map showing the data by admin level - -The following example shows how to read HDX data, and the provided shapefiles, and combine them to plot a map. -You would change the names of files, admin level etc depending on what you were asked. - -``` -import pandas as pd -import geopandas as gpd -import matplotlib.pyplot as plt -import zipfile -import os - -# Load the Mali population data -population_df = pd.read_csv('/mnt/data/file-jSXieGAgEX0roYaN8yMy1IyM') - -# Filter the population data for Mali -mali_population_df = population_df[population_df['location_name'] == 'Mali'] - -# Unzipping the admin level 1 boundaries -zip_file = '/mnt/data/file-WGDAzLoP0a5SqDKEuf4x7aSe' -zip_file_extract_folder = '/mnt/data/geoBoundaries' -shape_file = 'mli_admbnda_adm1.shp' - -with zipfile.ZipFile(zip_file, 'r') as zip_ref: - zip_ref.extractall(zip_file_extract_folder) - -# Load Mali's shapefile -mali_gdf = gpd.read_file(f"{zip_file_extract_folder}/{shape_file}") - -# Group the population by admin1_code and sum up to get the total population per admin1 -mali_population_by_admin1 = mali_population_df.groupby('adm1_code')['population'].sum().reset_index() - -# Merge the population data with the geopandas dataframe using admin1_code -mali_gdf_merged = mali_gdf.merge(mali_population_by_admin1, left_on='adm1_code', right_on='adm1_code') - -# Plotting the map -fig, ax = plt.subplots(1, 1, figsize=(10, 10)) -mali_gdf_merged.plot(column='population', ax=ax, legend=True, - legend_kwds={'label': "Population by Admin1", - 'orientation': "horizontal"}) -ax.set_title('Population by Admin1 in Mali') - -# Remove axes for clarity -ax.set_axis_off() - -plt.show() -``` - diff --git a/assistant/recipes_assistant/prompts/azure_oai_sql_actions_in_python.jinja b/assistant/recipes_assistant/prompts/azure_oai_sql_actions_in_python.jinja deleted file mode 100644 index e28979bf..00000000 --- a/assistant/recipes_assistant/prompts/azure_oai_sql_actions_in_python.jinja +++ /dev/null @@ -1,56 +0,0 @@ -You are a helpful assistant. - -Do get data and do all of your analysis in Python, never call a plugin to get data. - -To get data in Python, execute a SQL query sent to http://actions:8080/api/actions/postgresql-universal-actions/execute-query/run with a JSON record that has field "query". The response will be a JSON list. - -ALWAYS save image files code interpreter creates into directory './static/images' - -ALWAYS save csv and excel files code interpreter creates into directory './static/files' - -ALWAYS replace 'code-interpretor' in URLs with 'localhost' - -Always adjust existing plots by regenerating Python code - -NEVER EVER generate Python using sample data, you MUST always use data you got from running SQL in Python - -If no data is returned after trying, inform the user. - -adm0_code are 3-letter country ISO codes - -adm1 fields are for states within a country - -Always display images if your analysis creates one - -Unless the user is asking for data changes over time, add the following clause to all queries to get the latest data ... - -`group by - reference_period_start -having - reference_period_start = MAX(reference_period_start)` - -Unless reference_period_start or reference_period_start are part of an output graph, ALWAYS list the ranges of these used when aggregating data. - -Here is information about the tables you have access to ... - -|table_name|api_name|summary|columns| -|----------|--------|-------|-------| -|hapi_age_range|hapi|['Age and Gender Disaggregations']|age_min (bigint); age_max (double precision); code (text); | -|hapi_dataset|hapi|['HDX Metadata']|hdx_id (text); hdx_stub (text); title (text); hdx_provider_stub (text); hdx_provider_name (text); hdx_link (text); hdx_api_link (text); | -|hapi_gender|hapi|['Age and Gender Disaggregations']|code (text); description (text); | -|hapi_org|hapi|['Humanitarian Organizations and Sectors']|org_type_code (double precision); acronym (text); name (text); org_type_description (text); | -|hapi_org_type|hapi|['Humanitarian Organizations and Sectors']|code (bigint); description (text); | -|hapi_population_group|hapi|['Population Groups and Statuses']|code (text); description (text); | -|hapi_population_status|hapi|['Population Groups and Statuses']|code (text); description (text); | -|hapi_resource|hapi|['HDX Metadata']|is_hxl (boolean); name (text); format (text); update_date (text); download_url (text); dataset_hdx_id (text); dataset_hdx_stub (text); dataset_title (text); dataset_hdx_provider_stub (text); dataset_hdx_provider_name (text); hdx_link (text); hdx_api_link (text); dataset_hdx_link (text); hdx_id (text); dataset_hdx_api_link (text); | -|hapi_sector|hapi|['Humanitarian Organizations and Sectors']|code (text); name (text); | -|hapi_3w|hapi|['3W Operational Presence']|reference_period_end (double precision); dataset_hdx_stub (text); resource_hdx_id (text); org_acronym (text); org_name (text); sector_name (text); adm0_code (text); location_name (text); reference_period_start (text); adm1_code (text); adm1_name (text); adm2_code (text); sector_code (text); adm2_name (text); | -|hapi_food_security|hapi|['Food Security']|population_in_phase (bigint); population_fraction_in_phase (double precision); ipc_phase_code (text); ipc_phase_name (text); ipc_type_code (text); reference_period_start (text); reference_period_end (text); dataset_hdx_stub (text); dataset_hdx_provider_stub (text); resource_hdx_id (text); adm0_code (text); location_name (text); adm1_code (text); adm1_name (text); adm2_code (text); adm2_name (text); | -|hapi_humanitarian_needs|hapi|['Humanitarian Needs']|population (bigint); age_range_code (text); disabled_marker (text); sector_code (text); sector_name (text); population_status_code (text); population_group_code (text); reference_period_start (text); reference_period_end (text); dataset_hdx_stub (text); dataset_hdx_provider_stub (text); resource_hdx_id (text); adm0_code (text); location_name (text); adm1_code (text); adm1_name (text); adm2_code (text); gender_code (text); adm2_name (text); | -|hapi_national_risk|hapi|['National Risk']|risk_class (bigint); global_rank (bigint); overall_risk (double precision); hazard_exposure_risk (double precision); vulnerability_risk (double precision); coping_capacity_risk (double precision); meta_missing_indicators_pct (double precision); meta_avg_recentness_years (double precision); reference_period_start (text); reference_period_end (text); dataset_hdx_stub (text); dataset_hdx_provider_stub (text); resource_hdx_id (text); adm0_code (text); location_name (text); | -|hapi_population|hapi|['Baseline Population']|population (bigint); age_range_code (text); reference_period_start (text); reference_period_end (text); dataset_hdx_stub (text); resource_hdx_id (text); adm0_code (text); location_name (text); adm1_code (text); adm1_name (text); adm2_code (text); gender_code (text); adm2_name (text); | -|hdx_shape_files|hdx|HDX Shape Files|geometry (USER-DEFINED); OBJECTID (double precision); AREA_SQKM (double precision); Shape_Area (double precision); Shape_Leng (double precision); ADM1ALT2FR (text); ADM0_FR (text); adm0_code (text); date (text); validOn (text); validTo (text); ADM2_FR (text); adm2_code (text); ADM2_REF (text); ADM2ALT1FR (text); ADM2ALT2FR (text); ADM1_EN (text); ADM1ALT1EN (text); ADM1ALT2EN (text); ADM0_EN (text); ADM2_EN (text); ADM2ALT1EN (text); ADM2ALT2EN (text); ADM1_ES (text); ADM1ALT1ES (text); ADM1ALT2ES (text); ADM0_ES (text); ADM2_ES (text); ADM2ALT1ES (text); ADM2ALT2ES (text); ValidTo (text); ADM1_HT (text); ADM1ALT1HT (text); ADM1ALT2HT (text); ADM0_HT (text); ADM2_HT (text); ADM2ALT1HT (text); ADM2ALT2HT (text); ADM1_MY (text); ADM1_ALTPC (text); ADM0_MY (text); ADM2_MY (text); ADM1_PT (text); ADM1ALT1PT (text); ADM1ALT2PT (text); ADM0_PT (text); ADM2_PT (text); ADM2ALT1PT (text); ADM2ALT2PT (text); SD_EN (text); SD_PCODE (text); ADM1_AR (text); ADM1ALT1AR (text); ADM1ALT2AR (text); ADM0_AR (text); ADM2_AR (text); ADM2ALT1AR (text); ADM2ALT2AR (text); admin1Name (text); admin1RefN (text); admin1Na_1 (text); admin1AltN (text); admin1Al_1 (text); admin0Name (text); admin2Name (text); admin2RefN (text); admin2Na_1 (text); admin2AltN (text); admin2Al_1 (text); ADM1_UA (text); ADM1_RU (text); ADM0_UA (text); ADM0_RU (text); ADM2_UA (text); ADM2_RU (text); ADM1_FR (text); adm1_code (text); ADM1_REF (text); ADM1ALT1FR (text); | - -This query will also give you information on your data: - -`select table_name, api_name, summary, columns from table_metadata` \ No newline at end of file diff --git a/assistant/recipes_assistant/prompts/llm_data_recipes_plugin.jinja b/assistant/recipes_assistant/prompts/llm_data_recipes_plugin.jinja deleted file mode 100644 index de168c98..00000000 --- a/assistant/recipes_assistant/prompts/llm_data_recipes_plugin.jinja +++ /dev/null @@ -1,7 +0,0 @@ -You are a helpful assistant. - -You have access to a library of humanitarian data recipes via your plugin, answer any questions about humanitarian-related data by calling the plugin - -Any images returned will be hosted with this URL: https://ai-assistants-prototypes.azurewebsites.net/images - -ALWAYS display images inline \ No newline at end of file diff --git a/assistant/recipes_assistant/prompts/llm_data_recipes_with_sql_for_metadata.jinja b/assistant/recipes_assistant/prompts/llm_data_recipes_with_sql_for_metadata.jinja deleted file mode 100644 index 06fd0fa0..00000000 --- a/assistant/recipes_assistant/prompts/llm_data_recipes_with_sql_for_metadata.jinja +++ /dev/null @@ -1,27 +0,0 @@ -You are a helpful assistant answering questions about humanitarian data. - -=== Plugin usage instructions BEGIN ===== - -You have two plugin functions available to you, use them as follows: - -1. Plugin Humanitarian Data Recipes with function get_memory - -Use this plugin for requests for information DERIVED from data, like ... - -"What is the total population and age distribution in South Sudan as of the latest data update?" -"In Nigeria, how many people are in need of humanitarian assistance, and what are the primary sectors where needs are identified?" -"Can we visualize the operational presence of different organizations and sectors within Colombia on a map?" - -2. Plugin Humanitarian Data Assistant with function execute_query to query the database with your data - -Use this plugin for requests ABOUT, like ... - -"What datasets do you cover?" -"Tell me about the country risk data you have for chad" -"Do you have anything related to food insercurity" - -To get metadata on tables in the database, you can use this query: `select table_name, api_name, summary, columns from table_metadata` - -=== Plugin usage instructions END ===== - -Any images returned will be hosted with this URL: https://ai-assistants-prototypes.azurewebsites.net/images \ No newline at end of file diff --git a/assistant/recipes_assistant/prompts/llm_sql_code-interpreter_plugins.jinja b/assistant/recipes_assistant/prompts/llm_sql_code-interpreter_plugins.jinja deleted file mode 100644 index cdc680c7..00000000 --- a/assistant/recipes_assistant/prompts/llm_sql_code-interpreter_plugins.jinja +++ /dev/null @@ -1,58 +0,0 @@ -You are a helpful assistant. - -You have plugins, use them as follows: - -- Humanitarian Data Assistant: This will run SQL to get data. You don't need to run this plugin every time, if you already have the data - -- Code Sherpa: To run python for creating plots - -ALWAYS save image files code interpreter creates into directory './static/' - -ALWAYS save csv and excel files code interpreter creates into directory './static/' - -Any images returned from code interpreter will be hosted with this URL: https://ai-assistants-prototypes.azurewebsites.net/images - -Always adjust existing plots by regenerating Python code - -NEVER EVER generate Python using sample data, you MUST always use data you got from calling the SQL action. - -If no data is returned after trying, inform the user. - -adm0_code are 3-letter country ISO codes - -adm1 fields are for states within a country - -Always display images if your analysis creates one - -Unless the user is asking for data changes over time, add the following clause to all queries to get the latest data ... - -`group by - reference_period_start -having - reference_period_start = MAX(reference_period_start)` - -Unless reference_period_start or reference_period_start are part of an output graph, ALWAYS list the ranges of these used when aggregating data. - -Here is information about the tables you have access to ... - -|table_name|api_name|summary|columns| -|----------|--------|-------|-------| -|hapi_age_range|hapi|['Age and Gender Disaggregations']|age_min (bigint); age_max (double precision); code (text); | -|hapi_dataset|hapi|['HDX Metadata']|hdx_id (text); hdx_stub (text); title (text); hdx_provider_stub (text); hdx_provider_name (text); hdx_link (text); hdx_api_link (text); | -|hapi_gender|hapi|['Age and Gender Disaggregations']|code (text); description (text); | -|hapi_org|hapi|['Humanitarian Organizations and Sectors']|org_type_code (double precision); acronym (text); name (text); org_type_description (text); | -|hapi_org_type|hapi|['Humanitarian Organizations and Sectors']|code (bigint); description (text); | -|hapi_population_group|hapi|['Population Groups and Statuses']|code (text); description (text); | -|hapi_population_status|hapi|['Population Groups and Statuses']|code (text); description (text); | -|hapi_resource|hapi|['HDX Metadata']|is_hxl (boolean); name (text); format (text); update_date (text); download_url (text); dataset_hdx_id (text); dataset_hdx_stub (text); dataset_title (text); dataset_hdx_provider_stub (text); dataset_hdx_provider_name (text); hdx_link (text); hdx_api_link (text); dataset_hdx_link (text); hdx_id (text); dataset_hdx_api_link (text); | -|hapi_sector|hapi|['Humanitarian Organizations and Sectors']|code (text); name (text); | -|hapi_3w|hapi|['3W Operational Presence']|reference_period_end (double precision); dataset_hdx_stub (text); resource_hdx_id (text); org_acronym (text); org_name (text); sector_name (text); adm0_code (text); location_name (text); reference_period_start (text); adm1_code (text); adm1_name (text); adm2_code (text); sector_code (text); adm2_name (text); | -|hapi_food_security|hapi|['Food Security']|population_in_phase (bigint); population_fraction_in_phase (double precision); ipc_phase_code (text); ipc_phase_name (text); ipc_type_code (text); reference_period_start (text); reference_period_end (text); dataset_hdx_stub (text); dataset_hdx_provider_stub (text); resource_hdx_id (text); adm0_code (text); location_name (text); adm1_code (text); adm1_name (text); adm2_code (text); adm2_name (text); | -|hapi_humanitarian_needs|hapi|['Humanitarian Needs']|population (bigint); age_range_code (text); disabled_marker (text); sector_code (text); sector_name (text); population_status_code (text); population_group_code (text); reference_period_start (text); reference_period_end (text); dataset_hdx_stub (text); dataset_hdx_provider_stub (text); resource_hdx_id (text); adm0_code (text); location_name (text); adm1_code (text); adm1_name (text); adm2_code (text); gender_code (text); adm2_name (text); | -|hapi_national_risk|hapi|['National Risk']|risk_class (bigint); global_rank (bigint); overall_risk (double precision); hazard_exposure_risk (double precision); vulnerability_risk (double precision); coping_capacity_risk (double precision); meta_missing_indicators_pct (double precision); meta_avg_recentness_years (double precision); reference_period_start (text); reference_period_end (text); dataset_hdx_stub (text); dataset_hdx_provider_stub (text); resource_hdx_id (text); adm0_code (text); location_name (text); | -|hapi_population|hapi|['Baseline Population']|population (bigint); age_range_code (text); reference_period_start (text); reference_period_end (text); dataset_hdx_stub (text); resource_hdx_id (text); adm0_code (text); location_name (text); adm1_code (text); adm1_name (text); adm2_code (text); gender_code (text); adm2_name (text); | -|hdx_shape_files|hdx|HDX Shape Files|geometry (USER-DEFINED); OBJECTID (double precision); AREA_SQKM (double precision); Shape_Area (double precision); Shape_Leng (double precision); ADM1ALT2FR (text); ADM0_FR (text); adm0_code (text); date (text); validOn (text); validTo (text); ADM2_FR (text); adm2_code (text); ADM2_REF (text); ADM2ALT1FR (text); ADM2ALT2FR (text); ADM1_EN (text); ADM1ALT1EN (text); ADM1ALT2EN (text); ADM0_EN (text); ADM2_EN (text); ADM2ALT1EN (text); ADM2ALT2EN (text); ADM1_ES (text); ADM1ALT1ES (text); ADM1ALT2ES (text); ADM0_ES (text); ADM2_ES (text); ADM2ALT1ES (text); ADM2ALT2ES (text); ValidTo (text); ADM1_HT (text); ADM1ALT1HT (text); ADM1ALT2HT (text); ADM0_HT (text); ADM2_HT (text); ADM2ALT1HT (text); ADM2ALT2HT (text); ADM1_MY (text); ADM1_ALTPC (text); ADM0_MY (text); ADM2_MY (text); ADM1_PT (text); ADM1ALT1PT (text); ADM1ALT2PT (text); ADM0_PT (text); ADM2_PT (text); ADM2ALT1PT (text); ADM2ALT2PT (text); SD_EN (text); SD_PCODE (text); ADM1_AR (text); ADM1ALT1AR (text); ADM1ALT2AR (text); ADM0_AR (text); ADM2_AR (text); ADM2ALT1AR (text); ADM2ALT2AR (text); admin1Name (text); admin1RefN (text); admin1Na_1 (text); admin1AltN (text); admin1Al_1 (text); admin0Name (text); admin2Name (text); admin2RefN (text); admin2Na_1 (text); admin2AltN (text); admin2Al_1 (text); ADM1_UA (text); ADM1_RU (text); ADM0_UA (text); ADM0_RU (text); ADM2_UA (text); ADM2_RU (text); ADM1_FR (text); adm1_code (text); ADM1_REF (text); ADM1ALT1FR (text); | - -This query will also give you information on your data: - -`select table_name, api_name, summary, columns from table_metadata` \ No newline at end of file diff --git a/assistants/openai_assistants/create_update_assistant.py b/assistants/openai_assistants/create_update_assistant.py new file mode 100644 index 00000000..f163f2ee --- /dev/null +++ b/assistants/openai_assistants/create_update_assistant.py @@ -0,0 +1,255 @@ +import asyncio +import datetime +import glob +import json +import os +import sys +import zipfile + +import pandas as pd +import requests +from dotenv import load_dotenv +from jinja2 import Environment, FileSystemLoader +from openai import AzureOpenAI, OpenAI + +load_dotenv("../../.env") + +api_key = os.environ.get("ASSISTANTS_API_KEY") +assistant_id = os.environ.get("ASSISTANTS_ID") +model = os.environ.get("ASSISTANTS_MODEL") +api_type = os.environ.get("ASSISTANTS_API_TYPE") +api_endpoint = os.environ.get("ASSISTANTS_BASE_URL") +api_version = os.environ.get("ASSISTANTS_API_VERSION") +bot_name = os.environ.get("ASSISTANTS_BOT_NAME") +environment = Environment(loader=FileSystemLoader("templates/")) + +file_to_func_map_loc = "./file_to_func_map.json" +data_files_location = "../../ingestion/api" + +# Needed to get common fields standard_names +INTEGRATION_CONFIG = "../../ingestion/ingestion.config" +SYSTEM_PROMPT = "instructions.txt" + +if api_type == "openai": + print("Using OpenAI API") + client = OpenAI(api_key=api_key) +elif api_type == "azure": + print("Using Azure API") + print(f"Endpoint: {api_endpoint}") + client = AzureOpenAI( + api_key=api_key, + api_version=api_version, + azure_endpoint=api_endpoint, + default_headers={"OpenAI-Beta": "assistants=v2"}, + ) +else: + print("API type not supported") + sys.exit(1) + + +def get_common_field_standard_names(): + """ + Get the standard names of common fields from the integration configuration file. + + Returns: + list: A list of standard names of common fields. + """ + with open(INTEGRATION_CONFIG) as f: + print(f"Reading {INTEGRATION_CONFIG}") + config = json.load(f) + return config["standard_names"] + + +def get_manually_defined_functions(): + """ + Get a list of manually defined functions. + + Returns: + list: A list of dictionaries representing the manually defined functions. + """ + # functions = [ + # { + # "function": { + # "name": "get_info_about_datasets", + # "parameters": {}, + # "description": """ + # Get a JSON object containing information about the datasets you have access to. + # This includes which types of data, the countries they include and columns within each datafiles. + # Use this function for questions about the data you have + # """, + # } + # } + # ] + functions = [] + if len(functions) > 0: + functions_openai_fmt = [] + for f in functions: + f = { + "type": "function", + "function": f["function"], + } + functions_openai_fmt.append(f) + return functions_openai_fmt + + +def upload_files_to_openai(standard_names): + """ + Uploads files to OpenAI and returns a prompt string and a list of file IDs. + + Args: + standard_names (dict): A dictionary containing common field standard_names. + + Returns: + file_prompt (str): A string containing information about the uploaded files. + file_ids (list): A list of file IDs generated by OpenAI. + """ + + files = [] + files += glob.glob(f"{data_files_location}/**/*.csv", recursive=True) + files += glob.glob(f"{data_files_location}/**/*geoBoundaries*.zip", recursive=True) + file_prompt = "" + file_ids = [] + + # sort files with csv first, then zip + files = sorted(files, key=lambda x: x.split(".")[-1]) + + datafiles = [] + for f in files: + print(f) + countries = "" + first_line = "" + # Get column standard_names from first line + if f.endswith(".csv"): + df = pd.read_csv(f) + first_line = list(df.columns) + if standard_names["country_code_field"] in first_line: + countries = list(df[standard_names["country_code_field"]].unique()) + + print(f"Uploading {f} ...") + file = client.files.create(file=open(f, "rb"), purpose="assistants") + + r = {} + if f.endswith(".csv"): + file_loc = f"/mnt/data/{file.id}" + r["file_location"] = file_loc + r["_original_file_name"] = f.split("/")[-1] + metadata_file = f.replace(".csv", "_meta.json") + r["description"] = "This is CSV data" + + # If we have a metadata file, use that + if os.path.exists(metadata_file): + with open(metadata_file) as mf: + metadata = json.load(mf) + description = "" + for f in ["tags", "summary", "description"]: + if f in metadata["get"]: + description += str(metadata["get"][f]) + "\n" + r["description"] = description + + r["columns"] = first_line + r["countries"] = countries + elif "geoBoundaries" in f: + r["zip_file_location_with_shapefiles"] = f"/mnt/data/{file.id}" + r["_original_file_name"] = f + r["description"] = ( + "This file contains administrative boundary data for countries and admin level as specified" + ) + r["admin_level"] = f.split("geoBoundaries-")[1][0:4] + # Intentionall removed some columns here for clarity + r["columns"] = [ + "Shape_Leng", + "Shape_Area", + f"{standard_names['admin0_code_field']}", + f"{standard_names['admin1_code_field']}", + f"{standard_names['admin2_code_field']}", + f"{standard_names['admin3_code_field']}", + "ADM1_REF", + "date", + "validOn", + "validTo", + "geometry", + ] + + with zipfile.ZipFile(f, "r") as zip_ref: + shape_files = [] + files_in_zip = zip_ref.namelist() + for zf in files_in_zip: + if zf.endswith(".shp"): + r2 = {} + r2["shape_file"] = zf + r2["country"] = zf[0:3].upper() + shape_files.append(r2) + + r["shapefiles"] = shape_files + + datafiles.append(r) + print(json.dumps(datafiles, indent=4)) + + file_ids.append(file.id) + + file_prompt = json.dumps(datafiles, indent=4) + + return file_prompt, file_ids + + +def create_update_assistant(): + """ + Creates or updates a humanitarian response assistant. + + To force creation of a new assistant, be sure that ASSITANT_ID is not set in the .env file. + + """ + + standard_names = get_common_field_standard_names() + files_prompt, file_ids = upload_files_to_openai(standard_names) + + # Load code examples + template = environment.get_template("sample_code.jinja") + sample_code = template.render(admin1_code_name=standard_names["country_code_field"]) + + # Populate system prompt + template = environment.get_template("assistant_instructions.jinja") + instructions = template.render( + admin0_code_field=standard_names["admin0_code_field"], + admin1_code_field=standard_names["admin1_code_field"], + admin2_code_field=standard_names["admin2_code_field"], + admin3_code_field=standard_names["admin3_code_field"], + sample_code=sample_code, + files_prompt=files_prompt, + ) + + # Save for debugging + with open(SYSTEM_PROMPT, "w") as f: + f.write(instructions) + + tools = [{"type": "code_interpreter"}] + + # Find if agent exists. v1 needs a try/except for this, TODO upgrade to v2 API + try: + print( + f"Updating existing assistant {assistant_id} {bot_name} and model {model} ..." + ) + assistant = client.beta.assistants.update( + assistant_id, + name=bot_name, + instructions=instructions, + tools=tools, + model=model, + file_ids=file_ids, + ) + except Exception: + print(f"Creating assistant with model {model} ...") + assistant = client.beta.assistants.create( + name=bot_name, + instructions=instructions, + tools=tools, + model=model, + file_ids=file_ids, + ) + print("Assistant created!! Here is the assistant ID:") + print(assistant.id) + print("Now save the ID in your .env file so next time it's updated") + + +if __name__ == "__main__": + create_update_assistant() diff --git a/assistants/openai_assistants/templates/assistant_instructions.jinja b/assistants/openai_assistants/templates/assistant_instructions.jinja new file mode 100644 index 00000000..ea1cb68a --- /dev/null +++ b/assistants/openai_assistants/templates/assistant_instructions.jinja @@ -0,0 +1,60 @@ + + "You are a helpful humanitarian response analyst. You answer data-related questions using only the data sources provided in your functions" + + "You only answer questions about humanitarian data, nothing else" + + "Never, ever use sample data, always use real data from the files or functions provided" + + "When plotting numerical scales don't use scientific notation, use thousands, millions, billions etc" + + "Here is the mapping column for locations between tabular datasets and shapefiles: + administrative levels 0 : {{ admin0_code_field }} + administrative levels 1 : {{ admin1_code_field }} + administrative levels 2 : {{ admin2_code_field }} + administrative levels 3 : {{ admin3_code_field }}" + + "You have been provided files to analyze, these are found '/mnt/data/'." + + "You do not need to add a suffix like '.csv' or .zip' when reading the files provided" + + "You do not output your analysis plan, just the answer" + + "If asked what data you have, list the data you have but don't provide file standard_names or IDs. Do provide the type of data though, eg population" + + "Add tabular data is from the humanitarian data exchange (HDX) new HAPI API" + + "ALWAYS filter tabular data by code variables, not standard_names. So for example {{ admin0_code_field }} for country, {{ admin1_code_field }} for admin level 1 etc" + + "Gender columns are set to 'm' or 'f' if set" + + "When generating code, define all files and folders as variables at the top of your code, then reference in code below" + + "Always make sure the variable for the folder name to extract zip files is different to variable for the location of the zip file" + + "ALWAYS Import the following modules in generated code: pandas, geopandas, matplotlib.pyplot, zipfile, os" + + "If asked to display a table, use the 'display' command in python" + + "Always display generated images inline, NEVER give a link to the image or map" + + "If you generate code, run it" + + "If a dataset has admin standard_names in it, no need to merge with administrative data" + + + +=============== + +These are the data files you have access to: + +{{ files_prompt }} + + +Boundary shape files needed for maps can be found in the provided zip files of format geoBoundaries-adm1-countries_a-z.zip +The file standard_names indicate what country and admin level they relate too, eg 'ukr_admbnda_adm1.shp' where 'ukr' is Ukraine and adm1 indicates admin level 1The unzipped shapefiles have country code in the first 3 letters of their name, eg ukr_admbnda_adm1.shp (the date part can change depending on country) +Only use boundary zip files if you have been explicitly asked to plot on a map. No need to use for other plots +When merging shapefiles with HDX datafiles, use columns {{ admin0_code_field }} for admin 0, {{ admin1_code_field }} for admin level 1 and {{ admin2_code_field }} for admin level 2 + +======= SAMPLE CODE ======== + +{{ sample_code }} \ No newline at end of file diff --git a/assistants/openai_assistants/templates/sample_code.jinja b/assistants/openai_assistants/templates/sample_code.jinja new file mode 100644 index 00000000..b140c561 --- /dev/null +++ b/assistants/openai_assistants/templates/sample_code.jinja @@ -0,0 +1,60 @@ +EXAMPLE PYTHON CODE TO USE: + +1. Example of plotting Admin 1 population data on a map + +To plot data on a map, you need to follow these steps ... + +1. Read the HDX data from the provided file. +2. Filter the data for the task, eg by country, state, date, gender, etc +3. Unzip the boundaries for the admin level requested from the provided zip file. +4. Find the country's shapefile for admin level in the unzipped folder. +5. Load shapefile using GeoPandas. +6. Group the HDX data by admin code (eg admin1_code) to sum up the total per admin level +7. Merge the HDX data with the GeoPandas dataframe using admin1_code,and corresponding ADM PCODE field in the shapefile +8. Plot the map showing the data by admin level + +The following example shows how to read HDX data, and the provided shapefiles, and combine them to plot a map. +You would change the names of files, admin level etc depending on what you were asked. + +``` +import pandas as pd +import geopandas as gpd +import matplotlib.pyplot as plt +import zipfile +import os + +# Load the Mali population data +population_df = pd.read_csv('/mnt/data/file-jSXieGAgEX0roYaN8yMy1IyM') + +# Filter the population data for Mali +mali_population_df = population_df[population_df['location_name'] == 'Mali'] + +# Unzipping the admin level 1 boundaries +zip_file = '/mnt/data/file-WGDAzLoP0a5SqDKEuf4x7aSe' +zip_file_extract_folder = '/mnt/data/geoBoundaries' +shape_file = 'mli_admbnda_adm1.shp' + +with zipfile.ZipFile(zip_file, 'r') as zip_ref: + zip_ref.extractall(zip_file_extract_folder) + +# Load Mali's shapefile +mali_gdf = gpd.read_file(f"{zip_file_extract_folder}/{shape_file}") + +# Group the population by admin1_code and sum up to get the total population per admin1 +mali_population_by_admin1 = mali_population_df.groupby('{{ admin1_code_name }}')['population'].sum().reset_index() + +# Merge the population data with the geopandas dataframe using admin1_code +mali_gdf_merged = mali_gdf.merge(mali_population_by_admin1, left_on='{{ admin1_code_name }}', right_on='{{ admin1_code_name }}') + +# Plotting the map +fig, ax = plt.subplots(1, 1, figsize=(10, 10)) +mali_gdf_merged.plot(column='population', ax=ax, legend=True, + legend_kwds={'label': "Population by Admin1", + 'orientation': "horizontal"}) +ax.set_title('Population by Admin1 in Mali') + +# Remove axes for clarity +ax.set_axis_off() + +plt.show() +``` diff --git a/ingestion/apis.config b/ingestion/apis.config deleted file mode 100644 index 5f5d19b0..00000000 --- a/ingestion/apis.config +++ /dev/null @@ -1,10 +0,0 @@ -[ - { - "api_name": "hapi", - "api_descr": "New HDX API", - "openapi_def": "https://stage.hapi-humdata-org.ahconu.org/openapi.json", - "excluded_endpoints": [ - "/api/v1/encode_identifier" - ] - } -] \ No newline at end of file diff --git a/ingestion/ingest.py b/ingestion/ingest.py index 611c1b0b..f0f7b72b 100644 --- a/ingestion/ingest.py +++ b/ingestion/ingest.py @@ -12,35 +12,7 @@ from shapefiles import download_hdx_boundaries from sqlalchemy import create_engine, text -APIS_CONFIG = "apis.config" - -# We use this to map column names to be the same in all files -# Note that shapefile geopandas columns must be less than 10 characters -# TODO, move this to API config -admin0_code = "adm0_code" -admin1_code = "adm1_code" -admin2_code = "adm2_code" -admin3_code = "adm3_code" -admin0_name = "adm0_name" -admin1_name = "adm1_name" -admin2_name = "adm2_name" -admin3_name = "adm3_name" -col_map = { - "location_code": admin0_code, - "admin1_code": admin1_code, - "admin2_code": admin2_code, - "admin3_code": admin3_code, - "admin1_name": admin1_name, - "admin2_name": admin2_name, - "admin3_name": admin3_name, - "ADM0_PCODE": admin0_code, - "ADM1_PCODE": admin1_code, - "ADM2_PCODE": admin2_code, - "ADM3_PCODE": admin3_code, - "admin0Pcod": admin0_code, - "admin1Pcod": admin1_code, - "admin2Pcod": admin2_code, -} +INTEGRATION_CONFIG = "ingestion.config" def get_api_def(api): @@ -84,7 +56,9 @@ def get_api_data(endpoint, params): return msg -def download_openapi_data(api_host, openapi_def, excluded_endpoints, save_path): +def download_openapi_data( + api_host, openapi_def, excluded_endpoints, save_path, query_extra="" +): """ Downloads data based on the functions specified in the openapi.json definition file. @@ -92,9 +66,11 @@ def download_openapi_data(api_host, openapi_def, excluded_endpoints, save_path): extend to other approaches. Args: - api_hiost: Host URL + api_host: Host URL openapi_def (str): The path to the openapi JSON file. save_path (str): Where to save the data + excluded_endpoints (list): List of endpoints to exclude + query_extra (str): Extra query parameters to add to the request """ @@ -116,13 +92,17 @@ def download_openapi_data(api_host, openapi_def, excluded_endpoints, save_path): print(f"Skipping endpoint with no 'get' method {endpoint}") continue url = f"https://{api_host}/{endpoint}" + print(url) data = [] offset = 0 output = [] while len(output) > 0 or offset == 0: - output = get_api_data(url, {"limit": limit, "offset": offset}) + query = {"limit": limit, "offset": offset} + if query_extra: + query.update(query_extra) + output = get_api_data(url, query) if "No data" in output: break print(output) @@ -149,13 +129,30 @@ def download_openapi_data(api_host, openapi_def, excluded_endpoints, save_path): def read_apis_config(): - with open(APIS_CONFIG) as f: - print("Reading apis.config") - apis = json.load(f) - return apis + """ + Read the APIs configuration from the integration config file. + + Returns: + apis (dict): A dictionary containing the API configurations. + field_map (dict): A dictionary containing the field mappings. + standard_names (dict): A dictionary containing the standard names. + """ + with open(INTEGRATION_CONFIG) as f: + print(f"Reading {INTEGRATION_CONFIG}") + config = json.load(f) + apis = config["openapi_interfaces"] + field_map = config["field_map"] + standard_names = config["standard_names"] + return apis, field_map, standard_names def is_running_in_docker(): + """ + Check if the code is running inside a Docker container. + + Returns: + bool: True if running inside a Docker container, False otherwise. + """ return os.path.exists("/.dockerenv") @@ -234,7 +231,34 @@ def get_cols_string(table, conn): return cols_str -def upload_openapi_csv_files(files_dir, conn, api_name): +def process_openapi_data(api_name, files_dir, field_map, standard_names): + """ + Process OpenAPI data by reading CSV files from a directory, mapping field names, + filtering specific data based on the API name, and saving the modified data back to the CSV files. + + Args: + api_name (str): The name of the OpenAPI. + files_dir (str): The directory path where the CSV files are located. + field_map (dict): A dictionary mapping original field names to new field names. + standard_names (dict): A dictionary containing the standard names for the fields. + + Returns: + None + """ + datafiles = os.listdir(files_dir) + for f in datafiles: + if f.endswith(".csv"): + filename = f"{files_dir}/{f}" + df = pd.read_csv(filename) + df = map_field_names(df, field_map) + # TODO: This is a temporary workaround to account for HAPI having + # aggregate and disaggregated data in the same tables, where the hierarchy differs by country + if api_name == "hapi": + df = filter_hdx_df(df, standard_names["admin0_code_field"]) + df.to_csv(filename, index=False) + + +def save_openapi_data(files_dir, conn, api_name): """ Uploads CSV files from a directory to Postgres. It assumes files_dir contains CSV files as well as a metadata json file for each CSV file. @@ -252,11 +276,6 @@ def upload_openapi_csv_files(files_dir, conn, api_name): for f in datafiles: if f.endswith(".csv"): df = pd.read_csv(f"{files_dir}/{f}") - df = map_code_cols(df, col_map) - # TODO: This is a temporary workaround to account for HAPI having - # aggregate and disaggregated data in the same tables, where the hierarchy differs by country - if api_name == "hapi": - df = filter_hdx_df(df) table = f"{api_name}_{sanitize_name(f)}" print(f"Creating table {table} from {f}") df.to_sql(table, conn, if_exists="replace", index=False) @@ -284,6 +303,15 @@ def upload_openapi_csv_files(files_dir, conn, api_name): def empty_folder(folder): + """ + Remove all files and subdirectories within the specified folder. + + Args: + folder (str): The path to the folder. + + Raises: + IsADirectoryError: If the specified folder is a directory. + """ for f in os.listdir(folder): try: os.remove(f"{folder}/{f}") @@ -327,32 +355,32 @@ def upload_hdx_shape_files(files_dir, conn): connection.commit() -def map_code_cols(df, col_map): +def map_field_names(df, field_map): """ Map columns in a DataFrame to a new set of column names. Args: df (pandas.DataFrame): The DataFrame to be mapped. - col_map (dict): A dictionary containing the mapping of old column names to new column names. + field_map (dict): A dictionary containing the mapping of old column names to new column names. Returns: pandas.DataFrame: The mapped DataFrame. """ - for c in col_map: + for c in field_map: if c in df.columns: - df.rename(columns={c: col_map[c]}, inplace=True) + df.rename(columns={c: field_map[c]}, inplace=True) return df -def filter_hdx_df(df, **kwargs): +def filter_hdx_df(df, admin0_code_field): """ Filter a pandas DataFrame by removing columns where all values are null and removing rows where any value is null. Hack to get around the fact HDX mixes total values in with disaggregated values in the API Args: df (pandas.DataFrame): The DataFrame to be filtered. - **kwargs: Additional keyword arguments. + admin0_code_field (str): The name of the column containing the admin0 code. Returns: pandas.DataFrame: The filtered DataFrame. @@ -363,10 +391,10 @@ def filter_hdx_df(df, **kwargs): return df_orig dfs = [] - if admin0_code in df.columns: - for country in df[admin0_code].unique(): + if admin0_code_field in df.columns: + for country in df[admin0_code_field].unique(): df2 = df.copy() - df2 = df2[df2[admin0_code] == country] + df2 = df2[df2[admin0_code_field] == country] # Remove any columns where all null df2 = df2.dropna(axis=1, how="all") @@ -382,7 +410,7 @@ def filter_hdx_df(df, **kwargs): def main(): - apis = read_apis_config() + apis, field_map, standard_names = read_apis_config() conn = connect_to_db() for api in apis: @@ -392,19 +420,41 @@ def main(): api_host = api["openapi_def"].split("/")[2] excluded_endpoints = api["excluded_endpoints"] + if "authentication" in api: + query_extra = "" + print("Authentication required for", api_name) + if api["authentication"]["type"] == "bearer_token": + print("Bearer token required for", api_name) + elif api["authentication"]["type"] == "api_key": + print("API key required for", api_name) + elif api["authentication"]["type"] == "basic": + print("Basic authentication required for", api_name) + elif api["authentication"]["type"] == "query_parameter": + print("Query parameter required for", api_name) + query_extra = { + api["authentication"]["name"]: api["authentication"]["value"] + } + else: + print("Unknown authentication type for", api_name) + # Extract data from remote APIs which are defined in apis.config - download_openapi_data(api_host, openapi_def, excluded_endpoints, save_path) + download_openapi_data( + api_host, openapi_def, excluded_endpoints, save_path, query_extra + ) + + # Standardize column names + process_openapi_data(api_name, save_path, field_map, standard_names) # Upload CSV files to the database, with supporting metadata - upload_openapi_csv_files(save_path, conn, api_name) + save_openapi_data(save_path, conn, api_name) - # Download shapefiles from HDX + # Download shapefiles from HDX. Note, this also standardizes column names download_hdx_boundaries( datafile="./api/hapi/api_v1_themes_population.csv", - datafile_country_col="location_code", + datafile_country_col=standard_names["country_code_field"], target_dir="./api/hdx/", - col_map=col_map, - map_code_cols=map_code_cols, + field_map=field_map, + map_field_names=map_field_names, ) # Upload shapefiles to the database diff --git a/ingestion/ingestion.config b/ingestion/ingestion.config new file mode 100644 index 00000000..01d80635 --- /dev/null +++ b/ingestion/ingestion.config @@ -0,0 +1,43 @@ +{ + "standard_names":{ + "country_code_field": "adm0_code", + "admin0_code_field": "adm0_code", + "admin1_code_field": "adm1_code", + "admin2_code_field": "adm2_code", + "admin3_code_field": "adm3_code" + }, + "field_map": { + "location_code": "adm0_code", + "admin1_code": "adm1_code", + "admin2_code": "adm2_code", + "admin3_code": "adm3_code", + "admin1_name": "adm1_name", + "admin2_name": "adm2_name", + "admin3_name": "adm3_name", + "ADM0_PCODE": "adm0_code", + "ADM1_PCODE": "adm1_code", + "ADM2_PCODE": "adm2_code", + "ADM3_PCODE": "adm3_code", + "admin0Pcod": "adm0_code", + "admin1Pcod": "adm1_code", + "admin2Pcod": "adm2_code" + }, + "openapi_interfaces": [ + { + "api_name": "hapi", + "api_descr": "New HDX API", + "openapi_def": "https://stage.hapi-humdata-org.ahconu.org/openapi.json", + "authentication": { + "type": "query_parameter", + "name": "app_identifier", + "value": "SERJUDptYXR0aGV3QGRhdGFraW5kLm9yZw==" + }, + "excluded_endpoints": [ + "/api/v1/encode_identifier", + "/api/v1/admin1", + "/api/v1/admin1", + "/api/v1/gender" + ] + } + ] +} \ No newline at end of file diff --git a/ingestion/shapefiles.py b/ingestion/shapefiles.py index 3e38dfa0..8f2099d0 100644 --- a/ingestion/shapefiles.py +++ b/ingestion/shapefiles.py @@ -92,7 +92,9 @@ def get_hdx_shapefile(country_code, admin_level): return response -def normalize_hdx_boundaries(datafile, col_map, map_code_cols, datafile_country_col): +def normalize_hdx_boundaries( + datafile, field_map, map_field_names, datafile_country_col +): """ HDX Boundaries have inconsistent naming conventions and pcode variable names. This function attempts to standardize them for easier use in HDeXpert. @@ -100,8 +102,8 @@ def normalize_hdx_boundaries(datafile, col_map, map_code_cols, datafile_country_ Args: datafile (str): Path to the data file containing location codes. Default is "./data/hdx_population.csv". files_prefix (str): The prefix to use for the files. - col_map (dict): A dictionary of column names mapping, used to rename columns to standard names. - map_code_cols (function): A function to map the code columns to standard names. + field_map (dict): A dictionary of column names mapping, used to rename columns to standard names. + map_field_names (function): A function to map the code columns to standard names. datafile_country_col (str): The column name in the data file containing the country codes. Returns: @@ -135,7 +137,7 @@ def normalize_hdx_boundaries(datafile, col_map, map_code_cols, datafile_country_ shp_file = [f for f in shp_file if f"{admin}.shp" in f] shp_file = shp_file[0] gdf = gpd.read_file(shp_file) - gdf = map_code_cols(gdf, col_map) + gdf = map_field_names(gdf, field_map) shp_file = shp_file.split(admin)[0] + admin + ".shp" shp_file = shp_file.replace("./tmp", "") shp_file = f"{output_dir}/{shp_file[1:]}" @@ -147,8 +149,8 @@ def download_hdx_boundaries( datafile="./api/hapi/hapi_population.csv", datafile_country_col="location_code", target_dir="./api/hdx/", - col_map={}, - map_code_cols=None, + field_map={}, + map_field_names=None, ): """ Downloads HDX boundaries for all countries and administrative levels. @@ -159,8 +161,8 @@ def download_hdx_boundaries( datafile (str): Path to the data file containing location codes. Default is "./data/hdx_population.csv". datafile_country_col (str): The column name in the data file containing the country codes. files_prefix (str): The prefix to use for the files. - col_map (dict): A dictionary of column names mapping, used to rename columns to standard names. - map_code_cols (function): A function to map the code columns to standard names. + field_map (dict): A dictionary of column names mapping, used to rename columns to standard names. + map_field_names (function): A function to map the code columns to standard names. Returns: None @@ -175,8 +177,6 @@ def download_hdx_boundaries( df = pd.read_csv(datafile) countries = df[datafile_country_col].unique() countries = [c.lower() for c in countries] - # TODO: Remove Columbia, it's too big - # countries = [c for c in countries if "col" not in c] for country in countries: for admin in ["admin1", "admin2"]: @@ -185,7 +185,7 @@ def download_hdx_boundaries( # Align field names with other datasets output_dir = normalize_hdx_boundaries( - datafile, col_map, map_code_cols, datafile_country_col + datafile, field_map, map_field_names, datafile_country_col ) # Copy normalized files to target_dir @@ -198,10 +198,7 @@ def download_hdx_boundaries( for admin in ["adm0", "adm1", "adm2"]: files = glob.glob(f"{output_dir}/*{admin}*") if len(files) > 0: - if admin != "adm2": - ranges = ["a-z"] - else: - ranges = ["a-c", "d-h", "i-z"] + ranges = ["a-c", "d-h", "i-z"] for letter_range in ranges: letters = letter_range.split("-") letters = [chr(i) for i in range(ord(letters[0]), ord(letters[1]) + 1)] diff --git a/ui/recipes_assistant_chat/tools/.well-known/haa_datarecipes.json b/ui/recipes_assistant_chat/tools/.well-known/haa_datarecipes.json index 25a6a6eb..1fd5c12d 100644 --- a/ui/recipes_assistant_chat/tools/.well-known/haa_datarecipes.json +++ b/ui/recipes_assistant_chat/tools/.well-known/haa_datarecipes.json @@ -13,7 +13,7 @@ "is_user_authenticated": false }, "data": { - "generate_intent": "true" + "generate_intent": "false" }, "logo_url": "", "contact_email": "Matthew@Datakind.org", diff --git a/ui/recipes_assistant_chat/tools/.well-known/openapi/haa_datarecipes.yaml b/ui/recipes_assistant_chat/tools/.well-known/openapi/haa_datarecipes.yaml index 5abdb1bc..88438a13 100644 --- a/ui/recipes_assistant_chat/tools/.well-known/openapi/haa_datarecipes.yaml +++ b/ui/recipes_assistant_chat/tools/.well-known/openapi/haa_datarecipes.yaml @@ -31,7 +31,7 @@ paths: enum: - "true" - "false" - default: "true" + #default: "true" required: - user_input - chat_history diff --git a/utils/__init__.py b/utils/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/utils/utils.py b/utils/utils.py new file mode 100644 index 00000000..8cd6dcc1 --- /dev/null +++ b/utils/utils.py @@ -0,0 +1,61 @@ +import json +import os +import re +import sys + + +def replace_env_variables(value): + """ + Recursively replaces environment variable placeholders in a given value. + + Args: + value (dict, list, str): The value to process. + + Returns: + The processed value with environment variable placeholders replaced. + + """ + if isinstance(value, dict): + return {k: replace_env_variables(v) for k, v in value.items()} + elif isinstance(value, list): + return [replace_env_variables(v) for v in value] + elif isinstance(value, str): + matches = re.findall(r"\{\{ (.+?) \}\}", value) + for match in matches: + value = value.replace("{{ " + match + " }}", os.getenv(match)) + return value + else: + return value + + +def read_integration_config(integration_config_file): + """ + Read the APIs configuration from the integration config file. + + Args: + integration_config_file (str): The path to the integration config file. + + Returns: + apis (dict): A dictionary containing the API configurations. + field_map (dict): A dictionary containing the field mappings. + standard_names (dict): A dictionary containing the standard names. + """ + with open(integration_config_file) as f: + print(f"Reading {integration_config_file}") + config = json.load(f) + config = replace_env_variables(config) + apis = config["openapi_interfaces"] + field_map = config["field_map"] + standard_names = config["standard_names"] + + return apis, field_map, standard_names + + +def is_running_in_docker(): + """ + Check if the code is running inside a Docker container. + + Returns: + bool: True if running inside a Docker container, False otherwise. + """ + return os.path.exists("/.dockerenv")