From f40ac3151b3e044023f496efd1b674a989a86f52 Mon Sep 17 00:00:00 2001
From: semio <semio@posteo.net>
Date: Sun, 12 May 2024 13:17:15 +0800
Subject: [PATCH] add result data analysis notebook

---
 .../notebooks/result_data_analysis.py         | 675 ++++++++++++++++++
 1 file changed, 675 insertions(+)
 create mode 100644 automation-api/yival_experiments/notebooks/result_data_analysis.py

diff --git a/automation-api/yival_experiments/notebooks/result_data_analysis.py b/automation-api/yival_experiments/notebooks/result_data_analysis.py
new file mode 100644
index 0000000..3c97c0e
--- /dev/null
+++ b/automation-api/yival_experiments/notebooks/result_data_analysis.py
@@ -0,0 +1,675 @@
+# # Result Data Analysis
+#
+# This notebook is for producing tables listed in https://docs.google.com/spreadsheets/d/1ln5ui3f13AfAQkBuEMbNomBXlZLhkQPYVEpBlZjUtu0/edit?pli=1#gid=0
+#
+# Results are from the experiments in Apr and May 2023
+
+# going to use duckdb
+# %load_ext sql
+
+# %sql duckdb://
+
+import pandas as pd
+from lib.pilot.helpers import read_ai_eval_spreadsheet, get_questions, get_model_configs, get_prompt_variants
+from lib.config import read_config
+import matplotlib.pyplot as plt
+import seaborn as sns
+
+# load env
+config = read_config()
+
+
+
+# ## prepare data
+
+# +
+# results from experiment 1:
+result_1 = pd.read_csv('./data/Gapminder AI evaluations - Results202404.csv')
+
+# results from experiment 2:
+result_2 = pd.read_csv('./data/Gapminder AI evaluations - Latest Results.csv')
+
+result = pd.concat([result_1, result_2], ignore_index=True)
+# -
+
+# load ai eval spreadsheet
+ai_eval_sheet = read_ai_eval_spreadsheet()
+
+result
+
+# cleanup 
+result.columns = result.columns.map(lambda x: x.lower().replace(' ', '_'))
+
+result
+
+
+
+# + magic_args="--save result_to_analyze " language="sql"
+# select 
+#     *,
+#     CASE
+#     WHEN ((Result = 'correct')) THEN (3)
+#     WHEN ((Result = 'wrong')) THEN (2)
+#     WHEN ((Result = 'very_wrong')) THEN (1)
+#     WHEN ((Result = 'fail')) THEN (0)
+#     ELSE 0
+#   END AS score
+# from result where model_configuration_id not like 'mc026'
+
+# + magic_args="--with result_to_analyze --save result_chn_prompt_renamed" language="sql"
+# select 
+#    * exclude (prompt_variation_id),
+#    replace(prompt_variation_id, '_zh', '') as prompt_variation_id
+# from result_to_analyze
+# -
+
+
+
+
+
+# models
+all_models = ai_eval_sheet.gen_ai_model_configs.data.df
+
+all_models.tail()
+
+# prompts
+all_prompts = ai_eval_sheet.prompt_variations.data.df
+
+all_prompts.tail()
+
+
+
+# question in eval sheet
+eval_questions = ai_eval_sheet.questions.data.df
+
+eval_questions.columns
+
+# all questions in contentful export
+all_questions = pd.read_csv('./data/contentful_questions_data.csv')
+
+# + magic_args="--save questions_and_topics" language="sql"
+# SELECT
+#   e."question_id",
+#   e."published_version_of_question",
+#   e."language",
+#   l.wrongPercentage AS human_wrong_percentage,
+#   str_split (l.included_in_tests_within_these_topic_ids, ';') AS topic_list,
+#   filter (topic_list, (x -> contains (x, 'sdg'))) [1] AS sdg_topic,
+#   filter (
+#     topic_list,
+#     (
+#       x -> list_contains (
+#         main.list_value (
+#           'refugees',
+#           'population',
+#           'sustainable-development-misconception-study-2020',
+#           '2017_gapminder_test',
+#           'climate-misconception-study-2024'
+#         ),
+#         x
+#       )
+#     )
+#   ) AS other_topics
+# FROM
+#   eval_questions AS e
+#   LEFT JOIN all_questions AS l ON (
+#     (
+#       replace(e."question_id", '_text', '') = CAST(l.globalId AS VARCHAR)
+#     )
+#   )
+# ORDER BY
+#   e."language",
+#   l.globalId;
+
+# + magic_args="--save q_and_t" language="sql"
+# -- only keep question id and topic list.
+# select 
+#     question_id,
+#     first(human_wrong_percentage) as human_wrong_percentage,
+#     first(topic_list) as topic_list,
+#     first(sdg_topic) as sdg_topic,
+#     first(other_topics) as other_topics
+# from questions_and_topics
+# group by question_id
+# -
+
+
+
+
+
+
+
+
+
+# ## Experiment Total
+
+# + language="sql"
+# select
+#     'AI' as name,
+#     count(*) as total_count,
+#     count(*) filter (result != 'fail') as total_count_exclude_indecisive,
+#     count(*) filter (result = 'correct') / total_count_exclude_indecisive * 100 as correct_rate,
+#     100 - correct_rate as wrong_rate,
+#     count(*) filter (result = 'fail') / total_count * 100 as indecisive_rate
+# from result_to_analyze
+# -
+
+
+
+# ## Break down by Model
+
+# + language="sql"
+# select
+#     m.model_id as model_id,
+#     count(*) as total_count,
+#     count(*) filter (result != 'fail') as total_count_exclude_indecisive,
+#     count(*) filter (result = 'correct') / total_count_exclude_indecisive * 100 as correct_rate,
+#     100 - correct_rate as wrong_rate,
+#     count(*) filter (result = 'fail') / total_count * 100 as indecisive_rate
+# from result_to_analyze r left join all_models m on r.model_configuration_id = m.model_config_id
+# GROUP BY m.model_id
+# -
+
+
+
+# ## break down by prompt and prompt family
+
+# + magic_args="by_prompt_family <<" language="sql"
+# select
+#     p.prompt_family as prompt_family,
+#     count(DISTINCT p.variation_id) / 2 as number_of_prompts,  -- treat chinese prompt and english prompt the same.
+#     count(*) as total_count,
+#     count(*) filter (result != 'fail') as total_count_exclude_indecisive,
+#     count(*) filter (result = 'correct') / total_count_exclude_indecisive * 100 as correct_rate,
+#     100 - correct_rate as wrong_rate,
+#     count(*) filter (result = 'fail') / total_count * 100 as indecisive_rate
+# from result_to_analyze r left join all_prompts p on r.prompt_variation_id = p.variation_id
+# GROUP BY p.prompt_family
+# ORDER BY correct_rate desc
+# -
+
+by_prompt_family.DataFrame().set_index('prompt_family')
+
+# + magic_args="by_prompt <<" language="sql"
+# select
+#     any_value(p.prompt_family) as prompt_family,
+#     prompt_variation_id,
+#     count(*) as total_count,
+#     count(*) filter (result != 'fail') as total_count_exclude_indecisive,
+#     count(*) filter (result = 'correct') / total_count_exclude_indecisive * 100 as correct_rate,
+#     100 - correct_rate as wrong_rate,
+#     count(*) filter (result = 'fail') / total_count * 100 as indecisive_rate
+# from result_chn_prompt_renamed r left join all_prompts p on r.prompt_variation_id = p.variation_id
+# GROUP BY r.prompt_variation_id
+# ORDER BY correct_rate desc
+# -
+
+by_prompt.DataFrame().to_csv('./data/outputs/new_total_by_prompts.csv', index=False)
+
+
+
+
+
+# ## break down by topics
+
+# + magic_args="by_topics_1 <<" language="sql"
+# select
+#     q.sdg_topic as sdg_topic,
+#     count(DISTINCT q.question_id) as number_of_questions,  -- treat chinese prompt and english prompt the same.
+#     count(*) as total_count,
+#     count(*) filter (result != 'fail') as total_count_exclude_indecisive,
+#     count(*) filter (result = 'correct') / total_count_exclude_indecisive * 100 as correct_rate,
+#     100 - correct_rate as wrong_rate,
+#     count(*) filter (result = 'fail') / total_count * 100 as indecisive_rate
+# from result_to_analyze r left join q_and_t q on r.question_id = q.question_id
+# GROUP BY q.sdg_topic
+# ORDER BY sdg_topic
+# -
+
+by_topics_1.DataFrame().set_index('sdg_topic')
+
+# +
+# other topics
+
+# + magic_args="--save res_with_other_topics" language="sql"
+# select 
+#     r.*,
+#     unnest(q.other_topics) as topic
+# from result_to_analyze r left join q_and_t q on r.question_id = q.question_id
+# -
+
+
+
+# + magic_args="--with res_with_other_topics by_topics_2 <<" language="sql"
+# select
+#     topic,
+#     count(DISTINCT question_id) as number_of_questions,  -- treat chinese prompt and english prompt the same.
+#     count(*) as total_count,
+#     count(*) filter (result != 'fail') as total_count_exclude_indecisive,
+#     count(*) filter (result = 'correct') / total_count_exclude_indecisive * 100 as correct_rate,
+#     100 - correct_rate as wrong_rate,
+#     count(*) filter (result = 'fail') / total_count * 100 as indecisive_rate
+# from res_with_other_topics
+# GROUP BY topic
+# ORDER BY topic
+# -
+
+by_topics_2.DataFrame().set_index('topic')
+
+
+
+
+
+# ## Model vs Prompt Family
+
+# +
+# I need to check the variance cause by Prompt Family for each Model.
+# So I will first check the answer variance of each question, then get the average variance of all questions.
+
+# + magic_args="--save res_with_prompt_family" language="sql"
+# select
+#     r.*,
+#     p.prompt_family
+# from result_to_analyze r left join all_prompts p on r.prompt_variation_id = p.variation_id
+
+# + magic_args="--save res_with_prompt_family_exclude_ind" language="sql"
+# select * from res_with_prompt_family where score != 0
+
+# + magic_args="--save model_prompt_stat1" language="sql"
+# select
+#       prompt_family,
+#       model_configuration_id,
+#       question_id,
+#       count(*) as total_amount,
+#       count(*) filter (score = 3) / total_amount * 100 as correct_rate,
+#       -- stddev_pop(score) / mean (score) * 100 as variance
+#       count(DISTINCT score) as variance
+#     from
+#       res_with_prompt_family_exclude_ind
+#     group by
+#       prompt_family,
+#       model_configuration_id,
+#       question_id
+#     order by
+#       "correct_rate" desc
+
+# + magic_args="--save model_prompt_stat2" language="sql"
+# select
+#       prompt_family,
+#       model_configuration_id,
+#       question_id,
+#       count(*) as total_amount,
+#       count(*) filter (score = 0) / total_amount * 100 as indecisive_rate
+#     from
+#       res_with_prompt_family
+#     group by
+#       prompt_family,
+#       model_configuration_id,
+#       question_id
+
+# + magic_args="model_prompt_stats <<" language="sql"
+# select
+#   r2.prompt_family,
+#   r2.model_configuration_id,
+#   mean (correct_rate) as cr,
+#   mean (indecisive_rate) as ir,
+#   mode (variance) as variance
+# from
+#   model_prompt_stat1 r2
+#   left join model_prompt_stat2 r3 on r2.prompt_family = r3.prompt_family
+#   and r2.model_configuration_id = r3.model_configuration_id
+#   and r2.question_id = r3.question_id
+# group by
+#   r2.prompt_family,
+#   r2.model_configuration_id
+# order by
+#   r2.model_configuration_id,
+#   r2.prompt_family
+#
+# -
+
+tmp_df1 = model_prompt_stats.DataFrame().set_index(['prompt_family', 'model_configuration_id'])
+
+tmp_df1
+
+tmp_df1.describe()
+
+
+
+# ## Model vs Topic
+# Same as above, need to calculate variance per question first and get the average.
+
+# + magic_args="--save model_question_stat1" language="sql"
+# select
+#     question_id,
+#     model_configuration_id,
+#     count(*) filter (
+#       score = 3
+#     ) / count(*) * 100 as correct_rate,
+#     -- stddev_pop(score) / mean(score) * 100 as variance
+#     count(DISTINCT score) as variance
+#   from
+#     (select * from result_to_analyze where score != 0)
+#   group by
+#     question_id,
+#     model_configuration_id
+
+# + magic_args="--save model_question_stat2" language="sql"
+#   select
+#     question_id,
+#     model_configuration_id,
+#     count(*) filter (
+#       score = 0
+#     ) / count(*) * 100 as indecisive_rate
+#   from
+#     result_to_analyze
+#   group by
+#     question_id,
+#     model_configuration_id
+
+# + magic_args="--save model_question_stat_all" language="sql"
+# select
+#     r1.*,
+#     r2.indecisive_rate
+#   from
+#     model_question_stat1 r1
+#   left join model_question_stat2 r2 on
+#     r1.question_id = r2.question_id
+#     and r1.model_configuration_id = r2.model_configuration_id
+
+# + magic_args="--save model_topic_stat" language="sql"
+#   select
+#     r.*,
+#     q.sdg_topic,
+#     q.other_topics,
+#     q.human_wrong_percentage,
+#     case 
+#       when q.sdg_topic is null then other_topics
+#       else list_append(q.other_topics, q.sdg_topic)
+#     end as all_topics
+#     
+#   from
+#     model_question_stat_all r
+#   left join q_and_t q on
+#     r.question_id = q.question_id
+
+# + magic_args="--with model_topic_stat model_topic_res <<" language="sql"
+# select
+#   model_configuration_id,
+#   topic,
+#   count(*) as "number of qs",
+#   mean (correct_rate) as correct_rate,
+#   mean (indecisive_rate) as indecisive_rate,
+#   mode (variance) as variance
+# from
+#   (
+#     select
+#       * exclude (all_topics, sdg_topic, other_topics),
+#       unnest (all_topics) as topic
+#     from
+#       model_topic_stat
+#   )
+# group by
+#   topic,
+#   model_configuration_id
+# order by
+#   topic,
+#   model_configuration_id
+# -
+
+model_topic_res_df = model_topic_res.DataFrame().set_index(['model_configuration_id', 'topic'])
+
+model_topic_res_df.to_csv('./data/outputs/new_model_vs_topic.csv')
+
+model_topic_res_df.describe()
+
+
+
+
+
+# ## Topic vs Prompt Family
+
+# +
+# we will reuse the res_with_prompt_family_exclude_ind and res_with_prompt_family queries defined above.
+
+# + magic_args="--save question_prompt_family_stat1" language="sql"
+#     select
+#       question_id,
+#       prompt_family,
+#       count(*) filter (score = 3) / count(*) * 100 as correct_rate,
+#       -- stddev_pop(score) / mean (score) * 100 as variance
+#       count(DISTINCT score) as variance
+#     from
+#       res_with_prompt_family_exclude_ind
+#     group by
+#       question_id,
+#       prompt_family
+
+# + magic_args="--save question_prompt_family_stat2" language="sql"
+#     select
+#       question_id,
+#       prompt_family,
+#       count(*) filter (score = 1) / count(*) * 100 as indecisive_rate
+#     from
+#       res_with_prompt_family
+#     group by
+#       question_id,
+#       prompt_family
+# -
+
+
+
+# + magic_args="--save question_prompt_family_stat_all" language="sql"
+#     select
+#       r1.question_id,
+#       r1.prompt_family,
+#       mean (correct_rate) as correct_rate,
+#       mean (indecisive_rate) as indecisive_rate,
+#       mode (variance) as variance
+#     from
+#       question_prompt_family_stat1 r1
+#       left join question_prompt_family_stat2 r2 on r1.question_id = r2.question_id
+#       and r1.prompt_family = r2.prompt_family
+#     group by
+#       r1.question_id,
+#       r1.prompt_family
+
+# + magic_args="--save topic_prompt_family_stat" language="sql"
+#     select
+#       r.*,
+#       q.sdg_topic,
+#       q.other_topics,
+#       case
+#         when q.sdg_topic is null then other_topics
+#         else list_append (q.other_topics, q.sdg_topic)
+#       end as all_topics
+#     from
+#       question_prompt_family_stat_all r
+#       left join q_and_t q on r.question_id = q.question_id
+
+# + magic_args="--with topic_prompt_family_stat topic_prompt_family_res <<" language="sql"
+# select
+#   topic,
+#   -- count(*) as "number of qs",
+#   prompt_family,
+#   mean (correct_rate) as correct_rate,
+#   mean (indecisive_rate) as indecisive_rate,
+#   median (variance) as variance
+# from
+#   (select 
+#     * exclude (all_topics, sdg_topic, other_topics),
+#     unnest(all_topics) as topic
+#    from topic_prompt_family_stat)
+# group by
+#   topic,
+#   prompt_family
+# order by
+#   topic,
+#   prompt_family
+# -
+
+topic_prompt_family_df = topic_prompt_family_res.DataFrame().set_index(['topic', 'prompt_family'])
+
+topic_prompt_family_df.to_csv('./data/outputs/new_topic_vs_prompt.csv')
+
+topic_prompt_family_df.describe()
+
+
+
+# ## Questions where AI worse than human and monkey
+
+# + language="sql"
+# select * from model_topic_stat;
+
+# + magic_args="model_topic_human_diff <<" language="sql"
+# select 
+#   question_id,
+#   model_configuration_id,
+#     (100 - correct_rate) as ai_wrong_percentage,
+#     human_wrong_percentage,
+#   ai_wrong_percentage - human_wrong_percentage as diff,
+#     sdg_topic,
+#     other_topics
+# from model_topic_stat 
+# where diff > 0
+# order by 
+#     "sdg_topic", 
+#     cast(other_topics as varchar), 
+#     "model_configuration_id"
+# -
+
+model_topic_human_diff_df = model_topic_human_diff.DataFrame()
+
+model_topic_human_diff_df.to_csv('./data/outputs/new_ai_worse_human.csv', index=False)
+
+
+
+
+
+# + magic_args="model_topic_monkey_diff <<" language="sql"
+# select 
+#   question_id,
+#   model_configuration_id,
+#     (100 - correct_rate) as ai_wrong_percentage,
+#     100 * (2/3) as monkey_wrong_percentage,
+#   ai_wrong_percentage - monkey_wrong_percentage as diff,
+#     sdg_topic,
+#     other_topics
+# from model_topic_stat 
+# where diff > 0
+# order by 
+#     "sdg_topic", 
+#     cast(other_topics as varchar), 
+#     "model_configuration_id"
+# -
+
+model_topic_monkey_diff_df = model_topic_monkey_diff.DataFrame()
+
+model_topic_monkey_diff_df.to_csv('./data/outputs/new_ai_worse_monkey.csv', index=False)
+
+
+
+
+
+# +
+# summary stats for human and monkey vs ai
+
+# + magic_args="summary_human_ai <<" language="sql"
+# select 
+#     question_id,
+#     count(*) as num_of_models,
+#     mean(diff) as average_diff,
+# from 
+#     model_topic_human_diff_df
+# group by 
+#     question_id
+# ORDER BY
+#     num_of_models desc,
+#     average_diff desc
+# -
+
+summary_human_ai.DataFrame()
+
+summary_human_ai.DataFrame().to_csv('./data/outputs/new_summary_human_ai.csv')
+
+
+
+# + magic_args="summary_monkey_ai <<" language="sql"
+# select 
+#     question_id,
+#     count(*) as num_of_models,
+#     mean(diff) as average_diff,
+# from 
+#     model_topic_monkey_diff_df
+# group by 
+#     question_id
+# ORDER BY
+#     num_of_models desc,
+#     average_diff desc
+# -
+
+summary_monkey_ai.DataFrame().to_csv('./data/outputs/new_summary_monkey_ai.csv')
+
+
+
+
+
+# ## Question vs Prompt Family
+
+# + magic_args="question_prompt_family_stat << " language="sql"
+# select * from question_prompt_family_stat_all
+# -
+
+question_prompt_family_stat_df = question_prompt_family_stat.DataFrame()
+
+question_prompt_family_stat_df.to_csv('./data/outputs/new_question_prompt_family_stat.csv')
+
+
+
+
+
+
+
+# # Check raw outputs
+
+outputs1 = pd.read_excel('../output/archives/20240401/results.xlsx')
+outputs2 = pd.read_excel('../output/results.xlsx')
+
+outputs = pd.concat([outputs1, outputs2], ignore_index=True)
+
+outputs
+
+
+
+# alibaba = %sql select * from outputs where model_id = 'qwen-max-0403'
+# err = %sql select * from outputs where model_id = 'qwen-max-0403' and raw_output like '%Error%'
+
+err.DataFrame().head(10)
+
+# +
+# Issue: Seems the gpt 4 evaluator grades some Error and indecisive answers as "correct"..
+# -
+
+
+
+err.DataFrame().shape
+
+alibaba.DataFrame().shape
+
+60 / 30348  # still have 0.1% of API Error
+
+# + magic_args="err2 <<" language="sql"
+# select * from outputs where model_id = 'qwen-max-0403' and
+#  (raw_output like '%抱歉%'
+#     OR raw_output like '%遗憾%'
+#     OR raw_output like '%对不起%'
+#     OR raw_output like '%无法%')  -- these are answers including the word "Sorry" or "I can't"
+# -
+
+err2.DataFrame()
+
+err2.DataFrame().shape
+
+
+
+