From 49541c64c68319f83b3086d105d5bd2e617187db Mon Sep 17 00:00:00 2001
From: Young <afe.young@gmail.com>
Date: Wed, 17 Jul 2024 06:08:59 +0000
Subject: [PATCH 01/10] Init todo

---
 rdagent/app/quant_factor_benchmark/eval.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/rdagent/app/quant_factor_benchmark/eval.py b/rdagent/app/quant_factor_benchmark/eval.py
index 3088be30..9c2c3d47 100644
--- a/rdagent/app/quant_factor_benchmark/eval.py
+++ b/rdagent/app/quant_factor_benchmark/eval.py
@@ -27,3 +27,12 @@
 
 # 5.run the eval
 res = eval_method.eval()
+
+# TODO:
+# - Run it:
+# - factor input data generator;
+#   - f_{gt}(input) => value_{gt}
+#   - f_{llm}(input) => value_{llm}
+#   - we have legal issue to release Input
+# - Eval result:
+#   -  check https://github.com/peteryang1/fincov2/blob/master/src/scripts/benchmark/analysis.py

From aa4c7e52db3570741ce974af73dfd2a4d2280752 Mon Sep 17 00:00:00 2001
From: Taozhi Wang <taozhi.mark.wang@gmail.com>
Date: Tue, 23 Jul 2024 02:42:28 +0000
Subject: [PATCH 02/10] Evaluation & dataset

---
 RD-Agent                                         |  1 +
 rdagent/app/quant_factor_benchmark/eval.py       | 15 +++++++++++----
 rdagent/components/benchmark/conf.py             |  2 +-
 rdagent/components/benchmark/eval_method.py      | 16 +++++++++-------
 .../qlib/factor_experiment_loader/json_loader.py | 12 ++++++++----
 5 files changed, 30 insertions(+), 16 deletions(-)
 create mode 160000 RD-Agent

diff --git a/RD-Agent b/RD-Agent
new file mode 160000
index 00000000..61d67d85
--- /dev/null
+++ b/RD-Agent
@@ -0,0 +1 @@
+Subproject commit 61d67d8518072d69d5169853d49dd6ff88e6055c
diff --git a/rdagent/app/quant_factor_benchmark/eval.py b/rdagent/app/quant_factor_benchmark/eval.py
index 9c2c3d47..d9cb4948 100644
--- a/rdagent/app/quant_factor_benchmark/eval.py
+++ b/rdagent/app/quant_factor_benchmark/eval.py
@@ -1,4 +1,5 @@
-from rdagent.scenarios.qlib.factor_task_loader.json_loader import (
+from rdagent.app.qlib_rd_loop.conf import PROP_SETTING
+from rdagent.scenarios.qlib.factor_experiment_loader.json_loader import (
     FactorTestCaseLoaderFromJsonFile,
 )
 
@@ -6,6 +7,11 @@
 from rdagent.components.benchmark.eval_method import FactorImplementEval
 from rdagent.core.utils import import_class
 
+from rdagent.core.utils import import_class
+from rdagent.core.scenario import Scenario
+from rdagent.scenarios.qlib.experiment.factor_experiment import QlibFactorScenario
+
+
 # 1.read the settings
 bs = BenchmarkSettings()
 
@@ -14,13 +20,14 @@
 
 # 3.declare the method to be tested and pass the arguments.
 
-method_cls = import_class(bs.bench_method_cls)
-generate_method = method_cls()
-
+scen: Scenario = import_class(PROP_SETTING.factor_scen)()
+generate_method = import_class(bs.bench_method_cls)(scen=scen)
+ 
 # 4.declare the eval method and pass the arguments.
 eval_method = FactorImplementEval(
     method=generate_method,
     test_cases=test_cases,
+    scen=scen,
     catch_eval_except=True,
     test_round=bs.bench_test_round,
 )
diff --git a/rdagent/components/benchmark/conf.py b/rdagent/components/benchmark/conf.py
index f854eec3..7ebebc5d 100644
--- a/rdagent/components/benchmark/conf.py
+++ b/rdagent/components/benchmark/conf.py
@@ -18,7 +18,7 @@ class BenchmarkSettings(BaseSettings):
     bench_test_round: int = 10
     bench_test_case_n: Optional[int] = None  # how many test cases to run; If not given, all test cases will be run
 
-    bench_method_cls: str = "rdagent.factor_implementation.CoSTEER.CoSTEERFG"
+    bench_method_cls: str = "rdagent.components.coder.factor_coder.CoSTEER.FactorCoSTEER"
     bench_method_extra_kwargs: dict = field(
         default_factory=dict,
     )  # extra kwargs for the method to be tested except the task list
diff --git a/rdagent/components/benchmark/eval_method.py b/rdagent/components/benchmark/eval_method.py
index 054b1c97..b45451eb 100644
--- a/rdagent/components/benchmark/eval_method.py
+++ b/rdagent/components/benchmark/eval_method.py
@@ -20,6 +20,7 @@
 from rdagent.core.developer import Developer
 from rdagent.core.exception import CoderException
 from rdagent.core.experiment import Task, Workspace
+from rdagent.core.scenario import Scenario
 from rdagent.core.utils import multiprocessing_wrapper
 
 
@@ -114,17 +115,18 @@ def __init__(
         test_cases: TestCase,
         method: Developer,
         *args,
+        scen: Scenario,
         test_round: int = 10,
         **kwargs,
     ):
         online_evaluator_l = [
-            FactorSingleColumnEvaluator(),
-            FactorOutputFormatEvaluator(),
-            FactorRowCountEvaluator(),
-            FactorIndexEvaluator(),
-            FactorMissingValuesEvaluator(),
-            FactorEqualValueCountEvaluator(),
-            FactorCorrelationEvaluator(hard_check=False),
+            FactorSingleColumnEvaluator(scen),
+            FactorOutputFormatEvaluator(scen),
+            FactorRowCountEvaluator(scen),
+            FactorIndexEvaluator(scen),
+            FactorMissingValuesEvaluator(scen),
+            FactorEqualValueCountEvaluator(scen),
+            FactorCorrelationEvaluator(hard_check=False, scen=scen),
         ]
         super().__init__(online_evaluator_l, test_cases, method, *args, **kwargs)
         self.test_round = test_round
diff --git a/rdagent/scenarios/qlib/factor_experiment_loader/json_loader.py b/rdagent/scenarios/qlib/factor_experiment_loader/json_loader.py
index 99395c36..28c37b10 100644
--- a/rdagent/scenarios/qlib/factor_experiment_loader/json_loader.py
+++ b/rdagent/scenarios/qlib/factor_experiment_loader/json_loader.py
@@ -8,7 +8,7 @@
     FactorTask,
 )
 from rdagent.components.loader.experiment_loader import FactorExperimentLoader
-from rdagent.core.experiment import Loader
+from rdagent.core.experiment import Experiment, Loader
 from rdagent.scenarios.qlib.experiment.factor_experiment import QlibFactorExperiment
 
 
@@ -47,7 +47,7 @@ class FactorTestCaseLoaderFromJsonFile:
     def load(self, json_file_path: Path) -> list:
         with open(json_file_path, "r") as file:
             factor_dict = json.load(file)
-        TestData = TestCase()
+        TestData = TestCase(target_task=Experiment(sub_tasks=[]))
         for factor_name, factor_data in factor_dict.items():
             task = FactorTask(
                 factor_name=factor_name,
@@ -55,9 +55,13 @@ def load(self, json_file_path: Path) -> list:
                 factor_formulation=factor_data["formulation"],
                 variables=factor_data["variables"],
             )
-            gt = FactorFBWorkspace(task, code=factor_data["gt_code"])
+            gt = FactorFBWorkspace(task)
+            code = {
+                "factor.py": factor_data["gt_code"]
+            }
+            gt.inject_code(**code)
             gt.execute()
-            TestData.target_task.append(task)
+            TestData.target_task.sub_tasks.append(task)
             TestData.ground_truth.append(gt)
 
         return TestData

From c51a6f08a2b73f547697940c44022b4b5567b769 Mon Sep 17 00:00:00 2001
From: Taozhi Wang <taozhi.mark.wang@gmail.com>
Date: Tue, 23 Jul 2024 08:06:14 +0000
Subject: [PATCH 03/10] Generate new data

---
 rdagent/components/benchmark/analysis.py  | 63 +++++++++++++++++++++++
 rdagent/components/benchmark/example.json |  6 +--
 2 files changed, 66 insertions(+), 3 deletions(-)
 create mode 100644 rdagent/components/benchmark/analysis.py

diff --git a/rdagent/components/benchmark/analysis.py b/rdagent/components/benchmark/analysis.py
new file mode 100644
index 00000000..1aedb7cc
--- /dev/null
+++ b/rdagent/components/benchmark/analysis.py
@@ -0,0 +1,63 @@
+import pickle
+import os
+import pandas as pd
+import matplotlib.pyplot as plt
+
+# Function to load and process each pickle file
+def process_pickle_file(file_path):
+    try:
+        with open(file_path, 'rb') as file:
+            data = pickle.load(file)
+        # Assuming data is a DataFrame or similar
+        print(f"Data from {file_path} processed successfully.")
+        return data
+    except Exception as e:
+        print(f"Error processing {file_path}: {e}")
+        return None
+    
+def analysis(folder_path):
+    success_count = 0
+    fail_count = 0
+
+    # Logging the errors
+    error_log = open("error_log.log", "w")
+
+    # List to store data for visualization
+    data_frames = []
+
+    # Processing each file in the directory
+    for file_name in os.listdir(folder_path):
+        file_path = os.path.join(folder_path, file_name)
+        data = process_pickle_file(file_path)
+        if data is not None:
+            data_frames.append(data)
+
+    for df in data_frames:
+        if 'Execution succeeded' in df[0]:
+            success_count += 1
+        else:
+            fail_count += 1
+            error_log.write(f"{file_path}: \n{df[0]}\n")
+
+    # Writing summary
+    print(f"Number of successful files: {success_count}")
+    print(f"Number of failed files: {fail_count}")
+
+    # Closing the error log file
+    error_log.close()
+
+def view_pickle_file(folder_path):
+    for file_name in os.listdir(folder_path):
+        file_path = os.path.join(folder_path, file_name)
+
+        print(f'the path of this file is: {file_path}\n')
+        with open(file_path, 'rb') as file:
+            data = pickle.load(file)
+            for i in range(len(data)):
+                print(data[i])
+
+
+if __name__ == '__main__':
+    folder_path = '/data/userdata/v-taozhiwang/RD-Agent/git_ignore_folder/factor_implementation_execution_cache'
+    
+    analysis(folder_path)
\ No newline at end of file
diff --git a/rdagent/components/benchmark/example.json b/rdagent/components/benchmark/example.json
index b69ffd8e..f1e5a85c 100644
--- a/rdagent/components/benchmark/example.json
+++ b/rdagent/components/benchmark/example.json
@@ -6,7 +6,7 @@
             "20-day turnover rate": "Average turnover rate over the past 20 days.",
             "Market Capitalization": "Total market value of a company's outstanding shares."
         },
-        "gt_code": "import pandas as pd\n\ndata_f = pd.read_hdf('daily_f.h5')\n\ndata = data_f.reset_index()\nwindow_size = 20\n\nnominator=data.groupby('instrument')[['30\u65e5\u6362\u624b\u7387']].rolling(window=window_size).mean().reset_index(0, drop=True)\n# transfer to series\nnew=nominator['30\u65e5\u6362\u624b\u7387']\ndata['Turnover_Rate_Factor']=new/data['\u6d41\u901aA\u80a1']\n\n# # set the datetime and instrument as index and drop the original index\nresult=pd.DataFrame(data['Turnover_Rate_Factor']).set_index(data_f.index)\n\n# transfer the result to series\nresult=result['Turnover_Rate_Factor']\nresult.to_hdf(\"result.h5\", key=\"data\")\n"
+        "gt_code": "import pandas as pd\n\ndata_f = pd.read_hdf('daily_f.h5')\n\ndata = data_f.reset_index()\nwindow_size = 20\n\nnominator=data.groupby('instrument')[['TurnoverRate_30D']].rolling(window=window_size).mean().reset_index(0, drop=True)\n# transfer to series\nnew=nominator['TurnoverRate_30D']\ndata['Turnover_Rate_Factor']=new/data['TradableACapital']\n\n# set the datetime and instrument as index and drop the original index\nresult=pd.DataFrame(data['Turnover_Rate_Factor']).set_index(data_f.index)\n\n# transfer the result to series\nresult=result['Turnover_Rate_Factor']\nresult.to_hdf(\"result.h5\", key=\"data\")" 
     },
     "PctTurn20": {
         "description": "A factor representing the percentage change in turnover rate over the past 20 trading days, market-value neutralized.",
@@ -16,7 +16,7 @@
             "Turnover_{i, t}": "Turnover of stock i at day t.",
             "Turnover_{i, t-20}": "Turnover of stock i at day t-20."
         },
-        "gt_code": "import pandas as pd\nfrom statsmodels import api as sm\n\n\ndef fill_mean(s: pd.Series) -> pd.Series:\n    return s.fillna(s.mean()).fillna(0.0)\n\n\ndef market_value_neutralize(s: pd.Series, mv: pd.Series) -> pd.Series:\n    s = s.groupby(\"datetime\", group_keys=False).apply(fill_mean)\n    mv = mv.groupby(\"datetime\", group_keys=False).apply(fill_mean)\n\n    df_f = mv.to_frame(\"\u5e02\u503c\")\n    df_f[\"const\"] = 1\n    X = df_f[[\"\u5e02\u503c\", \"const\"]]\n\n    # Perform the Ordinary Least Squares (OLS) regression\n    model = sm.OLS(s, X)\n    results = model.fit()\n\n    # Calculate the residuals\n    df_f[\"residual\"] = results.resid\n    df_f[\"norm_resi\"] = df_f.groupby(level=\"datetime\", group_keys=False)[\"residual\"].apply(\n        lambda x: (x - x.mean()) / x.std(),\n    )\n    return df_f[\"norm_resi\"]\n\n\n# get_turnover\ndf_pv = pd.read_hdf(\"daily_pv.h5\", key=\"data\")\ndf_f = pd.read_hdf(\"daily_f.h5\", key=\"data\")\nturnover = df_pv[\"$money\"] / df_f[\"\u6d41\u901a\u5e02\u503c\"]\n\nf = turnover.groupby(\"instrument\").pct_change(periods=20)\n\nf_neutralized = market_value_neutralize(f, df_f[\"\u6d41\u901a\u5e02\u503c\"])\n\nf_neutralized.to_hdf(\"result.h5\", key=\"data\")\n"
+        "gt_code": "import pandas as pd\nfrom statsmodels import api as sm\n\ndef fill_mean(s: pd.Series) -> pd.Series:\n    return s.fillna(s.mean()).fillna(0.0)\n\ndef market_value_neutralize(s: pd.Series, mv: pd.Series) -> pd.Series:\n    s = s.groupby(\"datetime\", group_keys=False).apply(fill_mean)\n    mv = mv.groupby(\"datetime\", group_keys=False).apply(fill_mean)\n\n    df_f = mv.to_frame(\"MarketValue\")\n    df_f[\"const\"] = 1\n    X = df_f[[\"MarketValue\", \"const\"]]\n\n    # Perform the Ordinary Least Squares (OLS) regression\n    model = sm.OLS(s, X)\n    results = model.fit()\n\n    # Calculate the residuals\n    df_f[\"residual\"] = results.resid\n    df_f[\"norm_resi\"] = df_f.groupby(level=\"datetime\", group_keys=False)[\"residual\"].apply(\n        lambda x: (x - x.mean()) / x.std(),\n    )\n    return df_f[\"norm_resi\"]\n\n\n# get_turnover\ndf_pv = pd.read_hdf(\"daily_pv.h5\", key=\"data\")\ndf_f = pd.read_hdf(\"daily_f.h5\", key=\"data\")\nturnover = df_pv[\"$money\"] / df_f[\"TradableMarketValue\"]\n\nf = turnover.groupby(\"instrument\").pct_change(periods=20)\n\nf_neutralized = market_value_neutralize(f, df_f[\"TradableMarketValue\"])\n\nf_neutralized.to_hdf(\"result.h5\", key=\"data\")"
     },
     "PB_ROE": {
         "description": "Constructed using the ranking difference between PB and ROE, with PB and ROE replacing original PB and ROE to obtain reconstructed factor values.",
@@ -25,6 +25,6 @@
             "\\text{rank}(PB_t)": "Ranking PB on cross-section at time t.",
             "\\text{rank}(ROE_t)": "Ranking single-quarter ROE on cross-section at time t."
         },
-        "gt_code": "#!/usr/bin/env python\n\nimport pandas as pd\n\ndata_f = pd.read_hdf('daily_f.h5')\n\ndata = data_f.reset_index()\n\n# Calculate the rank of PB and ROE\ndata['PB_rank'] = data.groupby('datetime')['B/P'].rank()\ndata['ROE_rank'] = data.groupby('datetime')['ROE'].rank()\n\n# Calculate the difference between the ranks\ndata['PB_ROE'] = data['PB_rank'] - data['ROE_rank']\n\n# set the datetime and instrument as index and drop the original index\nresult=pd.DataFrame(data['PB_ROE']).set_index(data_f.index)\n\n# transfer the result to series\nresult=result['PB_ROE']\nresult.to_hdf(\"result.h5\", key=\"data\")\n"
+        "gt_code": "#!/usr/bin/env python\n\nimport pandas as pd\n\ndata_f = pd.read_hdf('daily_f.h5')\n\ndata = data_f.reset_index()\n\n# Calculate the rank of PB and ROE\ndata['PB_rank'] = data.groupby('datetime')['B/P'].rank()\ndata['ROE_rank'] = data.groupby('datetime')['ROE'].rank()\n\n# Calculate the difference between the ranks\ndata['PB_ROE'] = data['PB_rank'] - data['ROE_rank']\n\n# set the datetime and instrument as index and drop the original index\nresult=pd.DataFrame(data['PB_ROE']).set_index(data_f.index)\n\n# transfer the result to series\nresult=result['PB_ROE']\nresult.to_hdf(\"result.h5\", key=\"data\")"
     }
 }
\ No newline at end of file

From 90bd7e3923d1ab103a98b737d4405ec4bacad1b6 Mon Sep 17 00:00:00 2001
From: Taozhi Wang <taozhi.mark.wang@gmail.com>
Date: Wed, 24 Jul 2024 01:28:53 +0000
Subject: [PATCH 04/10] dataset generation

---
 .../components/benchmark/generate_dataset.py  | 72 +++++++++++++++++++
 1 file changed, 72 insertions(+)
 create mode 100644 rdagent/components/benchmark/generate_dataset.py

diff --git a/rdagent/components/benchmark/generate_dataset.py b/rdagent/components/benchmark/generate_dataset.py
new file mode 100644
index 00000000..6e0520d1
--- /dev/null
+++ b/rdagent/components/benchmark/generate_dataset.py
@@ -0,0 +1,72 @@
+import pandas as pd
+import numpy as np
+import random
+import string
+
+def create_new_hdf5_file(file_path, new_path):
+    """ Create a new HDF5 file with random data. """
+    # Load the dataset
+    data = pd.read_hdf(file_path, key='data')
+
+    columns = [] # TODO select the column we want to keep
+    selected_data = data[columns]
+
+    # Generate new data for each column
+    new_data = pd.DataFrame(index=selected_data.index)
+
+    for column in selected_data.columns:
+        if column == 'B/P':
+            mean = selected_data[column].mean().values[0]
+            std = selected_data[column].std().values[0]
+        else:
+            mean = selected_data[column].mean()
+            std = selected_data[column].std()
+        new_data[column] = np.random.normal(mean, std, size=selected_data.shape[0])
+
+    # Save the new dataset
+    new_data.to_hdf(new_path, key='data', mode='w')
+
+    print("New dataset created and saved successfully!")
+
+def change_head(path):
+    data = pd.read_hdf(path, key='data')
+    columns = [
+        'B/P', 'ROE',
+        'TotalCapitalStock', 'TradableACapital', 'TotalMarketValue', 'TradableMarketValue', 'StockPrice', 
+        'E/P', 'ECut/P', 'EBIT/EV', 'EBITDA/EV', 'ROA_Q', 'MACrossover', 'QuarterlyUnrestrictedShareholdersRatioChange', 
+        'HSZZ_ALPHA_3M', 'ROE_Q', 'ROA_TTM', 'S_ROAG', 'ExternalFinancingScale_2Y', 'ConMarketConf_5D', 
+        'NetProfitSequentialQuarterlyChange', 'SemiannualUnrestrictedShareholdersRatioChange', 'STD_12M', 
+        'LarSmaDiffSellValue', 'OperatingCashFlowRatio', 'TurnoverRate_30D', 'HSZZ_R2_3M', 'Sales_Growth_3Y', 
+        'PricePosition_30D', 'NetProfitMargin', 'OperatingProfitYOY', 'SalesToCashRatio', 
+        'FutureUnrestrictedRatio_3M', 'HSZZ_ALPHA_12M', 'Idiosyncrasy', 'RatingChange', 'TSKEW', 
+        'WeeklyConsensusChangeJR_1W', 'HSZZ_BETA_3M', 'PricePosition_180D', 'MedSellValue', 
+        'UnlimitedShareholdersAverageAmount', 'T_ROEG', 'QuarterlyAverageShareholdersRatioChange', 
+        'FixedAssetTurnover', 'MonthlyRatingChange_1M', 'FutureUnrestrictedRatio_6M', 'TurnoverRate_30D_90D', 
+        'Sales2EV', 'ILLIQ_1M', 'Profit_Growth_TTM', 'HighLow_1M', 'OperationCash_TTM', 
+        'FutureUnrestrictedRatio_6MOver30DAvgTurnover', 'TurnoverRate_30D_180D', 'GrossProfitMargin', 
+        'AnalystMomentumScore', 'ShareExpansionRatio_2Y', 'ROIC', 'TurnoverRate_60D', 
+        'ExternalFinancingAdjustedGrowthRate', 'Weighted_Strength_3M', 'Weighted_Strength_1M', 
+        'FutureUnrestrictedRatio_1MOver30DAvgTurnover', 'OperatingCashFlowOverRevenue_TTM', 'ConMarketConf_10D', 
+        'HSZZ_ResidualStd_3M', 'RevenueSequentialYOY', 'RevenueYOY', 'EXTE'
+    ]
+
+    data.columns = columns
+    data.to_hdf(path, key='data', mode='w')
+    print("Head changed successfully!")
+
+def view_hdf5_file(filename):
+    with pd.HDFStore(filename, 'r') as store:
+        print("Keys in the file:", store.keys())
+        data = store['data']
+        print(data.head())
+        print("\nSummary statistics:\n", data.describe())  
+        print(data.index)
+
+
+
+if __name__ == '__main__':
+    path = ''
+    new_path = ''
+    create_new_hdf5_file(file_path=path, new_path=new_path)
+    change_head(new_path)
+    view_hdf5_file(new_path)
\ No newline at end of file

From 864f5a0c2cff0d83765f0dd67a90597bc3bb0754 Mon Sep 17 00:00:00 2001
From: Taozhi Wang <taozhi.mark.wang@gmail.com>
Date: Wed, 24 Jul 2024 06:04:24 +0000
Subject: [PATCH 05/10] add the result

---
 rdagent/app/quant_factor_benchmark/eval.py    | 24 +++++++
 rdagent/components/benchmark/conf.py          |  2 +-
 .../components/benchmark/generate_dataset.py  | 72 -------------------
 .../factor_coder/CoSTEER/evolving_agent.py    |  2 +-
 .../components/coder/factor_coder/config.py   |  2 +-
 5 files changed, 27 insertions(+), 75 deletions(-)
 delete mode 100644 rdagent/components/benchmark/generate_dataset.py

diff --git a/rdagent/app/quant_factor_benchmark/eval.py b/rdagent/app/quant_factor_benchmark/eval.py
index d9cb4948..5924401c 100644
--- a/rdagent/app/quant_factor_benchmark/eval.py
+++ b/rdagent/app/quant_factor_benchmark/eval.py
@@ -1,3 +1,7 @@
+import os
+from pathlib import Path
+import pickle
+import time
 from rdagent.app.qlib_rd_loop.conf import PROP_SETTING
 from rdagent.scenarios.qlib.factor_experiment_loader.json_loader import (
     FactorTestCaseLoaderFromJsonFile,
@@ -11,6 +15,7 @@
 from rdagent.core.scenario import Scenario
 from rdagent.scenarios.qlib.experiment.factor_experiment import QlibFactorScenario
 
+from pprint import pprint
 
 # 1.read the settings
 bs = BenchmarkSettings()
@@ -35,6 +40,25 @@
 # 5.run the eval
 res = eval_method.eval()
 
+# 6.save the result
+pprint(res)
+
+res_workspace = (Path().cwd() / "git_ignore_folder" / "eval_results").absolute()
+print(str(res_workspace))
+
+# Save results
+timestamp = time.strftime("%Y%m%d-%H%M%S", time.localtime(time.time()))
+
+if not os.path.exists(str(res_workspace)):
+    os.makedirs(str(res_workspace))
+
+df_file_path = res_workspace / ("result_" + timestamp + ".csv")
+res_pkl_path = res_workspace / ("res_promptV2" + timestamp + ".pkl")
+res_pkl_path = res_workspace / ("res_promptV2" + timestamp + ".pkl")
+with open(str(res_pkl_path), "wb") as file:
+    # file.write(str(res))
+    pickle.dump(res, file)
+
 # TODO:
 # - Run it:
 # - factor input data generator;
diff --git a/rdagent/components/benchmark/conf.py b/rdagent/components/benchmark/conf.py
index 7ebebc5d..cefd6391 100644
--- a/rdagent/components/benchmark/conf.py
+++ b/rdagent/components/benchmark/conf.py
@@ -15,7 +15,7 @@ class BenchmarkSettings(BaseSettings):
 
     bench_data_path: Path = DIRNAME / "example.json"
 
-    bench_test_round: int = 10
+    bench_test_round: int = 1
     bench_test_case_n: Optional[int] = None  # how many test cases to run; If not given, all test cases will be run
 
     bench_method_cls: str = "rdagent.components.coder.factor_coder.CoSTEER.FactorCoSTEER"
diff --git a/rdagent/components/benchmark/generate_dataset.py b/rdagent/components/benchmark/generate_dataset.py
deleted file mode 100644
index 6e0520d1..00000000
--- a/rdagent/components/benchmark/generate_dataset.py
+++ /dev/null
@@ -1,72 +0,0 @@
-import pandas as pd
-import numpy as np
-import random
-import string
-
-def create_new_hdf5_file(file_path, new_path):
-    """ Create a new HDF5 file with random data. """
-    # Load the dataset
-    data = pd.read_hdf(file_path, key='data')
-
-    columns = [] # TODO select the column we want to keep
-    selected_data = data[columns]
-
-    # Generate new data for each column
-    new_data = pd.DataFrame(index=selected_data.index)
-
-    for column in selected_data.columns:
-        if column == 'B/P':
-            mean = selected_data[column].mean().values[0]
-            std = selected_data[column].std().values[0]
-        else:
-            mean = selected_data[column].mean()
-            std = selected_data[column].std()
-        new_data[column] = np.random.normal(mean, std, size=selected_data.shape[0])
-
-    # Save the new dataset
-    new_data.to_hdf(new_path, key='data', mode='w')
-
-    print("New dataset created and saved successfully!")
-
-def change_head(path):
-    data = pd.read_hdf(path, key='data')
-    columns = [
-        'B/P', 'ROE',
-        'TotalCapitalStock', 'TradableACapital', 'TotalMarketValue', 'TradableMarketValue', 'StockPrice', 
-        'E/P', 'ECut/P', 'EBIT/EV', 'EBITDA/EV', 'ROA_Q', 'MACrossover', 'QuarterlyUnrestrictedShareholdersRatioChange', 
-        'HSZZ_ALPHA_3M', 'ROE_Q', 'ROA_TTM', 'S_ROAG', 'ExternalFinancingScale_2Y', 'ConMarketConf_5D', 
-        'NetProfitSequentialQuarterlyChange', 'SemiannualUnrestrictedShareholdersRatioChange', 'STD_12M', 
-        'LarSmaDiffSellValue', 'OperatingCashFlowRatio', 'TurnoverRate_30D', 'HSZZ_R2_3M', 'Sales_Growth_3Y', 
-        'PricePosition_30D', 'NetProfitMargin', 'OperatingProfitYOY', 'SalesToCashRatio', 
-        'FutureUnrestrictedRatio_3M', 'HSZZ_ALPHA_12M', 'Idiosyncrasy', 'RatingChange', 'TSKEW', 
-        'WeeklyConsensusChangeJR_1W', 'HSZZ_BETA_3M', 'PricePosition_180D', 'MedSellValue', 
-        'UnlimitedShareholdersAverageAmount', 'T_ROEG', 'QuarterlyAverageShareholdersRatioChange', 
-        'FixedAssetTurnover', 'MonthlyRatingChange_1M', 'FutureUnrestrictedRatio_6M', 'TurnoverRate_30D_90D', 
-        'Sales2EV', 'ILLIQ_1M', 'Profit_Growth_TTM', 'HighLow_1M', 'OperationCash_TTM', 
-        'FutureUnrestrictedRatio_6MOver30DAvgTurnover', 'TurnoverRate_30D_180D', 'GrossProfitMargin', 
-        'AnalystMomentumScore', 'ShareExpansionRatio_2Y', 'ROIC', 'TurnoverRate_60D', 
-        'ExternalFinancingAdjustedGrowthRate', 'Weighted_Strength_3M', 'Weighted_Strength_1M', 
-        'FutureUnrestrictedRatio_1MOver30DAvgTurnover', 'OperatingCashFlowOverRevenue_TTM', 'ConMarketConf_10D', 
-        'HSZZ_ResidualStd_3M', 'RevenueSequentialYOY', 'RevenueYOY', 'EXTE'
-    ]
-
-    data.columns = columns
-    data.to_hdf(path, key='data', mode='w')
-    print("Head changed successfully!")
-
-def view_hdf5_file(filename):
-    with pd.HDFStore(filename, 'r') as store:
-        print("Keys in the file:", store.keys())
-        data = store['data']
-        print(data.head())
-        print("\nSummary statistics:\n", data.describe())  
-        print(data.index)
-
-
-
-if __name__ == '__main__':
-    path = ''
-    new_path = ''
-    create_new_hdf5_file(file_path=path, new_path=new_path)
-    change_head(new_path)
-    view_hdf5_file(new_path)
\ No newline at end of file
diff --git a/rdagent/components/coder/factor_coder/CoSTEER/evolving_agent.py b/rdagent/components/coder/factor_coder/CoSTEER/evolving_agent.py
index feee0ef7..75661805 100644
--- a/rdagent/components/coder/factor_coder/CoSTEER/evolving_agent.py
+++ b/rdagent/components/coder/factor_coder/CoSTEER/evolving_agent.py
@@ -14,6 +14,6 @@ def filter_evolvable_subjects_by_feedback(self, evo: EvolvableSubjects, feedback
         assert len(evo.sub_workspace_list) == len(feedback)
 
         for index in range(len(evo.sub_workspace_list)):
-            if not feedback[index].final_decision:
+            if feedback[index] and not feedback[index].final_decision:
                 evo.sub_workspace_list[index].clear()
         return evo
diff --git a/rdagent/components/coder/factor_coder/config.py b/rdagent/components/coder/factor_coder/config.py
index cc3c301c..9a62b4e6 100644
--- a/rdagent/components/coder/factor_coder/config.py
+++ b/rdagent/components/coder/factor_coder/config.py
@@ -10,7 +10,7 @@ class FactorImplementSettings(BaseSettings):
     class Config:
         env_prefix = "FACTOR_CODER_"  # Use FACTOR_CODER_ as prefix for environment variables
 
-    coder_use_cache: bool = False
+    coder_use_cache: bool = True
     data_folder: str = str(
         (Path().cwd() / "git_ignore_folder" / "factor_implementation_source_data").absolute(),
     )

From f9b57b96cf041c5a7c271e91c5ab3c0b711e4b57 Mon Sep 17 00:00:00 2001
From: Taozhi Wang <taozhi.mark.wang@gmail.com>
Date: Wed, 24 Jul 2024 08:01:51 +0000
Subject: [PATCH 06/10] Analysis

---
 .../app/quant_factor_benchmark/analysis.py    | 290 ++++++++++++++++++
 .../quant_factor_benchmark/design/__init__.py |   0
 .../design/benchmark.py                       |  72 +++++
 3 files changed, 362 insertions(+)
 create mode 100644 rdagent/app/quant_factor_benchmark/analysis.py
 create mode 100644 rdagent/app/quant_factor_benchmark/design/__init__.py
 create mode 100644 rdagent/app/quant_factor_benchmark/design/benchmark.py

diff --git a/rdagent/app/quant_factor_benchmark/analysis.py b/rdagent/app/quant_factor_benchmark/analysis.py
new file mode 100644
index 00000000..7dd94cc9
--- /dev/null
+++ b/rdagent/app/quant_factor_benchmark/analysis.py
@@ -0,0 +1,290 @@
+
+from pathlib import Path
+import pickle
+import pandas as pd
+
+from rdagent.app.quant_factor_benchmark.design.benchmark import summarize_res as summarize_res
+
+
+results = {
+    "1 round experiment": "git_ignore_folder/eval_results/res_promptV220240724-060037.pkl",
+}
+
+
+def load_data(file_path):
+    file_path = Path(file_path)
+
+    if not (file_path.is_file() and file_path.suffix == ".pkl" and file_path.name.startswith("res_")):
+        raise ValueError("You may get a invalid file path")
+
+    sum_df = []
+    with file_path.open("rb") as f:
+        res = pickle.load(f)
+
+    sum_df.append(summarize_res(res))
+    sum_df = pd.concat(sum_df, axis=1)
+    if sum_df.shape[0] == 0:
+        raise ValueError("No data in the file")
+    print(file_path, sum_df.shape)
+
+    index = [
+        "FactorSingleColumnEvaluator",
+        "FactorOutputFormatEvaluator",
+        "FactorRowCountEvaluator",
+        "FactorIndexEvaluator",
+        "FactorMissingValuesEvaluator",
+        "FactorEqualValueCountEvaluator",
+        "FactorCorrelationEvaluator",
+        "run factor error",
+    ]
+
+    # reindex in case of failing to run evaluator.
+    # If all implemented factors fail to run, some evaluators may not appear in the result.
+    sum_df = sum_df.reindex(index, axis=0)
+
+    sum_df_clean = sum_df.T.groupby(level=0).apply(lambda x: x.reset_index(drop=True))
+
+    # sum_df.columns
+
+    def get_run_error(sum_df_clean):
+        run_error = sum_df_clean["run factor error"]
+
+        run_error = run_error.unstack()
+
+        run_error = run_error.T.fillna(False).astype(bool)  # null indicate no exception
+
+        succ_rate = ~run_error
+        succ_rate = succ_rate.mean(axis=0).to_frame("success rate")
+        # make it display in a percentage rate
+        # succ_rate["success rate"] = succ_rate["success rate"].map(lambda x: f"{x:.2%}")
+        return succ_rate
+
+    succ_rate = get_run_error(sum_df_clean)
+
+    def reformat_succ_rate(display_df):
+        """You may get dataframe like this:
+
+                                    success rate
+        250-day_high_distance             80.00%
+        Corr_Close_Turnover               20.00%
+        EP_TTM                            20.00%
+        High_Frequency_Skewness           60.00%
+        Momentum                          50.00%
+        Morning_30-min_Return             30.00%
+        UID                                0.00%
+        Weighted_Earnings_Frequency       10.00%
+        """
+        index_map = {
+            # =========================research benchmark===========================
+            "One_Month_Volatility": ("One_Month_Volatility", "Volume&Price", "Easy"),
+            "Vol20": ("Vol20", "Volume&Price", "Medium"),
+            "Alpha#70": ("Alpha#70", "Volume&Price", "Hard"),
+            "DailyRDvar": ("DailyRDvar", "High-Frequency", "Easy"),
+            "AdjRDvar": ("AdjRDvar", "High-Frequency", "Medium"),
+            "AdjRDskew": ("AdjRDskew", "High-Frequency", "Hard"),
+            "PEG": ("PEG", "Fundamentals", "Easy"),
+            "Turnover_STD_1M": ("Turnover_STD_1M", "Fundamentals", "Medium"),
+            "turnover_correlation_with_price": (
+                "turnover_correlation_with_price",
+                "Fundamentals",
+                "Hard",
+            ),
+            "minute_pv_corr": ("minute_pv_corr", "High-Frequency", "New Discovery"),
+            "Liquidity_Factor": ("Liquidity_Factor", "Fundamentals", "New Discovery"),
+            # =========================project benchmark===========================
+            "250-day_high_distance": ("250-day_high_distance", "Volume&Price", "Medium"),
+            "Corr_Close_Turnover": ("Corr_Close_Turnover", "Fundamentals", "Easy"),
+            "EP_TTM": ("EP_TTM", "Fundamentals", "Medium"),
+            "High_Frequency_Skewness": ("High_Frequency_Skewness", "High-Frequency", "Easy"),
+            "Momentum": ("Momentum", "Volume&Price", "Easy"),
+            "Morning_30-min_Return": ("Morning_30-min_Return", "High-Frequency", "Medium"),
+            "UID": ("UID", "High-Frequency", "Hard"),
+            "Weighted_Earnings_Frequency": ("Weighted_Earnings_Frequency", "Volume&Price", "Hard"),
+            "Turnover_Rate_Factor": ("Turnover_Rate_Factor", "Fundamentals", "New Discovery"),
+            "PctTurn20": ("PctTurn20", "Volume&Price", "New Discovery"),
+            "PB_ROE": ("PB_ROE", "Fundamentals", "New Discovery"),
+        }
+
+        new_idx = []
+        display_df = display_df[display_df.index.isin(index_map.keys())]
+        # display_df = display_df.reindex(index_map.keys())
+        for idx in display_df.index:
+            new_idx.append(index_map[idx])
+        display_df.index = pd.MultiIndex.from_tuples(
+            new_idx,
+            names=["Factor", "Category", "Difficulty"],
+        )
+
+        display_df = display_df.swaplevel(0, 2).swaplevel(0, 1).sort_index(axis=0)
+
+        def sort_key_func(x):
+            order_v = []
+            for i in x:
+                order_v.append({"Easy": 0, "Medium": 1, "Hard": 2, "New Discovery": 3}.get(i, i))
+            return order_v
+
+        return display_df.sort_index(key=sort_key_func)
+
+    succ_rate_f = reformat_succ_rate(succ_rate)
+    succ_rate_f
+
+    sum_df_clean["FactorRowCountEvaluator"]
+
+    def get_run_error(eval_series):
+        eval_series = eval_series.unstack()
+
+        succ_rate = eval_series.T.fillna(False).astype(bool)  # false indicate failure
+
+        succ_rate = succ_rate.mean(axis=0).to_frame("success rate")
+        # make it display in a percentage rate
+        # succ_rate["success rate"] = succ_rate["success rate"].map(lambda x: f"{x:.2%}")
+        return succ_rate
+
+    format_issue = (
+        sum_df_clean["FactorRowCountEvaluator"] & sum_df_clean["FactorIndexEvaluator"]
+    )
+
+    format_succ_rate = get_run_error(format_issue)
+    format_succ_rate_f = reformat_succ_rate(format_succ_rate)
+
+    corr = sum_df_clean["FactorCorrelationEvaluator"] * format_issue
+
+    corr = corr.unstack().T.mean(axis=0).to_frame("corr(only success)")
+    corr_res = reformat_succ_rate(corr)
+
+    corr_max = sum_df_clean["FactorCorrelationEvaluator"] * format_issue
+
+    corr_max = corr_max.unstack().T.max(axis=0).to_frame("corr(only success)")
+    corr_max_res = reformat_succ_rate(corr_max)
+
+    value_max = sum_df_clean["FactorMissingValuesEvaluator"] * format_issue
+    value_max = value_max.unstack().T.max(axis=0).to_frame("max_value")
+    value_max_res = reformat_succ_rate(value_max)
+
+    value_avg = (
+        (sum_df_clean["FactorMissingValuesEvaluator"] * format_issue)
+        .unstack()
+        .T.mean(axis=0)
+        .to_frame("avg_value")
+    )
+    value_avg_res = reformat_succ_rate(value_avg)
+
+    result_all = pd.concat(
+        {
+            "avg. Correlation (value only)": corr_res.iloc[:, 0],
+            "avg. Format successful rate": format_succ_rate_f.iloc[:, 0],
+            "avg. Run successful rate": succ_rate_f.iloc[:, 0],
+            "max. Correlation": corr_max_res.iloc[:, 0],
+            "max. accuracy": value_max_res.iloc[:, 0],
+            "avg. accuracy": value_avg_res.iloc[:, 0],
+        },
+        axis=1,
+    )
+
+    def result_all_key_order(x):
+        order_v = []
+        for i in x:
+            order_v.append(
+                {
+                    "avg. Run successful rate": 0,
+                    "avg. Format successful rate": 1,
+                    "avg. Correlation (value only)": 2,
+                    "max. Correlation": 3,
+                    "max. accuracy": 4,
+                    "avg. accuracy": 5,
+                }.get(i, i),
+            )
+        return order_v
+
+    df = result_all.sort_index(axis=1, key=result_all_key_order)
+    print(df)
+
+    df["Selected Correlation"] = [0, 0, 0] # @TODO
+
+    # Calculate the mean of each column
+    mean_values = df.fillna(0.0).mean()
+    mean_df = pd.DataFrame(mean_values).T
+
+    # TODO: set it as multi-index
+    # Assign the MultiIndex to the DataFrame
+    mean_df.index = pd.MultiIndex.from_tuples([("-", "-", "Average")], names=["Factor", "Category", "Difficulty"])
+
+    # Append the mean values to the end of the dataframe
+    df_w_mean = pd.concat([df, mean_df]).astype("float")
+
+    return df_w_mean
+
+
+def display_df(df):
+    # This depends on jupyter
+    def _single_formatter(column):
+        if column.endswith("rate"):
+
+            def _f(x):
+                return "{:.2%}".format(x) if pd.notnull(x) else "-"
+
+        else:
+
+            def _f(x):
+                return "{:.4}".format(x) if pd.notnull(x) else "-"
+
+        return _f
+
+    def get_formatters():
+        # Show NaN or None as '-'.  Don't convert the value to string
+        fmts = {column: _single_formatter(column) for column in df.columns}
+        return fmts
+
+    # TODO display
+    df_w_mean.drop(["max. accuracy", "avg. accuracy"], axis=1).style.format(get_formatters()).background_gradient(
+        axis=0, vmax=1, vmin=0, cmap=__import__("seaborn").light_palette("green", as_cmap=True)
+    )
+
+
+final_res = {}
+for k, p in results.items():
+    df = load_data(p)
+    print(df)
+    final_res[k] = df.iloc[-1, :]
+
+final_res = pd.DataFrame(final_res)
+
+
+# TODO plot it with seaborn and save it as a file
+final_res.drop(["max. accuracy", "avg. accuracy"], axis=0).T
+
+import matplotlib.pyplot as plt
+import seaborn as sns
+
+plt.rcParams["axes.unicode_minus"] = False
+
+
+def change_fs(font_size):
+    font_size = font_size
+    plt.rc("font", size=font_size)  # controls default text sizes
+    plt.rc("axes", titlesize=font_size)  # fontsize of the axes title
+    plt.rc("axes", labelsize=font_size)  # fontsize of the x and y labels
+    plt.rc("xtick", labelsize=font_size)  # fontsize of the tick labels
+    plt.rc("ytick", labelsize=font_size)  # fontsize of the tick labels
+    plt.rc("legend", fontsize=font_size)  # legend fontsize
+    plt.rc("figure", titlesize=font_size)  # fontsize of the figure title
+
+
+change_fs(20)
+
+
+# Prepare the data for plotting
+plot_data = final_res.drop(["max. accuracy", "avg. accuracy"], axis=0).T
+plot_data = plot_data.reset_index().melt("index", var_name="a", value_name="b")
+
+# Create the plot
+plt.figure(figsize=(10, 6))
+sns.barplot(x="index", y="b", hue="a", data=plot_data)
+
+# Set the labels and title
+plt.xlabel("Method")
+plt.ylabel("Value")
+plt.title("Comparison of Different Methods")
+
+# Save the plot as a file
+plt.savefig("comparison_plot.png")
diff --git a/rdagent/app/quant_factor_benchmark/design/__init__.py b/rdagent/app/quant_factor_benchmark/design/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/rdagent/app/quant_factor_benchmark/design/benchmark.py b/rdagent/app/quant_factor_benchmark/design/benchmark.py
new file mode 100644
index 00000000..084a52e0
--- /dev/null
+++ b/rdagent/app/quant_factor_benchmark/design/benchmark.py
@@ -0,0 +1,72 @@
+from collections import defaultdict
+from copy import deepcopy
+from pathlib import Path
+from typing import Dict, List, Tuple, Union
+
+import pandas as pd
+from tqdm import tqdm
+
+from rdagent.core.exception import RunnerException
+from rdagent.components.coder.factor_coder.CoSTEER.evaluators import (
+    FactorCorrelationEvaluator,
+    FactorEqualValueCountEvaluator,
+    FactorEvaluator,
+    FactorIndexEvaluator,
+    FactorMissingValuesEvaluator,
+    FactorOutputFormatEvaluator,
+    FactorRowCountEvaluator,
+    FactorSingleColumnEvaluator,
+)
+
+
+"""
+Define EVAL_RES_ONLINE with an example
+{
+    <factor_name: str>: [
+        (
+            <FactorEvaluator: object>,
+            if successfully run(call `.execute()`):
+                [
+                    (Evaluator,
+                        if successfully evaluate it:
+                            (feedback, metric),
+                        else:
+                            EvaluationException
+                    ),
+                    ... other evaluators ...
+                ]
+            else:
+                <Run Exception>
+        )
+        ... more similar tuples ...
+    ]
+}
+"""
+EVAL_RES_ONLINE = Dict[
+    str,
+    List[Tuple[FactorEvaluator, Union[object, RunnerException]]],
+]
+
+def summarize_res(res: EVAL_RES_ONLINE) -> pd.DataFrame:
+    # None: indicate that it raises exception and get no results
+    sum_res = {}
+    for factor_name, runs in res.items():
+        for fi, err_or_res_l in runs:
+            # NOTE:  str(fi) may not be unique!!  Because the workspace can be skipped when hitting the cache.
+            uniq_key = f"{str(fi)},{id(fi)}"
+
+            key = (factor_name, uniq_key)
+            val = {}
+            if isinstance(err_or_res_l, Exception):
+                val["run factor error"] = str(err_or_res_l.__class__)
+            else:
+                val["run factor error"] = None
+                for ev_obj, err_or_res in err_or_res_l:
+                    if isinstance(err_or_res, Exception):
+                        val[str(ev_obj)] = None
+                    else:
+                        feedback, metric = err_or_res
+                        val[str(ev_obj)] = metric
+            sum_res[key] = val
+
+    return pd.DataFrame(sum_res)

From db82b67bebafa9fcfad466a12703308381ff44d2 Mon Sep 17 00:00:00 2001
From: Taozhi Wang <taozhi.mark.wang@gmail.com>
Date: Wed, 24 Jul 2024 08:27:06 +0000
Subject: [PATCH 07/10] Factor update

---
 .../app/quant_factor_benchmark/analysis.py    | 44 ++++---------
 rdagent/components/benchmark/analysis.py      | 63 -------------------
 rdagent/components/benchmark/example.json     |  6 ++
 3 files changed, 18 insertions(+), 95 deletions(-)
 delete mode 100644 rdagent/components/benchmark/analysis.py

diff --git a/rdagent/app/quant_factor_benchmark/analysis.py b/rdagent/app/quant_factor_benchmark/analysis.py
index 7dd94cc9..81a7c4c9 100644
--- a/rdagent/app/quant_factor_benchmark/analysis.py
+++ b/rdagent/app/quant_factor_benchmark/analysis.py
@@ -1,15 +1,27 @@
 
+import json
 from pathlib import Path
 import pickle
 import pandas as pd
 
 from rdagent.app.quant_factor_benchmark.design.benchmark import summarize_res as summarize_res
+from rdagent.components.benchmark.conf import BenchmarkSettings
 
 
 results = {
     "1 round experiment": "git_ignore_folder/eval_results/res_promptV220240724-060037.pkl",
 }
 
+# Get index map from the json file
+index_map = {}
+bs = BenchmarkSettings()
+def load(json_file_path: Path) -> None:
+    with open(json_file_path, "r") as file:
+        factor_dict = json.load(file)
+    for factor_name, factor_data in factor_dict.items():
+        index_map[factor_name] = (factor_name, factor_data["Category"], factor_data["Difficulty"])
+
+load(bs.bench_data_path)
 
 def load_data(file_path):
     file_path = Path(file_path)
@@ -74,36 +86,6 @@ def reformat_succ_rate(display_df):
         UID                                0.00%
         Weighted_Earnings_Frequency       10.00%
         """
-        index_map = {
-            # =========================research benchmark===========================
-            "One_Month_Volatility": ("One_Month_Volatility", "Volume&Price", "Easy"),
-            "Vol20": ("Vol20", "Volume&Price", "Medium"),
-            "Alpha#70": ("Alpha#70", "Volume&Price", "Hard"),
-            "DailyRDvar": ("DailyRDvar", "High-Frequency", "Easy"),
-            "AdjRDvar": ("AdjRDvar", "High-Frequency", "Medium"),
-            "AdjRDskew": ("AdjRDskew", "High-Frequency", "Hard"),
-            "PEG": ("PEG", "Fundamentals", "Easy"),
-            "Turnover_STD_1M": ("Turnover_STD_1M", "Fundamentals", "Medium"),
-            "turnover_correlation_with_price": (
-                "turnover_correlation_with_price",
-                "Fundamentals",
-                "Hard",
-            ),
-            "minute_pv_corr": ("minute_pv_corr", "High-Frequency", "New Discovery"),
-            "Liquidity_Factor": ("Liquidity_Factor", "Fundamentals", "New Discovery"),
-            # =========================project benchmark===========================
-            "250-day_high_distance": ("250-day_high_distance", "Volume&Price", "Medium"),
-            "Corr_Close_Turnover": ("Corr_Close_Turnover", "Fundamentals", "Easy"),
-            "EP_TTM": ("EP_TTM", "Fundamentals", "Medium"),
-            "High_Frequency_Skewness": ("High_Frequency_Skewness", "High-Frequency", "Easy"),
-            "Momentum": ("Momentum", "Volume&Price", "Easy"),
-            "Morning_30-min_Return": ("Morning_30-min_Return", "High-Frequency", "Medium"),
-            "UID": ("UID", "High-Frequency", "Hard"),
-            "Weighted_Earnings_Frequency": ("Weighted_Earnings_Frequency", "Volume&Price", "Hard"),
-            "Turnover_Rate_Factor": ("Turnover_Rate_Factor", "Fundamentals", "New Discovery"),
-            "PctTurn20": ("PctTurn20", "Volume&Price", "New Discovery"),
-            "PB_ROE": ("PB_ROE", "Fundamentals", "New Discovery"),
-        }
 
         new_idx = []
         display_df = display_df[display_df.index.isin(index_map.keys())]
@@ -199,8 +181,6 @@ def result_all_key_order(x):
     df = result_all.sort_index(axis=1, key=result_all_key_order)
     print(df)
 
-    df["Selected Correlation"] = [0, 0, 0] # @TODO
-
     # Calculate the mean of each column
     mean_values = df.fillna(0.0).mean()
     mean_df = pd.DataFrame(mean_values).T
diff --git a/rdagent/components/benchmark/analysis.py b/rdagent/components/benchmark/analysis.py
deleted file mode 100644
index 1aedb7cc..00000000
--- a/rdagent/components/benchmark/analysis.py
+++ /dev/null
@@ -1,63 +0,0 @@
-import pickle
-import os
-import pandas as pd
-import matplotlib.pyplot as plt
-
-# Function to load and process each pickle file
-def process_pickle_file(file_path):
-    try:
-        with open(file_path, 'rb') as file:
-            data = pickle.load(file)
-        # Assuming data is a DataFrame or similar
-        print(f"Data from {file_path} processed successfully.")
-        return data
-    except Exception as e:
-        print(f"Error processing {file_path}: {e}")
-        return None
-    
-def analysis(folder_path):
-    success_count = 0
-    fail_count = 0
-
-    # Logging the errors
-    error_log = open("error_log.log", "w")
-
-    # List to store data for visualization
-    data_frames = []
-
-    # Processing each file in the directory
-    for file_name in os.listdir(folder_path):
-        file_path = os.path.join(folder_path, file_name)
-        data = process_pickle_file(file_path)
-        if data is not None:
-            data_frames.append(data)
-
-    for df in data_frames:
-        if 'Execution succeeded' in df[0]:
-            success_count += 1
-        else:
-            fail_count += 1
-            error_log.write(f"{file_path}: \n{df[0]}\n")
-
-    # Writing summary
-    print(f"Number of successful files: {success_count}")
-    print(f"Number of failed files: {fail_count}")
-
-    # Closing the error log file
-    error_log.close()
-
-def view_pickle_file(folder_path):
-    for file_name in os.listdir(folder_path):
-        file_path = os.path.join(folder_path, file_name)
-
-        print(f'the path of this file is: {file_path}\n')
-        with open(file_path, 'rb') as file:
-            data = pickle.load(file)
-            for i in range(len(data)):
-                print(data[i])
-
-
-if __name__ == '__main__':
-    folder_path = '/data/userdata/v-taozhiwang/RD-Agent/git_ignore_folder/factor_implementation_execution_cache'
-    
-    analysis(folder_path)
\ No newline at end of file
diff --git a/rdagent/components/benchmark/example.json b/rdagent/components/benchmark/example.json
index f1e5a85c..742927da 100644
--- a/rdagent/components/benchmark/example.json
+++ b/rdagent/components/benchmark/example.json
@@ -6,6 +6,8 @@
             "20-day turnover rate": "Average turnover rate over the past 20 days.",
             "Market Capitalization": "Total market value of a company's outstanding shares."
         },
+        "Category": "Fundamentals",
+        "Difficulty": "Easy",
         "gt_code": "import pandas as pd\n\ndata_f = pd.read_hdf('daily_f.h5')\n\ndata = data_f.reset_index()\nwindow_size = 20\n\nnominator=data.groupby('instrument')[['TurnoverRate_30D']].rolling(window=window_size).mean().reset_index(0, drop=True)\n# transfer to series\nnew=nominator['TurnoverRate_30D']\ndata['Turnover_Rate_Factor']=new/data['TradableACapital']\n\n# set the datetime and instrument as index and drop the original index\nresult=pd.DataFrame(data['Turnover_Rate_Factor']).set_index(data_f.index)\n\n# transfer the result to series\nresult=result['Turnover_Rate_Factor']\nresult.to_hdf(\"result.h5\", key=\"data\")" 
     },
     "PctTurn20": {
@@ -16,6 +18,8 @@
             "Turnover_{i, t}": "Turnover of stock i at day t.",
             "Turnover_{i, t-20}": "Turnover of stock i at day t-20."
         },
+        "Category": "Volume&Price",
+        "Difficulty": "Medium",
         "gt_code": "import pandas as pd\nfrom statsmodels import api as sm\n\ndef fill_mean(s: pd.Series) -> pd.Series:\n    return s.fillna(s.mean()).fillna(0.0)\n\ndef market_value_neutralize(s: pd.Series, mv: pd.Series) -> pd.Series:\n    s = s.groupby(\"datetime\", group_keys=False).apply(fill_mean)\n    mv = mv.groupby(\"datetime\", group_keys=False).apply(fill_mean)\n\n    df_f = mv.to_frame(\"MarketValue\")\n    df_f[\"const\"] = 1\n    X = df_f[[\"MarketValue\", \"const\"]]\n\n    # Perform the Ordinary Least Squares (OLS) regression\n    model = sm.OLS(s, X)\n    results = model.fit()\n\n    # Calculate the residuals\n    df_f[\"residual\"] = results.resid\n    df_f[\"norm_resi\"] = df_f.groupby(level=\"datetime\", group_keys=False)[\"residual\"].apply(\n        lambda x: (x - x.mean()) / x.std(),\n    )\n    return df_f[\"norm_resi\"]\n\n\n# get_turnover\ndf_pv = pd.read_hdf(\"daily_pv.h5\", key=\"data\")\ndf_f = pd.read_hdf(\"daily_f.h5\", key=\"data\")\nturnover = df_pv[\"$money\"] / df_f[\"TradableMarketValue\"]\n\nf = turnover.groupby(\"instrument\").pct_change(periods=20)\n\nf_neutralized = market_value_neutralize(f, df_f[\"TradableMarketValue\"])\n\nf_neutralized.to_hdf(\"result.h5\", key=\"data\")"
     },
     "PB_ROE": {
@@ -25,6 +29,8 @@
             "\\text{rank}(PB_t)": "Ranking PB on cross-section at time t.",
             "\\text{rank}(ROE_t)": "Ranking single-quarter ROE on cross-section at time t."
         },
+        "Category": "High-Frequency",
+        "Difficulty": "Hard",
         "gt_code": "#!/usr/bin/env python\n\nimport pandas as pd\n\ndata_f = pd.read_hdf('daily_f.h5')\n\ndata = data_f.reset_index()\n\n# Calculate the rank of PB and ROE\ndata['PB_rank'] = data.groupby('datetime')['B/P'].rank()\ndata['ROE_rank'] = data.groupby('datetime')['ROE'].rank()\n\n# Calculate the difference between the ranks\ndata['PB_ROE'] = data['PB_rank'] - data['ROE_rank']\n\n# set the datetime and instrument as index and drop the original index\nresult=pd.DataFrame(data['PB_ROE']).set_index(data_f.index)\n\n# transfer the result to series\nresult=result['PB_ROE']\nresult.to_hdf(\"result.h5\", key=\"data\")"
     }
 }
\ No newline at end of file

From 52dc9385dab781492b4b4e2513011d21e93c6d33 Mon Sep 17 00:00:00 2001
From: Taozhi Wang <taozhi.mark.wang@gmail.com>
Date: Thu, 25 Jul 2024 02:52:39 +0000
Subject: [PATCH 08/10] Updates

---
 .../app/quant_factor_benchmark/analysis.py    |  4 +-
 .../quant_factor_benchmark/design/__init__.py |  0
 .../design/benchmark.py                       | 72 -------------------
 rdagent/app/quant_factor_benchmark/eval.py    | 28 +-------
 rdagent/components/benchmark/conf.py          |  5 +-
 rdagent/components/benchmark/eval_method.py   | 36 +++++++++-
 .../components/coder/factor_coder/config.py   |  2 +-
 7 files changed, 43 insertions(+), 104 deletions(-)
 delete mode 100644 rdagent/app/quant_factor_benchmark/design/__init__.py
 delete mode 100644 rdagent/app/quant_factor_benchmark/design/benchmark.py

diff --git a/rdagent/app/quant_factor_benchmark/analysis.py b/rdagent/app/quant_factor_benchmark/analysis.py
index 81a7c4c9..6e6a20ff 100644
--- a/rdagent/app/quant_factor_benchmark/analysis.py
+++ b/rdagent/app/quant_factor_benchmark/analysis.py
@@ -4,12 +4,12 @@
 import pickle
 import pandas as pd
 
-from rdagent.app.quant_factor_benchmark.design.benchmark import summarize_res as summarize_res
+from rdagent.components.benchmark.eval_method import summarize_res as summarize_res
 from rdagent.components.benchmark.conf import BenchmarkSettings
 
 
 results = {
-    "1 round experiment": "git_ignore_folder/eval_results/res_promptV220240724-060037.pkl",
+    "1 round experiment": "log/2024-07-25_02-32-24-552766/1445941/2024-07-25_02-48-07-756153.pkl",
 }
 
 # Get index map from the json file
diff --git a/rdagent/app/quant_factor_benchmark/design/__init__.py b/rdagent/app/quant_factor_benchmark/design/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/rdagent/app/quant_factor_benchmark/design/benchmark.py b/rdagent/app/quant_factor_benchmark/design/benchmark.py
deleted file mode 100644
index 084a52e0..00000000
--- a/rdagent/app/quant_factor_benchmark/design/benchmark.py
+++ /dev/null
@@ -1,72 +0,0 @@
-from collections import defaultdict
-from copy import deepcopy
-from pathlib import Path
-from typing import Dict, List, Tuple, Union
-
-import pandas as pd
-from tqdm import tqdm
-
-from rdagent.core.exception import RunnerException
-from rdagent.components.coder.factor_coder.CoSTEER.evaluators import (
-    FactorCorrelationEvaluator,
-    FactorEqualValueCountEvaluator,
-    FactorEvaluator,
-    FactorIndexEvaluator,
-    FactorMissingValuesEvaluator,
-    FactorOutputFormatEvaluator,
-    FactorRowCountEvaluator,
-    FactorSingleColumnEvaluator,
-)
-
-
-"""
-Define EVAL_RES_ONLINE with an example
-{
-    <factor_name: str>: [
-        (
-            <FactorEvaluator: object>,
-            if successfully run(call `.execute()`):
-                [
-                    (Evaluator,
-                        if successfully evaluate it:
-                            (feedback, metric),
-                        else:
-                            EvaluationException
-                    ),
-                    ... other evaluators ...
-                ]
-            else:
-                <Run Exception>
-        )
-        ... more similar tuples ...
-    ]
-}
-"""
-EVAL_RES_ONLINE = Dict[
-    str,
-    List[Tuple[FactorEvaluator, Union[object, RunnerException]]],
-]
-
-def summarize_res(res: EVAL_RES_ONLINE) -> pd.DataFrame:
-    # None: indicate that it raises exception and get no results
-    sum_res = {}
-    for factor_name, runs in res.items():
-        for fi, err_or_res_l in runs:
-            # NOTE:  str(fi) may not be unique!!  Because the workspace can be skipped when hitting the cache.
-            uniq_key = f"{str(fi)},{id(fi)}"
-
-            key = (factor_name, uniq_key)
-            val = {}
-            if isinstance(err_or_res_l, Exception):
-                val["run factor error"] = str(err_or_res_l.__class__)
-            else:
-                val["run factor error"] = None
-                for ev_obj, err_or_res in err_or_res_l:
-                    if isinstance(err_or_res, Exception):
-                        val[str(ev_obj)] = None
-                    else:
-                        feedback, metric = err_or_res
-                        val[str(ev_obj)] = metric
-            sum_res[key] = val
-
-    return pd.DataFrame(sum_res)
diff --git a/rdagent/app/quant_factor_benchmark/eval.py b/rdagent/app/quant_factor_benchmark/eval.py
index 5924401c..a22f928f 100644
--- a/rdagent/app/quant_factor_benchmark/eval.py
+++ b/rdagent/app/quant_factor_benchmark/eval.py
@@ -3,6 +3,7 @@
 import pickle
 import time
 from rdagent.app.qlib_rd_loop.conf import PROP_SETTING
+from rdagent.log import rdagent_logger as logger
 from rdagent.scenarios.qlib.factor_experiment_loader.json_loader import (
     FactorTestCaseLoaderFromJsonFile,
 )
@@ -41,29 +42,4 @@
 res = eval_method.eval()
 
 # 6.save the result
-pprint(res)
-
-res_workspace = (Path().cwd() / "git_ignore_folder" / "eval_results").absolute()
-print(str(res_workspace))
-
-# Save results
-timestamp = time.strftime("%Y%m%d-%H%M%S", time.localtime(time.time()))
-
-if not os.path.exists(str(res_workspace)):
-    os.makedirs(str(res_workspace))
-
-df_file_path = res_workspace / ("result_" + timestamp + ".csv")
-res_pkl_path = res_workspace / ("res_promptV2" + timestamp + ".pkl")
-res_pkl_path = res_workspace / ("res_promptV2" + timestamp + ".pkl")
-with open(str(res_pkl_path), "wb") as file:
-    # file.write(str(res))
-    pickle.dump(res, file)
-
-# TODO:
-# - Run it:
-# - factor input data generator;
-#   - f_{gt}(input) => value_{gt}
-#   - f_{llm}(input) => value_{llm}
-#   - we have legal issue to release Input
-# - Eval result:
-#   -  check https://github.com/peteryang1/fincov2/blob/master/src/scripts/benchmark/analysis.py
+logger.log_object(res)
diff --git a/rdagent/components/benchmark/conf.py b/rdagent/components/benchmark/conf.py
index cefd6391..a05bbe39 100644
--- a/rdagent/components/benchmark/conf.py
+++ b/rdagent/components/benchmark/conf.py
@@ -11,11 +11,14 @@
 
 
 class BenchmarkSettings(BaseSettings):
+    class Config:
+        env_prefix = "BENCHMARK_"  # Use BENCHMARK_ as prefix for environment variables
+
     ground_truth_dir: Path = DIRNAME / "ground_truth"
 
     bench_data_path: Path = DIRNAME / "example.json"
 
-    bench_test_round: int = 1
+    bench_test_round: int = 10
     bench_test_case_n: Optional[int] = None  # how many test cases to run; If not given, all test cases will be run
 
     bench_method_cls: str = "rdagent.components.coder.factor_coder.CoSTEER.FactorCoSTEER"
diff --git a/rdagent/components/benchmark/eval_method.py b/rdagent/components/benchmark/eval_method.py
index b45451eb..4d96afd3 100644
--- a/rdagent/components/benchmark/eval_method.py
+++ b/rdagent/components/benchmark/eval_method.py
@@ -1,7 +1,8 @@
 from collections import defaultdict
 from pathlib import Path
-from typing import List, Tuple, Union
+from typing import Dict, List, Tuple, Union
 
+import pandas as pd
 from tqdm import tqdm
 
 from rdagent.components.coder.factor_coder.config import FACTOR_IMPLEMENT_SETTINGS
@@ -18,12 +19,17 @@
 from rdagent.components.coder.factor_coder.factor import FactorFBWorkspace
 from rdagent.core.conf import RD_AGENT_SETTINGS
 from rdagent.core.developer import Developer
-from rdagent.core.exception import CoderException
+from rdagent.core.exception import CoderException, RunnerException
 from rdagent.core.experiment import Task, Workspace
 from rdagent.core.scenario import Scenario
 from rdagent.core.utils import multiprocessing_wrapper
 
 
+EVAL_RES = Dict[
+    str,
+    List[Tuple[FactorEvaluator, Union[object, RunnerException]]],
+]
+
 class TestCase:
     def __init__(
         self,
@@ -165,3 +171,29 @@ def eval(self):
             res[gt_case.target_task.factor_name].append((gen_factor, eval_res))
 
         return res
+
+    @staticmethod
+    def summarize_res(res: EVAL_RES) -> pd.DataFrame:
+        # None: indicate that it raises exception and get no results
+        sum_res = {}
+        for factor_name, runs in res.items():
+            for fi, err_or_res_l in runs:
+                # NOTE:  str(fi) may not be unique!!  Because the workspace can be skipped when hitting the cache.
+                uniq_key = f"{str(fi)},{id(fi)}"
+
+                key = (factor_name, uniq_key)
+                val = {}
+                if isinstance(err_or_res_l, Exception):
+                    val["run factor error"] = str(err_or_res_l.__class__)
+                else:
+                    val["run factor error"] = None
+                    for ev_obj, err_or_res in err_or_res_l:
+                        if isinstance(err_or_res, Exception):
+                            val[str(ev_obj)] = None
+                        else:
+                            feedback, metric = err_or_res
+                            val[str(ev_obj)] = metric
+                sum_res[key] = val
+
+        return pd.DataFrame(sum_res)
+
diff --git a/rdagent/components/coder/factor_coder/config.py b/rdagent/components/coder/factor_coder/config.py
index 9a62b4e6..cc3c301c 100644
--- a/rdagent/components/coder/factor_coder/config.py
+++ b/rdagent/components/coder/factor_coder/config.py
@@ -10,7 +10,7 @@ class FactorImplementSettings(BaseSettings):
     class Config:
         env_prefix = "FACTOR_CODER_"  # Use FACTOR_CODER_ as prefix for environment variables
 
-    coder_use_cache: bool = True
+    coder_use_cache: bool = False
     data_folder: str = str(
         (Path().cwd() / "git_ignore_folder" / "factor_implementation_source_data").absolute(),
     )

From 702c830b8759cc6613618e9abbaed6de8d7ad9e6 Mon Sep 17 00:00:00 2001
From: Taozhi Wang <taozhi.mark.wang@gmail.com>
Date: Thu, 25 Jul 2024 07:10:51 +0000
Subject: [PATCH 09/10] Reformat analysis.py

---
 .../app/quant_factor_benchmark/analysis.py    | 390 +++++++-----------
 1 file changed, 152 insertions(+), 238 deletions(-)

diff --git a/rdagent/app/quant_factor_benchmark/analysis.py b/rdagent/app/quant_factor_benchmark/analysis.py
index 6e6a20ff..75aea00d 100644
--- a/rdagent/app/quant_factor_benchmark/analysis.py
+++ b/rdagent/app/quant_factor_benchmark/analysis.py
@@ -1,169 +1,60 @@
-
 import json
-from pathlib import Path
 import pickle
 import pandas as pd
+from pathlib import Path
+import matplotlib.pyplot as plt
+import seaborn as sns
 
-from rdagent.components.benchmark.eval_method import summarize_res as summarize_res
+from rdagent.components.benchmark.eval_method import FactorImplementEval
 from rdagent.components.benchmark.conf import BenchmarkSettings
 
-
-results = {
-    "1 round experiment": "log/2024-07-25_02-32-24-552766/1445941/2024-07-25_02-48-07-756153.pkl",
-}
-
-# Get index map from the json file
-index_map = {}
-bs = BenchmarkSettings()
-def load(json_file_path: Path) -> None:
-    with open(json_file_path, "r") as file:
-        factor_dict = json.load(file)
-    for factor_name, factor_data in factor_dict.items():
-        index_map[factor_name] = (factor_name, factor_data["Category"], factor_data["Difficulty"])
-
-load(bs.bench_data_path)
-
-def load_data(file_path):
-    file_path = Path(file_path)
-
-    if not (file_path.is_file() and file_path.suffix == ".pkl" and file_path.name.startswith("res_")):
-        raise ValueError("You may get a invalid file path")
-
-    sum_df = []
-    with file_path.open("rb") as f:
-        res = pickle.load(f)
-
-    sum_df.append(summarize_res(res))
-    sum_df = pd.concat(sum_df, axis=1)
-    if sum_df.shape[0] == 0:
-        raise ValueError("No data in the file")
-    print(file_path, sum_df.shape)
-
-    index = [
-        "FactorSingleColumnEvaluator",
-        "FactorOutputFormatEvaluator",
-        "FactorRowCountEvaluator",
-        "FactorIndexEvaluator",
-        "FactorMissingValuesEvaluator",
-        "FactorEqualValueCountEvaluator",
-        "FactorCorrelationEvaluator",
-        "run factor error",
-    ]
-
-    # reindex in case of failing to run evaluator.
-    # If all implemented factors fail to run, some evaluators may not appear in the result.
-    sum_df = sum_df.reindex(index, axis=0)
-
-    sum_df_clean = sum_df.T.groupby(level=0).apply(lambda x: x.reset_index(drop=True))
-
-    # sum_df.columns
-
-    def get_run_error(sum_df_clean):
-        run_error = sum_df_clean["run factor error"]
-
-        run_error = run_error.unstack()
-
-        run_error = run_error.T.fillna(False).astype(bool)  # null indicate no exception
-
-        succ_rate = ~run_error
-        succ_rate = succ_rate.mean(axis=0).to_frame("success rate")
-        # make it display in a percentage rate
-        # succ_rate["success rate"] = succ_rate["success rate"].map(lambda x: f"{x:.2%}")
-        return succ_rate
-
-    succ_rate = get_run_error(sum_df_clean)
-
-    def reformat_succ_rate(display_df):
-        """You may get dataframe like this:
-
-                                    success rate
-        250-day_high_distance             80.00%
-        Corr_Close_Turnover               20.00%
-        EP_TTM                            20.00%
-        High_Frequency_Skewness           60.00%
-        Momentum                          50.00%
-        Morning_30-min_Return             30.00%
-        UID                                0.00%
-        Weighted_Earnings_Frequency       10.00%
-        """
-
+class BenchmarkAnalyzer:
+    def __init__(self, settings):
+        self.settings = settings
+        self.index_map = self.load_index_map()
+
+    def load_index_map(self):
+        index_map = {}
+        with open(self.settings.bench_data_path, "r") as file:
+            factor_dict = json.load(file)
+        for factor_name, data in factor_dict.items():
+            index_map[factor_name] = (factor_name, data["Category"], data["Difficulty"])
+        return index_map
+
+    def load_data(self, file_path):
+        file_path = Path(file_path)
+        if not (file_path.is_file() and file_path.suffix == ".pkl"):
+            raise ValueError("Invalid file path")
+        
+        with file_path.open("rb") as f:
+            res = pickle.load(f)
+        
+        return res
+
+    def process_results(self, results):
+        final_res = {}
+        for experiment, path in results.items():
+            data = self.load_data(path)
+            summarized_data = FactorImplementEval.summarize_res(data)
+            processed_data = self.analyze_data(summarized_data)
+            final_res[experiment] = processed_data.iloc[-1, :]
+        return final_res
+    
+    def reformat_succ_rate(self, display_df):
         new_idx = []
-        display_df = display_df[display_df.index.isin(index_map.keys())]
-        # display_df = display_df.reindex(index_map.keys())
+        display_df = display_df[display_df.index.isin(self.index_map.keys())]
         for idx in display_df.index:
-            new_idx.append(index_map[idx])
+            new_idx.append(self.index_map[idx])
+
         display_df.index = pd.MultiIndex.from_tuples(
             new_idx,
             names=["Factor", "Category", "Difficulty"],
         )
-
         display_df = display_df.swaplevel(0, 2).swaplevel(0, 1).sort_index(axis=0)
 
-        def sort_key_func(x):
-            order_v = []
-            for i in x:
-                order_v.append({"Easy": 0, "Medium": 1, "Hard": 2, "New Discovery": 3}.get(i, i))
-            return order_v
-
-        return display_df.sort_index(key=sort_key_func)
-
-    succ_rate_f = reformat_succ_rate(succ_rate)
-    succ_rate_f
-
-    sum_df_clean["FactorRowCountEvaluator"]
-
-    def get_run_error(eval_series):
-        eval_series = eval_series.unstack()
-
-        succ_rate = eval_series.T.fillna(False).astype(bool)  # false indicate failure
-
-        succ_rate = succ_rate.mean(axis=0).to_frame("success rate")
-        # make it display in a percentage rate
-        # succ_rate["success rate"] = succ_rate["success rate"].map(lambda x: f"{x:.2%}")
-        return succ_rate
-
-    format_issue = (
-        sum_df_clean["FactorRowCountEvaluator"] & sum_df_clean["FactorIndexEvaluator"]
-    )
-
-    format_succ_rate = get_run_error(format_issue)
-    format_succ_rate_f = reformat_succ_rate(format_succ_rate)
-
-    corr = sum_df_clean["FactorCorrelationEvaluator"] * format_issue
-
-    corr = corr.unstack().T.mean(axis=0).to_frame("corr(only success)")
-    corr_res = reformat_succ_rate(corr)
-
-    corr_max = sum_df_clean["FactorCorrelationEvaluator"] * format_issue
-
-    corr_max = corr_max.unstack().T.max(axis=0).to_frame("corr(only success)")
-    corr_max_res = reformat_succ_rate(corr_max)
-
-    value_max = sum_df_clean["FactorMissingValuesEvaluator"] * format_issue
-    value_max = value_max.unstack().T.max(axis=0).to_frame("max_value")
-    value_max_res = reformat_succ_rate(value_max)
-
-    value_avg = (
-        (sum_df_clean["FactorMissingValuesEvaluator"] * format_issue)
-        .unstack()
-        .T.mean(axis=0)
-        .to_frame("avg_value")
-    )
-    value_avg_res = reformat_succ_rate(value_avg)
-
-    result_all = pd.concat(
-        {
-            "avg. Correlation (value only)": corr_res.iloc[:, 0],
-            "avg. Format successful rate": format_succ_rate_f.iloc[:, 0],
-            "avg. Run successful rate": succ_rate_f.iloc[:, 0],
-            "max. Correlation": corr_max_res.iloc[:, 0],
-            "max. accuracy": value_max_res.iloc[:, 0],
-            "avg. accuracy": value_avg_res.iloc[:, 0],
-        },
-        axis=1,
-    )
-
-    def result_all_key_order(x):
+        return display_df.sort_index(key=lambda x: [{"Easy": 0, "Medium": 1, "Hard": 2, "New Discovery": 3}.get(i, i) for i in x])
+    
+    def result_all_key_order(self, x):
         order_v = []
         for i in x:
             order_v.append(
@@ -178,93 +69,116 @@ def result_all_key_order(x):
             )
         return order_v
 
-    df = result_all.sort_index(axis=1, key=result_all_key_order)
-    print(df)
-
-    # Calculate the mean of each column
-    mean_values = df.fillna(0.0).mean()
-    mean_df = pd.DataFrame(mean_values).T
-
-    # TODO: set it as multi-index
-    # Assign the MultiIndex to the DataFrame
-    mean_df.index = pd.MultiIndex.from_tuples([("-", "-", "Average")], names=["Factor", "Category", "Difficulty"])
-
-    # Append the mean values to the end of the dataframe
-    df_w_mean = pd.concat([df, mean_df]).astype("float")
-
-    return df_w_mean
-
-
-def display_df(df):
-    # This depends on jupyter
-    def _single_formatter(column):
-        if column.endswith("rate"):
-
-            def _f(x):
-                return "{:.2%}".format(x) if pd.notnull(x) else "-"
-
-        else:
-
-            def _f(x):
-                return "{:.4}".format(x) if pd.notnull(x) else "-"
-
-        return _f
-
-    def get_formatters():
-        # Show NaN or None as '-'.  Don't convert the value to string
-        fmts = {column: _single_formatter(column) for column in df.columns}
-        return fmts
-
-    # TODO display
-    df_w_mean.drop(["max. accuracy", "avg. accuracy"], axis=1).style.format(get_formatters()).background_gradient(
-        axis=0, vmax=1, vmin=0, cmap=__import__("seaborn").light_palette("green", as_cmap=True)
-    )
-
-
-final_res = {}
-for k, p in results.items():
-    df = load_data(p)
-    print(df)
-    final_res[k] = df.iloc[-1, :]
-
-final_res = pd.DataFrame(final_res)
-
-
-# TODO plot it with seaborn and save it as a file
-final_res.drop(["max. accuracy", "avg. accuracy"], axis=0).T
-
-import matplotlib.pyplot as plt
-import seaborn as sns
-
-plt.rcParams["axes.unicode_minus"] = False
-
-
-def change_fs(font_size):
-    font_size = font_size
-    plt.rc("font", size=font_size)  # controls default text sizes
-    plt.rc("axes", titlesize=font_size)  # fontsize of the axes title
-    plt.rc("axes", labelsize=font_size)  # fontsize of the x and y labels
-    plt.rc("xtick", labelsize=font_size)  # fontsize of the tick labels
-    plt.rc("ytick", labelsize=font_size)  # fontsize of the tick labels
-    plt.rc("legend", fontsize=font_size)  # legend fontsize
-    plt.rc("figure", titlesize=font_size)  # fontsize of the figure title
-
-
-change_fs(20)
-
+    def analyze_data(self, sum_df):
+        index = [
+            "FactorSingleColumnEvaluator",
+            "FactorOutputFormatEvaluator",
+            "FactorRowCountEvaluator",
+            "FactorIndexEvaluator",
+            "FactorMissingValuesEvaluator",
+            "FactorEqualValueCountEvaluator",
+            "FactorCorrelationEvaluator",
+            "run factor error",
+        ]
+        sum_df = sum_df.reindex(index, axis=0)
+        sum_df_clean = sum_df.T.groupby(level=0).apply(lambda x: x.reset_index(drop=True))
+
+        run_error = sum_df_clean["run factor error"].unstack().T.fillna(False).astype(bool)
+        succ_rate = ~run_error
+        succ_rate = succ_rate.mean(axis=0).to_frame("success rate")
 
-# Prepare the data for plotting
-plot_data = final_res.drop(["max. accuracy", "avg. accuracy"], axis=0).T
-plot_data = plot_data.reset_index().melt("index", var_name="a", value_name="b")
+        succ_rate_f = self.reformat_succ_rate(succ_rate)
+        succ_rate_f
 
-# Create the plot
-plt.figure(figsize=(10, 6))
-sns.barplot(x="index", y="b", hue="a", data=plot_data)
+        sum_df_clean["FactorRowCountEvaluator"]
 
-# Set the labels and title
-plt.xlabel("Method")
-plt.ylabel("Value")
-plt.title("Comparison of Different Methods")
+        format_issue = (
+            sum_df_clean["FactorRowCountEvaluator"] & sum_df_clean["FactorIndexEvaluator"]
+        )
+        eval_series = format_issue.unstack()
+        succ_rate = eval_series.T.fillna(False).astype(bool)  # false indicate failure
+        format_succ_rate = succ_rate.mean(axis=0).to_frame("success rate")
+        format_succ_rate_f = self.reformat_succ_rate(format_succ_rate)
+
+        corr = sum_df_clean["FactorCorrelationEvaluator"] * format_issue
+        corr = corr.unstack().T.mean(axis=0).to_frame("corr(only success)")
+        corr_res = self.reformat_succ_rate(corr)
+        corr_max = sum_df_clean["FactorCorrelationEvaluator"] * format_issue
+
+        corr_max = corr_max.unstack().T.max(axis=0).to_frame("corr(only success)")
+        corr_max_res = self.reformat_succ_rate(corr_max)
+
+        value_max = sum_df_clean["FactorMissingValuesEvaluator"] * format_issue
+        value_max = value_max.unstack().T.max(axis=0).to_frame("max_value")
+        value_max_res = self.reformat_succ_rate(value_max)
+
+        value_avg = (
+            (sum_df_clean["FactorMissingValuesEvaluator"] * format_issue)
+            .unstack()
+            .T.mean(axis=0)
+            .to_frame("avg_value")
+        )
+        value_avg_res = self.reformat_succ_rate(value_avg)
+
+        result_all = pd.concat(
+            {
+                "avg. Correlation (value only)": corr_res.iloc[:, 0],
+                "avg. Format successful rate": format_succ_rate_f.iloc[:, 0],
+                "avg. Run successful rate": succ_rate_f.iloc[:, 0],
+                "max. Correlation": corr_max_res.iloc[:, 0],
+                "max. accuracy": value_max_res.iloc[:, 0],
+                "avg. accuracy": value_avg_res.iloc[:, 0],
+            },
+            axis=1,
+        )
 
-# Save the plot as a file
-plt.savefig("comparison_plot.png")
+        df = result_all.sort_index(axis=1, key=self.result_all_key_order)
+        print(df)
+
+        # Calculate the mean of each column
+        mean_values = df.fillna(0.0).mean()
+        mean_df = pd.DataFrame(mean_values).T
+
+        # Assign the MultiIndex to the DataFrame
+        mean_df.index = pd.MultiIndex.from_tuples([("-", "-", "Average")], names=["Factor", "Category", "Difficulty"])
+
+        # Append the mean values to the end of the dataframe
+        df_w_mean = pd.concat([df, mean_df]).astype("float")
+
+        return df_w_mean
+
+
+
+class Plotter:
+    @staticmethod
+    def change_fs(font_size):
+        plt.rc("font", size=font_size)
+        plt.rc("axes", titlesize=font_size)
+        plt.rc("axes", labelsize=font_size)
+        plt.rc("xtick", labelsize=font_size)
+        plt.rc("ytick", labelsize=font_size)
+        plt.rc("legend", fontsize=font_size)
+        plt.rc("figure", titlesize=font_size)
+
+    @staticmethod
+    def plot_data(data, file_name):
+        plt.figure(figsize=(10, 6))
+        sns.barplot(x="index", y="b", hue="a", data=data)
+        plt.xlabel("Method")
+        plt.ylabel("Value")
+        plt.title("Comparison of Different Methods")
+        plt.savefig(file_name)
+
+if __name__ == "__main__":
+    settings = BenchmarkSettings()
+    benchmark = BenchmarkAnalyzer(settings)
+    results = {
+        "1 round experiment": "git_ignore_folder/eval_results/res_promptV220240724-060037.pkl",
+    }
+    final_results = benchmark.process_results(results)
+    final_results_df = pd.DataFrame(final_results)
+
+    Plotter.change_fs(20)
+    plot_data = final_results_df.drop(["max. accuracy", "avg. accuracy"], axis=0).T
+    plot_data = plot_data.reset_index().melt("index", var_name="a", value_name="b")
+    Plotter.plot_data(plot_data, "rdagent/app/quant_factor_benchmark/comparison_plot.png")

From ac80c93172759412d5fe603965ffe797a9406a32 Mon Sep 17 00:00:00 2001
From: Taozhi Wang <taozhi.mark.wang@gmail.com>
Date: Thu, 25 Jul 2024 07:46:50 +0000
Subject: [PATCH 10/10] CI fix

---
 RD-Agent | 1 -
 1 file changed, 1 deletion(-)
 delete mode 160000 RD-Agent

diff --git a/RD-Agent b/RD-Agent
deleted file mode 160000
index 61d67d85..00000000
--- a/RD-Agent
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 61d67d8518072d69d5169853d49dd6ff88e6055c