feat: add suite plots (#95)

abcsys · Aug 31, 2024 · 4a3831b · 4a3831b
1 parent 2045be3
commit 4a3831b
Show file tree

Hide file tree

Showing 9 changed files with 304 additions and 4 deletions.
diff --git a/benchmark/__init__.py b/benchmark/__init__.py
@@ -16,4 +16,4 @@
 from benchmark.classic import (
     benchmarks, classic_benchmarks, challenges
 )
-from benchmark.suite import suites
+from benchmark.suite import suites, suite_plots
diff --git a/benchmark/run.py b/benchmark/run.py
@@ -12,7 +12,9 @@ def run(args) -> dict:
     # create a deep copy of args before making changes
     args = copy.deepcopy(args)
 
-    if args.input_file:
+    if args.plot:
+        benchmark_func = run_plot
+    elif args.input_file:
         args.name = args.input_file.split('/')[-1].split(".")[0]
         benchmark_func = run_from_file
     elif args.suite:
@@ -25,6 +27,19 @@ def run(args) -> dict:
     return benchmark_func(args)
 
 
+def run_plot(args):
+    if args.plot == 'all':
+        for name in bm.suite_plots.keys():
+            bm.suite_plots[name](args)
+    else:
+        plots = args.plot.split(',')
+        for plot in plots:
+            name = plot.strip().replace('_', '-')
+            if name not in bm.suite_plots:
+                raise ValueError(f"Plot {name} not found.")
+            bm.suite_plots[name](args)
+
+
 def run_from_file(args) -> dict:
     """
     Entity pairs should follow the Libem result format:
@@ -82,6 +97,10 @@ def args() -> argparse.Namespace:
                         help="Number of pairs to run through. "
                              "Set as <= 0 to run through the entire dataset.",
                         type=int, default=5)
+    parser.add_argument("--plot", dest='plot', nargs='?',
+                        help="The benchmark suite plot name(s), separated by comma,  "
+                             "or 'all' to plot all.",
+                        type=str, default='')
 
     # dataset configurations
     parser.add_argument("--no-shuffle", dest='shuffle',
@@ -167,6 +186,12 @@ def validate(args):
     if args.suite and args.input_file:
         raise ValueError("Cannot specify both "
                          "suite and input file.")
+    if args.plot and args.suite:
+        raise ValueError("Cannot specify both "
+                         "plot and suite.")
+    if args.plot and args.input_file:
+        raise ValueError("Cannot specify both "
+                         "plot and input file.")
 
     if args.batch_size <= 0:
         raise ValueError("Batch size cannot be <= 0.")

diff --git a/benchmark/suite/__init__.py b/benchmark/suite/__init__.py
@@ -18,4 +18,18 @@
     'gpt-4o': gpt_4o.run,
     'gpt-4o-mini': gpt_4o_mini.run,
     'llama3': llama3.run,
-}
+}
+
+from benchmark.suite.plot import (
+    batch_size,
+    model_cost_trend,
+    model_f1,
+    model_throughput
+)
+
+suite_plots = {
+    'batch-size': batch_size.plot,
+    'model-cost-trend': model_cost_trend.plot,
+    'model-f1': model_f1.plot,
+    'model-throughput': model_throughput.plot
+}
diff --git a/benchmark/suite/batch.py b/benchmark/suite/batch.py
@@ -20,7 +20,7 @@ def run(args):
 
     batch_sizes = [1, 4, 16, 64, 128, 256, 512]
 
-    print(f"Benchmark: Varying the batch size on the {args.name} benchmarks:")
+    print(f"Benchmark: Varying the batch size on the {args.name} benchmark:")
     start = time.time()
 
     reports = {}

diff --git a/benchmark/suite/plot/batch_size.py b/benchmark/suite/plot/batch_size.py
@@ -0,0 +1,65 @@
+import os
+import numpy as np
+import pandas as pd
+import seaborn as sns
+
+from datetime import datetime
+from matplotlib import pyplot as plt, ticker
+
+import benchmark as bm
+from benchmark.suite.plot.util import load
+
+
+def plot(args):
+    '''
+    kwargs:
+        benchmark (str): benchmark to plot, 
+                         empty to plot the last batch suite result.
+    '''
+    # get the newest batch suite result
+    result_file = None
+    for file in os.scandir(bm.result_dir):
+        # extract suite and benchmark name from result file name
+        suite, name = file.name[20:31], file.name[32:-4]
+        if suite == 'suite-batch':
+            if args.kwargs and 'benchmark' in args.kwargs:
+                if name == args.kwargs['benchmark']:
+                    result_file = file
+            else:
+                result_file = file
+
+    if not result_file:
+        raise ValueError("No batch suite results found.")
+    results = pd.read_csv(result_file)
+
+    # calculate difference from baseline
+    baseline = results[results['batch_size'] == 1].iloc[0]
+
+    def delta(x):
+        x['throughput'] = x['throughput'] - baseline['throughput']
+        x['cost'] = baseline['cost'] - x['cost']
+        return x
+
+    results = results.apply(delta, axis=1)
+
+    # plot
+    sns.set_theme(font_scale=1.4)
+    fig, (ax1, ax2) = plt.subplots(1, 2)
+    fig.set_size_inches(12, 5)
+    sns.barplot(results, x='batch_size', y='throughput', color="#2ecc71", ax=ax1, width=0.8)
+    ax1.axhline(0, color='#e74c3c')
+    ax1.set(xlabel="Batch Size vs Throughput", ylabel="Throughput (pps)", title="")
+    ax1.get_xaxis().set_major_formatter(
+        ticker.FuncFormatter(lambda x, p: int(x)))
+    sns.barplot(results, x='batch_size', y='cost', color="#2ecc71", ax=ax2, width=0.8)
+    ax2.set(xlabel="Batch Size vs Cost Savings", ylabel="Cost Savings ($)", title="")
+    ax2.get_xaxis().set_major_formatter(
+        ticker.FuncFormatter(lambda x, p: int(x)))
+    plt.tight_layout()
+
+    output_file = os.path.join(
+                    bm.figure_dir,
+                    f"{datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}-"
+                    "batch-size.svg")
+    plt.savefig(output_file, format='svg', bbox_inches = "tight")
+    print(f"Batch size plot saved to: {output_file}")
diff --git a/benchmark/suite/plot/model_cost_trend.py b/benchmark/suite/plot/model_cost_trend.py
@@ -0,0 +1,71 @@
+import os
+import seaborn as sns
+
+from datetime import datetime
+from matplotlib import pyplot as plt, ticker
+
+import benchmark as bm
+from benchmark.suite.plot.util import load
+
+
+def plot(args):
+    '''
+    kwargs:
+        models (list[str]): models to include in the plot.
+        benchmarks (list[str]): benchmarks to include in the plot.
+    '''
+    results = load(args)
+
+    # calculate pairs per $
+    def ppd(x):
+        x['num_pairs'] = x['tp'] + x['fp'] + x['tn'] + x['fn']
+        if x['cost'] > 0:
+            x['ppd'] = x['num_pairs'] / x['cost']
+        else:
+            x['ppd'] = -1
+        return x
+    results = results.apply(ppd, axis=1)
+
+    # order by ppd
+    results = results[results['ppd'] >= 0]
+    results = results.groupby('model')[['ppd']].mean().sort_values('ppd')['ppd'].reset_index()
+    # dummy column needed for markers
+    results['label'] = 1
+
+    # calculate range and positioning numbers
+    y_range = results['ppd'].max() - results['ppd'].min()
+    y_unit = y_range / 25
+    x_unit = len(results) / 110
+
+    # plot
+    sns.set_theme(font_scale=1.4)
+    fig, ax = plt.subplots()
+    fig.set_size_inches(9, 5)
+
+    # draw AWS baseline
+    plt.axhline(4000, color='#3498db', linewidth=2, linestyle='--')
+    ax.text(len(results) - 1.7, 4000 + y_unit, "AWS ER", color='#3498db', size=14)
+
+    # plot line graph with value labels
+    sns.lineplot(results, x='model', y='ppd', color='#2ecc71', style='label', markers=True, 
+                 ax=ax, linewidth=3, legend=False, markersize=8)
+    for item in results.iterrows():
+        text = f"{format(int(item[1]['ppd']), ',')}"
+        x_space = len(text) * x_unit
+        ax.text(item[0] - x_space, item[1]['ppd'] + y_unit, text, color='#000', size=15)
+
+    plt.xticks(rotation=90)
+    plt.locator_params(axis='y', nbins=5)
+    ax.get_yaxis().set_major_formatter(
+        ticker.FuncFormatter(lambda x, p: format(int(x), ',')))
+    plt.ylim(-y_unit, y_range + y_unit * 3)
+    plt.title('')
+    plt.xlabel('')
+    plt.ylabel("Pairs per Dollar")
+
+    output_file = os.path.join(
+                    bm.figure_dir,
+                    f"{datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}-"
+                    "model-cost-trend.svg")
+    plt.savefig(output_file, format='svg', bbox_inches = "tight")
+    print(f"Model cost trend plot saved to: {output_file}")
diff --git a/benchmark/suite/plot/model_f1.py b/benchmark/suite/plot/model_f1.py
@@ -0,0 +1,39 @@
+import os
+import numpy as np
+import pandas as pd
+import seaborn as sns
+
+from datetime import datetime
+from matplotlib import pyplot as plt
+
+import benchmark as bm
+from benchmark.suite.plot.util import load
+
+
+def plot(args):
+    '''
+    kwargs:
+        models (list[str]): models to include in the plot.
+        benchmarks (list[str]): benchmarks to include in the plot.
+    '''
+    results = load(args)
+
+    # order by f1 score
+    order = results.groupby('model')[['f1']].mean().sort_values('f1').index
+
+    # plot
+    plt.figure(figsize=(9, 5))
+    sns.set_theme(font_scale=1.4)
+    sns.barplot(results, x='model', y='f1', estimator=np.mean, capsize=.2, color='#2ecc71', order=order, width=0.7)
+    plt.xticks(rotation=90)
+    plt.ylim(0, 100)
+    plt.title('')
+    plt.xlabel('')
+    plt.ylabel("F1 Score (%)")
+
+    output_file = os.path.join(
+                    bm.figure_dir,
+                    f"{datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}-"
+                    "model-f1.svg")
+    plt.savefig(output_file, format='svg', bbox_inches = "tight")
+    print(f"Model F1 plot saved to: {output_file}")
diff --git a/benchmark/suite/plot/model_throughput.py b/benchmark/suite/plot/model_throughput.py
@@ -0,0 +1,38 @@
+import os
+import numpy as np
+import pandas as pd
+import seaborn as sns
+
+from datetime import datetime
+from matplotlib import pyplot as plt
+
+import benchmark as bm
+from benchmark.suite.plot.util import load
+
+
+def plot(args):
+    '''
+    kwargs:
+        models (list[str]): models to include in the plot.
+        benchmarks (list[str]): benchmarks to include in the plot.
+    '''
+    results = load(args)
+
+    # order by f1 score
+    order = results.groupby('model')[['throughput']].mean().sort_values('throughput').index
+
+    # plot
+    plt.figure(figsize=(9, 5))
+    sns.set_theme(font_scale=1.4)
+    sns.barplot(results, x='model', y='throughput', estimator=np.mean, capsize=.2, color='#2ecc71', order=order, width=0.7)
+    plt.xticks(rotation=90)
+    plt.title('')
+    plt.xlabel('')
+    plt.ylabel("Throughput (pps)")
+
+    output_file = os.path.join(
+                    bm.figure_dir,
+                    f"{datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}-"
+                    "model-throughput.svg")
+    plt.savefig(output_file, format='svg', bbox_inches = "tight")
+    print(f"Model throughput plot saved to: {output_file}")
diff --git a/benchmark/suite/plot/util.py b/benchmark/suite/plot/util.py
@@ -0,0 +1,48 @@
+import os
+import pandas as pd
+
+import benchmark as bm
+
+
+def load(args) -> pd.DataFrame:
+    '''
+    kwargs:
+        models (list[str]): models to include in the plot.
+        benchmarks (list[str]): benchmarks to include in the plot, 
+                                leave empty to include all benchmarks.
+    '''
+
+    models = ['gpt-3.5-turbo', 'gpt-4', 'gpt-4-turbo', 
+              'gpt-4o', 'gpt-4o-mini', 'llama3', 'llama3.1']
+    if args.kwargs and 'models' in args.kwargs:
+        models = args.kwargs['models']
+
+    benchmarks = []
+    if args.kwargs and 'benchmarks' in args.kwargs:
+        benchmarks = args.kwargs['benchmarks']
+
+    # load in files, get newest result for each model if available
+    files = {}
+    for file in os.scandir(bm.result_dir):
+        # extract suite and model name from result file name
+        suite, name = file.name[20:25], file.name[26:-4]
+        if suite == 'suite' and name in models:
+            files[name] = file
+
+    if len(files) == 0:
+        raise ValueError("No suite resuts found. Run through "
+                         "at least one benchmark suite before plotting.")
+
+    # load all files into DataFrame
+    dfs = []
+    for k, v in files.items():
+        df = pd.read_csv(v)
+        df.loc[:, 'model'] = k
+        dfs.append(df)
+    results = pd.concat(dfs)
+
+    # filter benchmarks
+    if len(benchmarks) > 0:
+        results = results[results['benchmark'].isin(benchmarks)]
+
+    return results