DAI-Lab · leix28 · Jan 2, 2021 · Dec 30, 2020 · Dec 30, 2020 · Dec 30, 2020
diff --git a/Makefile b/Makefile
@@ -39,12 +39,16 @@ install-test: clean-build clean-pyc ## install the package and test dependencies
 
 .PHONY: test
 test: ## run tests quickly with the default Python
+	python -m pytest -m "not slow" tests
+
+.PHONY: test-slow
+test-slow: ## run all tests (including slow tests)
 	python -m pytest tests
 
 .PHONY: lint
 lint: ## check style with flake8 and isort
 	flake8 fibber tests
-	isort -c --recursive fibber tests
+	isort -c fibber tests
 
 .PHONY: install-develop
 install-develop: clean-build clean-pyc ## install the package in editable mode and dependencies for development
@@ -57,7 +61,7 @@ test-all: ## run tests on every Python version with tox
 .PHONY: fix-lint
 fix-lint: ## fix lint issues using autoflake, autopep8, and isort
 	find fibber tests -name '*.py' | xargs autoflake --in-place --remove-all-unused-imports --remove-unused-variables
-	autopep8 --in-place --recursive --ignore W503 --aggressive fibber tests
+	autopep8 --in-place --recursive --aggressive fibber tests
 	isort --atomic fibber tests
 
 .PHONY: coverage
@@ -136,7 +140,7 @@ ifeq ($(CHANGELOG_LINES),0)
 endif
 
 .PHONY: check-release
-check-release: check-master check-history ## Check if the release can be made
+check-release: test-slow check-master check-history ## Check if the release can be made
 
 .PHONY: release
 release: check-release bumpversion-release publish bumpversion-patch

diff --git a/README.md b/README.md
@@ -176,40 +176,40 @@ The output is a tuple of (str, list, list).
 # Original Text
 'The Avengers is a good movie. Although it is 3 hours long, every scene has something to watch.'
 
-# 5 paraphrases
+# 5 paraphrase_list
 ['the avengers is a good movie. even it is 2 hours long, there is not enough to watch.',
   'the avengers is a good movie. while it is 3 hours long, it is still very watchable.',
   'the avengers is a good movie and although it is 2 ¹⁄₂ hours long, it is never very interesting.',
   'avengers is not a good movie. while it is three hours long, it is still something to watch.',
   'the avengers is a bad movie. while it is three hours long, it is still something to watch.']
 
-# Evaluation metrics of these 5 paraphrases.
+# Evaluation metrics of these 5 paraphrase_list.
 
   {'EditingDistance': 8,
-   'USESemanticSimilarity': 0.9523628950119019,
-   'GloVeSemanticSimilarity': 0.9795315341042675,
-   'GPT2GrammarQuality': 1.492070198059082,
-   'BertClfPrediction': 0},
+   'USESemanticSimilarityMetric': 0.9523628950119019,
+   'GloVeSemanticSimilarityMetric': 0.9795315341042675,
+   'GPT2GrammarQualityMetric': 1.492070198059082,
+   'BertClassifier': 0},
   {'EditingDistance': 9,
-   'USESemanticSimilarity': 0.9372092485427856,
-   'GloVeSemanticSimilarity': 0.9575780832312993,
-   'GPT2GrammarQuality': 0.9813404679298401,
-   'BertClfPrediction': 1},
+   'USESemanticSimilarityMetric': 0.9372092485427856,
+   'GloVeSemanticSimilarityMetric': 0.9575780832312993,
+   'GPT2GrammarQualityMetric': 0.9813404679298401,
+   'BertClassifier': 1},
   {'EditingDistance': 11,
-   'USESemanticSimilarity': 0.9265919327735901,
-   'GloVeSemanticSimilarity': 0.9710499628056698,
-   'GPT2GrammarQuality': 1.325406551361084,
-   'BertClfPrediction': 0},
+   'USESemanticSimilarityMetric': 0.9265919327735901,
+   'GloVeSemanticSimilarityMetric': 0.9710499628056698,
+   'GPT2GrammarQualityMetric': 1.325406551361084,
+   'BertClassifier': 0},
   {'EditingDistance': 7,
-   'USESemanticSimilarity': 0.8913971185684204,
-   'GloVeSemanticSimilarity': 0.9800737898362042,
-   'GPT2GrammarQuality': 1.2504483461380005,
-   'BertClfPrediction': 1},
+   'USESemanticSimilarityMetric': 0.8913971185684204,
+   'GloVeSemanticSimilarityMetric': 0.9800737898362042,
+   'GPT2GrammarQualityMetric': 1.2504483461380005,
+   'BertClassifier': 1},
   {'EditingDistance': 8,
-   'USESemanticSimilarity': 0.9124080538749695,
-   'GloVeSemanticSimilarity': 0.9744155151490856,
-   'GPT2GrammarQuality': 1.1626977920532227,
-   'BertClfPrediction': 0}]
+   'USESemanticSimilarityMetric': 0.9124080538749695,
+   'GloVeSemanticSimilarityMetric': 0.9744155151490856,
+   'GPT2GrammarQualityMetric': 1.1626977920532227,
+   'BertClassifier': 0}]
 ```
 
 **(5) You can ask fibber to randomly pick a sentence from the dataset and paraphrase it.**

diff --git a/fibber/__init__.py b/fibber/__init__.py
@@ -2,7 +2,7 @@
 
 __author__ = 'MIT Data To AI Lab'
 __email__ = 'dailabmit@gmail.com'
-__version__ = '0.1.4.dev0'
+__version__ = '0.2.0.dev0'
 
 import os
 

diff --git a/fibber/benchmark/benchmark.py b/fibber/benchmark/benchmark.py
@@ -5,11 +5,11 @@
 from fibber import log
 from fibber.benchmark.benchmark_utils import update_detailed_result
 from fibber.datasets import builtin_datasets, get_dataset, subsample_dataset, verify_dataset
-from fibber.metrics.attack_aggregation import add_sentence_level_adversarial_attack_metrics
-from fibber.metrics.metric_base import MetricBase
+from fibber.metrics.attack_aggregation_utils import add_sentence_level_adversarial_attack_metrics
 from fibber.metrics.metric_utils import MetricBundle
 from fibber.paraphrase_strategies import (
     BertSamplingStrategy, IdentityStrategy, RandomStrategy, TextFoolerStrategy)
+from fibber.paraphrase_strategies.strategy_base import StrategyBase
 
 logger = log.setup_custom_logger(__name__)
 log.remove_logger_tf_handler(logger)
@@ -102,7 +102,7 @@ def __init__(self,
 
         if customized_clf:
             self._metric_bundle.add_classifier(str(customized_clf), customized_clf)
-            self._metric_bundle.set_target_classifier(str(customized_clf))
+            self._metric_bundle.set_target_classifier_by_name(str(customized_clf))
 
         add_sentence_level_adversarial_attack_metrics(
             self._metric_bundle, gpt2_ppl_threshold=5, use_sim_threshold=0.85)
@@ -136,7 +136,7 @@ def run_benchmark(self,
                 paraphrase_strategy = built_in_strategies[paraphrase_strategy](
                     {}, self._dataset_name, strategy_gpu_id, self._output_dir, self._metric_bundle)
         else:
-            assert isinstance(paraphrase_strategy, MetricBase)
+            assert isinstance(paraphrase_strategy, StrategyBase)
 
         # get experiment name
         if exp_name is None:

diff --git a/fibber/metrics/__init__.py b/fibber/metrics/__init__.py
@@ -1,16 +1,16 @@
-from fibber.metrics.bert_clf_prediction import BertClfPrediction
-from fibber.metrics.edit_distance import EditDistance
-from fibber.metrics.glove_semantic_similarity import GloVeSemanticSimilarity
-from fibber.metrics.gpt2_grammar_quality import GPT2GrammarQuality
+from fibber.metrics.bert_classifier import BertClassifier
+from fibber.metrics.edit_distance_metric import EditDistanceMetric
+from fibber.metrics.glove_semantic_similarity_metric import GloVeSemanticSimilarityMetric
+from fibber.metrics.gpt2_grammar_quality_metric import GPT2GrammarQualityMetric
 from fibber.metrics.metric_base import MetricBase
 from fibber.metrics.metric_utils import MetricBundle
-from fibber.metrics.use_semantic_similarity import USESemanticSimilarity
+from fibber.metrics.use_semantic_similarity_metric import USESemanticSimilarityMetric
 
 __all__ = [
-    "BertClfPrediction",
-    "EditDistance",
-    "GloVeSemanticSimilarity",
-    "GPT2GrammarQuality",
+    "BertClassifier",
+    "EditDistanceMetric",
+    "GloVeSemanticSimilarityMetric",
+    "GPT2GrammarQualityMetric",
     "MetricBase",
-    "USESemanticSimilarity",
+    "USESemanticSimilarityMetric",
     "MetricBundle"]
diff --git a/fibber/metrics/attack_aggregation.py → fibber/metrics/attack_aggregation_utils.py b/fibber/metrics/attack_aggregation.py → fibber/metrics/attack_aggregation_utils.py
@@ -6,7 +6,7 @@
 
 import numpy as np
 
-from fibber.metrics.edit_distance import EditDistance
+from fibber.metrics.edit_distance_metric import EditDistanceMetric
 from fibber.metrics.metric_utils import DIRECTION_HIGHER_BETTER, DIRECTION_LOWER_BETTER
 
 
@@ -17,10 +17,10 @@ def paraphrase_classification_accuracy_agg_fn_constructor(
     The aggregation function outputs the after attack accuracy of the BERT classifier.
 
     Args:
-        gpt2_ppl_threshold (float): the threshold for ``GPT2GrammarQuality`` metric. The
+        gpt2_ppl_threshold (float): the threshold for ``GPT2GrammarQualityMetric`` metric. The
             adversarial example should have the perplexity ratio measured by GPT2 lower than
             this threshold.
-        use_sim_threshold (float): the threshold for ``USESemanticSimilarity`` metric. The
+        use_sim_threshold (float): the threshold for ``USESemanticSimilarityMetric`` metric. The
             adversarial example should have the USE similarity higher than this threshold.
         target_clf (str): the metric name of the target classifier.
     """
@@ -30,16 +30,16 @@ def agg_fn(data_record):
             return 0
         for item in data_record["paraphrase_metrics"]:
             if (item[target_clf] != data_record["label"]
-                    and item["GPT2GrammarQuality"] < gpt2_ppl_threshold
-                    and item["USESemanticSimilarity"] > use_sim_threshold):
+                    and item["GPT2GrammarQualityMetric"] < gpt2_ppl_threshold
+                    and item["USESemanticSimilarityMetric"] > use_sim_threshold):
                 return 0
         return 1
 
     return agg_fn
 
 
 def editing_distance_element_worker(x):
-    editing_distance_metric = EditDistance()
+    editing_distance_metric = EditDistanceMetric()
     return editing_distance_metric.measure_example(x[0], x[1])
 
 
@@ -63,11 +63,12 @@ def pairwise_editing_distance_fn(data_record):
     return float(np.mean(distance))
 
 
-def get_best_adv_by_sim(data_record, gpt2_ppl_threshold=5, use_sim_threshold=0.85):
+def get_best_adv_by_sim(data_record, target_clf, gpt2_ppl_threshold=5, use_sim_threshold=0.85):
     """Find the adversarial example with best similarity.
 
     Args:
         data_record (dict): a data record with paraphrases and metrics.
+        target_clf (str): the targeted classifier.
         gpt2_ppl_threshold (float): the gpt2 perplexity ration threshold for a legitimate
             adversarial example.
         use_sim_threshold (float): the USE cosine similarity threshold for a legitimate adversarial
@@ -76,26 +77,27 @@ def get_best_adv_by_sim(data_record, gpt2_ppl_threshold=5, use_sim_threshold=0.8
          (dict): the metrics of the best adversarial example. None if no legitimate adversarial
             example is found.
     """
-    if data_record["original_text_metrics"]["BertClfPrediction"] != data_record["label"]:
+    if data_record["original_text_metrics"][target_clf] != data_record["label"]:
         return None
     best_score = -1
     best_metrics = None
     for metrics in data_record["paraphrase_metrics"]:
-        if (metrics["BertClfPrediction"] == data_record["label"]
-                or metrics["GPT2GrammarQuality"] > gpt2_ppl_threshold
-                or metrics["USESemanticSimilarity"] < use_sim_threshold):
+        if (metrics[target_clf] == data_record["label"]
+                or metrics["GPT2GrammarQualityMetric"] > gpt2_ppl_threshold
+                or metrics["USESemanticSimilarityMetric"] < use_sim_threshold):
             continue
-        if metrics["USESemanticSimilarity"] > best_score:
-            best_score = metrics["USESemanticSimilarity"]
+        if metrics["USESemanticSimilarityMetric"] > best_score:
+            best_score = metrics["USESemanticSimilarityMetric"]
             best_metrics = metrics
     return best_metrics
 
 
-def get_best_adv_by_ppl(data_record, gpt2_ppl_threshold=5, use_sim_threshold=0.85):
+def get_best_adv_by_ppl(data_record, target_clf, gpt2_ppl_threshold=5, use_sim_threshold=0.85):
     """Find the adversarial example with lowest perplexity.
 
     Args:
         data_record (dict): a data record with paraphrases and metrics.
+        target_clf (str): the targeted classifier.
         gpt2_ppl_threshold (float): the gpt2 perplexity ration threshold for a legitimate
             adversarial example.
         use_sim_threshold (float): the USE cosine similarity threshold for a legitimate adversarial
@@ -104,22 +106,22 @@ def get_best_adv_by_ppl(data_record, gpt2_ppl_threshold=5, use_sim_threshold=0.8
          (dict): the metrics of the best adversarial example. None if no legitimate adversarial
             example is found.
     """
-    if data_record["original_text_metrics"]["BertClfPrediction"] != data_record["label"]:
+    if data_record["original_text_metrics"][target_clf] != data_record["label"]:
         return None
     best_score = 1e8
     best_metrics = None
     for metrics in data_record["paraphrase_metrics"]:
-        if (metrics["BertClfPrediction"] == data_record["label"]
-                or metrics["GPT2GrammarQuality"] > gpt2_ppl_threshold
-                or metrics["USESemanticSimilarity"] < use_sim_threshold):
+        if (metrics[target_clf] == data_record["label"]
+                or metrics["GPT2GrammarQualityMetric"] > gpt2_ppl_threshold
+                or metrics["USESemanticSimilarityMetric"] < use_sim_threshold):
             continue
-        if metrics["GPT2GrammarQuality"] < best_score:
-            best_score = metrics["GPT2GrammarQuality"]
+        if metrics["GPT2GrammarQualityMetric"] < best_score:
+            best_score = metrics["GPT2GrammarQualityMetric"]
             best_metrics = metrics
     return best_metrics
 
 
-def get_best_adv_metric_fn_constructor(get_best_adv_fn, metric_name,
+def get_best_adv_metric_fn_constructor(get_best_adv_fn, metric_name, target_clf,
                                        gpt2_ppl_threshold=5, use_sim_threshold=0.85):
     """Returns an aggregation function that extracts the value of a specified metric for the best
     adversarial example.
@@ -130,6 +132,7 @@ def get_best_adv_metric_fn_constructor(get_best_adv_fn, metric_name,
         get_best_adv_fn (fn): a function that returns the metric dict of the best adversarial
             example.
         metric_name (str): a metric name.
+        target_clf (str): the targeted classifier.
         gpt2_ppl_threshold (float): the gpt2 perplexity ration threshold for a legitimate
             adversarial example.
         use_sim_threshold (float): the USE cosine similarity threshold for a legitimate adversarial
@@ -138,7 +141,8 @@ def get_best_adv_metric_fn_constructor(get_best_adv_fn, metric_name,
         (fn): an aggregation function that takes data_record as an input.
     """
     def agg_fn(data_record):
-        best_metrics = get_best_adv_fn(data_record, gpt2_ppl_threshold, use_sim_threshold)
+        best_metrics = get_best_adv_fn(data_record, target_clf,
+                                       gpt2_ppl_threshold, use_sim_threshold)
         if best_metrics is not None:
             return best_metrics[metric_name]
         return math.nan
@@ -178,6 +182,7 @@ def add_sentence_level_adversarial_attack_metrics(
         for metric_name in metric_bundle.get_metric_names():
             metric_bundle.add_advanced_aggregation_fn(
                 "%s_best_sim_adv_%s" % (classifier_name, metric_name),
-                get_best_adv_metric_fn_constructor(get_best_adv_by_sim, metric_name),
+                get_best_adv_metric_fn_constructor(
+                    get_best_adv_by_sim, metric_name, metric_bundle.get_target_classifier_name()),
                 metric_bundle.get_metric_direction(metric_name)
             )