diff --git a/asf_public_discourse_home_decarbonisation/analysis/topic_analysis/topic_analysis_heat_pump_keywords.py b/asf_public_discourse_home_decarbonisation/analysis/topic_analysis/topic_analysis_heat_pump_keywords.py
index bfe5d34..7dc19a7 100644
--- a/asf_public_discourse_home_decarbonisation/analysis/topic_analysis/topic_analysis_heat_pump_keywords.py
+++ b/asf_public_discourse_home_decarbonisation/analysis/topic_analysis/topic_analysis_heat_pump_keywords.py
@@ -1,3 +1,4 @@
+# %%
 """
 This script is used to perform topic analysis on the data from the MSE and Buildhub forums,
 focusing on the mentions of "heat pump" related keywords.
@@ -27,6 +28,7 @@
 from asf_public_discourse_home_decarbonisation.utils.topic_analysis_utils import (
     create_bar_plot_most_common_topics,
     get_outputs_from_topic_model,
+    distribution_of_length_outliers_and_others,
 )
 
 # %% [markdown]
@@ -41,57 +43,14 @@
 bh_data = get_bh_data(category="all", collection_date="24_02_01")
 
 # %%
-# Replacing abbreviations
-mse_data["title"] = mse_data["title"].apply(
-    lambda x: x.lower()
-    .replace("ashps", "air source heat pumps")
-    .replace("ashp", "air source heat pump")
-    .replace("gshps", "ground source heat pumps")
-    .replace("gshp", "ground source heat pump")
-    .replace("hps", "heat pumps")
-    .replace("hp", "heat pump")
-    .replace("ufh", "under floor heating")
-)
-mse_data["text"] = mse_data["text"].apply(
-    lambda x: x.lower()
-    .replace("ashps", "air source heat pumps")
-    .replace("ashp", "air source heat pump")
-    .replace("gshps", "ground source heat pumps")
-    .replace("gshp", "ground source heat pump")
-    .replace("hps", "heat pumps")
-    .replace("hp", "heat pump")
-    .replace("ufh", "under floor heating")
-)
-# Replacing abbreviations
-bh_data["title"] = (
-    bh_data["title"]
-    .astype(str)
-    .apply(
-        lambda x: x.lower()
-        .replace("ashps", "air source heat pumps")
-        .replace("ashp", "air source heat pump")
-        .replace("gshps", "ground source heat pumps")
-        .replace("gshp", "ground source heat pump")
-        .replace("hps", "heat pumps")
-        .replace("hp", "heat pump")
-        .replace("ufh", "under floor heating")
-    )
-)
-bh_data["text"] = (
-    bh_data["text"]
-    .astype(str)
-    .apply(
-        lambda x: x.lower()
-        .replace("ashps", "air source heat pumps")
-        .replace("ashp", "air source heat pump")
-        .replace("gshps", "ground source heat pumps")
-        .replace("gshp", "ground source heat pump")
-        .replace("hps", "heat pumps")
-        .replace("hp", "heat pump")
-        .replace("ufh", "under floor heating")
-    )
+from asf_public_discourse_home_decarbonisation.pipeline.bert_topic_analysis.evaluate_bertopic_results import (
+    process_abbreviations,
 )
 
+# %%
+mse_data = process_abbreviations(mse_data)
+bh_data = process_abbreviations(bh_data)
+
 # %%
 mse_data["category"].unique()
 
@@ -123,7 +82,6 @@
 topic_model = BERTopic(umap_model=umap_model)
 topics, probs = topic_model.fit_transform(docs)
 
-
 # %%
 topics, topics_info, doc_info = get_outputs_from_topic_model(topic_model, docs)
 
@@ -131,7 +89,7 @@
 topics_info.head()
 
 # %%
-doc_info.head()
+distribution_of_length_outliers_and_others(doc_info)
 
 # %%
 create_bar_plot_most_common_topics(topics_info=topics_info, top_n_topics=16)
@@ -159,7 +117,6 @@
 # %%
 topic_model.visualize_term_rank()
 
-
 # %%
 
 
@@ -188,7 +145,6 @@
 topic_model = BERTopic(umap_model=umap_model)
 topics, probs = topic_model.fit_transform(docs)
 
-
 # %%
 topics, topics_info, doc_info = get_outputs_from_topic_model(topic_model, docs)
 
@@ -198,6 +154,9 @@
 # %%
 doc_info.head()
 
+# %%
+distribution_of_length_outliers_and_others(doc_info)
+
 # %%
 create_bar_plot_most_common_topics(topics_info=topics_info, top_n_topics=16)
 
diff --git a/asf_public_discourse_home_decarbonisation/pipeline/bert_topic_analysis/evaluate_bertopic_results.py b/asf_public_discourse_home_decarbonisation/pipeline/bert_topic_analysis/evaluate_bertopic_results.py
index 4a82435..08e36af 100644
--- a/asf_public_discourse_home_decarbonisation/pipeline/bert_topic_analysis/evaluate_bertopic_results.py
+++ b/asf_public_discourse_home_decarbonisation/pipeline/bert_topic_analysis/evaluate_bertopic_results.py
@@ -42,6 +42,7 @@
 )
 from asf_public_discourse_home_decarbonisation.config.plotting_configs import (
     set_plotting_styles,
+    NESTA_COLOURS,
 )
 
 import logging
@@ -79,6 +80,12 @@ def argparser() -> argparse.Namespace:
         help="Path to data file, if not standard forum data.",
         default=None,
     )
+    parser.add_argument(
+        "--process_abbreviations",
+        type=bool,
+        help="Whether to process abbreviations",
+        default=True,
+    )
     args = parser.parse_args()
     return args
 
@@ -105,6 +112,34 @@ def get_configuration_params_file(path_to_config_file: str) -> tuple:
     return data_source_params, model_and_additional_params
 
 
+def process_abbreviations(data: pd.DataFrame) -> pd.DataFrame:
+    """
+    Replaces abbreviations in the title and text columns of the dataframe with their full forms.
+
+    Args:
+        data (pd.DataFrame): dataframe to process
+
+    Returns:
+        pd.DataFrame: dataframe with abbreviations replaced
+    """
+    for col in ["title", "text"]:
+        data[col] = (
+            data[col]
+            .astype(str)
+            .apply(
+                lambda x: x.lower()
+                .replace("ashps", "air source heat pumps")
+                .replace("ashp", "air source heat pump")
+                .replace("gshps", "ground source heat pumps")
+                .replace("gshp", "ground source heat pump")
+                .replace("hps", "heat pumps")
+                .replace("hp", "heat pump")
+                .replace("ufh", "under floor heating")
+            )
+        )
+    return data
+
+
 def create_boxplots_with_results(output_configs: dict):
     """
     Creates and saves a figure with boxplots showing the distribution of results for:
@@ -122,9 +157,15 @@ def create_boxplots_with_results(output_configs: dict):
     n_docs = output_configs["n_docs"]
     n_runs = output_configs["n_runs"]
     keywords = output_configs["keywords"]
-    number_of_outliers_df = pd.DataFrame(output_configs["outliers"])
-    number_of_topics_df = pd.DataFrame(output_configs["topics"])
-    avg_probablity_df = pd.DataFrame(output_configs["probabilities"])
+    number_of_outliers_df = pd.DataFrame.from_dict(
+        output_configs["outliers"], orient="index"
+    ).T
+    number_of_topics_df = pd.DataFrame.from_dict(
+        output_configs["topics"], orient="index"
+    ).T
+    avg_probablity_df = pd.DataFrame.from_dict(
+        output_configs["probabilities"], orient="index"
+    ).T
 
     # Plot horizontal boxplots for each dataframe: number_of_topics_df, number_of_outliers_df and avg_probablity_df
     fig, axes = plt.subplots(3, 1, figsize=(10, number_of_topics_df.shape[1] * 3))
@@ -160,6 +201,92 @@ def create_boxplots_with_results(output_configs: dict):
     plt.close()
 
 
+def plot_distribution_of_outliers(output_configs: dict):
+    """
+    Creates and saves a figure with boxplots, one for each model being assessed,
+    showing fraction of documents in the y-axis versus number of runs where
+    a document is an outlier (x-axis).
+
+    Args:
+        output_configs (dict): dictionary with results information
+    """
+
+    # Extracting information from output_configs dictionary
+    source_name = output_configs["data_source"]
+    category = output_configs["category"]
+    n_docs = output_configs["n_docs"]
+    n_runs = output_configs["n_runs"]
+    keywords = output_configs["keywords"]
+    outliers_vs_docs_dict = output_configs["distribution_outliers"]
+
+    fig, axes = plt.subplots(
+        len(outliers_vs_docs_dict), 1, figsize=(10, len(outliers_vs_docs_dict) * 3)
+    )
+
+    if n_runs <= 20:
+        for i, (model_name, outliers_vs_docs) in enumerate(
+            outliers_vs_docs_dict.items()
+        ):
+            outliers_vs_docs = (
+                outliers_vs_docs.groupby("outlier_count", as_index=False)
+                .nunique()[["outlier_count", "Document"]]
+                .rename(columns={"Document": "num_docs"})
+            )
+            bars = axes[i].bar(
+                outliers_vs_docs["outlier_count"],
+                outliers_vs_docs["num_docs"] / n_docs * 100,
+                color=NESTA_COLOURS[0],
+                edgecolor="white",
+            )
+            axes[i].set_xlabel("# of runs in which a document is an outlier")
+            axes[i].set_title(model_name)
+
+            # Add numbers above the bars
+            for bar in bars:
+                height = bar.get_height()
+                axes[i].text(
+                    bar.get_x() + bar.get_width() / 2,
+                    height,
+                    f"{height:.2f}",
+                    ha="center",
+                    va="bottom",
+                )
+    else:
+        for i, (model_name, outliers_vs_docs) in enumerate(
+            outliers_vs_docs_dict.items()
+        ):
+            axes[i].hist(
+                outliers_vs_docs["outlier_count"],
+                bins="auto",
+                density=True,
+                color=NESTA_COLOURS[0],
+                edgecolor="white",
+            )
+            axes[i].set_xlabel("# of runs in which a document is an outlier")
+            axes[i].set_title(model_name)
+
+    fig.suptitle(
+        "Percentage of docs vs. Number of runs being outlier\nSource: {}, Category: {}\n Keywords filter: {}, # Docs: {}, # Runs: {}".format(
+            source_name,
+            category,
+            keywords,
+            n_docs,
+            n_runs,
+        )
+    )
+
+    plt.tight_layout()
+
+    plt.savefig(
+        os.path.join(
+            TOPIC_ANALYSIS_EVALUATION_PATH,
+            f"outliers_distribution_{source_name}_{category}_{keywords}_{n_runs}runs.png",
+        )
+    )
+
+    plt.close()
+
+
 def run_topic_model_evaluation(n_runs: int, docs: list, model_configs: dict) -> tuple:
     """
     Evaluates a topic model according to a list of model configurations on a specific number of runs.
@@ -186,6 +313,10 @@ def run_topic_model_evaluation(n_runs: int, docs: list, model_configs: dict) ->
     runs_number_of_outliers = []
     runs_avg_prob = []
 
+    docs_outlier_count = pd.DataFrame(columns=["Document", "outlier_count"])
+    docs_outlier_count["Document"] = docs
+    docs_outlier_count["outlier_count"] = 0
+
     # Getting model configurations
     # default values found here: https://maartengr.github.io/BERTopic/api/bertopic.html#bertopic._bertopic.BERTopic.__init__
     nr_topics = model_configs.get("nr_topics", None)
@@ -222,6 +353,18 @@ def run_topic_model_evaluation(n_runs: int, docs: list, model_configs: dict) ->
             )
             # Appending average probability of the documents (not in the outliers' cluster) belonging to topics
             runs_avg_prob.append(doc_info[doc_info["Topic"] > -1]["Probability"].mean())
+
+            # Updating number of times a doc is an outlier
+            docs_outlier_count = docs_outlier_count.merge(
+                right=doc_info[["Document", "Topic"]], on="Document"
+            )
+            docs_outlier_count["outlier_count"] = docs_outlier_count.apply(
+                lambda x: (
+                    x["outlier_count"] + 1 if x["Topic"] == -1 else x["outlier_count"]
+                ),
+                axis=1,
+            )
+            docs_outlier_count.drop(columns="Topic", inplace=True)
         except:
             logger.info(f"Run {i} failed due to no topics being found. Skipping...")
 
@@ -229,11 +372,16 @@ def run_topic_model_evaluation(n_runs: int, docs: list, model_configs: dict) ->
         runs_number_of_topics,
         runs_number_of_outliers,
         runs_avg_prob,
+        docs_outlier_count,
     )
 
 
 def read_and_filter_data(
-    source_name: str, category: str, path_to_data_file: str, keywords: list
+    source_name: str,
+    category: str,
+    path_to_data_file: str,
+    keywords: list,
+    proc_abbreviations: bool,
 ) -> pd.DataFrame:
     """
     Loads and filters data before applying topic analysis.
@@ -243,6 +391,7 @@ def read_and_filter_data(
         category (str): category e.g. "all"
         path_to_data_file (str): path to data file if not the standard forum data
         keywords (list): list of keywords to filter the data
+        proc_abbreviations (bool): whether to process abbreviations in the data
 
     Returns:
         pd.DataFrame: filtered data
@@ -255,6 +404,9 @@ def read_and_filter_data(
         else:
             data = get_bh_data(category)
 
+    if proc_abbreviations:
+        data = process_abbreviations(data)
+
     # filter the data based on the keywords
     if keywords is not None:
         data = data[
@@ -273,6 +425,7 @@ def read_and_filter_data(
     n_runs = args.n_runs
     path_to_config_file = args.path_to_config_file
     path_to_data_file = args.path_to_data_file
+    proc_abbreviations = args.process_abbreviations
 
     (
         data_source_params,
@@ -290,12 +443,13 @@ def read_and_filter_data(
             category = slice.get("category")
             keywords = slice.get("keywords")
             data = read_and_filter_data(
-                source_name, category, path_to_data_file, keywords
+                source_name, category, path_to_data_file, keywords, proc_abbreviations
             )
 
             number_of_topics_dict = dict()
             number_of_outliers_dict = dict()
             avg_probablity_dict = dict()
+            distribution_outliers_dict = dict()
 
             # for each model specification run and evaluate the model
             for model_param in model_and_additional_params:
@@ -315,11 +469,13 @@ def read_and_filter_data(
                     runs_number_of_topics,
                     runs_number_of_outliers,
                     runs_avg_prob,
+                    distribution_outliers,
                 ) = run_topic_model_evaluation(n_runs, docs, model_param)
 
                 number_of_topics_dict[model_name] = runs_number_of_topics
                 number_of_outliers_dict[model_name] = runs_number_of_outliers
                 avg_probablity_dict[model_name] = runs_avg_prob
+                distribution_outliers_dict[model_name] = distribution_outliers
 
             # Create config dictionary with outputs
             output_configs = {
@@ -331,7 +487,10 @@ def read_and_filter_data(
                 "outliers": number_of_outliers_dict,
                 "topics": number_of_topics_dict,
                 "probabilities": avg_probablity_dict,
+                "distribution_outliers": distribution_outliers_dict,
             }
 
             # Plotting results
             create_boxplots_with_results(output_configs)
+
+            plot_distribution_of_outliers(output_configs)
diff --git a/asf_public_discourse_home_decarbonisation/utils/topic_analysis_utils.py b/asf_public_discourse_home_decarbonisation/utils/topic_analysis_utils.py
index 3d28098..0bef86a 100644
--- a/asf_public_discourse_home_decarbonisation/utils/topic_analysis_utils.py
+++ b/asf_public_discourse_home_decarbonisation/utils/topic_analysis_utils.py
@@ -55,3 +55,32 @@ def get_outputs_from_topic_model(topic_model, docs: list) -> pd.DataFrame:
 
     doc_info = topic_model.get_document_info(docs)
     return topics, topics_info, doc_info
+
+
+def distribution_of_length_outliers_and_others(doc_info: pd.DataFrame):
+    """
+    Creates an histogram showing the distribution of title lengths
+    for documents in the outliers' cluster versus all other documents.
+
+    Args:
+        doc_info (pd.DataFrame): dataframe with information about documents.
+    """
+    doc_info["doc_length"] = doc_info["Document"].str.len()
+
+    plt.figure(figsize=(6, 4))
+    plt.hist(
+        doc_info[doc_info["Topic"] != -1]["doc_length"],
+        bins=30,
+        density=True,
+        alpha=0.6,
+    )
+    plt.hist(
+        doc_info[doc_info["Topic"] == -1]["doc_length"],
+        bins=30,
+        color="red",
+        density=True,
+        alpha=0.6,
+    )
+    plt.legend(["Other clusters", "Outlier cluster"])
+    plt.xlabel("Length of documents")
+    plt.ylabel("Density")