Merge remote-tracking branch 'origin/main'

fhnw-ivy · Jun 5, 2024 · 932688d · 932688d
2 parents ce146d0 + dafef79
commit 932688d
Show file tree

Hide file tree

Showing 9 changed files with 460 additions and 152 deletions.
diff --git a/data/embeddings/mini_lm/train_embeddings.pkl → ...dings/mini_lm/labelled_dev_embeddings.pkl b/data/embeddings/mini_lm/train_embeddings.pkl → ...dings/mini_lm/labelled_dev_embeddings.pkl
diff --git a/data/embeddings/mini_lm/test_embeddings.pkl → ...ngs/mini_lm/unlabelled_dev_embeddings.pkl b/data/embeddings/mini_lm/test_embeddings.pkl → ...ngs/mini_lm/unlabelled_dev_embeddings.pkl
diff --git a/data/embeddings/mini_lm/val_embeddings.pkl → ...ngs/mini_lm/validation_set_embeddings.pkl b/data/embeddings/mini_lm/val_embeddings.pkl → ...ngs/mini_lm/validation_set_embeddings.pkl
diff --git a/...mbeddings/mpnet_base/train_embeddings.pkl → ...gs/mpnet_base/labelled_dev_embeddings.pkl b/...mbeddings/mpnet_base/train_embeddings.pkl → ...gs/mpnet_base/labelled_dev_embeddings.pkl
diff --git a/...embeddings/mpnet_base/test_embeddings.pkl → .../mpnet_base/unlabelled_dev_embeddings.pkl b/...embeddings/mpnet_base/test_embeddings.pkl → .../mpnet_base/unlabelled_dev_embeddings.pkl
diff --git a/.../embeddings/mpnet_base/val_embeddings.pkl → .../mpnet_base/validation_set_embeddings.pkl b/.../embeddings/mpnet_base/val_embeddings.pkl → .../mpnet_base/validation_set_embeddings.pkl
diff --git a/notebooks/embedding_analysis.ipynb b/notebooks/embedding_analysis.ipynb
@@ -50,15 +50,99 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "WL_DATA_DIR = os.getenv('DATA_DIR', 'data')\n",
-    "WL_DATA_DIR = os.path.abspath(os.path.join(parent_dir, WL_DATA_DIR, 'weak_labelled'))\n",
+    "DATA_DIR = os.getenv('DATA_DIR', 'data')\n",
+    "EMBEDDING_DATA_DIR = os.path.abspath(os.path.join(parent_dir, DATA_DIR, 'embeddings'))\n",
     "\n",
     "weak_labelled = {}\n",
     "\n",
-    "print(f\"Reading weak labelled data from {WL_DATA_DIR}\")\n",
-    "for file in os.listdir(WL_DATA_DIR):\n",
-    "    weak_labelled[file] = pd.read_parquet(os.path.join(WL_DATA_DIR, file))\n",
-    "    print(f\"- Read {file}\")"
+    "print(f\"Reading weak labelled data from {EMBEDDING_DATA_DIR}\")\n",
+    "\n",
+    "embedding_model_dirs = [d for d in os.listdir(EMBEDDING_DATA_DIR) if os.path.isdir(os.path.join(EMBEDDING_DATA_DIR, d))]\n",
+    "embeddings = {}\n",
+    "\n",
+    "for dir in embedding_model_dirs:\n",
+    "    print(f\"- Opening Embeddings from {dir}\")\n",
+    "    curr_embeddings = {}\n",
+    "    for file in os.listdir(os.path.join(EMBEDDING_DATA_DIR, dir)):\n",
+    "        if file.endswith('.pkl'):\n",
+    "            filename = file.split('.')[0]\n",
+    "            curr_embeddings[filename] = pd.read_pickle(os.path.join(EMBEDDING_DATA_DIR, dir, file))\n",
+    "        print(f\"  - Read {file}\")\n",
+    "    embeddings[dir] = curr_embeddings"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "PARTITIONS_DATA_DIR = os.path.abspath(os.path.join(parent_dir, DATA_DIR, 'partitions'))\n",
+    "\n",
+    "print(f\"Reading partitions data from {PARTITIONS_DATA_DIR}\")    \n",
+    "\n",
+    "partitions = {}\n",
+    "\n",
+    "for file in os.listdir(PARTITIONS_DATA_DIR):\n",
+    "    if file.endswith('.parquet'):\n",
+    "        filename = file.split('.')[0]\n",
+    "        partitions[filename] = pd.read_parquet(os.path.join(PARTITIONS_DATA_DIR, file))\n",
+    "    print(f'- Read {file}')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Merging content, title and label to embedding vectors"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "merged_partitions = {}\n",
+    "\n",
+    "for embedding_model in embeddings:\n",
+    "    merged_partitions[embedding_model] = {}\n",
+    "    print(f'For {embedding_model}:')\n",
+    "    for partition in partitions:\n",
+    "        curr_partition_name = partition.split('_')[0]\n",
+    "        embeddings_keys = embeddings[embedding_model].keys()\n",
+    "        \n",
+    "        for embedding_key in embeddings_keys:\n",
+    "            if curr_partition_name == embedding_key.split('_')[0]:\n",
+    "                partition_data = partitions[partition]\n",
+    "                embedding_data = embeddings[embedding_model][embedding_key]\n",
+    "                partition_data['embedding'] = embedding_data.tolist()\n",
+    "                \n",
+    "                merged_partitions[embedding_model][partition] = partition_data\n",
+    "                \n",
+    "                print(f\"- Merged {embedding_key} with {partition}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# concatenate all partitions for each embedding model\n",
+    "for embedding_model in merged_partitions:\n",
+    "    print(f\"Concatenating partitions for {embedding_model}\")\n",
+    "    partitions_data = pd.concat(merged_partitions[embedding_model].values(), ignore_index=True)\n",
+    "    merged_partitions[embedding_model] = partitions_data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "merged_partitions['mini_lm']"
    ]
   },
   {
@@ -81,7 +165,6 @@
     "import pandas as pd\n",
     "from sklearn.decomposition import PCA\n",
     "\n",
-    "\n",
     "def break_content(text, length=50):\n",
     "    lines = []\n",
     "    while len(text) > length:\n",
@@ -100,18 +183,16 @@
     "\n",
     "    df = weak_labelled[key]\n",
     "\n",
-    "    embeddings = np.vstack(df['embedding_vec'].values)\n",
-    "    labels = np.array(df['label'].values)\n",
+    "    embeddings = np.vstack(df['embedding'].values)\n",
     "    content = df['content'].apply(lambda x: break_content(x)).values\n",
     "\n",
     "    pca = PCA(n_components=3)\n",
     "    reduced_embeddings = pca.fit_transform(embeddings)\n",
     "\n",
     "    pca_df = pd.DataFrame(reduced_embeddings, columns=['PCA1', 'PCA2', 'PCA3'])\n",
-    "    pca_df['Label'] = labels\n",
     "    pca_df['Content'] = content\n",
     "\n",
-    "    fig = px.scatter_3d(pca_df, x='PCA1', y='PCA2', z='PCA3', color='Label',\n",
+    "    fig = px.scatter_3d(pca_df, x='PCA1', y='PCA2', z='PCA3',\n",
     "                        title=f'PCA of Embedding Vectors for {key}',\n",
     "                        size_max=5, opacity=0.6, height=800,\n",
     "                        hover_data={'Content': True})\n",
@@ -123,8 +204,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### KNN Weak Labels\n",
-    "For this first view onto the embedding space we will look at how the KNN labels the sentiments."
+    "### MiniLM Embedding Space"
    ]
   },
   {
@@ -137,21 +217,17 @@
     "\n",
     "knn_key = 'mlp_weak_labeling_weaklabels.parquet'\n",
     "\n",
-    "knn_wl_ds = create_dataset('knn', weak_labelled[knn_key],\n",
-    "                           weak_labelled[knn_key]['embedding_vec'],\n",
-    "                           weak_labelled[knn_key]['label'])\n",
+    "mini_lm_ds = create_dataset('mini_lm', merged_partitions['mini_lm'], merged_partitions['mini_lm']['embedding'], content=merged_partitions['mini_lm']['content'])\n",
     "\n",
-    "px_session = launch_px(knn_wl_ds, None)\n",
+    "px_session = launch_px(mini_lm_ds, None)\n",
     "px_session.view()"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "The cluster distancing itself the farthest seems to consist almost solely of **music album reviews** that were labeled with `1`, meaning a positive sentiment.\n",
-    "\n",
-    "Otherwise, the labels don't show a specific pattern or clustering. However, the embedding positions in the space are relatively clearly clustered: We are using the embedding vectors of huggingface's `all-MiniLM-L6-v2` BERT sentence transformer. This sentence transformer was trained on sentence pairs that appear as a Q&A. The resulting vector embedding therefore describes the semantic content of such a sentence - This is exactly what we can see in the embedding space; Reviews of the `amazon-polarity` dataset are clustered together according to their product niche, as for example already mentioned with the cluster containing music reviews.\n",
+    "The embedding positions in the space are relatively clearly clustered. In this first example we are using the embedding vectors of huggingface's `all-MiniLM-L6-v2` BERT sentence transformer. This sentence transformer was trained on sentence pairs that appear as a Q&A. The resulting vector embedding therefore describes the semantic content of such a sentence - This is exactly what we can see in the embedding space; Reviews of the `amazon-polarity` dataset are clustered together according to their product niche, as for example already mentioned with the cluster containing music reviews.\n",
     "\n",
     "But we can also observe other semantic relationships:\n",
     "- Video game reviews lie between music and book reviews: This axis could perhaps describe interactivity; music can be enjoyed passively, games do have some interactions between cutscenes while books capture ones concentration and attention entirely.\n",
@@ -172,7 +248,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "plot_pca(weak_labelled, knn_key)"
+    "plot_pca(merged_partitions, 'mini_lm')"
    ]
   },
   {
@@ -185,134 +261,6 @@
     "- Movies\n",
     "- Tech Gadgets"
    ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Using a sentence transformer with higher dimensionality"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "log_reg_key = 'log_reg_weak_labeling_weaklabels.parquet'\n",
-    "\n",
-    "log_reg_wl_ds = create_dataset('log_reg', weak_labelled[log_reg_key],\n",
-    "                               weak_labelled[log_reg_key]['embedding_vec'],\n",
-    "                               weak_labelled[log_reg_key]['label'])\n",
-    "\n",
-    "px_session = launch_px(log_reg_wl_ds, None)\n",
-    "px_session.view()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "plot_pca(weak_labelled, log_reg_key)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Multi-Layer Perceptron Weak Labelling"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "mlp_key = 'mlp_weak_labeling_weaklabels.parquet'\n",
-    "\n",
-    "mlp_wl_ds = create_dataset('mlp_reg', weak_labelled[mlp_key],\n",
-    "                           weak_labelled[mlp_key]['embedding_vec'],\n",
-    "                           weak_labelled[mlp_key]['label'])\n",
-    "\n",
-    "px_session = launch_px(mlp_wl_ds, None)\n",
-    "px_session.view()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "plot_pca(weak_labelled, mlp_key)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Random Forest Weak Labelling"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "rf_key = 'rf_weak_labeling_weaklabels.parquet'\n",
-    "\n",
-    "rf_wl_ds = create_dataset('rf_reg', weak_labelled[rf_key],\n",
-    "                          weak_labelled[rf_key]['embedding_vec'],\n",
-    "                          weak_labelled[rf_key]['label'])\n",
-    "\n",
-    "px_session = launch_px(rf_wl_ds, None)\n",
-    "px_session.view()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "plot_pca(weak_labelled, rf_key)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Support Vector Machine Weak Labelling"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "svm_key = 'svm_weak_labeling_weaklabels.parquet'\n",
-    "\n",
-    "svm_wl_ds = create_dataset('svm_reg', weak_labelled[svm_key],\n",
-    "                           weak_labelled[svm_key]['embedding_vec'],\n",
-    "                           weak_labelled[svm_key]['label'])\n",
-    "\n",
-    "px_session = launch_px(svm_wl_ds, None)\n",
-    "px_session.view()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "plot_pca(weak_labelled, svm_key)"
-   ]
   }
  ],
  "metadata": {