Skip to content
This repository has been archived by the owner on Jun 28, 2024. It is now read-only.

Commit

Permalink
Merge remote-tracking branch 'origin/main'
Browse files Browse the repository at this point in the history
  • Loading branch information
noaahh committed Jun 5, 2024
2 parents ce146d0 + dafef79 commit 932688d
Show file tree
Hide file tree
Showing 9 changed files with 460 additions and 152 deletions.
246 changes: 97 additions & 149 deletions notebooks/embedding_analysis.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -50,15 +50,99 @@
"metadata": {},
"outputs": [],
"source": [
"WL_DATA_DIR = os.getenv('DATA_DIR', 'data')\n",
"WL_DATA_DIR = os.path.abspath(os.path.join(parent_dir, WL_DATA_DIR, 'weak_labelled'))\n",
"DATA_DIR = os.getenv('DATA_DIR', 'data')\n",
"EMBEDDING_DATA_DIR = os.path.abspath(os.path.join(parent_dir, DATA_DIR, 'embeddings'))\n",
"\n",
"weak_labelled = {}\n",
"\n",
"print(f\"Reading weak labelled data from {WL_DATA_DIR}\")\n",
"for file in os.listdir(WL_DATA_DIR):\n",
" weak_labelled[file] = pd.read_parquet(os.path.join(WL_DATA_DIR, file))\n",
" print(f\"- Read {file}\")"
"print(f\"Reading weak labelled data from {EMBEDDING_DATA_DIR}\")\n",
"\n",
"embedding_model_dirs = [d for d in os.listdir(EMBEDDING_DATA_DIR) if os.path.isdir(os.path.join(EMBEDDING_DATA_DIR, d))]\n",
"embeddings = {}\n",
"\n",
"for dir in embedding_model_dirs:\n",
" print(f\"- Opening Embeddings from {dir}\")\n",
" curr_embeddings = {}\n",
" for file in os.listdir(os.path.join(EMBEDDING_DATA_DIR, dir)):\n",
" if file.endswith('.pkl'):\n",
" filename = file.split('.')[0]\n",
" curr_embeddings[filename] = pd.read_pickle(os.path.join(EMBEDDING_DATA_DIR, dir, file))\n",
" print(f\" - Read {file}\")\n",
" embeddings[dir] = curr_embeddings"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"PARTITIONS_DATA_DIR = os.path.abspath(os.path.join(parent_dir, DATA_DIR, 'partitions'))\n",
"\n",
"print(f\"Reading partitions data from {PARTITIONS_DATA_DIR}\") \n",
"\n",
"partitions = {}\n",
"\n",
"for file in os.listdir(PARTITIONS_DATA_DIR):\n",
" if file.endswith('.parquet'):\n",
" filename = file.split('.')[0]\n",
" partitions[filename] = pd.read_parquet(os.path.join(PARTITIONS_DATA_DIR, file))\n",
" print(f'- Read {file}')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Merging content, title and label to embedding vectors"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"merged_partitions = {}\n",
"\n",
"for embedding_model in embeddings:\n",
" merged_partitions[embedding_model] = {}\n",
" print(f'For {embedding_model}:')\n",
" for partition in partitions:\n",
" curr_partition_name = partition.split('_')[0]\n",
" embeddings_keys = embeddings[embedding_model].keys()\n",
" \n",
" for embedding_key in embeddings_keys:\n",
" if curr_partition_name == embedding_key.split('_')[0]:\n",
" partition_data = partitions[partition]\n",
" embedding_data = embeddings[embedding_model][embedding_key]\n",
" partition_data['embedding'] = embedding_data.tolist()\n",
" \n",
" merged_partitions[embedding_model][partition] = partition_data\n",
" \n",
" print(f\"- Merged {embedding_key} with {partition}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# concatenate all partitions for each embedding model\n",
"for embedding_model in merged_partitions:\n",
" print(f\"Concatenating partitions for {embedding_model}\")\n",
" partitions_data = pd.concat(merged_partitions[embedding_model].values(), ignore_index=True)\n",
" merged_partitions[embedding_model] = partitions_data"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"merged_partitions['mini_lm']"
]
},
{
Expand All @@ -81,7 +165,6 @@
"import pandas as pd\n",
"from sklearn.decomposition import PCA\n",
"\n",
"\n",
"def break_content(text, length=50):\n",
" lines = []\n",
" while len(text) > length:\n",
Expand All @@ -100,18 +183,16 @@
"\n",
" df = weak_labelled[key]\n",
"\n",
" embeddings = np.vstack(df['embedding_vec'].values)\n",
" labels = np.array(df['label'].values)\n",
" embeddings = np.vstack(df['embedding'].values)\n",
" content = df['content'].apply(lambda x: break_content(x)).values\n",
"\n",
" pca = PCA(n_components=3)\n",
" reduced_embeddings = pca.fit_transform(embeddings)\n",
"\n",
" pca_df = pd.DataFrame(reduced_embeddings, columns=['PCA1', 'PCA2', 'PCA3'])\n",
" pca_df['Label'] = labels\n",
" pca_df['Content'] = content\n",
"\n",
" fig = px.scatter_3d(pca_df, x='PCA1', y='PCA2', z='PCA3', color='Label',\n",
" fig = px.scatter_3d(pca_df, x='PCA1', y='PCA2', z='PCA3',\n",
" title=f'PCA of Embedding Vectors for {key}',\n",
" size_max=5, opacity=0.6, height=800,\n",
" hover_data={'Content': True})\n",
Expand All @@ -123,8 +204,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"### KNN Weak Labels\n",
"For this first view onto the embedding space we will look at how the KNN labels the sentiments."
"### MiniLM Embedding Space"
]
},
{
Expand All @@ -137,21 +217,17 @@
"\n",
"knn_key = 'mlp_weak_labeling_weaklabels.parquet'\n",
"\n",
"knn_wl_ds = create_dataset('knn', weak_labelled[knn_key],\n",
" weak_labelled[knn_key]['embedding_vec'],\n",
" weak_labelled[knn_key]['label'])\n",
"mini_lm_ds = create_dataset('mini_lm', merged_partitions['mini_lm'], merged_partitions['mini_lm']['embedding'], content=merged_partitions['mini_lm']['content'])\n",
"\n",
"px_session = launch_px(knn_wl_ds, None)\n",
"px_session = launch_px(mini_lm_ds, None)\n",
"px_session.view()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The cluster distancing itself the farthest seems to consist almost solely of **music album reviews** that were labeled with `1`, meaning a positive sentiment.\n",
"\n",
"Otherwise, the labels don't show a specific pattern or clustering. However, the embedding positions in the space are relatively clearly clustered: We are using the embedding vectors of huggingface's `all-MiniLM-L6-v2` BERT sentence transformer. This sentence transformer was trained on sentence pairs that appear as a Q&A. The resulting vector embedding therefore describes the semantic content of such a sentence - This is exactly what we can see in the embedding space; Reviews of the `amazon-polarity` dataset are clustered together according to their product niche, as for example already mentioned with the cluster containing music reviews.\n",
"The embedding positions in the space are relatively clearly clustered. In this first example we are using the embedding vectors of huggingface's `all-MiniLM-L6-v2` BERT sentence transformer. This sentence transformer was trained on sentence pairs that appear as a Q&A. The resulting vector embedding therefore describes the semantic content of such a sentence - This is exactly what we can see in the embedding space; Reviews of the `amazon-polarity` dataset are clustered together according to their product niche, as for example already mentioned with the cluster containing music reviews.\n",
"\n",
"But we can also observe other semantic relationships:\n",
"- Video game reviews lie between music and book reviews: This axis could perhaps describe interactivity; music can be enjoyed passively, games do have some interactions between cutscenes while books capture ones concentration and attention entirely.\n",
Expand All @@ -172,7 +248,7 @@
"metadata": {},
"outputs": [],
"source": [
"plot_pca(weak_labelled, knn_key)"
"plot_pca(merged_partitions, 'mini_lm')"
]
},
{
Expand All @@ -185,134 +261,6 @@
"- Movies\n",
"- Tech Gadgets"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Using a sentence transformer with higher dimensionality"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"log_reg_key = 'log_reg_weak_labeling_weaklabels.parquet'\n",
"\n",
"log_reg_wl_ds = create_dataset('log_reg', weak_labelled[log_reg_key],\n",
" weak_labelled[log_reg_key]['embedding_vec'],\n",
" weak_labelled[log_reg_key]['label'])\n",
"\n",
"px_session = launch_px(log_reg_wl_ds, None)\n",
"px_session.view()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"plot_pca(weak_labelled, log_reg_key)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Multi-Layer Perceptron Weak Labelling"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"mlp_key = 'mlp_weak_labeling_weaklabels.parquet'\n",
"\n",
"mlp_wl_ds = create_dataset('mlp_reg', weak_labelled[mlp_key],\n",
" weak_labelled[mlp_key]['embedding_vec'],\n",
" weak_labelled[mlp_key]['label'])\n",
"\n",
"px_session = launch_px(mlp_wl_ds, None)\n",
"px_session.view()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"plot_pca(weak_labelled, mlp_key)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Random Forest Weak Labelling"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"rf_key = 'rf_weak_labeling_weaklabels.parquet'\n",
"\n",
"rf_wl_ds = create_dataset('rf_reg', weak_labelled[rf_key],\n",
" weak_labelled[rf_key]['embedding_vec'],\n",
" weak_labelled[rf_key]['label'])\n",
"\n",
"px_session = launch_px(rf_wl_ds, None)\n",
"px_session.view()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"plot_pca(weak_labelled, rf_key)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Support Vector Machine Weak Labelling"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"svm_key = 'svm_weak_labeling_weaklabels.parquet'\n",
"\n",
"svm_wl_ds = create_dataset('svm_reg', weak_labelled[svm_key],\n",
" weak_labelled[svm_key]['embedding_vec'],\n",
" weak_labelled[svm_key]['label'])\n",
"\n",
"px_session = launch_px(svm_wl_ds, None)\n",
"px_session.view()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"plot_pca(weak_labelled, svm_key)"
]
}
],
"metadata": {
Expand Down
Loading

0 comments on commit 932688d

Please sign in to comment.