feat(text-clustering): add interpretability

Inist-CNRS · Oct 8, 2024 · 171f8c5 · 171f8c5
1 parent ba81856
commit 171f8c5
Show file tree

Hide file tree

Showing 3 changed files with 238 additions and 80 deletions.
diff --git a/services/text-clustering/Dockerfile b/services/text-clustering/Dockerfile
@@ -2,7 +2,7 @@
 FROM python:3.9-slim-bullseye AS dvc-files
 WORKDIR /dvc
 RUN apt update && apt -y install git
-RUN pip install dvc[webdav]==3.39.0
+RUN pip install dvc[webdav]==3.55.2
 RUN --mount=type=secret,id=webdav_login \
     --mount=type=secret,id=webdav_password \
     --mount=type=secret,id=webdav_url \
@@ -12,7 +12,7 @@ RUN --mount=type=secret,id=webdav_login \
     dvc remote modify --local webdav-remote user "$(cat /run/secrets/webdav_login)" && \
     dvc remote modify --local webdav-remote password "$(cat /run/secrets/webdav_password)"
 RUN dvc doctor
-COPY ./v1/all-MiniLM-L6-v2.dvc /dvc
+COPY ./v1/sinr_vector_scientific_abstract.pk.dvc /dvc
 RUN dvc pull -v
 
 
@@ -23,17 +23,20 @@ ENV NUMBA_CACHE_DIR=/tmp/numba_cache
 
 USER root
 # Install all python dependencies
-RUN pip install --no-build-isolation \
-    --index-url https://download.pytorch.org/whl/cpu \
-    --extra-index-url https://pypi.org/simple \
-    sentence-transformers==2.2.2 \
-    umap-learn==0.5.5 \
-    scikit-learn==1.4.1.post1 \
-    prometheus-client==0.19.0
+# RUN apt install python3.9-dev \
+#     build-essential \
+#     libomp-dev
+
+RUN apt update && apt -y install python3.9-dev build-essential libomp-dev
+
+RUN pip install sinr==1.3.1 \
+    scikit-learn==1.5.1 \
+    prometheus-client==0.19.0 \
+    hdbscan==0.8.38.post1
 
 # Declare files to copy in .dockerignore
 COPY --chown=daemon:daemon . /app/public/
 RUN mv /app/public/config.json /app && chmod a+w /app/config.json
 RUN mkdir /tmp/retrieve
 
-COPY --chown=daemon:daemon --from=dvc-files /dvc/all-MiniLM-L6-v2 /app/public/v1/all-MiniLM-L6-v2
+COPY --chown=daemon:daemon --from=dvc-files /dvc/sinr_vector_scientific_abstract.pk /app/public/v1/sinr_vector_scientific_abstract.pk
diff --git a/services/text-clustering/v1/clustering.py b/services/text-clustering/v1/clustering.py
@@ -1,20 +1,25 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
+import sinr.graph_embeddings as ge
+import unicodedata
+import numpy as np
 import json
 import sys
-import umap.umap_ as umap
-from sentence_transformers import SentenceTransformer
 from sklearn.preprocessing import StandardScaler
 from sklearn.metrics.pairwise import cosine_distances
-# Two hdbscan aglos : normal and from sklearn
-# import hdbscan
-from sklearn.cluster import HDBSCAN
+import hdbscan
 
 # from prometheus_client import CollectorRegistry, Counter, push_to_gateway
 # registry = CollectorRegistry()
 # c = Counter('documents', 'Number of documents processed', registry=registry)
 # job_name='clustering'
 
+#model
+sinr_vec = ge.SINrVectors('v1/sinr_vector_scientific_abstract')
+sinr_vec.load()
+dim_model = sinr_vec.get_number_of_dimensions()
+stopword_list = ["study","abstract","result","prospective","nested"]
+
 
 def center_reduce(matrix):
     """
@@ -33,7 +38,73 @@ def center_reduce(matrix):
 
     return matrix_center_reduce
 
-model = SentenceTransformer('./v1/all-MiniLM-L6-v2')
+#normalize text
+def uniformize(input_str):
+    text =  ''.join(
+        c for c in unicodedata.normalize('NFKD', input_str)
+        if ( unicodedata.category(c) != 'Mn' and c.isalpha() ) or c == "'" or c == ' '
+    )
+    return ' '.join(text.lower().split())
+
+
+def embedding_sentence(sentence, model, dim_model = dim_model):
+    len_sentence = 0
+    embedding = np.zeros(dim_model)
+    for word in sentence.split(" "):
+        try:
+            embedding += model.get_my_vector(word)
+            len_sentence +=1
+        except:
+            continue
+
+    if len_sentence > 0:
+        return embedding/len_sentence, True
+    return embedding, False
+
+
+def custom_flatten(matrix):
+    return [item.replace("_", " ") for row in matrix for item in row if (item not in stopword_list and len(item)>2)]
+
+
+def describe_kbest_dim(model, vec, k, max_words=10, thereshold=0.21):
+    best_dims = np.argsort(vec)[-k:]
+    descriptors = []
+    for dimension in reversed(best_dims):
+        descriptors.append(model.get_dimension_stereotypes_idx(dimension,max_words))
+
+    res = []
+    for descriptor in descriptors:
+        res.append([y for x,y in descriptor.get_interpreters() if x>thereshold])
+    return custom_flatten(res)
+
+
+def extract_informations_from_cluster(texts, clusterer, model=sinr_vec):
+    """Use describe_kbest_dim to get information on clusterer
+
+    Args:
+        texts (list): embeddings sinr
+        clusterer (HDBSCAN): the clusterer
+    """
+    cluster_dict = {}
+    for i in range(len(texts)):
+        label = int(clusterer.labels_[i])+1
+        if label == 0:
+            continue
+
+        if label not in cluster_dict:
+            cluster_dict[label] = []
+        cluster_dict[label].append(texts[i])
+
+    for label in cluster_dict:
+        # compute a mean vector of the cluster to extract information from best dimensions
+        information_cluster = np.array(cluster_dict[label]).mean(axis=0)
+
+        cluster_dict[label] = describe_kbest_dim(model, information_cluster, 5)
+
+    return cluster_dict
+
+
+
 
 ## WS
 # Datas
@@ -43,7 +114,6 @@ def center_reduce(matrix):
     all_data.append(data)
 
 len_data = len(all_data)
-
 texts=[]
 indice_out_cluster = []
 for i in range(len_data):
@@ -53,66 +123,71 @@ def center_reduce(matrix):
     try:
         line = all_data[i]
 
-        if "value" in line :
+        if "value" in line and isinstance(line["value"],str) :
             value = line["value"]
-            if type(value)==list:
-                texts.append(model.encode(" ".join(value)))
-            elif type(value)==str:
-                    texts.append(model.encode(value))
+            embedding, isnt_noise = embedding_sentence(value, sinr_vec)
+            if isnt_noise:
+                texts.append(embedding)
             else:
                 indice_out_cluster.append(i)
-
+
+
         else:
             indice_out_cluster.append(i)
 
-    except:
+    except Exception as e:
+        sys.stderr.write(str(e))
         indice_out_cluster.append(i)
 
-# Reduce DIM from 700+ to 8
-embeddings = umap.UMAP(n_neighbors=30,
-                       n_components=8,
-                       min_dist=0.0,
-                       metric='cosine',
-                       init='spectral').fit_transform(center_reduce(texts))
-
+# Without dimension reduction
+embeddings = np.array(texts)
 embeddings = center_reduce(embeddings)
 cosine_dist_matrix = cosine_distances(embeddings, embeddings)
 
 
 ## HDBSCAN with hdbscan library
-# clusterer = hdbscan.HDBSCAN(algorithm='best',
-#                             prediction_data=True,
-#                             approx_min_span_tree=True,
-#                             gen_min_span_tree=True,
-#                             min_cluster_size=int(max(10,len_data/50)),
-#                             cluster_selection_epsilon = 0.02,
-#                             min_samples=1,
-#                             p=None,
-#                             metric='precomputed',
-#                             cluster_selection_method='eom')
+clusterer = hdbscan.HDBSCAN(algorithm='best',
+                            prediction_data=False, # on exec le modèle qu'une fois
+                            approx_min_span_tree=False, # Approximation pour le calcul => True augmente la rapidité
+                            gen_min_span_tree=False,
+                            min_cluster_size=int(max(5, len_data/50)),
+                            cluster_selection_epsilon = min(0.05, max(0.001, len_data/1000000)),
+                            min_samples= int(min(10, 1 + len_data/1000)),
+                            p=None,
+                            metric='precomputed',
+                            cluster_selection_method='eom',
+                            n_jobs=-1)
 
 # HDBSCAN with scikit-learn
-clusterer = HDBSCAN(
-    algorithm='auto',
-    metric='precomputed',
-    min_cluster_size=int(max(10,len_data/100)),
-    cluster_selection_epsilon = 0.01,
-    min_samples=1,
-    cluster_selection_method="eom",
-    n_jobs=-1) 
-
+# clusterer = HDBSCAN(
+#     algorithm='auto',
+#     metric='precomputed',
+#     min_cluster_size=int(max(5,len_data/100)),
+#     cluster_selection_epsilon = 0,
+#     min_samples=1,
+#     cluster_selection_method="eom",
+#     n_jobs=-1)
 
 clusterer.fit(cosine_dist_matrix)
 
+cluster2info = extract_informations_from_cluster(texts, clusterer)
 
 # extract infos
 res = []
 indice_in_cluster=0
 for i in range(len_data):
     if i in indice_out_cluster :
-        all_data[i]["value"] = {"cluster":0, "weight":"1.0"}
+        all_data[i]["value"] = {"cluster":0, "keywords":[], "weight":0}
     else:
-        all_data[i]["value"]={"cluster":int(clusterer.labels_[indice_in_cluster]+1), "weight":str(clusterer.probabilities_[indice_in_cluster])}
+        label_cluster = int(clusterer.labels_[indice_in_cluster]+1)
+        if label_cluster == 0:
+            all_data[i]["value"] = {"cluster":0, "keywords":[], "weight":0}
+
+        else:
+            all_data[i]["value"]={
+                "cluster":label_cluster,
+                "keywords":cluster2info[label_cluster],
+                "weight":str(clusterer.probabilities_[indice_in_cluster])}
         indice_in_cluster +=1 # Here we increment only if the row isn't noise, because they aren't count in "clusterer model"