Skip to content

Commit

Permalink
feat(text-clustering): add interpretability
Browse files Browse the repository at this point in the history
  • Loading branch information
leogail committed Oct 8, 2024
1 parent ba81856 commit 171f8c5
Show file tree
Hide file tree
Showing 3 changed files with 238 additions and 80 deletions.
23 changes: 13 additions & 10 deletions services/text-clustering/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
FROM python:3.9-slim-bullseye AS dvc-files
WORKDIR /dvc
RUN apt update && apt -y install git
RUN pip install dvc[webdav]==3.39.0
RUN pip install dvc[webdav]==3.55.2
RUN --mount=type=secret,id=webdav_login \
--mount=type=secret,id=webdav_password \
--mount=type=secret,id=webdav_url \
Expand All @@ -12,7 +12,7 @@ RUN --mount=type=secret,id=webdav_login \
dvc remote modify --local webdav-remote user "$(cat /run/secrets/webdav_login)" && \
dvc remote modify --local webdav-remote password "$(cat /run/secrets/webdav_password)"
RUN dvc doctor
COPY ./v1/all-MiniLM-L6-v2.dvc /dvc
COPY ./v1/sinr_vector_scientific_abstract.pk.dvc /dvc
RUN dvc pull -v


Expand All @@ -23,17 +23,20 @@ ENV NUMBA_CACHE_DIR=/tmp/numba_cache

USER root
# Install all python dependencies
RUN pip install --no-build-isolation \
--index-url https://download.pytorch.org/whl/cpu \
--extra-index-url https://pypi.org/simple \
sentence-transformers==2.2.2 \
umap-learn==0.5.5 \
scikit-learn==1.4.1.post1 \
prometheus-client==0.19.0
# RUN apt install python3.9-dev \
# build-essential \
# libomp-dev

RUN apt update && apt -y install python3.9-dev build-essential libomp-dev

RUN pip install sinr==1.3.1 \
scikit-learn==1.5.1 \
prometheus-client==0.19.0 \
hdbscan==0.8.38.post1

# Declare files to copy in .dockerignore
COPY --chown=daemon:daemon . /app/public/
RUN mv /app/public/config.json /app && chmod a+w /app/config.json
RUN mkdir /tmp/retrieve

COPY --chown=daemon:daemon --from=dvc-files /dvc/all-MiniLM-L6-v2 /app/public/v1/all-MiniLM-L6-v2
COPY --chown=daemon:daemon --from=dvc-files /dvc/sinr_vector_scientific_abstract.pk /app/public/v1/sinr_vector_scientific_abstract.pk
159 changes: 117 additions & 42 deletions services/text-clustering/v1/clustering.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,25 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import sinr.graph_embeddings as ge
import unicodedata
import numpy as np
import json
import sys
import umap.umap_ as umap
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_distances
# Two hdbscan aglos : normal and from sklearn
# import hdbscan
from sklearn.cluster import HDBSCAN
import hdbscan

# from prometheus_client import CollectorRegistry, Counter, push_to_gateway
# registry = CollectorRegistry()
# c = Counter('documents', 'Number of documents processed', registry=registry)
# job_name='clustering'

#model
sinr_vec = ge.SINrVectors('v1/sinr_vector_scientific_abstract')
sinr_vec.load()
dim_model = sinr_vec.get_number_of_dimensions()
stopword_list = ["study","abstract","result","prospective","nested"]


def center_reduce(matrix):
"""
Expand All @@ -33,7 +38,73 @@ def center_reduce(matrix):

return matrix_center_reduce

model = SentenceTransformer('./v1/all-MiniLM-L6-v2')
#normalize text
def uniformize(input_str):
text = ''.join(
c for c in unicodedata.normalize('NFKD', input_str)
if ( unicodedata.category(c) != 'Mn' and c.isalpha() ) or c == "'" or c == ' '
)
return ' '.join(text.lower().split())


def embedding_sentence(sentence, model, dim_model = dim_model):
len_sentence = 0
embedding = np.zeros(dim_model)
for word in sentence.split(" "):
try:
embedding += model.get_my_vector(word)
len_sentence +=1
except:
continue

if len_sentence > 0:
return embedding/len_sentence, True
return embedding, False


def custom_flatten(matrix):
return [item.replace("_", " ") for row in matrix for item in row if (item not in stopword_list and len(item)>2)]


def describe_kbest_dim(model, vec, k, max_words=10, thereshold=0.21):
best_dims = np.argsort(vec)[-k:]
descriptors = []
for dimension in reversed(best_dims):
descriptors.append(model.get_dimension_stereotypes_idx(dimension,max_words))

res = []
for descriptor in descriptors:
res.append([y for x,y in descriptor.get_interpreters() if x>thereshold])
return custom_flatten(res)


def extract_informations_from_cluster(texts, clusterer, model=sinr_vec):
"""Use describe_kbest_dim to get information on clusterer
Args:
texts (list): embeddings sinr
clusterer (HDBSCAN): the clusterer
"""
cluster_dict = {}
for i in range(len(texts)):
label = int(clusterer.labels_[i])+1
if label == 0:
continue

if label not in cluster_dict:
cluster_dict[label] = []
cluster_dict[label].append(texts[i])

for label in cluster_dict:
# compute a mean vector of the cluster to extract information from best dimensions
information_cluster = np.array(cluster_dict[label]).mean(axis=0)

cluster_dict[label] = describe_kbest_dim(model, information_cluster, 5)

return cluster_dict




## WS
# Datas
Expand All @@ -43,7 +114,6 @@ def center_reduce(matrix):
all_data.append(data)

len_data = len(all_data)

texts=[]
indice_out_cluster = []
for i in range(len_data):
Expand All @@ -53,66 +123,71 @@ def center_reduce(matrix):
try:
line = all_data[i]

if "value" in line :
if "value" in line and isinstance(line["value"],str) :
value = line["value"]
if type(value)==list:
texts.append(model.encode(" ".join(value)))
elif type(value)==str:
texts.append(model.encode(value))
embedding, isnt_noise = embedding_sentence(value, sinr_vec)
if isnt_noise:
texts.append(embedding)
else:
indice_out_cluster.append(i)



else:
indice_out_cluster.append(i)

except:
except Exception as e:
sys.stderr.write(str(e))
indice_out_cluster.append(i)

# Reduce DIM from 700+ to 8
embeddings = umap.UMAP(n_neighbors=30,
n_components=8,
min_dist=0.0,
metric='cosine',
init='spectral').fit_transform(center_reduce(texts))

# Without dimension reduction
embeddings = np.array(texts)
embeddings = center_reduce(embeddings)
cosine_dist_matrix = cosine_distances(embeddings, embeddings)


## HDBSCAN with hdbscan library
# clusterer = hdbscan.HDBSCAN(algorithm='best',
# prediction_data=True,
# approx_min_span_tree=True,
# gen_min_span_tree=True,
# min_cluster_size=int(max(10,len_data/50)),
# cluster_selection_epsilon = 0.02,
# min_samples=1,
# p=None,
# metric='precomputed',
# cluster_selection_method='eom')
clusterer = hdbscan.HDBSCAN(algorithm='best',
prediction_data=False, # on exec le modèle qu'une fois
approx_min_span_tree=False, # Approximation pour le calcul => True augmente la rapidité
gen_min_span_tree=False,
min_cluster_size=int(max(5, len_data/50)),
cluster_selection_epsilon = min(0.05, max(0.001, len_data/1000000)),
min_samples= int(min(10, 1 + len_data/1000)),
p=None,
metric='precomputed',
cluster_selection_method='eom',
n_jobs=-1)

# HDBSCAN with scikit-learn
clusterer = HDBSCAN(
algorithm='auto',
metric='precomputed',
min_cluster_size=int(max(10,len_data/100)),
cluster_selection_epsilon = 0.01,
min_samples=1,
cluster_selection_method="eom",
n_jobs=-1)

# clusterer = HDBSCAN(
# algorithm='auto',
# metric='precomputed',
# min_cluster_size=int(max(5,len_data/100)),
# cluster_selection_epsilon = 0,
# min_samples=1,
# cluster_selection_method="eom",
# n_jobs=-1)

clusterer.fit(cosine_dist_matrix)

cluster2info = extract_informations_from_cluster(texts, clusterer)

# extract infos
res = []
indice_in_cluster=0
for i in range(len_data):
if i in indice_out_cluster :
all_data[i]["value"] = {"cluster":0, "weight":"1.0"}
all_data[i]["value"] = {"cluster":0, "keywords":[], "weight":0}
else:
all_data[i]["value"]={"cluster":int(clusterer.labels_[indice_in_cluster]+1), "weight":str(clusterer.probabilities_[indice_in_cluster])}
label_cluster = int(clusterer.labels_[indice_in_cluster]+1)
if label_cluster == 0:
all_data[i]["value"] = {"cluster":0, "keywords":[], "weight":0}

else:
all_data[i]["value"]={
"cluster":label_cluster,
"keywords":cluster2info[label_cluster],
"weight":str(clusterer.probabilities_[indice_in_cluster])}
indice_in_cluster +=1 # Here we increment only if the row isn't noise, because they aren't count in "clusterer model"


Expand Down
Loading

0 comments on commit 171f8c5

Please sign in to comment.