Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[text clustering] add interpretation #182

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 13 additions & 10 deletions services/text-clustering/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
FROM python:3.9-slim-bullseye AS dvc-files
WORKDIR /dvc
RUN apt update && apt -y install git
RUN pip install dvc[webdav]==3.39.0
RUN pip install dvc[webdav]==3.55.2
RUN --mount=type=secret,id=webdav_login \
--mount=type=secret,id=webdav_password \
--mount=type=secret,id=webdav_url \
Expand All @@ -12,7 +12,7 @@ RUN --mount=type=secret,id=webdav_login \
dvc remote modify --local webdav-remote user "$(cat /run/secrets/webdav_login)" && \
dvc remote modify --local webdav-remote password "$(cat /run/secrets/webdav_password)"
RUN dvc doctor
COPY ./v1/all-MiniLM-L6-v2.dvc /dvc
COPY ./v1/sinr_vector_scientific_abstract.pk.dvc /dvc
RUN dvc pull -v


Expand All @@ -23,17 +23,20 @@ ENV NUMBA_CACHE_DIR=/tmp/numba_cache

USER root
# Install all python dependencies
RUN pip install --no-build-isolation \
--index-url https://download.pytorch.org/whl/cpu \
--extra-index-url https://pypi.org/simple \
sentence-transformers==2.2.2 \
umap-learn==0.5.5 \
scikit-learn==1.4.1.post1 \
prometheus-client==0.19.0
# RUN apt install python3.9-dev \
# build-essential \
# libomp-dev

RUN apt update && apt -y install python3.9-dev build-essential libomp-dev

RUN pip install sinr==1.3.1 \
scikit-learn==1.5.1 \
prometheus-client==0.19.0 \
hdbscan==0.8.38.post1

# Declare files to copy in .dockerignore
COPY --chown=daemon:daemon . /app/public/
RUN mv /app/public/config.json /app && chmod a+w /app/config.json
RUN mkdir /tmp/retrieve

COPY --chown=daemon:daemon --from=dvc-files /dvc/all-MiniLM-L6-v2 /app/public/v1/all-MiniLM-L6-v2
COPY --chown=daemon:daemon --from=dvc-files /dvc/sinr_vector_scientific_abstract.pk /app/public/v1/sinr_vector_scientific_abstract.pk
6 changes: 0 additions & 6 deletions services/text-clustering/v1/all-MiniLM-L6-v2.dvc

This file was deleted.

159 changes: 117 additions & 42 deletions services/text-clustering/v1/clustering.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,25 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import sinr.graph_embeddings as ge
import unicodedata
import numpy as np
import json
import sys
import umap.umap_ as umap
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_distances
# Two hdbscan aglos : normal and from sklearn
# import hdbscan
from sklearn.cluster import HDBSCAN
import hdbscan

# from prometheus_client import CollectorRegistry, Counter, push_to_gateway
# registry = CollectorRegistry()
# c = Counter('documents', 'Number of documents processed', registry=registry)
# job_name='clustering'

#model
sinr_vec = ge.SINrVectors('v1/sinr_vector_scientific_abstract')
sinr_vec.load()
dim_model = sinr_vec.get_number_of_dimensions()
stopword_list = ["study","abstract","result","prospective","nested"]


def center_reduce(matrix):
"""
Expand All @@ -33,7 +38,73 @@ def center_reduce(matrix):

return matrix_center_reduce

model = SentenceTransformer('./v1/all-MiniLM-L6-v2')
#normalize text
def uniformize(input_str):
text = ''.join(
c for c in unicodedata.normalize('NFKD', input_str)
if ( unicodedata.category(c) != 'Mn' and c.isalpha() ) or c == "'" or c == ' '
)
return ' '.join(text.lower().split())


def embedding_sentence(sentence, model, dim_model = dim_model):
len_sentence = 0
embedding = np.zeros(dim_model)
for word in sentence.split(" "):
try:
embedding += model.get_my_vector(word)
len_sentence +=1
except:
continue

if len_sentence > 0:
return embedding/len_sentence, True
return embedding, False


def custom_flatten(matrix):
return [item.replace("_", " ") for row in matrix for item in row if (item not in stopword_list and len(item)>2)]


def describe_kbest_dim(model, vec, k, max_words=10, thereshold=0.21):
best_dims = np.argsort(vec)[-k:]
descriptors = []
for dimension in reversed(best_dims):
descriptors.append(model.get_dimension_stereotypes_idx(dimension,max_words))

res = []
for descriptor in descriptors:
res.append([y for x,y in descriptor.get_interpreters() if x>thereshold])
return custom_flatten(res)


def extract_informations_from_cluster(texts, clusterer, model=sinr_vec):
"""Use describe_kbest_dim to get information on clusterer

Args:
texts (list): embeddings sinr
clusterer (HDBSCAN): the clusterer
"""
cluster_dict = {}
for i in range(len(texts)):
label = int(clusterer.labels_[i])+1
if label == 0:
continue

if label not in cluster_dict:
cluster_dict[label] = []
cluster_dict[label].append(texts[i])

for label in cluster_dict:
# compute a mean vector of the cluster to extract information from best dimensions
information_cluster = np.array(cluster_dict[label]).mean(axis=0)

cluster_dict[label] = describe_kbest_dim(model, information_cluster, 5)

return cluster_dict




## WS
# Datas
Expand All @@ -43,7 +114,6 @@ def center_reduce(matrix):
all_data.append(data)

len_data = len(all_data)

texts=[]
indice_out_cluster = []
for i in range(len_data):
Expand All @@ -53,66 +123,71 @@ def center_reduce(matrix):
try:
line = all_data[i]

if "value" in line :
if "value" in line and isinstance(line["value"],str) :
value = line["value"]
if type(value)==list:
texts.append(model.encode(" ".join(value)))
elif type(value)==str:
texts.append(model.encode(value))
embedding, isnt_noise = embedding_sentence(value, sinr_vec)
if isnt_noise:
texts.append(embedding)
else:
indice_out_cluster.append(i)



else:
indice_out_cluster.append(i)

except:
except Exception as e:
sys.stderr.write(str(e))
indice_out_cluster.append(i)

# Reduce DIM from 700+ to 8
embeddings = umap.UMAP(n_neighbors=30,
n_components=8,
min_dist=0.0,
metric='cosine',
init='spectral').fit_transform(center_reduce(texts))

# Without dimension reduction
embeddings = np.array(texts)
embeddings = center_reduce(embeddings)
cosine_dist_matrix = cosine_distances(embeddings, embeddings)


## HDBSCAN with hdbscan library
# clusterer = hdbscan.HDBSCAN(algorithm='best',
# prediction_data=True,
# approx_min_span_tree=True,
# gen_min_span_tree=True,
# min_cluster_size=int(max(10,len_data/50)),
# cluster_selection_epsilon = 0.02,
# min_samples=1,
# p=None,
# metric='precomputed',
# cluster_selection_method='eom')
clusterer = hdbscan.HDBSCAN(algorithm='best',
prediction_data=False, # on exec le modèle qu'une fois
approx_min_span_tree=False, # Approximation pour le calcul => True augmente la rapidité
gen_min_span_tree=False,
min_cluster_size=int(max(5, len_data/50)),
cluster_selection_epsilon = min(0.05, max(0.001, len_data/1000000)),
min_samples= int(min(10, 1 + len_data/1000)),
p=None,
metric='precomputed',
cluster_selection_method='eom',
n_jobs=-1)

# HDBSCAN with scikit-learn
clusterer = HDBSCAN(
algorithm='auto',
metric='precomputed',
min_cluster_size=int(max(10,len_data/100)),
cluster_selection_epsilon = 0.01,
min_samples=1,
cluster_selection_method="eom",
n_jobs=-1)

# clusterer = HDBSCAN(
# algorithm='auto',
# metric='precomputed',
# min_cluster_size=int(max(5,len_data/100)),
# cluster_selection_epsilon = 0,
# min_samples=1,
# cluster_selection_method="eom",
# n_jobs=-1)

clusterer.fit(cosine_dist_matrix)

cluster2info = extract_informations_from_cluster(texts, clusterer)

# extract infos
res = []
indice_in_cluster=0
for i in range(len_data):
if i in indice_out_cluster :
all_data[i]["value"] = {"cluster":0, "weight":"1.0"}
all_data[i]["value"] = {"cluster":0, "keywords":[], "weight":0}
else:
all_data[i]["value"]={"cluster":int(clusterer.labels_[indice_in_cluster]+1), "weight":str(clusterer.probabilities_[indice_in_cluster])}
label_cluster = int(clusterer.labels_[indice_in_cluster]+1)
if label_cluster == 0:
all_data[i]["value"] = {"cluster":0, "keywords":[], "weight":0}

else:
all_data[i]["value"]={
"cluster":label_cluster,
"keywords":cluster2info[label_cluster],
"weight":str(clusterer.probabilities_[indice_in_cluster])}
indice_in_cluster +=1 # Here we increment only if the row isn't noise, because they aren't count in "clusterer model"


Expand Down
Loading