From 4ebad8b0bf3f193f03ebf6f0afaf728c0445bb5a Mon Sep 17 00:00:00 2001 From: Tenzin Date: Mon, 22 Apr 2024 11:40:39 +0530 Subject: [PATCH 01/11] cluster with kmeans --- src/monocheck/clustering.py | 7 +++++++ src/monocheck/pipeline.py | 8 +++++--- 2 files changed, 12 insertions(+), 3 deletions(-) create mode 100644 src/monocheck/clustering.py diff --git a/src/monocheck/clustering.py b/src/monocheck/clustering.py new file mode 100644 index 0000000..f90d0a6 --- /dev/null +++ b/src/monocheck/clustering.py @@ -0,0 +1,7 @@ +import numpy as np +from sklearn.cluster import KMeans + +def cluster(features: np.ndarray, no_of_clusters:int= 2): + kmeans = KMeans(n_clusters= no_of_clusters, random_state=22) + kmeans.fit(features) + return kmeans.labels_ \ No newline at end of file diff --git a/src/monocheck/pipeline.py b/src/monocheck/pipeline.py index 9ec6b5d..be88760 100644 --- a/src/monocheck/pipeline.py +++ b/src/monocheck/pipeline.py @@ -8,6 +8,7 @@ from monocheck.prepare import load_image from monocheck.feature_extraction import extract_features from monocheck.dimension_reduction import reduce_dimension +from monocheck.clustering import cluster def pipeline(image_paths:List[Path]): imgs_array = [load_image(image_path).squeeze(0) for image_path in image_paths] @@ -18,12 +19,13 @@ def pipeline(image_paths:List[Path]): imgs_features = extract_features(imgs_array, model) imgs_features = imgs_features.reshape(-1,4096) reduced_imgs_features = reduce_dimension(imgs_features) - return reduced_imgs_features + clustering_labels = cluster(reduced_imgs_features) + return clustering_labels if __name__ == "__main__": imgs_path = [Path("image.jpg"), Path("image2.jpg")] - imgs_feat = pipeline(imgs_path) - print(imgs_feat) + imgs_labels = pipeline(imgs_path) + print(imgs_labels) From 92467f67734b865cfc84febc1a440bb6b0e0e0a8 Mon Sep 17 00:00:00 2001 From: Tenzin Date: Mon, 22 Apr 2024 11:41:54 +0530 Subject: [PATCH 02/11] update dependencie --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index a7d2901..c6bcdef 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,6 +24,7 @@ dependencies = [ "numpy", "pillow", "tensorflow", + "scikit-learn", ] [project.optional-dependencies] From 0618ca59e7127a56a8f608803dcdcb91386502d5 Mon Sep 17 00:00:00 2001 From: Tenzin Date: Mon, 22 Apr 2024 11:50:25 +0530 Subject: [PATCH 03/11] group clusters with file names --- src/monocheck/clustering.py | 17 ++++++++++++++++- src/monocheck/pipeline.py | 8 ++++++-- 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/src/monocheck/clustering.py b/src/monocheck/clustering.py index f90d0a6..b48d22a 100644 --- a/src/monocheck/clustering.py +++ b/src/monocheck/clustering.py @@ -1,7 +1,22 @@ import numpy as np +from typing import List +from pathlib import Path + from sklearn.cluster import KMeans + def cluster(features: np.ndarray, no_of_clusters:int= 2): kmeans = KMeans(n_clusters= no_of_clusters, random_state=22) kmeans.fit(features) - return kmeans.labels_ \ No newline at end of file + return kmeans.labels_ + +def group_clusters(files_paths: List[Path], cluster_labels:np.ndarray): + groups = {} + for file, cluster in zip(files_paths, cluster_labels): + file = str(file) + if cluster not in groups.keys(): + groups[cluster] = [] + groups[cluster].append(file) + else: + groups[cluster].append(file) + return groups \ No newline at end of file diff --git a/src/monocheck/pipeline.py b/src/monocheck/pipeline.py index be88760..c446ee3 100644 --- a/src/monocheck/pipeline.py +++ b/src/monocheck/pipeline.py @@ -8,7 +8,7 @@ from monocheck.prepare import load_image from monocheck.feature_extraction import extract_features from monocheck.dimension_reduction import reduce_dimension -from monocheck.clustering import cluster +from monocheck.clustering import cluster, group_clusters def pipeline(image_paths:List[Path]): imgs_array = [load_image(image_path).squeeze(0) for image_path in image_paths] @@ -16,11 +16,15 @@ def pipeline(image_paths:List[Path]): model = VGG16() model = Model(inputs = model.inputs, outputs = model.layers[-2].output) + imgs_features = extract_features(imgs_array, model) imgs_features = imgs_features.reshape(-1,4096) reduced_imgs_features = reduce_dimension(imgs_features) + """ cluster the image features with kmeans """ clustering_labels = cluster(reduced_imgs_features) - return clustering_labels + """ group images based on labels, with key: label and values: image paths""" + cluster_groups = group_clusters(image_paths, clustering_labels) + return cluster_groups if __name__ == "__main__": imgs_path = [Path("image.jpg"), Path("image2.jpg")] From b0d179ac5658bf17ca17d656887c7d06964ff4ca Mon Sep 17 00:00:00 2001 From: Tenzin Date: Mon, 22 Apr 2024 12:03:01 +0530 Subject: [PATCH 04/11] save clustering res to json --- src/monocheck/clustering.py | 1 + src/monocheck/pipeline.py | 11 +++++++++-- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/src/monocheck/clustering.py b/src/monocheck/clustering.py index b48d22a..03fa933 100644 --- a/src/monocheck/clustering.py +++ b/src/monocheck/clustering.py @@ -14,6 +14,7 @@ def group_clusters(files_paths: List[Path], cluster_labels:np.ndarray): groups = {} for file, cluster in zip(files_paths, cluster_labels): file = str(file) + cluster = int(cluster) # int32 -> int conversion if cluster not in groups.keys(): groups[cluster] = [] groups[cluster].append(file) diff --git a/src/monocheck/pipeline.py b/src/monocheck/pipeline.py index c446ee3..e6c9a91 100644 --- a/src/monocheck/pipeline.py +++ b/src/monocheck/pipeline.py @@ -1,4 +1,5 @@ import numpy as np +import json from pathlib import Path from typing import List @@ -26,10 +27,16 @@ def pipeline(image_paths:List[Path]): cluster_groups = group_clusters(image_paths, clustering_labels) return cluster_groups + + if __name__ == "__main__": imgs_path = [Path("image.jpg"), Path("image2.jpg")] - imgs_labels = pipeline(imgs_path) - print(imgs_labels) + grouped_clusters = pipeline(imgs_path) + file_name = 'grouped_clusters.json' + with open(file_name, 'w') as file: + json.dump(grouped_clusters, file, indent=4) + + print(f"Data has been successfully saved to {file_name}.") From f119d93b0d4d6a86d14e15982b3d8d675adc3012 Mon Sep 17 00:00:00 2001 From: Tenzin Date: Mon, 22 Apr 2024 12:19:56 +0530 Subject: [PATCH 05/11] save clustering res to json --- src/monocheck/pipeline.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/monocheck/pipeline.py b/src/monocheck/pipeline.py index e6c9a91..49f01b2 100644 --- a/src/monocheck/pipeline.py +++ b/src/monocheck/pipeline.py @@ -11,7 +11,7 @@ from monocheck.dimension_reduction import reduce_dimension from monocheck.clustering import cluster, group_clusters -def pipeline(image_paths:List[Path]): +def pipeline(image_paths:List[Path], output_file_path:Path=Path('grouped_clusters.json')): imgs_array = [load_image(image_path).squeeze(0) for image_path in image_paths] imgs_array = np.stack(imgs_array, axis=0) model = VGG16() @@ -32,11 +32,7 @@ def pipeline(image_paths:List[Path]): if __name__ == "__main__": imgs_path = [Path("image.jpg"), Path("image2.jpg")] grouped_clusters = pipeline(imgs_path) - file_name = 'grouped_clusters.json' - with open(file_name, 'w') as file: - json.dump(grouped_clusters, file, indent=4) - print(f"Data has been successfully saved to {file_name}.") From e021958a123a3d7ab025ee60d2d3716da08d6ae9 Mon Sep 17 00:00:00 2001 From: Tenzin Date: Mon, 22 Apr 2024 14:37:37 +0530 Subject: [PATCH 06/11] save clustering res to json; --- src/monocheck/pipeline.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/monocheck/pipeline.py b/src/monocheck/pipeline.py index 49f01b2..3616e93 100644 --- a/src/monocheck/pipeline.py +++ b/src/monocheck/pipeline.py @@ -1,5 +1,6 @@ import numpy as np import json + from pathlib import Path from typing import List @@ -25,6 +26,10 @@ def pipeline(image_paths:List[Path], output_file_path:Path=Path('grouped_cluster clustering_labels = cluster(reduced_imgs_features) """ group images based on labels, with key: label and values: image paths""" cluster_groups = group_clusters(image_paths, clustering_labels) + + """ save the result """ + with open(output_file_path, 'w') as file: + json.dump(cluster_groups, file, indent=4) return cluster_groups From ec9e04049e8a85b4e133230e05f8059cddc7f178 Mon Sep 17 00:00:00 2001 From: Tenzin Date: Mon, 22 Apr 2024 15:21:38 +0530 Subject: [PATCH 07/11] acknowledgment after steps --- src/monocheck/clustering.py | 2 ++ src/monocheck/dimension_reduction.py | 1 + src/monocheck/feature_extraction.py | 1 + 3 files changed, 4 insertions(+) diff --git a/src/monocheck/clustering.py b/src/monocheck/clustering.py index 03fa933..13a4b00 100644 --- a/src/monocheck/clustering.py +++ b/src/monocheck/clustering.py @@ -8,6 +8,8 @@ def cluster(features: np.ndarray, no_of_clusters:int= 2): kmeans = KMeans(n_clusters= no_of_clusters, random_state=22) kmeans.fit(features) + + print("[SUCCESS]: Clustering succesfully done") return kmeans.labels_ def group_clusters(files_paths: List[Path], cluster_labels:np.ndarray): diff --git a/src/monocheck/dimension_reduction.py b/src/monocheck/dimension_reduction.py index e792b8e..d41d51a 100644 --- a/src/monocheck/dimension_reduction.py +++ b/src/monocheck/dimension_reduction.py @@ -7,5 +7,6 @@ def reduce_dimension(images_feature: np.ndarray, components:int = 100): pca.fit(images_feature) reduced_images_feature = pca.transform(images_feature) + print("[SUCCESS]: Image features dimensions reduction successfully done.") return reduced_images_feature diff --git a/src/monocheck/feature_extraction.py b/src/monocheck/feature_extraction.py index daf9070..05e1b98 100644 --- a/src/monocheck/feature_extraction.py +++ b/src/monocheck/feature_extraction.py @@ -6,6 +6,7 @@ def extract_features(images_input: np.ndarray, model:VGG16): images_features = model.predict(images_input) + print("[SUCCESS]: Image features extraction done.") return images_features From a2b1e194e47070ae30c784afa6c347eaec8a9636 Mon Sep 17 00:00:00 2001 From: Tenzin Date: Mon, 22 Apr 2024 15:52:07 +0530 Subject: [PATCH 08/11] do features extraction in batches of 1000 --- src/monocheck/feature_extraction.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/src/monocheck/feature_extraction.py b/src/monocheck/feature_extraction.py index 05e1b98..dfb8f9c 100644 --- a/src/monocheck/feature_extraction.py +++ b/src/monocheck/feature_extraction.py @@ -4,10 +4,22 @@ def extract_features(images_input: np.ndarray, model:VGG16): - images_features = model.predict(images_input) + """ total number of images""" + num_images = images_input.shape[0] + """ batch size """ + batch_size = 1000 + all_features = [] + """ predict in batches """ + for start in range(0, num_images, batch_size): + end = min(start + batch_size, num_images) # Ensure the last batch is handled properly + batch_features = model.predict(images_input[start:end]) + all_features.append(batch_features) + print(f"[{end}/{num_images}] image features extraction done.") + + all_features = np.vstack(all_features) + print("[SUCCESS]: Image features extraction done.") - return images_features - + return all_features From 9909655826d8d092a0f9da58c16b4ec5a29854c7 Mon Sep 17 00:00:00 2001 From: Tenzin Date: Mon, 22 Apr 2024 16:06:01 +0530 Subject: [PATCH 09/11] store image features as pickle --- .gitignore | 4 ++++ src/monocheck/pipeline.py | 22 +++++++++++++++++----- 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/.gitignore b/.gitignore index b6e4761..e82e609 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,7 @@ +ocr_output/ +array.pkl +grouped_clusters.json + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/src/monocheck/pipeline.py b/src/monocheck/pipeline.py index 3616e93..aec0abc 100644 --- a/src/monocheck/pipeline.py +++ b/src/monocheck/pipeline.py @@ -1,6 +1,6 @@ import numpy as np import json - +import pickle from pathlib import Path from typing import List @@ -12,15 +12,26 @@ from monocheck.dimension_reduction import reduce_dimension from monocheck.clustering import cluster, group_clusters + +IMAGE_FEATURES_PICKLE = Path('array.pkl') + def pipeline(image_paths:List[Path], output_file_path:Path=Path('grouped_clusters.json')): imgs_array = [load_image(image_path).squeeze(0) for image_path in image_paths] imgs_array = np.stack(imgs_array, axis=0) model = VGG16() model = Model(inputs = model.inputs, outputs = model.layers[-2].output) - - imgs_features = extract_features(imgs_array, model) - imgs_features = imgs_features.reshape(-1,4096) + """ load image features pickle if exists """ + if not IMAGE_FEATURES_PICKLE.exists(): + imgs_features = extract_features(imgs_array, model) + imgs_features = imgs_features.reshape(-1,4096) + """ save the image features as pickle file """ + with open(IMAGE_FEATURES_PICKLE, 'wb') as file: + pickle.dump(imgs_features, file) + else: + with open(IMAGE_FEATURES_PICKLE, 'rb') as file: + imgs_features = pickle.load(file) + reduced_imgs_features = reduce_dimension(imgs_features) """ cluster the image features with kmeans """ clustering_labels = cluster(reduced_imgs_features) @@ -35,7 +46,8 @@ def pipeline(image_paths:List[Path], output_file_path:Path=Path('grouped_cluster if __name__ == "__main__": - imgs_path = [Path("image.jpg"), Path("image2.jpg")] + # imgs_path = [Path("image.jpg"), Path("image2.jpg")] + imgs_path = list(Path("ocr_output").rglob("*.jpg")) grouped_clusters = pipeline(imgs_path) From dcc10dbef046aaaf559649f7eb0c95f2bd46789b Mon Sep 17 00:00:00 2001 From: Tenzin Date: Mon, 22 Apr 2024 16:24:34 +0530 Subject: [PATCH 10/11] view clusters --- .gitignore | 1 + src/monocheck/clustering.py | 34 +++++++++++++++++++++++++++++++++- src/monocheck/pipeline.py | 6 ++---- 3 files changed, 36 insertions(+), 5 deletions(-) diff --git a/.gitignore b/.gitignore index e82e609..19b2b8d 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ ocr_output/ array.pkl grouped_clusters.json +clustering_output/ # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/src/monocheck/clustering.py b/src/monocheck/clustering.py index 13a4b00..679303e 100644 --- a/src/monocheck/clustering.py +++ b/src/monocheck/clustering.py @@ -1,8 +1,11 @@ import numpy as np +import matplotlib.pyplot as plt + from typing import List from pathlib import Path from sklearn.cluster import KMeans +from keras.preprocessing.image import load_img def cluster(features: np.ndarray, no_of_clusters:int= 2): @@ -22,4 +25,33 @@ def group_clusters(files_paths: List[Path], cluster_labels:np.ndarray): groups[cluster].append(file) else: groups[cluster].append(file) - return groups \ No newline at end of file + return groups + + +def view_clusters(grouped_clusters, output_dir='clustering_output'): + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + clusters = list(grouped_clusters.keys()) + for cluster in clusters: + plt.figure(figsize=(30,30)) + + """ file names """ + files = grouped_clusters[cluster] + """ Only allow up to 100 images to be shown at a time """ + if len(files) > 100: + files = files[:100] + + """ Plot each image in the cluster """ + for index, file in enumerate(files): + plt.subplot(10, 10, index + 1) + img = load_img(file) + img = np.array(img) + plt.imshow(img) + plt.axis('off') + + """ save the image""" + plt.savefig(f'{output_dir}/cluster_{cluster}.png') + plt.close() + + print(f"Cluster {cluster} saved to {output_dir}/cluster_{cluster}.jpg") + diff --git a/src/monocheck/pipeline.py b/src/monocheck/pipeline.py index aec0abc..f537d3b 100644 --- a/src/monocheck/pipeline.py +++ b/src/monocheck/pipeline.py @@ -10,8 +10,7 @@ from monocheck.prepare import load_image from monocheck.feature_extraction import extract_features from monocheck.dimension_reduction import reduce_dimension -from monocheck.clustering import cluster, group_clusters - +from monocheck.clustering import cluster, group_clusters, view_clusters IMAGE_FEATURES_PICKLE = Path('array.pkl') @@ -46,10 +45,9 @@ def pipeline(image_paths:List[Path], output_file_path:Path=Path('grouped_cluster if __name__ == "__main__": - # imgs_path = [Path("image.jpg"), Path("image2.jpg")] imgs_path = list(Path("ocr_output").rglob("*.jpg")) grouped_clusters = pipeline(imgs_path) - + view_clusters(grouped_clusters) From 6e2727488ba88ce574fe3842ba192a5c6507db16 Mon Sep 17 00:00:00 2001 From: Tenzin Date: Mon, 22 Apr 2024 16:56:36 +0530 Subject: [PATCH 11/11] view clusters in pdf --- .gitignore | 1 + src/monocheck/clustering.py | 62 +++++++++++++++++++++---------------- 2 files changed, 37 insertions(+), 26 deletions(-) diff --git a/.gitignore b/.gitignore index 19b2b8d..cfb2f71 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,7 @@ ocr_output/ array.pkl grouped_clusters.json clustering_output/ +output_images/ # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/src/monocheck/clustering.py b/src/monocheck/clustering.py index 679303e..c5a1bda 100644 --- a/src/monocheck/clustering.py +++ b/src/monocheck/clustering.py @@ -1,13 +1,15 @@ import numpy as np -import matplotlib.pyplot as plt from typing import List from pathlib import Path +from PIL import Image from sklearn.cluster import KMeans -from keras.preprocessing.image import load_img +from reportlab.lib.pagesizes import letter +from reportlab.pdfgen import canvas + def cluster(features: np.ndarray, no_of_clusters:int= 2): kmeans = KMeans(n_clusters= no_of_clusters, random_state=22) kmeans.fit(features) @@ -27,31 +29,39 @@ def group_clusters(files_paths: List[Path], cluster_labels:np.ndarray): groups[cluster].append(file) return groups - -def view_clusters(grouped_clusters, output_dir='clustering_output'): - output_dir = Path(output_dir) - output_dir.mkdir(parents=True, exist_ok=True) +def view_clusters(grouped_clusters, save_path='output_images'): + """ save each clustering in different pdfs """ clusters = list(grouped_clusters.keys()) for cluster in clusters: - plt.figure(figsize=(30,30)) - - """ file names """ files = grouped_clusters[cluster] - """ Only allow up to 100 images to be shown at a time """ - if len(files) > 100: - files = files[:100] - - """ Plot each image in the cluster """ + images_per_page = 10 + page_width, page_height = letter # Default letter size + + """ Create a PDF canvas """ + c = canvas.Canvas(f'{save_path}/cluster_{cluster}.pdf', pagesize=letter) + + y_position = page_height - 72 # Initial top margin offset for index, file in enumerate(files): - plt.subplot(10, 10, index + 1) - img = load_img(file) - img = np.array(img) - plt.imshow(img) - plt.axis('off') - - """ save the image""" - plt.savefig(f'{output_dir}/cluster_{cluster}.png') - plt.close() - - print(f"Cluster {cluster} saved to {output_dir}/cluster_{cluster}.jpg") - + if index % images_per_page == 0 and index != 0: + c.showPage() # Add a new page if the current one is filled + y_position = page_height - 72 # Reset position at the top of a new page + + """ Open image and get its original size """ + img = Image.open(file) + img_width, img_height = img.size + + """ Check if the image width exceeds the page width """ + if img_width > page_width - 144: + scale_factor = (page_width - 144) / img_width + img_width *= scale_factor + img_height *= scale_factor + + """ Draw image on the canvas at original size """ + c.drawImage(file, 72, y_position - img_height, width=img_width, height=img_height) + + """ Update y_position for the next image """ + y_position -= (img_height + 10) # Move down by the image height plus some margin + + """save pdf""" + c.save() + print(f"Cluster {cluster} images saved to {save_path}/cluster_{cluster}.pdf")