From 4ebad8b0bf3f193f03ebf6f0afaf728c0445bb5a Mon Sep 17 00:00:00 2001
From: Tenzin <tenzintsunduebhattu@gmail.com>
Date: Mon, 22 Apr 2024 11:40:39 +0530
Subject: [PATCH 01/11] cluster with kmeans

---
 src/monocheck/clustering.py | 7 +++++++
 src/monocheck/pipeline.py   | 8 +++++---
 2 files changed, 12 insertions(+), 3 deletions(-)
 create mode 100644 src/monocheck/clustering.py

diff --git a/src/monocheck/clustering.py b/src/monocheck/clustering.py
new file mode 100644
index 0000000..f90d0a6
--- /dev/null
+++ b/src/monocheck/clustering.py
@@ -0,0 +1,7 @@
+import numpy as np
+from sklearn.cluster import KMeans
+
+def cluster(features: np.ndarray, no_of_clusters:int= 2):
+    kmeans = KMeans(n_clusters= no_of_clusters, random_state=22)
+    kmeans.fit(features)
+    return kmeans.labels_
\ No newline at end of file
diff --git a/src/monocheck/pipeline.py b/src/monocheck/pipeline.py
index 9ec6b5d..be88760 100644
--- a/src/monocheck/pipeline.py
+++ b/src/monocheck/pipeline.py
@@ -8,6 +8,7 @@
 from monocheck.prepare import load_image
 from monocheck.feature_extraction import extract_features
 from monocheck.dimension_reduction import reduce_dimension
+from monocheck.clustering import cluster
 
 def pipeline(image_paths:List[Path]):
     imgs_array = [load_image(image_path).squeeze(0) for image_path in image_paths]
@@ -18,12 +19,13 @@ def pipeline(image_paths:List[Path]):
     imgs_features = extract_features(imgs_array, model)
     imgs_features = imgs_features.reshape(-1,4096)
     reduced_imgs_features = reduce_dimension(imgs_features)
-    return reduced_imgs_features
+    clustering_labels = cluster(reduced_imgs_features)
+    return clustering_labels
 
 if __name__ == "__main__":
     imgs_path = [Path("image.jpg"), Path("image2.jpg")]
-    imgs_feat = pipeline(imgs_path)
-    print(imgs_feat)
+    imgs_labels = pipeline(imgs_path)
+    print(imgs_labels)
 
     
 

From 92467f67734b865cfc84febc1a440bb6b0e0e0a8 Mon Sep 17 00:00:00 2001
From: Tenzin <tenzintsunduebhattu@gmail.com>
Date: Mon, 22 Apr 2024 11:41:54 +0530
Subject: [PATCH 02/11] update dependencie

---
 pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyproject.toml b/pyproject.toml
index a7d2901..c6bcdef 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -24,6 +24,7 @@ dependencies = [
   "numpy",
   "pillow",
   "tensorflow",
+  "scikit-learn",
 ]
 
 [project.optional-dependencies]

From 0618ca59e7127a56a8f608803dcdcb91386502d5 Mon Sep 17 00:00:00 2001
From: Tenzin <tenzintsunduebhattu@gmail.com>
Date: Mon, 22 Apr 2024 11:50:25 +0530
Subject: [PATCH 03/11] group clusters with file names

---
 src/monocheck/clustering.py | 17 ++++++++++++++++-
 src/monocheck/pipeline.py   |  8 ++++++--
 2 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/src/monocheck/clustering.py b/src/monocheck/clustering.py
index f90d0a6..b48d22a 100644
--- a/src/monocheck/clustering.py
+++ b/src/monocheck/clustering.py
@@ -1,7 +1,22 @@
 import numpy as np
+from typing import List
+from pathlib import Path 
+
 from sklearn.cluster import KMeans
 
+
 def cluster(features: np.ndarray, no_of_clusters:int= 2):
     kmeans = KMeans(n_clusters= no_of_clusters, random_state=22)
     kmeans.fit(features)
-    return kmeans.labels_
\ No newline at end of file
+    return kmeans.labels_
+
+def group_clusters(files_paths: List[Path], cluster_labels:np.ndarray):
+    groups = {}
+    for file, cluster in zip(files_paths, cluster_labels):
+        file = str(file)
+        if cluster not in groups.keys():
+            groups[cluster] = []
+            groups[cluster].append(file)
+        else:
+            groups[cluster].append(file)
+    return groups
\ No newline at end of file
diff --git a/src/monocheck/pipeline.py b/src/monocheck/pipeline.py
index be88760..c446ee3 100644
--- a/src/monocheck/pipeline.py
+++ b/src/monocheck/pipeline.py
@@ -8,7 +8,7 @@
 from monocheck.prepare import load_image
 from monocheck.feature_extraction import extract_features
 from monocheck.dimension_reduction import reduce_dimension
-from monocheck.clustering import cluster
+from monocheck.clustering import cluster, group_clusters
 
 def pipeline(image_paths:List[Path]):
     imgs_array = [load_image(image_path).squeeze(0) for image_path in image_paths]
@@ -16,11 +16,15 @@ def pipeline(image_paths:List[Path]):
     model = VGG16()
     model = Model(inputs = model.inputs, outputs = model.layers[-2].output)
 
+    
     imgs_features = extract_features(imgs_array, model)
     imgs_features = imgs_features.reshape(-1,4096)
     reduced_imgs_features = reduce_dimension(imgs_features)
+    """ cluster the image features with kmeans """
     clustering_labels = cluster(reduced_imgs_features)
-    return clustering_labels
+    """ group images based on labels, with key: label and values: image paths"""
+    cluster_groups = group_clusters(image_paths, clustering_labels)
+    return cluster_groups
 
 if __name__ == "__main__":
     imgs_path = [Path("image.jpg"), Path("image2.jpg")]

From b0d179ac5658bf17ca17d656887c7d06964ff4ca Mon Sep 17 00:00:00 2001
From: Tenzin <tenzintsunduebhattu@gmail.com>
Date: Mon, 22 Apr 2024 12:03:01 +0530
Subject: [PATCH 04/11] save clustering res to json

---
 src/monocheck/clustering.py |  1 +
 src/monocheck/pipeline.py   | 11 +++++++++--
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/src/monocheck/clustering.py b/src/monocheck/clustering.py
index b48d22a..03fa933 100644
--- a/src/monocheck/clustering.py
+++ b/src/monocheck/clustering.py
@@ -14,6 +14,7 @@ def group_clusters(files_paths: List[Path], cluster_labels:np.ndarray):
     groups = {}
     for file, cluster in zip(files_paths, cluster_labels):
         file = str(file)
+        cluster = int(cluster)   # int32 -> int conversion
         if cluster not in groups.keys():
             groups[cluster] = []
             groups[cluster].append(file)
diff --git a/src/monocheck/pipeline.py b/src/monocheck/pipeline.py
index c446ee3..e6c9a91 100644
--- a/src/monocheck/pipeline.py
+++ b/src/monocheck/pipeline.py
@@ -1,4 +1,5 @@
 import numpy as np
+import json 
 from pathlib import Path 
 from typing import List 
 
@@ -26,10 +27,16 @@ def pipeline(image_paths:List[Path]):
     cluster_groups = group_clusters(image_paths, clustering_labels)
     return cluster_groups
 
+
+
 if __name__ == "__main__":
     imgs_path = [Path("image.jpg"), Path("image2.jpg")]
-    imgs_labels = pipeline(imgs_path)
-    print(imgs_labels)
+    grouped_clusters = pipeline(imgs_path)
+    file_name = 'grouped_clusters.json'
+    with open(file_name, 'w') as file:
+        json.dump(grouped_clusters, file, indent=4)
+
+    print(f"Data has been successfully saved to {file_name}.")
 
     
 

From f119d93b0d4d6a86d14e15982b3d8d675adc3012 Mon Sep 17 00:00:00 2001
From: Tenzin <tenzintsunduebhattu@gmail.com>
Date: Mon, 22 Apr 2024 12:19:56 +0530
Subject: [PATCH 05/11] save clustering res to json

---
 src/monocheck/pipeline.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/src/monocheck/pipeline.py b/src/monocheck/pipeline.py
index e6c9a91..49f01b2 100644
--- a/src/monocheck/pipeline.py
+++ b/src/monocheck/pipeline.py
@@ -11,7 +11,7 @@
 from monocheck.dimension_reduction import reduce_dimension
 from monocheck.clustering import cluster, group_clusters
 
-def pipeline(image_paths:List[Path]):
+def pipeline(image_paths:List[Path], output_file_path:Path=Path('grouped_clusters.json')):
     imgs_array = [load_image(image_path).squeeze(0) for image_path in image_paths]
     imgs_array = np.stack(imgs_array, axis=0)
     model = VGG16()
@@ -32,11 +32,7 @@ def pipeline(image_paths:List[Path]):
 if __name__ == "__main__":
     imgs_path = [Path("image.jpg"), Path("image2.jpg")]
     grouped_clusters = pipeline(imgs_path)
-    file_name = 'grouped_clusters.json'
-    with open(file_name, 'w') as file:
-        json.dump(grouped_clusters, file, indent=4)
 
-    print(f"Data has been successfully saved to {file_name}.")
 
     
 

From e021958a123a3d7ab025ee60d2d3716da08d6ae9 Mon Sep 17 00:00:00 2001
From: Tenzin <tenzintsunduebhattu@gmail.com>
Date: Mon, 22 Apr 2024 14:37:37 +0530
Subject: [PATCH 06/11] save clustering res to json;

---
 src/monocheck/pipeline.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/monocheck/pipeline.py b/src/monocheck/pipeline.py
index 49f01b2..3616e93 100644
--- a/src/monocheck/pipeline.py
+++ b/src/monocheck/pipeline.py
@@ -1,5 +1,6 @@
 import numpy as np
 import json 
+
 from pathlib import Path 
 from typing import List 
 
@@ -25,6 +26,10 @@ def pipeline(image_paths:List[Path], output_file_path:Path=Path('grouped_cluster
     clustering_labels = cluster(reduced_imgs_features)
     """ group images based on labels, with key: label and values: image paths"""
     cluster_groups = group_clusters(image_paths, clustering_labels)
+
+    """ save the result """
+    with open(output_file_path, 'w') as file:
+        json.dump(cluster_groups, file, indent=4)
     return cluster_groups
 
 

From ec9e04049e8a85b4e133230e05f8059cddc7f178 Mon Sep 17 00:00:00 2001
From: Tenzin <tenzintsunduebhattu@gmail.com>
Date: Mon, 22 Apr 2024 15:21:38 +0530
Subject: [PATCH 07/11] acknowledgment after steps

---
 src/monocheck/clustering.py          | 2 ++
 src/monocheck/dimension_reduction.py | 1 +
 src/monocheck/feature_extraction.py  | 1 +
 3 files changed, 4 insertions(+)

diff --git a/src/monocheck/clustering.py b/src/monocheck/clustering.py
index 03fa933..13a4b00 100644
--- a/src/monocheck/clustering.py
+++ b/src/monocheck/clustering.py
@@ -8,6 +8,8 @@
 def cluster(features: np.ndarray, no_of_clusters:int= 2):
     kmeans = KMeans(n_clusters= no_of_clusters, random_state=22)
     kmeans.fit(features)
+
+    print("[SUCCESS]: Clustering succesfully done")
     return kmeans.labels_
 
 def group_clusters(files_paths: List[Path], cluster_labels:np.ndarray):
diff --git a/src/monocheck/dimension_reduction.py b/src/monocheck/dimension_reduction.py
index e792b8e..d41d51a 100644
--- a/src/monocheck/dimension_reduction.py
+++ b/src/monocheck/dimension_reduction.py
@@ -7,5 +7,6 @@ def reduce_dimension(images_feature: np.ndarray, components:int = 100):
     pca.fit(images_feature)
     reduced_images_feature = pca.transform(images_feature)
     
+    print("[SUCCESS]: Image features dimensions reduction successfully done.")
     return reduced_images_feature
 
diff --git a/src/monocheck/feature_extraction.py b/src/monocheck/feature_extraction.py
index daf9070..05e1b98 100644
--- a/src/monocheck/feature_extraction.py
+++ b/src/monocheck/feature_extraction.py
@@ -6,6 +6,7 @@
 def extract_features(images_input: np.ndarray, model:VGG16):
     images_features = model.predict(images_input)
 
+    print("[SUCCESS]: Image features extraction done.")
     return images_features
 
 

From a2b1e194e47070ae30c784afa6c347eaec8a9636 Mon Sep 17 00:00:00 2001
From: Tenzin <tenzintsunduebhattu@gmail.com>
Date: Mon, 22 Apr 2024 15:52:07 +0530
Subject: [PATCH 08/11] do features extraction in batches of 1000

---
 src/monocheck/feature_extraction.py | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/src/monocheck/feature_extraction.py b/src/monocheck/feature_extraction.py
index 05e1b98..dfb8f9c 100644
--- a/src/monocheck/feature_extraction.py
+++ b/src/monocheck/feature_extraction.py
@@ -4,10 +4,22 @@
 
 
 def extract_features(images_input: np.ndarray, model:VGG16):
-    images_features = model.predict(images_input)
+    """ total number of images"""
+    num_images = images_input.shape[0]
+    """ batch size """
+    batch_size = 1000
+    all_features = []
 
+    """ predict in batches """
+    for start in range(0, num_images, batch_size):
+        end = min(start + batch_size, num_images)  # Ensure the last batch is handled properly
+        batch_features = model.predict(images_input[start:end])
+        all_features.append(batch_features)
+        print(f"[{end}/{num_images}] image features extraction done.")
+    
+    all_features = np.vstack(all_features)
+    
     print("[SUCCESS]: Image features extraction done.")
-    return images_features
-
+    return all_features
 
 

From 9909655826d8d092a0f9da58c16b4ec5a29854c7 Mon Sep 17 00:00:00 2001
From: Tenzin <tenzintsunduebhattu@gmail.com>
Date: Mon, 22 Apr 2024 16:06:01 +0530
Subject: [PATCH 09/11] store image features as pickle

---
 .gitignore                |  4 ++++
 src/monocheck/pipeline.py | 22 +++++++++++++++++-----
 2 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/.gitignore b/.gitignore
index b6e4761..e82e609 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,7 @@
+ocr_output/
+array.pkl
+grouped_clusters.json
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
diff --git a/src/monocheck/pipeline.py b/src/monocheck/pipeline.py
index 3616e93..aec0abc 100644
--- a/src/monocheck/pipeline.py
+++ b/src/monocheck/pipeline.py
@@ -1,6 +1,6 @@
 import numpy as np
 import json 
-
+import pickle 
 from pathlib import Path 
 from typing import List 
 
@@ -12,15 +12,26 @@
 from monocheck.dimension_reduction import reduce_dimension
 from monocheck.clustering import cluster, group_clusters
 
+
+IMAGE_FEATURES_PICKLE = Path('array.pkl')
+
 def pipeline(image_paths:List[Path], output_file_path:Path=Path('grouped_clusters.json')):
     imgs_array = [load_image(image_path).squeeze(0) for image_path in image_paths]
     imgs_array = np.stack(imgs_array, axis=0)
     model = VGG16()
     model = Model(inputs = model.inputs, outputs = model.layers[-2].output)
 
-    
-    imgs_features = extract_features(imgs_array, model)
-    imgs_features = imgs_features.reshape(-1,4096)
+    """ load image features pickle if exists """
+    if not IMAGE_FEATURES_PICKLE.exists():
+        imgs_features = extract_features(imgs_array, model)
+        imgs_features = imgs_features.reshape(-1,4096)
+        """ save the image features as pickle file """
+        with open(IMAGE_FEATURES_PICKLE, 'wb') as file:
+            pickle.dump(imgs_features, file)
+    else:
+        with open(IMAGE_FEATURES_PICKLE, 'rb') as file:
+            imgs_features = pickle.load(file)
+        
     reduced_imgs_features = reduce_dimension(imgs_features)
     """ cluster the image features with kmeans """
     clustering_labels = cluster(reduced_imgs_features)
@@ -35,7 +46,8 @@ def pipeline(image_paths:List[Path], output_file_path:Path=Path('grouped_cluster
 
 
 if __name__ == "__main__":
-    imgs_path = [Path("image.jpg"), Path("image2.jpg")]
+    # imgs_path = [Path("image.jpg"), Path("image2.jpg")]
+    imgs_path = list(Path("ocr_output").rglob("*.jpg"))
     grouped_clusters = pipeline(imgs_path)
 
 

From dcc10dbef046aaaf559649f7eb0c95f2bd46789b Mon Sep 17 00:00:00 2001
From: Tenzin <tenzintsunduebhattu@gmail.com>
Date: Mon, 22 Apr 2024 16:24:34 +0530
Subject: [PATCH 10/11] view clusters

---
 .gitignore                  |  1 +
 src/monocheck/clustering.py | 34 +++++++++++++++++++++++++++++++++-
 src/monocheck/pipeline.py   |  6 ++----
 3 files changed, 36 insertions(+), 5 deletions(-)

diff --git a/.gitignore b/.gitignore
index e82e609..19b2b8d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,7 @@
 ocr_output/
 array.pkl
 grouped_clusters.json
+clustering_output/
 
 # Byte-compiled / optimized / DLL files
 __pycache__/
diff --git a/src/monocheck/clustering.py b/src/monocheck/clustering.py
index 13a4b00..679303e 100644
--- a/src/monocheck/clustering.py
+++ b/src/monocheck/clustering.py
@@ -1,8 +1,11 @@
 import numpy as np
+import matplotlib.pyplot as plt
+
 from typing import List
 from pathlib import Path 
 
 from sklearn.cluster import KMeans
+from keras.preprocessing.image import load_img 
 
 
 def cluster(features: np.ndarray, no_of_clusters:int= 2):
@@ -22,4 +25,33 @@ def group_clusters(files_paths: List[Path], cluster_labels:np.ndarray):
             groups[cluster].append(file)
         else:
             groups[cluster].append(file)
-    return groups
\ No newline at end of file
+    return groups
+
+
+def view_clusters(grouped_clusters, output_dir='clustering_output'):
+    output_dir = Path(output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    clusters = list(grouped_clusters.keys())
+    for cluster in clusters:
+        plt.figure(figsize=(30,30))
+
+        """ file names """
+        files = grouped_clusters[cluster]
+        """ Only allow up to 100 images to be shown at a time """
+        if len(files) > 100:
+            files = files[:100]
+
+        """ Plot each image in the cluster """
+        for index, file in enumerate(files):
+            plt.subplot(10, 10, index + 1)
+            img = load_img(file)
+            img = np.array(img)
+            plt.imshow(img)
+            plt.axis('off')
+
+        """ save the image"""
+        plt.savefig(f'{output_dir}/cluster_{cluster}.png')
+        plt.close()  
+
+        print(f"Cluster {cluster} saved to {output_dir}/cluster_{cluster}.jpg")
+
diff --git a/src/monocheck/pipeline.py b/src/monocheck/pipeline.py
index aec0abc..f537d3b 100644
--- a/src/monocheck/pipeline.py
+++ b/src/monocheck/pipeline.py
@@ -10,8 +10,7 @@
 from monocheck.prepare import load_image
 from monocheck.feature_extraction import extract_features
 from monocheck.dimension_reduction import reduce_dimension
-from monocheck.clustering import cluster, group_clusters
-
+from monocheck.clustering import cluster, group_clusters, view_clusters
 
 IMAGE_FEATURES_PICKLE = Path('array.pkl')
 
@@ -46,10 +45,9 @@ def pipeline(image_paths:List[Path], output_file_path:Path=Path('grouped_cluster
 
 
 if __name__ == "__main__":
-    # imgs_path = [Path("image.jpg"), Path("image2.jpg")]
     imgs_path = list(Path("ocr_output").rglob("*.jpg"))
     grouped_clusters = pipeline(imgs_path)
-
+    view_clusters(grouped_clusters)
 
     
 

From 6e2727488ba88ce574fe3842ba192a5c6507db16 Mon Sep 17 00:00:00 2001
From: Tenzin <tenzintsunduebhattu@gmail.com>
Date: Mon, 22 Apr 2024 16:56:36 +0530
Subject: [PATCH 11/11] view clusters in pdf

---
 .gitignore                  |  1 +
 src/monocheck/clustering.py | 62 +++++++++++++++++++++----------------
 2 files changed, 37 insertions(+), 26 deletions(-)

diff --git a/.gitignore b/.gitignore
index 19b2b8d..cfb2f71 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,6 +2,7 @@ ocr_output/
 array.pkl
 grouped_clusters.json
 clustering_output/
+output_images/
 
 # Byte-compiled / optimized / DLL files
 __pycache__/
diff --git a/src/monocheck/clustering.py b/src/monocheck/clustering.py
index 679303e..c5a1bda 100644
--- a/src/monocheck/clustering.py
+++ b/src/monocheck/clustering.py
@@ -1,13 +1,15 @@
 import numpy as np
-import matplotlib.pyplot as plt
 
 from typing import List
 from pathlib import Path 
+from PIL import Image
 
 from sklearn.cluster import KMeans
-from keras.preprocessing.image import load_img 
 
 
+from reportlab.lib.pagesizes import letter
+from reportlab.pdfgen import canvas
+
 def cluster(features: np.ndarray, no_of_clusters:int= 2):
     kmeans = KMeans(n_clusters= no_of_clusters, random_state=22)
     kmeans.fit(features)
@@ -27,31 +29,39 @@ def group_clusters(files_paths: List[Path], cluster_labels:np.ndarray):
             groups[cluster].append(file)
     return groups
 
-
-def view_clusters(grouped_clusters, output_dir='clustering_output'):
-    output_dir = Path(output_dir)
-    output_dir.mkdir(parents=True, exist_ok=True)
+def view_clusters(grouped_clusters, save_path='output_images'):
+    """ save each clustering in different pdfs """
     clusters = list(grouped_clusters.keys())
     for cluster in clusters:
-        plt.figure(figsize=(30,30))
-
-        """ file names """
         files = grouped_clusters[cluster]
-        """ Only allow up to 100 images to be shown at a time """
-        if len(files) > 100:
-            files = files[:100]
-
-        """ Plot each image in the cluster """
+        images_per_page = 10
+        page_width, page_height = letter  # Default letter size
+        
+        """ Create a PDF canvas """
+        c = canvas.Canvas(f'{save_path}/cluster_{cluster}.pdf', pagesize=letter)
+        
+        y_position = page_height - 72  # Initial top margin offset
         for index, file in enumerate(files):
-            plt.subplot(10, 10, index + 1)
-            img = load_img(file)
-            img = np.array(img)
-            plt.imshow(img)
-            plt.axis('off')
-
-        """ save the image"""
-        plt.savefig(f'{output_dir}/cluster_{cluster}.png')
-        plt.close()  
-
-        print(f"Cluster {cluster} saved to {output_dir}/cluster_{cluster}.jpg")
-
+            if index % images_per_page == 0 and index != 0:
+                c.showPage()  # Add a new page if the current one is filled
+                y_position = page_height - 72  # Reset position at the top of a new page
+            
+            """ Open image and get its original size """
+            img = Image.open(file)
+            img_width, img_height = img.size
+            
+            """ Check if the image width exceeds the page width """
+            if img_width > page_width - 144:
+                scale_factor = (page_width - 144) / img_width
+                img_width *= scale_factor
+                img_height *= scale_factor
+            
+            """ Draw image on the canvas at original size """
+            c.drawImage(file, 72, y_position - img_height, width=img_width, height=img_height)
+            
+            """  Update y_position for the next image """
+            y_position -= (img_height + 10)  # Move down by the image height plus some margin
+        
+        """save pdf"""
+        c.save()  
+        print(f"Cluster {cluster} images saved to {save_path}/cluster_{cluster}.pdf")