Skip to content

Commit

Permalink
Merge pull request #4 from tenzin3/feat/clustering
Browse files Browse the repository at this point in the history
Feat/clustering
  • Loading branch information
tenzin3 authored Apr 25, 2024
2 parents c9486f7 + 6e27274 commit 66cbd2a
Show file tree
Hide file tree
Showing 6 changed files with 122 additions and 10 deletions.
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
ocr_output/
array.pkl
grouped_clusters.json
clustering_output/
output_images/

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ dependencies = [
"numpy",
"pillow",
"tensorflow",
"scikit-learn",
]

[project.optional-dependencies]
Expand Down
67 changes: 67 additions & 0 deletions src/monocheck/clustering.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
import numpy as np

from typing import List
from pathlib import Path
from PIL import Image

from sklearn.cluster import KMeans


from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas

def cluster(features: np.ndarray, no_of_clusters:int= 2):
kmeans = KMeans(n_clusters= no_of_clusters, random_state=22)
kmeans.fit(features)

print("[SUCCESS]: Clustering succesfully done")
return kmeans.labels_

def group_clusters(files_paths: List[Path], cluster_labels:np.ndarray):
groups = {}
for file, cluster in zip(files_paths, cluster_labels):
file = str(file)
cluster = int(cluster) # int32 -> int conversion
if cluster not in groups.keys():
groups[cluster] = []
groups[cluster].append(file)
else:
groups[cluster].append(file)
return groups

def view_clusters(grouped_clusters, save_path='output_images'):
""" save each clustering in different pdfs """
clusters = list(grouped_clusters.keys())
for cluster in clusters:
files = grouped_clusters[cluster]
images_per_page = 10
page_width, page_height = letter # Default letter size

""" Create a PDF canvas """
c = canvas.Canvas(f'{save_path}/cluster_{cluster}.pdf', pagesize=letter)

y_position = page_height - 72 # Initial top margin offset
for index, file in enumerate(files):
if index % images_per_page == 0 and index != 0:
c.showPage() # Add a new page if the current one is filled
y_position = page_height - 72 # Reset position at the top of a new page

""" Open image and get its original size """
img = Image.open(file)
img_width, img_height = img.size

""" Check if the image width exceeds the page width """
if img_width > page_width - 144:
scale_factor = (page_width - 144) / img_width
img_width *= scale_factor
img_height *= scale_factor

""" Draw image on the canvas at original size """
c.drawImage(file, 72, y_position - img_height, width=img_width, height=img_height)

""" Update y_position for the next image """
y_position -= (img_height + 10) # Move down by the image height plus some margin

"""save pdf"""
c.save()
print(f"Cluster {cluster} images saved to {save_path}/cluster_{cluster}.pdf")
1 change: 1 addition & 0 deletions src/monocheck/dimension_reduction.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,6 @@ def reduce_dimension(images_feature: np.ndarray, components:int = 100):
pca.fit(images_feature)
reduced_images_feature = pca.transform(images_feature)

print("[SUCCESS]: Image features dimensions reduction successfully done.")
return reduced_images_feature

19 changes: 16 additions & 3 deletions src/monocheck/feature_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,22 @@


def extract_features(images_input: np.ndarray, model:VGG16):
images_features = model.predict(images_input)

return images_features
""" total number of images"""
num_images = images_input.shape[0]
""" batch size """
batch_size = 1000
all_features = []

""" predict in batches """
for start in range(0, num_images, batch_size):
end = min(start + batch_size, num_images) # Ensure the last batch is handled properly
batch_features = model.predict(images_input[start:end])
all_features.append(batch_features)
print(f"[{end}/{num_images}] image features extraction done.")

all_features = np.vstack(all_features)

print("[SUCCESS]: Image features extraction done.")
return all_features


38 changes: 31 additions & 7 deletions src/monocheck/pipeline.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import numpy as np
import json
import pickle
from pathlib import Path
from typing import List

Expand All @@ -8,22 +10,44 @@
from monocheck.prepare import load_image
from monocheck.feature_extraction import extract_features
from monocheck.dimension_reduction import reduce_dimension
from monocheck.clustering import cluster, group_clusters, view_clusters

def pipeline(image_paths:List[Path]):
IMAGE_FEATURES_PICKLE = Path('array.pkl')

def pipeline(image_paths:List[Path], output_file_path:Path=Path('grouped_clusters.json')):
imgs_array = [load_image(image_path).squeeze(0) for image_path in image_paths]
imgs_array = np.stack(imgs_array, axis=0)
model = VGG16()
model = Model(inputs = model.inputs, outputs = model.layers[-2].output)

imgs_features = extract_features(imgs_array, model)
imgs_features = imgs_features.reshape(-1,4096)
""" load image features pickle if exists """
if not IMAGE_FEATURES_PICKLE.exists():
imgs_features = extract_features(imgs_array, model)
imgs_features = imgs_features.reshape(-1,4096)
""" save the image features as pickle file """
with open(IMAGE_FEATURES_PICKLE, 'wb') as file:
pickle.dump(imgs_features, file)
else:
with open(IMAGE_FEATURES_PICKLE, 'rb') as file:
imgs_features = pickle.load(file)

reduced_imgs_features = reduce_dimension(imgs_features)
return reduced_imgs_features
""" cluster the image features with kmeans """
clustering_labels = cluster(reduced_imgs_features)
""" group images based on labels, with key: label and values: image paths"""
cluster_groups = group_clusters(image_paths, clustering_labels)

""" save the result """
with open(output_file_path, 'w') as file:
json.dump(cluster_groups, file, indent=4)
return cluster_groups



if __name__ == "__main__":
imgs_path = [Path("image.jpg"), Path("image2.jpg")]
imgs_feat = pipeline(imgs_path)
print(imgs_feat)
imgs_path = list(Path("ocr_output").rglob("*.jpg"))
grouped_clusters = pipeline(imgs_path)
view_clusters(grouped_clusters)



Expand Down

0 comments on commit 66cbd2a

Please sign in to comment.