fix multiprocessing on Windows

Fork method is not available on Windows, `spawn` is used there as a fallback. Routine for conversion to pdf now needs to be in separate file with no Django imports. Thanks to Dylan and Davis for pointing out this issue.
IonMich · Jan 24, 2024 · babca96 · babca96
1 parent c30b585
commit babca96
Show file tree

Hide file tree

Showing 3 changed files with 102 additions and 92 deletions.
diff --git a/submissions/convert.py b/submissions/convert.py
@@ -0,0 +1,91 @@
+from PIL import Image
+import fitz
+
+def convert_pdf_to_images(filepath, dpi, top_percent=0.25, left_percent=0.5, crop_box=None, skip_pages=(0,1,3)):
+    """
+    Converts a pdf file to a list of images.
+    If crop_box is not None, then the images are cropped to the specified box.
+    Otherwise, the images are cropped to the top left corner of the page,
+    with the width and height specified by top_percent and left_percent.
+
+    Parameters
+    ----------
+    filepath : str
+        The path to the pdf file.
+    dpi : int
+        The dpi of the images.
+    top_percent : float
+        The percentage of the top of the page to keep.
+    left_percent : float
+        The percentage of the left of the page to crop.
+    crop_box : tuple
+        A tuple of the form (left, top, right, bottom) that specifies the crop box.
+    skip_pages : tuple
+        A tuple of page numbers to skip.
+
+    Returns
+    -------
+    images : list
+        A list of images.
+    """
+    images = []
+    doc = fitz.open(filepath)
+    for page in doc:
+        if page.number in skip_pages:
+            images.append(None)
+            continue
+        rect = page.rect  # the page rectangle
+
+        if crop_box is not None and page.number in crop_box:
+            x_perc = float(crop_box[page.number]["x"])
+            y_perc = float(crop_box[page.number]["y"])
+            w_perc = float(crop_box[page.number]["width"])
+            h_perc = float(crop_box[page.number]["height"])
+            page_width = rect.x1 - rect.x0
+            page_height = rect.y1 - rect.y0
+            rect.x0 = rect.x0 + x_perc * page_width / 100
+            rect.y0 = rect.y0 + y_perc * page_height / 100
+            rect.x1 = rect.x0 + w_perc * page_width / 100
+            rect.y1 = rect.y0 + h_perc * page_height / 100
+        else:
+            rect.x1 = rect.x0 + (rect.x1 - rect.x0) * left_percent
+            rect.y1 = rect.y0 + (rect.y1 - rect.y0) * top_percent
+
+        pix = page.get_pixmap(dpi=dpi, clip=rect)
+        images.append(Image.frombytes(mode="RGB", size=[pix.width, pix.height], data=pix.samples))
+
+    return images
+
+def convert_pdf_to_images_multi(i, cpu, submissions_pdfs, dpi, top_percent, left_percent, crop_box, skip_pages):
+    images = []
+    segment_size = len(submissions_pdfs) // cpu
+    start = i * segment_size
+    end = (i + 1) * segment_size
+    if i == cpu - 1:
+        end = len(submissions_pdfs)
+    print(f"Process {i}: {start} to {end-1}")
+    for j in range(start, end):
+        images.append(convert_pdf_to_images(submissions_pdfs[j], dpi, top_percent, left_percent, crop_box, skip_pages))
+
+    return images
+
+def multiprocessed_pdf_conversion(vectors):
+    from multiprocessing import Pool, set_start_method
+    try:
+        set_start_method('fork', force=True)
+    except ValueError:
+        print("Forking the process failed. Trying spawn.")
+        set_start_method('spawn', force=True)
+
+    pool = Pool()
+    print("Pool initialized")
+    results = pool.starmap(convert_pdf_to_images_multi, vectors)
+    pool.close()
+    pool.join()
+    print("Done")
+
+    images = []
+    for result in results:
+        images.extend(result)
+
+    return images
diff --git a/submissions/models.py b/submissions/models.py
@@ -2,7 +2,6 @@
 import random
 import string
 import uuid
-from multiprocessing import Pool, cpu_count, set_start_method
 
 from django.conf import settings
 from django.contrib.auth.models import User
@@ -18,25 +17,13 @@
 from students.models import Student
 from submissions.digits_classify import (classify, import_students_from_db,
                                          import_onnx_model)
-from submissions.utils import (CommaSeparatedFloatField, convert_pdf_to_images,
-                               get_quiz_pdf_path, open_UploadedFile_as_PDF,
-                               submission_image_upload_to,
+from submissions.utils import (CommaSeparatedFloatField, get_quiz_pdf_path, 
+                               open_UploadedFile_as_PDF, submission_image_upload_to,
                                submission_upload_to)
+from submissions.convert import (convert_pdf_to_images, 
+                                 multiprocessed_pdf_conversion)
 
 
-def convert_pdf_to_images_multi(i, cpu, submissions_pdfs, dpi, top_percent, left_percent, crop_box, skip_pages):
-    images = []
-    segment_size = len(submissions_pdfs) // cpu
-    start = i * segment_size
-    end = (i + 1) * segment_size
-    if i == cpu - 1:
-        end = len(submissions_pdfs)
-    print(f"Process {i}: {start} to {end-1}")
-    for j in range(start, end):
-        images.append(convert_pdf_to_images(submissions_pdfs[j], dpi, top_percent, left_percent, crop_box, skip_pages))
-
-    return images
-
 class Submission(models.Model):
     id = models.UUIDField(
         primary_key=True, 
@@ -279,27 +266,14 @@ def get_images_for_classify(cls, assignment, dpi, top_percent=0.25, left_percent
     # @profile
     @classmethod
     def get_images_for_classify_multi(cls, assignment, dpi, top_percent=0.25, left_percent=0.5, crop_box=None, skip_pages=(0,1,3)):
-
-        submissions = PaperSubmission.objects.filter(assignment=assignment)
-
-        try:
-            #TODO: verify that this is ok on all platforms
-            set_start_method('fork')
-        except RuntimeError:
-            pass
+        from multiprocessing import cpu_count
         cpu = cpu_count()
+        submissions = PaperSubmission.objects.filter(assignment=assignment)
         submissions_pdfs = [sub.pdf.path for sub in submissions]
         vectors = [(i, cpu, submissions_pdfs, dpi, top_percent, left_percent, crop_box, skip_pages) for i in range(cpu)]
-        print("Starting %i processes for %i subs..." % (cpu, len(submissions)))
-        pool = Pool()
-        results = pool.starmap(convert_pdf_to_images_multi, vectors)
-        pool.close()
-        pool.join()
-        print("Done")
-
-        images = []
-        for result in results:
-            images.extend(result)
+
+        print("%i submissions..." % (len(submissions)))
+        images = multiprocessed_pdf_conversion(vectors)
 
         len_images = [len(imgs) for imgs in images]
         image_sub_pks = [[sub.pk for i in range(len_imgs_sub)] for sub, len_imgs_sub in zip(submissions, len_images)]

diff --git a/submissions/utils.py b/submissions/utils.py
@@ -3,10 +3,11 @@
 import re
 
 import fitz
+
 from django.core.validators import RegexValidator
 from django.db import models
 from django.utils.translation import gettext as _
-from PIL import Image
+
 
 comma_separated_float_list_re = re.compile('^[,\s]*([-+]?\d*\.?\d+[,\s]*)+$')
 validate_comma_separated_float_list = RegexValidator(
@@ -81,59 +82,3 @@ def split_pdfs(pdf_fpath=None, file_idx=0, n_pages=2):
     Split a PDF into multiple PDFs each of size n_pages.
     """
     raise NotImplementedError
-
-def convert_pdf_to_images(filepath, dpi, top_percent=0.25, left_percent=0.5, crop_box=None, skip_pages=(0,1,3)):
-    """
-    Converts a pdf file to a list of images.
-    If crop_box is not None, then the images are cropped to the specified box.
-    Otherwise, the images are cropped to the top left corner of the page,
-    with the width and height specified by top_percent and left_percent.
-
-    Parameters
-    ----------
-    filepath : str
-        The path to the pdf file.
-    dpi : int
-        The dpi of the images.
-    top_percent : float
-        The percentage of the top of the page to keep.
-    left_percent : float
-        The percentage of the left of the page to crop.
-    crop_box : tuple
-        A tuple of the form (left, top, right, bottom) that specifies the crop box.
-    skip_pages : tuple
-        A tuple of page numbers to skip.
-
-    Returns
-    -------
-    images : list
-        A list of images.
-    """
-    images = []
-    doc = fitz.open(filepath)
-    for page in doc:
-        if page.number in skip_pages:
-            images.append(None)
-            continue
-        rect = page.rect  # the page rectangle
-
-        if crop_box is not None and page.number in crop_box:
-            x_perc = float(crop_box[page.number]["x"])
-            y_perc = float(crop_box[page.number]["y"])
-            w_perc = float(crop_box[page.number]["width"])
-            h_perc = float(crop_box[page.number]["height"])
-            page_width = rect.x1 - rect.x0
-            page_height = rect.y1 - rect.y0
-            rect.x0 = rect.x0 + x_perc * page_width / 100
-            rect.y0 = rect.y0 + y_perc * page_height / 100
-            rect.x1 = rect.x0 + w_perc * page_width / 100
-            rect.y1 = rect.y0 + h_perc * page_height / 100
-        else:
-            rect.x1 = rect.x0 + (rect.x1 - rect.x0) * left_percent
-            rect.y1 = rect.y0 + (rect.y1 - rect.y0) * top_percent
-
-        pix = page.get_pixmap(dpi=dpi, clip=rect)
-        images.append(Image.frombytes(mode="RGB", size=[pix.width, pix.height], data=pix.samples))
-
-    return images
-