Skip to content

Commit

Permalink
fix multiprocessing on Windows
Browse files Browse the repository at this point in the history
Fork method is not available on Windows, `spawn` is used there as a fallback. Routine for conversion to pdf now needs to be in separate file with no Django imports.

Thanks to Dylan and Davis for pointing out this issue.
  • Loading branch information
IonMich committed Jan 24, 2024
1 parent c30b585 commit babca96
Show file tree
Hide file tree
Showing 3 changed files with 102 additions and 92 deletions.
91 changes: 91 additions & 0 deletions submissions/convert.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
from PIL import Image
import fitz

def convert_pdf_to_images(filepath, dpi, top_percent=0.25, left_percent=0.5, crop_box=None, skip_pages=(0,1,3)):
"""
Converts a pdf file to a list of images.
If crop_box is not None, then the images are cropped to the specified box.
Otherwise, the images are cropped to the top left corner of the page,
with the width and height specified by top_percent and left_percent.
Parameters
----------
filepath : str
The path to the pdf file.
dpi : int
The dpi of the images.
top_percent : float
The percentage of the top of the page to keep.
left_percent : float
The percentage of the left of the page to crop.
crop_box : tuple
A tuple of the form (left, top, right, bottom) that specifies the crop box.
skip_pages : tuple
A tuple of page numbers to skip.
Returns
-------
images : list
A list of images.
"""
images = []
doc = fitz.open(filepath)
for page in doc:
if page.number in skip_pages:
images.append(None)
continue
rect = page.rect # the page rectangle

if crop_box is not None and page.number in crop_box:
x_perc = float(crop_box[page.number]["x"])
y_perc = float(crop_box[page.number]["y"])
w_perc = float(crop_box[page.number]["width"])
h_perc = float(crop_box[page.number]["height"])
page_width = rect.x1 - rect.x0
page_height = rect.y1 - rect.y0
rect.x0 = rect.x0 + x_perc * page_width / 100
rect.y0 = rect.y0 + y_perc * page_height / 100
rect.x1 = rect.x0 + w_perc * page_width / 100
rect.y1 = rect.y0 + h_perc * page_height / 100
else:
rect.x1 = rect.x0 + (rect.x1 - rect.x0) * left_percent
rect.y1 = rect.y0 + (rect.y1 - rect.y0) * top_percent

pix = page.get_pixmap(dpi=dpi, clip=rect)
images.append(Image.frombytes(mode="RGB", size=[pix.width, pix.height], data=pix.samples))

return images

def convert_pdf_to_images_multi(i, cpu, submissions_pdfs, dpi, top_percent, left_percent, crop_box, skip_pages):
images = []
segment_size = len(submissions_pdfs) // cpu
start = i * segment_size
end = (i + 1) * segment_size
if i == cpu - 1:
end = len(submissions_pdfs)
print(f"Process {i}: {start} to {end-1}")
for j in range(start, end):
images.append(convert_pdf_to_images(submissions_pdfs[j], dpi, top_percent, left_percent, crop_box, skip_pages))

return images

def multiprocessed_pdf_conversion(vectors):
from multiprocessing import Pool, set_start_method
try:
set_start_method('fork', force=True)
except ValueError:
print("Forking the process failed. Trying spawn.")
set_start_method('spawn', force=True)

pool = Pool()
print("Pool initialized")
results = pool.starmap(convert_pdf_to_images_multi, vectors)
pool.close()
pool.join()
print("Done")

images = []
for result in results:
images.extend(result)

return images
44 changes: 9 additions & 35 deletions submissions/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
import random
import string
import uuid
from multiprocessing import Pool, cpu_count, set_start_method

from django.conf import settings
from django.contrib.auth.models import User
Expand All @@ -18,25 +17,13 @@
from students.models import Student
from submissions.digits_classify import (classify, import_students_from_db,
import_onnx_model)
from submissions.utils import (CommaSeparatedFloatField, convert_pdf_to_images,
get_quiz_pdf_path, open_UploadedFile_as_PDF,
submission_image_upload_to,
from submissions.utils import (CommaSeparatedFloatField, get_quiz_pdf_path,
open_UploadedFile_as_PDF, submission_image_upload_to,
submission_upload_to)
from submissions.convert import (convert_pdf_to_images,
multiprocessed_pdf_conversion)


def convert_pdf_to_images_multi(i, cpu, submissions_pdfs, dpi, top_percent, left_percent, crop_box, skip_pages):
images = []
segment_size = len(submissions_pdfs) // cpu
start = i * segment_size
end = (i + 1) * segment_size
if i == cpu - 1:
end = len(submissions_pdfs)
print(f"Process {i}: {start} to {end-1}")
for j in range(start, end):
images.append(convert_pdf_to_images(submissions_pdfs[j], dpi, top_percent, left_percent, crop_box, skip_pages))

return images

class Submission(models.Model):
id = models.UUIDField(
primary_key=True,
Expand Down Expand Up @@ -279,27 +266,14 @@ def get_images_for_classify(cls, assignment, dpi, top_percent=0.25, left_percent
# @profile
@classmethod
def get_images_for_classify_multi(cls, assignment, dpi, top_percent=0.25, left_percent=0.5, crop_box=None, skip_pages=(0,1,3)):

submissions = PaperSubmission.objects.filter(assignment=assignment)

try:
#TODO: verify that this is ok on all platforms
set_start_method('fork')
except RuntimeError:
pass
from multiprocessing import cpu_count
cpu = cpu_count()
submissions = PaperSubmission.objects.filter(assignment=assignment)
submissions_pdfs = [sub.pdf.path for sub in submissions]
vectors = [(i, cpu, submissions_pdfs, dpi, top_percent, left_percent, crop_box, skip_pages) for i in range(cpu)]
print("Starting %i processes for %i subs..." % (cpu, len(submissions)))
pool = Pool()
results = pool.starmap(convert_pdf_to_images_multi, vectors)
pool.close()
pool.join()
print("Done")

images = []
for result in results:
images.extend(result)

print("%i submissions..." % (len(submissions)))
images = multiprocessed_pdf_conversion(vectors)

len_images = [len(imgs) for imgs in images]
image_sub_pks = [[sub.pk for i in range(len_imgs_sub)] for sub, len_imgs_sub in zip(submissions, len_images)]
Expand Down
59 changes: 2 additions & 57 deletions submissions/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,11 @@
import re

import fitz

from django.core.validators import RegexValidator
from django.db import models
from django.utils.translation import gettext as _
from PIL import Image


comma_separated_float_list_re = re.compile('^[,\s]*([-+]?\d*\.?\d+[,\s]*)+$')
validate_comma_separated_float_list = RegexValidator(
Expand Down Expand Up @@ -81,59 +82,3 @@ def split_pdfs(pdf_fpath=None, file_idx=0, n_pages=2):
Split a PDF into multiple PDFs each of size n_pages.
"""
raise NotImplementedError

def convert_pdf_to_images(filepath, dpi, top_percent=0.25, left_percent=0.5, crop_box=None, skip_pages=(0,1,3)):
"""
Converts a pdf file to a list of images.
If crop_box is not None, then the images are cropped to the specified box.
Otherwise, the images are cropped to the top left corner of the page,
with the width and height specified by top_percent and left_percent.
Parameters
----------
filepath : str
The path to the pdf file.
dpi : int
The dpi of the images.
top_percent : float
The percentage of the top of the page to keep.
left_percent : float
The percentage of the left of the page to crop.
crop_box : tuple
A tuple of the form (left, top, right, bottom) that specifies the crop box.
skip_pages : tuple
A tuple of page numbers to skip.
Returns
-------
images : list
A list of images.
"""
images = []
doc = fitz.open(filepath)
for page in doc:
if page.number in skip_pages:
images.append(None)
continue
rect = page.rect # the page rectangle

if crop_box is not None and page.number in crop_box:
x_perc = float(crop_box[page.number]["x"])
y_perc = float(crop_box[page.number]["y"])
w_perc = float(crop_box[page.number]["width"])
h_perc = float(crop_box[page.number]["height"])
page_width = rect.x1 - rect.x0
page_height = rect.y1 - rect.y0
rect.x0 = rect.x0 + x_perc * page_width / 100
rect.y0 = rect.y0 + y_perc * page_height / 100
rect.x1 = rect.x0 + w_perc * page_width / 100
rect.y1 = rect.y0 + h_perc * page_height / 100
else:
rect.x1 = rect.x0 + (rect.x1 - rect.x0) * left_percent
rect.y1 = rect.y0 + (rect.y1 - rect.y0) * top_percent

pix = page.get_pixmap(dpi=dpi, clip=rect)
images.append(Image.frombytes(mode="RGB", size=[pix.width, pix.height], data=pix.samples))

return images

0 comments on commit babca96

Please sign in to comment.