added ability to operate in lenient mode, ie tolerating trailing junk…

… data
waikato-datamining · Apr 17, 2023 · 90e2c52 · 90e2c52
1 parent 91e4d88
commit 90e2c52
Show file tree

Hide file tree

Showing 21 changed files with 303 additions and 58 deletions.
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -1,6 +1,12 @@
 Changelog
 =========
 
+0.0.5 (2023-04-17)
+------------------
+
+- added support for *lenient mode* via `strict` and `check_size` parameters
+
+
 0.0.4 (2023-03-30)
 ------------------
 

diff --git a/README.md b/README.md
@@ -2,7 +2,12 @@
 
 Python 3 library for checking whether an image is complete or not. 
 It is either looking for EOF markers or checking the length of the file against one stored in the file.
-Can also operate on bytes or BytesIO objects.
+
+Can also operate on bytes or BytesIO objects. 
+
+By default, the library operates in **strict** mode, i.e., no trailing junk data 
+is tolerated. However, by supplying the parameters `strict` and `check_size` this
+can turned into **lenient** mode. `check_size` is only used for formats gif, jpg, png.
 
 
 ## Supported image formats
@@ -41,29 +46,46 @@ Can also operate on bytes or BytesIO objects.
 
 ## Examples
 
-* auto detection
+### Auto detection
 
-  ```python
-  from image_complete.auto import is_image_complete
+```python
+from image_complete.auto import is_image_complete
 
-  # using file names
-  print(is_image_complete("/some/where/hello_world.jpg"))
-  print(is_image_complete("/some/where/image.png"))
-
-  # using bytes or BytesIO
-  with open("/some/where/image.bmp", "rb") as fp:
-      b = fp.read()
-  print(is_image_complete(b))
-  ```
-
-* JPG specific
-
-  ```python
-  from image_complete.jpg import is_jpg_complete, is_jpg
-
-  f = "/some/where/hello_world.jpg"
-  if is_jpg(f):
-      print(is_jpg_complete(f))
-  else:
-      print("Not a JPG!")
-  ```
+# using file names
+print(is_image_complete("/some/where/hello_world.jpg"))
+print(is_image_complete("/some/where/image.png"))
+
+# using bytes or BytesIO
+with open("/some/where/image.bmp", "rb") as fp:
+    b = fp.read()
+print(is_image_complete(b))
+```
+
+
+### JPG specific
+
+```python
+from image_complete.jpg import is_jpg_complete, is_jpg
+
+f = "/some/where/hello_world.jpg"
+if is_jpg(f):
+    print(is_jpg_complete(f))
+else:
+    print("Not a JPG!")
+```
+
+
+### Lenient mode (i.e., tolerating trailing junk data)
+
+```python
+from image_complete.auto import is_image_complete
+
+# using file names
+print(is_image_complete("/some/where/hello_world.jpg", strict=False, check_size=100))
+print(is_image_complete("/some/where/image.png", strict=False, check_size=100))
+
+# using bytes or BytesIO
+with open("/some/where/image.bmp", "rb") as fp:
+    b = fp.read()
+print(is_image_complete(b, strict=False))
+```
diff --git a/setup.py b/setup.py
@@ -32,7 +32,7 @@ def _read(f):
     packages=[
         "image_complete",
     ],
-    version="0.0.4",
+    version="0.0.5",
     author='Peter Reutemann',
     author_email='fracpete@waikato.ac.nz',
 )
diff --git a/src/image_complete/auto.py b/src/image_complete/auto.py
@@ -1,19 +1,24 @@
 from io import BytesIO
 
+from image_complete.base import DEFAULT_CHECK_SIZE
 from image_complete.bmp import is_bmp_complete, is_bmp
 from image_complete.gif import is_gif_complete, is_gif
 from image_complete.jpg import is_jpg_complete, is_jpg
 from image_complete.png import is_png_complete, is_png
 from image_complete.webp import is_webp_complete, is_webp
 
 
-def is_image_complete(img):
+def is_image_complete(img, strict=True, check_size=DEFAULT_CHECK_SIZE):
     """
     Checks whether the image is complete. Auto-detects the type based on extension.
     If the type is not supported, it will throw an exception.
 
     :param img: the absolute path to the image or a bytes/BytesIO object
     :type img: str or bytes or BytesIO
+    :param strict: if True then no junk data after actual data is allowed
+    :type strict: bool
+    :param check_size: the number of bytes from the end of the file to look for EOF marker (only used by: gif, jpg, png)
+    :type check_size: int
     :return: True if complete
     :rtype: bool
     """
@@ -22,30 +27,30 @@ def is_image_complete(img):
 
     if isinstance(img, str):
         name = img.lower()
-        if name.endswith(".gif"):
-            return is_gif_complete(img)
+        if name.endswith(".bmp"):
+            return is_bmp_complete(img, strict=strict)
+        elif name.endswith(".gif"):
+            return is_gif_complete(img, strict=strict, check_size=check_size)
         elif name.endswith(".jpg") or name.endswith(".jpeg"):
-            return is_jpg_complete(img)
+            return is_jpg_complete(img, strict=strict, check_size=check_size)
         elif name.endswith(".png"):
-            return is_png_complete(img)
-        elif name.endswith(".bmp"):
-            return is_bmp_complete(img)
+            return is_png_complete(img, strict=strict, check_size=check_size)
         elif name.endswith(".webp"):
-            return is_webp_complete(img)
+            return is_webp_complete(img, strict=strict)
         else:
             raise Exception("Unsupported file type: " + img)
 
     elif isinstance(img, BytesIO):
         if is_bmp(img):
-            return is_bmp_complete(img)
+            return is_bmp_complete(img, strict=strict)
         elif is_gif(img):
-            return is_gif_complete(img)
+            return is_gif_complete(img, strict=strict, check_size=check_size)
         elif is_jpg(img):
-            return is_jpg_complete(img)
+            return is_jpg_complete(img, strict=strict, check_size=check_size)
         elif is_png(img):
-            return is_png_complete(img)
+            return is_png_complete(img, strict=strict, check_size=check_size)
         elif is_webp(img):
-            return is_webp_complete(img)
+            return is_webp_complete(img, strict=strict)
         else:
             raise Exception("Failed to determine file type!")
     else:

diff --git a/src/image_complete/base.py b/src/image_complete/base.py
@@ -3,6 +3,10 @@
 from io import BytesIO
 
 
+DEFAULT_CHECK_SIZE = 100
+""" the buffer size used to look for the EOF marker. """
+
+
 def load(img):
     """
     Loads the data and returns it as BytesIO object and the associated length.

diff --git a/src/image_complete/bmp.py b/src/image_complete/bmp.py
@@ -24,14 +24,16 @@ def is_bmp(img):
         return False
 
 
-def is_bmp_complete(img):
+def is_bmp_complete(img, strict=True):
     """
     Checks whether the BMP image is complete.
 
     https://en.wikipedia.org/wiki/BMP_file_format#File_structure
 
     :param img: the absolute path to the BMP image or a bytes/BytesIO object
     :type img: str or bytes or BytesIO
+    :param strict: if True then no junk data after actual data is allowed
+    :type strict: bool
     :return: True if complete
     :rtype: bool
     """
@@ -43,7 +45,10 @@ def is_bmp_complete(img):
             data.seek(2, 0)
             data = data.read(4)
             blen = struct.unpack('I', data)
-            return blen[0] == data_len
+            if strict:
+                return blen[0] == data_len
+            else:
+                return blen[0] <= data_len
         else:
             return False
     except:

diff --git a/src/image_complete/gif.py b/src/image_complete/gif.py
@@ -1,5 +1,5 @@
 from io import BytesIO
-from .base import load
+from .base import load, DEFAULT_CHECK_SIZE
 
 
 def is_gif(img):
@@ -22,14 +22,18 @@ def is_gif(img):
         return False
 
 
-def is_gif_complete(img):
+def is_gif_complete(img, strict=True, check_size=DEFAULT_CHECK_SIZE):
     """
     Checks whether the GIF image is complete.
 
     https://en.wikipedia.org/wiki/GIF#File_format
 
     :param img: the absolute path to the GIF image or a bytes/BytesIO object
     :type img: str or bytes or BytesIO
+    :param strict: if True then no junk data after actual data is allowed
+    :type strict: bool
+    :param check_size: the number of bytes from the end of the file to look for EOF marker
+    :type check_size: int
     :return: True if complete
     :rtype: bool
     """
@@ -38,9 +42,19 @@ def is_gif_complete(img):
         if data is None:
             return False
         if data_len > 1:
-            data.seek(data_len - 1, 0)
-            marker = data.read(1)
-            return marker[0] == 59
+            if strict:
+                data.seek(data_len - 1, 0)
+                marker = data.read(1)
+                return marker[0] == 59
+            else:
+                if check_size > data_len:
+                    check_size = data_len
+                data.seek(data_len - check_size, 0)
+                buffer = data.read(check_size)
+                for b in buffer:
+                    if b == 59:
+                        return True
+                return False
         else:
             return False
     except:

diff --git a/src/image_complete/jpg.py b/src/image_complete/jpg.py
@@ -1,5 +1,5 @@
 from io import BytesIO
-from .base import load
+from .base import load, DEFAULT_CHECK_SIZE
 
 
 def is_jpg(img):
@@ -22,14 +22,18 @@ def is_jpg(img):
         return False
 
 
-def is_jpg_complete(img):
+def is_jpg_complete(img, strict=True, check_size=DEFAULT_CHECK_SIZE):
     """
     Checks whether the JPG image is complete.
 
     http://en.wikipedia.org/wiki/JPEG#Syntax_and_structure
 
     :param img: the absolute path to the JPG image or a bytes/BytesIO object
     :type img: str or bytes or BytesIO
+    :param strict: if True then no junk data after actual data is allowed
+    :type strict: bool
+    :param check_size: the number of bytes from the end of the file to look for EOF marker
+    :type check_size: int
     :return: True if complete
     :rtype: bool
     """
@@ -38,9 +42,19 @@ def is_jpg_complete(img):
         if data is None:
             return False
         if data_len > 2:
-            data.seek(data_len - 2, 0)
-            marker = data.read(2)
-            return (marker[0] == 0xFF) and (marker[1] == 0xD9)
+            if strict:
+                data.seek(data_len - 2, 0)
+                marker = data.read(2)
+                return (marker[0] == 0xFF) and (marker[1] == 0xD9)
+            else:
+                if check_size > data_len:
+                    check_size = data_len
+                data.seek(data_len - check_size, 0)
+                buffer = data.read(check_size)
+                for i in range(len(buffer) - 1):
+                    if (buffer[i] == 0xFF) and (buffer[i+1] == 0xD9):
+                        return True
+                return False
         else:
             return False
     except:

diff --git a/src/image_complete/png.py b/src/image_complete/png.py
@@ -1,5 +1,5 @@
 from io import BytesIO
-from .base import load
+from .base import load, DEFAULT_CHECK_SIZE
 
 
 def is_png(img):
@@ -22,7 +22,7 @@ def is_png(img):
         return False
 
 
-def is_png_complete(img):
+def is_png_complete(img, strict=True, check_size=DEFAULT_CHECK_SIZE):
     """
     Checks whether the PNG image is complete.
 
@@ -32,6 +32,10 @@ def is_png_complete(img):
 
     :param img: the absolute path to the BMP image or a bytes/BytesIO object
     :type img: str or bytes or BytesIO
+    :param strict: if True then no junk data after actual data is allowed
+    :type strict: bool
+    :param check_size: the number of bytes from the end of the file to look for EOF marker
+    :type check_size: int
     :return: True if complete
     :rtype: bool
     """
@@ -40,9 +44,19 @@ def is_png_complete(img):
         if data is None:
             return False
         if data_len > 8:
-            data.seek(data_len - 8, 0)
-            marker = data.read(8)
-            return (marker[0] == 73) and (marker[1] == 69) and (marker[2] == 78) and (marker[3] == 68)
+            if strict:
+                data.seek(data_len - 8, 0)
+                marker = data.read(8)
+                return (marker[0] == 73) and (marker[1] == 69) and (marker[2] == 78) and (marker[3] == 68)
+            else:
+                if check_size > data_len:
+                    check_size = data_len
+                data.seek(data_len - check_size, 0)
+                buffer = data.read(check_size)
+                for i in range(len(buffer) - 8):
+                    if (buffer[i] == 73) and (buffer[i+1] == 69) and (buffer[i+2] == 78) and (buffer[i+3] == 68):
+                        return True
+                return False
         else:
             return False
     except:

diff --git a/src/image_complete/webp.py b/src/image_complete/webp.py
@@ -24,14 +24,16 @@ def is_webp(img):
         return False
 
 
-def is_webp_complete(img):
+def is_webp_complete(img, strict=True):
     """
     Checks whether the WebP image is complete.
 
     https://developers.google.com/speed/webp/docs/riff_container
 
     :param img: the absolute path to the BMP image or a bytes/BytesIO object
     :type img: str or bytes or BytesIO
+    :param strict: if True then no junk data after actual data is allowed
+    :type strict: bool
     :return: True if complete
     :rtype: bool
     """
@@ -50,7 +52,10 @@ def is_webp_complete(img):
             data.seek(4, 0)
             data = data.read(4)
             d_len = struct.unpack('I', data)
-            return d_len[0] == data_len - 8  # RIFF/4 + DATALEN/4 = 8
+            if strict:
+                return d_len[0] == data_len - 8  # RIFF/4 + DATALEN/4 = 8
+            else:
+                return d_len[0] <= data_len - 8  # RIFF/4 + DATALEN/4 = 8
         else:
             return False
     except: