From 50a223b44b27a526cffd6192b25a54e759b99b7b Mon Sep 17 00:00:00 2001 From: vfdev-5 Date: Thu, 2 Jul 2020 17:42:41 +0200 Subject: [PATCH 1/5] [WIP] Unify ops Grayscale and RandomGrayscale --- test/test_functional_tensor.py | 13 +++++ test/test_transforms_tensor.py | 64 +++++++++++++-------- torchvision/transforms/functional.py | 32 +++++------ torchvision/transforms/functional_pil.py | 37 ++++++++++++ torchvision/transforms/functional_tensor.py | 45 +++++++++++++++ torchvision/transforms/transforms.py | 30 ++++++---- 6 files changed, 169 insertions(+), 52 deletions(-) diff --git a/test/test_functional_tensor.py b/test/test_functional_tensor.py index 0c2194e0f7b..540ffe87725 100644 --- a/test/test_functional_tensor.py +++ b/test/test_functional_tensor.py @@ -278,6 +278,19 @@ def test_pad(self): pad_tensor_script = script_fn(tensor, script_pad, **kwargs) self.assertTrue(pad_tensor.equal(pad_tensor_script), msg="{}, {}".format(pad, kwargs)) + # def test_to_grayscale(self): + # script_to_grayscale = torch.jit.script(F_t.to_grayscale) + # img_tensor = torch.randint(0, 255, (4, 3, 16, 16), dtype=torch.uint8) + # img_tensor_clone = img_tensor.clone() + # grayscale_tensor = F_t.rgb_to_grayscale(img_tensor).to(int) + # grayscale_pil_img = torch.tensor(np.array(F.to_grayscale(F.to_pil_image(img_tensor)))).to(int) + # max_diff = (grayscale_tensor - grayscale_pil_img).abs().max() + # self.assertLess(max_diff, 1.0001) + # self.assertTrue(torch.equal(img_tensor, img_tensor_clone)) + # # scriptable function test + # grayscale_script = script_rgb_to_grayscale(img_tensor).to(int) + # self.assertTrue(torch.equal(grayscale_script, grayscale_tensor)) + if __name__ == '__main__': unittest.main() diff --git a/test/test_transforms_tensor.py b/test/test_transforms_tensor.py index 6a8d9930754..c28c84ff88f 100644 --- a/test/test_transforms_tensor.py +++ b/test/test_transforms_tensor.py @@ -15,10 +15,13 @@ def _create_data(self, height=3, width=3, channels=3): return tensor, pil_img def compareTensorToPIL(self, tensor, pil_image): - pil_tensor = torch.as_tensor(np.array(pil_image).transpose((2, 0, 1))) + pil_tensor = np.array(pil_image) + if pil_tensor.ndim == 2: + pil_tensor = pil_tensor[:, :, None] + pil_tensor = torch.as_tensor(pil_tensor.transpose((2, 0, 1))) self.assertTrue(tensor.equal(pil_tensor)) - def _test_functional_geom_op(self, func, fn_kwargs): + def _test_functional_op(self, func, fn_kwargs): if fn_kwargs is None: fn_kwargs = {} tensor, pil_img = self._create_data(height=10, width=10) @@ -26,7 +29,7 @@ def _test_functional_geom_op(self, func, fn_kwargs): transformed_pil_img = getattr(F, func)(pil_img, **fn_kwargs) self.compareTensorToPIL(transformed_tensor, transformed_pil_img) - def _test_class_geom_op(self, method, meth_kwargs=None): + def _test_class_op(self, method, meth_kwargs=None): if meth_kwargs is None: meth_kwargs = {} @@ -46,15 +49,15 @@ def _test_class_geom_op(self, method, meth_kwargs=None): transformed_tensor_script = scripted_fn(tensor) self.assertTrue(transformed_tensor.equal(transformed_tensor_script)) - def _test_geom_op(self, func, method, fn_kwargs=None, meth_kwargs=None): - self._test_functional_geom_op(func, fn_kwargs) - self._test_class_geom_op(method, meth_kwargs) + def _test_op(self, func, method, fn_kwargs=None, meth_kwargs=None): + self._test_functional_op(func, fn_kwargs) + self._test_class_op(method, meth_kwargs) def test_random_horizontal_flip(self): - self._test_geom_op('hflip', 'RandomHorizontalFlip') + self._test_op('hflip', 'RandomHorizontalFlip') def test_random_vertical_flip(self): - self._test_geom_op('vflip', 'RandomVerticalFlip') + self._test_op('vflip', 'RandomVerticalFlip') def test_adjustments(self): fns = ['adjust_brightness', 'adjust_contrast', 'adjust_saturation'] @@ -85,22 +88,22 @@ def test_adjustments(self): def test_pad(self): # Test functional.pad (PIL and Tensor) with padding as single int - self._test_functional_geom_op( + self._test_functional_op( "pad", fn_kwargs={"padding": 2, "fill": 0, "padding_mode": "constant"} ) # Test functional.pad and transforms.Pad with padding as [int, ] fn_kwargs = meth_kwargs = {"padding": [2, ], "fill": 0, "padding_mode": "constant"} - self._test_geom_op( + self._test_op( "pad", "Pad", fn_kwargs=fn_kwargs, meth_kwargs=meth_kwargs ) # Test functional.pad and transforms.Pad with padding as list fn_kwargs = meth_kwargs = {"padding": [4, 4], "fill": 0, "padding_mode": "constant"} - self._test_geom_op( + self._test_op( "pad", "Pad", fn_kwargs=fn_kwargs, meth_kwargs=meth_kwargs ) # Test functional.pad and transforms.Pad with padding as tuple fn_kwargs = meth_kwargs = {"padding": (2, 2, 2, 2), "fill": 127, "padding_mode": "constant"} - self._test_geom_op( + self._test_op( "pad", "Pad", fn_kwargs=fn_kwargs, meth_kwargs=meth_kwargs ) @@ -108,7 +111,7 @@ def test_crop(self): fn_kwargs = {"top": 2, "left": 3, "height": 4, "width": 5} # Test transforms.RandomCrop with size and padding as tuple meth_kwargs = {"size": (4, 5), "padding": (4, 4), "pad_if_needed": True, } - self._test_geom_op( + self._test_op( 'crop', 'RandomCrop', fn_kwargs=fn_kwargs, meth_kwargs=meth_kwargs ) @@ -125,17 +128,17 @@ def test_crop(self): for padding_config in padding_configs: config = dict(padding_config) config["size"] = size - self._test_class_geom_op("RandomCrop", config) + self._test_class_op("RandomCrop", config) def test_center_crop(self): fn_kwargs = {"output_size": (4, 5)} meth_kwargs = {"size": (4, 5), } - self._test_geom_op( + self._test_op( "center_crop", "CenterCrop", fn_kwargs=fn_kwargs, meth_kwargs=meth_kwargs ) fn_kwargs = {"output_size": (5,)} meth_kwargs = {"size": (5, )} - self._test_geom_op( + self._test_op( "center_crop", "CenterCrop", fn_kwargs=fn_kwargs, meth_kwargs=meth_kwargs ) tensor = torch.randint(0, 255, (3, 10, 10), dtype=torch.uint8) @@ -154,7 +157,7 @@ def test_center_crop(self): scripted_fn = torch.jit.script(f) scripted_fn(tensor) - def _test_geom_op_list_output(self, func, method, out_length, fn_kwargs=None, meth_kwargs=None): + def _test_op_list_output(self, func, method, out_length, fn_kwargs=None, meth_kwargs=None): if fn_kwargs is None: fn_kwargs = {} if meth_kwargs is None: @@ -183,40 +186,51 @@ def _test_geom_op_list_output(self, func, method, out_length, fn_kwargs=None, me def test_five_crop(self): fn_kwargs = meth_kwargs = {"size": (5,)} - self._test_geom_op_list_output( + self._test_op_list_output( "five_crop", "FiveCrop", out_length=5, fn_kwargs=fn_kwargs, meth_kwargs=meth_kwargs ) fn_kwargs = meth_kwargs = {"size": [5, ]} - self._test_geom_op_list_output( + self._test_op_list_output( "five_crop", "FiveCrop", out_length=5, fn_kwargs=fn_kwargs, meth_kwargs=meth_kwargs ) fn_kwargs = meth_kwargs = {"size": (4, 5)} - self._test_geom_op_list_output( + self._test_op_list_output( "five_crop", "FiveCrop", out_length=5, fn_kwargs=fn_kwargs, meth_kwargs=meth_kwargs ) fn_kwargs = meth_kwargs = {"size": [4, 5]} - self._test_geom_op_list_output( + self._test_op_list_output( "five_crop", "FiveCrop", out_length=5, fn_kwargs=fn_kwargs, meth_kwargs=meth_kwargs ) def test_ten_crop(self): fn_kwargs = meth_kwargs = {"size": (5,)} - self._test_geom_op_list_output( + self._test_op_list_output( "ten_crop", "TenCrop", out_length=10, fn_kwargs=fn_kwargs, meth_kwargs=meth_kwargs ) fn_kwargs = meth_kwargs = {"size": [5, ]} - self._test_geom_op_list_output( + self._test_op_list_output( "ten_crop", "TenCrop", out_length=10, fn_kwargs=fn_kwargs, meth_kwargs=meth_kwargs ) fn_kwargs = meth_kwargs = {"size": (4, 5)} - self._test_geom_op_list_output( + self._test_op_list_output( "ten_crop", "TenCrop", out_length=10, fn_kwargs=fn_kwargs, meth_kwargs=meth_kwargs ) fn_kwargs = meth_kwargs = {"size": [4, 5]} - self._test_geom_op_list_output( + self._test_op_list_output( "ten_crop", "TenCrop", out_length=10, fn_kwargs=fn_kwargs, meth_kwargs=meth_kwargs ) + def test_to_grayscale(self): + + fn_kwargs = meth_kwargs = {"num_output_channels": 1} + self._test_op("to_grayscale", "Grayscale", fn_kwargs=fn_kwargs, meth_kwargs=meth_kwargs) + + fn_kwargs = meth_kwargs = {"num_output_channels": 3} + self._test_op("to_grayscale", "Grayscale", fn_kwargs=fn_kwargs, meth_kwargs=meth_kwargs) + + meth_kwargs = {} + self._test_class_op("RandomGrayscale", meth_kwargs=meth_kwargs) + if __name__ == '__main__': unittest.main() diff --git a/torchvision/transforms/functional.py b/torchvision/transforms/functional.py index 81a601a8e20..af236ab87be 100644 --- a/torchvision/transforms/functional.py +++ b/torchvision/transforms/functional.py @@ -33,6 +33,13 @@ def _get_image_size(img: Tensor) -> List[int]: return F_pil._get_image_size(img) +def _get_image_num_channels(img: Tensor) -> int: + if isinstance(img, torch.Tensor): + return F_t._get_image_num_channels(img) + + return F_pil._get_image_num_channels(img) + + @torch.jit.unused def _is_numpy(img: Any) -> bool: return isinstance(img, np.ndarray) @@ -922,32 +929,25 @@ def affine(img, angle, translate, scale, shear, resample=0, fillcolor=None): return img.transform(output_size, Image.AFFINE, matrix, resample, **kwargs) -def to_grayscale(img, num_output_channels=1): +def to_grayscale(img: Tensor, num_output_channels: int = 1) -> Tensor: """Convert image to grayscale version of image. + The image can be a PIL Image or a Tensor, in which case it is expected + to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions Args: - img (PIL Image): Image to be converted to grayscale. + img (PIL Image or Tensor): Image to be converted to grayscale. + num_output_channels (int): number of channels of the output image. Value can be 1 or 3. Default, 1. Returns: - PIL Image: Grayscale version of the image. + PIL Image or Tensor: Grayscale version of the image. if num_output_channels = 1 : returned image is single channel if num_output_channels = 3 : returned image is 3 channel with r = g = b """ - if not F_pil._is_pil_image(img): - raise TypeError('img should be PIL Image. Got {}'.format(type(img))) - - if num_output_channels == 1: - img = img.convert('L') - elif num_output_channels == 3: - img = img.convert('L') - np_img = np.array(img, dtype=np.uint8) - np_img = np.dstack([np_img, np_img, np_img]) - img = Image.fromarray(np_img, 'RGB') - else: - raise ValueError('num_output_channels should be either 1 or 3') + if not isinstance(img, torch.Tensor): + return F_pil.to_grayscale(img, num_output_channels) - return img + return F_t.to_grayscale(img, num_output_channels) def erase(img, i, j, h, w, v, inplace=False): diff --git a/torchvision/transforms/functional_pil.py b/torchvision/transforms/functional_pil.py index f1bcda113aa..941a4449c13 100644 --- a/torchvision/transforms/functional_pil.py +++ b/torchvision/transforms/functional_pil.py @@ -25,6 +25,13 @@ def _get_image_size(img: Any) -> List[int]: raise TypeError("Unexpected type {}".format(type(img))) +@torch.jit.unused +def _get_image_num_channels(img: Any) -> int: + if _is_pil_image(img): + return 1 if img.mode == 'L' else 3 + raise TypeError("Unexpected type {}".format(type(img))) + + @torch.jit.unused def hflip(img): """Horizontally flip the given PIL Image. @@ -286,3 +293,33 @@ def crop(img: Image.Image, top: int, left: int, height: int, width: int) -> Imag raise TypeError('img should be PIL Image. Got {}'.format(type(img))) return img.crop((left, top, left + width, top + height)) + + +@torch.jit.unused +def to_grayscale(img: Image.Image, num_output_channels: int = 1) -> Image.Image: + """Convert image to grayscale version of image. + + Args: + img (PIL Image): Image to be converted to grayscale. + num_output_channels (int): number of channels of the output image. Value can be 1 or 3. Default, 1. + + Returns: + PIL Image: Grayscale version of the image. + if num_output_channels = 1 : returned image is single channel + + if num_output_channels = 3 : returned image is 3 channel with r = g = b + """ + if not _is_pil_image(img): + raise TypeError('img should be PIL Image. Got {}'.format(type(img))) + + if num_output_channels == 1: + img = img.convert('L') + elif num_output_channels == 3: + img = img.convert('L') + np_img = np.array(img, dtype=np.uint8) + np_img = np.dstack([np_img, np_img, np_img]) + img = Image.fromarray(np_img, 'RGB') + else: + raise ValueError('num_output_channels should be either 1 or 3') + + return img diff --git a/torchvision/transforms/functional_tensor.py b/torchvision/transforms/functional_tensor.py index d4a8f340997..7b0ef8f79b2 100644 --- a/torchvision/transforms/functional_tensor.py +++ b/torchvision/transforms/functional_tensor.py @@ -13,6 +13,15 @@ def _get_image_size(img: Tensor) -> List[int]: raise TypeError("Unexpected type {}".format(type(img))) +def _get_image_num_channels(img: Tensor) -> int: + if img.ndim == 2: + return 1 + elif img.ndim > 2: + return img.shape[-3] + + raise TypeError("Unexpected type {}".format(type(img))) + + def vflip(img: Tensor) -> Tensor: """Vertically flip the given the Image Tensor. @@ -447,3 +456,39 @@ def pad(img: Tensor, padding: List[int], fill: int = 0, padding_mode: str = "con img = img.to(out_dtype) return img + + +def to_grayscale(img: Tensor, num_output_channels: int = 1) -> Tensor: + """Convert image to grayscale version of image. + + Args: + img (Tensor): Image to be converted to grayscale. We assume (..., 3, H, W) layout. + num_output_channels (int): number of channels of the output image. Value can be 1 or 3. Default, 1. + + Returns: + Tensor: Grayscale version of the image. + if num_output_channels = 1 : returned image is single channel + + if num_output_channels = 3 : returned image is 3 channel with r = g = b + """ + if img.ndim < 3: + raise TypeError("Input image tensor should have at least 3 dimensions, but found {}".format(img.ndim)) + c = img.shape[-3] + if c != 3: + raise TypeError("Input image tensor should 3 channels, but found {}".format(c)) + + if num_output_channels not in (1, 3): + raise ValueError('num_output_channels should be either 1 or 3') + + # PIL grayscale L mode is L = R * 299/1000 + G * 587/1000 + B * 114/1000 + r = img[..., 0, :, :] + g = img[..., 1, :, :] + b = img[..., 2, :, :] + l_img = (0.299 * r + 0.587 * g + 0.114 * b + 0.5).to(img.dtype) + + if num_output_channels == 3: + l_img = torch.stack([l_img, l_img, l_img], dim=-3) + else: + l_img = l_img.unsqueeze(dim=-3) + + return l_img diff --git a/torchvision/transforms/transforms.py b/torchvision/transforms/transforms.py index a5214ed3174..d33bed02eb9 100644 --- a/torchvision/transforms/transforms.py +++ b/torchvision/transforms/transforms.py @@ -1279,8 +1279,11 @@ def __repr__(self): return s.format(name=self.__class__.__name__, **d) -class Grayscale(object): +class Grayscale(torch.nn.Module): """Convert image to grayscale. + The image can be a PIL Image or a Tensor, in which case it is expected + to have [..., 3, H, W] shape, where ... means an arbitrary number of leading + dimensions Args: num_output_channels (int): (1 or 3) number of channels desired for output image @@ -1293,15 +1296,16 @@ class Grayscale(object): """ def __init__(self, num_output_channels=1): + super().__init__() self.num_output_channels = num_output_channels - def __call__(self, img): + def forward(self, img: Tensor) -> Tensor: """ Args: - img (PIL Image): Image to be converted to grayscale. + img (PIL Image or Tensor): Image to be converted to grayscale. Returns: - PIL Image: Randomly grayscaled image. + PIL Image or Tensor: Grayscaled image. """ return F.to_grayscale(img, num_output_channels=self.num_output_channels) @@ -1309,14 +1313,17 @@ def __repr__(self): return self.__class__.__name__ + '(num_output_channels={0})'.format(self.num_output_channels) -class RandomGrayscale(object): +class RandomGrayscale(torch.nn.Module): """Randomly convert image to grayscale with a probability of p (default 0.1). + The image can be a PIL Image or a Tensor, in which case it is expected + to have [..., 3, H, W] shape, where ... means an arbitrary number of leading + dimensions Args: p (float): probability that image should be converted to grayscale. Returns: - PIL Image: Grayscale version of the input image with probability p and unchanged + PIL Image or Tensor: Grayscale version of the input image with probability p and unchanged with probability (1-p). - If input image is 1 channel: grayscale version is 1 channel - If input image is 3 channel: grayscale version is 3 channel with r == g == b @@ -1324,18 +1331,19 @@ class RandomGrayscale(object): """ def __init__(self, p=0.1): + super().__init__() self.p = p - def __call__(self, img): + def forward(self, img: Tensor) -> Tensor: """ Args: - img (PIL Image): Image to be converted to grayscale. + img (PIL Image or Tensor): Image to be converted to grayscale. Returns: - PIL Image: Randomly grayscaled image. + PIL Image or Tensor: Randomly grayscaled image. """ - num_output_channels = 1 if img.mode == 'L' else 3 - if random.random() < self.p: + num_output_channels = F._get_image_num_channels(img) + if torch.rand(1) < self.p: return F.to_grayscale(img, num_output_channels=num_output_channels) return img From 50da7be70743876e2cd7935c3b5ec8ebcf5b3a52 Mon Sep 17 00:00:00 2001 From: vfdev-5 Date: Fri, 14 Aug 2020 10:04:42 +0200 Subject: [PATCH 2/5] Unified inputs for grayscale op and transforms - deprecated F.to_grayscale in favor of F.rgb_to_grayscale --- test/test_functional_tensor.py | 32 ++++++---- test/test_transforms_tensor.py | 4 +- torchvision/transforms/functional.py | 35 ++++++++-- torchvision/transforms/functional_pil.py | 22 ++++++- torchvision/transforms/functional_tensor.py | 71 +++++++++------------ torchvision/transforms/transforms.py | 4 +- 6 files changed, 105 insertions(+), 63 deletions(-) diff --git a/test/test_functional_tensor.py b/test/test_functional_tensor.py index aab9d3d9b02..f2a3f3d7683 100644 --- a/test/test_functional_tensor.py +++ b/test/test_functional_tensor.py @@ -1,5 +1,4 @@ import unittest -import random import colorsys import math @@ -23,7 +22,10 @@ def _create_data(self, height=3, width=3, channels=3): return tensor, pil_img def compareTensorToPIL(self, tensor, pil_image, msg=None): - pil_tensor = torch.as_tensor(np.array(pil_image).transpose((2, 0, 1))) + np_pil_image = np.array(pil_image) + if np_pil_image.ndim == 2: + np_pil_image = np_pil_image[:, :, None] + pil_tensor = torch.as_tensor(np_pil_image.transpose((2, 0, 1))) if msg is None: msg = "tensor:\n{} \ndid not equal PIL tensor:\n{}".format(tensor, pil_tensor) self.assertTrue(tensor.equal(pil_tensor), msg) @@ -187,17 +189,21 @@ def test_adjustments(self): scripted_fn(img) def test_rgb_to_grayscale(self): - script_rgb_to_grayscale = torch.jit.script(F_t.rgb_to_grayscale) - img_tensor = torch.randint(0, 255, (3, 16, 16), dtype=torch.uint8) - img_tensor_clone = img_tensor.clone() - grayscale_tensor = F_t.rgb_to_grayscale(img_tensor).to(int) - grayscale_pil_img = torch.tensor(np.array(F.to_grayscale(F.to_pil_image(img_tensor)))).to(int) - max_diff = (grayscale_tensor - grayscale_pil_img).abs().max() - self.assertLess(max_diff, 1.0001) - self.assertTrue(torch.equal(img_tensor, img_tensor_clone)) - # scriptable function test - grayscale_script = script_rgb_to_grayscale(img_tensor).to(int) - self.assertTrue(torch.equal(grayscale_script, grayscale_tensor)) + script_rgb_to_grayscale = torch.jit.script(F.rgb_to_grayscale) + + img_tensor, pil_img = self._create_data(32, 34) + + for num_output_channels in (3, 1): + gray_pil_image = F.rgb_to_grayscale(pil_img, num_output_channels=num_output_channels) + gray_tensor = F.rgb_to_grayscale(img_tensor, num_output_channels=num_output_channels) + + if num_output_channels == 1: + print(gray_tensor.shape) + + self.compareTensorToPIL(gray_tensor, gray_pil_image) + + s_gray_tensor = script_rgb_to_grayscale(img_tensor, num_output_channels=num_output_channels) + self.assertTrue(s_gray_tensor.equal(gray_tensor)) def test_center_crop(self): script_center_crop = torch.jit.script(F.center_crop) diff --git a/test/test_transforms_tensor.py b/test/test_transforms_tensor.py index de73bb54079..aefa962d3f7 100644 --- a/test/test_transforms_tensor.py +++ b/test/test_transforms_tensor.py @@ -324,10 +324,10 @@ def test_random_perspective(self): def test_to_grayscale(self): fn_kwargs = meth_kwargs = {"num_output_channels": 1} - self._test_op("to_grayscale", "Grayscale", fn_kwargs=fn_kwargs, meth_kwargs=meth_kwargs) + self._test_op("rgb_to_grayscale", "Grayscale", fn_kwargs=fn_kwargs, meth_kwargs=meth_kwargs) fn_kwargs = meth_kwargs = {"num_output_channels": 3} - self._test_op("to_grayscale", "Grayscale", fn_kwargs=fn_kwargs, meth_kwargs=meth_kwargs) + self._test_op("rgb_to_grayscale", "Grayscale", fn_kwargs=fn_kwargs, meth_kwargs=meth_kwargs) meth_kwargs = {} self._test_class_op("RandomGrayscale", meth_kwargs=meth_kwargs) diff --git a/torchvision/transforms/functional.py b/torchvision/transforms/functional.py index 7baba2f9692..5765a8b81e8 100644 --- a/torchvision/transforms/functional.py +++ b/torchvision/transforms/functional.py @@ -959,12 +959,39 @@ def affine( def to_grayscale(img: Tensor, num_output_channels: int = 1) -> Tensor: - """Convert image to grayscale version of image. + """DEPRECATED. Convert RGB image to grayscale version of image. + The image can be a PIL Image or a Tensor, in which case it is expected + to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions + + .. warning:: + + This method is deprecated and will be removed in future releases. + Please, use ``F.rgb_to_grayscale`` instead. + + + Args: + img (PIL Image or Tensor): RGB Image to be converted to grayscale. + num_output_channels (int): number of channels of the output image. Value can be 1 or 3. Default, 1. + + Returns: + PIL Image or Tensor: Grayscale version of the image. + if num_output_channels = 1 : returned image is single channel + + if num_output_channels = 3 : returned image is 3 channel with r = g = b + """ + warnings.warn("The use of the F.to_grayscale transform is deprecated, " + + "please use F.rgb_to_grayscale instead.") + + return rgb_to_grayscale(img, num_output_channels) + + +def rgb_to_grayscale(img: Tensor, num_output_channels: int = 1) -> Tensor: + """Convert RGB image to grayscale version of image. The image can be a PIL Image or a Tensor, in which case it is expected to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions Args: - img (PIL Image or Tensor): Image to be converted to grayscale. + img (PIL Image or Tensor): RGB Image to be converted to grayscale. num_output_channels (int): number of channels of the output image. Value can be 1 or 3. Default, 1. Returns: @@ -974,9 +1001,9 @@ def to_grayscale(img: Tensor, num_output_channels: int = 1) -> Tensor: if num_output_channels = 3 : returned image is 3 channel with r = g = b """ if not isinstance(img, torch.Tensor): - return F_pil.to_grayscale(img, num_output_channels) + return F_pil.rgb_to_grayscale(img, num_output_channels) - return F_t.to_grayscale(img, num_output_channels) + return F_t.rgb_to_grayscale(img, num_output_channels) def erase(img: Tensor, i: int, j: int, h: int, w: int, v: Tensor, inplace: bool = False) -> Tensor: diff --git a/torchvision/transforms/functional_pil.py b/torchvision/transforms/functional_pil.py index f6ba88030b0..0dc55ebdc8f 100644 --- a/torchvision/transforms/functional_pil.py +++ b/torchvision/transforms/functional_pil.py @@ -1,4 +1,5 @@ import numbers +import warnings from typing import Any, List, Sequence import numpy as np @@ -491,12 +492,31 @@ def perspective(img, perspective_coeffs, interpolation=Image.BICUBIC, fill=None) @torch.jit.unused def to_grayscale(img, num_output_channels): - """Convert image to grayscale version of image. + """DEPRECATED. Convert RGB image to grayscale version of image. Args: img (PIL Image): Image to be converted to grayscale. num_output_channels (int): number of channels of the output image. Value can be 1 or 3. Default, 1. + Returns: + PIL Image: Grayscale version of the image. + if num_output_channels = 1 : returned image is single channel + + if num_output_channels = 3 : returned image is 3 channel with r = g = b + """ + warnings.warn("The use of the F_pil.to_grayscale transform is deprecated, " + + "please use F.rgb_to_grayscale instead.") + return rgb_to_grayscale(img, num_output_channels) + + +@torch.jit.unused +def rgb_to_grayscale(img, num_output_channels): + """Convert RGB image to grayscale version of image. + + Args: + img (PIL Image): RGB Image to be converted to grayscale. + num_output_channels (int): number of channels of the output image. Value can be 1 or 3. Default, 1. + Returns: PIL Image: Grayscale version of the image. if num_output_channels = 1 : returned image is single channel diff --git a/torchvision/transforms/functional_tensor.py b/torchvision/transforms/functional_tensor.py index c89e9b36f45..5ee3fb3cd89 100644 --- a/torchvision/transforms/functional_tensor.py +++ b/torchvision/transforms/functional_tensor.py @@ -76,22 +76,47 @@ def crop(img: Tensor, top: int, left: int, height: int, width: int) -> Tensor: return img[..., top:top + height, left:left + width] -def rgb_to_grayscale(img: Tensor) -> Tensor: +def rgb_to_grayscale(img: Tensor, num_output_channels: int = 1) -> Tensor: """Convert the given RGB Image Tensor to Grayscale. For RGB to Grayscale conversion, ITU-R 601-2 luma transform is performed which is L = R * 0.2989 + G * 0.5870 + B * 0.1140 Args: img (Tensor): Image to be converted to Grayscale in the form [C, H, W]. + num_output_channels (int): number of channels of the output image. Value can be 1 or 3. Default, 1. Returns: - Tensor: Grayscale image. + Tensor: Grayscale version of the image. + if num_output_channels = 1 : returned image is single channel + + if num_output_channels = 3 : returned image is 3 channel with r = g = b """ - if img.shape[0] != 3: - raise TypeError('Input Image does not contain 3 Channels') + if img.ndim < 3: + raise TypeError("Input image tensor should have at least 3 dimensions, but found {}".format(img.ndim)) + c = img.shape[-3] + if c != 3: + raise TypeError("Input image tensor should 3 channels, but found {}".format(c)) - return (0.2989 * img[0] + 0.5870 * img[1] + 0.1140 * img[2]).to(img.dtype) + if num_output_channels not in (1, 3): + raise ValueError('num_output_channels should be either 1 or 3') + + r = img[..., 0, :, :].float() + g = img[..., 1, :, :].float() + b = img[..., 2, :, :].float() + # According to PIL docs: PIL grayscale L mode is L = R * 299/1000 + G * 587/1000 + B * 114/1000 + # but implementation is slightly different: + # https://github.com/python-pillow/Pillow/blob/4634eafe3c695a014267eefdce830b4a825beed7/ + # src/libImaging/Convert.c#L47 + # ((rgb)[0]*19595 + (rgb)[1]*38470 + (rgb)[2]*7471 + 0x8000) >> 16 + l_img = torch.floor((19595 * r + 38470 * g + 7471 * b + 2 ** 15) / 2 ** 16).to(img.dtype) + + if num_output_channels == 3: + l_img = torch.stack([l_img, l_img, l_img], dim=-3) + else: + l_img = l_img.unsqueeze(dim=-3) + + return l_img def adjust_brightness(img: Tensor, brightness_factor: float) -> Tensor: @@ -893,39 +918,3 @@ def perspective( mode = _interpolation_modes[interpolation] return _apply_grid_transform(img, grid, mode) - - -def to_grayscale(img: Tensor, num_output_channels: int = 1) -> Tensor: - """Convert image to grayscale version of image. - - Args: - img (Tensor): Image to be converted to grayscale. We assume (..., 3, H, W) layout. - num_output_channels (int): number of channels of the output image. Value can be 1 or 3. Default, 1. - - Returns: - Tensor: Grayscale version of the image. - if num_output_channels = 1 : returned image is single channel - - if num_output_channels = 3 : returned image is 3 channel with r = g = b - """ - if img.ndim < 3: - raise TypeError("Input image tensor should have at least 3 dimensions, but found {}".format(img.ndim)) - c = img.shape[-3] - if c != 3: - raise TypeError("Input image tensor should 3 channels, but found {}".format(c)) - - if num_output_channels not in (1, 3): - raise ValueError('num_output_channels should be either 1 or 3') - - # PIL grayscale L mode is L = R * 299/1000 + G * 587/1000 + B * 114/1000 - r = img[..., 0, :, :] - g = img[..., 1, :, :] - b = img[..., 2, :, :] - l_img = (0.299 * r + 0.587 * g + 0.114 * b + 0.5).to(img.dtype) - - if num_output_channels == 3: - l_img = torch.stack([l_img, l_img, l_img], dim=-3) - else: - l_img = l_img.unsqueeze(dim=-3) - - return l_img diff --git a/torchvision/transforms/transforms.py b/torchvision/transforms/transforms.py index a5c0c6daa50..b995101c3c7 100644 --- a/torchvision/transforms/transforms.py +++ b/torchvision/transforms/transforms.py @@ -1382,7 +1382,7 @@ def forward(self, img: Tensor) -> Tensor: Returns: PIL Image or Tensor: Grayscaled image. """ - return F.to_grayscale(img, num_output_channels=self.num_output_channels) + return F.rgb_to_grayscale(img, num_output_channels=self.num_output_channels) def __repr__(self): return self.__class__.__name__ + '(num_output_channels={0})'.format(self.num_output_channels) @@ -1419,7 +1419,7 @@ def forward(self, img: Tensor) -> Tensor: """ num_output_channels = F._get_image_num_channels(img) if torch.rand(1) < self.p: - return F.to_grayscale(img, num_output_channels=num_output_channels) + return F.rgb_to_grayscale(img, num_output_channels=num_output_channels) return img def __repr__(self): From 1078e1dbf5c0c21a391e8b493e69818c88dd804c Mon Sep 17 00:00:00 2001 From: vfdev-5 Date: Fri, 14 Aug 2020 18:39:20 +0200 Subject: [PATCH 3/5] Fixes bug with fp input --- torchvision/transforms/functional_tensor.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/torchvision/transforms/functional_tensor.py b/torchvision/transforms/functional_tensor.py index 5ee3fb3cd89..53e66c23591 100644 --- a/torchvision/transforms/functional_tensor.py +++ b/torchvision/transforms/functional_tensor.py @@ -104,12 +104,16 @@ def rgb_to_grayscale(img: Tensor, num_output_channels: int = 1) -> Tensor: r = img[..., 0, :, :].float() g = img[..., 1, :, :].float() b = img[..., 2, :, :].float() - # According to PIL docs: PIL grayscale L mode is L = R * 299/1000 + G * 587/1000 + B * 114/1000 - # but implementation is slightly different: - # https://github.com/python-pillow/Pillow/blob/4634eafe3c695a014267eefdce830b4a825beed7/ - # src/libImaging/Convert.c#L47 - # ((rgb)[0]*19595 + (rgb)[1]*38470 + (rgb)[2]*7471 + 0x8000) >> 16 - l_img = torch.floor((19595 * r + 38470 * g + 7471 * b + 2 ** 15) / 2 ** 16).to(img.dtype) + if not img.is_floating_point(): + # According to PIL docs: PIL grayscale L mode is L = R * 299/1000 + G * 587/1000 + B * 114/1000 + # but implementation is slightly different: + # https://github.com/python-pillow/Pillow/blob/4634eafe3c695a014267eefdce830b4a825beed7/ + # src/libImaging/Convert.c#L47 + # ((rgb)[0]*19595 + (rgb)[1]*38470 + (rgb)[2]*7471 + 0x8000) >> 16 + # l_img = ((19595 * r + 38470 * g + 7471 * b + 2 ** 15) / 2 ** 16).to(img.dtype) + l_img = torch.floor((19595 * r + 38470 * g + 7471 * b + 2 ** 15) / 2 ** 16).to(img.dtype) + else: + l_img = (0.299 * r + 0.587 * g + 0.114 * b).to(img.dtype) if num_output_channels == 3: l_img = torch.stack([l_img, l_img, l_img], dim=-3) @@ -407,8 +411,8 @@ def ten_crop(img: Tensor, size: BroadcastingList2[int], vertical_flip: bool = Fa def _blend(img1: Tensor, img2: Tensor, ratio: float) -> Tensor: - bound = 1 if img1.dtype in [torch.half, torch.float32, torch.float64] else 255 - return (ratio * img1 + (1 - ratio) * img2).clamp(0, bound).to(img1.dtype) + bound = 1.0 if img1.is_floating_point() else 255.0 + return (ratio * img1 + (1.0 - ratio) * img2).clamp(0, bound).to(img1.dtype) def _rgb2hsv(img): From f228dc88535e91bf9b67c57bc461f5be7ef2a770 Mon Sep 17 00:00:00 2001 From: vfdev-5 Date: Tue, 25 Aug 2020 22:39:05 +0200 Subject: [PATCH 4/5] [WIP] Updated code according to review --- test/test_functional_tensor.py | 26 ++++++++++++------ test/test_transforms_tensor.py | 8 +++--- torchvision/transforms/functional.py | 29 +++++++++------------ torchvision/transforms/functional_pil.py | 21 +-------------- torchvision/transforms/functional_tensor.py | 22 +++++----------- 5 files changed, 42 insertions(+), 64 deletions(-) diff --git a/test/test_functional_tensor.py b/test/test_functional_tensor.py index 10b0571af94..8c7c7c74313 100644 --- a/test/test_functional_tensor.py +++ b/test/test_functional_tensor.py @@ -30,12 +30,15 @@ def compareTensorToPIL(self, tensor, pil_image, msg=None): msg = "tensor:\n{} \ndid not equal PIL tensor:\n{}".format(tensor, pil_tensor) self.assertTrue(tensor.cpu().equal(pil_tensor), msg) - def approxEqualTensorToPIL(self, tensor, pil_image, tol=1e-5, msg=None): - pil_tensor = torch.as_tensor(np.array(pil_image).transpose((2, 0, 1))).to(tensor) - mae = torch.abs(tensor - pil_tensor).mean().item() + def approxEqualTensorToPIL(self, tensor, pil_image, tol=1e-5, msg=None, method="mean"): + np_pil_image = np.array(pil_image) + if np_pil_image.ndim == 2: + np_pil_image = np_pil_image[:, :, None] + pil_tensor = torch.as_tensor(np_pil_image.transpose((2, 0, 1))).to(tensor) + err = getattr(torch, method)(torch.abs(tensor - pil_tensor)).item() self.assertTrue( - mae < tol, - msg="{}: mae={}, tol={}: \n{}\nvs\n{}".format(msg, mae, tol, tensor[0, :10, :10], pil_tensor[0, :10, :10]) + err < tol, + msg="{}: err={}, tol={}: \n{}\nvs\n{}".format(msg, err, tol, tensor[0, :10, :10], pil_tensor[0, :10, :10]) ) def _test_vflip(self, device): @@ -216,10 +219,10 @@ def test_adjustments(self): def test_adjustments_cuda(self): self._test_adjustments("cuda") - def test_rgb_to_grayscale(self): + def _test_rgb_to_grayscale(self, device): script_rgb_to_grayscale = torch.jit.script(F.rgb_to_grayscale) - img_tensor, pil_img = self._create_data(32, 34) + img_tensor, pil_img = self._create_data(32, 34, device=device) for num_output_channels in (3, 1): gray_pil_image = F.rgb_to_grayscale(pil_img, num_output_channels=num_output_channels) @@ -228,11 +231,18 @@ def test_rgb_to_grayscale(self): if num_output_channels == 1: print(gray_tensor.shape) - self.compareTensorToPIL(gray_tensor, gray_pil_image) + self.approxEqualTensorToPIL(gray_tensor.float(), gray_pil_image, tol=1.0 + 1e-10, method="max") s_gray_tensor = script_rgb_to_grayscale(img_tensor, num_output_channels=num_output_channels) self.assertTrue(s_gray_tensor.equal(gray_tensor)) + def test_rgb_to_grayscale(self): + self._test_rgb_to_grayscale("cpu") + + @unittest.skipIf(not torch.cuda.is_available(), reason="Skip if no CUDA device") + def test_rgb_to_grayscale_cuda(self): + self._test_rgb_to_grayscale("cuda") + def _test_center_crop(self, device): script_center_crop = torch.jit.script(F.center_crop) diff --git a/test/test_transforms_tensor.py b/test/test_transforms_tensor.py index aefa962d3f7..dfda584390b 100644 --- a/test/test_transforms_tensor.py +++ b/test/test_transforms_tensor.py @@ -323,11 +323,11 @@ def test_random_perspective(self): def test_to_grayscale(self): - fn_kwargs = meth_kwargs = {"num_output_channels": 1} - self._test_op("rgb_to_grayscale", "Grayscale", fn_kwargs=fn_kwargs, meth_kwargs=meth_kwargs) + meth_kwargs = {"num_output_channels": 1} + self._test_class_op("Grayscale", meth_kwargs=meth_kwargs) - fn_kwargs = meth_kwargs = {"num_output_channels": 3} - self._test_op("rgb_to_grayscale", "Grayscale", fn_kwargs=fn_kwargs, meth_kwargs=meth_kwargs) + meth_kwargs = {"num_output_channels": 3} + self._test_class_op("Grayscale", meth_kwargs=meth_kwargs) meth_kwargs = {} self._test_class_op("RandomGrayscale", meth_kwargs=meth_kwargs) diff --git a/torchvision/transforms/functional.py b/torchvision/transforms/functional.py index 6bc04eb2ce6..4a36e0b05e6 100644 --- a/torchvision/transforms/functional.py +++ b/torchvision/transforms/functional.py @@ -958,31 +958,24 @@ def affine( return F_t.affine(img, matrix=matrix, resample=resample, fillcolor=fillcolor) -def to_grayscale(img: Tensor, num_output_channels: int = 1) -> Tensor: - """DEPRECATED. Convert RGB image to grayscale version of image. - The image can be a PIL Image or a Tensor, in which case it is expected - to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions - - .. warning:: - - This method is deprecated and will be removed in future releases. - Please, use ``F.rgb_to_grayscale`` instead. - +@torch.jit.unused +def to_grayscale(img, num_output_channels=1): + """Convert PIL image of any mode (RGB, HSV, LAB, etc) to grayscale version of image. Args: - img (PIL Image or Tensor): RGB Image to be converted to grayscale. + img (PIL Image): PIL Image to be converted to grayscale. num_output_channels (int): number of channels of the output image. Value can be 1 or 3. Default, 1. Returns: - PIL Image or Tensor: Grayscale version of the image. + PIL Image: Grayscale version of the image. if num_output_channels = 1 : returned image is single channel if num_output_channels = 3 : returned image is 3 channel with r = g = b """ - warnings.warn("The use of the F.to_grayscale transform is deprecated, " + - "please use F.rgb_to_grayscale instead.") + if isinstance(img, Image.Image): + return F_pil.to_grayscale(img, num_output_channels) - return rgb_to_grayscale(img, num_output_channels) + raise TypeError("Input should be PIL Image") def rgb_to_grayscale(img: Tensor, num_output_channels: int = 1) -> Tensor: @@ -990,6 +983,10 @@ def rgb_to_grayscale(img: Tensor, num_output_channels: int = 1) -> Tensor: The image can be a PIL Image or a Tensor, in which case it is expected to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions + Note: + Please, note that this method supports only RGB images as input. For inputs in other color spaces, + please, consider using meth:`~torchvision.transforms.functional.to_grayscale` with PIL Image. + Args: img (PIL Image or Tensor): RGB Image to be converted to grayscale. num_output_channels (int): number of channels of the output image. Value can be 1 or 3. Default, 1. @@ -1001,7 +998,7 @@ def rgb_to_grayscale(img: Tensor, num_output_channels: int = 1) -> Tensor: if num_output_channels = 3 : returned image is 3 channel with r = g = b """ if not isinstance(img, torch.Tensor): - return F_pil.rgb_to_grayscale(img, num_output_channels) + return F_pil.to_grayscale(img, num_output_channels) return F_t.rgb_to_grayscale(img, num_output_channels) diff --git a/torchvision/transforms/functional_pil.py b/torchvision/transforms/functional_pil.py index 0dc55ebdc8f..0f5166b3e01 100644 --- a/torchvision/transforms/functional_pil.py +++ b/torchvision/transforms/functional_pil.py @@ -492,31 +492,12 @@ def perspective(img, perspective_coeffs, interpolation=Image.BICUBIC, fill=None) @torch.jit.unused def to_grayscale(img, num_output_channels): - """DEPRECATED. Convert RGB image to grayscale version of image. + """Convert PIL image of any mode (RGB, HSV, LAB, etc) to grayscale version of image. Args: img (PIL Image): Image to be converted to grayscale. num_output_channels (int): number of channels of the output image. Value can be 1 or 3. Default, 1. - Returns: - PIL Image: Grayscale version of the image. - if num_output_channels = 1 : returned image is single channel - - if num_output_channels = 3 : returned image is 3 channel with r = g = b - """ - warnings.warn("The use of the F_pil.to_grayscale transform is deprecated, " + - "please use F.rgb_to_grayscale instead.") - return rgb_to_grayscale(img, num_output_channels) - - -@torch.jit.unused -def rgb_to_grayscale(img, num_output_channels): - """Convert RGB image to grayscale version of image. - - Args: - img (PIL Image): RGB Image to be converted to grayscale. - num_output_channels (int): number of channels of the output image. Value can be 1 or 3. Default, 1. - Returns: PIL Image: Grayscale version of the image. if num_output_channels = 1 : returned image is single channel diff --git a/torchvision/transforms/functional_tensor.py b/torchvision/transforms/functional_tensor.py index 080b7162320..6b581abd8d9 100644 --- a/torchvision/transforms/functional_tensor.py +++ b/torchvision/transforms/functional_tensor.py @@ -101,24 +101,14 @@ def rgb_to_grayscale(img: Tensor, num_output_channels: int = 1) -> Tensor: if num_output_channels not in (1, 3): raise ValueError('num_output_channels should be either 1 or 3') - r = img[..., 0, :, :].float() - g = img[..., 1, :, :].float() - b = img[..., 2, :, :].float() - if not img.is_floating_point(): - # According to PIL docs: PIL grayscale L mode is L = R * 299/1000 + G * 587/1000 + B * 114/1000 - # but implementation is slightly different: - # https://github.com/python-pillow/Pillow/blob/4634eafe3c695a014267eefdce830b4a825beed7/ - # src/libImaging/Convert.c#L47 - # ((rgb)[0]*19595 + (rgb)[1]*38470 + (rgb)[2]*7471 + 0x8000) >> 16 - # l_img = ((19595 * r + 38470 * g + 7471 * b + 2 ** 15) / 2 ** 16).to(img.dtype) - l_img = torch.floor((19595 * r + 38470 * g + 7471 * b + 2 ** 15) / 2 ** 16).to(img.dtype) - else: - l_img = (0.299 * r + 0.587 * g + 0.114 * b).to(img.dtype) + r, g, b = img.unbind(dim=-3) + # This implementation closely follows the TF one: + # https://github.com/tensorflow/tensorflow/blob/v2.3.0/tensorflow/python/ops/image_ops_impl.py#L2105-L2138 + l_img = (0.2989 * r + 0.587 * g + 0.114 * b).to(img.dtype) + l_img = l_img.unsqueeze(dim=-3) if num_output_channels == 3: - l_img = torch.stack([l_img, l_img, l_img], dim=-3) - else: - l_img = l_img.unsqueeze(dim=-3) + return l_img.expand(img.shape) return l_img From c9a76fa42384d26013f4dd540d93d8df67a49707 Mon Sep 17 00:00:00 2001 From: vfdev-5 Date: Wed, 26 Aug 2020 22:55:59 +0200 Subject: [PATCH 5/5] Removed unused import --- torchvision/transforms/functional_pil.py | 1 - 1 file changed, 1 deletion(-) diff --git a/torchvision/transforms/functional_pil.py b/torchvision/transforms/functional_pil.py index 0f5166b3e01..ba620ab9d9c 100644 --- a/torchvision/transforms/functional_pil.py +++ b/torchvision/transforms/functional_pil.py @@ -1,5 +1,4 @@ import numbers -import warnings from typing import Any, List, Sequence import numpy as np