diff --git a/test/prototype/test_bitpacking.py b/test/prototype/test_bitpacking.py new file mode 100644 index 000000000..c1b60e07f --- /dev/null +++ b/test/prototype/test_bitpacking.py @@ -0,0 +1,70 @@ +import torch +from torchao.prototype.common.bitpacking import pack, unpack +import pytest +from torch.utils._triton import has_triton +from torchao.quantization.utils import TORCH_VERSION_AFTER_2_4 + +if not TORCH_VERSION_AFTER_2_4: + pytest.skip("Unsupported PyTorch version", allow_module_level=True) + +def test_uint4_to_uint8_CPU(): + test_tensor = torch.randint(0, 15, (4, 4), dtype=torch.uint8) + packed = pack(test_tensor, 8, 4, device='cpu') + unpacked = unpack(packed, 4, device='cpu') + unpadded = unpacked[:test_tensor.shape[0], ...] + assert(unpadded.allclose(test_tensor)) + +def test_uint3_to_int16_col_wise_cpu(): + test_tensor = torch.randint(0, 7, (8, 5), dtype=torch.int16) + packed = pack(test_tensor,16, 3, False, device='cpu') + unpacked = unpack(packed, 3, False, device='cpu') + unpadded = unpacked[:test_tensor.shape[0], ...] + assert(unpadded.allclose(test_tensor)) + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") +def test_uint4_to_uint8(): + test_tensor = torch.randint(0, 15, (4, 4), dtype=torch.uint8).cuda() + packed = pack(test_tensor, 8, 4) + unpacked = unpack(packed, 4) + unpadded = unpacked[:test_tensor.shape[0], ...] + assert(unpadded.allclose(test_tensor)) + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") +@pytest.mark.skipif(not has_triton(), reason="unsupported without triton") +def test_uint4_to_uint8_compile(): + torch._dynamo.config.specialize_int = True + pack_compiled = torch.compile(pack, fullgraph=True) + unpack_compiled = torch.compile(unpack, fullgraph=True) + test_tensor = torch.randint(0, 15, (3, 4), dtype=torch.uint8).cuda() + packed = pack_compiled(test_tensor, 8, 4) + unpacked = unpack_compiled(packed, 4) + unpadded = unpacked[:test_tensor.shape[0], ...] + assert(unpadded.allclose(test_tensor)) + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") +def test_uint3_to_int16(): + test_tensor = torch.randint(0, 7, (5, 8), dtype=torch.int16).cuda() + packed = pack(test_tensor,16, 3) + unpacked = unpack(packed, 3) + unpadded = unpacked[:test_tensor.shape[0], ...] + assert(unpadded.allclose(test_tensor)) + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") +@pytest.mark.skipif(not has_triton(), reason="unsupported without triton") +def test_uint2_to_uint8_col_wise_compile(): + torch._dynamo.config.specialize_int = True + pack_compiled = torch.compile(pack, fullgraph=True) + unpack_compiled = torch.compile(unpack, fullgraph=True) + test_tensor = torch.randint(0, 3, (8, 8), dtype=torch.uint8).cuda() + packed = pack_compiled(test_tensor, 8, 2, False) + unpacked = unpack_compiled(packed,2, False) + unpadded = unpacked[:test_tensor.shape[0], ...] + assert(unpadded.allclose(test_tensor)) + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") +def test_uint3_to_int16_col_wise(): + test_tensor = torch.randint(0, 7, (8, 5), dtype=torch.int16).cuda() + packed = pack(test_tensor,16, 3, False) + unpacked = unpack(packed, 3, False) + unpadded = unpacked[:test_tensor.shape[0], ...] + assert(unpadded.allclose(test_tensor)) \ No newline at end of file diff --git a/torchao/prototype/common/bitpacking.py b/torchao/prototype/common/bitpacking.py new file mode 100644 index 000000000..35e471c34 --- /dev/null +++ b/torchao/prototype/common/bitpacking.py @@ -0,0 +1,101 @@ +import torch +from functools import reduce + + + +def unpack(data, data_size, by_rows = True, device="cuda"): + """ + Unpacks small dtype elements from a larger dtype. + + Inputs: + data: torch.Tensor - a tensor of packed elements of a small dtype within a larger dtype. + data_size: int - the size of the small dtype in bits. + + optional: + by_rows: bool - specifies whether to unpack... + by rows: tensor(n,m) -> tensor(n*scale, m) + or by columns: tensor(n,m) -> tensor(n,m*scale) + + defaults to rows because quantization is typically done by rows + but choose the version which matches how you quantize as this improves memory accesses/performance + + Returns: torch.Tensor - a tensor of the unpacked elements. + """ + if by_rows: + return _unpack_by_rows(data, data_size, device) + else: + return _unpack_by_cols(data, data_size) + +def pack(data, container_size, data_size, by_rows = True, device="cuda"): + """ + Packs small dtype elements into a larger dtype. + Pads rows to be divisible by the scale. + + Inputs: + data: torch.Tensor - a tensor of unpacked elements of a small dtype. + container_size: int - the size of the large dtype in bits. + data_size: int - the size of the small dtype in bits. + + optional: + by_rows: bool - specifies whether to pack values... + by rows: tensor(n,m) -> tensor(n//scale, m) + or by columns: tensor(n,m) -> tensor(n,m//scale) + + defaults to rows because quantization is typically done by rows + but choose the version which matches how you quantize as this improves memory accesses/performance + + Returns: torch.Tensor - a tensor of packed elements. + """ + if by_rows: + return _pack_by_rows(data, container_size, data_size, device) + else: + return _pack_by_cols(data, container_size, data_size, device) + +def _unpack_by_rows(data, data_size, device) -> torch.Tensor: + shape = data.shape + scale = data.element_size() * 8 // data_size + + unpacked_data = torch.zeros((shape[0]*scale, *shape[1:]), dtype=data.dtype).to(device) + nbits = (1 << data_size) - 1 # mask for the last dtype_size bits + for i in range(scale): + shift_amt = data.element_size() * 8 - data_size * (i + 1) # how much to shift to get the ith uint + unpacked_data[i::scale] = ((data >> shift_amt) & (nbits)) + return unpacked_data + +def _unpack_by_cols(data, data_size) -> torch.Tensor: + shape = data.shape + scale = data.element_size() * 8 // data_size + unpacked_data = [] + nbits = (1 << data_size) - 1 # mask for the last dtype_size bits + for i in range(scale): + shift_amt = data.element_size() * 8 - data_size * (i + 1) # how much to shift to get the ith uint + unpacked_data.append(((data >> shift_amt) & (nbits)).to(data.dtype)) + return torch.stack(unpacked_data,dim=-1).view(*shape[:-1],shape[-1]*scale) # stack the unpacked data and reshape to the original shape + +def _pack_by_rows(data, container_size, data_size, device) -> torch.Tensor: + + scale = container_size // data_size + assert scale > 1, f"container_size ({container_size}) is not larger than data_size ({data_size})" + assert data.shape[0] >= scale, f"not enough values to pack, data.shape[0] ({data.shape[0]}) < scale ({scale})" + # pad the data to be divisible by scale + if data.shape[0] % scale != 0: + padding = torch.zeros((scale - data.shape[0] % scale, *data.shape[1:],), dtype=data.dtype).to(device) + data = torch.cat([data, padding], dim=0).cuda() + + shape = data.shape + ret = reduce(lambda x,y: x|y,[data[i::scale, ...] << container_size-data_size*(i+1) for i in range(scale)]) + return ret.view(shape[0] // scale, *shape[1:]).to(device) + +def _pack_by_cols(data, container_size, data_size, device) -> torch.Tensor: + scale = container_size // data_size + assert scale > 1, f"container_size ({container_size}) not double the capacity ofdata_size ({data_size})" + # pad the data to be divisible by scale + if data.shape[-1] % scale != 0: + padding = torch.zeros((*data.shape[:-1], scale - data.shape[-1] % scale), dtype=data.dtype).to(device) + data = torch.cat([data, padding], dim=-1).cuda() + + shape = data.shape + data = data.contiguous().view(-1) + #shift the data to the different indexes within the larger dtype and then union them together + ret = reduce(lambda x,y: x|y,[data[i::scale] << container_size-data_size*(i+1) for i in range(scale)]) + return ret.view(*shape[:-1],shape[-1] // scale).to(device) \ No newline at end of file