pola-rs · ritchie46 · Feb 4, 2024 · Feb 3, 2024 · Feb 3, 2024 · Feb 3, 2024
@@ -74,6 +74,10 @@ avro = ["arrow/io_avro", "arrow/io_avro_compression"]
 csv = ["atoi_simd", "polars-core/rows", "itoa", "ryu", "fast-float", "simdutf8"]
 decompress = ["flate2/rust_backend", "zstd"]
 decompress-fast = ["flate2/zlib-ng", "zstd"]
+dtype-u8 = ["polars-core/dtype-u8"]
+dtype-u16 = ["polars-core/dtype-u16"]
+dtype-i8 = ["polars-core/dtype-i8"]
+dtype-i16 = ["polars-core/dtype-i16"]
 dtype-categorical = ["polars-core/dtype-categorical"]
 dtype-date = ["polars-core/dtype-date", "polars-time/dtype-date"]
 object = []

@@ -29,6 +29,20 @@ impl PrimitiveParser for Float64Type {
     }
 }
 
+#[cfg(feature = "dtype-u8")]
+impl PrimitiveParser for UInt8Type {
+    #[inline]
+    fn parse(bytes: &[u8]) -> Option<u8> {
+        atoi_simd::parse_skipped(bytes).ok()
+    }
+}
+#[cfg(feature = "dtype-u16")]
+impl PrimitiveParser for UInt16Type {
+    #[inline]
+    fn parse(bytes: &[u8]) -> Option<u16> {
+        atoi_simd::parse_skipped(bytes).ok()
+    }
+}
 impl PrimitiveParser for UInt32Type {
     #[inline]
     fn parse(bytes: &[u8]) -> Option<u32> {
@@ -41,6 +55,20 @@ impl PrimitiveParser for UInt64Type {
         atoi_simd::parse_skipped(bytes).ok()
     }
 }
+#[cfg(feature = "dtype-i8")]
+impl PrimitiveParser for Int8Type {
+    #[inline]
+    fn parse(bytes: &[u8]) -> Option<i8> {
+        atoi_simd::parse_skipped(bytes).ok()
+    }
+}
+#[cfg(feature = "dtype-i16")]
+impl PrimitiveParser for Int16Type {
+    #[inline]
+    fn parse(bytes: &[u8]) -> Option<i16> {
+        atoi_simd::parse_skipped(bytes).ok()
+    }
+}
 impl PrimitiveParser for Int32Type {
     #[inline]
     fn parse(bytes: &[u8]) -> Option<i32> {
@@ -457,8 +485,16 @@ pub(crate) fn init_buffers(
             let (name, dtype) = schema.get_at_index(i).unwrap();
             let builder = match dtype {
                 &DataType::Boolean => Buffer::Boolean(BooleanChunkedBuilder::new(name, capacity)),
+                #[cfg(feature = "dtype-i8")]
+                &DataType::Int8 => Buffer::Int8(PrimitiveChunkedBuilder::new(name, capacity)),
+                #[cfg(feature = "dtype-i16")]
+                &DataType::Int16 => Buffer::Int16(PrimitiveChunkedBuilder::new(name, capacity)),
                 &DataType::Int32 => Buffer::Int32(PrimitiveChunkedBuilder::new(name, capacity)),
                 &DataType::Int64 => Buffer::Int64(PrimitiveChunkedBuilder::new(name, capacity)),
+                #[cfg(feature = "dtype-u8")]
+                &DataType::UInt8 => Buffer::UInt8(PrimitiveChunkedBuilder::new(name, capacity)),
+                #[cfg(feature = "dtype-u16")]
+                &DataType::UInt16 => Buffer::UInt16(PrimitiveChunkedBuilder::new(name, capacity)),
                 &DataType::UInt32 => Buffer::UInt32(PrimitiveChunkedBuilder::new(name, capacity)),
                 &DataType::UInt64 => Buffer::UInt64(PrimitiveChunkedBuilder::new(name, capacity)),
                 &DataType::Float32 => Buffer::Float32(PrimitiveChunkedBuilder::new(name, capacity)),
@@ -491,8 +527,16 @@ pub(crate) fn init_buffers(
 #[allow(clippy::large_enum_variant)]
 pub(crate) enum Buffer {
     Boolean(BooleanChunkedBuilder),
+    #[cfg(feature = "dtype-i8")]
+    Int8(PrimitiveChunkedBuilder<Int8Type>),
+    #[cfg(feature = "dtype-i16")]
+    Int16(PrimitiveChunkedBuilder<Int16Type>),
     Int32(PrimitiveChunkedBuilder<Int32Type>),
     Int64(PrimitiveChunkedBuilder<Int64Type>),
+    #[cfg(feature = "dtype-u8")]
+    UInt8(PrimitiveChunkedBuilder<UInt8Type>),
+    #[cfg(feature = "dtype-u16")]
+    UInt16(PrimitiveChunkedBuilder<UInt16Type>),
     UInt32(PrimitiveChunkedBuilder<UInt32Type>),
     UInt64(PrimitiveChunkedBuilder<UInt64Type>),
     Float32(PrimitiveChunkedBuilder<Float32Type>),
@@ -515,8 +559,16 @@ impl Buffer {
     pub(crate) fn into_series(self) -> PolarsResult<Series> {
         let s = match self {
             Buffer::Boolean(v) => v.finish().into_series(),
+            #[cfg(feature = "dtype-i8")]
+            Buffer::Int8(v) => v.finish().into_series(),
+            #[cfg(feature = "dtype-i16")]
+            Buffer::Int16(v) => v.finish().into_series(),
             Buffer::Int32(v) => v.finish().into_series(),
             Buffer::Int64(v) => v.finish().into_series(),
+            #[cfg(feature = "dtype-u8")]
+            Buffer::UInt8(v) => v.finish().into_series(),
+            #[cfg(feature = "dtype-u16")]
+            Buffer::UInt16(v) => v.finish().into_series(),
             Buffer::UInt32(v) => v.finish().into_series(),
             Buffer::UInt64(v) => v.finish().into_series(),
             Buffer::Float32(v) => v.finish().into_series(),
@@ -562,8 +614,16 @@ impl Buffer {
     pub(crate) fn add_null(&mut self, valid: bool) {
         match self {
             Buffer::Boolean(v) => v.append_null(),
+            #[cfg(feature = "dtype-i8")]
+            Buffer::Int8(v) => v.append_null(),
+            #[cfg(feature = "dtype-i16")]
+            Buffer::Int16(v) => v.append_null(),
             Buffer::Int32(v) => v.append_null(),
             Buffer::Int64(v) => v.append_null(),
+            #[cfg(feature = "dtype-u8")]
+            Buffer::UInt8(v) => v.append_null(),
+            #[cfg(feature = "dtype-u16")]
+            Buffer::UInt16(v) => v.append_null(),
             Buffer::UInt32(v) => v.append_null(),
             Buffer::UInt64(v) => v.append_null(),
             Buffer::Float32(v) => v.append_null(),
@@ -596,8 +656,16 @@ impl Buffer {
     pub(crate) fn dtype(&self) -> DataType {
         match self {
             Buffer::Boolean(_) => DataType::Boolean,
+            #[cfg(feature = "dtype-i8")]
+            Buffer::Int8(_) => DataType::Int8,
+            #[cfg(feature = "dtype-i16")]
+            Buffer::Int16(_) => DataType::Int16,
             Buffer::Int32(_) => DataType::Int32,
             Buffer::Int64(_) => DataType::Int64,
+            #[cfg(feature = "dtype-u8")]
+            Buffer::UInt8(_) => DataType::UInt8,
+            #[cfg(feature = "dtype-u16")]
+            Buffer::UInt16(_) => DataType::UInt16,
             Buffer::UInt32(_) => DataType::UInt32,
             Buffer::UInt64(_) => DataType::UInt64,
             Buffer::Float32(_) => DataType::Float32,
@@ -639,6 +707,24 @@ impl Buffer {
                 missing_is_null,
                 None,
             ),
+            #[cfg(feature = "dtype-i8")]
+            Int8(buf) => <PrimitiveChunkedBuilder<Int8Type> as ParsedBuffer>::parse_bytes(
+                buf,
+                bytes,
+                ignore_errors,
+                needs_escaping,
+                missing_is_null,
+                None,
+            ),
+            #[cfg(feature = "dtype-i16")]
+            Int16(buf) => <PrimitiveChunkedBuilder<Int16Type> as ParsedBuffer>::parse_bytes(
+                buf,
+                bytes,
+                ignore_errors,
+                needs_escaping,
+                missing_is_null,
+                None,
+            ),
             Int32(buf) => <PrimitiveChunkedBuilder<Int32Type> as ParsedBuffer>::parse_bytes(
                 buf,
                 bytes,
@@ -655,7 +741,17 @@ impl Buffer {
                 missing_is_null,
                 None,
             ),
-            UInt64(buf) => <PrimitiveChunkedBuilder<UInt64Type> as ParsedBuffer>::parse_bytes(
+            #[cfg(feature = "dtype-u8")]
+            UInt8(buf) => <PrimitiveChunkedBuilder<UInt8Type> as ParsedBuffer>::parse_bytes(
+                buf,
+                bytes,
+                ignore_errors,
+                needs_escaping,
+                missing_is_null,
+                None,
+            ),
+            #[cfg(feature = "dtype-u16")]
+            UInt16(buf) => <PrimitiveChunkedBuilder<UInt16Type> as ParsedBuffer>::parse_bytes(
                 buf,
                 bytes,
                 ignore_errors,
@@ -671,6 +767,14 @@ impl Buffer {
                 missing_is_null,
                 None,
             ),
+            UInt64(buf) => <PrimitiveChunkedBuilder<UInt64Type> as ParsedBuffer>::parse_bytes(
+                buf,
+                bytes,
+                ignore_errors,
+                needs_escaping,
+                missing_is_null,
+                None,
+            ),
             Float32(buf) => <PrimitiveChunkedBuilder<Float32Type> as ParsedBuffer>::parse_bytes(
                 buf,
                 bytes,

@@ -435,6 +435,7 @@ impl<'a, R: MmapBytesReader + 'a> CsvReader<'a, R> {
         let mut _has_categorical = false;
         let mut _err: Option<PolarsError> = None;
 
+        #[allow(unused_mut)]
         let schema = overwriting_schema
             .iter_fields()
             .filter_map(|mut fld| {
@@ -445,12 +446,6 @@ impl<'a, R: MmapBytesReader + 'a> CsvReader<'a, R> {
                         // let inference decide the column type
                         None
                     },
-                    Int8 | Int16 | UInt8 | UInt16 => {
-                        // We have not compiled these buffers, so we cast them later.
-                        to_cast.push(fld.clone());
-                        fld.coerce(DataType::Int32);
-                        Some(fld)
-                    },
                     #[cfg(feature = "dtype-categorical")]
                     Categorical(_, _) => {
                         _has_categorical = true;

@@ -275,11 +275,13 @@ dtype-array = [
 ]
 dtype-i8 = [
   "polars-core/dtype-i8",
+  "polars-io/dtype-i8",
   "polars-lazy?/dtype-i8",
   "polars-ops/dtype-i8",
 ]
 dtype-i16 = [
   "polars-core/dtype-i16",
+  "polars-io/dtype-i16",
   "polars-lazy?/dtype-i16",
   "polars-ops/dtype-i16",
 ]
@@ -292,11 +294,13 @@ dtype-decimal = [
 ]
 dtype-u8 = [
   "polars-core/dtype-u8",
+  "polars-io/dtype-u8",
   "polars-lazy?/dtype-u8",
   "polars-ops/dtype-u8",
 ]
 dtype-u16 = [
   "polars-core/dtype-u16",
+  "polars-io/dtype-u16",
   "polars-lazy?/dtype-u16",
   "polars-ops/dtype-u16",
 ]

@@ -243,6 +243,49 @@ def test_csv_missing_utf8_is_empty_string() -> None:
     ]
 
 
+def test_csv_int_types() -> None:
+    f = io.StringIO(
+        "u8,i8,u16,i16,u32,i32,u64,i64\n"
+        "0,0,0,0,0,0,0,0\n"
+        "0,-128,0,-32768,0,-2147483648,0,-9223372036854775808\n"
+        "255,127,65535,32767,4294967295,2147483647,18446744073709551615,9223372036854775807\n"
+        "01,01,01,01,01,01,01,01\n"
+        "01,-01,01,-01,01,-01,01,-01\n"
+    )
+    df = pl.read_csv(
+        f,
+        schema={
+            "u8": pl.UInt8,
+            "i8": pl.Int8,
+            "u16": pl.UInt16,
+            "i16": pl.Int16,
+            "u32": pl.UInt32,
+            "i32": pl.Int32,
+            "u64": pl.UInt64,
+            "i64": pl.Int64,
+        },
+    )
+
+    assert_frame_equal(
+        df,
+        pl.DataFrame(
+            {
+                "u8": pl.Series([0, 0, 255, 1, 1], dtype=pl.UInt8),
+                "i8": pl.Series([0, -128, 127, 1, -1], dtype=pl.Int8),
+                "u16": pl.Series([0, 0, 65535, 1, 1], dtype=pl.UInt16),
+                "i16": pl.Series([0, -32768, 32767, 1, -1], dtype=pl.Int16),
+                "u32": pl.Series([0, 0, 4294967295, 1, 1], dtype=pl.UInt32),
+                "i32": pl.Series([0, -2147483648, 2147483647, 1, -1], dtype=pl.Int32),
+                "u64": pl.Series([0, 0, 18446744073709551615, 1, 1], dtype=pl.UInt64),
+                "i64": pl.Series(
+                    [0, -9223372036854775808, 9223372036854775807, 1, -1],
+                    dtype=pl.Int64,
+                ),
+            }
+        ),
+    )
+
+
 def test_csv_float_parsing() -> None:
     lines_with_floats = [
         "123.86,+123.86,-123.86\n",