Merge remote-tracking branch 'origin/fsspec-urlsplit' into fsspec-url…

…split * origin/fsspec-urlsplit: feat: add support for current RNTuple files (#962) chore: update pre-commit hooks (#993) chore: add types to most of the `uproot.source` module (#996) fix: make hist import optional in test_0965 (#994) docs: add GaetanLepage as a contributor for test (#995)
scikit-hep · Oct 19, 2023 · c075234 · c075234
2 parents cc30432 + 118713e
commit c075234
Show file tree

Hide file tree

Showing 18 changed files with 352 additions and 115 deletions.
diff --git a/.all-contributorsrc b/.all-contributorsrc
@@ -506,6 +506,15 @@
       "contributions": [
         "doc"
       ]
+    },
+    {
+      "login": "GaetanLepage",
+      "name": "Gaétan Lepage",
+      "avatar_url": "https://avatars.githubusercontent.com/u/33058747?v=4",
+      "profile": "http://glepage.com",
+      "contributions": [
+        "test"
+      ]
     }
   ],
   "contributorsPerLine": 7,

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -30,7 +30,7 @@ repos:
 
 
 - repo: https://github.com/asottile/pyupgrade
-  rev: v3.13.0
+  rev: v3.15.0
   hooks:
   - id: pyupgrade
     args: ["--py38-plus"]
diff --git a/README.md b/README.md
@@ -178,6 +178,7 @@ Thanks especially to the gracious help of Uproot contributors (including the [or
       <td align="center" valign="top" width="14.28%"><a href="https://github.com/ioanaif"><img src="https://avatars.githubusercontent.com/u/9751871?v=4?s=100" width="100px;" alt="ioanaif"/><br /><sub><b>ioanaif</b></sub></a><br /><a href="https://github.com/scikit-hep/uproot5/commits?author=ioanaif" title="Code">💻</a></td>
       <td align="center" valign="top" width="14.28%"><a href="https://github.com/natsukium"><img src="https://avatars.githubusercontent.com/u/25083790?v=4?s=100" width="100px;" alt="OTABI Tomoya"/><br /><sub><b>OTABI Tomoya</b></sub></a><br /><a href="https://github.com/scikit-hep/uproot5/commits?author=natsukium" title="Tests">⚠️</a></td>
       <td align="center" valign="top" width="14.28%"><a href="https://github.com/JostMigenda"><img src="https://avatars.githubusercontent.com/u/16189747?v=4?s=100" width="100px;" alt="Jost Migenda"/><br /><sub><b>Jost Migenda</b></sub></a><br /><a href="https://github.com/scikit-hep/uproot5/commits?author=JostMigenda" title="Documentation">📖</a></td>
+      <td align="center" valign="top" width="14.28%"><a href="http://glepage.com"><img src="https://avatars.githubusercontent.com/u/33058747?v=4?s=100" width="100px;" alt="Gaétan Lepage"/><br /><sub><b>Gaétan Lepage</b></sub></a><br /><a href="https://github.com/scikit-hep/uproot5/commits?author=GaetanLepage" title="Tests">⚠️</a></td>
     </tr>
   </tbody>
 </table>

diff --git a/src/uproot/const.py b/src/uproot/const.py
@@ -129,18 +129,25 @@
     7: "float64",
     8: "float32",
     9: "float16",
-    10: "int64",
-    11: "int32",
-    12: "int16",
-    13: "int8",
-    14: "uint32",  # SplitIndex64 delta encoding
-    15: "uint64",  # SplitIndex32 delta encoding
+    10: "uint64",
+    11: "uint32",
+    12: "uint16",
+    13: "uint8",
+    14: "uint64",  # SplitIndex64 delta encoding
+    15: "uint32",  # SplitIndex32 delta encoding
     16: "float64",  # split
     17: "float32",  # split
     18: "float16",  # split
-    19: "int64",  # split
-    20: "int32",  # split
-    21: "int16",  # split
+    19: "uint64",  # split
+    20: "uint32",  # split
+    21: "uint16",  # split
+    22: "int64",
+    23: "int32",
+    24: "int16",
+    25: "int8",
+    26: "int64",  # split + zigzag encoding
+    27: "int32",  # split + zigzag encoding
+    28: "int16",  # split + zigzag encoding
 }
 rntuple_col_num_to_size_dict = {
     1: 64,
@@ -156,14 +163,21 @@
     11: 32,
     12: 16,
     13: 8,
-    14: 32,  # SplitIndex64 delta encoding
-    15: 64,  # SplitIndex32 delta encoding
+    14: 64,  # SplitIndex64 delta encoding
+    15: 32,  # SplitIndex32 delta encoding
     16: 64,  # split
     17: 32,  # split
     18: 16,  # split
     19: 64,  # split
     20: 32,  # split
     21: 16,  # split
+    22: 64,
+    23: 32,
+    24: 16,
+    25: 8,
+    26: 64,  # split + zigzag encoding
+    27: 32,  # split + zigzag encoding
+    28: 16,  # split + zigzag encoding
 }
 
 rntuple_col_type_to_num_dict = {
@@ -176,10 +190,10 @@
     "real64": 7,
     "real32": 8,
     "real16": 9,
-    "int64": 10,
-    "int32": 11,
-    "int16": 12,
-    "int8": 13,
+    "uint64": 10,
+    "uint32": 11,
+    "uint16": 12,
+    "uint8": 13,
     "splitindex64": 14,
     "splitindex32": 15,
     "splitreal64": 16,
@@ -188,6 +202,13 @@
     "splitin64": 19,
     "splitint32": 20,
     "splitint16": 21,
+    "int64": 22,
+    "int32": 23,
+    "int16": 24,
+    "int8": 25,
+    "splitzigzagint64": 26,
+    "splitzigzagint32": 27,
+    "splitzigzagint16": 28,
 }
 
 rntuple_role_leaf = 0

diff --git a/src/uproot/models/RNTuple.py b/src/uproot/models/RNTuple.py
@@ -28,6 +28,10 @@
 _rntuple_cluster_summary_format = struct.Struct("<QQ")
 
 
+def from_zigzag(n):
+    return n >> 1 ^ -(n & 1)
+
+
 def _envelop_header(chunk, cursor, context):
     env_version, min_version = cursor.fields(
         chunk, uproot.const._rntuple_frame_format, context
@@ -326,7 +330,7 @@ def to_akform(self):
         form = ak.forms.RecordForm(recordlist, topnames, form_key="toplevel")
         return form
 
-    def read_pagedesc(self, destination, desc, dtype_str, dtype):
+    def read_pagedesc(self, destination, desc, dtype_str, dtype, nbits, split):
         loc = desc.locator
         context = {}
         # bool in RNTuple is always stored as bits
@@ -339,6 +343,44 @@ def read_pagedesc(self, destination, desc, dtype_str, dtype):
         content = cursor.array(
             decomp_chunk, num_elements_toread, dtype, context, move=False
         )
+
+        if split:
+            content = content.view(numpy.uint8)
+
+            if nbits == 16:
+                # AAAAABBBBB needs to become
+                # ABABABABAB
+                res = numpy.empty(len(content), numpy.uint8)
+                res[0::2] = content[len(res) * 0 // 2 : len(res) * 1 // 2]
+                res[1::2] = content[len(res) * 1 // 2 : len(res) * 2 // 2]
+                res = res.view(numpy.uint16)
+
+            elif nbits == 32:
+                # AAAAABBBBBCCCCCDDDDD needs to become
+                # ABCDABCDABCDABCDABCD
+                res = numpy.empty(len(content), numpy.uint8)
+                res[0::4] = content[len(res) * 0 // 4 : len(res) * 1 // 4]
+                res[1::4] = content[len(res) * 1 // 4 : len(res) * 2 // 4]
+                res[2::4] = content[len(res) * 2 // 4 : len(res) * 3 // 4]
+                res[3::4] = content[len(res) * 3 // 4 : len(res) * 4 // 4]
+                res = res.view(numpy.uint32)
+
+            elif nbits == 64:
+                # AAAAABBBBBCCCCCDDDDDEEEEEFFFFFGGGGGHHHHH needs to become
+                # ABCDEFGHABCDEFGHABCDEFGHABCDEFGHABCDEFGH
+                res = numpy.empty(len(content), numpy.uint8)
+                res[0::8] = content[len(res) * 0 // 8 : len(res) * 1 // 8]
+                res[1::8] = content[len(res) * 1 // 8 : len(res) * 2 // 8]
+                res[2::8] = content[len(res) * 2 // 8 : len(res) * 3 // 8]
+                res[3::8] = content[len(res) * 3 // 8 : len(res) * 4 // 8]
+                res[4::8] = content[len(res) * 4 // 8 : len(res) * 5 // 8]
+                res[5::8] = content[len(res) * 5 // 8 : len(res) * 6 // 8]
+                res[6::8] = content[len(res) * 6 // 8 : len(res) * 7 // 8]
+                res[7::8] = content[len(res) * 7 // 8 : len(res) * 8 // 8]
+                res = res.view(numpy.uint64)
+
+            content = res
+
         if isbit:
             content = (
                 numpy.unpackbits(content.view(dtype=numpy.uint8))
@@ -368,14 +410,24 @@ def read_col_page(self, ncol, cluster_i):
         total_len = numpy.sum([desc.num_elements for desc in pagelist])
         res = numpy.empty(total_len, dtype)
         tracker = 0
+        split = 14 <= dtype_byte <= 21 or 26 <= dtype_byte <= 28
+        nbits = uproot.const.rntuple_col_num_to_size_dict[dtype_byte]
         for page_desc in pagelist:
             n_elements = page_desc.num_elements
             tracker_end = tracker + n_elements
-            self.read_pagedesc(res[tracker:tracker_end], page_desc, dtype_str, dtype)
+            self.read_pagedesc(
+                res[tracker:tracker_end], page_desc, dtype_str, dtype, nbits, split
+            )
             tracker = tracker_end
 
         if dtype_byte <= uproot.const.rntuple_col_type_to_num_dict["index32"]:
             res = numpy.insert(res, 0, 0)  # for offsets
+        zigzag = 26 <= dtype_byte <= 28
+        delta = 14 <= dtype_byte <= 15
+        if zigzag:
+            res = from_zigzag(res)
+        elif delta:
+            numpy.cumsum(res)
         return res
 
     def arrays(
@@ -645,6 +697,15 @@ def read(self, chunk, cursor, context):
 
         return out
 
+    def read_extension_header(self, out, chunk, cursor, context):
+        out.field_records = self.list_field_record_frames.read(chunk, cursor, context)
+        out.column_records = self.list_column_record_frames.read(chunk, cursor, context)
+        out.alias_columns = self.list_alias_column_frames.read(chunk, cursor, context)
+        out.extra_type_infos = self.list_extra_type_info_reader.read(
+            chunk, cursor, context
+        )
+        return out
+
 
 class ColumnGroupRecordReader:
     def read(self, chunk, cursor, context):
@@ -672,9 +733,29 @@ def read(self, chunk, cursor, context):
         return out
 
 
+class RNTupleSchemaExtension:
+    def read(self, chunk, cursor, context):
+        out = MetaData(type(self).__name__)
+        out.size = cursor.field(chunk, struct.Struct("<I"), context)
+        out.field_records = ListFrameReader(
+            RecordFrameReader(FieldRecordReader())
+        ).read(chunk, cursor, context)
+        out.column_records = ListFrameReader(
+            RecordFrameReader(ColumnRecordReader())
+        ).read(chunk, cursor, context)
+        out.alias_records = ListFrameReader(
+            RecordFrameReader(AliasColumnReader())
+        ).read(chunk, cursor, context)
+        out.extra_type_info = ListFrameReader(
+            RecordFrameReader(ExtraTypeInfoReader())
+        ).read(chunk, cursor, context)
+        return out
+
+
 class FooterReader:
     def __init__(self):
-        self.extension_header_links = ListFrameReader(EnvLinkReader())
+        self.extension_header_links = RNTupleSchemaExtension()
+        # self.extension_header_links = ListFrameReader(EnvLinkReader())
         self.column_group_record_frames = ListFrameReader(
             RecordFrameReader(ColumnGroupRecordReader())
         )
@@ -691,8 +772,8 @@ def read(self, chunk, cursor, context):
         out.env_header = _envelop_header(chunk, cursor, context)
         out.feature_flag = cursor.field(chunk, _rntuple_feature_flag_format, context)
         out.header_crc32 = cursor.field(chunk, struct.Struct("<I"), context)
-
         out.extension_links = self.extension_header_links.read(chunk, cursor, context)
+
         out.col_group_records = self.column_group_record_frames.read(
             chunk, cursor, context
         )