Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/fsspec-urlsplit' into fsspec-url…
Browse files Browse the repository at this point in the history
…split

* origin/fsspec-urlsplit:
  feat: add support for current RNTuple files (#962)
  chore: update pre-commit hooks (#993)
  chore: add types to most of the `uproot.source` module (#996)
  fix: make hist import optional in test_0965 (#994)
  docs: add GaetanLepage as a contributor for test (#995)
  • Loading branch information
lobis committed Oct 19, 2023
2 parents cc30432 + 118713e commit c075234
Show file tree
Hide file tree
Showing 18 changed files with 352 additions and 115 deletions.
9 changes: 9 additions & 0 deletions .all-contributorsrc
Original file line number Diff line number Diff line change
Expand Up @@ -506,6 +506,15 @@
"contributions": [
"doc"
]
},
{
"login": "GaetanLepage",
"name": "Gaétan Lepage",
"avatar_url": "https://avatars.githubusercontent.com/u/33058747?v=4",
"profile": "http://glepage.com",
"contributions": [
"test"
]
}
],
"contributorsPerLine": 7,
Expand Down
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ repos:


- repo: https://github.com/asottile/pyupgrade
rev: v3.13.0
rev: v3.15.0
hooks:
- id: pyupgrade
args: ["--py38-plus"]
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,7 @@ Thanks especially to the gracious help of Uproot contributors (including the [or
<td align="center" valign="top" width="14.28%"><a href="https://github.com/ioanaif"><img src="https://avatars.githubusercontent.com/u/9751871?v=4?s=100" width="100px;" alt="ioanaif"/><br /><sub><b>ioanaif</b></sub></a><br /><a href="https://github.com/scikit-hep/uproot5/commits?author=ioanaif" title="Code">💻</a></td>
<td align="center" valign="top" width="14.28%"><a href="https://github.com/natsukium"><img src="https://avatars.githubusercontent.com/u/25083790?v=4?s=100" width="100px;" alt="OTABI Tomoya"/><br /><sub><b>OTABI Tomoya</b></sub></a><br /><a href="https://github.com/scikit-hep/uproot5/commits?author=natsukium" title="Tests">⚠️</a></td>
<td align="center" valign="top" width="14.28%"><a href="https://github.com/JostMigenda"><img src="https://avatars.githubusercontent.com/u/16189747?v=4?s=100" width="100px;" alt="Jost Migenda"/><br /><sub><b>Jost Migenda</b></sub></a><br /><a href="https://github.com/scikit-hep/uproot5/commits?author=JostMigenda" title="Documentation">📖</a></td>
<td align="center" valign="top" width="14.28%"><a href="http://glepage.com"><img src="https://avatars.githubusercontent.com/u/33058747?v=4?s=100" width="100px;" alt="Gaétan Lepage"/><br /><sub><b>Gaétan Lepage</b></sub></a><br /><a href="https://github.com/scikit-hep/uproot5/commits?author=GaetanLepage" title="Tests">⚠️</a></td>
</tr>
</tbody>
</table>
Expand Down
51 changes: 36 additions & 15 deletions src/uproot/const.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,18 +129,25 @@
7: "float64",
8: "float32",
9: "float16",
10: "int64",
11: "int32",
12: "int16",
13: "int8",
14: "uint32", # SplitIndex64 delta encoding
15: "uint64", # SplitIndex32 delta encoding
10: "uint64",
11: "uint32",
12: "uint16",
13: "uint8",
14: "uint64", # SplitIndex64 delta encoding
15: "uint32", # SplitIndex32 delta encoding
16: "float64", # split
17: "float32", # split
18: "float16", # split
19: "int64", # split
20: "int32", # split
21: "int16", # split
19: "uint64", # split
20: "uint32", # split
21: "uint16", # split
22: "int64",
23: "int32",
24: "int16",
25: "int8",
26: "int64", # split + zigzag encoding
27: "int32", # split + zigzag encoding
28: "int16", # split + zigzag encoding
}
rntuple_col_num_to_size_dict = {
1: 64,
Expand All @@ -156,14 +163,21 @@
11: 32,
12: 16,
13: 8,
14: 32, # SplitIndex64 delta encoding
15: 64, # SplitIndex32 delta encoding
14: 64, # SplitIndex64 delta encoding
15: 32, # SplitIndex32 delta encoding
16: 64, # split
17: 32, # split
18: 16, # split
19: 64, # split
20: 32, # split
21: 16, # split
22: 64,
23: 32,
24: 16,
25: 8,
26: 64, # split + zigzag encoding
27: 32, # split + zigzag encoding
28: 16, # split + zigzag encoding
}

rntuple_col_type_to_num_dict = {
Expand All @@ -176,10 +190,10 @@
"real64": 7,
"real32": 8,
"real16": 9,
"int64": 10,
"int32": 11,
"int16": 12,
"int8": 13,
"uint64": 10,
"uint32": 11,
"uint16": 12,
"uint8": 13,
"splitindex64": 14,
"splitindex32": 15,
"splitreal64": 16,
Expand All @@ -188,6 +202,13 @@
"splitin64": 19,
"splitint32": 20,
"splitint16": 21,
"int64": 22,
"int32": 23,
"int16": 24,
"int8": 25,
"splitzigzagint64": 26,
"splitzigzagint32": 27,
"splitzigzagint16": 28,
}

rntuple_role_leaf = 0
Expand Down
89 changes: 85 additions & 4 deletions src/uproot/models/RNTuple.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,10 @@
_rntuple_cluster_summary_format = struct.Struct("<QQ")


def from_zigzag(n):
return n >> 1 ^ -(n & 1)


def _envelop_header(chunk, cursor, context):
env_version, min_version = cursor.fields(
chunk, uproot.const._rntuple_frame_format, context
Expand Down Expand Up @@ -326,7 +330,7 @@ def to_akform(self):
form = ak.forms.RecordForm(recordlist, topnames, form_key="toplevel")
return form

def read_pagedesc(self, destination, desc, dtype_str, dtype):
def read_pagedesc(self, destination, desc, dtype_str, dtype, nbits, split):
loc = desc.locator
context = {}
# bool in RNTuple is always stored as bits
Expand All @@ -339,6 +343,44 @@ def read_pagedesc(self, destination, desc, dtype_str, dtype):
content = cursor.array(
decomp_chunk, num_elements_toread, dtype, context, move=False
)

if split:
content = content.view(numpy.uint8)

if nbits == 16:
# AAAAABBBBB needs to become
# ABABABABAB
res = numpy.empty(len(content), numpy.uint8)
res[0::2] = content[len(res) * 0 // 2 : len(res) * 1 // 2]
res[1::2] = content[len(res) * 1 // 2 : len(res) * 2 // 2]
res = res.view(numpy.uint16)

elif nbits == 32:
# AAAAABBBBBCCCCCDDDDD needs to become
# ABCDABCDABCDABCDABCD
res = numpy.empty(len(content), numpy.uint8)
res[0::4] = content[len(res) * 0 // 4 : len(res) * 1 // 4]
res[1::4] = content[len(res) * 1 // 4 : len(res) * 2 // 4]
res[2::4] = content[len(res) * 2 // 4 : len(res) * 3 // 4]
res[3::4] = content[len(res) * 3 // 4 : len(res) * 4 // 4]
res = res.view(numpy.uint32)

elif nbits == 64:
# AAAAABBBBBCCCCCDDDDDEEEEEFFFFFGGGGGHHHHH needs to become
# ABCDEFGHABCDEFGHABCDEFGHABCDEFGHABCDEFGH
res = numpy.empty(len(content), numpy.uint8)
res[0::8] = content[len(res) * 0 // 8 : len(res) * 1 // 8]
res[1::8] = content[len(res) * 1 // 8 : len(res) * 2 // 8]
res[2::8] = content[len(res) * 2 // 8 : len(res) * 3 // 8]
res[3::8] = content[len(res) * 3 // 8 : len(res) * 4 // 8]
res[4::8] = content[len(res) * 4 // 8 : len(res) * 5 // 8]
res[5::8] = content[len(res) * 5 // 8 : len(res) * 6 // 8]
res[6::8] = content[len(res) * 6 // 8 : len(res) * 7 // 8]
res[7::8] = content[len(res) * 7 // 8 : len(res) * 8 // 8]
res = res.view(numpy.uint64)

content = res

if isbit:
content = (
numpy.unpackbits(content.view(dtype=numpy.uint8))
Expand Down Expand Up @@ -368,14 +410,24 @@ def read_col_page(self, ncol, cluster_i):
total_len = numpy.sum([desc.num_elements for desc in pagelist])
res = numpy.empty(total_len, dtype)
tracker = 0
split = 14 <= dtype_byte <= 21 or 26 <= dtype_byte <= 28
nbits = uproot.const.rntuple_col_num_to_size_dict[dtype_byte]
for page_desc in pagelist:
n_elements = page_desc.num_elements
tracker_end = tracker + n_elements
self.read_pagedesc(res[tracker:tracker_end], page_desc, dtype_str, dtype)
self.read_pagedesc(
res[tracker:tracker_end], page_desc, dtype_str, dtype, nbits, split
)
tracker = tracker_end

if dtype_byte <= uproot.const.rntuple_col_type_to_num_dict["index32"]:
res = numpy.insert(res, 0, 0) # for offsets
zigzag = 26 <= dtype_byte <= 28
delta = 14 <= dtype_byte <= 15
if zigzag:
res = from_zigzag(res)
elif delta:
numpy.cumsum(res)
return res

def arrays(
Expand Down Expand Up @@ -645,6 +697,15 @@ def read(self, chunk, cursor, context):

return out

def read_extension_header(self, out, chunk, cursor, context):
out.field_records = self.list_field_record_frames.read(chunk, cursor, context)
out.column_records = self.list_column_record_frames.read(chunk, cursor, context)
out.alias_columns = self.list_alias_column_frames.read(chunk, cursor, context)
out.extra_type_infos = self.list_extra_type_info_reader.read(
chunk, cursor, context
)
return out


class ColumnGroupRecordReader:
def read(self, chunk, cursor, context):
Expand Down Expand Up @@ -672,9 +733,29 @@ def read(self, chunk, cursor, context):
return out


class RNTupleSchemaExtension:
def read(self, chunk, cursor, context):
out = MetaData(type(self).__name__)
out.size = cursor.field(chunk, struct.Struct("<I"), context)
out.field_records = ListFrameReader(
RecordFrameReader(FieldRecordReader())
).read(chunk, cursor, context)
out.column_records = ListFrameReader(
RecordFrameReader(ColumnRecordReader())
).read(chunk, cursor, context)
out.alias_records = ListFrameReader(
RecordFrameReader(AliasColumnReader())
).read(chunk, cursor, context)
out.extra_type_info = ListFrameReader(
RecordFrameReader(ExtraTypeInfoReader())
).read(chunk, cursor, context)
return out


class FooterReader:
def __init__(self):
self.extension_header_links = ListFrameReader(EnvLinkReader())
self.extension_header_links = RNTupleSchemaExtension()
# self.extension_header_links = ListFrameReader(EnvLinkReader())
self.column_group_record_frames = ListFrameReader(
RecordFrameReader(ColumnGroupRecordReader())
)
Expand All @@ -691,8 +772,8 @@ def read(self, chunk, cursor, context):
out.env_header = _envelop_header(chunk, cursor, context)
out.feature_flag = cursor.field(chunk, _rntuple_feature_flag_format, context)
out.header_crc32 = cursor.field(chunk, struct.Struct("<I"), context)

out.extension_links = self.extension_header_links.read(chunk, cursor, context)

out.col_group_records = self.column_group_record_frames.read(
chunk, cursor, context
)
Expand Down
Loading

0 comments on commit c075234

Please sign in to comment.