Skip to content

Commit

Permalink
GH-134: Add Validity functions (#135)
Browse files Browse the repository at this point in the history
Adding `is_null`, `is_not_null`, and `is_nan` functions and tests to the
Compute implementations for Go.

This also removes the `-ffast-math` option from the SIMD optimized
compiler Makefile so that our comparisons properly handle `NaN` values
(`-ffast-math` assumes that no values are `NaN` and removes the checks).
`is_nan` can be easily implemented for float and double values by
dispatching to `A != A`.

Tests are also added for the new functions.

Ultimately the driving force behind this is as these are necessary for
implementing scanning reads in iceberg-go

---------

Co-authored-by: Sutou Kouhei <kou@clear-code.com>
  • Loading branch information
zeroshade and kou authored Sep 25, 2024
1 parent 14844ae commit 6166f84
Show file tree
Hide file tree
Showing 21 changed files with 58,373 additions and 52,334 deletions.
6 changes: 6 additions & 0 deletions .github/workflows/lint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,12 @@ jobs:
uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
with:
python-version: 3.12
- name: Setup Go
uses: actions/setup-go@0a12ed9d6a96ab950c8f026ed9f722fe0da7ef32 # v5.0.2
with:
go-version: '1.23'
cache: true
cache-dependency-path: go.sum
- name: Install pre-commit
run: |
python -m pip install pre-commit
Expand Down
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

repos:
- repo: https://github.com/golangci/golangci-lint
rev: v1.60.3
rev: v1.61.0
hooks:
# no built-in support for multiple go.mod
# https://github.com/golangci/golangci-lint/issues/828
Expand Down
34 changes: 34 additions & 0 deletions arrow/array/struct.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,40 @@ func NewStructArray(cols []arrow.Array, names []string) (*Struct, error) {
return NewStructArrayWithNulls(cols, names, nil, 0, 0)
}

// NewStructArrayWithFields builds a new Struct Array using the passed columns
// and provided fields. As opposed to NewStructArray, this allows you to provide
// the full fields to utilize for the struct column instead of just the names.
func NewStructArrayWithFields(cols []arrow.Array, fields []arrow.Field) (*Struct, error) {
if len(cols) != len(fields) {
return nil, fmt.Errorf("%w: mismatching number of fields and child arrays", arrow.ErrInvalid)
}
if len(cols) == 0 {
return nil, fmt.Errorf("%w: can't infer struct array length with 0 child arrays", arrow.ErrInvalid)
}

length := cols[0].Len()
children := make([]arrow.ArrayData, len(cols))
for i, c := range cols {
if length != c.Len() {
return nil, fmt.Errorf("%w: mismatching child array lengths", arrow.ErrInvalid)
}
if !arrow.TypeEqual(fields[i].Type, c.DataType()) {
return nil, fmt.Errorf("%w: mismatching data type for child #%d, field says '%s', got '%s'",
arrow.ErrInvalid, i, fields[i].Type, c.DataType())
}
if !fields[i].Nullable && c.NullN() > 0 {
return nil, fmt.Errorf("%w: field says not-nullable, child #%d has nulls",
arrow.ErrInvalid, i)
}

children[i] = c.Data()
}

data := NewData(arrow.StructOf(fields...), length, []*memory.Buffer{nil}, children, 0, 0)
defer data.Release()
return NewStructData(data), nil
}

// NewStructArrayWithNulls is like NewStructArray as a convenience function,
// but also takes in a null bitmap, the number of nulls, and an optional offset
// to use for creating the Struct Array.
Expand Down
30 changes: 26 additions & 4 deletions arrow/compute/exprs/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ const (
SubstraitArithmeticFuncsURI = SubstraitDefaultURIPrefix + "functions_arithmetic.yaml"
// URI for official Substrait Comparison funcs extensions
SubstraitComparisonFuncsURI = SubstraitDefaultURIPrefix + "functions_comparison.yaml"
SubstraitBooleanFuncsURI = SubstraitDefaultURIPrefix + "functions_boolean.yaml"

TimestampTzTimezone = "UTC"
)
Expand Down Expand Up @@ -93,7 +94,7 @@ func init() {
}
}

for _, fn := range []string{"equal", "not_equal", "lt", "lte", "gt", "gte"} {
for _, fn := range []string{"equal", "not_equal", "lt", "lte", "gt", "gte", "is_null", "is_not_null", "is_nan"} {
err := DefaultExtensionIDRegistry.AddSubstraitScalarToArrow(
extensions.ID{URI: SubstraitComparisonFuncsURI, Name: fn},
simpleMapSubstraitToArrowFunc)
Expand All @@ -102,13 +103,30 @@ func init() {
}
}

for _, fn := range []string{"equal", "not_equal", "less", "less_equal", "greater", "greater_equal"} {
for _, fn := range []string{"equal", "not_equal", "less", "less_equal", "greater", "greater_equal", "is_null", "is_not_null", "is_nan"} {
err := DefaultExtensionIDRegistry.AddArrowToSubstrait(fn,
simpleMapArrowToSubstraitFunc(SubstraitComparisonFuncsURI))
if err != nil {
panic(err)
}
}

for _, fn := range []string{"and", "or"} {
err := DefaultExtensionIDRegistry.AddSubstraitScalarToArrow(
extensions.ID{URI: SubstraitBooleanFuncsURI, Name: fn},
simpleMapSubstraitToArrowFunc)
if err != nil {
panic(err)
}
}

for _, fn := range []string{"and_kleene", "or_kleene"} {
err := DefaultExtensionIDRegistry.AddArrowToSubstrait(fn,
simpleMapArrowToSubstraitFunc(SubstraitBooleanFuncsURI))
if err != nil {
panic(err)
}
}
}

type overflowBehavior string
Expand Down Expand Up @@ -168,13 +186,17 @@ var substraitToArrowFuncMap = map[string]string{
"gt": "greater",
"lte": "less_equal",
"gte": "greater_equal",
"or": "or_kleene",
"and": "and_kleene",
}

var arrowToSubstraitFuncMap = map[string]string{
"less": "lt",
"greater": "gt",
"less_equal": "lte",
"greater_equal": "gte",
"and_kleene": "and",
"or_kleene": "or",
}

func simpleMapSubstraitToArrowFunc(sf *expr.ScalarFunction) (fname string, opts compute.FunctionOptions, err error) {
Expand Down Expand Up @@ -553,9 +575,9 @@ func ToSubstraitType(dt arrow.DataType, nullable bool, ext ExtensionIDSet) (type
return &types.Float32Type{Nullability: nullability}, nil
case arrow.FLOAT64:
return &types.Float64Type{Nullability: nullability}, nil
case arrow.STRING:
case arrow.STRING, arrow.LARGE_STRING:
return &types.StringType{Nullability: nullability}, nil
case arrow.BINARY:
case arrow.BINARY, arrow.LARGE_BINARY:
return &types.BinaryType{Nullability: nullability}, nil
case arrow.DATE32:
return &types.DateType{Nullability: nullability}, nil
Expand Down
5 changes: 3 additions & 2 deletions arrow/compute/internal/kernels/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,15 @@ C2GOASM=c2goasm
CC=clang-11
CXX=clang++-11
C_FLAGS=-target x86_64-unknown-none -masm=intel -mno-red-zone -mstackrealign -mllvm -inline-threshold=5000 \
-fno-asynchronous-unwind-tables -fno-exceptions -fno-rtti -O3 -fno-builtin -ffast-math -fno-jump-tables -I_lib -I../../../../internal/utils/_lib
-fno-asynchronous-unwind-tables -fno-exceptions -fno-rtti -O3 -fno-builtin -fno-jump-tables \
-fno-math-errno -funsafe-math-optimizations -fno-rounding-math -fno-trapping-math -I_lib -I../../../../internal/utils/_lib
ASM_FLAGS_AVX2=-mavx2 -mfma
ASM_FLAGS_SSE4=-msse4
ASM_FLAGS_BMI2=-mbmi2
ASM_FLAGS_POPCNT=-mpopcnt

C_FLAGS_NEON=-O3 -fvectorize -mllvm -force-vector-width=16 -fno-asynchronous-unwind-tables -mno-red-zone -mstackrealign -fno-exceptions \
-fno-rtti -fno-builtin -ffast-math -fno-jump-tables -I_lib -I../../../../internal/utils/_lib
-fno-rtti -fno-builtin -fno-math-errno -funsafe-math-optimizations -fno-rounding-math -fno-trapping-math -fno-jump-tables -I_lib -I../../../../internal/utils/_lib

GO_SOURCES := $(shell find . -path ./_lib -prune -o -name '*.go' -not -name '*_test.go')
ALL_SOURCES := $(shell find . -path ./_lib -prune -o -name '*.go' -name '*.s' -not -name '*_test.go')
Expand Down
Loading

0 comments on commit 6166f84

Please sign in to comment.