-
Notifications
You must be signed in to change notification settings - Fork 532
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add string interning to SyncIterator
#3411
Merged
Merged
Changes from 5 commits
Commits
Show all changes
14 commits
Select commit
Hold shift + click to select a range
a1bb25a
Intern pq.Value
mapno 5ce91ac
Some improvements
mapno 07777cd
No noticeable improvement with pool
mapno c64e8e8
Add tests
mapno 9b046ed
Don't intern UUID columns
mapno 159363b
fmt
mapno 68ba876
Add interning to older formats
mapno 1d1811a
Add simple benchmark
mapno 321b3b9
Please linter
mapno 678eced
Some improvements
mapno 4cf23bf
Add comments
mapno 3982f61
chlog
mapno 54ff38e
Merge remote-tracking branch 'origin/main' into string-interning
mapno 531a7b7
Add high cardinality attr to benchmark
mapno File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,83 @@ | ||
package intern | ||
|
||
import ( | ||
"sync" | ||
"unsafe" | ||
|
||
pq "github.com/parquet-go/parquet-go" | ||
) | ||
|
||
type Interner struct { | ||
mtx sync.RWMutex | ||
m map[string][]byte | ||
} | ||
|
||
func New() *Interner { | ||
return NewWithSize(0) | ||
} | ||
|
||
func NewWithSize(size int) *Interner { | ||
return &Interner{m: make(map[string][]byte, size)} | ||
} | ||
|
||
func (i *Interner) UnsafeClone(v *pq.Value) pq.Value { | ||
switch v.Kind() { | ||
case pq.ByteArray, pq.FixedLenByteArray: | ||
// Look away, this is unsafe. | ||
a := *(*pqValue)(unsafe.Pointer(v)) | ||
a.ptr = addressOfBytes(i.internBytes(a.byteArray())) | ||
return *(*pq.Value)(unsafe.Pointer(&a)) | ||
default: | ||
return *v | ||
} | ||
} | ||
|
||
func (i *Interner) internBytes(b []byte) []byte { | ||
s := bytesToString(b) | ||
|
||
i.mtx.RLock() | ||
if x, ok := i.m[s]; ok { | ||
i.mtx.RUnlock() | ||
return x | ||
} | ||
i.mtx.RUnlock() | ||
|
||
i.mtx.Lock() | ||
defer i.mtx.Unlock() | ||
|
||
clone := make([]byte, len(b)) | ||
copy(clone, b) | ||
i.m[s] = clone | ||
return clone | ||
} | ||
|
||
func (i *Interner) Close() { | ||
i.mtx.Lock() | ||
clear(i.m) // clear the map | ||
i.m = nil | ||
i.mtx.Unlock() | ||
} | ||
|
||
// bytesToString converts a byte slice to a string. | ||
func bytesToString(b []byte) string { | ||
return unsafe.String(unsafe.SliceData(b), len(b)) | ||
} | ||
|
||
//go:linkname addressOfBytes github.com/parquet-go/parquet-go/internal/unsafecast.AddressOfBytes | ||
func addressOfBytes(data []byte) *byte | ||
|
||
//go:linkname bytes github.com/parquet-go/parquet-go/internal/unsafecast.Bytes | ||
func bytes(data *byte, size int) []byte | ||
|
||
// pqValue is a slimmer version of parquet-go's pq.Value. | ||
type pqValue struct { | ||
// data | ||
ptr *byte | ||
u64 uint64 | ||
// type | ||
kind int8 // XOR(Kind) so the zero-value is <null> | ||
} | ||
|
||
func (v *pqValue) byteArray() []byte { | ||
return bytes(v.ptr, int(v.u64)) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
package intern | ||
|
||
import ( | ||
"testing" | ||
"unsafe" | ||
|
||
pq "github.com/parquet-go/parquet-go" | ||
) | ||
|
||
func TestInterner_internBytes(t *testing.T) { | ||
i := New() | ||
defer i.Close() | ||
|
||
words := []string{"hello", "world", "hello", "world", "hello", "world"} | ||
for _, w := range words { | ||
_ = i.internBytes([]byte(w)) | ||
} | ||
if len(i.m) != 2 { | ||
// Values are interned, so there should be only 2 unique words | ||
t.Errorf("expected 2, got %d", len(i.m)) | ||
} | ||
if i.internBytes([]byte("hello"))[0] != i.internBytes([]byte("hello"))[0] { | ||
// Values are interned, so the memory address should be the same | ||
t.Error("expected same memory address") | ||
} | ||
} | ||
|
||
func TestInterner_UnsafeClone(t *testing.T) { | ||
i := New() | ||
defer i.Close() | ||
|
||
value1 := pq.ByteArrayValue([]byte("foo")) | ||
value2 := pq.ByteArrayValue([]byte("foo")) | ||
|
||
clone1 := i.UnsafeClone(&value1) | ||
clone2 := i.UnsafeClone(&value2) | ||
|
||
if clone1.ByteArray()[0] != clone2.ByteArray()[0] { | ||
// Values are interned, so the memory address should be the same | ||
t.Error("expected same memory address") | ||
} | ||
|
||
if value1.ByteArray()[0] != value2.ByteArray()[0] { | ||
// Mutates the original value, so the memory address should be different as well | ||
t.Error("expected same memory address") | ||
} | ||
} | ||
|
||
func Test_pqValue(t *testing.T) { | ||
// Test that conversion from pq.Value to pqValue and back to pq.Value | ||
// does not change the value. | ||
value := pq.ByteArrayValue([]byte("foo")) | ||
pqValue := *(*pqValue)(unsafe.Pointer(&value)) | ||
back := *(*pq.Value)(unsafe.Pointer(&pqValue)) | ||
|
||
if value.Kind() != back.Kind() { | ||
t.Error("expected same kind") | ||
} | ||
|
||
if string(value.ByteArray()) != string(back.ByteArray()) { | ||
t.Error("expected same value") | ||
} | ||
|
||
if value.String() != back.String() { | ||
t.Error("expected same value") | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
does this incur an extra value copy?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, but we pass
pq.Value
as values (🥁) inEntries
.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Aren't we cloning values unnecessarily? For any non-byte array/fixed len byte array by cloning here?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, but values are used everywhere. Changing this to pointers is a big change in the codebase, that I'd prefer to keep out from this PR.
FWIW,
v.Clone()
is/was creating a copy ofpq.Value
too. This is unchanged in this PR.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
should the line be
v.Clone()
to make it consistent with the way it used to be? i'm honestly not sure.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
For all other types (int, float, etc) the struct doesn't contain external pointers so a copy of the contents is fine. That's what Clone() does: https://github.com/parquet-go/parquet-go/blob/main/value.go#L792 Avoiding it is fine because it would incur another type check unnecessarily.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fine with this leaving as is in the PR, but it does feel safer to call
.Clone()
in case that internal behavior changes.