Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

implemented SparseIntSet #533

Merged
merged 25 commits into from
Oct 4, 2019
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
9efb335
implemented SparseIntSet
louisponet Sep 28, 2019
c38959c
added tests for coveralls, removed unnecessary added to README
louisponet Sep 28, 2019
4fb0485
implemented comments, not immutable yet
louisponet Sep 29, 2019
4050d46
implemented SparseIntSet
louisponet Sep 28, 2019
070b4f1
added tests for coveralls, removed unnecessary added to README
louisponet Sep 28, 2019
e228fad
implemented comments, not immutable yet
louisponet Sep 29, 2019
61bac31
Merge remote-tracking branch 'louisponet/SparseIntSet' into SparseIntSet
louisponet Sep 29, 2019
6f8555e
added SparseIntSet benchmarks
louisponet Sep 29, 2019
8254a23
code cleanup, assure! comment, removed current_id
louisponet Sep 29, 2019
23ddc6c
fixed test
louisponet Sep 29, 2019
f248297
made SparseIntSet immutable
louisponet Sep 29, 2019
88fa2d8
added less worst case bench
louisponet Sep 30, 2019
77963a8
Added auto cleanup! on vanilla pop!, dirty_pop! is without cleanup.
louisponet Sep 30, 2019
aec600d
Apply suggestions from code review
louisponet Oct 1, 2019
e9377eb
mutable + cleanup!
louisponet Oct 1, 2019
43e436b
only do cleanup when there is actually a zero counter
louisponet Oct 1, 2019
cabcc66
changed to use NULL_INT_PAGE, simplified cleanup! and push!!
louisponet Oct 1, 2019
b91b8fb
docs
louisponet Oct 1, 2019
523da53
code cleanup
louisponet Oct 1, 2019
29aaefa
Apply suggestions from code review
louisponet Oct 2, 2019
9770b81
corrected copy, in, code cleanup, removed complement
louisponet Oct 2, 2019
37e8af7
cleaned up imports
louisponet Oct 3, 2019
3b55ec9
immutable zip iterator, semver bump, removed entity_id
louisponet Oct 3, 2019
74ecdbf
Update Project.toml
louisponet Oct 4, 2019
e002e38
length better length in iterator
louisponet Oct 4, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ This package implements a variety of data structures, including
- DataStructures.IntSet
- Priority Queue
- Fenwick Tree
- SparseIntSet

Resources
---------
Expand Down
69 changes: 69 additions & 0 deletions benchmark/benchmarks.jl
Original file line number Diff line number Diff line change
Expand Up @@ -39,3 +39,72 @@ SUITE[["heap","mutable", "min", "push"]] =
@benchmarkable push_heap(h, $xs) setup=(h=MutableBinaryMinHeap{Float64}())
SUITE[["heap","mutable", "min", "pop"]] =
@benchmarkable pop_heap(h) setup=(h=MutableBinaryMinHeap{Float64}($xs))

SUITE["SparseIntSet"] = BenchmarkGroup()

rand_setup = (
Random.seed!(1234);
ids1 = rand(1:30000, 1000);
ids2 = rand(1:30000, 1000);
)

function create_fill_packed(ids1)
y = SparseIntSet()
for i in ids1
push!(y, i)
end
end

SUITE["SparseIntSet"]["create_fill"] =
@benchmarkable create_fill_packed(ids1) setup=rand_setup

SUITE["SparseIntSet"]["in while not in"] =
@benchmarkable in(23, y) evals=1000 setup=(y = SparseIntSet();)
SUITE["SparseIntSet"]["in while in"] =
@benchmarkable in(5199, y) evals=1000 setup=(y=SparseIntSet(); push!(y, 5199))

function pop_push(y)
pop!(y, 5199)
push!(y, 5199)
end

SUITE["SparseIntSet"]["pop push"] = @benchmarkable pop_push(y) setup=(y=SparseIntSet(); push!(y, 5199))

function iterate_one_bench(x)
t = 0
for i in x
t += i
end
return t
end
function iterate_two_bench(x,y)
t = 0
for (ix, iy) in zip(x, y)
t += ix + iy
end
return t
end
function iterate_two_exclude_one_bench(x,y,z)
t = 0
for (ix, iy) in zip(x, y, exclude=(z,))
t += ix + iy
end
return t
end

x_y_z_setup = (
Random.seed!(1234);
x = SparseIntSet(rand(1:30000, 1000));
y = SparseIntSet(rand(1:30000, 1000));
z = SparseIntSet(rand(1:30000, 1000));
)

SUITE["SparseIntSet"]["iterate one"] =
@benchmarkable iterate_one_bench(x) setup=x_y_z_setup

SUITE["SparseIntSet"]["iterate two"] =
@benchmarkable iterate_two_bench(x,y) setup=x_y_z_setup

SUITE["SparseIntSet"]["iterate two exclude one"] =
@benchmarkable iterate_two_exclude_one_bench(x,y,z) setup=x_y_z_setup

1 change: 1 addition & 0 deletions docs/src/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,5 +41,6 @@ Pages = [
"mutable_linked_list.md"
"intset.md",
"sorted_containers.md",
"sparse_int_set.md"
]
```
9 changes: 9 additions & 0 deletions docs/src/sparse_int_set.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# DataStructures.SparseIntSet

Implementation of a __Sparse Integer Set__, for background see [Sparse Sets](https://www.computist.xyz/2018/06/sparse-sets.html).
Only positive non-zero `Int`s are allowed inside the set.
The idea is to have one **packed** `Vector` storing all the `Int`s contained in the set as to allow for fast iteration, and a sparse, paged **reverse** `Vector` with the position of a particular `Int` inside the **packed** `Vector`. This allows for very fast iteration, insertion and deletion of indices.
Most behavior is similar to a normal `IntSet`, however `collect`, `first` and `last` are with respected to the **packed** vector, in which the ordering is not guaranteed.
The **reverse** `Vector` is paged, meaning that it is a `Vector{Vector{Int}}` where each of the `Vector{Int}`s has the length of one memory page of `Int`s. Every time an index that was not yet in the range of the already present pages, a new one will be created and added to the **reverse**, allowing for dynamical growth.
If all the indices on a particular page are deleted from the set, it will not automatically get cleaned up for performance reasons. The `cleanup!` method is provided to facilitate the reclaiming of memory of a page when all it's corresponding indices have been deleted from the set.
louisponet marked this conversation as resolved.
Show resolved Hide resolved
The `complement` of a `SparseIntSet` is defined with respect to the pages that are in use.
2 changes: 2 additions & 0 deletions src/DataStructures.jl
Original file line number Diff line number Diff line change
Expand Up @@ -104,4 +104,6 @@ module DataStructures
export PriorityQueue, peek

include("priorityqueue.jl")
include("sparse_int_set.jl")
export SparseIntSet, cleanup!
end
262 changes: 262 additions & 0 deletions src/sparse_int_set.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,262 @@
import Base: @propagate_inbounds, zip
louisponet marked this conversation as resolved.
Show resolved Hide resolved

const INT_PER_PAGE = div(ccall(:jl_getpagesize, Clong, ()), sizeof(Int))

#TODO: Batch creation and allocation
louisponet marked this conversation as resolved.
Show resolved Hide resolved
struct SparseIntSet
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When I suggested making this immutable, I was wrong.
Looking at how much more complex the code had to become, that was a mistake on my part.
you can probably just revert that commit

packed ::Vector{Int}
reverse::Vector{Vector{Int}}
end

SparseIntSet() = SparseIntSet(Int[], Vector{Int}[])

SparseIntSet(indices) = union!(SparseIntSet(), indices)

eltype(::Type{SparseIntSet}) = Int

empty(::SparseIntSet) = SparseIntSet()
louisponet marked this conversation as resolved.
Show resolved Hide resolved

function empty!(s::SparseIntSet)
empty!(s.packed)
for p in s.reverse
fill!(p, 0)
end
return s
end

isempty(s::SparseIntSet) = isempty(s.packed)

copy(s::SparseIntSet) = copy!(SparseIntSet(), s)

function copy!(to::SparseIntSet, from::SparseIntSet)
resize!(to.packed, length(from.packed))
to.packed .= from.packed
resize!(to.reverse, length(from.reverse))
for i in eachindex(from.reverse)
if isassigned(from.reverse, i)
to.reverse[i] = copy(from.reverse[i])
end
end
return to
end

function pageid_offset(s::SparseIntSet, i)
pageid = div(i - 1, INT_PER_PAGE) + 1
return pageid, (i - 1) & (INT_PER_PAGE - 1) + 1
end

function in(i, s::SparseIntSet)
pageid, offset = pageid_offset(s, i)
isassigned(s.reverse, pageid) && @inbounds s.reverse[pageid][offset] != 0
louisponet marked this conversation as resolved.
Show resolved Hide resolved
end

length(s::SparseIntSet) = length(s.packed)

# This makes sure that when adding (pushing) an Int,
# it's respective page is allocated and put at the index of the reverse such that
# pageid_offset works as intended.
# Pages will be allocated only once, when pushing an Int that belongs to them.
# Other not used pages (created during resize!) will be undefs until one Int belonging to them gets added.
function assure!(s::SparseIntSet, pageid)
oxinabox marked this conversation as resolved.
Show resolved Hide resolved
if pageid > length(s.reverse)
resize!(s.reverse, pageid - 1)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Probably better not to do that? push! will resize for you.
And it has heuristics to better handle the resizing

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure I understand what you mean, do you mean to move assure! into push! I guess that's ok since I don't think it's used anywhere else

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

no, I mean why are we doing resize! then push! ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Before this was to generate the undefs but now indeed i'm just using push! after sizehint!, I don't know if that's optimal

Copy link
Member

@oxinabox oxinabox Oct 2, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it is, basically,
Marginally better is resize! + assignment to index
but you missout on the heuristics of how push! will do extra resizing under the hood when it has to grow (I think sizehint! actaully might also block those since it iwill grow early)
and that extra reizinging under the hood gives speedup between seperate calls

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

did a quick benchmark with doing resize! followed by assignment to index, difference is basically swallowed by noise on the benchmark it seems.

p = zeros(Int, INT_PER_PAGE)
push!(s.reverse, p)
return p, true
elseif !isassigned(s.reverse, pageid)
p = zeros(Int, INT_PER_PAGE)
@inbounds s.reverse[pageid] = p
return p, true
end
return @inbounds s.reverse[pageid], false
end

function push!(s::SparseIntSet, i::Integer)
i <= 0 && throw(DomainError("Only positive Ints allowed."))
pageid, offset = pageid_offset(s, i)
page, newly_created = assure!(s, pageid)
if newly_created || page[offset] == 0
@inbounds page[offset] = length(s) + 1
push!(s.packed, i)
return s
end
return s
end
push!(s::SparseIntSet, is::Integer...) = (for i in is; push!(s, i); end; return s)

function pop!(s::SparseIntSet)
if isempty(s)
throw(ArgumentError("Cannot pop an empty set."))
end
id = pop!(s.packed)
pageid, offset = pageid_offset(s, id)
@inbounds s.reverse[pageid][offset] = 0
return id
end

function pop!(s::SparseIntSet, id::Integer)
id < 0 && throw(ArgumentError("Int to pop needs to be positive."))

@boundscheck if !in(id, s)
throw(BoundsError(s, id))
end
@inbounds begin
packed_endid = s.packed[end]
from_page, from_offset = pageid_offset(s, id)
to_page, to_offset = pageid_offset(s, packed_endid)

packed_id = s.reverse[from_page][from_offset]
s.packed[packed_id] = packed_endid
s.reverse[to_page][to_offset] = s.reverse[from_page][from_offset]
s.reverse[from_page][from_offset] = 0
louisponet marked this conversation as resolved.
Show resolved Hide resolved
pop!(s.packed)
end
return id
end

function pop!(s::SparseIntSet, id::Integer, default)
id < 0 && throw(ArgumentError("Int to pop needs to be positive."))
in(id, s) ? (@inbounds pop!(s, id)) : default
louisponet marked this conversation as resolved.
Show resolved Hide resolved
end
popfirst!(s::SparseIntSet) = pop!(s, first(s))

iterate(set::SparseIntSet, args...) = iterate(set.packed, args...)

last(s::SparseIntSet) = isempty(s) ? throw(ArgumentError("Empty set has no last element.")) : last(s.packed)

union(s::SparseIntSet, ns) = union!(copy(s), ns)
union!(s::SparseIntSet, ns) = (for n in ns; push!(s, n); end; s)
louisponet marked this conversation as resolved.
Show resolved Hide resolved

intersect(s1::SparseIntSet) = copy(s1)
intersect(s1::SparseIntSet, ss...) = intersect(s1, intersect(ss...))
function intersect(s1::SparseIntSet, ns)
s = SparseIntSet()
for n in ns
n in s1 && push!(s, n)
end
return s
end

intersect!(s1::SparseIntSet, ss...) = intersect!(s1, intersect(ss...))

#Is there a more performant way to do this?
intersect!(s1::SparseIntSet, ns) = copy!(s1, intersect(s1, ns))

setdiff(s::SparseIntSet, ns) = setdiff!(copy(s), ns)
setdiff!(s::SparseIntSet, ns) = (for n in ns; pop!(s, n, nothing); end; s)
louisponet marked this conversation as resolved.
Show resolved Hide resolved

function ==(s1::SparseIntSet, s2::SparseIntSet)
length(s1) != length(s2) && return false
return all(x -> in(x, s1), s2)
louisponet marked this conversation as resolved.
Show resolved Hide resolved
end

issubset(a::SparseIntSet, b::SparseIntSet) = isequal(a, intersect(a, b))

complement(a::SparseIntSet) = complement!(SparseIntSet(), a)
function complement!(b::SparseIntSet, a::SparseIntSet)
empty!(b)
for i in eachindex(a.reverse)
if !isassigned(a.reverse, i)
resize!(b.reverse, i)
new_ids = (i-1)*INT_PER_PAGE+1:(i)*INT_PER_PAGE
louisponet marked this conversation as resolved.
Show resolved Hide resolved
append!(b.packed, new_ids)
b.reverse[i] = collect(new_ids)
else
for offset in 1:INT_PER_PAGE
if a.reverse[i][offset] == 0
push!(b, INT_PER_PAGE*(i-1) + offset)
end
end
end
end
return b
end
#Can this be optimized?
complement!(a::SparseIntSet) = copy!(a, complement(a))

<(a::SparseIntSet, b::SparseIntSet) = (a<=b) && !isequal(a,b)
louisponet marked this conversation as resolved.
Show resolved Hide resolved
<=(a::SparseIntSet, b::SparseIntSet) = issubset(a, b)

function findfirst_packed_id(i, s::SparseIntSet)
pageid, offset = pageid_offset(s, i)
if isassigned(s.reverse, pageid)
@inbounds id = s.reverse[pageid][offset]
return id
end
return 0
end

function cleanup!(s::SparseIntSet)
oxinabox marked this conversation as resolved.
Show resolved Hide resolved
isused = x -> isassigned(s.reverse, x) && any(y -> y != 0, s.reverse[x])
indices = eachindex(s.reverse)
last_page_id = findlast(isused, indices)
if last_page_id === nothing
empty!(s.reverse)
return s
else
new_pages = Vector{Vector{Int}}(undef, last_page_id)
for i in indices
if isused(i)
new_pages[i] = s.reverse[i]
end
end
empty!(s.reverse)
resize!(s.reverse, last_page_id)
for i in eachindex(new_pages)
if isassigned(new_pages, i)
s.reverse[i] = new_pages[i]
end
end
return s
end
end

collect(s::SparseIntSet) = s.packed
louisponet marked this conversation as resolved.
Show resolved Hide resolved

mutable struct ZippedSparseIntSetIterator{VT,IT}
louisponet marked this conversation as resolved.
Show resolved Hide resolved
current_id::Int
louisponet marked this conversation as resolved.
Show resolved Hide resolved
valid_sets::VT
shortest_set::SparseIntSet
excluded_sets::IT
function ZippedSparseIntSetIterator(valid_sets::SparseIntSet...;exclude::NTuple{N, SparseIntSet}=()) where{N}
shortest = valid_sets[findmin(map(x->length(x), valid_sets))[2]]
new{typeof(valid_sets), typeof(exclude)}(zero(eltype(shortest)), valid_sets, shortest, exclude)
end
end

zip(s::SparseIntSet...;kwargs...) = ZippedSparseIntSetIterator(s...;kwargs...)

@inline length(it::ZippedSparseIntSetIterator) = length(it.shortest_set)

in_excluded(id, it::ZippedSparseIntSetIterator{VT,Tuple{}}) where {VT} = false
louisponet marked this conversation as resolved.
Show resolved Hide resolved

function in_excluded(id, it)
for e in it.excluded_sets
if id in e
return true
end
end
return false
end

@inline function id_tids(it, state)
id = it.shortest_set.packed[state]
return id, map(x -> findfirst_packed_id(id, x), it.valid_sets)
end

@propagate_inbounds function iterate(it::ZippedSparseIntSetIterator, state=1)
if state > length(it)
return nothing
end
id, tids = id_tids(it, state)
while !all(x -> x!=0, tids) || in_excluded(id, it)
louisponet marked this conversation as resolved.
Show resolved Hide resolved
state += 1
if state > length(it)
louisponet marked this conversation as resolved.
Show resolved Hide resolved
return nothing
end

id, tids = id_tids(it, state)
end
it.current_id = id
louisponet marked this conversation as resolved.
Show resolved Hide resolved
return tids, state + 1
end
3 changes: 2 additions & 1 deletion test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import DataStructures: IntSet
@test [] == detect_ambiguities(Base, Core, DataStructures)

tests = ["int_set",
"sparse_int_set",
"deque",
"circ_deque",
"sorted_containers",
Expand All @@ -28,7 +29,7 @@ tests = ["int_set",
"sorting",
"priority_queue",
"fenwick",
"robin_dict"
"robin_dict",
]

if length(ARGS) > 0
Expand Down
Loading