Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add IntSet to DataStructures #114

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ This package implements a variety of data structures, including
* Trie
* Linked List
* Sorted Dict, Sorted Multi-Dict and Sorted Set
* DataStructures.IntSet

------
Deque
Expand Down Expand Up @@ -363,6 +364,21 @@ A list of sequentially linked nodes. This allows efficient insertion of nodes to
julia> for i in l5; print(i); end
246

---------------------
DataStructures.IntSet
---------------------

``DataStructures.IntSet`` is a drop-in replacement for the Base ``IntSet`` type. It
efficiently stores dense collections of small non-negative ``Int``\ s as a sorted
set. The constructor ``IntSet([itr])`` constructs a sorted set of the integers
generated by the given iterable object, or an empty set if no argument is
given. If the set will be sparse (for example holding a few very large
integers), use ``Set`` or ``SortedSet`` instead.

A complement IntSet may be constructed with ``complement`` or ``complement!``. The
complement of an empty ``IntSet`` contains ``typemax(Int)`` elements from 0 to
``typemax(Int)-1``.

----------------------------------------
Overview of Sorted Containers
----------------------------------------
Expand Down
3 changes: 2 additions & 1 deletion src/DataStructures.jl
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,8 @@ module DataStructures
include("defaultdict.jl")
include("trie.jl")

include("intset.jl")

include("list.jl")
include("balancedTree.jl")
include("tokens.jl")
Expand All @@ -81,7 +83,6 @@ module DataStructures

export status
export deref_key, deref_value, deref, advance, regress


@deprecate stack Stack
@deprecate queue Queue
Expand Down
253 changes: 253 additions & 0 deletions src/intset.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,253 @@
# This file was a part of Julia. License is MIT: http://julialang.org/license

import Base: similar, copy, copy!, eltype, push!, pop!, delete!, shift!,
empty!, isempty, union, union!, intersect, intersect!,
setdiff, setdiff!, symdiff, symdiff!, in, start, next, done,
last, length, show, hash, issubset, ==, <=, <, unsafe_getindex,
unsafe_setindex!, findnextnot, first
# These names will be removed from Base in the future:
import Base: complement, complement!

type IntSet
bits::BitVector
inverse::Bool
IntSet() = new(fill!(BitVector(256), false), false)
end
IntSet(itr) = union!(IntSet(), itr)

similar(s::IntSet) = IntSet()
copy(s1::IntSet) = copy!(IntSet(), s1)
function copy!(to::IntSet, from::IntSet)
resize!(to.bits, length(from.bits))
copy!(to.bits, from.bits)
to.inverse = from.inverse
to
end
eltype(s::IntSet) = Int
sizehint(s::IntSet, n::Integer) = (_resize0!(s.bits, n+1); s)

# only required on 0.3:
function first(itr::IntSet)
state = start(itr)
done(itr, state) && throw(ArgumentError("collection must be non-empty"))
next(itr, state)[1]
end

# An internal function for setting the inclusion bit for a given integer n >= 0
@inline function _setint!(s::IntSet, n::Integer, b::Bool)
idx = n+1
if idx > length(s.bits)
!b && return s # setting a bit to zero outside the set's bits is a no-op
newlen = idx + idx>>1 # This operation may overflow; we want saturation
_resize0!(s.bits, ifelse(newlen<0, typemax(Int), newlen))
end
unsafe_setindex!(s.bits, b, idx) # Use @inbounds once available
s
end

# An internal function to resize a bitarray and ensure the newly allocated
# elements are zeroed (will become unnecessary if this behavior changes)
@inline function _resize0!(b::BitVector, newlen::Integer)
len = length(b)
resize!(b, newlen)
len < newlen && unsafe_setindex!(b, false, len+1:newlen) # resize! gives dirty memory
b
end

# An internal function that resizes a bitarray so it matches the length newlen
# Returns a bitvector of the removed elements (empty if none were removed)
function _matchlength!(b::BitArray, newlen::Integer)
len = length(b)
len > newlen && return splice!(b, newlen+1:len)
len < newlen && _resize0!(b, newlen)
return BitVector(0)
end

const _intset_bounds_err_msg = "elements of IntSet must be between 0 and typemax(Int)-1"

function push!(s::IntSet, n::Integer)
0 <= n < typemax(Int) || throw(ArgumentError(_intset_bounds_err_msg))
_setint!(s, n, !s.inverse)
end
push!(s::IntSet, ns::Integer...) = (for n in ns; push!(s, n); end; s)

function pop!(s::IntSet)
s.inverse && throw(ArgumentError("cannot pop the last element of complement IntSet"))
pop!(s, last(s))
end
function pop!(s::IntSet, n::Integer)
0 <= n < typemax(Int) || throw(ArgumentError(_intset_bounds_err_msg))
n in s ? (_delete!(s, n); n) : throw(KeyError(n))
end
function pop!(s::IntSet, n::Integer, default)
0 <= n < typemax(Int) || throw(ArgumentError(_intset_bounds_err_msg))
n in s ? (_delete!(s, n); n) : default
end
function pop!(f::Function, s::IntSet, n::Integer)
0 <= n < typemax(Int) || throw(ArgumentError(_intset_bounds_err_msg))
n in s ? (_delete!(s, n); n) : f()
end
_delete!(s::IntSet, n::Integer) = _setint!(s, n, s.inverse)
delete!(s::IntSet, n::Integer) = n < 0 ? s : _delete!(s, n)
shift!(s::IntSet) = pop!(s, first(s))

empty!(s::IntSet) = (fill!(s.bits, false); s.inverse = false; s)
isempty(s::IntSet) = s.inverse ? length(s.bits) == typemax(Int) && all(s.bits) : !any(s.bits)

# Mathematical set functions: union!, intersect!, setdiff!, symdiff!
# When applied to two intsets, these all have a similar form:
# - Reshape s1 to match s2, occasionally grabbing the bits that were removed
# - Use map to apply some bitwise operation across the entire bitvector
# - These operations use functors to work on the bitvector chunks, so are
# very efficient... but a little untraditional. E.g., (p > q) => (p & ~q)
# - If needed, append the removed bits back to s1 or invert the array

union(s::IntSet, ns) = union!(copy(s), ns)
union!(s::IntSet, ns) = (for n in ns; push!(s, n); end; s)
function union!(s1::IntSet, s2::IntSet)
l = length(s2.bits)
if !s1.inverse & !s2.inverse; e = _matchlength!(s1.bits, l); map!(|, s1.bits, s1.bits, s2.bits); append!(s1.bits, e)
elseif s1.inverse & !s2.inverse; e = _matchlength!(s1.bits, l); map!(>, s1.bits, s1.bits, s2.bits); append!(s1.bits, e)
elseif !s1.inverse & s2.inverse; _resize0!(s1.bits, l); map!(<, s1.bits, s1.bits, s2.bits); s1.inverse = true
else #= s1.inverse & s2.inverse=# _resize0!(s1.bits, l); map!(&, s1.bits, s1.bits, s2.bits)
end
s1
end

intersect(s1::IntSet) = copy(s1)
intersect(s1::IntSet, ss...) = intersect(s1, intersect(ss...))
function intersect(s1::IntSet, ns)
s = IntSet()
for n in ns
n in s1 && push!(s, n)
end
s
end
intersect(s1::IntSet, s2::IntSet) = intersect!(copy(s1), s2)
function intersect!(s1::IntSet, s2::IntSet)
l = length(s2.bits)
if !s1.inverse & !s2.inverse; _resize0!(s1.bits, l); map!(&, s1.bits, s1.bits, s2.bits)
elseif s1.inverse & !s2.inverse; _resize0!(s1.bits, l); map!(<, s1.bits, s1.bits, s2.bits); s1.inverse = false
elseif !s1.inverse & s2.inverse; e = _matchlength!(s1.bits, l); map!(>, s1.bits, s1.bits, s2.bits); append!(s1.bits, e)
else #= s1.inverse & s2.inverse=# e = _matchlength!(s1.bits, l); map!(|, s1.bits, s1.bits, s2.bits); append!(s1.bits, e)
end
s1
end

setdiff(s::IntSet, ns) = setdiff!(copy(s), ns)
setdiff!(s::IntSet, ns) = (for n in ns; _delete!(s, n); end; s)
function setdiff!(s1::IntSet, s2::IntSet)
l = length(s2.bits)
if !s1.inverse & !s2.inverse; e = _matchlength!(s1.bits, l); map!(>, s1.bits, s1.bits, s2.bits); append!(s1.bits, e)
elseif s1.inverse & !s2.inverse; e = _matchlength!(s1.bits, l); map!(|, s1.bits, s1.bits, s2.bits); append!(s1.bits, e)
elseif !s1.inverse & s2.inverse; _resize0!(s1.bits, l); map!(&, s1.bits, s1.bits, s2.bits)
else #= s1.inverse & s2.inverse=# _resize0!(s1.bits, l); map!(<, s1.bits, s1.bits, s2.bits); s1.inverse = false
end
s1
end

symdiff(s::IntSet, ns) = symdiff!(copy(s), ns)
symdiff!(s::IntSet, ns) = (for n in ns; symdiff!(s, n); end; s)
function symdiff!(s::IntSet, n::Integer)
0 <= n < typemax(Int) || throw(ArgumentError(_intset_bounds_err_msg))
val = (n in s) $ !s.inverse
_setint!(s, n, val)
s
end
function symdiff!(s1::IntSet, s2::IntSet)
e = _matchlength!(s1.bits, length(s2.bits))
map!($, s1.bits, s1.bits, s2.bits)
s2.inverse && (s1.inverse = !s1.inverse)
append!(s1.bits, e)
s1
end

function in(n::Integer, s::IntSet)
idx = n+1
if 1 <= idx <= length(s.bits)
unsafe_getindex(s.bits, idx) != s.inverse
else
ifelse((idx <= 0) | (idx > typemax(Int)), false, s.inverse)
end
end

# Use the next-set index as the state to prevent looking it up again in done
start(s::IntSet) = next(s, 0)[2]
function next(s::IntSet, i, invert=false)
if s.inverse $ invert
# i+1 could rollover causing a BoundsError in findnext/findnextnot
nextidx = i == typemax(Int) ? 0 : findnextnot(s.bits, i+1)
# Extend indices beyond the length of the bits since it is inverted
nextidx = nextidx == 0 ? max(i, length(s.bits))+1 : nextidx
else
nextidx = i == typemax(Int) ? 0 : findnext(s.bits, i+1)
end
(i-1, nextidx)
end
done(s::IntSet, i) = i <= 0

# Nextnot iterates through elements *not* in the set
nextnot(s::IntSet, i) = next(s, i, true)

function last(s::IntSet)
l = length(s.bits)
if s.inverse
idx = l < typemax(Int) ? typemax(Int) : findprevnot(s.bits, l)
else
idx = findprev(s.bits, l)
end
idx == 0 ? throw(ArgumentError("collection must be non-empty")) : idx - 1
end

length(s::IntSet) = (n = sum(s.bits); ifelse(s.inverse, typemax(Int) - n, n))

complement(s::IntSet) = complement!(copy(s))
complement!(s::IntSet) = (s.inverse = !s.inverse; s)

function show(io::IO, s::IntSet)
print(io, "IntSet([")
first = true
for n in s
if s.inverse && n > 2 && done(s, nextnot(s, n-3)[2])
print(io, ", ..., ", typemax(Int)-1)
break
end
!first && print(io, ", ")
print(io, n)
first = false
end
print(io, "])")
end

function ==(s1::IntSet, s2::IntSet)
l1 = length(s1.bits)
l2 = length(s2.bits)
l1 < l2 && return ==(s2, s1) # Swap so s1 is always equal-length or longer

# Try to do this without allocating memory or checking bit-by-bit
if s1.inverse == s2.inverse
# If the lengths are the same, simply punt to bitarray comparison
l1 == l2 && return s1.bits == s2.bits
# Otherwise check the last bit. If equal, we only need to check up to l2
return findprev(s1.bits, l1) == findprev(s2.bits, l2) &&
unsafe_getindex(s1.bits, 1:l2) == s2.bits
else
# one complement, one not. Could feasibly be true on 32 bit machines
# Only if all non-overlapping bits are set and overlaps are inverted
return l1 == typemax(Int) &&
map!(!, unsafe_getindex(s1.bits, 1:l2)) == s2.bits &&
(l1 == l2 || all(unsafe_getindex(s1.bits, l2+1:l1)))
end
end

const hashis_seed = UInt === UInt64 ? 0x88989f1fc7dea67d : 0xc7dea67d
function hash(s::IntSet, h::UInt)
# Only hash the bits array up to the last-set bit to prevent extra empty
# bits from changing the hash result
l = findprev(s.bits, length(s.bits))
hash(unsafe_getindex(s.bits, 1:l), h) $ hash(s.inverse) $ hashis_seed
end

issubset(a::IntSet, b::IntSet) = isequal(a, intersect(a,b))
<(a::IntSet, b::IntSet) = (a<=b) && !isequal(a,b)
<=(a::IntSet, b::IntSet) = issubset(a, b)
3 changes: 2 additions & 1 deletion test/runtests.jl
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
tests = ["deque",
tests = ["intset",
"deque",
"sortedcontainers",
"stack_and_queue",
"accumulator",
Expand Down
Loading