From e925b267fa19418b7d91aba67e21dc2cdc078da7 Mon Sep 17 00:00:00 2001 From: Mauro Werder Date: Fri, 20 Jun 2014 16:56:26 +0100 Subject: [PATCH 1/5] Refactored dict.jl --- base/dict.jl | 700 ++++++++++++++++++++++++++++----------------- base/precompile.jl | 6 +- 2 files changed, 438 insertions(+), 268 deletions(-) diff --git a/base/dict.jl b/base/dict.jl index a6354d408c7f5..bd264610335f7 100644 --- a/base/dict.jl +++ b/base/dict.jl @@ -1,10 +1,40 @@ # generic operations on associative collections -abstract Associative{K,V} +# generic operations on associative collections +abstract Associative{K,V} # TODO: change to Dictionary + +## Interface +# +# Implemented by Associative: +#* haskey +#* copy +#* merge(!) +#* filter(!) +#* == +#* convert +#* similar +#* push! +#* getindex +#* in +# +# To implement by the types: +# get(!) +# getkey +# pop! +# keys +# values +# empty! +# length +# isempty, eltype +# start, next, done +# sizehint +# +## Deprecated: +# delete! const secret_table_token = :__c782dbf1cf4d6a2e5e3865d7e95634f2e09b5902__ -haskey(d::Associative, k) = in(k,keys(d)) +haskey(d::Associative, k) = in(k, keys(d)) function in(p::(Any,Any), a::Associative) v = get(a,p[1],secret_table_token) @@ -221,9 +251,45 @@ function ==(l::Associative, r::Associative) true end -# some support functions +# conversion between Dict types +function convert{K,V}(T::Type{Associative{K,V}},d::Associative) + h = T{K,V}() + for (k,v) in d + ck = convert(K,k) + if !haskey(h,ck) + h[ck] = convert(V,v) + else + error("key collision during dictionary conversion") + end + end + return h +end +convert{K,V}(T::Type{Associative{K,V}},d::Associative{K,V}) = d -_tablesz(x::Integer) = x < 16 ? 16 : one(x)<<((sizeof(x)<<3)-leading_zeros(x-1)) +# serialisation +function serialize(s, t::Associative) + serialize_type(s, typeof(t)) + write(s, int32(length(t))) + for (k,v) in t + serialize(s, k) + serialize(s, v) + end +end + +function deserialize{K,V}(s, T::Type{Associative{K,V}}) + n = read(s, Int32) + t = T(); sizehint(t, n) + for i = 1:n + k = deserialize(s) + v = deserialize(s) + t[k] = v + end + return t +end + +similar(d::Associative) = typeof(d)() + +# some support functions function getindex(t::Associative, key) v = get(t, key, secret_table_token) @@ -240,7 +306,9 @@ setindex!(t::Associative, v, k1, k2, ks...) = setindex!(t, v, tuple(k1,k2,ks...) push!(t::Associative, key, v) = setindex!(t, v, key) -# hashing objects by identity + +###### +# Old ObjectIdDict type ObjectIdDict <: Associative{Any,Any} ht::Array{Any,1} @@ -263,8 +331,6 @@ type ObjectIdDict <: Associative{Any,Any} end end -similar(d::ObjectIdDict) = ObjectIdDict() - function setindex!(t::ObjectIdDict, v::ANY, k::ANY) t.ht = ccall(:jl_eqtable_put, Array{Any,1}, (Any, Any, Any), t.ht, k, v) return t @@ -304,125 +370,120 @@ end copy(o::ObjectIdDict) = ObjectIdDict(o) -# dict - -type Dict{K,V} <: Associative{K,V} - slots::Array{Uint8,1} - keys::Array{K,1} - vals::Array{V,1} - ndel::Int - count::Int - deleter::Function - - function Dict() - n = 16 - new(zeros(Uint8,n), Array(K,n), Array(V,n), 0, 0, identity) - end - function Dict(ks, vs) - # TODO: eventually replace with a call to Dict(zip(ks,vs)) - n = min(length(ks), length(vs)) - h = Dict{K,V}() - for i=1:n - h[ks[i]] = vs[i] - end - return h - end - function Dict(kv) - h = Dict{K,V}() - for (k,v) in kv - h[k] = v - end - return h - end -end -Dict() = Dict{Any,Any}() - -Dict{K,V}(ks::AbstractArray{K}, vs::AbstractArray{V}) = Dict{K,V}(ks,vs) -Dict(ks, vs) = Dict{Any,Any}(ks, vs) - -# conversion between Dict types -function convert{K,V}(::Type{Dict{K,V}},d::Dict) - h = Dict{K,V}() - for (k,v) in d - ck = convert(K,k) - if !haskey(h,ck) - h[ck] = convert(V,v) - else - error("key collision during dictionary conversion") - end +########### +# Hash-table based dictionaries +abstract HashDictionary{K,V} <: Associative{K,V} +# it is assumed that the concrete types are constructed with @makeHashDictionary +# or have the same internal structure. + +# constants +const ISEMPTY = 0x0 +const ISFILLED = 0x1 +const ISMISSING = 0x2 + +## helper types +immutable Unordered end +typealias Ordered Int +# This slows it down by a factor of 4 compared to the typealias! +# immutable Ordered +# a::Int +# end +# getindex(ord::Vector{Ordered}, i::Integer) = invoke(getindex, (Vector, Int), ord, int(i)).a +# setindex!(ord::Vector{Ordered}, val::Integer, i::Integer) = Core.arrayset(ord, Ordered(val), int(i)) +# push!(ord::Vector{Ordered}, val::Integer) = invoke(push!, (Vector, Ordered), ord, Ordered(val)) + +## Common internal and external interface of HashDictionary which may +## need re-definition for special types of HashDictionary's. +numslots(h::HashDictionary) = length(h.slots) + +# Transforms a key into an index. sz has to be a power of 2. +hashindex(::HashDictionary, key, sz) = (int(hash(key)) & (sz-1)) + 1 + +isslotempty(h::HashDictionary, i::Int) = h.slots[i] == ISEMPTY +isslotfilled(h::HashDictionary, i::Int) = h.slots[i] == ISFILLED +isslotmissing(h::HashDictionary, i::Int) = h.slots[i] == ISMISSING + +# These functions to access the h.keys and h.vals array, in case a transformation +# of the key is needed before setting/getting it: + +# transforms key at index ind: +gkey(h::HashDictionary, ind) = h.keys[ind] +# transform key back before setting it: +skey!(h::HashDictionary, key, ind) = (h.keys[ind] = key) +skey!(::HashDictionary, ar::Vector, key, ind) = (ar[ind] = key) +# transforms val at index ind: +gval(h::HashDictionary, ind) = h.vals[ind] +# transform val back before setting it: +sval!(h::HashDictionary, val, ind) = (h.vals[ind] = val) +sval!(::HashDictionary, ar::Vector, val, ind) = (ar[ind] = val) + +# key checking & converting as it comes in +function keyconvert{K,V}(h::HashDictionary{K,V}, key0) + key = convert(K,key0) + if !isequal(key, key0) + error(key0, " is not a valid key for type ", K) end - return h + key end -convert{K,V}(::Type{Dict{K,V}},d::Dict{K,V}) = d - -# syntax entry points -Dict{K,V}(ks::(K...), vs::(V...)) = Dict{K ,V }(ks, vs) -Dict{K }(ks::(K...), vs::Tuple ) = Dict{K ,Any}(ks, vs) -Dict{V }(ks::Tuple , vs::(V...)) = Dict{Any,V }(ks, vs) -Dict{K,V}(kv::AbstractArray{(K,V)}) = Dict{K,V}(kv) -Dict{K,V}(kv::Associative{K,V}) = Dict{K,V}(kv) +# Entries which should be purged during calls of rehash and +# ht_keyindex. For instance for weak-key dicts the reference may have +# been gc-ed. +topurge(::HashDictionary, key) = false -similar{K,V}(d::Dict{K,V}) = (K=>V)[] +isordered(h::HashDictionary) = eltype(h.idxs)==Ordered -function serialize(s, t::Dict) - serialize_type(s, typeof(t)) - write(s, int32(length(t))) - for (k,v) in t - serialize(s, k) - serialize(s, v) - end -end - -function deserialize{K,V}(s, T::Type{Dict{K,V}}) - n = read(s, Int32) - t = T(); sizehint(t, n) - for i = 1:n - k = deserialize(s) - v = deserialize(s) - t[k] = v - end - return t -end - -hashindex(key, sz) = (int(hash(key)) & (sz-1)) + 1 - -isslotempty(h::Dict, i::Int) = h.slots[i] == 0x0 -isslotfilled(h::Dict, i::Int) = h.slots[i] == 0x1 -isslotmissing(h::Dict, i::Int) = h.slots[i] == 0x2 +# new table size +_tablesz(x::Integer) = x < 16 ? 16 : one(x)<<((sizeof(x)<<3)-leading_zeros(x-1)) -function rehash{K,V}(h::Dict{K,V}, newsz) - olds = h.slots - oldk = h.keys - oldv = h.vals - sz = length(olds) +function rehash{K,V}(h::HashDictionary{K,V}, newsz) + sz = numslots(h) newsz = _tablesz(newsz) if h.count == 0 resize!(h.slots, newsz) - fill!(h.slots, 0) + fill!(h.slots, ISEMPTY) resize!(h.keys, newsz) resize!(h.vals, newsz) + resize!(h.idxs, newsz) + resize!(h.order, 0) + sizehint(h.order, newsz) # TODO: profile whether this makes it better. h.ndel = 0 return h end + ordered = isordered(h) + if ordered + _compact_order!(h) + end - slots = zeros(Uint8,newsz) - keys = Array(K, newsz) - vals = Array(V, newsz) + slots = zeros(Uint8,newsz) # zero==ISEMPTY + keys = Array(eltype(h.keys), newsz) + vals = Array(eltype(h.vals), newsz) + idxs = Array(eltype(h.idxs), newsz) + order = Array(eltype(h.order), h.count) + sizehint(order, newsz) # TODO: profile whether this makes it better. count0 = h.count count = 0 for i = 1:sz - if olds[i] == 0x1 - k = oldk[i] - v = oldv[i] - index = hashindex(k, newsz) + if h.slots[i] == ISFILLED + k = gkey(h,i) + if topurge(h,k) + continue + end + v = gval(h,i) + index = hashindex(h, k, newsz) while slots[index] != 0 + # adds one to index, wrapping around at newsz index = (index & (newsz-1)) + 1 end - slots[index] = 0x1 - keys[index] = k - vals[index] = v + slots[index] = ISFILLED + skey!(h, keys, k, index) + sval!(h, vals, v, index) + if ordered + idx = h.idxs[i] + idxs[index] = idx + order[idx] = index + end count += 1 if h.count != count0 @@ -431,54 +492,82 @@ function rehash{K,V}(h::Dict{K,V}, newsz) end end end - h.slots = slots h.keys = keys h.vals = vals + h.idxs = idxs + h.order = order h.count = count h.ndel = 0 - return h end -function sizehint(d::Dict, newsz) - oldsz = length(d.slots) +# this is only used for ordered dicts +function _compact_order!(h::HashDictionary) + if h.count == length(h.order) + return + end + + i = 1 + while h.order[i] > 0; i += 1; end + + j = i+1 + while h.order[j] == 0; j += 1; end + + for k = j:length(h.order) + idx = h.order[k] + if idx > 0 + h.order[i] = idx + h.idxs[idx] = i + i += 1 + end + end + + resize!(h.order, h.count) + nothing +end + +function sizehint(h::HashDictionary, newsz) + oldsz = numslots(h) if newsz <= oldsz # todo: shrink # be careful: rehash() assumes everything fits. it was only designed # for growing. - return d + return h end # grow at least 25% newsz = max(newsz, (oldsz*5)>>2) - rehash(d, newsz) + rehash(h, newsz) end -function empty!{K,V}(h::Dict{K,V}) - fill!(h.slots, 0x0) - sz = length(h.slots) - h.keys = Array(K, sz) - h.vals = Array(V, sz) +function empty!{K,V}(h::HashDictionary{K,V}) + fill!(h.slots, ISEMPTY) + sz = numslots(h) + h.keys = Array(eltype(h.keys), sz) + h.vals = Array(eltype(h.vals), sz) + h.idxs = Array(eltype(h.idxs), sz) + h.order = Array(eltype(h.idxs), 0) h.ndel = 0 h.count = 0 return h end + # get the index where a key is stored, or -1 if not present -function ht_keyindex{K,V}(h::Dict{K,V}, key) - sz = length(h.keys) +function ht_keyindex{K,V}(h::HashDictionary{K,V}, key) + sz = numslots(h) iter = 0 maxprobe = max(16, sz>>6) - index = hashindex(key, sz) - keys = h.keys + index = hashindex(h, key, sz) while true if isslotempty(h,index) break end - if !isslotmissing(h,index) && isequal(key,keys[index]) + if !isslotmissing(h,index) && isequal(key, gkey(h, index)) return index end + topurge(h,key) && _delete!(h, index) index = (index & (sz-1)) + 1 iter+=1 @@ -488,16 +577,15 @@ function ht_keyindex{K,V}(h::Dict{K,V}, key) return -1 end -# get the index where a key is stored, or -pos if not present -# and the key would be inserted at pos +# Get the index where a key is stored, or -pos if not present +# and the key would be inserted at pos. # This version is for use by setindex! and get! -function ht_keyindex2{K,V}(h::Dict{K,V}, key) - sz = length(h.keys) +function ht_keyindex!{K,V}(h::HashDictionary{K,V}, key) + sz = numslots(h) iter = 0 maxprobe = max(16, sz>>6) - index = hashindex(key, sz) + index = hashindex(h, key, sz) avail = 0 - keys = h.keys while true if isslotempty(h,index) @@ -511,9 +599,10 @@ function ht_keyindex2{K,V}(h::Dict{K,V}, key) # in case "key" already exists in a later collided slot. avail = -index end - elseif isequal(key, keys[index]) + elseif isequal(key, gkey(h, index)) return index end + topurge(h,key) && _delete!(h, index) index = (index & (sz-1)) + 1 iter+=1 @@ -522,235 +611,316 @@ function ht_keyindex2{K,V}(h::Dict{K,V}, key) avail < 0 && return avail + # No slot available, rehash and try again: rehash(h, h.count > 64000 ? sz*2 : sz*4) + return ht_keyindex!(h, key) +end + +function _setindex!(h::HashDictionary, val, key, index) + if index>0 + skey!(h, key, index) + sval!(h, val, index) + else # occupy new slot + index = - index + h.slots[index] = ISFILLED + skey!(h, key, index) + sval!(h, val, index) + if isordered(h) + push!(h.order, index) + h.idxs[index] = length(h.order) + end + h.count += 1 - return ht_keyindex2(h, key) -end - -function _setindex!(h::Dict, v, key, index) - h.slots[index] = 0x1 - h.keys[index] = key - h.vals[index] = v - h.count += 1 - - sz = length(h.keys) - # Rehash now if necessary - if h.ndel >= ((3*sz)>>2) || h.count*3 > sz*2 - # > 3/4 deleted or > 2/3 full - rehash(h, h.count > 64000 ? h.count*2 : h.count*4) + sz = numslots(h) + # Rehash now if necessary + if h.ndel >= ((3*sz)>>2) || h.count*3 > sz*2 + # > 3/4 deleted or > 2/3 full + rehash(h, h.count > 64000 ? h.count*2 : h.count*4) + end end end -function setindex!{K,V}(h::Dict{K,V}, v0, key0) - key = convert(K,key0) - if !isequal(key,key0) - error(key0, " is not a valid key for type ", K) - end +function setindex!{K,V}(h::HashDictionary{K,V}, v0, key) + key = keyconvert(h, key) v = convert(V, v0) - index = ht_keyindex2(h, key) - - if index > 0 - h.keys[index] = key - h.vals[index] = v - else - _setindex!(h, v, key, -index) - end - + index = ht_keyindex!(h, key) + _setindex!(h, v, key, index) return h end -function get!{K,V}(h::Dict{K,V}, key0, default) - key = convert(K,key0) - if !isequal(key,key0) - error(key0, " is not a valid key for type ", K) - end +function get!{K,V}(h::HashDictionary{K,V}, key, default) + key = keyconvert(h, key) + index = ht_keyindex!(h, key) - index = ht_keyindex2(h, key) - - index > 0 && return h.vals[index] + index > 0 && return gval(h, index) v = convert(V, default) - _setindex!(h, v, key, -index) + _setindex!(h, v, key, index) return v end -function get!{K,V}(default::Function, h::Dict{K,V}, key0) - key = convert(K,key0) - if !isequal(key,key0) - error(key0, " is not a valid key for type ", K) - end +function get!{K,V}(default::Function, h::HashDictionary{K,V}, key) + key = keyconvert(h, key) + index = ht_keyindex!(h, key) - index = ht_keyindex2(h, key) - - index > 0 && return h.vals[index] + index > 0 && return gval(h, index) v = convert(V, default()) - _setindex!(h, v, key, -index) + _setindex!(h, v, key, index) return v end # NOTE: this macro is specific to Dict, not Associative, and should # therefore not be exported as-is: it's for internal use only. -macro get!(h, key0, default) +macro get!(h, key, default) quote - K, V = eltype($(esc(h))) - key = convert(K, $(esc(key0))) - isequal(key, $(esc(key0))) || error($(esc(key0)), " is not a valid key for type ", K) - idx = ht_keyindex2($(esc(h)), key) - if idx < 0 - idx = -idx + key = keyconvert($(esc(h)), $(esc(key))) + index = ht_keyindex!($(esc(h)), key) + if index < 0 + K, V = eltype($(esc(h))) v = convert(V, $(esc(default))) - _setindex!($(esc(h)), v, key, idx) + _setindex!($(esc(h)), v, key, index) else - @inbounds v = $(esc(h)).vals[idx] + @inbounds v = gval($(esc(h)),index) end v end end - -function getindex{K,V}(h::Dict{K,V}, key) +function getindex{K,V}(h::HashDictionary{K,V}, key) index = ht_keyindex(h, key) - return (index<0) ? throw(KeyError(key)) : h.vals[index]::V + return (index<0) ? throw(KeyError(key)) : gval(h, index)::V end -function get{K,V}(h::Dict{K,V}, key, deflt) +function get{K,V}(h::HashDictionary{K,V}, key, deflt) index = ht_keyindex(h, key) - return (index<0) ? deflt : h.vals[index]::V + return (index<0) ? deflt : gval(h, index)::V end -function get{K,V}(deflt::Function, h::Dict{K,V}, key) +function get{K,V}(deflt::Function, h::HashDictionary{K,V}, key) index = ht_keyindex(h, key) - return (index<0) ? deflt() : h.vals[index]::V + return (index<0) ? deflt() : gval(h, index)::V end -haskey(h::Dict, key) = (ht_keyindex(h, key) >= 0) -in{T<:Dict}(key, v::KeyIterator{T}) = (ht_keyindex(v.dict, key) >= 0) +haskey(h::HashDictionary, key) = (ht_keyindex(h, key) >= 0) +in{T<:HashDictionary}(key, v::KeyIterator{T}) = haskey(v.dict, key) -function getkey{K,V}(h::Dict{K,V}, key, deflt) +function getkey{K,V}(h::HashDictionary{K,V}, key, deflt) index = ht_keyindex(h, key) - return (index<0) ? deflt : h.keys[index]::K + return (index<0) ? deflt : gkey(h, index)::K end -function _pop!(h::Dict, index) - val = h.vals[index] +function _pop!(h::HashDictionary, index) + val = gval(h, index) _delete!(h, index) return val end -function pop!(h::Dict, key) +function pop!(h::HashDictionary, key) index = ht_keyindex(h, key) index > 0 ? _pop!(h, index) : throw(KeyError(key)) end -function pop!(h::Dict, key, default) +function pop!(h::HashDictionary, key, default) index = ht_keyindex(h, key) index > 0 ? _pop!(h, index) : default end -function _delete!(h::Dict, index) - h.slots[index] = 0x2 - ccall(:jl_arrayunset, Void, (Any, Uint), h.keys, index-1) +function _delete!(h::HashDictionary, index) + h.slots[index] = ISMISSING + ccall(:jl_arrayunset, Void, (Any, Uint), h.keys, index-1) # don't use gkey here! ccall(:jl_arrayunset, Void, (Any, Uint), h.vals, index-1) + if isordered(h) + h.order[h.idxs[index]] = 0 + end h.ndel += 1 h.count -= 1 h end -function delete!(h::Dict, key) +function delete!(h::HashDictionary, key) index = ht_keyindex(h, key) if index > 0; _delete!(h, index); end h end -function skip_deleted(h::Dict, i) - L = length(h.slots) +function skip_deleted(h::HashDictionary, i) + L = numslots(h) while i<=L && !isslotfilled(h,i) i += 1 end return i end -start(t::Dict) = skip_deleted(t, 1) -done(t::Dict, i) = done(t.vals, i) -next(t::Dict, i) = ((t.keys[i],t.vals[i]), skip_deleted(t,i+1)) +start(h::HashDictionary) = skip_deleted(h, 1) +done(h::HashDictionary, i) = done(h.vals, i) +next(h::HashDictionary, i) = ((gkey(h, i), gval(h, i)), skip_deleted(h, i+1)) -isempty(t::Dict) = (t.count == 0) -length(t::Dict) = t.count +isempty(h::HashDictionary) = (h.count == 0) +length(h::HashDictionary) = h.count -next{T<:Dict}(v::KeyIterator{T}, i) = (v.dict.keys[i], skip_deleted(v.dict,i+1)) -next{T<:Dict}(v::ValueIterator{T}, i) = (v.dict.vals[i], skip_deleted(v.dict,i+1)) +next{T<:HashDictionary}(v::KeyIterator{T}, i) = (gkey(v.dict, i), skip_deleted(v.dict,i+1)) +next{T<:HashDictionary}(v::ValueIterator{T}, i) = (gval(v.dict, i), skip_deleted(v.dict,i+1)) -# weak key dictionaries +## macro to make a subtype of a HashDictionary: +macro makeHashDictionary(TName, K, KK, V, VV, Order) + # assert(isa(order, Union(Ordered, Unordered))) + # if isa(order, Ordered) + # error("Ordered dict not implemented.") + # end + esc( # Escaping everything to make methods/variable names not mangeled. + # Not sure this macro could safely be used outside this modules. + quote + type $TName{$K,$V} <: HashDictionary{$K,$V} + slots::Array{Uint8,1} # flag on status of storage slot + keys::Array{$KK,1} # skey! maps K->KK + vals::Array{$VV,1} # gkey! maps KK->K + idxs::Array{$Order,1} # order of keys + order::Array{$Order,1}# order + ndel::Int # number of deleted items + count::Int # + + function $TName() + n = 16 + ord = Array($Order,0) + sizehint(ord, n) # TODO: profile whether this makes it better. + new(zeros(Uint8,n), Array($KK,n), Array($VV,n), Array($Order,n), ord, 0, 0) + end + function $TName(ks, vs) + # TODO: eventually replace with a call to $TName(zip(ks,vs)) + n = min(length(ks), length(vs)) + h = $TName{$K,$V}() + for i=1:n + h[ks[i]] = vs[i] + end + return h + end + function $TName(kv) + h = $TName{$K,$V}() + for (k,v) in kv + h[k] = v + end + return h + end + end -function weak_key_delete!(t::Dict, k) + $TName() = $TName{Any,Any}() + + # TODO: + # - this does not add finalizers to weak-key dicts! + # - if V/K and VV/KK are not convertable this will also fail. + $TName{$K,$V}(ks::AbstractArray{$K}, vs::AbstractArray{$V}) = $TName{$K,$V}(ks,vs) + $TName(ks, vs) = $TName{Any,Any}(ks, vs) + + # syntax entry points + $TName{$K,$V}(ks::($K...), vs::($V...)) = $TName{$K ,$V }(ks, vs) + $TName{$K }(ks::($K...), vs::Tuple ) = $TName{$K ,Any}(ks, vs) + $TName{$V }(ks::Tuple , vs::($V...)) = $TName{Any,$V }(ks, vs) + + $TName{$K,$V}(kv::AbstractArray{($K,$V)}) = $TName{$K,$V}(kv) + $TName{$K,$V}(kv::Associative{$K,$V}) = $TName{$K,$V}(kv) + end + ) +end + +## The standard Dict +@makeHashDictionary(Dict, K, K, V, V, Unordered) +## Ordered Dict +@makeHashDictionary(OrderedDict, K, K, V, V, Ordered) +## ObjectIdDict +@makeHashDictionary(ObjectIdDict2, K, K, V, V, Unordered) +## WeakKeyDict +@makeHashDictionary(WeakKeyDict, K, WeakRef, V, V, Unordered) +## WeakObjectIdDict +@makeHashDictionary(WeakObjectIdDict, K, WeakRef, V, V, Unordered) + +# Update some methods for them +## ObjectID +typealias OIdDicts{K,V} Union(ObjectIdDict2{K,V}, WeakObjectIdDict{K,V}) +hashindex(::OIdDicts, key, sz) = (int(object_id(key)) & (sz-1)) + 1 # object_id is a hash already + +## Weak keys +# TODO: Constructors working on arrays will not add finalizers! +typealias WeakDicts{K,V} Union(WeakKeyDict{K,V}, WeakObjectIdDict{K,V}) + +# transforms key at index ind: +gkey(h::WeakDicts, ind) = h.keys[ind].value +# transform key back before setting it: +_skey_weak(key) = key==nothing ? throw(KeyError("'nothing' is not allowed as a weak-key")) : key +skey!(h::WeakDicts, key, ind) = (h.keys[ind] = WeakRef(_skey_weak(key))) +skey!(::WeakDicts, ar::Vector, key, ind) = (ar[ind] = WeakRef(_skey_weak(key))) + +# finalizer for mutables: +function weak_key_delete!(t::WeakDicts, k) # when a weak key is finalized, remove from dictionary if it is still there - wk = getkey(t, k, secret_table_token) - if !is(wk,secret_table_token) && is(wk.value, k) + wk = getkey(t, k, secret_table_token) # getkey returns the WeakRef.value + if !is(wk,secret_table_token) && is(wk, k) delete!(t, k) end end -function add_weak_key(t::Dict, k, v) - if is(t.deleter, identity) - t.deleter = x->weak_key_delete!(t, x) +# purge entries. For instance for weak-key dicts the reference may +# have been gc-ed. +topurge(::WeakDicts, key) = key==nothing + +function _setindex!(h::WeakDicts, val, key, index) + # add a finalizer + if ~isimmutable(key) + deleter(x) = weak_key_delete!(h, x) + finalizer(key, deleter) + end + + # as in original method + if index>0 + skey!(h, key, index) + sval!(h, val, index) + else # occupy new slot + index = - index + h.slots[index] = ISFILLED + skey!(h, key, index) + sval!(h, val, index) + h.count += 1 + + sz = numslots(h) + # Rehash now if necessary + if h.ndel >= ((3*sz)>>2) || h.count*3 > sz*2 + # > 3/4 deleted or > 2/3 full + rehash(h, h.count > 64000 ? h.count*2 : h.count*4) + end end - t[WeakRef(k)] = v - # TODO: it might be better to avoid the finalizer, allow - # wiped WeakRefs to remain in the table, and delete them as - # they are discovered by getindex and setindex!. - finalizer(k, t.deleter) - return t end -function weak_value_delete!(t::Dict, k, v) - # when a weak value is finalized, remove from dictionary if it is still there - wv = get(t, k, secret_table_token) - if !is(wv,secret_table_token) && is(wv.value, v) - delete!(t, k) +# add purging +function skip_deleted(h::WeakDicts, i) + L = numslots(h) + while i<=L + if isslotfilled(h,i) + if topurge(h, gkey(h, i)) + _delete!(h, i) + else + break + end + end + i += 1 end + return i end -function add_weak_value(t::Dict, k, v) - t[k] = WeakRef(v) - finalizer(v, x->weak_value_delete!(t, k, x)) - return t -end - -type WeakKeyDict{K,V} <: Associative{K,V} - ht::Dict{Any,V} - - WeakKeyDict() = new((Any=>V)[]) -end -WeakKeyDict() = WeakKeyDict{Any,Any}() - -setindex!{K}(wkh::WeakKeyDict{K}, v, key) = add_weak_key(wkh.ht, convert(K,key), v) +## Ordered Dicts +typealias OrderedDicts{K,V} Union(OrderedDict{K,V}) -function getkey{K}(wkh::WeakKeyDict{K}, kk, deflt) - k = getkey(wkh.ht, kk, secret_table_token) - if is(k, secret_table_token) - return deflt +function skip_deleted(h::OrderedDicts, i) + L = length(h.order) + while i<=L && h.order[i] == 0 + i += 1 end - return k.value::K + return i end -get{K}(wkh::WeakKeyDict{K}, key, def) = get(wkh.ht, key, def) -get{K}(def::Function, wkh::WeakKeyDict{K}, key) = get(def, wkh.ht, key) -get!{K}(wkh::WeakKeyDict{K}, key, def) = get!(wkh.ht, key, def) -get!{K}(def::Function, wkh::WeakKeyDict{K}, key) = get!(def, wkh.ht, key) -pop!{K}(wkh::WeakKeyDict{K}, key) = pop!(wkh.ht, key) -pop!{K}(wkh::WeakKeyDict{K}, key, def) = pop!(wkh.ht, key, def) -delete!{K}(wkh::WeakKeyDict{K}, key) = delete!(wkh.ht, key) -empty!(wkh::WeakKeyDict) = (empty!(wkh.ht); wkh) -haskey{K}(wkh::WeakKeyDict{K}, key) = haskey(wkh.ht, key) -getindex{K}(wkh::WeakKeyDict{K}, key) = getindex(wkh.ht, key) -isempty(wkh::WeakKeyDict) = isempty(wkh.ht) +done(h::OrderedDicts, i) = done(h.order, i) +next(h::OrderedDicts, i) = ((gkey(h, h.order[i]), gval(h, h.order[i])), skip_deleted(h,i+1)) + +next{T<:OrderedDicts}(v::KeyIterator{T}, i) = (gkey(v.dict, v.dict.order[i]), skip_deleted(v.dict,i+1)) +next{T<:OrderedDicts}(v::ValueIterator{T}, i) = (gval(v.dict, v.dict.order[i]), skip_deleted(v.dict,i+1)) -start(t::WeakKeyDict) = start(t.ht) -done(t::WeakKeyDict, i) = done(t.ht, i) -function next{K}(t::WeakKeyDict{K}, i) - kv, i = next(t.ht, i) - ((kv[1].value::K,kv[2]), i) -end -length(t::WeakKeyDict) = length(t.ht) diff --git a/base/precompile.jl b/base/precompile.jl index 0993b331b8f47..a80be64503d21 100644 --- a/base/precompile.jl +++ b/base/precompile.jl @@ -210,7 +210,7 @@ precompile(search, (IOBuffer, Uint8)) precompile(read, (IOBuffer, Type{Char})) precompile(read, (IOBuffer, Type{Uint8})) precompile(LineEdit.write_prompt, (Terminals.TTYTerminal, LineEdit.PromptState, ASCIIString)) -precompile(ht_keyindex2, (Dict{Uint8, Any}, Uint8)) +precompile(ht_keyindex!, (Dict{Uint8, Any}, Uint8)) precompile(rehash, (Dict{Uint8, Any}, Int)) precompile(setindex!, (Dict{Uint8, Any}, LineEdit.Prompt, Uint8)) precompile(_setindex!, (Dict{Uint8, Any}, LineEdit.Prompt, Uint8, Int)) @@ -230,11 +230,11 @@ precompile(done, (Array{LineEdit.TextInterface, 1}, Int)) precompile(next, (Array{LineEdit.TextInterface, 1}, Int)) precompile(LineEdit.init_state, (Terminals.TTYTerminal, LineEdit.Prompt)) precompile(setindex!, (Dict{Any, Any}, LineEdit.PromptState, LineEdit.Prompt)) -precompile(ht_keyindex2, (Dict{Any, Any}, LineEdit.Prompt)) +precompile(ht_keyindex!, (Dict{Any, Any}, LineEdit.Prompt)) precompile(_setindex!, (Dict{Any, Any}, LineEdit.PromptState, LineEdit.Prompt, Int)) precompile(LineEdit.init_state, (Terminals.TTYTerminal, LineEdit.HistoryPrompt)) precompile(setindex!, (Dict{Any, Any}, LineEdit.SearchState, LineEdit.HistoryPrompt)) -precompile(ht_keyindex2, (Dict{Any, Any}, LineEdit.HistoryPrompt)) +precompile(ht_keyindex!, (Dict{Any, Any}, LineEdit.HistoryPrompt)) precompile(_setindex!, (Dict{Any, Any}, LineEdit.SearchState, LineEdit.HistoryPrompt, Int)) precompile(LineEdit.activate, (LineEdit.Prompt, LineEdit.MIState)) precompile(isequal, (LineEdit.Prompt, LineEdit.Prompt)) From e73799de47184c10be133df7bc86575cbe206273 Mon Sep 17 00:00:00 2001 From: Mauro Werder Date: Mon, 23 Jun 2014 12:38:40 +0100 Subject: [PATCH 2/5] Added preformance tests. --- base/dict.jl | 32 ++-- test/perf/dicts/perf.jl | 350 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 365 insertions(+), 17 deletions(-) create mode 100644 test/perf/dicts/perf.jl diff --git a/base/dict.jl b/base/dict.jl index bd264610335f7..d89c92730181d 100644 --- a/base/dict.jl +++ b/base/dict.jl @@ -1,5 +1,3 @@ -# generic operations on associative collections - # generic operations on associative collections abstract Associative{K,V} # TODO: change to Dictionary @@ -17,7 +15,7 @@ abstract Associative{K,V} # TODO: change to Dictionary #* getindex #* in # -# To implement by the types: +# To implement by the specific types: # get(!) # getkey # pop! @@ -377,9 +375,9 @@ abstract HashDictionary{K,V} <: Associative{K,V} # or have the same internal structure. # constants -const ISEMPTY = 0x0 -const ISFILLED = 0x1 -const ISMISSING = 0x2 +const EMPTY = 0x0 +const FILLED = 0x1 +const MISSING = 0x2 ## helper types immutable Unordered end @@ -399,9 +397,9 @@ numslots(h::HashDictionary) = length(h.slots) # Transforms a key into an index. sz has to be a power of 2. hashindex(::HashDictionary, key, sz) = (int(hash(key)) & (sz-1)) + 1 -isslotempty(h::HashDictionary, i::Int) = h.slots[i] == ISEMPTY -isslotfilled(h::HashDictionary, i::Int) = h.slots[i] == ISFILLED -isslotmissing(h::HashDictionary, i::Int) = h.slots[i] == ISMISSING +isslotempty(h::HashDictionary, i::Int) = h.slots[i] == EMPTY +isslotfilled(h::HashDictionary, i::Int) = h.slots[i] == FILLED +isslotmissing(h::HashDictionary, i::Int) = h.slots[i] == MISSING # These functions to access the h.keys and h.vals array, in case a transformation # of the key is needed before setting/getting it: @@ -441,7 +439,7 @@ function rehash{K,V}(h::HashDictionary{K,V}, newsz) newsz = _tablesz(newsz) if h.count == 0 resize!(h.slots, newsz) - fill!(h.slots, ISEMPTY) + fill!(h.slots, EMPTY) resize!(h.keys, newsz) resize!(h.vals, newsz) resize!(h.idxs, newsz) @@ -455,7 +453,7 @@ function rehash{K,V}(h::HashDictionary{K,V}, newsz) _compact_order!(h) end - slots = zeros(Uint8,newsz) # zero==ISEMPTY + slots = zeros(Uint8,newsz) # zero==EMPTY keys = Array(eltype(h.keys), newsz) vals = Array(eltype(h.vals), newsz) idxs = Array(eltype(h.idxs), newsz) @@ -465,7 +463,7 @@ function rehash{K,V}(h::HashDictionary{K,V}, newsz) count = 0 for i = 1:sz - if h.slots[i] == ISFILLED + if h.slots[i] == FILLED k = gkey(h,i) if topurge(h,k) continue @@ -476,7 +474,7 @@ function rehash{K,V}(h::HashDictionary{K,V}, newsz) # adds one to index, wrapping around at newsz index = (index & (newsz-1)) + 1 end - slots[index] = ISFILLED + slots[index] = FILLED skey!(h, keys, k, index) sval!(h, vals, v, index) if ordered @@ -541,7 +539,7 @@ function sizehint(h::HashDictionary, newsz) end function empty!{K,V}(h::HashDictionary{K,V}) - fill!(h.slots, ISEMPTY) + fill!(h.slots, EMPTY) sz = numslots(h) h.keys = Array(eltype(h.keys), sz) h.vals = Array(eltype(h.vals), sz) @@ -622,7 +620,7 @@ function _setindex!(h::HashDictionary, val, key, index) sval!(h, val, index) else # occupy new slot index = - index - h.slots[index] = ISFILLED + h.slots[index] = FILLED skey!(h, key, index) sval!(h, val, index) if isordered(h) @@ -728,7 +726,7 @@ function pop!(h::HashDictionary, key, default) end function _delete!(h::HashDictionary, index) - h.slots[index] = ISMISSING + h.slots[index] = MISSING ccall(:jl_arrayunset, Void, (Any, Uint), h.keys, index-1) # don't use gkey here! ccall(:jl_arrayunset, Void, (Any, Uint), h.vals, index-1) if isordered(h) @@ -877,7 +875,7 @@ function _setindex!(h::WeakDicts, val, key, index) sval!(h, val, index) else # occupy new slot index = - index - h.slots[index] = ISFILLED + h.slots[index] = FILLED skey!(h, key, index) sval!(h, val, index) h.count += 1 diff --git a/test/perf/dicts/perf.jl b/test/perf/dicts/perf.jl new file mode 100644 index 0000000000000..89ff5073f1f2c --- /dev/null +++ b/test/perf/dicts/perf.jl @@ -0,0 +1,350 @@ +## Sparse matrix performance +include("../perfutil.jl") + +macro output_timings(t,name,desc,group) + quote + # If we weren't given anything for the test group, infer off of file path! + test_group = length($group) == 0 ? basename(dirname(Base.source_path())) : $group[1] + if codespeed + submit_to_codespeed( $t, $name, $desc, "seconds", test_group ) + elseif print_output + @printf "%-6.2f, %-6.2f, %-6.2f, %-6.2f, %s\n" minimum($t) maximum($t) mean($t) std($t) $name + end + gc() + end +end + +macro timeit(ex,name,desc,group...) + quote + t = zeros(ntrials) + for i=0:ntrials + e = 1000*(@elapsed $(esc(ex))) + if i > 0 + # warm up on first iteration + t[i] = e + end + end + @output_timings t $name $desc $group + end +end + +macro gc_disable(ex) + quote + gc_disable() + $ex + gc_enable() + end +end + +# TODO: update +DictsToTest = [Dict, ObjectIdDict] #, WeakKeyDict, Base.ObjectIdDict2, Base.WeakObjectIdDict, Base.OrderedDict] +srand(1) +obidtest = true # if set to false test for ObjectIdDict will error + +function dict_unittests(DictToTest) + # dict unittests from ../../collections.jl with bits commented which do not pass in v0.2.1 + + gc_disable() # needed to work with Weak-dicts + + # dict + h = DictToTest() + for i=1:10000 + h[i] = i+1 + end + for i=1:10000 + (h[i] == i+1) + end + for i=1:2:10000 + delete!(h, i) + end + for i=1:2:10000 + h[i] = i+1 + end + for i=1:10000 + (h[i] == i+1) + end + for i=1:10000 + delete!(h, i) + end + isempty(h) + h[77] = 100 + h[77]==100 + for i=1:10000 + h[i] = i+1 + end + for i=1:2:10000 + delete!(h, i) + end + for i=10001:20000 + h[i] = i+1 + end + for i=2:2:10000 + h[i]==i+1 + end + for i=10000:20000 + h[i]==i+1 + end + h = {"a" => 3} + h["a"] == 3 + + let + z = DictToTest() + get_KeyError = false + try + z["a"] + catch _e123_ + get_KeyError = isa(_e123_,KeyError) + end + get_KeyError + end + + _d = {"a"=>0} + isa([k for k in filter(x->length(x)==1, collect(keys(_d)))], Vector{Any}) + + # issue #1821 + if !(obidtest) + let + d = DictToTest{UTF8String, Vector{Int}}() + d["a"] = [1, 2] + d["b"] = [1] + isa(repr(d), String) # check that printable without error + end + end + + # issue #2344 + let + local bar + bestkey(d, key) = key + bestkey{K<:String,V}(d::Associative{K,V}, key) = string(key) + bar(x) = bestkey(x, :y) + bar([:x => [1,2,5]]) == :y + bar(["x" => [1,2,5]]) == "y" + end + + + isequal(DictToTest(), DictToTest()) + isequal({1 => 1}, {1 => 1}) + !isequal({1 => 1}, {}) + !isequal({1 => 1}, {1 => 2}) + !isequal({1 => 1}, {2 => 1}) + + # Generate some data to populate dicts to be compared + data_in = [ (rand(1:1000), randstring(2)) for _ in 1:1001 ] + + # Populate the first dict + if obidtest + d1 = DictToTest() + else + d1 = DictToTest{Int, String}() + end + for (k,v) in data_in + d1[k] = v + end + data_in = collect(d1) + # shuffle the data + for i in 1:length(data_in) + j = rand(1:length(data_in)) + data_in[i], data_in[j] = data_in[j], data_in[i] + end + # Inserting data in different (shuffled) order should result in + # equivalent dict. + if obidtest + d2 = DictToTest() + else + d2 = DictToTest{Int, String}() + end + for (k,v) in data_in + d2[k] = v + end + + isequal(d1, d2) + d3 = copy(d2) + d4 = copy(d2) + # Removing an item gives different dict + delete!(d1, data_in[rand(1:length(data_in))][1]) + !isequal(d1, d2) + # Changing a value gives different dict + d3[data_in[rand(1:length(data_in))][1]] = randstring(3) + !isequal(d1, d3) + # Adding a pair gives different dict + d4[1001] = randstring(3) + !isequal(d1, d4) + + if !(obidtest) + isequal(DictToTest(), sizehint(DictToTest(),96)) + end + + # Here is what currently happens when dictionaries of different types + # are compared. This is not necessarily desirable. These tests are + # descriptive rather than proscriptive. + !isequal({1 => 2}, {"dog" => "bone"}) + if !(obidtest) + isequal(DictToTest{Int, Int}(), DictToTest{String, String}()) + end + + # get! (get with default values assigned to the given location) + + # let f(x) = x^2, + # d = {8=>19}, + # def = {} + + # # get!(d, 8, 5) == 19 + # # get!(d, 19, 2) == 2 + + # get!(d, 42) do # d is updated with f(2) + # f(2) + # end == 4 + + # get!(d, 42) do # d is not updated + # f(200) + # end == 4 + + # get(d, 13) do # d is not updated + # f(4) + # end == 16 + + # d == {8=>19, 19=>2, 42=>4} + # end + + # # show + # for d in (["\n" => "\n", "1" => "\n", "\n" => "2"], + # [string(i) => i for i = 1:30], + # [reshape(1:i^2,i,i) => reshape(1:i^2,i,i) for i = 1:24], + # [utf8(Char['α':'α'+i]) => utf8(Char['α':'α'+i]) for i = (1:10)*10]) + # for cols in (12, 40, 80), rows in (2, 10, 24) + # # Ensure output is limited as requested + # s = IOBuffer() + # Base.showdict(s, d, limit=true, sz=(rows, cols)) + # out = split(takebuf_string(s),'\n') + # for line in out[2:end] + # strwidth(line) <= cols + # end + # length(out) <= rows + + # for f in (keys, values) + # s = IOBuffer() + # Base.showkv(s, f(d), limit=true, sz=(rows, cols)) + # out = split(takebuf_string(s),'\n') + # for line in out[2:end] + # strwidth(line) <= cols + # end + # length(out) <= rows + # end + # end + # # Simply ensure these do not throw errors + # Base.showdict(IOBuffer(), d, limit=false) + # !isempty(summary(d)) + # !isempty(summary(keys(d))) + # !isempty(summary(values(d))) + # end + + + # issue #2540 + d = {x => 1 + for x in ['a', 'b', 'c']} + d == {'a'=>1, 'b'=>1, 'c'=> 1} + + # issue #2629 + d = (String => String)[ a => "foo" for a in ["a","b","c"]] + d == ["a"=>"foo","b"=>"foo","c"=>"foo"] + + # # issue #5886 + # d5886 = DictToTest() + # for k5886 in 1:11 + # d5886[k5886] = 1 + # end + # for k5886 in keys(d5886) + # # undefined ref if not fixed + # d5886[k5886] += 1 + # end + + # ############# end of dict tests ############# + + gc_enable() +end + + +# Performance tests adapted from Kevin Squire: +# https://gist.github.com/kmsquire/5147894 +n = 10^5 +strs = [randstring(10) for i = 1:n] +nums = rand(Int, n) + +randp = randperm(n) +randvec = rand(1:n, 10^5) + +# performance tests +function dict_insertion_test(d::Associative) + #empty!(d) + for i = randp + d[strs[i]] = nums[i] + end + d +end + +function dict_deletion_test(d::Associative) + #dict_insertion_test(d) + for i in randvec + pop!(d, strs[i], 0) + end + d +end + +function dict_ins_del_test(d::Associative) + #dict_insertion_test(d) + for i in randvec + randbool()? pop!(d, strs[i], 0) : (d[strs[i]] = nums[i]) + end + d +end + +function dict_iterator(d::Associative) + #dict_insertion_test(d) + acc = 0 + for (k,v) in d + acc += length(k) + v + end + acc +end + +## runners +function test_insert(T::Type) + d = (T==ObjectIdDict) ? T() : T{ASCIIString,Int}() + t = 0.0 + @gc_disable @timeit dict_insertion_test(d) "$T\_ins" "$T: insertion tests" +end + +function test_delelete(T::Type) + d = (T==ObjectIdDict) ? T() : T{ASCIIString,Int}() + t = 0.0 + dict_insertion_test(d) # fill d + @gc_disable @timeit dict_deletion_test(d) "$T\_del" "$T: deletion tests" +end + +function test_insert_delete(T::Type) + d = (T==ObjectIdDict) ? T() : T{ASCIIString,Int}() + t = 0.0 + dict_insertion_test(d) # fill d + @gc_disable @timeit dict_ins_del_test(d) "$T\_ins_del" "$T: insertion and deletion tests" +end + +function test_iterations(T::Type) + d = (T==ObjectIdDict) ? T() : T{ASCIIString,Int}() + t = 0.0 + dict_insertion_test(d) # fill d + @gc_disable @timeit dict_iterator(d) "$T\_iter" "$T: iteration tests" +end + +function run(T) + for test in [test_insert, test_delelete, test_insert_delete, test_iterations] + times = test(T) + end +end + +# TODO: remove +@printf "min , max , mean , std , name\n" +for DictToTest in DictsToTest + println(" ") + @timeit dict_unittests(DictToTest) "$DictToTest\_unitt" "$DictToTest: dict-unit tests" + run(DictToTest) +end From eeb1821517b991a21eac7cd4819c0b3072e7b920 Mon Sep 17 00:00:00 2001 From: Mauro Werder Date: Mon, 23 Jun 2014 15:41:55 +0100 Subject: [PATCH 3/5] Fixed invarinace problem in deserialize and convert. However, not sure how to do convert in general. --- base/dict.jl | 31 ++++++++++++++++++++++++++----- test/perf/dicts/perf.jl | 3 ++- 2 files changed, 28 insertions(+), 6 deletions(-) diff --git a/base/dict.jl b/base/dict.jl index d89c92730181d..e6a8415c0029b 100644 --- a/base/dict.jl +++ b/base/dict.jl @@ -249,9 +249,28 @@ function ==(l::Associative, r::Associative) true end -# conversion between Dict types -function convert{K,V}(T::Type{Associative{K,V}},d::Associative) - h = T{K,V}() +function convert{A<:Associative}(T::Type{A}, d::Associative) + if typeof(d)==T + return d + end + if T.abstract + throw(TypeError) + + # TODO: what to do about something like: + # convert(Associative{Int, Float64}, SomeDict{Int, Int}()) + # + # Probably the right thing would be to return a SomeDict{Int, + # Float64}, followings array's lead. But SomeDict could only + # have one, zero, or more type parameters... + # + # # TODO: but how to do it cleanly? + # K = isa(T.parameters[1], TypeVar) ? typeof(d).parameters[1] : T.parameters[1] + # V = isa(T.parameters[2], TypeVar) ? typeof(d).parameters[2] : T.parameters[2] + # h = eval(typeof(d).env.name){K,V}() + else + h = T() + end + (K,V) = eltype(h) for (k,v) in d ck = convert(K,k) if !haskey(h,ck) @@ -262,7 +281,7 @@ function convert{K,V}(T::Type{Associative{K,V}},d::Associative) end return h end -convert{K,V}(T::Type{Associative{K,V}},d::Associative{K,V}) = d +convert{K,V}(T::Type{Associative{K,V}}, d::Associative{K,V}) = d # serialisation function serialize(s, t::Associative) @@ -274,7 +293,7 @@ function serialize(s, t::Associative) end end -function deserialize{K,V}(s, T::Type{Associative{K,V}}) +function deserialize{A<:Associative}(s, T::Type{A}) n = read(s, Int32) t = T(); sizehint(t, n) for i = 1:n @@ -374,6 +393,8 @@ abstract HashDictionary{K,V} <: Associative{K,V} # it is assumed that the concrete types are constructed with @makeHashDictionary # or have the same internal structure. +convert{K,V}(T::Type{HashDictionary{K,V}}, d::HashDictionary{K,V}) = d + # constants const EMPTY = 0x0 const FILLED = 0x1 diff --git a/test/perf/dicts/perf.jl b/test/perf/dicts/perf.jl index 89ff5073f1f2c..e7a17a7b8be0b 100644 --- a/test/perf/dicts/perf.jl +++ b/test/perf/dicts/perf.jl @@ -1,10 +1,11 @@ ## Sparse matrix performance include("../perfutil.jl") +# TODO: remove the two macros macro output_timings(t,name,desc,group) quote # If we weren't given anything for the test group, infer off of file path! - test_group = length($group) == 0 ? basename(dirname(Base.source_path())) : $group[1] + # test_group = length($group) == 0 ? basename(dirname(Base.source_path())) : $group[1] if codespeed submit_to_codespeed( $t, $name, $desc, "seconds", test_group ) elseif print_output From ede4b2e463237f32ca639173611a0459e0ea9702 Mon Sep 17 00:00:00 2001 From: Mauro Werder Date: Mon, 23 Jun 2014 21:05:01 +0100 Subject: [PATCH 4/5] Updated dict/perf.jl, needs some deleting before merge. --- base/dict.jl | 99 ++++++++++++++++++++++------------------- deps/libuv | 2 +- deps/openlibm | 2 +- test/perf/dicts/perf.jl | 72 +++++++++++++++--------------- 4 files changed, 91 insertions(+), 84 deletions(-) diff --git a/base/dict.jl b/base/dict.jl index e6a8415c0029b..2568afeceb4ba 100644 --- a/base/dict.jl +++ b/base/dict.jl @@ -413,11 +413,24 @@ typealias Ordered Int ## Common internal and external interface of HashDictionary which may ## need re-definition for special types of HashDictionary's. + +## Internals: numslots(h::HashDictionary) = length(h.slots) # Transforms a key into an index. sz has to be a power of 2. hashindex(::HashDictionary, key, sz) = (int(hash(key)) & (sz-1)) + 1 +# Key checking & converting as a key is stored (setindex!, get!, +# etc. but not get). +# Note this conversion has to match with the hash-function in +# hashindex, i.e. the converted key needs the same hashindex as the +# unconverted one. +function keyconvert{K,V}(h::HashDictionary{K,V}, key0) + key = convert(K, key0) + !isequal(key, key0) ? error(key0, " is not a valid key for type ", K) : key +end + + isslotempty(h::HashDictionary, i::Int) = h.slots[i] == EMPTY isslotfilled(h::HashDictionary, i::Int) = h.slots[i] == FILLED isslotmissing(h::HashDictionary, i::Int) = h.slots[i] == MISSING @@ -436,15 +449,6 @@ gval(h::HashDictionary, ind) = h.vals[ind] sval!(h::HashDictionary, val, ind) = (h.vals[ind] = val) sval!(::HashDictionary, ar::Vector, val, ind) = (ar[ind] = val) -# key checking & converting as it comes in -function keyconvert{K,V}(h::HashDictionary{K,V}, key0) - key = convert(K,key0) - if !isequal(key, key0) - error(key0, " is not a valid key for type ", K) - end - key -end - # Entries which should be purged during calls of rehash and # ht_keyindex. For instance for weak-key dicts the reference may have # been gc-ed. @@ -483,6 +487,7 @@ function rehash{K,V}(h::HashDictionary{K,V}, newsz) count0 = h.count count = 0 + # TODO: @inbounds for i = 1:sz if h.slots[i] == FILLED k = gkey(h,i) @@ -532,7 +537,7 @@ function _compact_order!(h::HashDictionary) j = i+1 while h.order[j] == 0; j += 1; end - + # TODO: @inbounds for k = j:length(h.order) idx = h.order[k] if idx > 0 @@ -546,39 +551,13 @@ function _compact_order!(h::HashDictionary) nothing end -function sizehint(h::HashDictionary, newsz) - oldsz = numslots(h) - if newsz <= oldsz - # todo: shrink - # be careful: rehash() assumes everything fits. it was only designed - # for growing. - return h - end - # grow at least 25% - newsz = max(newsz, (oldsz*5)>>2) - rehash(h, newsz) -end - -function empty!{K,V}(h::HashDictionary{K,V}) - fill!(h.slots, EMPTY) - sz = numslots(h) - h.keys = Array(eltype(h.keys), sz) - h.vals = Array(eltype(h.vals), sz) - h.idxs = Array(eltype(h.idxs), sz) - h.order = Array(eltype(h.idxs), 0) - h.ndel = 0 - h.count = 0 - return h -end - - # get the index where a key is stored, or -1 if not present function ht_keyindex{K,V}(h::HashDictionary{K,V}, key) sz = numslots(h) iter = 0 maxprobe = max(16, sz>>6) index = hashindex(h, key, sz) - + # TODO: @inbounds while true if isslotempty(h,index) break @@ -605,13 +584,12 @@ function ht_keyindex!{K,V}(h::HashDictionary{K,V}, key) maxprobe = max(16, sz>>6) index = hashindex(h, key, sz) avail = 0 - + # TODO: @inbounds while true if isslotempty(h,index) avail < 0 && return avail return -index end - if isslotmissing(h,index) if avail == 0 # found an available slot, but need to keep scanning @@ -659,6 +637,8 @@ function _setindex!(h::HashDictionary, val, key, index) end end +## External interface methods + function setindex!{K,V}(h::HashDictionary{K,V}, v0, key) key = keyconvert(h, key) v = convert(V, v0) @@ -701,7 +681,7 @@ macro get!(h, key, default) v = convert(V, $(esc(default))) _setindex!($(esc(h)), v, key, index) else - @inbounds v = gval($(esc(h)),index) + v = gval($(esc(h)),index) end v end @@ -722,12 +702,12 @@ function get{K,V}(deflt::Function, h::HashDictionary{K,V}, key) return (index<0) ? deflt() : gval(h, index)::V end -haskey(h::HashDictionary, key) = (ht_keyindex(h, key) >= 0) +haskey(h::HashDictionary, key) = (ht_keyindex(h, key) > 0) in{T<:HashDictionary}(key, v::KeyIterator{T}) = haskey(v.dict, key) function getkey{K,V}(h::HashDictionary{K,V}, key, deflt) index = ht_keyindex(h, key) - return (index<0) ? deflt : gkey(h, index)::K + return (index<0) ? deflt : gkey(h, index) end function _pop!(h::HashDictionary, index) @@ -748,7 +728,7 @@ end function _delete!(h::HashDictionary, index) h.slots[index] = MISSING - ccall(:jl_arrayunset, Void, (Any, Uint), h.keys, index-1) # don't use gkey here! + ccall(:jl_arrayunset, Void, (Any, Uint), h.keys, index-1) ccall(:jl_arrayunset, Void, (Any, Uint), h.vals, index-1) if isordered(h) h.order[h.idxs[index]] = 0 @@ -776,11 +756,36 @@ start(h::HashDictionary) = skip_deleted(h, 1) done(h::HashDictionary, i) = done(h.vals, i) next(h::HashDictionary, i) = ((gkey(h, i), gval(h, i)), skip_deleted(h, i+1)) +next{T<:HashDictionary}(v::KeyIterator{T}, i) = (gkey(v.dict, i), skip_deleted(v.dict,i+1)) +next{T<:HashDictionary}(v::ValueIterator{T}, i) = (gval(v.dict, i), skip_deleted(v.dict,i+1)) + isempty(h::HashDictionary) = (h.count == 0) length(h::HashDictionary) = h.count -next{T<:HashDictionary}(v::KeyIterator{T}, i) = (gkey(v.dict, i), skip_deleted(v.dict,i+1)) -next{T<:HashDictionary}(v::ValueIterator{T}, i) = (gval(v.dict, i), skip_deleted(v.dict,i+1)) +function sizehint(h::HashDictionary, newsz) + oldsz = numslots(h) + if newsz <= oldsz + # todo: shrink + # be careful: rehash() assumes everything fits. it was only designed + # for growing. + return h + end + # grow at least 25% + newsz = max(newsz, (oldsz*5)>>2) + rehash(h, newsz) +end + +function empty!{K,V}(h::HashDictionary{K,V}) + fill!(h.slots, EMPTY) + sz = numslots(h) + h.keys = Array(eltype(h.keys), sz) + h.vals = Array(eltype(h.vals), sz) + h.idxs = Array(eltype(h.idxs), sz) + h.order = Array(eltype(h.idxs), 0) + h.ndel = 0 + h.count = 0 + return h +end ## macro to make a subtype of a HashDictionary: macro makeHashDictionary(TName, K, KK, V, VV, Order) @@ -858,11 +863,15 @@ end ## ObjectID typealias OIdDicts{K,V} Union(ObjectIdDict2{K,V}, WeakObjectIdDict{K,V}) hashindex(::OIdDicts, key, sz) = (int(object_id(key)) & (sz-1)) + 1 # object_id is a hash already +function keyconvert{K,V}(h::OIdDicts{K,V}, key0) # no conversion as that can create a new object. + !isa(key0, K) ? error(key0, " is not a valid Object-Id-Dict key for type ", K) : key0 +end ## Weak keys # TODO: Constructors working on arrays will not add finalizers! typealias WeakDicts{K,V} Union(WeakKeyDict{K,V}, WeakObjectIdDict{K,V}) +# TODO: add @inbounds # transforms key at index ind: gkey(h::WeakDicts, ind) = h.keys[ind].value # transform key back before setting it: diff --git a/deps/libuv b/deps/libuv index efba94c6ff406..5d608abc3c2e9 160000 --- a/deps/libuv +++ b/deps/libuv @@ -1 +1 @@ -Subproject commit efba94c6ff40699887ff86c87f5a01460d7f2abb +Subproject commit 5d608abc3c2e9dc37da04030a0e07ba0af5ae57d diff --git a/deps/openlibm b/deps/openlibm index f418860958bba..0905558734018 160000 --- a/deps/openlibm +++ b/deps/openlibm @@ -1 +1 @@ -Subproject commit f418860958bbaecd2d7cb72e90d203c1306cd914 +Subproject commit 0905558734018c3e7d98bc532f5029a320be9699 diff --git a/test/perf/dicts/perf.jl b/test/perf/dicts/perf.jl index e7a17a7b8be0b..37e7a9380fc5f 100644 --- a/test/perf/dicts/perf.jl +++ b/test/perf/dicts/perf.jl @@ -1,7 +1,8 @@ ## Sparse matrix performance include("../perfutil.jl") +ntrials = 5 -# TODO: remove the two macros +# TODO: remove the two macros macro output_timings(t,name,desc,group) quote # If we weren't given anything for the test group, infer off of file path! @@ -28,6 +29,7 @@ macro timeit(ex,name,desc,group...) @output_timings t $name $desc $group end end +# TODO: end remove macro gc_disable(ex) quote @@ -37,16 +39,16 @@ macro gc_disable(ex) end end -# TODO: update +# TODO: update: DictsToTest = [Dict, ObjectIdDict] #, WeakKeyDict, Base.ObjectIdDict2, Base.WeakObjectIdDict, Base.OrderedDict] srand(1) obidtest = true # if set to false test for ObjectIdDict will error function dict_unittests(DictToTest) - # dict unittests from ../../collections.jl with bits commented which do not pass in v0.2.1 + # dict unittests from ../../collections.jl with bits commented + # which do not pass in v0.2.1 gc_disable() # needed to work with Weak-dicts - # dict h = DictToTest() for i=1:10000 @@ -270,37 +272,34 @@ end n = 10^5 strs = [randstring(10) for i = 1:n] nums = rand(Int, n) +nums2 = rand(Int, n) randp = randperm(n) randvec = rand(1:n, 10^5) # performance tests -function dict_insertion_test(d::Associative) - #empty!(d) +function dict_insertion_test(d::Associative, keys) for i = randp - d[strs[i]] = nums[i] + d[keys[i]] = nums[i] end d end -function dict_deletion_test(d::Associative) - #dict_insertion_test(d) +function dict_deletion_test(d::Associative, keys) for i in randvec - pop!(d, strs[i], 0) + pop!(d, keys[i], 0) end d end -function dict_ins_del_test(d::Associative) - #dict_insertion_test(d) +function dict_ins_del_test(d::Associative, keys) for i in randvec - randbool()? pop!(d, strs[i], 0) : (d[strs[i]] = nums[i]) + randbool()? pop!(d, keys[i], 0) : (d[keys[i]] = nums[i]) end d end function dict_iterator(d::Associative) - #dict_insertion_test(d) acc = 0 for (k,v) in d acc += length(k) + v @@ -309,43 +308,42 @@ function dict_iterator(d::Associative) end ## runners -function test_insert(T::Type) - d = (T==ObjectIdDict) ? T() : T{ASCIIString,Int}() - t = 0.0 - @gc_disable @timeit dict_insertion_test(d) "$T\_ins" "$T: insertion tests" +function test_insert(T::Type, keys) + d = (T==ObjectIdDict) ? T() : T{eltype(keys),Int}() + @gc_disable @timeit dict_insertion_test(d, keys) "$T\_ins" "$T: insertion tests" end -function test_delelete(T::Type) - d = (T==ObjectIdDict) ? T() : T{ASCIIString,Int}() - t = 0.0 - dict_insertion_test(d) # fill d - @gc_disable @timeit dict_deletion_test(d) "$T\_del" "$T: deletion tests" +function test_delete(T::Type, keys) + d = (T==ObjectIdDict) ? T() : T{eltype(keys),Int}() + dict_insertion_test(d, keys) # fill d + @gc_disable @timeit dict_deletion_test(d,keys) "$T\_del" "$T: deletion tests" end -function test_insert_delete(T::Type) - d = (T==ObjectIdDict) ? T() : T{ASCIIString,Int}() - t = 0.0 - dict_insertion_test(d) # fill d - @gc_disable @timeit dict_ins_del_test(d) "$T\_ins_del" "$T: insertion and deletion tests" +function test_insert_delete(T::Type, keys) + d = (T==ObjectIdDict) ? T() : T{eltype(keys),Int}() + dict_insertion_test(d, keys) # fill d + @gc_disable @timeit dict_ins_del_test(d,keys) "$T\_ins_del" "$T: insertion and deletion tests" end -function test_iterations(T::Type) - d = (T==ObjectIdDict) ? T() : T{ASCIIString,Int}() - t = 0.0 - dict_insertion_test(d) # fill d +function test_iterations(T::Type, keys) + d = (T==ObjectIdDict) ? T() : T{eltype(keys),Int}() + dict_insertion_test(d, keys) # fill d @gc_disable @timeit dict_iterator(d) "$T\_iter" "$T: iteration tests" end function run(T) - for test in [test_insert, test_delelete, test_insert_delete, test_iterations] - times = test(T) + for (keys,desc) in {(strs,"str"), (nums2,"Int")} + println(desc) # TODO remove + for test in [test_insert, test_delete, test_insert_delete, test_iterations] + times = test(T, keys) + end end end -# TODO: remove -@printf "min , max , mean , std , name\n" +@printf "min , max , mean , std , name\n" # TODO: remove for DictToTest in DictsToTest - println(" ") + println(" ") # TODO: remove @timeit dict_unittests(DictToTest) "$DictToTest\_unitt" "$DictToTest: dict-unit tests" run(DictToTest) end + From 48867cec10d295e5388822996e171919a9c7f9e1 Mon Sep 17 00:00:00 2001 From: Mauro Werder Date: Tue, 24 Jun 2014 18:08:05 +0100 Subject: [PATCH 5/5] Now dispatching on isequal as well. And fixed a bug in ObjectIdDict2. --- base/dict.jl | 9 ++++++--- test/perf/dicts/perf.jl | 5 ++++- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/base/dict.jl b/base/dict.jl index 2568afeceb4ba..b52337f3b7519 100644 --- a/base/dict.jl +++ b/base/dict.jl @@ -419,6 +419,8 @@ numslots(h::HashDictionary) = length(h.slots) # Transforms a key into an index. sz has to be a power of 2. hashindex(::HashDictionary, key, sz) = (int(hash(key)) & (sz-1)) + 1 +# Equality test to use +isequalkey(::HashDictionary, key1, key2) = isequal(key1, key2) # Key checking & converting as a key is stored (setindex!, get!, # etc. but not get). @@ -427,7 +429,7 @@ hashindex(::HashDictionary, key, sz) = (int(hash(key)) & (sz-1)) + 1 # unconverted one. function keyconvert{K,V}(h::HashDictionary{K,V}, key0) key = convert(K, key0) - !isequal(key, key0) ? error(key0, " is not a valid key for type ", K) : key + !isequalkey(h, key, key0) ? error(key0, " is not a valid key for type ", K) : key end @@ -562,7 +564,7 @@ function ht_keyindex{K,V}(h::HashDictionary{K,V}, key) if isslotempty(h,index) break end - if !isslotmissing(h,index) && isequal(key, gkey(h, index)) + if !isslotmissing(h,index) && isequalkey(h, key, gkey(h, index)) return index end topurge(h,key) && _delete!(h, index) @@ -596,7 +598,7 @@ function ht_keyindex!{K,V}(h::HashDictionary{K,V}, key) # in case "key" already exists in a later collided slot. avail = -index end - elseif isequal(key, gkey(h, index)) + elseif isequalkey(h, key, gkey(h, index)) return index end topurge(h,key) && _delete!(h, index) @@ -866,6 +868,7 @@ hashindex(::OIdDicts, key, sz) = (int(object_id(key)) & (sz-1)) + 1 # object_id function keyconvert{K,V}(h::OIdDicts{K,V}, key0) # no conversion as that can create a new object. !isa(key0, K) ? error(key0, " is not a valid Object-Id-Dict key for type ", K) : key0 end +isequalkey(::OIdDicts, key1, key2) = key1===key2 ## Weak keys # TODO: Constructors working on arrays will not add finalizers! diff --git a/test/perf/dicts/perf.jl b/test/perf/dicts/perf.jl index 37e7a9380fc5f..58df9752fe6d4 100644 --- a/test/perf/dicts/perf.jl +++ b/test/perf/dicts/perf.jl @@ -40,7 +40,10 @@ macro gc_disable(ex) end # TODO: update: -DictsToTest = [Dict, ObjectIdDict] #, WeakKeyDict, Base.ObjectIdDict2, Base.WeakObjectIdDict, Base.OrderedDict] +DictsToTest = [Dict, ObjectIdDict] +if isdefined(Base, :ObjectIdDict2) + append!(DictsToTest, [WeakKeyDict, Base.ObjectIdDict2, Base.WeakObjectIdDict, Base.OrderedDict]) +end srand(1) obidtest = true # if set to false test for ObjectIdDict will error