Skip to content

Commit

Permalink
Remove strong ordering of annotation ranges
Browse files Browse the repository at this point in the history
After a long chat with Lilith Halfner, we've come to the conclusion that
the range-based ordering applied to annotations, while nice for making
some otherwise O(n) code O(log n) and O(n^2) code O(n), is actually
assuming too much about how annotations are used and interact with each
other. Removing all assumptions about ordering, and giving annotation
order primacy seems like the most sensible thing to do, even if it makes
a few bits of the code less algorithmically "nice".

As a consequence, we also get rid of annotatedstring_optimize!. Specific
producers/consumers of annotated text will know what assumptions can be
made to compress/optimise the annotations used, and are thus best suited
to do so themselves. The one exception to this is probably when writing
to an AnnotatedIOBuffer, here adding a specific optimisation to
_insert_annotations! probably makes sense, and will be explored soon.
  • Loading branch information
tecosaur authored and pull[bot] committed May 29, 2024
1 parent e8bbe47 commit 08d6ae1
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 81 deletions.
82 changes: 19 additions & 63 deletions base/strings/annotated.jl
Original file line number Diff line number Diff line change
Expand Up @@ -255,56 +255,26 @@ annotatedstring(c::AnnotatedChar) =

AnnotatedString(s::SubString{<:AnnotatedString}) = annotatedstring(s)

"""
annotatedstring_optimize!(str::AnnotatedString)
Merge contiguous identical annotations in `str`.
"""
function annotatedstring_optimize!(s::AnnotatedString)
last_seen = Dict{Pair{Symbol, Any}, Int}()
i = 1
while i <= length(s.annotations)
region, keyval = s.annotations[i]
prev = get(last_seen, keyval, 0)
if prev > 0
lregion, _ = s.annotations[prev]
if last(lregion) + 1 == first(region)
s.annotations[prev] =
setindex(s.annotations[prev],
first(lregion):last(region),
1)
deleteat!(s.annotations, i)
else
delete!(last_seen, keyval)
end
else
last_seen[keyval] = i
i += 1
end
end
s
end

function repeat(str::AnnotatedString, r::Integer)
r == 0 && return one(AnnotatedString)
r == 1 && return str
unannot = repeat(str.string, r)
annotations = Vector{Tuple{UnitRange{Int}, Pair{Symbol, Any}}}()
len = ncodeunits(str)
fullregion = firstindex(str):lastindex(str)
for (region, annot) in str.annotations
if region == fullregion
push!(annotations, (firstindex(unannot):lastindex(unannot), annot))
if allequal(first, str.annotations) && first(first(str.annotations)) == fullregion
newfullregion = firstindex(unannot):lastindex(unannot)
for (_, annot) in str.annotations
push!(annotations, (newfullregion, annot))
end
end
for offset in 0:len:(r-1)*len
for (region, annot) in str.annotations
if region != fullregion
else
for offset in 0:len:(r-1)*len
for (region, annot) in str.annotations
push!(annotations, (region .+ offset, annot))
end
end
end
AnnotatedString(unannot, annotations) |> annotatedstring_optimize!
AnnotatedString(unannot, annotations)
end

repeat(str::SubString{<:AnnotatedString}, r::Integer) =
Expand Down Expand Up @@ -335,14 +305,9 @@ reverse(s::SubString{<:AnnotatedString}) = reverse(AnnotatedString(s))
function _annotate!(annlist::Vector{Tuple{UnitRange{Int}, Pair{Symbol, Any}}}, range::UnitRange{Int}, @nospecialize(labelval::Pair{Symbol, <:Any}))
label, val = labelval
if val === nothing
indices = searchsorted(annlist, (range,), by=first)
labelindex = filter(i -> first(annlist[i][2]) === label, indices)
for index in Iterators.reverse(labelindex)
deleteat!(annlist, index)
end
deleteat!(annlist, findall(ann -> ann[1] == range && first(ann[2]) === label, annlist))
else
sortedindex = searchsortedlast(annlist, (range,), by=first) + 1
insert!(annlist, sortedindex, (range, Pair{Symbol, Any}(label, val)))
push!(annlist, (range, Pair{Symbol, Any}(label, val)))
end
end

Expand Down Expand Up @@ -521,7 +486,7 @@ end
function _clear_annotations_in_region!(annotations::Vector{Tuple{UnitRange{Int}, Pair{Symbol, Any}}}, span::UnitRange{Int})
# Clear out any overlapping pre-existing annotations.
filter!(((region, _),) -> first(region) < first(span) || last(region) > last(span), annotations)
extras = Tuple{UnitRange{Int}, Pair{Symbol, Any}}[]
extras = Tuple{Int, Tuple{UnitRange{Int}, Pair{Symbol, Any}}}[]
for i in eachindex(annotations)
region, annot = annotations[i]
# Test for partial overlap
Expand All @@ -532,30 +497,21 @@ function _clear_annotations_in_region!(annotations::Vector{Tuple{UnitRange{Int},
# If `span` fits exactly within `region`, then we've only copied over
# the beginning overhang, but also need to conserve the end overhang.
if first(region) < first(span) && last(span) < last(region)
push!(extras, (last(span)+1:last(region), annot))
push!(extras, (i, (last(span)+1:last(region), annot)))
end
end
# Insert any extra entries in the appropriate position
for entry in extras
sortedindex = searchsortedlast(annotations, (first(entry),), by=first) + 1
insert!(annotations, sortedindex, entry)
end
end
# Insert any extra entries in the appropriate position
for (offset, (i, entry)) in enumerate(extras)
insert!(annotations, i + offset, entry)
end
annotations
end

function _insert_annotations!(io::AnnotatedIOBuffer, annotations::Vector{Tuple{UnitRange{Int}, Pair{Symbol, Any}}}, offset::Int = position(io))
if !eof(io)
for (region, annot) in annotations
region = first(region)+offset:last(region)+offset
sortedindex = searchsortedlast(io.annotations, (region,), by=first) + 1
insert!(io.annotations, sortedindex, (region, annot))
end
else
for (region, annot) in annotations
region = first(region)+offset:last(region)+offset
push!(io.annotations, (region, annot))
end
for (region, annot) in annotations
region = first(region)+offset:last(region)+offset
push!(io.annotations, (region, annot))
end
end

Expand Down
29 changes: 11 additions & 18 deletions test/strings/annotated.jl
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,10 @@
@test Base.AnnotatedString(str[3:4]) ==
Base.AnnotatedString("me", [(1:2, :thing => 0x01), (1:2, :all => 0x03)])
@test Base.AnnotatedString(str[3:6]) ==
Base.AnnotatedString("me s", [(1:2, :thing => 0x01), (1:4, :all => 0x03), (4:4, :other => 0x02)])
@test str == Base.AnnotatedString("some string", [(1:4, :thing => 0x01), (1:11, :all => 0x03), (6:11, :other => 0x02)])
Base.AnnotatedString("me s", [(1:2, :thing => 0x01), (4:4, :other => 0x02), (1:4, :all => 0x03)])
@test str == Base.AnnotatedString("some string", [(1:4, :thing => 0x01), (6:11, :other => 0x02), (1:11, :all => 0x03)])
@test str != Base.AnnotatedString("some string")
@test str != Base.AnnotatedString("some string", [(1:1, :thing => 0x01), (6:6, :other => 0x02), (11:11, :all => 0x03)])
@test str != Base.AnnotatedString("some string", [(1:1, :thing => 0x01), (1:11, :all => 0x03), (6:6, :other => 0x02)])
@test str != Base.AnnotatedString("some string", [(1:4, :thing => 0x11), (1:11, :all => 0x13), (6:11, :other => 0x12)])
@test str != Base.AnnotatedString("some thingg", [(1:4, :thing => 0x01), (1:11, :all => 0x03), (6:11, :other => 0x02)])
@test Base.AnnotatedString([Base.AnnotatedChar('a', [:a => 1]), Base.AnnotatedChar('b', [:b => 2])]) ==
Expand All @@ -51,15 +51,8 @@
# @test collect(Base.eachstyle(str)) ==
# [("some", [:thing => 0x01, :all => 0x03]),
# (" string", [:all => 0x03, :other => 0x02])]
@test ==(Base.annotatedstring_optimize!(
Base.AnnotatedString("abc", [(1:1, :val => 1),
(2:2, :val => 2),
(2:2, :val => 1),
(3:3, :val => 2)])),
Base.AnnotatedString("abc", [(1:2, :val => 1),
(2:3, :val => 2)]))
@test chopprefix(sprint(show, str), "Base.") ==
"AnnotatedString{String}(\"some string\", [(1:4, :thing => 0x01), (1:11, :all => 0x03), (6:11, :other => 0x02)])"
"AnnotatedString{String}(\"some string\", [(1:4, :thing => 0x01), (6:11, :other => 0x02), (1:11, :all => 0x03)])"
@test eval(Meta.parse(repr(str))) == str
@test sprint(show, MIME("text/plain"), str) == "\"some string\""
end
Expand Down Expand Up @@ -149,8 +142,8 @@ end
# Check `annotate!`, including region sorting
@test truncate(aio, 0).io.size == 0
@test write(aio, "hello world") == ncodeunits("hello world")
@test Base.annotate!(aio, 7:11, :tag => 2) === aio
@test Base.annotate!(aio, 1:5, :tag => 1) === aio
@test Base.annotate!(aio, 7:11, :tag => 2) === aio
@test Base.annotations(aio) == [(1:5, :tag => 1), (7:11, :tag => 2)]
# Reading
@test read(seekstart(deepcopy(aio.io)), String) == "hello world"
Expand Down Expand Up @@ -178,24 +171,24 @@ end
@test Base.annotations(aio) == [(1:5, :tag => 1), (7:11, :tag => 2)] # Should be unchanged
@test write(seek(aio, 0), Base.AnnotatedString("hey-o", [(1:5, :hey => 'o')])) == 5
@test read(seekstart(aio), String) == "hey-o alice"
@test Base.annotations(aio) == [(1:5, :hey => 'o'), (7:11, :tag => 2)] # First annotation should have been entirely replaced
@test Base.annotations(aio) == [(7:11, :tag => 2), (1:5, :hey => 'o')] # First annotation should have been entirely replaced
@test write(seek(aio, 7), Base.AnnotatedString("bbi", [(1:3, :hey => 'a')])) == 3 # a[lic => bbi]e ('alice' => 'abbie')
@test read(seekstart(aio), String) == "hey-o abbie"
@test Base.annotations(aio) == [(1:5, :hey => 'o'), (7:7, :tag => 2), (8:10, :hey => 'a'), (11:11, :tag => 2)]
@test Base.annotations(aio) == [(7:7, :tag => 2), (11:11, :tag => 2), (1:5, :hey => 'o'), (8:10, :hey => 'a')]
@test write(seek(aio, 0), Base.AnnotatedString("ab")) == 2 # Check first annotation's region is adjusted correctly
@test read(seekstart(aio), String) == "aby-o abbie"
@test Base.annotations(aio) == [(3:5, :hey => 'o'), (7:7, :tag => 2), (8:10, :hey => 'a'), (11:11, :tag => 2)]
@test Base.annotations(aio) == [(7:7, :tag => 2), (11:11, :tag => 2), (3:5, :hey => 'o'), (8:10, :hey => 'a')]
@test write(seek(aio, 3), Base.AnnotatedString("ss")) == 2
@test read(seekstart(aio), String) == "abyss abbie"
@test Base.annotations(aio) == [(3:3, :hey => 'o'), (7:7, :tag => 2), (8:10, :hey => 'a'), (11:11, :tag => 2)]
@test Base.annotations(aio) == [(7:7, :tag => 2), (11:11, :tag => 2), (3:3, :hey => 'o'), (8:10, :hey => 'a')]
# Writing one buffer to another
newaio = Base.AnnotatedIOBuffer()
@test write(newaio, seekstart(aio)) == 11
@test read(seekstart(newaio), String) == "abyss abbie"
@test Base.annotations(newaio) == Base.annotations(aio)
@test write(seek(newaio, 5), seek(aio, 5)) == 6
@test Base.annotations(newaio) == Base.annotations(aio)
@test sort(Base.annotations(newaio)) == sort(Base.annotations(aio))
@test write(newaio, seek(aio, 5)) == 6
@test read(seekstart(newaio), String) == "abyss abbie abbie"
@test Base.annotations(newaio) == vcat(Base.annotations(aio), [(13:13, :tag => 2), (14:16, :hey => 'a'), (17:17, :tag => 2)])
@test sort(Base.annotations(newaio)) == sort(vcat(Base.annotations(aio), [(13:13, :tag => 2), (14:16, :hey => 'a'), (17:17, :tag => 2)]))
end

0 comments on commit 08d6ae1

Please sign in to comment.