From 49143c55931327b9801a096d6de3db8cef6cf58e Mon Sep 17 00:00:00 2001 From: Curtis Vogt Date: Wed, 8 Mar 2017 10:29:42 -0600 Subject: [PATCH 01/10] Refactor date parsing to be fast and extensible --- base/dates/io.jl | 46 ++++++---- base/dates/parse.jl | 216 +++++++++++++++++++++++++++++--------------- test/dates/io.jl | 45 +++++---- 3 files changed, 201 insertions(+), 106 deletions(-) diff --git a/base/dates/io.jl b/base/dates/io.jl index 778e7e99641bca..bcfb9a3201936e 100644 --- a/base/dates/io.jl +++ b/base/dates/io.jl @@ -175,17 +175,17 @@ Delim(d::Char) = Delim{Char, 1}(d) Delim(d::String) = Delim{String, length(d)}(d) @inline function tryparsenext{N}(d::Delim{Char, N}, str, i::Int, len) - R = Nullable{Int64} + R = Nullable{Bool} for j=1:N i > len && return (R(), i) c, i = next(str, i) c != d.d && return (R(), i) end - return R(0), i + return R(true), i end @inline function tryparsenext{N}(d::Delim{String, N}, str, i::Int, len) - R = Nullable{Int64} + R = Nullable{Bool} i1 = i i2 = start(d.d) for j = 1:N @@ -198,7 +198,7 @@ end return R(), i1 end end - return R(0), i1 + return R(true), i1 end @inline function format(io, d::Delim, dt, locale) @@ -206,7 +206,7 @@ end end function _show_content{N}(io::IO, d::Delim{Char, N}) - if d.d in keys(SLOT_RULE) + if d.d in keys(FORMAT_SPECIFIERS) for i = 1:N write(io, '\\', d.d) end @@ -219,7 +219,7 @@ end function _show_content(io::IO, d::Delim) for c in d.d - if c in keys(SLOT_RULE) + if c in keys(FORMAT_SPECIFIERS) write(io, '\\') end write(io, c) @@ -236,8 +236,9 @@ end abstract type DayOfWeekToken end # special addition to Period types -# mapping format specifiers to period types -const SLOT_RULE = Dict{Char, Type}( +# Map format specifiers to, typically period, types. +# Note that Julia packages like TimeZones.jl can add additional specifiers. +const FORMAT_SPECIFIERS = Dict{Char, Type}( 'y' => Year, 'Y' => Year, 'm' => Month, @@ -252,13 +253,21 @@ const SLOT_RULE = Dict{Char, Type}( 's' => Millisecond, ) -slot_order(::Type{Date}) = (Year, Month, Day) -slot_order(::Type{DateTime}) = (Year, Month, Day, Hour, Minute, Second, Millisecond) - -slot_defaults(::Type{Date}) = map(Int64, (1, 1, 1)) -slot_defaults(::Type{DateTime}) = map(Int64, (1, 1, 1, 0, 0, 0, 0)) +const FORMAT_DEFAULTS = Dict{Type, Any}( + Year => Int64(1), + Month => Int64(1), + DayOfWeekToken => Int64(0), + Day => Int64(1), + Hour => Int64(0), + Minute => Int64(0), + Second => Int64(0), + Millisecond => Int64(0), +) -slot_types{T<:TimeType}(::Type{T}) = typeof(slot_defaults(T)) +const FORMAT_TRANSLATIONS = Dict{Type{<:TimeType}, Tuple}( + Date => (Year, Month, Day), + DateTime => (Year, Month, Day, Hour, Minute, Second, Millisecond), +) """ DateFormat(format::AbstractString, locale="english") -> DateFormat @@ -300,13 +309,13 @@ function DateFormat(f::AbstractString, locale::DateLocale=ENGLISH) prev = () prev_offset = 1 - letters = String(collect(keys(Base.Dates.SLOT_RULE))) + letters = String(collect(keys(FORMAT_SPECIFIERS))) for m in eachmatch(Regex("(? DateTime diff --git a/base/dates/parse.jl b/base/dates/parse.jl index e638319f10b09f..4d7973d0d57923 100644 --- a/base/dates/parse.jl +++ b/base/dates/parse.jl @@ -1,97 +1,132 @@ ### Parsing utilities -@generated function tryparse_internal{T<:TimeType, S, F}(::Type{T}, str::AbstractString, df::DateFormat{S, F}, raise::Bool=false) - token_types = Type[dp <: DatePart ? SLOT_RULE[first(dp.parameters)] : Void for dp in F.parameters] - N = length(F.parameters) - - types = slot_order(T) - num_types = length(types) - order = Vector{Int}(num_types) - for i = 1:num_types - order[i] = findfirst(token_types, types[i]) +function directives{S,F}(::Type{DateFormat{S,F}}) + tokens = F.parameters + di = 1 + directive_index = zeros(Int, length(tokens)) + directive_letters = sizehint!(Char[], length(tokens)) + for (i, token) in enumerate(tokens) + if token <: DatePart + directive_index[i] = di + + letter = first(token.parameters) + push!(directive_letters, letter) + + di += 1 + end end + return tokens, directive_index, directive_letters +end + +genvar(t::DataType) = Symbol(lowercase(string(t.name.name))) - field_defaults = slot_defaults(T) - field_order = tuple(order...) - tuple_type = slot_types(T) - # `slot_order`, `slot_defaults`, and `slot_types` return tuples of the same length - assert(num_types == length(field_order) == length(field_defaults)) +@generated function tryparse_core(str::AbstractString, df::DateFormat, raise::Bool=false) + token_types, directive_index, directive_letters = directives(df) + + directive_types = Type[FORMAT_SPECIFIERS[letter] for letter in directive_letters] + directive_names = Symbol[genvar(t) for t in directive_types] + directive_defaults = Tuple(FORMAT_DEFAULTS[t] for t in directive_types) + R = typeof(directive_defaults) + + # Pre-assign output variables to default values. Allows us to use `@goto done` without + # worrying about unassigned variables. + assign_defaults = Expr[ + quote + $name = $default + end + for (name, default) in zip(directive_names, directive_defaults) + ] + + parsers = Expr[ + begin + di = directive_index[i] + if di != 0 + name = directive_names[di] + nullable = Symbol(:nullable_, name) + quote + pos > len && @goto done + $nullable, next_pos = tryparsenext(tokens[$i], str, pos, len, locale) + isnull($nullable) && @goto error + $name = unsafe_get($nullable) + pos = next_pos + directive_idx += 1 + token_idx += 1 + end + else + quote + pos > len && @goto done + nullable_delim, next_pos = tryparsenext(tokens[$i], str, pos, len, locale) + isnull(nullable_delim) && @goto error + pos = next_pos + token_idx += 1 + end + end + end + for i in 1:length(token_types) + ] quote - R = Nullable{$tuple_type} - t = df.tokens - l = df.locale + tokens = df.tokens + locale::DateLocale = df.locale pos, len = start(str), endof(str) + directive_idx = 0 + token_idx = 1 + + $(assign_defaults...) + $(parsers...) - err_idx = 1 - Base.@nexprs $N i->val_i = 0 - Base.@nexprs $N i->(begin - pos > len && @goto done - nv, next_pos = tryparsenext(t[i], str, pos, len, l) - isnull(nv) && @goto error - val_i, pos = unsafe_get(nv), next_pos - err_idx += 1 - end) - pos <= len && @goto error + pos > len || @goto error @label done - parts = Base.@ntuple $N val - return R(reorder_args(parts, $field_order, $field_defaults, err_idx)::$tuple_type) + return Nullable{$R}($(Expr(:tuple, directive_names...))), directive_idx @label error # Note: Keeping exception generation in separate function helps with performance - raise && throw(gen_exception(t, err_idx, pos)) - return R() + if raise + if token_idx > length(tokens) + throw(ArgumentError("Found extra characters at the end of date time string")) + else + throw(ArgumentError("Unable to parse date time. Expected token $(tokens[token_idx]) at char $pos")) + end + end + return Nullable{$R}(), 0 end end -function gen_exception(tokens, err_idx, pos) - if err_idx > length(tokens) - ArgumentError("Found extra characters at the end of date time string") - else - ArgumentError("Unable to parse date time. Expected token $(tokens[err_idx]) at char $pos") - end -end -# reorder_args(val, idx, default, default_from) -# -# reorder elements of `val` tuple according to `idx` tuple. Use `default[i]` -# when `idx[i] == 0` or i >= default_from -# -# returns a tuple `xs` of the same length as `idx` where `xs[i]` is -# `val[idx[i]]` if `idx[i]` is non zero, `default[i]` if `idx[i]` is zero. -# -# `xs[i]` is `default[i]` for all i >= `default_from`. -# -# -function reorder_args{N}(val::Tuple, idx::NTuple{N}, default::Tuple, default_from::Integer) - ntuple(Val{N}) do i - if idx[i] == 0 || idx[i] >= default_from - default[i] - else - val[idx[i]] - end - end -end +@generated function tryparse_internal{T<:TimeType}( + ::Type{T}, str::AbstractString, df::DateFormat, raise::Bool=false, +) + token_types, directive_index, directive_letters = directives(df) -function Base.tryparse{T<:TimeType}(::Type{T}, str::AbstractString, df::DateFormat) - nt = tryparse_internal(T, str, df, false) - if isnull(nt) - return Nullable{T}() - else - return Nullable{T}(T(unsafe_get(nt)...)) - end -end + directive_types = Type[FORMAT_SPECIFIERS[letter] for letter in directive_letters] + directive_names = Symbol[genvar(t) for t in directive_types] -default_format(::Type{Date}) = ISODateFormat -default_format(::Type{DateTime}) = ISODateTimeFormat + output_types = FORMAT_TRANSLATIONS[T] + output_names = Symbol[genvar(t) for t in output_types] + output_defaults = Tuple(FORMAT_DEFAULTS[t] for t in output_types) + R = typeof(output_defaults) -function Base.parse{T<:TimeType}(::Type{T}, - str::AbstractString, - df::DateFormat=default_format(T)) - nt = tryparse_internal(T, str, df, true) - T(unsafe_get(nt)...) + # Pre-assign output variables to default values. Ensures that all output variables are + # assigned as the format directives may not include all of the required variables. + assign_defaults = Expr[ + quote + $name = $default + end + for (name, default) in zip(output_names, output_defaults) + ] + + # Unpacks the tuple into various directive variables. + directive_tuple = Expr(:tuple, directive_names...) + + quote + values, index = tryparse_core(str, df, raise) + isnull(values) && return Nullable{$R}() + $(assign_defaults...) + $directive_tuple = unsafe_get(values) + Nullable{$R}($(Expr(:tuple, output_names...))) + end end @inline function tryparsenext_base10(str::AbstractString, i::Int, len::Int, min_width::Int=1, max_width::Int=0) @@ -200,3 +235,38 @@ function Base.parse(::Type{DateTime}, s::AbstractString, df::typeof(ISODateTimeF @label error throw(ArgumentError("Invalid DateTime string")) end + +function Base.parse{T<:TimeType}( + ::Type{T}, str::AbstractString, df::DateFormat=default_format(T), +) + nt = tryparse_internal(T, str, df, true) + T(unsafe_get(nt)...) +end + +function Base.tryparse{T<:TimeType}( + ::Type{T}, str::AbstractString, df::DateFormat=default_format(T), +) + nt = tryparse_internal(T, str, df, false) + if isnull(nt) + Nullable{T}() + else + Nullable{T}(T(unsafe_get(nt)...)) + end +end + +@generated function Base.parse(::Type{Vector}, str::AbstractString, df::DateFormat) + token_types, directive_index, directive_letters = directives(df) + directive_types = Type[FORMAT_SPECIFIERS[letter] for letter in directive_letters] + + quote + nt, num_parsed = tryparse_core(str, df, true) + t = unsafe_get(nt) + directive_types = $(Expr(:tuple, directive_types...)) + result = Vector{Any}(num_parsed) + for (i, typ) in enumerate(directive_types) + i > num_parsed && break + result[i] = typ(t[i]) # Constructing types takes most of the time + end + return result + end +end diff --git a/test/dates/io.jl b/test/dates/io.jl index 7a3790478c2fc5..a660a08178361f 100644 --- a/test/dates/io.jl +++ b/test/dates/io.jl @@ -76,18 +76,18 @@ b2 = "96/Feb/1" b3 = "96/2/15" @test_throws ArgumentError Dates.DateTime(b3, f) try - Dates.tryparse_internal(DateTime, "2012/02/20T09:09:31.25i90", dateformat"yyyy/mm/ddTHH:MM:SS.s", true) + Dates.tryparse_internal(DateTime, "2012/2/20T9:9:31.25i90", dateformat"yyyy/mm/ddTHH:MM:SS.s", true) @test false catch err @test isa(err, ArgumentError) @test err.msg == "Found extra characters at the end of date time string" end try - Dates.tryparse_internal(DateTime, "2012/02/20T09:09:3i90", dateformat"yyyy/mm/ddTHH:MM:SS.s", true) + Dates.tryparse_internal(DateTime, "2012/2/20T9:9:3i90", dateformat"yyyy/mm/ddTHH:MM:SS.s", true) @test false catch err @test isa(err, ArgumentError) - @test err.msg == "Unable to parse date time. Expected token Delim(.) at char 19" + @test err.msg == "Unable to parse date time. Expected token Delim(.) at char 16" end f = "yy:dd:mm" @@ -375,29 +375,42 @@ let f = "YY" end # Issue: https://github.com/quinnj/TimeZones.jl/issues/19 -let ds = "2015-07-24T05:38:19.591Z", - dt = Dates.DateTime(2015, 7, 24, 5, 38, 19, 591), +let + const Zulu = String - format = "yyyy-mm-ddTHH:MM:SS.sssZ", + function Dates.tryparsenext(d::Dates.DatePart{'Z'}, str, i, len) + Dates.tryparsenext_word(str, i, len, Dates.min_width(d), Dates.max_width(d)) + end + + ds = "2015-07-24T05:38:19.591Z" + dt = Dates.DateTime(2015, 7, 24, 5, 38, 19, 591) + parsed = Any[ + Dates.Year(2015), Dates.Month(7), Dates.Day(24), + Dates.Hour(5), Dates.Minute(38), Dates.Second(19), Dates.Millisecond(591) + ] + + format = "yyyy-mm-ddTHH:MM:SS.sssZ" escaped_format = "yyyy-mm-dd\\THH:MM:SS.sss\\Z" - # Typically 'Z' isn't treated as a slot so it doesn't have to be escaped - @test DateTime(ds, format) == dt - @test DateTime(ds, escaped_format) == dt + # Typically 'Z' isn't treated as a specifier so it doesn't have to be escaped + @test parse(Vector, ds, Dates.DateFormat(format)) == parsed + @test parse(Vector, ds, Dates.DateFormat(escaped_format)) == parsed try - # Make 'Z' into a slot - Dates.SLOT_RULE['Z'] = Dates.TimeZone + # Make 'Z' into a specifier + Dates.FORMAT_SPECIFIERS['Z'] = Zulu + Dates.FORMAT_DEFAULTS[Zulu] = "" - @test_throws MethodError DateTime(ds, format) - @test DateTime(ds, escaped_format) == dt + @test parse(Vector, ds, Dates.DateFormat(format)) == [parsed; Zulu("Z")] + @test parse(Vector, ds, Dates.DateFormat(escaped_format)) == parsed finally - delete!(Dates.SLOT_RULE, 'Z') + delete!(Dates.FORMAT_SPECIFIERS, 'Z') + delete!(Dates.FORMAT_DEFAULTS, Zulu) end # Ensure that the default behaviour has been restored - @test DateTime(ds, format) == dt - @test DateTime(ds, escaped_format) == dt + @test parse(Vector, ds, Dates.DateFormat(format)) == parsed + @test parse(Vector, ds, Dates.DateFormat(escaped_format)) == parsed end # Issue 10817 From eac850fc9c1a01433e60cd5bf2126ddab253638f Mon Sep 17 00:00:00 2001 From: Curtis Vogt Date: Wed, 8 Mar 2017 14:59:20 -0600 Subject: [PATCH 02/10] Deprecate parse(::AbstractString, ::DateFormat) --- base/deprecated.jl | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/base/deprecated.jl b/base/deprecated.jl index 9ecfd1843f90f9..3bba48e9755f91 100644 --- a/base/deprecated.jl +++ b/base/deprecated.jl @@ -1278,6 +1278,18 @@ end @deprecate_binding LinearSlow IndexCartesian false @deprecate_binding linearindexing IndexStyle false +# #20876 +@eval Base.Dates begin + function Base.Dates.parse(x::AbstractString, df::DateFormat) + Base.depwarn(string( + "`Dates.parse(x::AbstractString, df::DateFormat)` is deprecated, use ", + "`sort!(filter!(el -> isa(el, Dates.Period), parse(Vector, x, df), rev=true, lt=Dates.periodisless)` " + " instead.", :parse) + # sort!([el for el in parse(Vector, x, df) if isa(el, Period)], rev=true, lt=periodisless) + sort!(filter!(el -> isa(el, Period), parse(Vector, x, df)), rev=true, lt=periodisless) + end +end + # END 0.6 deprecations # BEGIN 1.0 deprecations From c5bd3c3fc7dd6a52311d7940c11e45753206ffe7 Mon Sep 17 00:00:00 2001 From: Curtis Vogt Date: Wed, 8 Mar 2017 15:55:43 -0600 Subject: [PATCH 03/10] fixup --- base/deprecated.jl | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/base/deprecated.jl b/base/deprecated.jl index 3bba48e9755f91..0ed1656303f0f0 100644 --- a/base/deprecated.jl +++ b/base/deprecated.jl @@ -1283,9 +1283,8 @@ end function Base.Dates.parse(x::AbstractString, df::DateFormat) Base.depwarn(string( "`Dates.parse(x::AbstractString, df::DateFormat)` is deprecated, use ", - "`sort!(filter!(el -> isa(el, Dates.Period), parse(Vector, x, df), rev=true, lt=Dates.periodisless)` " - " instead.", :parse) - # sort!([el for el in parse(Vector, x, df) if isa(el, Period)], rev=true, lt=periodisless) + "`sort!(filter!(el -> isa(el, Dates.Period), parse(Vector, x, df), rev=true, lt=Dates.periodisless)` ", + " instead."), :parse) sort!(filter!(el -> isa(el, Period), parse(Vector, x, df)), rev=true, lt=periodisless) end end From ad19d232116b6c3ab21fa2dde3a49540d7572d2f Mon Sep 17 00:00:00 2001 From: Curtis Vogt Date: Fri, 10 Mar 2017 13:05:48 -0600 Subject: [PATCH 04/10] Switch to datatype_name --- base/dates/parse.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/base/dates/parse.jl b/base/dates/parse.jl index 4d7973d0d57923..e6f27d978f5036 100644 --- a/base/dates/parse.jl +++ b/base/dates/parse.jl @@ -18,7 +18,7 @@ function directives{S,F}(::Type{DateFormat{S,F}}) return tokens, directive_index, directive_letters end -genvar(t::DataType) = Symbol(lowercase(string(t.name.name))) +genvar(t::DataType) = Symbol(lowercase(string(Base.datatype_name(t)))) @generated function tryparse_core(str::AbstractString, df::DateFormat, raise::Bool=false) From d833ec43ddbf20b395e295d5c2aa2946538595cc Mon Sep 17 00:00:00 2001 From: Curtis Vogt Date: Fri, 10 Mar 2017 12:23:52 -0600 Subject: [PATCH 05/10] Move towards consistent terminology --- base/dates/io.jl | 25 ++++++---- base/dates/parse.jl | 119 ++++++++++++++++++++++---------------------- test/dates/io.jl | 10 ++-- 3 files changed, 80 insertions(+), 74 deletions(-) diff --git a/base/dates/io.jl b/base/dates/io.jl index bcfb9a3201936e..29170630395e13 100644 --- a/base/dates/io.jl +++ b/base/dates/io.jl @@ -206,7 +206,7 @@ end end function _show_content{N}(io::IO, d::Delim{Char, N}) - if d.d in keys(FORMAT_SPECIFIERS) + if d.d in keys(CONVERSION_SPECIFIERS) for i = 1:N write(io, '\\', d.d) end @@ -219,7 +219,7 @@ end function _show_content(io::IO, d::Delim) for c in d.d - if c in keys(FORMAT_SPECIFIERS) + if c in keys(CONVERSION_SPECIFIERS) write(io, '\\') end write(io, c) @@ -236,9 +236,9 @@ end abstract type DayOfWeekToken end # special addition to Period types -# Map format specifiers to, typically period, types. -# Note that Julia packages like TimeZones.jl can add additional specifiers. -const FORMAT_SPECIFIERS = Dict{Char, Type}( +# Map conversion specifiers or character codes to tokens. +# Note: Allow addition of new character codes added by packages +const CONVERSION_SPECIFIERS = Dict{Char, Type}( 'y' => Year, 'Y' => Year, 'm' => Month, @@ -253,7 +253,10 @@ const FORMAT_SPECIFIERS = Dict{Char, Type}( 's' => Millisecond, ) -const FORMAT_DEFAULTS = Dict{Type, Any}( +# Default values are needed when a conversion specifier is used in a DateFormat for parsing +# and we have reached the end of the input string. +# Note: Allow `Any` value as a default to support extensibility +const CONVERSION_DEFAULTS = Dict{Type, Any}( Year => Int64(1), Month => Int64(1), DayOfWeekToken => Int64(0), @@ -264,7 +267,9 @@ const FORMAT_DEFAULTS = Dict{Type, Any}( Millisecond => Int64(0), ) -const FORMAT_TRANSLATIONS = Dict{Type{<:TimeType}, Tuple}( +# Specifies the required fields in order to parse a TimeType +# Note: Allows for addition of new TimeTypes +const CONVERSION_TRANSLATIONS = Dict{Type{<:TimeType}, Tuple}( Date => (Year, Month, Day), DateTime => (Year, Month, Day, Hour, Minute, Second, Millisecond), ) @@ -309,13 +314,13 @@ function DateFormat(f::AbstractString, locale::DateLocale=ENGLISH) prev = () prev_offset = 1 - letters = String(collect(keys(FORMAT_SPECIFIERS))) + letters = String(collect(keys(CONVERSION_SPECIFIERS))) for m in eachmatch(Regex("(? len && @goto done - $nullable, next_pos = tryparsenext(tokens[$i], str, pos, len, locale) + $nullable, next_pos = tryparsenext(directives[$i], str, pos, len, locale) isnull($nullable) && @goto error $name = unsafe_get($nullable) pos = next_pos - directive_idx += 1 - token_idx += 1 + num_parsed += 1 + directive_index += 1 end else quote pos > len && @goto done - nullable_delim, next_pos = tryparsenext(tokens[$i], str, pos, len, locale) + nullable_delim, next_pos = tryparsenext(directives[$i], str, pos, len, locale) isnull(nullable_delim) && @goto error pos = next_pos - token_idx += 1 + directive_index += 1 end end end - for i in 1:length(token_types) + for i in 1:length(directives) ] quote - tokens = df.tokens + directives = df.tokens locale::DateLocale = df.locale pos, len = start(str), endof(str) - directive_idx = 0 - token_idx = 1 + num_parsed = 0 + directive_index = 1 $(assign_defaults...) $(parsers...) @@ -79,15 +77,16 @@ genvar(t::DataType) = Symbol(lowercase(string(Base.datatype_name(t)))) pos > len || @goto error @label done - return Nullable{$R}($(Expr(:tuple, directive_names...))), directive_idx + return Nullable{$R}($(Expr(:tuple, value_names...))), num_parsed @label error # Note: Keeping exception generation in separate function helps with performance if raise - if token_idx > length(tokens) + if directive_index > length(directives) throw(ArgumentError("Found extra characters at the end of date time string")) else - throw(ArgumentError("Unable to parse date time. Expected token $(tokens[token_idx]) at char $pos")) + d = directives[directive_index] + throw(ArgumentError("Unable to parse date time. Expected directive $d at char $pos")) end end return Nullable{$R}(), 0 @@ -98,18 +97,19 @@ end @generated function tryparse_internal{T<:TimeType}( ::Type{T}, str::AbstractString, df::DateFormat, raise::Bool=false, ) - token_types, directive_index, directive_letters = directives(df) + letters = character_codes(df) - directive_types = Type[FORMAT_SPECIFIERS[letter] for letter in directive_letters] - directive_names = Symbol[genvar(t) for t in directive_types] + tokens = Type[CONVERSION_SPECIFIERS[letter] for letter in letters] + value_names = Symbol[genvar(t) for t in tokens] - output_types = FORMAT_TRANSLATIONS[T] + output_types = CONVERSION_TRANSLATIONS[T] output_names = Symbol[genvar(t) for t in output_types] - output_defaults = Tuple(FORMAT_DEFAULTS[t] for t in output_types) + output_defaults = Tuple(CONVERSION_DEFAULTS[t] for t in output_types) R = typeof(output_defaults) - # Pre-assign output variables to default values. Ensures that all output variables are - # assigned as the format directives may not include all of the required variables. + # Pre-assign output variables to defaults. Ensures that all output variables are + # assigned as the tuple returned from `tryparse_core` may not include all of the + # required variables. assign_defaults = Expr[ quote $name = $default @@ -117,14 +117,14 @@ end for (name, default) in zip(output_names, output_defaults) ] - # Unpacks the tuple into various directive variables. - directive_tuple = Expr(:tuple, directive_names...) + # Unpacks the tuple returned by `tryparse_core` into separate variables. + value_tuple = Expr(:tuple, value_names...) quote - values, index = tryparse_core(str, df, raise) + values, num_parsed = tryparse_core(str, df, raise) isnull(values) && return Nullable{$R}() $(assign_defaults...) - $directive_tuple = unsafe_get(values) + $value_tuple = unsafe_get(values) Nullable{$R}($(Expr(:tuple, output_names...))) end end @@ -239,31 +239,32 @@ end function Base.parse{T<:TimeType}( ::Type{T}, str::AbstractString, df::DateFormat=default_format(T), ) - nt = tryparse_internal(T, str, df, true) - T(unsafe_get(nt)...) + values = tryparse_internal(T, str, df, true) + T(unsafe_get(values)...) end function Base.tryparse{T<:TimeType}( ::Type{T}, str::AbstractString, df::DateFormat=default_format(T), ) - nt = tryparse_internal(T, str, df, false) - if isnull(nt) + values = tryparse_internal(T, str, df, false) + if isnull(values) Nullable{T}() else - Nullable{T}(T(unsafe_get(nt)...)) + Nullable{T}(T(unsafe_get(values)...)) end end @generated function Base.parse(::Type{Vector}, str::AbstractString, df::DateFormat) - token_types, directive_index, directive_letters = directives(df) - directive_types = Type[FORMAT_SPECIFIERS[letter] for letter in directive_letters] + letters = character_codes(df) + + tokens = Type[CONVERSION_SPECIFIERS[letter] for letter in letters] quote - nt, num_parsed = tryparse_core(str, df, true) - t = unsafe_get(nt) - directive_types = $(Expr(:tuple, directive_types...)) + values, num_parsed = tryparse_core(str, df, true) + t = unsafe_get(values) + types = $(Expr(:tuple, tokens...)) result = Vector{Any}(num_parsed) - for (i, typ) in enumerate(directive_types) + for (i, typ) in enumerate(types) i > num_parsed && break result[i] = typ(t[i]) # Constructing types takes most of the time end diff --git a/test/dates/io.jl b/test/dates/io.jl index a660a08178361f..f4403d186fc10e 100644 --- a/test/dates/io.jl +++ b/test/dates/io.jl @@ -87,7 +87,7 @@ try @test false catch err @test isa(err, ArgumentError) - @test err.msg == "Unable to parse date time. Expected token Delim(.) at char 16" + @test err.msg == "Unable to parse date time. Expected directive Delim(.) at char 16" end f = "yy:dd:mm" @@ -398,14 +398,14 @@ let try # Make 'Z' into a specifier - Dates.FORMAT_SPECIFIERS['Z'] = Zulu - Dates.FORMAT_DEFAULTS[Zulu] = "" + Dates.CONVERSION_SPECIFIERS['Z'] = Zulu + Dates.CONVERSION_DEFAULTS[Zulu] = "" @test parse(Vector, ds, Dates.DateFormat(format)) == [parsed; Zulu("Z")] @test parse(Vector, ds, Dates.DateFormat(escaped_format)) == parsed finally - delete!(Dates.FORMAT_SPECIFIERS, 'Z') - delete!(Dates.FORMAT_DEFAULTS, Zulu) + delete!(Dates.CONVERSION_SPECIFIERS, 'Z') + delete!(Dates.CONVERSION_DEFAULTS, Zulu) end # Ensure that the default behaviour has been restored From f85877eb1b1b2cfd240b77019a6759292516c2ba Mon Sep 17 00:00:00 2001 From: Curtis Vogt Date: Fri, 10 Mar 2017 11:12:09 -0600 Subject: [PATCH 06/10] Rename parse(::Vector, ...) to parse_components --- base/dates/parse.jl | 3 +-- base/deprecated.jl | 4 ++-- test/dates/io.jl | 25 ++++++++++++++----------- 3 files changed, 17 insertions(+), 15 deletions(-) diff --git a/base/dates/parse.jl b/base/dates/parse.jl index 181daea4c99ff1..7329fc9428d3d8 100644 --- a/base/dates/parse.jl +++ b/base/dates/parse.jl @@ -254,9 +254,8 @@ function Base.tryparse{T<:TimeType}( end end -@generated function Base.parse(::Type{Vector}, str::AbstractString, df::DateFormat) +@generated function parse_components(str::AbstractString, df::DateFormat) letters = character_codes(df) - tokens = Type[CONVERSION_SPECIFIERS[letter] for letter in letters] quote diff --git a/base/deprecated.jl b/base/deprecated.jl index 0ed1656303f0f0..c53ae967e6006c 100644 --- a/base/deprecated.jl +++ b/base/deprecated.jl @@ -1283,9 +1283,9 @@ end function Base.Dates.parse(x::AbstractString, df::DateFormat) Base.depwarn(string( "`Dates.parse(x::AbstractString, df::DateFormat)` is deprecated, use ", - "`sort!(filter!(el -> isa(el, Dates.Period), parse(Vector, x, df), rev=true, lt=Dates.periodisless)` ", + "`sort!(filter!(el -> isa(el, Dates.Period), Dates.parse_components(x, df), rev=true, lt=Dates.periodisless)` ", " instead."), :parse) - sort!(filter!(el -> isa(el, Period), parse(Vector, x, df)), rev=true, lt=periodisless) + sort!(filter!(el -> isa(el, Period), parse_components(x, df)), rev=true, lt=periodisless) end end diff --git a/test/dates/io.jl b/test/dates/io.jl index f4403d186fc10e..69d4a7ae30aff2 100644 --- a/test/dates/io.jl +++ b/test/dates/io.jl @@ -29,10 +29,13 @@ # DateTime parsing # Useful reference for different locales: http://library.princeton.edu/departments/tsd/katmandu/reference/months.html -let str = "1996/02/15 24:00", format = "yyyy/mm/dd HH:MM" - expected = (1996, 2, 15, 24, 0, 0, 0) - @test get(Dates.tryparse_internal(DateTime, str, Dates.DateFormat(format))) == expected - @test_throws ArgumentError Dates.DateTime(str, Dates.DateFormat(format)) +# Allow parsing of strings which are not representable as a TimeType +let str = "02/15/1996 24:00", df = Dates.DateFormat("mm/dd/yyyy HH:MM") + parsed = Any[ + Dates.Month(2), Dates.Day(15), Dates.Year(1996), Dates.Hour(24), Dates.Minute(0) + ] + @test Dates.parse_components(str, df) == parsed + @test_throws ArgumentError Dates.parse(DateTime, str, df) end # DateFormat printing @@ -382,7 +385,7 @@ let Dates.tryparsenext_word(str, i, len, Dates.min_width(d), Dates.max_width(d)) end - ds = "2015-07-24T05:38:19.591Z" + str = "2015-07-24T05:38:19.591Z" dt = Dates.DateTime(2015, 7, 24, 5, 38, 19, 591) parsed = Any[ Dates.Year(2015), Dates.Month(7), Dates.Day(24), @@ -393,24 +396,24 @@ let escaped_format = "yyyy-mm-dd\\THH:MM:SS.sss\\Z" # Typically 'Z' isn't treated as a specifier so it doesn't have to be escaped - @test parse(Vector, ds, Dates.DateFormat(format)) == parsed - @test parse(Vector, ds, Dates.DateFormat(escaped_format)) == parsed + @test Dates.parse_components(str, Dates.DateFormat(format)) == parsed + @test Dates.parse_components(str, Dates.DateFormat(escaped_format)) == parsed try # Make 'Z' into a specifier Dates.CONVERSION_SPECIFIERS['Z'] = Zulu Dates.CONVERSION_DEFAULTS[Zulu] = "" - @test parse(Vector, ds, Dates.DateFormat(format)) == [parsed; Zulu("Z")] - @test parse(Vector, ds, Dates.DateFormat(escaped_format)) == parsed + @test Dates.parse_components(str, Dates.DateFormat(format)) == [parsed; Zulu("Z")] + @test Dates.parse_components(str, Dates.DateFormat(escaped_format)) == parsed finally delete!(Dates.CONVERSION_SPECIFIERS, 'Z') delete!(Dates.CONVERSION_DEFAULTS, Zulu) end # Ensure that the default behaviour has been restored - @test parse(Vector, ds, Dates.DateFormat(format)) == parsed - @test parse(Vector, ds, Dates.DateFormat(escaped_format)) == parsed + @test Dates.parse_components(str, Dates.DateFormat(format)) == parsed + @test Dates.parse_components(str, Dates.DateFormat(escaped_format)) == parsed end # Issue 10817 From 529affee47c0508c3a83f7c35696ae2c5271f77f Mon Sep 17 00:00:00 2001 From: Curtis Vogt Date: Fri, 10 Mar 2017 10:18:01 -0600 Subject: [PATCH 07/10] Internal parse funcs now take and return position --- base/dates/parse.jl | 35 ++++++++++++++++++++--------------- test/dates/io.jl | 4 ++-- 2 files changed, 22 insertions(+), 17 deletions(-) diff --git a/base/dates/parse.jl b/base/dates/parse.jl index 7329fc9428d3d8..5834a309c109f6 100644 --- a/base/dates/parse.jl +++ b/base/dates/parse.jl @@ -17,7 +17,9 @@ end genvar(t::DataType) = Symbol(lowercase(string(Base.datatype_name(t)))) -@generated function tryparse_core(str::AbstractString, df::DateFormat, raise::Bool=false) +@generated function tryparsenext_core( + str::AbstractString, pos::Int, len::Int, df::DateFormat, raise::Bool=false, +) directives = _directives(df) letters = character_codes(directives) @@ -67,7 +69,7 @@ genvar(t::DataType) = Symbol(lowercase(string(Base.datatype_name(t)))) quote directives = df.tokens locale::DateLocale = df.locale - pos, len = start(str), endof(str) + num_parsed = 0 directive_index = 1 @@ -77,7 +79,7 @@ genvar(t::DataType) = Symbol(lowercase(string(Base.datatype_name(t)))) pos > len || @goto error @label done - return Nullable{$R}($(Expr(:tuple, value_names...))), num_parsed + return Nullable{$R}($(Expr(:tuple, value_names...))), pos, num_parsed @label error # Note: Keeping exception generation in separate function helps with performance @@ -89,13 +91,13 @@ genvar(t::DataType) = Symbol(lowercase(string(Base.datatype_name(t)))) throw(ArgumentError("Unable to parse date time. Expected directive $d at char $pos")) end end - return Nullable{$R}(), 0 + return Nullable{$R}(), pos, 0 end end -@generated function tryparse_internal{T<:TimeType}( - ::Type{T}, str::AbstractString, df::DateFormat, raise::Bool=false, +@generated function tryparsenext_internal{T<:TimeType}( + ::Type{T}, str::AbstractString, pos::Int, len::Int, df::DateFormat, raise::Bool=false, ) letters = character_codes(df) @@ -108,8 +110,8 @@ end R = typeof(output_defaults) # Pre-assign output variables to defaults. Ensures that all output variables are - # assigned as the tuple returned from `tryparse_core` may not include all of the - # required variables. + # assigned as the value tuple returned from `tryparsenext_core` may not include all + # of the required variables. assign_defaults = Expr[ quote $name = $default @@ -117,15 +119,15 @@ end for (name, default) in zip(output_names, output_defaults) ] - # Unpacks the tuple returned by `tryparse_core` into separate variables. + # Unpacks the value tuple returned by `tryparsenext_core` into separate variables. value_tuple = Expr(:tuple, value_names...) quote - values, num_parsed = tryparse_core(str, df, raise) - isnull(values) && return Nullable{$R}() + values, pos, num_parsed = tryparsenext_core(str, pos, len, df, raise) + isnull(values) && return Nullable{$R}(), pos $(assign_defaults...) $value_tuple = unsafe_get(values) - Nullable{$R}($(Expr(:tuple, output_names...))) + return Nullable{$R}($(Expr(:tuple, output_names...))), pos end end @@ -239,14 +241,16 @@ end function Base.parse{T<:TimeType}( ::Type{T}, str::AbstractString, df::DateFormat=default_format(T), ) - values = tryparse_internal(T, str, df, true) + pos, len = start(str), endof(str) + values, pos = tryparsenext_internal(T, str, pos, len, df, true) T(unsafe_get(values)...) end function Base.tryparse{T<:TimeType}( ::Type{T}, str::AbstractString, df::DateFormat=default_format(T), ) - values = tryparse_internal(T, str, df, false) + pos, len = start(str), endof(str) + values, pos = tryparsenext_internal(T, str, pos, len, df, false) if isnull(values) Nullable{T}() else @@ -259,7 +263,8 @@ end tokens = Type[CONVERSION_SPECIFIERS[letter] for letter in letters] quote - values, num_parsed = tryparse_core(str, df, true) + pos, len = start(str), endof(str) + values, pos, num_parsed = tryparsenext_core(str, pos, len, df, true) t = unsafe_get(values) types = $(Expr(:tuple, tokens...)) result = Vector{Any}(num_parsed) diff --git a/test/dates/io.jl b/test/dates/io.jl index 69d4a7ae30aff2..8564265d880cfe 100644 --- a/test/dates/io.jl +++ b/test/dates/io.jl @@ -79,14 +79,14 @@ b2 = "96/Feb/1" b3 = "96/2/15" @test_throws ArgumentError Dates.DateTime(b3, f) try - Dates.tryparse_internal(DateTime, "2012/2/20T9:9:31.25i90", dateformat"yyyy/mm/ddTHH:MM:SS.s", true) + Dates.parse(DateTime, "2012/2/20T9:9:31.25i90", dateformat"yyyy/mm/ddTHH:MM:SS.s") @test false catch err @test isa(err, ArgumentError) @test err.msg == "Found extra characters at the end of date time string" end try - Dates.tryparse_internal(DateTime, "2012/2/20T9:9:3i90", dateformat"yyyy/mm/ddTHH:MM:SS.s", true) + Dates.parse(DateTime, "2012/2/20T9:9:3i90", dateformat"yyyy/mm/ddTHH:MM:SS.s") @test false catch err @test isa(err, ArgumentError) From c16d62990ab1a829cce5fc19625bc637b523c67e Mon Sep 17 00:00:00 2001 From: Curtis Vogt Date: Fri, 10 Mar 2017 15:45:42 -0600 Subject: [PATCH 08/10] Documentation for internal functions --- base/dates/parse.jl | 41 +++++++++++++++++++++++++++++++++++++---- 1 file changed, 37 insertions(+), 4 deletions(-) diff --git a/base/dates/parse.jl b/base/dates/parse.jl index 5834a309c109f6..50d98a15aa4e1d 100644 --- a/base/dates/parse.jl +++ b/base/dates/parse.jl @@ -16,7 +16,21 @@ end genvar(t::DataType) = Symbol(lowercase(string(Base.datatype_name(t)))) - +""" + tryparsenext_core(str::AbstractString, pos::Int, len::Int, df::DateFormat) + +Parses the string according to the directives within the DateFormat. Parsing will start at +character index `pos` and will stop when all directives are used or we have parsed up to the +end of the string (`len`). + +Returns a 3-element tuple `(values, pos, num_parsed)`: +* `values::Nullable{Tuple}`: A tuple which contains a values for each `DatePart` within the + `DateFormat` in the order in which the occur. If the string ends before we finish parsing + all the directives the missing values will be filled in with default values. +* `pos::Int`: The character index at which parsing stopped. +* `num_parsed::Int`: The number of values which were parsed and stored within `values`. + Useful for distinguishing parsed values from default values. +""" @generated function tryparsenext_core( str::AbstractString, pos::Int, len::Int, df::DateFormat, raise::Bool=false, ) @@ -95,7 +109,18 @@ genvar(t::DataType) = Symbol(lowercase(string(Base.datatype_name(t)))) end end +""" + tryparsenext_internal(::Type{<:TimeType}, str::AbstractString, pos::Int, len::Int, df::DateFormat) + +Parses the string according to the directives within the DateFormat. The specified TimeType +type determines the type of and order of tokens returned. If the given DateFormat or string +does not provide a required token a default value will be used. +Returns a 2-element tuple `(values, pos)`: +* `values::Nullable{Tuple}`: A tuple which contains a values for each token as specified by + the passed in type. +* `pos::Int`: The character index at which parsing stopped. +""" @generated function tryparsenext_internal{T<:TimeType}( ::Type{T}, str::AbstractString, pos::Int, len::Int, df::DateFormat, raise::Bool=false, ) @@ -104,9 +129,9 @@ end tokens = Type[CONVERSION_SPECIFIERS[letter] for letter in letters] value_names = Symbol[genvar(t) for t in tokens] - output_types = CONVERSION_TRANSLATIONS[T] - output_names = Symbol[genvar(t) for t in output_types] - output_defaults = Tuple(CONVERSION_DEFAULTS[t] for t in output_types) + output_tokens = CONVERSION_TRANSLATIONS[T] + output_names = Symbol[genvar(t) for t in output_tokens] + output_defaults = Tuple(CONVERSION_DEFAULTS[t] for t in output_tokens) R = typeof(output_defaults) # Pre-assign output variables to defaults. Ensures that all output variables are @@ -258,6 +283,14 @@ function Base.tryparse{T<:TimeType}( end end +""" + parse_components(str::AbstractString, df::DateFormat) -> Array{Any} + +Parse the string into its components according to the directives in the DateFormat. +Each component will be a distinct type, typically a subtype of Period. The order of the +components will match the order of the `DatePart` directives within the DateFormat. The +number of components may be less than the total number of `DatePart`. +""" @generated function parse_components(str::AbstractString, df::DateFormat) letters = character_codes(df) tokens = Type[CONVERSION_SPECIFIERS[letter] for letter in letters] From 2b787f22d85fcbf77188e8df3ec9bb23109a293b Mon Sep 17 00:00:00 2001 From: Curtis Vogt Date: Mon, 13 Mar 2017 09:51:39 -0500 Subject: [PATCH 09/10] Corrections to documentation --- base/dates/parse.jl | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/base/dates/parse.jl b/base/dates/parse.jl index 50d98a15aa4e1d..bf1bce69ceecce 100644 --- a/base/dates/parse.jl +++ b/base/dates/parse.jl @@ -17,15 +17,16 @@ end genvar(t::DataType) = Symbol(lowercase(string(Base.datatype_name(t)))) """ - tryparsenext_core(str::AbstractString, pos::Int, len::Int, df::DateFormat) + tryparsenext_core(str::AbstractString, pos::Int, len::Int, df::DateFormat, raise=false) Parses the string according to the directives within the DateFormat. Parsing will start at -character index `pos` and will stop when all directives are used or we have parsed up to the -end of the string (`len`). +character index `pos` and will stop when all directives are used or we have parsed up to, +the end of the string, `len`. If the provided string cannot be parsed an exception will be +thrown only if `raise` is true. Returns a 3-element tuple `(values, pos, num_parsed)`: -* `values::Nullable{Tuple}`: A tuple which contains a values for each `DatePart` within the - `DateFormat` in the order in which the occur. If the string ends before we finish parsing +* `values::Nullable{Tuple}`: A tuple which contains a value for each `DatePart` within the + `DateFormat` in the order in which they occur. If the string ends before we finish parsing all the directives the missing values will be filled in with default values. * `pos::Int`: The character index at which parsing stopped. * `num_parsed::Int`: The number of values which were parsed and stored within `values`. @@ -110,14 +111,14 @@ Returns a 3-element tuple `(values, pos, num_parsed)`: end """ - tryparsenext_internal(::Type{<:TimeType}, str::AbstractString, pos::Int, len::Int, df::DateFormat) + tryparsenext_internal(::Type{<:TimeType}, str, pos, len, df::DateFormat, raise=false) Parses the string according to the directives within the DateFormat. The specified TimeType type determines the type of and order of tokens returned. If the given DateFormat or string does not provide a required token a default value will be used. Returns a 2-element tuple `(values, pos)`: -* `values::Nullable{Tuple}`: A tuple which contains a values for each token as specified by +* `values::Nullable{Tuple}`: A tuple which contains a value for each token as specified by the passed in type. * `pos::Int`: The character index at which parsing stopped. """ From b0f6c75592db28ea4609378607daa4a9292d954f Mon Sep 17 00:00:00 2001 From: Curtis Vogt Date: Mon, 13 Mar 2017 10:32:08 -0500 Subject: [PATCH 10/10] Corrections from review --- base/dates/parse.jl | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/base/dates/parse.jl b/base/dates/parse.jl index bf1bce69ceecce..ac3b0533e08654 100644 --- a/base/dates/parse.jl +++ b/base/dates/parse.jl @@ -20,7 +20,7 @@ genvar(t::DataType) = Symbol(lowercase(string(Base.datatype_name(t)))) tryparsenext_core(str::AbstractString, pos::Int, len::Int, df::DateFormat, raise=false) Parses the string according to the directives within the DateFormat. Parsing will start at -character index `pos` and will stop when all directives are used or we have parsed up to, +character index `pos` and will stop when all directives are used or we have parsed up to the end of the string, `len`. If the provided string cannot be parsed an exception will be thrown only if `raise` is true. @@ -115,7 +115,8 @@ end Parses the string according to the directives within the DateFormat. The specified TimeType type determines the type of and order of tokens returned. If the given DateFormat or string -does not provide a required token a default value will be used. +does not provide a required token a default value will be used. If the provided string +cannot be parsed an exception will be thrown only if `raise` is true. Returns a 2-element tuple `(values, pos)`: * `values::Nullable{Tuple}`: A tuple which contains a value for each token as specified by