-
-
Notifications
You must be signed in to change notification settings - Fork 5.5k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Extensible and fast date parsing #20952
Changes from 2 commits
49143c5
eac850f
c5bd3c3
ad19d23
d833ec4
f85877e
529affe
c16d629
2b787f2
b0f6c75
17d8ecc
510dd02
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -175,17 +175,17 @@ Delim(d::Char) = Delim{Char, 1}(d) | |
Delim(d::String) = Delim{String, length(d)}(d) | ||
|
||
@inline function tryparsenext{N}(d::Delim{Char, N}, str, i::Int, len) | ||
R = Nullable{Int64} | ||
R = Nullable{Bool} | ||
for j=1:N | ||
i > len && return (R(), i) | ||
c, i = next(str, i) | ||
c != d.d && return (R(), i) | ||
end | ||
return R(0), i | ||
return R(true), i | ||
end | ||
|
||
@inline function tryparsenext{N}(d::Delim{String, N}, str, i::Int, len) | ||
R = Nullable{Int64} | ||
R = Nullable{Bool} | ||
i1 = i | ||
i2 = start(d.d) | ||
for j = 1:N | ||
|
@@ -198,15 +198,15 @@ end | |
return R(), i1 | ||
end | ||
end | ||
return R(0), i1 | ||
return R(true), i1 | ||
end | ||
|
||
@inline function format(io, d::Delim, dt, locale) | ||
write(io, d.d) | ||
end | ||
|
||
function _show_content{N}(io::IO, d::Delim{Char, N}) | ||
if d.d in keys(SLOT_RULE) | ||
if d.d in keys(FORMAT_SPECIFIERS) | ||
for i = 1:N | ||
write(io, '\\', d.d) | ||
end | ||
|
@@ -219,7 +219,7 @@ end | |
|
||
function _show_content(io::IO, d::Delim) | ||
for c in d.d | ||
if c in keys(SLOT_RULE) | ||
if c in keys(FORMAT_SPECIFIERS) | ||
write(io, '\\') | ||
end | ||
write(io, c) | ||
|
@@ -236,8 +236,9 @@ end | |
|
||
abstract type DayOfWeekToken end # special addition to Period types | ||
|
||
# mapping format specifiers to period types | ||
const SLOT_RULE = Dict{Char, Type}( | ||
# Map format specifiers to, typically period, types. | ||
# Note that Julia packages like TimeZones.jl can add additional specifiers. | ||
const FORMAT_SPECIFIERS = Dict{Char, Type}( | ||
'y' => Year, | ||
'Y' => Year, | ||
'm' => Month, | ||
|
@@ -252,13 +253,21 @@ const SLOT_RULE = Dict{Char, Type}( | |
's' => Millisecond, | ||
) | ||
|
||
slot_order(::Type{Date}) = (Year, Month, Day) | ||
slot_order(::Type{DateTime}) = (Year, Month, Day, Hour, Minute, Second, Millisecond) | ||
|
||
slot_defaults(::Type{Date}) = map(Int64, (1, 1, 1)) | ||
slot_defaults(::Type{DateTime}) = map(Int64, (1, 1, 1, 0, 0, 0, 0)) | ||
const FORMAT_DEFAULTS = Dict{Type, Any}( | ||
Year => Int64(1), | ||
Month => Int64(1), | ||
DayOfWeekToken => Int64(0), | ||
Day => Int64(1), | ||
Hour => Int64(0), | ||
Minute => Int64(0), | ||
Second => Int64(0), | ||
Millisecond => Int64(0), | ||
) | ||
|
||
slot_types{T<:TimeType}(::Type{T}) = typeof(slot_defaults(T)) | ||
const FORMAT_TRANSLATIONS = Dict{Type{<:TimeType}, Tuple}( | ||
Date => (Year, Month, Day), | ||
DateTime => (Year, Month, Day, Hour, Minute, Second, Millisecond), | ||
) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we make all these ImmutableDict? They're small enough that may be a performance and memory improvement (however inconsequential overall), and would help enforce that the data feeding into a generated function can't be modified (esp. if that code wants to eventually be precompiled) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @vtjnash switching these to There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Originally these were defined through methods (which I prefer): slot_order(::Type{DateTime}) = (Year, Month, Day, Hour, Minute, Second, Millisecond) Unfortunately this cannot be used here since generated functions cannot call methods defined after the generated function definition. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That is correct. A generated function cannot be relied upon to observe any external mutation. |
||
|
||
""" | ||
DateFormat(format::AbstractString, locale="english") -> DateFormat | ||
|
@@ -300,13 +309,13 @@ function DateFormat(f::AbstractString, locale::DateLocale=ENGLISH) | |
prev = () | ||
prev_offset = 1 | ||
|
||
letters = String(collect(keys(Base.Dates.SLOT_RULE))) | ||
letters = String(collect(keys(FORMAT_SPECIFIERS))) | ||
for m in eachmatch(Regex("(?<!\\\\)([\\Q$letters\\E])\\1*"), f) | ||
tran = replace(f[prev_offset:m.offset - 1], r"\\(.)", s"\1") | ||
|
||
if !isempty(prev) | ||
letter, width = prev | ||
typ = SLOT_RULE[letter] | ||
typ = FORMAT_SPECIFIERS[letter] | ||
|
||
push!(tokens, DatePart{letter}(width, isempty(tran))) | ||
end | ||
|
@@ -326,7 +335,7 @@ function DateFormat(f::AbstractString, locale::DateLocale=ENGLISH) | |
|
||
if !isempty(prev) | ||
letter, width = prev | ||
typ = SLOT_RULE[letter] | ||
typ = FORMAT_SPECIFIERS[letter] | ||
|
||
push!(tokens, DatePart{letter}(width, false)) | ||
end | ||
|
@@ -368,6 +377,9 @@ const ISODateTimeFormat = DateFormat("yyyy-mm-dd\\THH:MM:SS.s") | |
const ISODateFormat = DateFormat("yyyy-mm-dd") | ||
const RFC1123Format = DateFormat("e, dd u yyyy HH:MM:SS") | ||
|
||
default_format(::Type{DateTime}) = ISODateTimeFormat | ||
default_format(::Type{Date}) = ISODateFormat | ||
|
||
### API | ||
""" | ||
DateTime(dt::AbstractString, format::AbstractString; locale="english") -> DateTime | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,97 +1,132 @@ | ||
### Parsing utilities | ||
|
||
@generated function tryparse_internal{T<:TimeType, S, F}(::Type{T}, str::AbstractString, df::DateFormat{S, F}, raise::Bool=false) | ||
token_types = Type[dp <: DatePart ? SLOT_RULE[first(dp.parameters)] : Void for dp in F.parameters] | ||
N = length(F.parameters) | ||
|
||
types = slot_order(T) | ||
num_types = length(types) | ||
order = Vector{Int}(num_types) | ||
for i = 1:num_types | ||
order[i] = findfirst(token_types, types[i]) | ||
function directives{S,F}(::Type{DateFormat{S,F}}) | ||
tokens = F.parameters | ||
di = 1 | ||
directive_index = zeros(Int, length(tokens)) | ||
directive_letters = sizehint!(Char[], length(tokens)) | ||
for (i, token) in enumerate(tokens) | ||
if token <: DatePart | ||
directive_index[i] = di | ||
|
||
letter = first(token.parameters) | ||
push!(directive_letters, letter) | ||
|
||
di += 1 | ||
end | ||
end | ||
return tokens, directive_index, directive_letters | ||
end | ||
|
||
genvar(t::DataType) = Symbol(lowercase(string(t.name.name))) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hopefully this is ok or the right way to do this? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm sure this is going to generate lots of discussion. Basically I need to map the specifier types to a variable name. Using separate variable names allows the generated code to avoid type instability problems (occurs once extended). Additionally, I can't get away with using There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is there a reflection function for this? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I can use |
||
|
||
field_defaults = slot_defaults(T) | ||
field_order = tuple(order...) | ||
tuple_type = slot_types(T) | ||
|
||
# `slot_order`, `slot_defaults`, and `slot_types` return tuples of the same length | ||
assert(num_types == length(field_order) == length(field_defaults)) | ||
@generated function tryparse_core(str::AbstractString, df::DateFormat, raise::Bool=false) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It will be great if this function can take a start index and return an end position... I've had to hack around this to get this. It's very useful when you're parsing dates and some more stuff after / a list of dates There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this can be supported. |
||
token_types, directive_index, directive_letters = directives(df) | ||
|
||
directive_types = Type[FORMAT_SPECIFIERS[letter] for letter in directive_letters] | ||
directive_names = Symbol[genvar(t) for t in directive_types] | ||
directive_defaults = Tuple(FORMAT_DEFAULTS[t] for t in directive_types) | ||
R = typeof(directive_defaults) | ||
|
||
# Pre-assign output variables to default values. Allows us to use `@goto done` without | ||
# worrying about unassigned variables. | ||
assign_defaults = Expr[ | ||
quote | ||
$name = $default | ||
end | ||
for (name, default) in zip(directive_names, directive_defaults) | ||
] | ||
|
||
parsers = Expr[ | ||
begin | ||
di = directive_index[i] | ||
if di != 0 | ||
name = directive_names[di] | ||
nullable = Symbol(:nullable_, name) | ||
quote | ||
pos > len && @goto done | ||
$nullable, next_pos = tryparsenext(tokens[$i], str, pos, len, locale) | ||
isnull($nullable) && @goto error | ||
$name = unsafe_get($nullable) | ||
pos = next_pos | ||
directive_idx += 1 | ||
token_idx += 1 | ||
end | ||
else | ||
quote | ||
pos > len && @goto done | ||
nullable_delim, next_pos = tryparsenext(tokens[$i], str, pos, len, locale) | ||
isnull(nullable_delim) && @goto error | ||
pos = next_pos | ||
token_idx += 1 | ||
end | ||
end | ||
end | ||
for i in 1:length(token_types) | ||
] | ||
|
||
quote | ||
R = Nullable{$tuple_type} | ||
t = df.tokens | ||
l = df.locale | ||
tokens = df.tokens | ||
locale::DateLocale = df.locale | ||
pos, len = start(str), endof(str) | ||
directive_idx = 0 | ||
token_idx = 1 | ||
|
||
$(assign_defaults...) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Cool that you got rid of this! :) |
||
$(parsers...) | ||
|
||
err_idx = 1 | ||
Base.@nexprs $N i->val_i = 0 | ||
Base.@nexprs $N i->(begin | ||
pos > len && @goto done | ||
nv, next_pos = tryparsenext(t[i], str, pos, len, l) | ||
isnull(nv) && @goto error | ||
val_i, pos = unsafe_get(nv), next_pos | ||
err_idx += 1 | ||
end) | ||
pos <= len && @goto error | ||
pos > len || @goto error | ||
|
||
@label done | ||
parts = Base.@ntuple $N val | ||
return R(reorder_args(parts, $field_order, $field_defaults, err_idx)::$tuple_type) | ||
return Nullable{$R}($(Expr(:tuple, directive_names...))), directive_idx | ||
|
||
@label error | ||
# Note: Keeping exception generation in separate function helps with performance | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. so is this comment no longer applicable? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I tested this again and it no longer seems to be an issue. I originally found this problem. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is this comment no longer accurate? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Correct. I'll make sure to remove that. |
||
raise && throw(gen_exception(t, err_idx, pos)) | ||
return R() | ||
if raise | ||
if token_idx > length(tokens) | ||
throw(ArgumentError("Found extra characters at the end of date time string")) | ||
else | ||
throw(ArgumentError("Unable to parse date time. Expected token $(tokens[token_idx]) at char $pos")) | ||
end | ||
end | ||
return Nullable{$R}(), 0 | ||
end | ||
end | ||
|
||
function gen_exception(tokens, err_idx, pos) | ||
if err_idx > length(tokens) | ||
ArgumentError("Found extra characters at the end of date time string") | ||
else | ||
ArgumentError("Unable to parse date time. Expected token $(tokens[err_idx]) at char $pos") | ||
end | ||
end | ||
|
||
# reorder_args(val, idx, default, default_from) | ||
# | ||
# reorder elements of `val` tuple according to `idx` tuple. Use `default[i]` | ||
# when `idx[i] == 0` or i >= default_from | ||
# | ||
# returns a tuple `xs` of the same length as `idx` where `xs[i]` is | ||
# `val[idx[i]]` if `idx[i]` is non zero, `default[i]` if `idx[i]` is zero. | ||
# | ||
# `xs[i]` is `default[i]` for all i >= `default_from`. | ||
# | ||
# | ||
function reorder_args{N}(val::Tuple, idx::NTuple{N}, default::Tuple, default_from::Integer) | ||
ntuple(Val{N}) do i | ||
if idx[i] == 0 || idx[i] >= default_from | ||
default[i] | ||
else | ||
val[idx[i]] | ||
end | ||
end | ||
end | ||
@generated function tryparse_internal{T<:TimeType}( | ||
::Type{T}, str::AbstractString, df::DateFormat, raise::Bool=false, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. And/ or this method should also take a position and return an end position.... |
||
) | ||
token_types, directive_index, directive_letters = directives(df) | ||
|
||
function Base.tryparse{T<:TimeType}(::Type{T}, str::AbstractString, df::DateFormat) | ||
nt = tryparse_internal(T, str, df, false) | ||
if isnull(nt) | ||
return Nullable{T}() | ||
else | ||
return Nullable{T}(T(unsafe_get(nt)...)) | ||
end | ||
end | ||
directive_types = Type[FORMAT_SPECIFIERS[letter] for letter in directive_letters] | ||
directive_names = Symbol[genvar(t) for t in directive_types] | ||
|
||
default_format(::Type{Date}) = ISODateFormat | ||
default_format(::Type{DateTime}) = ISODateTimeFormat | ||
output_types = FORMAT_TRANSLATIONS[T] | ||
output_names = Symbol[genvar(t) for t in output_types] | ||
output_defaults = Tuple(FORMAT_DEFAULTS[t] for t in output_types) | ||
R = typeof(output_defaults) | ||
|
||
function Base.parse{T<:TimeType}(::Type{T}, | ||
str::AbstractString, | ||
df::DateFormat=default_format(T)) | ||
nt = tryparse_internal(T, str, df, true) | ||
T(unsafe_get(nt)...) | ||
# Pre-assign output variables to default values. Ensures that all output variables are | ||
# assigned as the format directives may not include all of the required variables. | ||
assign_defaults = Expr[ | ||
quote | ||
$name = $default | ||
end | ||
for (name, default) in zip(output_names, output_defaults) | ||
] | ||
|
||
# Unpacks the tuple into various directive variables. | ||
directive_tuple = Expr(:tuple, directive_names...) | ||
|
||
quote | ||
values, index = tryparse_core(str, df, raise) | ||
isnull(values) && return Nullable{$R}() | ||
$(assign_defaults...) | ||
$directive_tuple = unsafe_get(values) | ||
Nullable{$R}($(Expr(:tuple, output_names...))) | ||
end | ||
end | ||
|
||
@inline function tryparsenext_base10(str::AbstractString, i::Int, len::Int, min_width::Int=1, max_width::Int=0) | ||
|
@@ -200,3 +235,38 @@ function Base.parse(::Type{DateTime}, s::AbstractString, df::typeof(ISODateTimeF | |
@label error | ||
throw(ArgumentError("Invalid DateTime string")) | ||
end | ||
|
||
function Base.parse{T<:TimeType}( | ||
::Type{T}, str::AbstractString, df::DateFormat=default_format(T), | ||
) | ||
nt = tryparse_internal(T, str, df, true) | ||
T(unsafe_get(nt)...) | ||
end | ||
|
||
function Base.tryparse{T<:TimeType}( | ||
::Type{T}, str::AbstractString, df::DateFormat=default_format(T), | ||
) | ||
nt = tryparse_internal(T, str, df, false) | ||
if isnull(nt) | ||
Nullable{T}() | ||
else | ||
Nullable{T}(T(unsafe_get(nt)...)) | ||
end | ||
end | ||
|
||
@generated function Base.parse(::Type{Vector}, str::AbstractString, df::DateFormat) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is a bit of an odd method, weren't you going to deprecate it? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I added a deprecation for There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I agree that the method signature is a bit odd. Maybe a different function name like There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. if it doesn't have to be an externally visible method of parse, then yeah a local name would raise fewer eyebrows (does the deprecation need to be eval'ed?) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The original There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We should consider having "Parser types" (kinda like the token types in this module) and using their objects of those types as the first argument to parse rather than have it always be There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. But of course that's not in the scope of this changeset. |
||
token_types, directive_index, directive_letters = directives(df) | ||
directive_types = Type[FORMAT_SPECIFIERS[letter] for letter in directive_letters] | ||
|
||
quote | ||
nt, num_parsed = tryparse_core(str, df, true) | ||
t = unsafe_get(nt) | ||
directive_types = $(Expr(:tuple, directive_types...)) | ||
result = Vector{Any}(num_parsed) | ||
for (i, typ) in enumerate(directive_types) | ||
i > num_parsed && break | ||
result[i] = typ(t[i]) # Constructing types takes most of the time | ||
end | ||
return result | ||
end | ||
end |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The
Any
here allows TimeZones.jl to add and return a default timezone I presume?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Any
is used for extensibility. Specifically for TimeZones.jl I'm using an empty string as the default. If/when this is converted into a ZonedDateTime an exception is raised telling the user that the timezone given is invalid.