Skip to content

Commit

Permalink
Merge pull request #165 from mochi/codepoint-to-bytes-surrogate
Browse files Browse the repository at this point in the history
Support parsing UTF-16 surrogate pairs in mochiweb_html #164
  • Loading branch information
etrepum committed Feb 9, 2016
2 parents bd6ae7c + 5a70cda commit d024b4a
Show file tree
Hide file tree
Showing 3 changed files with 43 additions and 13 deletions.
5 changes: 4 additions & 1 deletion CHANGES.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
Version 2.13.0 released XXXX-XX-XX
Version 2.13.0 released 2016-02-08

* Support parsing of UTF-16 surrogate pairs encoded as character
references in mochiweb_html
https://github.com/mochi/mochiweb/issues/164
* Avoid swallowing messages that are not related to the socket
during request parsing
https://github.com/mochi/mochiweb/pull/161
Expand Down
45 changes: 33 additions & 12 deletions src/mochiweb_html.erl
Original file line number Diff line number Diff line change
Expand Up @@ -639,13 +639,42 @@ find_gt(Bin, S=#decoder{offset=O}, HasSlash) ->

tokenize_charref(Bin, S=#decoder{offset=O}) ->
try
tokenize_charref(Bin, S, O)
case tokenize_charref_raw(Bin, S, O) of
{C1, S1=#decoder{offset=O1}} when C1 >= 16#D800 andalso C1 =< 16#DFFF ->
%% Surrogate pair
tokeninize_charref_surrogate_pair(Bin, S1, C1);
{Unichar, S1} when is_integer(Unichar) ->
{{data, mochiutf8:codepoint_to_bytes(Unichar), false},
S1};
{Unichars, S1} when is_list(Unichars) ->
{{data, unicode:characters_to_binary(Unichars), false},
S1}
end
catch
throw:invalid_charref ->
{{data, <<"&">>, false}, S}
end.

tokenize_charref(Bin, S=#decoder{offset=O}, Start) ->
tokeninize_charref_surrogate_pair(Bin, S=#decoder{offset=O}, C1) ->
case Bin of
<<_:O/binary, $&, _/binary>> ->
case tokenize_charref_raw(Bin, ?INC_COL(S), O + 1) of
{C2, S1} when C2 >= 16#D800 andalso C1 =< 16#DFFF ->
{{data,
unicode:characters_to_binary(
<<C1:16, C2:16>>,
utf16,
utf8),
false},
S1};
_ ->
throw(invalid_charref)
end;
_ ->
throw(invalid_charref)
end.

tokenize_charref_raw(Bin, S=#decoder{offset=O}, Start) ->
case Bin of
<<_:O/binary>> ->
throw(invalid_charref);
Expand All @@ -658,17 +687,9 @@ tokenize_charref(Bin, S=#decoder{offset=O}, Start) ->
<<_:O/binary, $;, _/binary>> ->
Len = O - Start,
<<_:Start/binary, Raw:Len/binary, _/binary>> = Bin,
Data = case mochiweb_charref:charref(Raw) of
undefined ->
throw(invalid_charref);
Unichar when is_integer(Unichar) ->
mochiutf8:codepoint_to_bytes(Unichar);
Unichars when is_list(Unichars) ->
unicode:characters_to_binary(Unichars)
end,
{{data, Data, false}, ?INC_COL(S)};
{mochiweb_charref:charref(Raw), ?INC_COL(S)};
_ ->
tokenize_charref(Bin, ?INC_COL(S), Start)
tokenize_charref_raw(Bin, ?INC_COL(S), Start)
end.

tokenize_doctype(Bin, S) ->
Expand Down
6 changes: 6 additions & 0 deletions test/mochiweb_html_tests.erl
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,12 @@ tokens_test() ->
mochiweb_html:tokens(<<"not html < at all">>)),
ok.

surrogate_test() ->
%% https://github.com/mochi/mochiweb/issues/164
?assertEqual(
[{data,<<240,159,152,138>>,false}],
mochiweb_html:tokens(<<"&#55357;&#56842;">>)).

parse_test() ->
D0 = <<"<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN\" \"http://www.w3.org/TR/html4/strict.dtd\">
<html>
Expand Down

0 comments on commit d024b4a

Please sign in to comment.