Skip to content

Commit

Permalink
fix(utilities): Fix UTF-16 encode/decode utility functions
Browse files Browse the repository at this point in the history
Make sure they always use a BOM for UTF-16, (required per Adobe PDF
spec), and add a test for them.

Fix related issue found by @Omikhleia, #1280, by taking most of their
recommendations regarding a possible patch, but actually editing the C
code rather than using `pdf.parse`.

(Tested with luacheck 0.23.0 on LuaJIT 2.0.5 (≈5.1))
  • Loading branch information
ctrlcctrlv authored and alerque committed Dec 16, 2021
1 parent 3ad14d5 commit 7180081
Show file tree
Hide file tree
Showing 3 changed files with 44 additions and 37 deletions.
76 changes: 40 additions & 36 deletions core/utilities.lua
Original file line number Diff line number Diff line change
Expand Up @@ -431,68 +431,72 @@ utilities.firstChar = function (str)
return chars[1]
end

local byte, floor, reverse = string.byte, math.floor, string.reverse

utilities.utf8charat = function (str, index)
return str:sub(index):match("([%z\1-\127\194-\244][\128-\191]*)")
end

utilities.utf8_to_utf16be_hexencoded = function (str)
local ustr = string.format("%04x", 0xfeff) -- BOM
for _, uchr in luautf8.codes(str) do
if (uchr < 0x10000) then
ustr = ustr..string.format("%04x", uchr)
else -- Surrogate pair
local sur_hi = (uchr - 0x10000) / 0x400 + 0xd800
local sur_lo = (uchr - 0x10000) % 0x400 + 0xdc00
ustr = ustr..string.format("%04x%04x", sur_hi, sur_lo)
end
end
return ustr
local utf16bom = function(endianness)
return endianness == "be" and "\xfe\xff" or endianness == "le" and "\xff\xfe" or SU.error("Unrecognized endianness")
end

utilities.utf8_to_utf16be = function (str)
utilities.hexencoded = function (str)
local ustr = ""
for _, uchr in luautf8.codes(str) do
if (uchr < 0x10000) then
ustr = ustr..string.format("%c%c", uchr / 256, uchr % 256 )
else -- Surrogate pair
local sur_hi = (uchr - 0x10000) / 0x400 + 0xd800
local sur_lo = (uchr - 0x10000) % 0x400 + 0xdc00
ustr = ustr..string.format("%c%c%c%c", sur_hi / 256, sur_hi % 256 , sur_lo / 256, sur_lo % 256)
end
for i = 1, #str do
ustr = ustr..string.format("%02x", byte(str[i]))
end
return ustr
end

utilities.utf8_to_utf16le = function (str)
utilities.hexdecoded = function (str)
if #str % 2 == 1 then SU.error("Cannot decode hex string with odd len") end
local ustr = ""
for _, uchr in luautf8.codes(str) do
if (uchr < 0x10000) then
ustr = ustr..string.format("%c%c", uchr % 256, uchr / 256 )
else -- Surrogate pair
local sur_hi = (uchr - 0x10000) / 0x400 + 0xd800
local sur_lo = (uchr - 0x10000) % 0x400 + 0xdc00
ustr = ustr..string.format("%c%c%c%c", sur_hi % 256, sur_hi / 256 , sur_lo % 256, sur_lo / 256)
end
for i = 1, #str, 2 do
ustr = ustr..string.format("%c", tonumber(string.sub(str, i, i+1), 16))
end
return ustr
end

utilities.utf16le_to_utf8 = function (str)
local ustr = ""
for uchr in utilities.utf16codes(str, "le") do
ustr = ustr..luautf8.char(uchr)
local uchr_to_surrogate_pair = function(uchr, endianness)
local hi, lo = floor((uchr - 0x10000) / 0x400) + 0xd800, (uchr - 0x10000) % 0x400 + 0xdc00
local s_hi, s_lo = string.format("%c%c", floor(hi / 256), hi % 256), string.format("%c%c", floor(lo / 256), lo % 256)
return endianness == "le" and (reverse(s_hi) .. reverse(s_lo)) or s_hi .. s_lo
end

local uchr_to_utf16_double_byte = function(uchr, endianness)
local ustr = string.format("%c%c", floor(uchr / 256), uchr % 256 )
return endianness == "le" and reverse(ustr) or ustr
end

local utf8_to_utf16 = function(str, endianness)
local ustr = utf16bom(endianness)
for _, uchr in luautf8.codes(str) do
ustr = ustr..(uchr < 0x10000 and uchr_to_utf16_double_byte(uchr, endianness)
or uchr_to_surrogate_pair(uchr, endianness))
end
return ustr
end

utilities.utf16be_to_utf8 = function (str)
utilities.utf8_to_utf16be = function (str) return utf8_to_utf16(str, "be") end
utilities.utf8_to_utf16le = function (str) return utf8_to_utf16(str, "le") end
utilities.utf8_to_utf16be_hexencoded = function (str) return utilities.hexencoded(utilities.utf8_to_utf16be(str)) end
utilities.utf8_to_utf16le_hexencoded = function (str) return utilities.hexencoded(utilities.utf8_to_utf16le(str)) end

local utf16_to_utf8 = function (str, endianness)
local bom = utf16bom(endianness)

if str:find(bom) == 1 then str = string.sub(str, 3, #str) end
local ustr = ""
for uchr in utilities.utf16codes(str, "be") do
for uchr in utilities.utf16codes(str, endianness) do
ustr = ustr..luautf8.char(uchr)
end
return ustr
end

utilities.utf16be_to_utf8 = function (str) return utf16_to_utf8(str, "be") end
utilities.utf16le_to_utf8 = function (str) return utf16_to_utf8(str, "le") end

local icu = require("justenoughicu")

local icuFormat = function (num, format)
Expand Down
2 changes: 2 additions & 0 deletions packages/pdf.lua
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,8 @@ end)
SILE.registerCommand("pdf:metadata", function (options, _)
local key = SU.required(options, "key", "pdf:metadata")
local val = SU.required(options, "val", "pdf:metadata")
-- see comment in pdf:bookmark
val = SU.utf8_to_utf16be(val)
SILE.typesetter:pushHbox({
value = nil,
height = SILE.measurement(0),
Expand Down
3 changes: 2 additions & 1 deletion src/justenoughlibtexpdf.c
Original file line number Diff line number Diff line change
Expand Up @@ -320,12 +320,13 @@ int pdf_end_annotation(lua_State *L) {
int pdf_metadata(lua_State *L) {
const char* key = luaL_checkstring(L, 1);
const char* val = luaL_checkstring(L, 2);
int len = lua_rawlen(L, 2);
ASSERT(p);
ASSERT(key);
ASSERT(val);
texpdf_add_dict(p->info,
texpdf_new_name(key),
texpdf_new_string(val, strlen(val)));
texpdf_new_string(val, len));
}
/* Images */

Expand Down

0 comments on commit 7180081

Please sign in to comment.