Skip to content

Commit

Permalink
fix(packages): Correct handling of ampersands and tildes in bibtex
Browse files Browse the repository at this point in the history
Accept `\&` for compatibility with legacy BibTeX, but do not
mandate it to be escaped for compatibility with other engines.
Support unescaped `~` as a non-breaking space for compability with
TeX, this is often found in existing bibliography files.
Support `\~` to render a tilde.
XML-escape the input so it can safely be wrapped in a `<sile>`
construct.

Closes sile-typesetter#2050

Closes sile-typesetter#1860 (replaced by this implementation)
  • Loading branch information
Omikhleia authored and Didier Willis committed Jun 15, 2024
1 parent e3ac5ce commit cf4f446
Showing 1 changed file with 30 additions and 5 deletions.
35 changes: 30 additions & 5 deletions packages/bibtex/init.lua
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,27 @@ local epnf = require("epnf")

local Bibliography

local nbsp = luautf8.char(0x00A0)
local function sanitize (str)
local s = str
-- TeX special characters:
-- Backslash-escaped tilde is a tilde,
-- but standalone tilde is a non-breaking space
:gsub("(.?)~", function (prev)
if prev == "\\" then return "~" end
return prev .. nbsp
end)
-- Other backslash-escaped characters are skipped
-- TODO FIXME:
-- This ok for \", \& etc. which we want to unescape,
-- BUT what should we do with other TeX-like commands?
:gsub("\\", "")
-- We will wrap the content in <sile> tags so we need to XML-escape
-- the input.
:gsub("&", "&amp;"):gsub("<", "&lt;"):gsub(">", "&gt;")
return s
end

-- luacheck: push ignore
-- stylua: ignore start
---@diagnostic disable: undefined-global, unused-local, lowercase-global
Expand All @@ -18,11 +39,14 @@ local bibtexparser = epnf.define(function (_ENV)
local quoted = C( P'"' * C(((1 - S'"\r\n\f\\') + (P'\\' * 1)) ^ 0) * '"' ) / function (...) local t={...}; return t[2] end
local _ = WS^0
local sep = S",;" * _
local myID = C(identifier + P(1)) / function (t) return strings[t] or t end
local myTag = C(identifier + P(1)) / function (t) return t:lower() end
local pieces = balanced + quoted + myID
local value = Ct(pieces * (WS * P("#") * WS * pieces)^0) / function (t) return table.concat(t) end
local pair = Cg(myTag * _ * "=" * _ * C(value)) * _ * sep^-1 / function (...) local t= {...}; return t[1], t[#t] end
local myID = C(identifier)
local myStrID = myID / function (t) return strings[t] or t end
local myTag = C(identifier) / function (t) return t:lower() end
local pieces = balanced + quoted + myStrID
local value = Ct(pieces * (WS * P("#") * WS * pieces)^0)
/ function (t) return table.concat(t) end / sanitize
local pair = myTag * _ * "=" * _ * value * _ * sep^-1
/ function (...) local t= {...}; return t[1], t[#t] end
local list = Cf(Ct("") * pair^0, rawset)
local skippedType = Cmt(R("az", "AZ")^1, function(_, _, tag)
-- ignore both @comment and @preamble
Expand Down Expand Up @@ -301,6 +325,7 @@ If no such abbreviation is found, the value is considered to be a string literal
String values are assumed to be in the UTF-8 encoding, and shall not contain (La)TeX commands.
Special character sequences from TeX (such as \code{`} assumed to be an opening quote) are not supported.
There are exceptions to this rule. Notably, the \code{~} character can be used to represent a non-breaking space (when not backslash-escaped), and the \code{\\&} sequence is accepted (though this implementation does not mandate escaping ampersands).
Values can also be composed by concatenating strings, using the \code{#} character.
Expand Down

0 comments on commit cf4f446

Please sign in to comment.