Skip to content

Commit

Permalink
Parse base64-encoded data URIs more efficiently
Browse files Browse the repository at this point in the history
(in some places)

Very long data: URIs in source documents are causing outsized memory
usage due to various parsing inefficiencies, for instance in
Network.URI, TagSoup, and T.P.R.Markdown.source. See e.g. jgm#10075.

This change improves the situation in a couple places we can control
relatively easily by using an attoparsec text-specialized parser to
consume base64-encoded strings. Attoparsec's takeWhile + inClass
functions are designed to chew through long strings like this without
doing unnecessary allocation, and the improvements in peak heap
allocation are significant.

One of the observations here is that if you parse something as a valid
data: uri it shouldn't need any further escaping so we can short-circuit
various processing steps that may unpack/iterate over the chars in the
URI.
  • Loading branch information
silby committed Dec 4, 2024
1 parent c34edf6 commit 3f0bb9d
Show file tree
Hide file tree
Showing 6 changed files with 111 additions and 13 deletions.
1 change: 1 addition & 0 deletions pandoc.cabal
Original file line number Diff line number Diff line change
Expand Up @@ -679,6 +679,7 @@ library
Text.Pandoc.Class.Sandbox,
Text.Pandoc.Filter.Environment,
Text.Pandoc.Filter.JSON,
Text.Pandoc.Parsing.Base64,
Text.Pandoc.Parsing.Capabilities,
Text.Pandoc.Parsing.Citations,
Text.Pandoc.Parsing.General,
Expand Down
4 changes: 3 additions & 1 deletion src/Text/Pandoc/Parsing.hs
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,8 @@ module Text.Pandoc.Parsing ( module Text.Pandoc.Sources,
Column,
ParseError,
errorMessages,
messageString
messageString,
parseBase64String
)
where

Expand Down Expand Up @@ -341,3 +342,4 @@ import Text.Pandoc.Parsing.State
SubstTable )
import Text.Pandoc.Parsing.Future
( askF, asksF, returnF, runF, Future(..) )
import Text.Pandoc.Parsing.Base64 (parseBase64String)
39 changes: 39 additions & 0 deletions src/Text/Pandoc/Parsing/Base64.hs
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE OverloadedStrings #-}
{- |
Module : Text.Pandoc.Parsing.Base64
Copyright : © 2024 Evan Silberman
License : GPL-2.0-or-later
Maintainer : John MacFarlane <jgm@berkeley.edu>
Parse large base64 strings efficiently within Pandoc's
normal parsing environment
-}

module Text.Pandoc.Parsing.Base64
( parseBase64String )

where

import Data.Text as T
import Data.Attoparsec.Text as A
import Text.Parsec (ParsecT, getInput, setInput, incSourceColumn)
import Text.Pandoc.Sources
import Control.Monad (mzero)

parseBase64String :: Monad m => ParsecT Sources u m Text
parseBase64String = do
Sources ((pos, txt):rest) <- getInput
let r = A.parse pBase64 txt
case r of
Done remaining consumed -> do
let pos' = incSourceColumn pos (T.length consumed)
setInput $ Sources ((pos', remaining):rest)
return consumed
_ -> mzero

pBase64 :: A.Parser Text
pBase64 = do
most <- A.takeWhile1 (A.inClass "A-Za-z0-9+/")
rest <- A.takeWhile (== '=')
return $ most <> rest
14 changes: 8 additions & 6 deletions src/Text/Pandoc/Readers/HTML.hs
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ import Text.Pandoc.Parsing hiding ((<|>))
import Text.Pandoc.Shared (
addMetaField, extractSpaces, htmlSpanLikeElements, renderTags',
safeRead, tshow, formatCode)
import Text.Pandoc.URI (escapeURI)
import Text.Pandoc.URI (escapeURI, isBase64DataURI)
import Text.Pandoc.Walk
import Text.TeXMath (readMathML, writeTeX)
import qualified Data.Sequence as Seq
Expand Down Expand Up @@ -1219,8 +1219,10 @@ htmlTag f = try $ do

-- | Adjusts a url according to the document's base URL.
canonicalizeUrl :: PandocMonad m => Text -> TagParser m Text
canonicalizeUrl url = do
mbBaseHref <- baseHref <$> getState
return $ case (parseURIReference (T.unpack url), mbBaseHref) of
(Just rel, Just bs) -> tshow (rel `nonStrictRelativeTo` bs)
_ -> url
canonicalizeUrl url
| isBase64DataURI url = return url
| otherwise = do
mbBaseHref <- baseHref <$> getState
return $ case (parseURIReference (T.unpack url), mbBaseHref) of
(Just rel, Just bs) -> tshow (rel `nonStrictRelativeTo` bs)
_ -> url
26 changes: 24 additions & 2 deletions src/Text/Pandoc/Readers/Markdown.hs
Original file line number Diff line number Diff line change
Expand Up @@ -1822,13 +1822,35 @@ parenthesizedChars = do
result <- charsInBalanced '(' ')' litChar
return $ "(" <> result <> ")"

pBase64DataURI :: PandocMonad m => MarkdownParser m Text
pBase64DataURI = mconcat <$> sequence
[ (textStr "data:")
, (T.singleton <$> (letter <|> digit))
, restrictedName
, (T.singleton <$> char '/')
, restrictedName
, textStr ";" <* trace "cool"
, (mconcat <$> many mediaParam)
, textStr "base64," <* trace "fine"
, parseBase64String
]
where
restrictedName = manyChar (alphaNum <|> oneOf "!#$&^_.+-")
mediaParam = mconcat <$> sequence
[ notFollowedBy (textStr "base64,") *> mempty -- XXX ???
, restrictedName
, textStr "="
, manyChar (noneOf ";")
, textStr ";"
]

-- source for a link, with optional title
source :: PandocMonad m => MarkdownParser m (Text, Text)
source = do
char '('
skipSpaces
let urlChunk =
try parenthesizedChars
let urlChunk = try pBase64DataURI
<|> try parenthesizedChars
<|> (notFollowedBy (oneOf " )") >> litChar)
<|> try (many1Char spaceChar <* notFollowedBy (oneOf "\"')"))
let sourceURL = T.unwords . T.words . T.concat <$> many urlChunk
Expand Down
40 changes: 36 additions & 4 deletions src/Text/Pandoc/URI.hs
Original file line number Diff line number Diff line change
Expand Up @@ -15,20 +15,26 @@ module Text.Pandoc.URI ( urlEncode
, isURI
, schemes
, uriPathToPath
, isBase64DataURI
) where
import qualified Network.HTTP.Types as HTTP
import qualified Text.Pandoc.UTF8 as UTF8
import qualified Data.Text as T
import qualified Data.Attoparsec.Text as A
import qualified Data.Set as Set
import Data.Char (isSpace, isAscii)
import Data.Either (isRight)
import Control.Applicative ((<|>))
import Network.URI (URI (uriScheme), parseURI, escapeURIString)

urlEncode :: T.Text -> T.Text
urlEncode = UTF8.toText . HTTP.urlEncode True . UTF8.fromText

-- | Escape whitespace and some punctuation characters in URI.
escapeURI :: T.Text -> T.Text
escapeURI = T.pack . escapeURIString (not . needsEscaping) . T.unpack
escapeURI t
| isBase64DataURI t = t
| otherwise = (T.pack . escapeURIString (not . needsEscaping) . T.unpack) t
where needsEscaping c = isSpace c || T.any (== c) "<>|\"{}[]^`"

--
Expand Down Expand Up @@ -87,12 +93,38 @@ schemes = Set.fromList
, "doi", "gemini", "isbn", "javascript", "pmid"
]

isBase64DataURI :: T.Text -> Bool
isBase64DataURI =
isRight . A.parseOnly pBase64DataURI

pBase64DataURI :: A.Parser T.Text
pBase64DataURI = mconcat <$> sequence
[ (A.string "data:")
, (T.singleton <$> (A.letter <|> A.digit))
, restrictedName
, (T.singleton <$> A.char '/')
, restrictedName
, (mconcat <$> A.many' mediaParam)
, A.string ";base64,"
, A.takeWhile1 (A.inClass "A-Za-z0-9+/")
, A.takeWhile (== '=')
]
where
restrictedName = A.takeWhile1 (A.inClass "0-9a-zA-Z!#$&^_.+-")
mediaParam = mconcat <$> sequence
[ (T.singleton <$> A.char ';')
, restrictedName
, (T.singleton <$> A.char '=')
, A.takeWhile (/= ';')
]

-- | Check if the string is a valid URL with a IANA or frequently used but
-- unofficial scheme (see @schemes@).
isURI :: T.Text -> Bool
isURI =
-- we URI-escape non-ASCII characters because otherwise parseURI will choke:
maybe False hasKnownScheme . parseURI . escapeURIString isAscii . T.unpack
isURI t =
isBase64DataURI t ||
-- we URI-escape non-ASCII characters because otherwise parseURI will choke:
(maybe False hasKnownScheme . parseURI . escapeURIString isAscii . T.unpack) t
where
hasKnownScheme = (`Set.member` schemes) . T.toLower .
T.filter (/= ':') . T.pack . uriScheme
Expand Down

0 comments on commit 3f0bb9d

Please sign in to comment.