From 3f0bb9d6a60d8225c21712ffacdc0c39bc3b11ed Mon Sep 17 00:00:00 2001 From: Evan Silberman Date: Sun, 1 Dec 2024 14:42:55 -0800 Subject: [PATCH] Parse base64-encoded data URIs more efficiently (in some places) Very long data: URIs in source documents are causing outsized memory usage due to various parsing inefficiencies, for instance in Network.URI, TagSoup, and T.P.R.Markdown.source. See e.g. #10075. This change improves the situation in a couple places we can control relatively easily by using an attoparsec text-specialized parser to consume base64-encoded strings. Attoparsec's takeWhile + inClass functions are designed to chew through long strings like this without doing unnecessary allocation, and the improvements in peak heap allocation are significant. One of the observations here is that if you parse something as a valid data: uri it shouldn't need any further escaping so we can short-circuit various processing steps that may unpack/iterate over the chars in the URI. --- pandoc.cabal | 1 + src/Text/Pandoc/Parsing.hs | 4 ++- src/Text/Pandoc/Parsing/Base64.hs | 39 ++++++++++++++++++++++++++++ src/Text/Pandoc/Readers/HTML.hs | 14 +++++----- src/Text/Pandoc/Readers/Markdown.hs | 26 +++++++++++++++++-- src/Text/Pandoc/URI.hs | 40 ++++++++++++++++++++++++++--- 6 files changed, 111 insertions(+), 13 deletions(-) create mode 100644 src/Text/Pandoc/Parsing/Base64.hs diff --git a/pandoc.cabal b/pandoc.cabal index 32ebcd43a6aa..812f620281e8 100644 --- a/pandoc.cabal +++ b/pandoc.cabal @@ -679,6 +679,7 @@ library Text.Pandoc.Class.Sandbox, Text.Pandoc.Filter.Environment, Text.Pandoc.Filter.JSON, + Text.Pandoc.Parsing.Base64, Text.Pandoc.Parsing.Capabilities, Text.Pandoc.Parsing.Citations, Text.Pandoc.Parsing.General, diff --git a/src/Text/Pandoc/Parsing.hs b/src/Text/Pandoc/Parsing.hs index 67fcc363dfc4..548c28425869 100644 --- a/src/Text/Pandoc/Parsing.hs +++ b/src/Text/Pandoc/Parsing.hs @@ -172,7 +172,8 @@ module Text.Pandoc.Parsing ( module Text.Pandoc.Sources, Column, ParseError, errorMessages, - messageString + messageString, + parseBase64String ) where @@ -341,3 +342,4 @@ import Text.Pandoc.Parsing.State SubstTable ) import Text.Pandoc.Parsing.Future ( askF, asksF, returnF, runF, Future(..) ) +import Text.Pandoc.Parsing.Base64 (parseBase64String) diff --git a/src/Text/Pandoc/Parsing/Base64.hs b/src/Text/Pandoc/Parsing/Base64.hs new file mode 100644 index 000000000000..4330a83d14de --- /dev/null +++ b/src/Text/Pandoc/Parsing/Base64.hs @@ -0,0 +1,39 @@ +{-# LANGUAGE FlexibleContexts #-} +{-# LANGUAGE OverloadedStrings #-} +{- | +Module : Text.Pandoc.Parsing.Base64 +Copyright : © 2024 Evan Silberman +License : GPL-2.0-or-later +Maintainer : John MacFarlane + +Parse large base64 strings efficiently within Pandoc's +normal parsing environment +-} + +module Text.Pandoc.Parsing.Base64 + ( parseBase64String ) + +where + +import Data.Text as T +import Data.Attoparsec.Text as A +import Text.Parsec (ParsecT, getInput, setInput, incSourceColumn) +import Text.Pandoc.Sources +import Control.Monad (mzero) + +parseBase64String :: Monad m => ParsecT Sources u m Text +parseBase64String = do + Sources ((pos, txt):rest) <- getInput + let r = A.parse pBase64 txt + case r of + Done remaining consumed -> do + let pos' = incSourceColumn pos (T.length consumed) + setInput $ Sources ((pos', remaining):rest) + return consumed + _ -> mzero + +pBase64 :: A.Parser Text +pBase64 = do + most <- A.takeWhile1 (A.inClass "A-Za-z0-9+/") + rest <- A.takeWhile (== '=') + return $ most <> rest diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs index 9704b047e0f1..ff6bbd881dd8 100644 --- a/src/Text/Pandoc/Readers/HTML.hs +++ b/src/Text/Pandoc/Readers/HTML.hs @@ -66,7 +66,7 @@ import Text.Pandoc.Parsing hiding ((<|>)) import Text.Pandoc.Shared ( addMetaField, extractSpaces, htmlSpanLikeElements, renderTags', safeRead, tshow, formatCode) -import Text.Pandoc.URI (escapeURI) +import Text.Pandoc.URI (escapeURI, isBase64DataURI) import Text.Pandoc.Walk import Text.TeXMath (readMathML, writeTeX) import qualified Data.Sequence as Seq @@ -1219,8 +1219,10 @@ htmlTag f = try $ do -- | Adjusts a url according to the document's base URL. canonicalizeUrl :: PandocMonad m => Text -> TagParser m Text -canonicalizeUrl url = do - mbBaseHref <- baseHref <$> getState - return $ case (parseURIReference (T.unpack url), mbBaseHref) of - (Just rel, Just bs) -> tshow (rel `nonStrictRelativeTo` bs) - _ -> url +canonicalizeUrl url + | isBase64DataURI url = return url + | otherwise = do + mbBaseHref <- baseHref <$> getState + return $ case (parseURIReference (T.unpack url), mbBaseHref) of + (Just rel, Just bs) -> tshow (rel `nonStrictRelativeTo` bs) + _ -> url diff --git a/src/Text/Pandoc/Readers/Markdown.hs b/src/Text/Pandoc/Readers/Markdown.hs index 51782f00015f..b01c937f7c92 100644 --- a/src/Text/Pandoc/Readers/Markdown.hs +++ b/src/Text/Pandoc/Readers/Markdown.hs @@ -1822,13 +1822,35 @@ parenthesizedChars = do result <- charsInBalanced '(' ')' litChar return $ "(" <> result <> ")" +pBase64DataURI :: PandocMonad m => MarkdownParser m Text +pBase64DataURI = mconcat <$> sequence + [ (textStr "data:") + , (T.singleton <$> (letter <|> digit)) + , restrictedName + , (T.singleton <$> char '/') + , restrictedName + , textStr ";" <* trace "cool" + , (mconcat <$> many mediaParam) + , textStr "base64," <* trace "fine" + , parseBase64String + ] + where + restrictedName = manyChar (alphaNum <|> oneOf "!#$&^_.+-") + mediaParam = mconcat <$> sequence + [ notFollowedBy (textStr "base64,") *> mempty -- XXX ??? + , restrictedName + , textStr "=" + , manyChar (noneOf ";") + , textStr ";" + ] + -- source for a link, with optional title source :: PandocMonad m => MarkdownParser m (Text, Text) source = do char '(' skipSpaces - let urlChunk = - try parenthesizedChars + let urlChunk = try pBase64DataURI + <|> try parenthesizedChars <|> (notFollowedBy (oneOf " )") >> litChar) <|> try (many1Char spaceChar <* notFollowedBy (oneOf "\"')")) let sourceURL = T.unwords . T.words . T.concat <$> many urlChunk diff --git a/src/Text/Pandoc/URI.hs b/src/Text/Pandoc/URI.hs index a0b47d259cd5..5052b4c6c514 100644 --- a/src/Text/Pandoc/URI.hs +++ b/src/Text/Pandoc/URI.hs @@ -15,12 +15,16 @@ module Text.Pandoc.URI ( urlEncode , isURI , schemes , uriPathToPath + , isBase64DataURI ) where import qualified Network.HTTP.Types as HTTP import qualified Text.Pandoc.UTF8 as UTF8 import qualified Data.Text as T +import qualified Data.Attoparsec.Text as A import qualified Data.Set as Set import Data.Char (isSpace, isAscii) +import Data.Either (isRight) +import Control.Applicative ((<|>)) import Network.URI (URI (uriScheme), parseURI, escapeURIString) urlEncode :: T.Text -> T.Text @@ -28,7 +32,9 @@ urlEncode = UTF8.toText . HTTP.urlEncode True . UTF8.fromText -- | Escape whitespace and some punctuation characters in URI. escapeURI :: T.Text -> T.Text -escapeURI = T.pack . escapeURIString (not . needsEscaping) . T.unpack +escapeURI t + | isBase64DataURI t = t + | otherwise = (T.pack . escapeURIString (not . needsEscaping) . T.unpack) t where needsEscaping c = isSpace c || T.any (== c) "<>|\"{}[]^`" -- @@ -87,12 +93,38 @@ schemes = Set.fromList , "doi", "gemini", "isbn", "javascript", "pmid" ] +isBase64DataURI :: T.Text -> Bool +isBase64DataURI = + isRight . A.parseOnly pBase64DataURI + +pBase64DataURI :: A.Parser T.Text +pBase64DataURI = mconcat <$> sequence + [ (A.string "data:") + , (T.singleton <$> (A.letter <|> A.digit)) + , restrictedName + , (T.singleton <$> A.char '/') + , restrictedName + , (mconcat <$> A.many' mediaParam) + , A.string ";base64," + , A.takeWhile1 (A.inClass "A-Za-z0-9+/") + , A.takeWhile (== '=') + ] + where + restrictedName = A.takeWhile1 (A.inClass "0-9a-zA-Z!#$&^_.+-") + mediaParam = mconcat <$> sequence + [ (T.singleton <$> A.char ';') + , restrictedName + , (T.singleton <$> A.char '=') + , A.takeWhile (/= ';') + ] + -- | Check if the string is a valid URL with a IANA or frequently used but -- unofficial scheme (see @schemes@). isURI :: T.Text -> Bool -isURI = - -- we URI-escape non-ASCII characters because otherwise parseURI will choke: - maybe False hasKnownScheme . parseURI . escapeURIString isAscii . T.unpack +isURI t = + isBase64DataURI t || + -- we URI-escape non-ASCII characters because otherwise parseURI will choke: + (maybe False hasKnownScheme . parseURI . escapeURIString isAscii . T.unpack) t where hasKnownScheme = (`Set.member` schemes) . T.toLower . T.filter (/= ':') . T.pack . uriScheme