Skip to content

Commit 3da4d41

Browse files
jgmsilby
andcommitted
Markdown reader: more efficient base64 data URI parsing.
This patch borrows some code from @silby's PR #10434 and should be regarded as co-authored. This is a lighter-weight patch that only touches the Markdown reader. The basic idea is to speed up parsing of base64 URIs by parsing them with a special path. This should improve the problem noted at #10075. Benchmarks (optimized compilation): Converting the large test.md from #10075 (7.6Mb embedded image) from markdown to json, before: 6182 GCs, 1578M in use, 5.471 MUT, 1.473 GC after: 951 GCs, 80M in use, .247 MUT, 0.035 GC For now we leave #10075 open to investigate improvements in HTML rendering with these large data URIs. Co-authored-by: Evan Silberman <[email protected]>
1 parent ca4ad3b commit 3da4d41

File tree

1 file changed

+36
-1
lines changed

1 file changed

+36
-1
lines changed

src/Text/Pandoc/Readers/Markdown.hs

+36-1
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ import Data.List (transpose, elemIndex, sortOn, foldl')
2828
import qualified Data.Map as M
2929
import Data.Maybe
3030
import qualified Data.Set as Set
31+
import qualified Data.Attoparsec.Text as A
3132
import Data.Text (Text)
3233
import qualified Data.Text as T
3334
import qualified Data.ByteString as BS
@@ -1834,12 +1835,46 @@ source = do
18341835
let sourceURL = T.unwords . T.words . T.concat <$> many urlChunk
18351836
let betweenAngles = try $
18361837
char '<' >> mconcat <$> (manyTill litChar (char '>'))
1837-
src <- try betweenAngles <|> sourceURL
1838+
src <- try betweenAngles <|> try pBase64DataURI <|> sourceURL
18381839
tit <- option "" $ try $ spnl >> linkTitle
18391840
skipSpaces
18401841
char ')'
18411842
return (escapeURI $ trimr src, tit)
18421843

1844+
pBase64DataURI :: PandocMonad m => ParsecT Sources s m Text
1845+
pBase64DataURI = mconcat <$> sequence
1846+
[ textStr "data:"
1847+
, T.singleton <$> alphaNum
1848+
, restrictedName
1849+
, T.singleton <$> char '/'
1850+
, restrictedName
1851+
, textStr ";"
1852+
, mconcat <$> many (try mediaParam)
1853+
, textStr "base64,"
1854+
, pBase64Data
1855+
]
1856+
where
1857+
restrictedName = manyChar (satisfy (A.inClass "A-Za-z0-9!#$&^_.+-"))
1858+
mediaParam = mconcat <$> sequence
1859+
[ restrictedName
1860+
, textStr "="
1861+
, manyChar (noneOf ";")
1862+
, textStr ";"
1863+
]
1864+
1865+
pBase64Data :: PandocMonad m => ParsecT Sources s m Text
1866+
pBase64Data = do
1867+
Sources inps <- getInput
1868+
case inps of
1869+
[] -> mzero
1870+
(fp,t):rest -> do
1871+
satisfy (A.inClass "A-Za-z0-9+/") -- parse one character or parsec won't know
1872+
-- we have consumed input
1873+
let (a,r) = T.span (A.inClass "A-Za-z0-9+/") t
1874+
let (b, trest) = T.span (=='=') r
1875+
setInput $ Sources ((fp,trest):rest)
1876+
return (a <> b)
1877+
18431878
linkTitle :: PandocMonad m => MarkdownParser m Text
18441879
linkTitle = quotedTitle '"' <|> quotedTitle '\''
18451880

0 commit comments

Comments
 (0)