Skip to content

Commit

Permalink
Implement a custom parser for <ac:*/> tags
Browse files Browse the repository at this point in the history
This replaces the workaround to replace colons in <ac:*/> tags with a
magic string with a custom parser for these tags to parse them as
ast.KindRawHtml.

The custom parser is a stripped down version of goldmark's rawHTMLParser.
  • Loading branch information
bernd committed Mar 31, 2023
1 parent e7e61ba commit 6e4a912
Show file tree
Hide file tree
Showing 3 changed files with 121 additions and 17 deletions.
112 changes: 112 additions & 0 deletions pkg/mark/ac_tag_parser.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
package mark

import (
"bytes"
"github.com/yuin/goldmark/ast"
"github.com/yuin/goldmark/parser"
"github.com/yuin/goldmark/text"
"github.com/yuin/goldmark/util"
"regexp"
)

// NewACTagParser returns an inline parser that parses <ac:* /> tags to ensure that Confluence specific tags are parsed
// as ast.KindRawHtml so they are not escaped at render time. The parser must be registered with a higher priority
// than goldmark's linkParser. Otherwise, the linkParser would parse the <ac:* /> tags.
func NewACTagParser() parser.InlineParser {
return &acTagParser{}
}

var _ parser.InlineParser = (*acTagParser)(nil)

// acTagParser is a stripped down version of goldmark's rawHTMLParser.
// See: https://github.com/yuin/goldmark/blob/master/parser/raw_html.go
type acTagParser struct {
}

func (s *acTagParser) Trigger() []byte {
return []byte{'<'}
}

func (s *acTagParser) Parse(_ ast.Node, block text.Reader, pc parser.Context) ast.Node {
line, _ := block.PeekLine()
if len(line) > 1 && util.IsAlphaNumeric(line[1]) {
return s.parseMultiLineRegexp(openTagRegexp, block, pc)
}
if len(line) > 2 && line[1] == '/' && util.IsAlphaNumeric(line[2]) {
return s.parseMultiLineRegexp(closeTagRegexp, block, pc)
}
if len(line) > 2 && line[1] == '!' && line[2] >= 'A' && line[2] <= 'Z' {
return s.parseUntil(block, closeDecl, pc)
}
if bytes.HasPrefix(line, openCDATA) {
return s.parseUntil(block, closeCDATA, pc)
}
return nil
}

var tagnamePattern = `([A-Za-z][A-Za-z0-9-]*)`

var attributePattern = `(?:[\r\n \t]+[a-zA-Z_:][a-zA-Z0-9:._-]*(?:[\r\n \t]*=[\r\n \t]*(?:[^\"'=<>` + "`" + `\x00-\x20]+|'[^']*'|"[^"]*"))?)`

// Only match <ac:*/> tags
var openTagRegexp = regexp.MustCompile("^<ac:" + tagnamePattern + attributePattern + `*[ \t]*/?>`)
var closeTagRegexp = regexp.MustCompile("^</ac:" + tagnamePattern + `\s*>`)

var openCDATA = []byte("<![CDATA[")
var closeCDATA = []byte("]]>")
var closeDecl = []byte(">")

func (s *acTagParser) parseUntil(block text.Reader, closer []byte, _ parser.Context) ast.Node {
savedLine, savedSegment := block.Position()
node := ast.NewRawHTML()
for {
line, segment := block.PeekLine()
if line == nil {
break
}
index := bytes.Index(line, closer)
if index > -1 {
node.Segments.Append(segment.WithStop(segment.Start + index + len(closer)))
block.Advance(index + len(closer))
return node
}
node.Segments.Append(segment)
block.AdvanceLine()
}
block.SetPosition(savedLine, savedSegment)
return nil
}

func (s *acTagParser) parseMultiLineRegexp(reg *regexp.Regexp, block text.Reader, _ parser.Context) ast.Node {
sline, ssegment := block.Position()
if block.Match(reg) {
node := ast.NewRawHTML()
eline, esegment := block.Position()
block.SetPosition(sline, ssegment)
for {
line, segment := block.PeekLine()
if line == nil {
break
}
l, _ := block.Position()
start := segment.Start
if l == sline {
start = ssegment.Start
}
end := segment.Stop
if l == eline {
end = esegment.Start
}

node.Segments.Append(text.NewSegment(start, end))
if l == eline {
block.Advance(end - start)
break
} else {
block.AdvanceLine()
}
}
return node
}
return nil
}
22 changes: 7 additions & 15 deletions pkg/mark/markdown.go
Original file line number Diff line number Diff line change
Expand Up @@ -430,22 +430,9 @@ func (r *ConfluenceRenderer) renderCodeBlock(writer util.BufWriter, source []byt
return ast.WalkContinue, nil
}

// compileMarkdown will replace tags like <ac:rich-tech-body> with escaped
// equivalent, because goldmark markdown parser replaces that tags with
// <a href="ac:rich-text-body">ac:rich-text-body</a> because of the autolink
// rule.
func CompileMarkdown(markdown []byte, stdlib *stdlib.Lib) string {
log.Tracef(nil, "rendering markdown:\n%s", string(markdown))

colon := []byte("---bf-COLON---")

tags := regexp.MustCompile(`</?ac:[^>]+>`)

for _, match := range tags.FindAll(markdown, -1) {
// Replace the colon in all "<ac:*>" tags with the colon bytes to avoid having Goldmark escape the HTML output.
markdown = bytes.ReplaceAll(markdown, match, bytes.ReplaceAll(match, []byte(":"), colon))
}

converter := goldmark.New(
goldmark.WithExtensions(
extension.GFM,
Expand All @@ -461,6 +448,12 @@ func CompileMarkdown(markdown []byte, stdlib *stdlib.Lib) string {
html.WithUnsafe(),
))

converter.Parser().AddOptions(parser.WithInlineParsers(
// Must be registered with a higher priority than goldmark's linkParser to make sure goldmark doesn't parse
// the <ac:*/> tags.
util.Prioritized(NewACTagParser(), 199),
))

converter.Renderer().AddOptions(renderer.WithNodeRenderers(
util.Prioritized(NewConfluenceRenderer(stdlib), 100),
))
Expand All @@ -472,8 +465,7 @@ func CompileMarkdown(markdown []byte, stdlib *stdlib.Lib) string {
panic(err)
}

// Restore all the colons we previously replaced.
html := bytes.ReplaceAll(buf.Bytes(), colon, []byte(":"))
html := buf.Bytes()

log.Tracef(nil, "rendered markdown to html:\n%s", string(html))

Expand Down
4 changes: 2 additions & 2 deletions pkg/mark/testdata/macro-include.html
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
<p><foo>bar</foo></p>
<ac:structured-macro ac:name="info">
<p><ac:structured-macro ac:name="info">
<ac:parameter ac:name="icon">true</ac:parameter>
<ac:parameter ac:name="title">Attention</ac:parameter>
<ac:rich-text-body>This is an info!</ac:rich-text-body>
</ac:structured-macro>
</ac:structured-macro></p>

0 comments on commit 6e4a912

Please sign in to comment.