Skip to content

Commit

Permalink
shrink public API surface in laika.markdown.*
Browse files Browse the repository at this point in the history
  • Loading branch information
jenshalm committed Jul 14, 2023
1 parent a5b143d commit 76ed269
Show file tree
Hide file tree
Showing 11 changed files with 162 additions and 171 deletions.
60 changes: 29 additions & 31 deletions core/shared/src/main/scala/laika/markdown/BlockParsers.scala
Original file line number Diff line number Diff line change
Expand Up @@ -24,40 +24,39 @@ import laika.parse.builders._
import laika.parse.implicits._
import laika.parse.text.{ PrefixedParser, WhitespacePreprocessor }

/** Provides all block parsers for Markdown text except for for lists which
* are factored out into a separate parser object and those blocks dealing
* with verbatim HTML markup which this library treats as an optional
* feature that has to be explicitly mixed in.
/** Provides all block parsers for Markdown text except for lists
* which are factored out into a separate parser object
* and those blocks dealing with verbatim HTML markup
* which this library treats as an optional feature that has to be explicitly mixed in.
*
* Block parsers are only concerned with splitting the document into
* (potentially nested) blocks. They are used in the first phase of parsing,
* while delegating to inline parsers for the 2nd phase.
* Block parsers are only concerned with splitting the document into (potentially nested) blocks.
* They are used in the first phase of parsing, while delegating to inline parsers for the 2nd phase.
*
* @author Jens Halm
* @author Jens Halm
*/
object BlockParsers {
private[laika] object BlockParsers {

/** Parses a single tab or space character.
*/
val tabOrSpace: Parser[Unit] = oneOf(' ', '\t').void

/** Parses up to 3 space characters. In Markdown an indentation
* of up to 3 spaces is optional and does not have any influence
* on the parsing logic.
/** Parses up to 3 space characters.
* In Markdown an indentation of up to 3 spaces is optional
* and does not have any influence on the parsing logic.
*/
val insignificantSpaces: Parser[Unit] = anyOf(' ').max(3).void

/** Parses the decoration (underline) of a setext header.
*/
val setextDecoration: Parser[String] = (someOf('=') | someOf('-')) <~ wsEol
private val setextDecoration: Parser[String] = (someOf('=') | someOf('-')) <~ wsEol

/** Parses a single Markdown block. In contrast to the generic block parser of the
* generic block parsers this method also consumes and ignores up to three optional space
* characters at the start of each line.
/** Parses a single Markdown block. In contrast to the generic block parser of the
* generic block parsers this method also consumes and ignores up to three optional space
* characters at the start of each line.
*
* @param firstLinePrefix parser that recognizes the start of the first line of this block
* @param linePrefix parser that recognizes the start of subsequent lines that still belong to the same block
* @param nextBlockPrefix parser that recognizes whether a line after one or more blank lines still belongs to the same block
* @param firstLinePrefix parser that recognizes the start of the first line of this block
* @param linePrefix parser that recognizes the start of subsequent lines that still belong to the same block
* @param nextBlockPrefix parser that recognizes whether a line after one or more blank lines still belongs to the same block
*/
def mdBlock(
firstLinePrefix: Parser[Any],
Expand All @@ -67,15 +66,14 @@ object BlockParsers {
block(firstLinePrefix, insignificantSpaces ~ linePrefix, nextBlockPrefix)
}

/** Parses a single Markdown block. In contrast to the `mdBlock` parser
* this method also verifies that the second line is not a setext header
* decoration.
/** Parses a single Markdown block. In contrast to the `mdBlock` parser
* this method also verifies that the second line is not a setext header decoration.
*
* @param firstLinePrefix parser that recognizes the start of the first line of this block
* @param linePrefix parser that recognizes the start of subsequent lines that still belong to the same block
* @param nextBlockPrefix parser that recognizes whether a line after one or more blank lines still belongs to the same block
* @param firstLinePrefix parser that recognizes the start of the first line of this block
* @param linePrefix parser that recognizes the start of subsequent lines that still belong to the same block
* @param nextBlockPrefix parser that recognizes whether a line after one or more blank lines still belongs to the same block
*/
def decoratedBlock(
private def decoratedBlock(
firstLinePrefix: Parser[Any],
linePrefix: Parser[Any],
nextBlockPrefix: Parser[Any]
Expand Down Expand Up @@ -159,9 +157,9 @@ object BlockParsers {
}.rootOnly

/** Parses an ATX header, a line that starts with 1 to 6 `'#'` characters,
* with the number of hash characters corresponding to the level of the header.
* Markdown also allows to decorate the line with trailing `'#'` characters which
* this parser will remove.
* with the number of hash characters corresponding to the level of the header.
* Markdown also allows to decorate the line with trailing `'#'` characters which
* this parser will remove.
*/
val atxHeader: BlockParserBuilder = BlockParserBuilder.recursive { recParsers =>
def stripDecoration(text: String) = text.trim.reverse.dropWhile(_ == '#').reverse.trim
Expand All @@ -173,7 +171,7 @@ object BlockParsers {
}

/** Parses a horizontal rule, a line only decorated with three or more `'*'`, `'-'` or `'_'`
* characters with optional spaces between them
* characters with optional spaces between them
*/
val rules: BlockParserBuilder = BlockParserBuilder.standalone {
val decoChar = oneOf('*', '-', '_')
Expand All @@ -193,7 +191,7 @@ object BlockParsers {
}

/** Parses a quoted block, a paragraph starting with a `'>'` character,
* with subsequent lines optionally starting with a `'>'`, too.
* with subsequent lines optionally starting with a `'>'`, too.
*/
val quotedBlock: BlockParserBuilder = BlockParserBuilder.recursive { recParsers =>
PrefixedParser('>') {
Expand Down
71 changes: 37 additions & 34 deletions core/shared/src/main/scala/laika/markdown/HTMLParsers.scala
Original file line number Diff line number Diff line change
Expand Up @@ -17,128 +17,131 @@
package laika.markdown

import cats.data.NonEmptySet
import laika.ast._
import laika.ast.*
import laika.bundle.{ BlockParserBuilder, SpanParserBuilder }
import laika.markdown.ast._
import laika.markdown.ast.*
import laika.parse.Parser
import laika.parse.markup.InlineParsers.spans
import laika.parse.markup.RecursiveSpanParsers
import laika.parse.text.{ CharGroup, DelimitedText, PrefixedParser }
import laika.parse.builders._
import laika.parse.implicits._
import laika.parse.builders.*
import laika.parse.implicits.*

/** Parses verbatim HTML elements which may interleave with standard Markdown markup.
* Extends the Markdown block and inline parsers, overriding several of their
* parsers to add the HTML functionality.
* Extends the Markdown block and inline parsers,
* overriding several of their parsers to add the HTML functionality.
*
* @author Jens Halm
* @author Jens Halm
*/
object HTMLParsers {
private[laika] object HTMLParsers {

private val htmlWSChars = NonEmptySet.of(' ', '\t', '\f', '\n', '\r')

private val htmlAttrEndChars = NonEmptySet.of('"', '\'', '<', '=', '/', '>') ++ htmlWSChars

/** Parses and consumes optional whitespace, always succeeds.
*/
val htmlWS: Parser[String] = anyOf(htmlWSChars)
private val htmlWS: Parser[String] = anyOf(htmlWSChars)

val htmlHexReference: Parser[String] = {
private val htmlHexReference: Parser[String] = {
val hexNumber = someOf(CharGroup.hexDigit)

((oneOf('x') | oneOf('X')) ~ hexNumber).source
}

val htmlDecReference: Parser[String] = someOf(CharGroup.digit)
private val htmlDecReference: Parser[String] = someOf(CharGroup.digit)

val htmlNamedReference: Parser[String] = someOf(CharGroup.alphaNum)
private val htmlNamedReference: Parser[String] = someOf(CharGroup.alphaNum)

/** Parses a numeric character reference (decimal or hexadecimal) without the leading `'&'`.
*/
val htmlNumericReference: Parser[String] = ("#" ~ (htmlHexReference | htmlDecReference)).source
private val htmlNumericReference: Parser[String] =
("#" ~ (htmlHexReference | htmlDecReference)).source

/** Parses a numeric or named character reference without the leading `'&'`.
*/
val htmlCharReference: PrefixedParser[HTMLCharacterReference] =
private val htmlCharReference: PrefixedParser[HTMLCharacterReference] =
("&" ~> (htmlNumericReference | htmlNamedReference) <~ ";").source.map(
HTMLCharacterReference(_)
)

val htmlAttributeName: Parser[String] = someNot(htmlAttrEndChars)
private val htmlAttributeName: Parser[String] = someNot(htmlAttrEndChars)

private def asTextContainers(spans: List[Span]): List[TextContainer] = spans.collect {
case tc: TextContainer => tc
}

val htmlUnquotedAttributeValue: Parser[(List[TextContainer], Option[Char])] =
private val htmlUnquotedAttributeValue: Parser[(List[TextContainer], Option[Char])] =
spans(delimitedBy(htmlAttrEndChars).keepDelimiter).embed(htmlCharReference)
.map { spans => (asTextContainers(spans), None) }

/** Parses an attribute value enclosed by the specified character.
*/
def htmlQuotedAttributeValue(c: String): Parser[(List[TextContainer], Option[Char])] =
private def htmlQuotedAttributeValue(c: String): Parser[(List[TextContainer], Option[Char])] =
c ~> spans(delimitedBy(c)).embed(htmlCharReference)
.map { spans => (asTextContainers(spans), Some(c.head)) }

/** Parses quoted and unquoted attribute values.
*/
val htmlAttributeValue: Parser[(List[TextContainer], Option[Char])] =
private val htmlAttributeValue: Parser[(List[TextContainer], Option[Char])] =
htmlQuotedAttributeValue("\"") |
htmlQuotedAttributeValue("'") |
htmlUnquotedAttributeValue

/** Parses a single attribute, consisting of the name and (optional) equals sign
* and value.
*/
val htmlAttribute: Parser[HTMLAttribute] =
private val htmlAttribute: Parser[HTMLAttribute] =
(htmlAttributeName <~ htmlWS) ~ opt("=" ~> htmlAttributeValue <~ htmlWS) ^^ {
case name ~ Some((value, quotedWith)) => HTMLAttribute(name, value, quotedWith)
case name ~ None => HTMLAttribute(name, Nil, None)
}

val htmlTagName: Parser[String] =
private val htmlTagName: Parser[String] =
(someOf(CharGroup.alpha) ~ anyNot(htmlWSChars.add('/').add('>'))).source

/** Parses an HTML tag without the enclosing `'<'` and `'>'` characters.
*/
val htmlTagContent: Parser[String ~ List[HTMLAttribute]] =
private val htmlTagContent: Parser[String ~ List[HTMLAttribute]] =
htmlTagName ~ (htmlWS ~> htmlAttribute.rep <~ htmlWS)

/** Parses an HTML end tag without the leading `'<'`.
*/
val htmlEndTag: Parser[HTMLEndTag] = "/" ~> htmlTagName <~ htmlWS <~ ">" ^^ { HTMLEndTag(_) }
private val htmlEndTag: Parser[HTMLEndTag] = "/" ~> htmlTagName <~ htmlWS <~ ">" ^^ {
HTMLEndTag(_)
}

/** Parses an HTML end tag if it matches the specified tag name.
*/
def htmlEndTag(tagName: String): DelimitedText = delimitedBy(
private def htmlEndTag(tagName: String): DelimitedText = delimitedBy(
("</" ~ tagName ~ htmlWS ~ ">").as("")
)

/** Parses an HTML comment without the leading `'<'`.
*/
val htmlComment: Parser[HTMLComment] = "!--" ~> delimitedBy("-->").map(HTMLComment(_))
private val htmlComment: Parser[HTMLComment] = "!--" ~> delimitedBy("-->").map(HTMLComment(_))

/** Parses an HTML comment without the leading `'<'`.
*/
val htmlScriptElement: Parser[HTMLScriptElement] =
private val htmlScriptElement: Parser[HTMLScriptElement] =
(("script" ~> (htmlWS ~> htmlAttribute.rep <~ htmlWS) <~ ">") ~ delimitedBy("</script>"))
.mapN(HTMLScriptElement(_, _))

/** Parses an empty HTML element without the leading `'<'`.
* Only recognizes empty tags explicitly closed.
*/
val htmlEmptyElement: Parser[HTMLEmptyElement] =
private val htmlEmptyElement: Parser[HTMLEmptyElement] =
(htmlTagContent <~ "/>").mapN(HTMLEmptyElement(_, _))

/** Parses an HTML start tag without the leading `'<'`.
* Only recognizes empty tags explicitly closed.
*/
val htmlStartTag: Parser[HTMLStartTag] = (htmlTagContent <~ ">").mapN(HTMLStartTag(_, _))
private val htmlStartTag: Parser[HTMLStartTag] = (htmlTagContent <~ ">").mapN(HTMLStartTag(_, _))

/** Parses an HTML element without the leading `'<'`, but including
* all the nested HTML and Text elements.
*/
lazy val htmlElement: Parser[HTMLElement] = htmlStartTag >> { tag =>
private lazy val htmlElement: Parser[HTMLElement] = htmlStartTag >> { tag =>
spans(htmlEndTag(tag.name)).embedAll(htmlBlockParsers).map { spans =>
HTMLElement(tag, spans)
}
Expand All @@ -147,7 +150,7 @@ object HTMLParsers {
/** Parses an HTML element without the leading `'<'`, but including
* all the nested HTML and Text elements, as well as any nested Markdown spans.
*/
def htmlElementWithNestedMarkdown(recParsers: RecursiveSpanParsers): Parser[HTMLElement] =
private def htmlElementWithNestedMarkdown(recParsers: RecursiveSpanParsers): Parser[HTMLElement] =
htmlStartTag >> { tag =>
recParsers.recursiveSpans(htmlEndTag(tag.name)).map { spans =>
HTMLElement(tag, spans)
Expand All @@ -168,15 +171,15 @@ object HTMLParsers {

/** Parses any of the HTML span elements supported by this trait, but no standard markdown inside HTML elements.
*/
lazy val htmlSpanInsideBlock: PrefixedParser[HTMLSpan] =
private lazy val htmlSpanInsideBlock: PrefixedParser[HTMLSpan] =
"<" ~> (htmlComment | htmlScriptElement | htmlEmptyElement | htmlElement | htmlEndTag | htmlStartTag)

/** Elements that the HTML specification does not define as "Phrasing Content".
* These elements can serve as the root of a Block instance in the Document model.
* For an HTML renderer this means that it can avoid to wrap these blocks
* inside p tags as it would do with a normal paragraph.
*/
val htmlBlockElements: Set[String] = Set(
private val htmlBlockElements: Set[String] = Set(
"body",
"style",
"blockquote",
Expand Down Expand Up @@ -229,7 +232,7 @@ object HTMLParsers {
/** Parses the start tag of an HTML block, only matches when the tag name is an
* actual block-level HTML tag.
*/
val htmlBlockStart: Parser[HTMLStartTag] = ("<" ~> htmlStartTag).collect {
private val htmlBlockStart: Parser[HTMLStartTag] = ("<" ~> htmlStartTag).collect {
case t @ HTMLStartTag(name, _, _) if htmlBlockElements.contains(name) => t
}

Expand All @@ -239,13 +242,13 @@ object HTMLParsers {
/** Parses a full HTML block, with the root element being a block-level HTML element
* and without parsing any standard Markdown markup.
*/
lazy val htmlBlock: Parser[HTMLBlock] = htmlBlockStart >> { tag =>
private lazy val htmlBlock: Parser[HTMLBlock] = htmlBlockStart >> { tag =>
spans(htmlEndTag(tag.name)).embedAll(htmlBlockParsers) <~ wsEol ^^ { spans =>
HTMLBlock(HTMLElement(tag, spans))
}
}

lazy val htmlBlockElement: Parser[Block] =
private lazy val htmlBlockElement: Parser[Block] =
"<" ~> (htmlComment | htmlEmptyElement | htmlStartTag) <~ wsEol ~ blankLine

lazy val htmlBlockFragment: BlockParserBuilder =
Expand Down
34 changes: 15 additions & 19 deletions core/shared/src/main/scala/laika/markdown/InlineParsers.scala
Original file line number Diff line number Diff line change
Expand Up @@ -16,35 +16,33 @@

package laika.markdown

import laika.ast._
import laika.ast.*
import laika.bundle.SpanParserBuilder
import laika.parse.{ LineSource, Parser, SourceFragment }
import laika.parse.markup.InlineParsers.text
import laika.parse.markup.RecursiveSpanParsers
import laika.parse.builders._
import laika.parse.implicits._
import laika.parse.builders.*
import laika.parse.implicits.*
import laika.parse.text.{ DelimitedText, PrefixedParser }

import scala.util.Try

/** Provides all inline parsers for Markdown text except for those dealing
* with verbatim HTML markup which this library treats as an optional
* feature that has to be explicitly mixed in.
/** Provides all inline parsers for Markdown text except for those dealing with verbatim HTML markup
* which this library treats as an optional feature that has to be explicitly mixed in.
*
* Inline parsers deal with markup within a block of text, such as a
* link or emphasized text. They are used in the second phase of parsing,
* after the block parsers have cut the document into a (potentially nested)
* block structure.
* Inline parsers deal with markup within a block of text, such as a link or emphasized text.
* They are used in the second phase of parsing,
* after the block parsers have cut the document into a (potentially nested) block structure.
*
* @author Jens Halm
* @author Jens Halm
*/
object InlineParsers {
private[laika] object InlineParsers {

/** Parses a single escaped character, only recognizing the characters the Markdown syntax document
* specifies as escapable.
* The `|` has been added to that list to support escaping in tables in the GitHub Flavor syntax.
/** Parses a single escaped character, only recognizing the characters the Markdown syntax document
* specifies as escapable.
* The `|` has been added to that list to support escaping in tables in the GitHub Flavor syntax.
*
* Note: escaping > is not mandated by the official syntax description, but by the official test suite.
* Note: escaping > is not mandated by the official syntax description, but by the official test suite.
*/
val escapedChar: Parser[String] =
oneOf('\\', '`', '*', '_', '{', '}', '[', ']', '(', ')', '#', '+', '-', '.', '!', '>', '|')
Expand Down Expand Up @@ -88,7 +86,7 @@ object InlineParsers {
/** Parses a span enclosed by a single occurrence of the specified character.
* Recursively parses nested spans, too.
*/
def enclosedBySingleChar(
private def enclosedBySingleChar(
c: Char
)(implicit recParsers: RecursiveSpanParsers): PrefixedParser[List[Span]] = {
val start = delimiter(c).nextNot(' ', '\n', c)
Expand Down Expand Up @@ -118,8 +116,6 @@ object InlineParsers {

private def normalizeId(id: String): String = id.toLowerCase.replaceAll("[\n ]+", " ")

type RecParser = String => List[Span]

/** Parses a link, including nested spans in the link text.
* Recognizes both, an inline link `[text](url)` and a link reference `[text][id]`.
*/
Expand Down
Loading

0 comments on commit 76ed269

Please sign in to comment.