Skip to content

Commit

Permalink
[#93] Add support for basic html tags
Browse files Browse the repository at this point in the history
Problem: HTML tags can be used in a markdown file. We should add
support for basic HTML tags embedded in markdown file.

Solution: Add support for image tags (<img src="link">), anchor
hyperlinks (<a href="link">Text</a>) and anchor target
locations (<a name="loc"> or <a id="loc">).
  • Loading branch information
Sereja313 committed Dec 23, 2022
1 parent 16041d0 commit 2430b54
Show file tree
Hide file tree
Showing 5 changed files with 194 additions and 37 deletions.
3 changes: 3 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,9 @@ Unreleased
* [#231](https://github.com/serokell/xrefcheck/pull/231)
+ Anchor analysis takes now into account the appropriate case-sensitivity depending on
the configured Markdown flavour.
* [#259](https://github.com/serokell/xrefcheck/pull/259)
+ Add support for image tags `<img src="link">`, anchor hyperlinks `<a href="link">Text</a>`
and anchor target locations `<a name="loc">` or `<a id="loc">`.

0.2.2
==========
Expand Down
133 changes: 98 additions & 35 deletions src/Xrefcheck/Scanners/Markdown.hs
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import Universum
import CMarkGFM
(Node (..), NodeType (..), PosInfo (..), commonmarkToNode, extAutolink, optFootnotes)
import Control.Lens (_Just, makeLenses, makeLensesFor, (.=))
import Control.Monad.Trans.RWS.CPS qualified as RWS
import Control.Monad.Trans.Writer.CPS (Writer, runWriter, tell)
import Data.Aeson (FromJSON (..), genericParseJSON)
import Data.ByteString.Lazy qualified as BSL
Expand Down Expand Up @@ -195,6 +196,12 @@ removeIgnored = withIgnoreMode . cataNodeWithParentNodeInfo remove
(IMSLink _, IMAGE {}) -> do
ssIgnore .= Nothing
return defNode
(IMSLink _, HTML_INLINE text) | isLink text -> do
ssIgnore .= Nothing
pure defNode
(IMSLink _, HTML_BLOCK text) | isLink text -> do
ssIgnore .= Nothing
pure defNode
(IMSLink ignoreLinkState, _) -> do
when (ignoreLinkState == ExpectingLinkInSubnodes) $
ssIgnore . _Just . ignoreMode .= IMSLink ParentExpectsLink
Expand Down Expand Up @@ -264,57 +271,60 @@ removeIgnored = withIgnoreMode . cataNodeWithParentNodeInfo remove
pure node
(node, _) -> pure node

findAttributes :: [Text] -> [Attribute Text] -> Maybe Text
findAttributes (map T.toLower -> attrs) =
fmap snd . find (\(attr, _) -> T.toLower attr `elem` attrs)

isLink :: Text -> Bool
isLink (parseTags -> tags) = case safeHead tags of
Just (TagOpen tag attrs) ->
T.toLower tag == "a" && isJust (findAttributes ["href"] attrs)
|| T.toLower tag == "img" && isJust (findAttributes ["src"] attrs)
_ -> False


-- | Custom `foldMap` for source tree.
foldNode :: (Monoid a, Monad m) => (Node -> m a) -> Node -> m a
foldNode action node@(Node _ _ subs) = do
a <- action node
b <- concatForM subs (foldNode action)
return (a <> b)

type ExtractorM a = ReaderT MarkdownConfig (Writer [ScanError 'Parse]) a
type ExtractorM a = RWS.RWS MarkdownConfig [ScanError 'Parse] (Maybe Reference) a

-- | Extract information from source tree.
nodeExtractInfo :: Node -> ExtractorM FileInfo
nodeExtractInfo input@(Node _ _ nSubs) = do
if checkIgnoreAllFile nSubs
then return (diffToFileInfo mempty)
else diffToFileInfo <$> (foldNode extractor =<< lift (removeIgnored input))
else diffToFileInfo <$> (foldNode extractor =<< (RWS.writer . runWriter $ removeIgnored input))

where
extractor :: Node -> ExtractorM FileInfoDiff
extractor node@(Node pos ty _) =
case ty of
HTML_BLOCK _ -> do
return mempty
extractor node@(Node pos ty _) = do
reference' <- RWS.get
-- If current state is not `Nothing`, try extracting associated text
let fileInfoDiff = case (reference', ty) of
(Just ref, TEXT text) ->
mempty & fidReferences .~ DList.singleton ref {rName = text}
(Just ref, _) -> mempty & fidReferences .~ DList.singleton ref
_ -> mempty
RWS.put Nothing
fmap (fileInfoDiff <>) case ty of
HTML_BLOCK text | isLink text -> extractHtmlLink text

HTML_BLOCK text -> extractAnchor text

HEADING lvl -> do
flavor <- asks mcFlavor
flavor <- RWS.asks mcFlavor
let aType = HeaderAnchor lvl
let aName = headerToAnchor flavor $ nodeExtractText node
let aPos = toPosition pos
return $ FileInfoDiff DList.empty $ DList.singleton $ Anchor {aType, aName, aPos}

HTML_INLINE text -> do
let
mName = do
tag <- safeHead $ parseTags text
attributes <- case tag of
TagOpen a attrs
| T.toLower a == "a" -> Just attrs
_ -> Nothing
(_, name) <- find (\(field, _) -> T.toLower field `elem` ["name", "id"]) attributes
pure name
HTML_INLINE text | isLink text -> extractHtmlLink text

case mName of
Just aName -> do
let aType = HandAnchor
aPos = toPosition pos
return $ FileInfoDiff
mempty
(pure $ Anchor {aType, aName, aPos})

Nothing -> do
return mempty
HTML_INLINE text -> extractAnchor text

LINK url _ -> extractLink url

Expand All @@ -328,17 +338,71 @@ nodeExtractInfo input@(Node _ _ nSubs) = do
rPos = toPosition pos
link = if null url then rName else url

let (rLink, rAnchor) = case T.splitOn "#" link of
[t] -> (t, Nothing)
t : ts -> (t, Just $ T.intercalate "#" ts)
[] -> error "impossible"
let (rLink, rAnchor) = splitLink link

let rInfo = referenceInfo rLink

return $ FileInfoDiff
(DList.singleton $ Reference {rName, rPos, rLink, rAnchor, rInfo})
DList.empty

extractAnchor :: Text -> ExtractorM FileInfoDiff
extractAnchor text = do
let mName = do
tag <- safeHead $ parseTags text
attributes <- case tag of
TagOpen a attrs | T.toLower a == "a" -> Just attrs
_ -> Nothing
(_, name) <- find (\(field, _) -> T.toLower field `elem` ["name", "id"]) attributes
pure name

case mName of
Just aName -> do
let aType = HandAnchor
aPos = toPosition pos
return $ FileInfoDiff
mempty
(pure $ Anchor {aType, aName, aPos})

Nothing -> do
return mempty

extractHtmlReference :: [Attribute Text] -> Maybe PosInfo -> DList.DList Reference
extractHtmlReference attrs tagPos = fromMaybe mempty do
link <- findAttributes ["href"] attrs
let (rLink, rAnchor) = splitLink link
pure . DList.singleton $ Reference "" rLink rAnchor (toPosition tagPos) (referenceInfo rLink)

splitLink :: Text -> (Text, Maybe Text)
splitLink link = case T.splitOn "#" link of
[t] -> (t, Nothing)
t : ts -> (t, Just $ T.intercalate "#" ts)
[] -> error "impossible"

extractHtmlImage :: [Attribute Text] -> Maybe PosInfo -> DList.DList Reference
extractHtmlImage attrs tagPos = fromMaybe mempty do
link <- findAttributes ["src"] attrs
pure . DList.singleton $ Reference "" link Nothing (toPosition tagPos) (referenceInfo link)

extractHtmlLink :: Text -> ExtractorM FileInfoDiff
extractHtmlLink text =
case safeHead $ parseTags text of
Just (TagOpen tag attrs) | T.toLower tag == "img" ->
pure $ mempty & fidReferences .~ extractHtmlImage attrs pos
Just (TagOpen tag attrs) | T.toLower tag == "a" -> do
let reference = extractHtmlReference attrs pos
case DList.toList reference of
[ref] -> do
-- The `cmark-gfm` package parses the link tag as three separate nodes:
-- `HTML_INLINE` with an opening tag, a `TEXT` with text in between,
-- and `HTML_INLINE` with a closing tag. So we keep the extracted link in a state and
-- try to get associated text in the next node.
RWS.put $ Just ref
pure mempty
_ -> pure mempty
_ -> pure mempty


-- | Check if there is `ignore all` at the beginning of the file,
-- ignoring preceding comments if there are any.
checkIgnoreAllFile :: [Node] -> Bool
Expand Down Expand Up @@ -406,11 +470,10 @@ textToMode _ = NotAnAnnotation

parseFileInfo :: MarkdownConfig -> LT.Text -> (FileInfo, [ScanError 'Parse])
parseFileInfo config input
= runWriter
$ flip runReaderT config
$ nodeExtractInfo
= RWS.evalRWS
(nodeExtractInfo
$ commonmarkToNode [optFootnotes] [extAutolink]
$ toStrict input
$ toStrict input) config Nothing

markdownScanner :: MarkdownConfig -> ScanAction
markdownScanner config canonicalFile =
Expand Down
24 changes: 24 additions & 0 deletions tests/golden/check-html/check-html.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,32 @@

## <a name='one'> <a name=two> <a NAME="three"> <a name="four"></a> <a NAME=five > Title1

<a name=six>

text <a id=seven> text

[One](#one)
[Two](#two)
[Three](#three)
[Four](#four)
[Five](#five)
[Six](#six)
[Seven](#seven)

<img src="https://user-images.githubusercontent.com/5394217/70820564-06b06e00-1dea-11ea-9680-27f661ca2a58.png" alt="Output sample" width="600"/>

text <img src="https://user-images.githubusercontent.com/5394217/70820564-06b06e00-1dea-11ea-9680-27f661ca2a58.png" alt="Output sample" width="600"/> text

<a href=https://serokell.io/>serokell</a>

text <a href=https://serokell.io/>serokell</a> text

<a href=#six>Six</a>

text <a href=#seven>Seven</a> text

<!-- xrefcheck: ignore link -->
<a href=https://serokell.io/404>serokell404</a>

<!-- xrefcheck: ignore link -->
text <a href=https://serokell.io/404>serokell404</a> text
4 changes: 2 additions & 2 deletions tests/golden/check-html/check.html.bats
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ load '../helpers'


@test "All HTML anchors should be valid" {
run xrefcheck
to_temp xrefcheck -v

assert_output --partial "All repository links are valid."
assert_diff expected.gold
}
67 changes: 67 additions & 0 deletions tests/golden/check-html/expected.gold
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
=== Repository data ===

check-html.md:
- references:
- reference (file-local) at src:13:1-11:
- text: "One"
- link: -
- anchor: one
- reference (file-local) at src:14:1-11:
- text: "Two"
- link: -
- anchor: two
- reference (file-local) at src:15:1-15:
- text: "Three"
- link: -
- anchor: three
- reference (file-local) at src:16:1-13:
- text: "Four"
- link: -
- anchor: four
- reference (file-local) at src:17:1-13:
- text: "Five"
- link: -
- anchor: five
- reference (file-local) at src:18:1-11:
- text: "Six"
- link: -
- anchor: six
- reference (file-local) at src:19:1-15:
- text: "Seven"
- link: -
- anchor: seven
- reference (external) at src:21:1-144:
- text: ""
- link: https://user-images.githubusercontent.com/5394217/70820564-06b06e00-1dea-11ea-9680-27f661ca2a58.png
- anchor: -
- reference (external) at src:23:6-149:
- text: ""
- link: https://user-images.githubusercontent.com/5394217/70820564-06b06e00-1dea-11ea-9680-27f661ca2a58.png
- anchor: -
- reference (external) at src:25:1-29:
- text: "serokell"
- link: https://serokell.io/
- anchor: -
- reference (external) at src:27:6-34:
- text: "serokell"
- link: https://serokell.io/
- anchor: -
- reference (file-local) at src:29:1-13:
- text: "Six"
- link: -
- anchor: six
- reference (file-local) at src:31:6-20:
- text: "Seven"
- link: -
- anchor: seven
- anchors:
- title1 (header II) at src:7:1-96
- one (hand made) at src:7:4-17
- two (hand made) at src:7:19-30
- three (hand made) at src:7:32-47
- four (hand made) at src:7:49-63
- five (hand made) at src:7:69-88
- six (hand made) at src:9:1-12
- seven (hand made) at src:11:6-17

All repository links are valid.

0 comments on commit 2430b54

Please sign in to comment.