[#93] Add support for basic html tags

Problem: HTML tags can be used in a markdown file. We should add support for basic HTML tags embedded in markdown file. Solution: Add support for image tags (<img src="link">), anchor hyperlinks (<a href="link">Text</a>) and anchor target locations (<a name="loc"> or <a id="loc">).
serokell · Dec 23, 2022 · 2430b54 · 2430b54
1 parent 16041d0
commit 2430b54
Show file tree

Hide file tree

Showing 5 changed files with 194 additions and 37 deletions.
diff --git a/CHANGES.md b/CHANGES.md
@@ -41,6 +41,9 @@ Unreleased
 * [#231](https://github.com/serokell/xrefcheck/pull/231)
   + Anchor analysis takes now into account the appropriate case-sensitivity depending on
   the configured Markdown flavour.
+* [#259](https://github.com/serokell/xrefcheck/pull/259)
+  + Add support for image tags `<img src="link">`, anchor hyperlinks `<a href="link">Text</a>`
+  and anchor target locations `<a name="loc">` or `<a id="loc">`.
 
 0.2.2
 ==========

diff --git a/src/Xrefcheck/Scanners/Markdown.hs b/src/Xrefcheck/Scanners/Markdown.hs
@@ -22,6 +22,7 @@ import Universum
 import CMarkGFM
   (Node (..), NodeType (..), PosInfo (..), commonmarkToNode, extAutolink, optFootnotes)
 import Control.Lens (_Just, makeLenses, makeLensesFor, (.=))
+import Control.Monad.Trans.RWS.CPS qualified as RWS
 import Control.Monad.Trans.Writer.CPS (Writer, runWriter, tell)
 import Data.Aeson (FromJSON (..), genericParseJSON)
 import Data.ByteString.Lazy qualified as BSL
@@ -195,6 +196,12 @@ removeIgnored = withIgnoreMode . cataNodeWithParentNodeInfo remove
             (IMSLink _, IMAGE {})        -> do
               ssIgnore .= Nothing
               return defNode
+            (IMSLink _, HTML_INLINE text) | isLink text -> do
+              ssIgnore .= Nothing
+              pure defNode
+            (IMSLink _, HTML_BLOCK text) | isLink text -> do
+              ssIgnore .= Nothing
+              pure defNode
             (IMSLink ignoreLinkState, _) -> do
               when (ignoreLinkState == ExpectingLinkInSubnodes) $
                 ssIgnore . _Just . ignoreMode .=  IMSLink ParentExpectsLink
@@ -264,57 +271,60 @@ removeIgnored = withIgnoreMode . cataNodeWithParentNodeInfo remove
             pure node
       (node, _) -> pure node
 
+findAttributes :: [Text] -> [Attribute Text] -> Maybe Text
+findAttributes (map T.toLower -> attrs) =
+  fmap snd . find (\(attr, _) -> T.toLower attr `elem` attrs)
+
+isLink :: Text -> Bool
+isLink (parseTags -> tags) = case safeHead tags of
+  Just (TagOpen tag attrs) ->
+      T.toLower tag == "a" && isJust (findAttributes ["href"] attrs)
+      || T.toLower tag == "img" && isJust (findAttributes ["src"] attrs)
+  _ -> False
+
+
 -- | Custom `foldMap` for source tree.
 foldNode :: (Monoid a, Monad m) => (Node -> m a) -> Node -> m a
 foldNode action node@(Node _ _ subs) = do
   a <- action node
   b <- concatForM subs (foldNode action)
   return (a <> b)
 
-type ExtractorM a = ReaderT MarkdownConfig (Writer [ScanError 'Parse]) a
+type ExtractorM a = RWS.RWS MarkdownConfig [ScanError 'Parse] (Maybe Reference) a
 
 -- | Extract information from source tree.
 nodeExtractInfo :: Node -> ExtractorM FileInfo
 nodeExtractInfo input@(Node _ _ nSubs) = do
   if checkIgnoreAllFile nSubs
   then return (diffToFileInfo mempty)
-  else diffToFileInfo <$> (foldNode extractor =<< lift (removeIgnored input))
+  else diffToFileInfo <$> (foldNode extractor =<< (RWS.writer . runWriter $ removeIgnored input))
 
   where
     extractor :: Node -> ExtractorM FileInfoDiff
-    extractor node@(Node pos ty _) =
-      case ty of
-        HTML_BLOCK _ -> do
-          return mempty
+    extractor node@(Node pos ty _) = do
+      reference' <- RWS.get
+      -- If current state is not `Nothing`, try extracting associated text
+      let fileInfoDiff = case (reference', ty) of
+             (Just ref, TEXT text) ->
+               mempty & fidReferences .~ DList.singleton ref {rName = text}
+             (Just ref, _) -> mempty & fidReferences .~ DList.singleton ref
+             _ -> mempty
+      RWS.put Nothing
+      fmap (fileInfoDiff <>) case ty of
+        HTML_BLOCK text | isLink text -> extractHtmlLink text
+
+        HTML_BLOCK text -> extractAnchor text
 
         HEADING lvl -> do
-          flavor <- asks mcFlavor
+          flavor <- RWS.asks mcFlavor
           let aType = HeaderAnchor lvl
           let aName = headerToAnchor flavor $ nodeExtractText node
           let aPos  = toPosition pos
           return $ FileInfoDiff DList.empty $ DList.singleton $ Anchor {aType, aName, aPos}
 
-        HTML_INLINE text -> do
-          let
-            mName = do
-              tag <- safeHead $ parseTags text
-              attributes <- case tag of
-                TagOpen a attrs
-                  | T.toLower a == "a" -> Just attrs
-                _ -> Nothing
-              (_, name) <- find (\(field, _) -> T.toLower field `elem` ["name", "id"]) attributes
-              pure name
+        HTML_INLINE text | isLink text -> extractHtmlLink text
 
-          case mName of
-            Just aName -> do
-              let aType = HandAnchor
-                  aPos  = toPosition pos
-              return $ FileInfoDiff
-                mempty
-                (pure $ Anchor {aType, aName, aPos})
-
-            Nothing -> do
-              return mempty
+        HTML_INLINE text -> extractAnchor text
 
         LINK url _ -> extractLink url
 
@@ -328,17 +338,71 @@ nodeExtractInfo input@(Node _ _ nSubs) = do
               rPos = toPosition pos
               link = if null url then rName else url
 
-          let (rLink, rAnchor) = case T.splitOn "#" link of
-                [t] -> (t, Nothing)
-                t : ts -> (t, Just $ T.intercalate "#" ts)
-                [] -> error "impossible"
+          let (rLink, rAnchor) = splitLink link
 
           let rInfo = referenceInfo rLink
 
           return $ FileInfoDiff
             (DList.singleton $ Reference {rName, rPos, rLink, rAnchor, rInfo})
             DList.empty
 
+        extractAnchor :: Text -> ExtractorM FileInfoDiff
+        extractAnchor text = do
+          let mName = do
+                tag <- safeHead $ parseTags text
+                attributes <- case tag of
+                  TagOpen a attrs | T.toLower a == "a" -> Just attrs
+                  _ -> Nothing
+                (_, name) <- find (\(field, _) -> T.toLower field `elem` ["name", "id"]) attributes
+                pure name
+
+          case mName of
+            Just aName -> do
+              let aType = HandAnchor
+                  aPos  = toPosition pos
+              return $ FileInfoDiff
+                mempty
+                (pure $ Anchor {aType, aName, aPos})
+
+            Nothing -> do
+              return mempty
+
+        extractHtmlReference ::  [Attribute Text] -> Maybe PosInfo -> DList.DList Reference
+        extractHtmlReference attrs tagPos = fromMaybe mempty do
+          link <- findAttributes ["href"] attrs
+          let (rLink, rAnchor) = splitLink link
+          pure . DList.singleton $ Reference "" rLink rAnchor (toPosition tagPos) (referenceInfo rLink)
+
+        splitLink :: Text -> (Text, Maybe Text)
+        splitLink link = case T.splitOn "#" link of
+          [t]    -> (t, Nothing)
+          t : ts -> (t, Just $ T.intercalate "#" ts)
+          []     -> error "impossible"
+
+        extractHtmlImage :: [Attribute Text] -> Maybe PosInfo -> DList.DList Reference
+        extractHtmlImage attrs tagPos = fromMaybe mempty do
+          link <- findAttributes ["src"] attrs
+          pure . DList.singleton $ Reference "" link Nothing (toPosition tagPos) (referenceInfo link)
+
+        extractHtmlLink :: Text -> ExtractorM FileInfoDiff
+        extractHtmlLink text =
+          case safeHead $ parseTags text of
+            Just (TagOpen tag attrs) | T.toLower tag == "img" ->
+              pure $ mempty & fidReferences .~ extractHtmlImage attrs pos
+            Just (TagOpen tag attrs) | T.toLower tag == "a" -> do
+              let  reference = extractHtmlReference attrs pos
+              case DList.toList reference of
+                [ref] -> do
+                  -- The `cmark-gfm` package parses the link tag as three separate nodes:
+                  -- `HTML_INLINE` with an opening tag, a `TEXT` with text in between,
+                  -- and `HTML_INLINE` with a closing tag. So we keep the extracted link in a state and
+                  -- try to get associated text in the next node.
+                  RWS.put $ Just ref
+                  pure mempty
+                _ -> pure mempty
+            _ -> pure mempty
+
+
 -- | Check if there is `ignore all` at the beginning of the file,
 -- ignoring preceding comments if there are any.
 checkIgnoreAllFile :: [Node] -> Bool
@@ -406,11 +470,10 @@ textToMode _         = NotAnAnnotation
 
 parseFileInfo :: MarkdownConfig -> LT.Text -> (FileInfo, [ScanError 'Parse])
 parseFileInfo config input
-  = runWriter
-  $ flip runReaderT config
-  $ nodeExtractInfo
+  = RWS.evalRWS
+  (nodeExtractInfo
   $ commonmarkToNode [optFootnotes] [extAutolink]
-  $ toStrict input
+  $ toStrict input) config Nothing
 
 markdownScanner :: MarkdownConfig -> ScanAction
 markdownScanner config canonicalFile =

diff --git a/tests/golden/check-html/check-html.md b/tests/golden/check-html/check-html.md
@@ -6,8 +6,32 @@
 
 ## <a name='one'> <a name=two> <a NAME="three"> <a name="four"></a> <a     NAME=five   >  Title1
 
+<a name=six>
+
+text <a id=seven> text
+
 [One](#one)
 [Two](#two)
 [Three](#three)
 [Four](#four)
 [Five](#five)
+[Six](#six)
+[Seven](#seven)
+
+<img src="https://user-images.githubusercontent.com/5394217/70820564-06b06e00-1dea-11ea-9680-27f661ca2a58.png" alt="Output sample" width="600"/>
+
+text <img src="https://user-images.githubusercontent.com/5394217/70820564-06b06e00-1dea-11ea-9680-27f661ca2a58.png" alt="Output sample" width="600"/> text
+
+<a href=https://serokell.io/>serokell</a>
+
+text <a href=https://serokell.io/>serokell</a> text
+
+<a href=#six>Six</a>
+
+text <a href=#seven>Seven</a> text
+
+<!-- xrefcheck: ignore link -->
+<a href=https://serokell.io/404>serokell404</a>
+
+<!-- xrefcheck: ignore link -->
+text <a href=https://serokell.io/404>serokell404</a> text
diff --git a/tests/golden/check-html/check.html.bats b/tests/golden/check-html/check.html.bats
@@ -11,7 +11,7 @@ load '../helpers'
 
 
 @test "All HTML anchors should be valid" {
-  run xrefcheck
+  to_temp xrefcheck -v
 
-  assert_output --partial "All repository links are valid."
+  assert_diff expected.gold
 }
diff --git a/tests/golden/check-html/expected.gold b/tests/golden/check-html/expected.gold
@@ -0,0 +1,67 @@
+=== Repository data ===
+
+  check-html.md:
+    - references:
+        - reference (file-local) at src:13:1-11:
+            - text: "One"
+            - link: -
+            - anchor: one
+        - reference (file-local) at src:14:1-11:
+            - text: "Two"
+            - link: -
+            - anchor: two
+        - reference (file-local) at src:15:1-15:
+            - text: "Three"
+            - link: -
+            - anchor: three
+        - reference (file-local) at src:16:1-13:
+            - text: "Four"
+            - link: -
+            - anchor: four
+        - reference (file-local) at src:17:1-13:
+            - text: "Five"
+            - link: -
+            - anchor: five
+        - reference (file-local) at src:18:1-11:
+            - text: "Six"
+            - link: -
+            - anchor: six
+        - reference (file-local) at src:19:1-15:
+            - text: "Seven"
+            - link: -
+            - anchor: seven
+        - reference (external) at src:21:1-144:
+            - text: ""
+            - link: https://user-images.githubusercontent.com/5394217/70820564-06b06e00-1dea-11ea-9680-27f661ca2a58.png
+            - anchor: -
+        - reference (external) at src:23:6-149:
+            - text: ""
+            - link: https://user-images.githubusercontent.com/5394217/70820564-06b06e00-1dea-11ea-9680-27f661ca2a58.png
+            - anchor: -
+        - reference (external) at src:25:1-29:
+            - text: "serokell"
+            - link: https://serokell.io/
+            - anchor: -
+        - reference (external) at src:27:6-34:
+            - text: "serokell"
+            - link: https://serokell.io/
+            - anchor: -
+        - reference (file-local) at src:29:1-13:
+            - text: "Six"
+            - link: -
+            - anchor: six
+        - reference (file-local) at src:31:6-20:
+            - text: "Seven"
+            - link: -
+            - anchor: seven
+    - anchors:
+        - title1 (header II) at src:7:1-96
+        - one (hand made) at src:7:4-17
+        - two (hand made) at src:7:19-30
+        - three (hand made) at src:7:32-47
+        - four (hand made) at src:7:49-63
+        - five (hand made) at src:7:69-88
+        - six (hand made) at src:9:1-12
+        - seven (hand made) at src:11:6-17
+
+All repository links are valid.