From 9c2ac77619b026959435d5b85aeedef523718364 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Enr=C3=ADquez?= Date: Thu, 26 Jan 2023 11:35:57 +0100 Subject: [PATCH] [#270] Handle relative redirects Problem: Currently, Xrefcheck can follow redirects with an absolute location link, but it cannot handle relative ones. Solution: After parsing the location link, obtain the corresponding absolute link by using the original request one. --- CHANGES.md | 2 + src/Xrefcheck/Data/URI.hs | 74 ++++++++++++++ src/Xrefcheck/Verify.hs | 96 ++++++------------- tests/Test/Xrefcheck/RedirectChainSpec.hs | 54 +++++++++-- tests/Test/Xrefcheck/URIParsingSpec.hs | 16 ++-- tests/Test/Xrefcheck/UtilRequests.hs | 8 +- .../check-autolinks/check-autolinks.bats | 2 +- 7 files changed, 168 insertions(+), 84 deletions(-) create mode 100644 src/Xrefcheck/Data/URI.hs diff --git a/CHANGES.md b/CHANGES.md index 0c41e2ea..481adbca 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -52,6 +52,8 @@ Unreleased * [#268](https://github.com/serokell/xrefcheck/pull/268) + Added CLI option `--color` that enables ANSI colors in output. + Changed the output coloring defaults to show colors when `CI` env variable is `true`. +* [#271](https://github.com/serokell/xrefcheck/pull/271) + + Now Xrefcheck is able to follow relative redirects. 0.2.2 ========== diff --git a/src/Xrefcheck/Data/URI.hs b/src/Xrefcheck/Data/URI.hs new file mode 100644 index 00000000..14a3c55a --- /dev/null +++ b/src/Xrefcheck/Data/URI.hs @@ -0,0 +1,74 @@ +{- SPDX-FileCopyrightText: 2023 Serokell + - + - SPDX-License-Identifier: MPL-2.0 + -} + +{-# LANGUAGE ExistentialQuantification #-} + +module Xrefcheck.Data.URI + ( UriParseError (..) + , parseUri + ) where + +import Universum + +import Control.Exception.Safe (handleJust) +import Control.Monad.Except (throwError) +import Text.URI (ParseExceptionBs, URI, mkURIBs) +import URI.ByteString qualified as URIBS + +data UriParseError + = UPEInvalid URIBS.URIParseError + | UPEConversion ParseExceptionBs + deriving stock (Show, Eq) + +data AnyURIRef = forall a. AnyURIRef (URIBS.URIRef a) + +serializeAnyURIRef :: AnyURIRef -> ByteString +serializeAnyURIRef (AnyURIRef uri) = URIBS.serializeURIRef' uri + +-- | Parse URI according to RFC 3986 extended by allowing non-encoded +-- `[` and `]` in query string. +-- +-- The first parameter indicates whether the parsing should admit relative +-- URIs or not. +parseUri :: Bool -> Text -> ExceptT UriParseError IO URI +parseUri canBeRelative link = do + -- There exist two main standards of URL parsing: RFC 3986 and the Web + -- Hypertext Application Technology Working Group's URL standard. Ideally, + -- we want to be able to parse the URLs in accordance with the latter + -- standard, because it provides a much less ambiguous set of rules for + -- percent-encoding special characters, and is essentially a living + -- standard that gets updated constantly. + -- + -- We have chosen the 'uri-bytestring' library for URI parsing because + -- of the 'laxURIParseOptions' parsing configuration. 'mkURI' from + -- the 'modern-uri' library parses URIs in accordance with RFC 3986 and does + -- not provide a means of parsing customization, which contrasts with + -- 'parseURI' that accepts a 'URIParserOptions'. One of the predefined + -- configurations of this type is 'strictURIParserOptions', which follows + -- RFC 3986, and the other -- 'laxURIParseOptions' -- allows brackets + -- in the queries, which draws us closer to the WHATWG URL standard. + -- + -- The 'modern-uri' package can parse an URI deciding if it is absolute or + -- relative depending on the success or failure of the scheme parsing. By + -- contrast, in 'uri-bytestring' it has to be decided beforehand, resulting in + -- different URI types. + uri <- case URIBS.parseURI URIBS.laxURIParserOptions (encodeUtf8 link) of + Left (URIBS.MalformedScheme _) | canBeRelative -> + URIBS.parseRelativeRef URIBS.laxURIParserOptions (encodeUtf8 link) + & either (throwError . UPEInvalid) (pure . AnyURIRef) + Left err -> throwError $ UPEInvalid err + Right uri -> pure $ AnyURIRef uri + + -- We stick to our infrastructure by continuing to operate on the datatypes + -- from 'modern-uri', which are used in the 'req' library. First we + -- serialize our URI parsed with 'parseURI' so it becomes a 'ByteString' + -- with all the necessary special characters *percent-encoded*, and then + -- call 'mkURIBs'. + mkURIBs (serializeAnyURIRef uri) + -- Ideally, this exception should never be thrown, as the URI + -- already *percent-encoded* with 'parseURI' from 'uri-bytestring' + -- and 'mkURIBs' is only used to convert to 'URI' type from + -- 'modern-uri' package. + & handleJust fromException (throwError . UPEConversion) diff --git a/src/Xrefcheck/Verify.hs b/src/Xrefcheck/Verify.hs index dc281082..7468ae12 100644 --- a/src/Xrefcheck/Verify.hs +++ b/src/Xrefcheck/Verify.hs @@ -25,9 +25,6 @@ module Xrefcheck.Verify , verifyRepo , verifyReference , checkExternalResource - - -- * URI parsing - , parseUri , reportVerifyErrs ) where @@ -35,7 +32,7 @@ import Universum import Control.Concurrent.Async (Async, async, cancel, poll, wait, withAsync) import Control.Exception (AsyncException (..), throwIO) -import Control.Exception.Safe (handleAsync, handleJust) +import Control.Exception.Safe (handleAsync) import Control.Monad.Except (MonadError (..)) import Data.Bits (toIntegralSized) import Data.ByteString qualified as BS @@ -64,12 +61,13 @@ import Network.HTTP.Types.Header (hRetryAfter) import Network.HTTP.Types.Status (Status, statusCode, statusMessage) import Text.Interpolation.Nyan import Text.ParserCombinators.ReadPrec qualified as ReadPrec (lift) -import Text.URI (Authority (..), ParseExceptionBs, URI (..), mkURIBs, unRText) +import Text.URI (Authority (..), URI (..), relativeTo, render, unRText) import Time (RatioNat, Second, Time (..), ms, sec, threadDelay, timeout, (+:+), (-:-)) -import URI.ByteString qualified as URIBS +import Control.Monad.Trans.Except (withExceptT) import Xrefcheck.Config import Xrefcheck.Core +import Xrefcheck.Data.URI import Xrefcheck.Orphans () import Xrefcheck.Progress import Xrefcheck.Scan @@ -129,8 +127,7 @@ data VerifyError | LinkTargetNotAddedToGit RelPosixLink | AnchorDoesNotExist Text [Anchor] | AmbiguousAnchorRef RelPosixLink Text (NonEmpty Anchor) - | ExternalResourceInvalidUri URIBS.URIParseError - | ExternalResourceUriConversionError ParseExceptionBs + | ExternalResourceUriParseError UriParseError | ExternalResourceInvalidUrl (Maybe Text) | ExternalResourceUnknownProtocol | ExternalHttpResourceUnavailable Status @@ -196,12 +193,12 @@ instance Given ColorMode => Buildable VerifyError where can change silently while the document containing it evolves. |] - ExternalResourceInvalidUri err -> + ExternalResourceUriParseError (UPEInvalid err) -> [int|| Invalid URI (#{err}) |] - ExternalResourceUriConversionError err -> + ExternalResourceUriParseError (UPEConversion err) -> [int|| Invalid URI #{interpolateIndentF 4 . build $ displayException err} @@ -657,41 +654,6 @@ verifyReference . aName in throwError $ AnchorDoesNotExist anchor similarAnchors --- | Parse URI according to RFC 3986 extended by allowing non-encoded --- `[` and `]` in query string. -parseUri :: Text -> ExceptT VerifyError IO URI -parseUri link = do - -- There exist two main standards of URL parsing: RFC 3986 and the Web - -- Hypertext Application Technology Working Group's URL standard. Ideally, - -- we want to be able to parse the URLs in accordance with the latter - -- standard, because it provides a much less ambiguous set of rules for - -- percent-encoding special characters, and is essentially a living - -- standard that gets updated constantly. - -- - -- We have chosen the 'uri-bytestring' library for URI parsing because - -- of the 'laxURIParseOptions' parsing configuration. 'mkURI' from - -- the 'modern-uri' library parses URIs in accordance with RFC 3986 and does - -- not provide a means of parsing customization, which contrasts with - -- 'parseURI' that accepts a 'URIParserOptions'. One of the predefined - -- configurations of this type is 'strictURIParserOptions', which follows - -- RFC 3986, and the other -- 'laxURIParseOptions' -- allows brackets - -- in the queries, which draws us closer to the WHATWG URL standard. - uri' <- URIBS.parseURI URIBS.laxURIParserOptions (encodeUtf8 link) - & either (throwError . ExternalResourceInvalidUri) pure - - -- We stick to our infrastructure by continuing to operate on the datatypes - -- from `modern-uri`, which are used in the 'req' library. First we - -- serialize our URI parsed with 'parseURI' so it becomes a 'ByteString' - -- with all the necessary special characters *percent-encoded*, and then - -- call 'mkURIBs'. - mkURIBs (URIBS.serializeURIRef' uri') - -- Ideally, this exception should never be thrown, as the URI - -- already *percent-encoded* with 'parseURI' from 'uri-bytestring' - -- and 'mkURIBs' is only used to convert to 'URI' type from - -- 'modern-uri' package. - & handleJust (fromException @ParseExceptionBs) - (throwError . ExternalResourceUriConversionError) - checkExternalResource :: RedirectChain -> Config -> Text -> ExceptT VerifyError IO () checkExternalResource followed config@Config{..} link | isIgnored = pass @@ -700,7 +662,7 @@ checkExternalResource followed config@Config{..} link | ncMaxRedirectFollows >= 0 && totalFollowed followed > ncMaxRedirectFollows = throwError $ RedirectChainLimit $ followed `pushRequest` (RedirectChainLink link) | otherwise = do - uri <- parseUri link + uri <- ExternalResourceUriParseError `withExceptT` parseUri False link case toString <$> uriScheme uri of Just "http" -> checkHttp uri Just "https" -> checkHttp uri @@ -745,10 +707,8 @@ checkExternalResource followed config@Config{..} link let maxTime = Time @Second $ unTime ncExternalRefCheckTimeout * timeoutFrac - reqRes <- catch (liftIO (timeout maxTime $ reqLink $> RRDone)) $ \httpErr -> - case interpretErrors uri httpErr of - Left err -> throwError err - Right res -> pure $ Just res + reqRes <- catch (liftIO (timeout maxTime $ reqLink $> RRDone)) $ + (Just <$>) <$> interpretErrors uri case reqRes of Nothing -> throwError $ ExternalHttpTimeout $ extractHost uri @@ -767,7 +727,7 @@ checkExternalResource followed config@Config{..} link [ if ncIgnoreAuthFailures -- unauthorized access then flip elem [403, 401] else const False - , (405 ==) -- method mismatch + , (405 ==) -- method mismatch ] interpretErrors uri = \case @@ -777,20 +737,26 @@ checkExternalResource followed config@Config{..} link HttpExceptionRequest _ exc -> case exc of StatusCodeException resp _ | isRedirectCode code -> case redirectLocation of - Nothing -> Left $ RedirectMissingLocation $ followed `pushRequest` (RedirectChainLink link) - Just nextLink -> case redirectRule link nextLink code ncExternalRefRedirects of - Nothing -> Right RRDone - Just RedirectRule{..} -> - case rrOutcome of - RROValid -> Right RRDone - RROInvalid -> Left $ RedirectRuleError - (followed `pushRequest` (RedirectChainLink link) `pushRequest` (RedirectChainLink nextLink)) - rrOn - RROFollow -> Right $ RRFollow nextLink - | isAllowedErrorCode code -> Right RRDone + Nothing -> throwError $ RedirectMissingLocation $ followed `pushRequest` RedirectChainLink link + Just nextLink -> do + nextUri <- ExternalResourceUriParseError `withExceptT` parseUri True nextLink + nextLinkAbsolute <- case relativeTo nextUri uri of + -- This should not happen because uri has been parsed with `parseUri False` + Nothing -> error "Not an absolute URL exception" + Just absoluteTarget -> pure $ render absoluteTarget + case redirectRule link nextLinkAbsolute code ncExternalRefRedirects of + Nothing -> pure RRDone + Just RedirectRule{..} -> + case rrOutcome of + RROValid -> pure RRDone + RROInvalid -> throwError $ RedirectRuleError + (followed `pushRequest` RedirectChainLink link `pushRequest` RedirectChainLink nextLinkAbsolute) + rrOn + RROFollow -> pure $ RRFollow nextLinkAbsolute + | isAllowedErrorCode code -> pure RRDone | otherwise -> case statusCode (responseStatus resp) of - 429 -> Left $ ExternalHttpTooManyRequests (retryAfterInfo resp) (extractHost uri) - _ -> Left . ExternalHttpResourceUnavailable $ responseStatus resp + 429 -> throwError $ ExternalHttpTooManyRequests (retryAfterInfo resp) (extractHost uri) + _ -> throwError $ ExternalHttpResourceUnavailable $ responseStatus resp where code :: Int code = statusCode $ responseStatus resp @@ -799,7 +765,7 @@ checkExternalResource followed config@Config{..} link redirectLocation = fmap decodeUtf8 . lookup "Location" $ responseHeaders resp - other -> Left . ExternalResourceSomeError $ show other + other -> throwError $ ExternalResourceSomeError $ show other where retryAfterInfo :: Response a -> Maybe RetryAfter retryAfterInfo = readMaybe . decodeUtf8 <=< L.lookup hRetryAfter . responseHeaders diff --git a/tests/Test/Xrefcheck/RedirectChainSpec.hs b/tests/Test/Xrefcheck/RedirectChainSpec.hs index e52f3e41..4b7a4752 100644 --- a/tests/Test/Xrefcheck/RedirectChainSpec.hs +++ b/tests/Test/Xrefcheck/RedirectChainSpec.hs @@ -40,6 +40,35 @@ test_redirectRequests = testGroup "Redirect chain tests" (link "/cycle1") progress (VerifyResult [RedirectChainCycle $ chain ["/cycle1", "/cycle2", "/cycle3", "/cycle4", "/cycle2"]]) + , testGroup "Relative redirect" + [ testCase "Host" $ do + setRef <- newIORef mempty + checkLinkAndProgressWithServer + (configMod 1) + setRef + mockRedirect + (link "/relative/host") + progress + (VerifyResult [RedirectChainLimit $ chain ["/relative/host", "/cycle2", "/cycle3"]]) + , testCase "Path" $ do + setRef <- newIORef mempty + checkLinkAndProgressWithServer + (configMod 1) + setRef + mockRedirect + (link "/relative/path") + progress + (VerifyResult [RedirectChainLimit $ chain ["/relative/path", "/relative/host", "/cycle2"]]) + ] + , testCase "Other host redirect" $ withServer otherMockRedirect $ do + setRef <- newIORef mempty + checkLinkAndProgressWithServer + (configMod 1) + setRef + mockRedirect + "http://127.0.0.1:5001/other/host" + progress + (VerifyResult [RedirectChainLimit $ fromList ["http://127.0.0.1:5001/other/host", link "/relative/host", link "/cycle2"]]) , testGroup "Limit" [ testCase "Takes effect" $ do setRef <- newIORef mempty @@ -89,19 +118,28 @@ test_redirectRequests = testGroup "Redirect chain tests" redirectRoute name to = route name $ pure $ toResponse ( "" :: Text , mkStatus 301 "Permanent redirect" - , M.fromList [(CI.map (decodeUtf8 @Text) hLocation, fmap link $ maybeToList to)] + , M.fromList [(CI.map (decodeUtf8 @Text) hLocation, maybeToList to)] ) mockRedirect :: IO () - mockRedirect = + mockRedirect = do run 5000 do -- A set of redirect routes that correspond to a broken chain. - redirectRoute "/broken1" $ Just "/broken2" - redirectRoute "/broken2" $ Just "/broken3" + redirectRoute "/broken1" $ Just $ link "/broken2" + redirectRoute "/broken2" $ Just $ link "/broken3" redirectRoute "/broken3" Nothing -- A set of redirect routes that correspond to a cycle. - redirectRoute "/cycle1" $ Just "/cycle2" - redirectRoute "/cycle2" $ Just "/cycle3" - redirectRoute "/cycle3" $ Just "/cycle4" - redirectRoute "/cycle4" $ Just "/cycle2" + redirectRoute "/cycle1" $ Just $ link "/cycle2" + redirectRoute "/cycle2" $ Just $ link "/cycle3" + redirectRoute "/cycle3" $ Just $ link "/cycle4" + redirectRoute "/cycle4" $ Just $ link "/cycle2" + + -- Relative redirects. + redirectRoute "/relative/host" $ Just "/cycle2" + redirectRoute "/relative/path" $ Just "host" + + -- To other host + otherMockRedirect :: IO () + otherMockRedirect = + run 5001 $ redirectRoute "/other/host" $ Just $ link "/relative/host" diff --git a/tests/Test/Xrefcheck/URIParsingSpec.hs b/tests/Test/Xrefcheck/URIParsingSpec.hs index c7ac2a7b..e1ee23f7 100644 --- a/tests/Test/Xrefcheck/URIParsingSpec.hs +++ b/tests/Test/Xrefcheck/URIParsingSpec.hs @@ -11,9 +11,9 @@ import Test.Tasty (TestTree, testGroup) import Test.Tasty.HUnit (testCase, (@?=)) import Text.URI (URI) import Text.URI.QQ (uri) -import URI.ByteString (SchemaError (..), URIParseError (..)) +import URI.ByteString qualified as URIBS -import Xrefcheck.Verify (VerifyError (..), parseUri) +import Xrefcheck.Data.URI (UriParseError (..), parseUri) test_uri :: [TestTree] test_uri = @@ -36,20 +36,20 @@ test_uri = , testGroup "URI parsing should be unsuccessful" [ testCase "With the special characters anywhere else" do parseUri' "https://exam/?q=a&p=b#fra{g}ment" >>= - (@?= Left (ExternalResourceInvalidUri MalformedPath)) + (@?= Left (UPEInvalid URIBS.MalformedPath)) parseUri' "https://example.com/pa[t]h/to[/]smth?q=a&p=b" >>= - (@?= Left (ExternalResourceInvalidUri MalformedPath)) + (@?= Left (UPEInvalid URIBS.MalformedPath)) , testCase "With malformed scheme" do parseUri' "https//example.com/" >>= - (@?= Left (ExternalResourceInvalidUri $ MalformedScheme MissingColon)) + (@?= Left (UPEInvalid $ URIBS.MalformedScheme URIBS.MissingColon)) , testCase "With malformed fragment" do parseUri' "https://example.com/?q=a&p=b#fra{g}ment" >>= - (@?= Left (ExternalResourceInvalidUri MalformedFragment)) + (@?= Left (UPEInvalid URIBS.MalformedFragment)) ] ] where - parseUri' :: Text -> IO $ Either VerifyError URI - parseUri' = runExceptT . parseUri + parseUri' :: Text -> IO $ Either UriParseError URI + parseUri' = runExceptT . parseUri False diff --git a/tests/Test/Xrefcheck/UtilRequests.hs b/tests/Test/Xrefcheck/UtilRequests.hs index e37d9042..2587b9ff 100644 --- a/tests/Test/Xrefcheck/UtilRequests.hs +++ b/tests/Test/Xrefcheck/UtilRequests.hs @@ -11,6 +11,7 @@ module Test.Xrefcheck.UtilRequests , checkLinkAndProgressWithServerDefault , verifyLinkDefault , verifyReferenceWithProgressDefault + , withServer , VerifyLinkTestEntry (..) ) where @@ -31,13 +32,16 @@ import Xrefcheck.System import Xrefcheck.Util import Xrefcheck.Verify +withServer :: IO () -> IO () -> IO () +withServer mock = E.bracket (forkIO mock) killThread . const + checkMultipleLinksWithServer :: IO () -> IORef (S.Set DomainName) -> [VerifyLinkTestEntry] -> IO () checkMultipleLinksWithServer mock setRef entries = - E.bracket (forkIO mock) killThread $ \_ -> do + withServer mock $ do forM_ entries $ \VerifyLinkTestEntry {..} -> checkLinkAndProgress vlteConfigModifier @@ -55,7 +59,7 @@ checkLinkAndProgressWithServer -> VerifyResult VerifyError -> IO () checkLinkAndProgressWithServer configModifier setRef mock link progress vrExpectation = - E.bracket (forkIO mock) killThread $ \_ -> do + withServer mock $ checkLinkAndProgress configModifier setRef link progress vrExpectation checkLinkAndProgress diff --git a/tests/golden/check-autolinks/check-autolinks.bats b/tests/golden/check-autolinks/check-autolinks.bats index 8b27f2a9..94c486ca 100644 --- a/tests/golden/check-autolinks/check-autolinks.bats +++ b/tests/golden/check-autolinks/check-autolinks.bats @@ -34,7 +34,7 @@ assert_diff - < https://commonmark.org/ + -> https://commonmark.org ^-- stopped before this one Invalid references dumped, 1 in total.