Skip to content

Commit

Permalink
[#270] Handle relative redirects
Browse files Browse the repository at this point in the history
Problem: Currently, Xrefcheck can follow redirects with an absolute
location link, but it cannot handle relative ones.

Solution: After parsing the location link, obtain the corresponding
absolute link by using the original request one.
  • Loading branch information
aeqz committed Jan 26, 2023
1 parent 747c884 commit ac02933
Show file tree
Hide file tree
Showing 7 changed files with 168 additions and 84 deletions.
2 changes: 2 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ Unreleased
* [#268](https://github.com/serokell/xrefcheck/pull/268)
+ Added CLI option `--color` that enables ANSI colors in output.
+ Changed the output coloring defaults to show colors when `CI` env variable is `true`.
* [#271](https://github.com/serokell/xrefcheck/pull/271)
+ Now Xrefcheck is able to follow relative redirects.

0.2.2
==========
Expand Down
74 changes: 74 additions & 0 deletions src/Xrefcheck/Data/URI.hs
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
{- SPDX-FileCopyrightText: 2023 Serokell <https://serokell.io>
-
- SPDX-License-Identifier: MPL-2.0
-}

{-# LANGUAGE ExistentialQuantification #-}

module Xrefcheck.Data.URI
( UriParseError (..)
, parseUri
) where

import Universum

import Control.Exception.Safe (handleJust)
import Control.Monad.Except (throwError)
import Text.URI (ParseExceptionBs, URI, mkURIBs)
import URI.ByteString qualified as URIBS

data UriParseError
= UPEInvalid URIBS.URIParseError
| UPEConversion ParseExceptionBs
deriving stock (Show, Eq)

data AnyURIRef = forall a. AnyURIRef (URIBS.URIRef a)

serializeAnyURIRef :: AnyURIRef -> ByteString
serializeAnyURIRef (AnyURIRef uri) = URIBS.serializeURIRef' uri

-- | Parse URI according to RFC 3986 extended by allowing non-encoded
-- `[` and `]` in query string.
--
-- The first parameter indicates whether the parsing should admit relative
-- URIs or not.
parseUri :: Bool -> Text -> ExceptT UriParseError IO URI
parseUri canBeRelative link = do
-- There exist two main standards of URL parsing: RFC 3986 and the Web
-- Hypertext Application Technology Working Group's URL standard. Ideally,
-- we want to be able to parse the URLs in accordance with the latter
-- standard, because it provides a much less ambiguous set of rules for
-- percent-encoding special characters, and is essentially a living
-- standard that gets updated constantly.
--
-- We have chosen the 'uri-bytestring' library for URI parsing because
-- of the 'laxURIParseOptions' parsing configuration. 'mkURI' from
-- the 'modern-uri' library parses URIs in accordance with RFC 3986 and does
-- not provide a means of parsing customization, which contrasts with
-- 'parseURI' that accepts a 'URIParserOptions'. One of the predefined
-- configurations of this type is 'strictURIParserOptions', which follows
-- RFC 3986, and the other -- 'laxURIParseOptions' -- allows brackets
-- in the queries, which draws us closer to the WHATWG URL standard.
--
-- The 'modern-uri' package can parse an URI deciding if it is absolute or
-- relative depending on the success or failure of the scheme parsing. By
-- contrast, in 'uri-bytestring' it has to be decided beforehand, resulting in
-- different URI types.
uri <- case URIBS.parseURI URIBS.laxURIParserOptions (encodeUtf8 link) of
Left (URIBS.MalformedScheme _) | canBeRelative ->
URIBS.parseRelativeRef URIBS.laxURIParserOptions (encodeUtf8 link)
& either (throwError . UPEInvalid) (pure . AnyURIRef)
Left err -> throwError $ UPEInvalid err
Right uri -> pure $ AnyURIRef uri

-- We stick to our infrastructure by continuing to operate on the datatypes
-- from 'modern-uri', which are used in the 'req' library. First we
-- serialize our URI parsed with 'parseURI' so it becomes a 'ByteString'
-- with all the necessary special characters *percent-encoded*, and then
-- call 'mkURIBs'.
mkURIBs (serializeAnyURIRef uri)
-- Ideally, this exception should never be thrown, as the URI
-- already *percent-encoded* with 'parseURI' from 'uri-bytestring'
-- and 'mkURIBs' is only used to convert to 'URI' type from
-- 'modern-uri' package.
& handleJust fromException (throwError . UPEConversion)
96 changes: 31 additions & 65 deletions src/Xrefcheck/Verify.hs
Original file line number Diff line number Diff line change
Expand Up @@ -25,17 +25,14 @@ module Xrefcheck.Verify
, verifyRepo
, verifyReference
, checkExternalResource

-- * URI parsing
, parseUri
, reportVerifyErrs
) where

import Universum

import Control.Concurrent.Async (Async, async, cancel, poll, wait, withAsync)
import Control.Exception (AsyncException (..), throwIO)
import Control.Exception.Safe (handleAsync, handleJust)
import Control.Exception.Safe (handleAsync)
import Control.Monad.Except (MonadError (..))
import Data.Bits (toIntegralSized)
import Data.ByteString qualified as BS
Expand Down Expand Up @@ -66,12 +63,13 @@ import System.FilePath (isPathSeparator)
import System.FilePath.Posix ((</>))
import Text.Interpolation.Nyan
import Text.ParserCombinators.ReadPrec qualified as ReadPrec (lift)
import Text.URI (Authority (..), ParseExceptionBs, URI (..), mkURIBs, unRText)
import Text.URI (Authority (..), URI (..), relativeTo, render, unRText)
import Time (RatioNat, Second, Time (..), ms, sec, threadDelay, timeout, (+:+), (-:-))
import URI.ByteString qualified as URIBS

import Control.Monad.Trans.Except (withExceptT)
import Xrefcheck.Config
import Xrefcheck.Core
import Xrefcheck.Data.URI
import Xrefcheck.Orphans ()
import Xrefcheck.Progress
import Xrefcheck.Scan
Expand Down Expand Up @@ -131,8 +129,7 @@ data VerifyError
| LinkTargetNotAddedToGit FilePath
| AnchorDoesNotExist Text [Anchor]
| AmbiguousAnchorRef FilePath Text (NonEmpty Anchor)
| ExternalResourceInvalidUri URIBS.URIParseError
| ExternalResourceUriConversionError ParseExceptionBs
| ExternalResourceUriParseError UriParseError
| ExternalResourceInvalidUrl (Maybe Text)
| ExternalResourceUnknownProtocol
| ExternalHttpResourceUnavailable Status
Expand Down Expand Up @@ -194,12 +191,12 @@ instance Given ColorMode => Buildable VerifyError where
can change silently while the document containing it evolves.
|]

ExternalResourceInvalidUri err ->
ExternalResourceUriParseError (UPEInvalid err) ->
[int||
Invalid URI (#{err})
|]

ExternalResourceUriConversionError err ->
ExternalResourceUriParseError (UPEConversion err) ->
[int||
Invalid URI
#{interpolateIndentF 4 . build $ displayException err}
Expand Down Expand Up @@ -662,41 +659,6 @@ verifyReference
. aName
in throwError $ AnchorDoesNotExist anchor similarAnchors

-- | Parse URI according to RFC 3986 extended by allowing non-encoded
-- `[` and `]` in query string.
parseUri :: Text -> ExceptT VerifyError IO URI
parseUri link = do
-- There exist two main standards of URL parsing: RFC 3986 and the Web
-- Hypertext Application Technology Working Group's URL standard. Ideally,
-- we want to be able to parse the URLs in accordance with the latter
-- standard, because it provides a much less ambiguous set of rules for
-- percent-encoding special characters, and is essentially a living
-- standard that gets updated constantly.
--
-- We have chosen the 'uri-bytestring' library for URI parsing because
-- of the 'laxURIParseOptions' parsing configuration. 'mkURI' from
-- the 'modern-uri' library parses URIs in accordance with RFC 3986 and does
-- not provide a means of parsing customization, which contrasts with
-- 'parseURI' that accepts a 'URIParserOptions'. One of the predefined
-- configurations of this type is 'strictURIParserOptions', which follows
-- RFC 3986, and the other -- 'laxURIParseOptions' -- allows brackets
-- in the queries, which draws us closer to the WHATWG URL standard.
uri' <- URIBS.parseURI URIBS.laxURIParserOptions (encodeUtf8 link)
& either (throwError . ExternalResourceInvalidUri) pure

-- We stick to our infrastructure by continuing to operate on the datatypes
-- from `modern-uri`, which are used in the 'req' library. First we
-- serialize our URI parsed with 'parseURI' so it becomes a 'ByteString'
-- with all the necessary special characters *percent-encoded*, and then
-- call 'mkURIBs'.
mkURIBs (URIBS.serializeURIRef' uri')
-- Ideally, this exception should never be thrown, as the URI
-- already *percent-encoded* with 'parseURI' from 'uri-bytestring'
-- and 'mkURIBs' is only used to convert to 'URI' type from
-- 'modern-uri' package.
& handleJust (fromException @ParseExceptionBs)
(throwError . ExternalResourceUriConversionError)

checkExternalResource :: RedirectChain -> Config -> Text -> ExceptT VerifyError IO ()
checkExternalResource followed config@Config{..} link
| isIgnored = pass
Expand All @@ -705,7 +667,7 @@ checkExternalResource followed config@Config{..} link
| ncMaxRedirectFollows >= 0 && totalFollowed followed > ncMaxRedirectFollows =
throwError $ RedirectChainLimit $ followed `pushRequest` (RedirectChainLink link)
| otherwise = do
uri <- parseUri link
uri <- ExternalResourceUriParseError `withExceptT` parseUri False link
case toString <$> uriScheme uri of
Just "http" -> checkHttp uri
Just "https" -> checkHttp uri
Expand Down Expand Up @@ -750,10 +712,8 @@ checkExternalResource followed config@Config{..} link

let maxTime = Time @Second $ unTime ncExternalRefCheckTimeout * timeoutFrac

reqRes <- catch (liftIO (timeout maxTime $ reqLink $> RRDone)) $ \httpErr ->
case interpretErrors uri httpErr of
Left err -> throwError err
Right res -> pure $ Just res
reqRes <- catch (liftIO (timeout maxTime $ reqLink $> RRDone)) $
(Just <$>) <$> interpretErrors uri

case reqRes of
Nothing -> throwError $ ExternalHttpTimeout $ extractHost uri
Expand All @@ -772,7 +732,7 @@ checkExternalResource followed config@Config{..} link
[ if ncIgnoreAuthFailures -- unauthorized access
then flip elem [403, 401]
else const False
, (405 ==) -- method mismatch
, (405 ==) -- method mismatch
]

interpretErrors uri = \case
Expand All @@ -782,20 +742,26 @@ checkExternalResource followed config@Config{..} link
HttpExceptionRequest _ exc -> case exc of
StatusCodeException resp _
| isRedirectCode code -> case redirectLocation of
Nothing -> Left $ RedirectMissingLocation $ followed `pushRequest` (RedirectChainLink link)
Just nextLink -> case redirectRule link nextLink code ncExternalRefRedirects of
Nothing -> Right RRDone
Just RedirectRule{..} ->
case rrOutcome of
RROValid -> Right RRDone
RROInvalid -> Left $ RedirectRuleError
(followed `pushRequest` (RedirectChainLink link) `pushRequest` (RedirectChainLink nextLink))
rrOn
RROFollow -> Right $ RRFollow nextLink
| isAllowedErrorCode code -> Right RRDone
Nothing -> throwError $ RedirectMissingLocation $ followed `pushRequest` RedirectChainLink link
Just nextLink -> do
nextUri <- ExternalResourceUriParseError `withExceptT` parseUri True nextLink
nextLinkAbsolute <- case relativeTo nextUri uri of
-- This should not happen because uri has been parsed with `parseUri False`
Nothing -> error "Not an absolute URL exception"
Just absoluteTarget -> pure $ render absoluteTarget
case redirectRule link nextLinkAbsolute code ncExternalRefRedirects of
Nothing -> pure RRDone
Just RedirectRule{..} ->
case rrOutcome of
RROValid -> pure RRDone
RROInvalid -> throwError $ RedirectRuleError
(followed `pushRequest` RedirectChainLink link `pushRequest` RedirectChainLink nextLinkAbsolute)
rrOn
RROFollow -> pure $ RRFollow nextLinkAbsolute
| isAllowedErrorCode code -> pure RRDone
| otherwise -> case statusCode (responseStatus resp) of
429 -> Left $ ExternalHttpTooManyRequests (retryAfterInfo resp) (extractHost uri)
_ -> Left . ExternalHttpResourceUnavailable $ responseStatus resp
429 -> throwError $ ExternalHttpTooManyRequests (retryAfterInfo resp) (extractHost uri)
_ -> throwError $ ExternalHttpResourceUnavailable $ responseStatus resp
where
code :: Int
code = statusCode $ responseStatus resp
Expand All @@ -804,7 +770,7 @@ checkExternalResource followed config@Config{..} link
redirectLocation = fmap decodeUtf8
. lookup "Location"
$ responseHeaders resp
other -> Left . ExternalResourceSomeError $ show other
other -> throwError $ ExternalResourceSomeError $ show other
where
retryAfterInfo :: Response a -> Maybe RetryAfter
retryAfterInfo = readMaybe . decodeUtf8 <=< L.lookup hRetryAfter . responseHeaders
Expand Down
54 changes: 46 additions & 8 deletions tests/Test/Xrefcheck/RedirectChainSpec.hs
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,35 @@ test_redirectRequests = testGroup "Redirect chain tests"
(link "/cycle1")
progress
(VerifyResult [RedirectChainCycle $ chain ["/cycle1", "/cycle2", "/cycle3", "/cycle4", "/cycle2"]])
, testGroup "Relative redirect"
[ testCase "Host" $ do
setRef <- newIORef mempty
checkLinkAndProgressWithServer
(configMod 1)
setRef
mockRedirect
(link "/relative/host")
progress
(VerifyResult [RedirectChainLimit $ chain ["/relative/host", "/cycle2", "/cycle3"]])
, testCase "Path" $ do
setRef <- newIORef mempty
checkLinkAndProgressWithServer
(configMod 1)
setRef
mockRedirect
(link "/relative/path")
progress
(VerifyResult [RedirectChainLimit $ chain ["/relative/path", "/relative/host", "/cycle2"]])
]
, testCase "Other host redirect" $ withServer otherMockRedirect $ do
setRef <- newIORef mempty
checkLinkAndProgressWithServer
(configMod 1)
setRef
mockRedirect
"http://127.0.0.1:5001/other/host"
progress
(VerifyResult [RedirectChainLimit $ fromList ["http://127.0.0.1:5001/other/host", link "/relative/host", link "/cycle2"]])
, testGroup "Limit"
[ testCase "Takes effect" $ do
setRef <- newIORef mempty
Expand Down Expand Up @@ -89,19 +118,28 @@ test_redirectRequests = testGroup "Redirect chain tests"
redirectRoute name to = route name $ pure $ toResponse
( "" :: Text
, mkStatus 301 "Permanent redirect"
, M.fromList [(CI.map (decodeUtf8 @Text) hLocation, fmap link $ maybeToList to)]
, M.fromList [(CI.map (decodeUtf8 @Text) hLocation, maybeToList to)]
)

mockRedirect :: IO ()
mockRedirect =
mockRedirect = do
run 5000 do
-- A set of redirect routes that correspond to a broken chain.
redirectRoute "/broken1" $ Just "/broken2"
redirectRoute "/broken2" $ Just "/broken3"
redirectRoute "/broken1" $ Just $ link "/broken2"
redirectRoute "/broken2" $ Just $ link "/broken3"
redirectRoute "/broken3" Nothing

-- A set of redirect routes that correspond to a cycle.
redirectRoute "/cycle1" $ Just "/cycle2"
redirectRoute "/cycle2" $ Just "/cycle3"
redirectRoute "/cycle3" $ Just "/cycle4"
redirectRoute "/cycle4" $ Just "/cycle2"
redirectRoute "/cycle1" $ Just $ link "/cycle2"
redirectRoute "/cycle2" $ Just $ link "/cycle3"
redirectRoute "/cycle3" $ Just $ link "/cycle4"
redirectRoute "/cycle4" $ Just $ link "/cycle2"

-- Relative redirects.
redirectRoute "/relative/host" $ Just "/cycle2"
redirectRoute "/relative/path" $ Just "host"

-- To other host
otherMockRedirect :: IO ()
otherMockRedirect =
run 5001 $ redirectRoute "/other/host" $ Just $ link "/relative/host"
16 changes: 8 additions & 8 deletions tests/Test/Xrefcheck/URIParsingSpec.hs
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,9 @@ import Test.Tasty (TestTree, testGroup)
import Test.Tasty.HUnit (testCase, (@?=))
import Text.URI (URI)
import Text.URI.QQ (uri)
import URI.ByteString (SchemaError (..), URIParseError (..))
import URI.ByteString qualified as URIBS

import Xrefcheck.Verify (VerifyError (..), parseUri)
import Xrefcheck.Data.URI (UriParseError (..), parseUri)

test_uri :: [TestTree]
test_uri =
Expand All @@ -36,20 +36,20 @@ test_uri =
, testGroup "URI parsing should be unsuccessful"
[ testCase "With the special characters anywhere else" do
parseUri' "https://exa<mple.co>m/?q=a&p=b#fra{g}ment" >>=
(@?= Left (ExternalResourceInvalidUri MalformedPath))
(@?= Left (UPEInvalid URIBS.MalformedPath))

parseUri' "https://example.com/pa[t]h/to[/]smth?q=a&p=b" >>=
(@?= Left (ExternalResourceInvalidUri MalformedPath))
(@?= Left (UPEInvalid URIBS.MalformedPath))

, testCase "With malformed scheme" do
parseUri' "https//example.com/" >>=
(@?= Left (ExternalResourceInvalidUri $ MalformedScheme MissingColon))
(@?= Left (UPEInvalid $ URIBS.MalformedScheme URIBS.MissingColon))

, testCase "With malformed fragment" do
parseUri' "https://example.com/?q=a&p=b#fra{g}ment" >>=
(@?= Left (ExternalResourceInvalidUri MalformedFragment))
(@?= Left (UPEInvalid URIBS.MalformedFragment))
]
]
where
parseUri' :: Text -> IO $ Either VerifyError URI
parseUri' = runExceptT . parseUri
parseUri' :: Text -> IO $ Either UriParseError URI
parseUri' = runExceptT . parseUri False
Loading

0 comments on commit ac02933

Please sign in to comment.