From 0b4ce991a17e5777ee86cb7dcec6b2b070cfd04d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Enr=C3=ADquez?= Date: Mon, 19 Dec 2022 17:50:07 +0100 Subject: [PATCH 1/2] [#25] Redirect links with configuration rules Problem: We previously changed the default behaviour of Xrefcheck when following link redirects, but did not provide a way to configure it. Solution: We are adding a new field in the configuration file to allow writing a list of redirect rules that will be applied to links that match them. --- CHANGES.md | 28 +-- README.md | 32 ++- ftp-tests/Test/Xrefcheck/FtpLinks.hs | 12 +- src/Xrefcheck/Config.hs | 17 +- src/Xrefcheck/Config/Default.hs | 73 +++++-- src/Xrefcheck/Data/Redirect.hs | 170 ++++++++++++++++ src/Xrefcheck/Scan.hs | 2 + src/Xrefcheck/Util.hs | 11 + src/Xrefcheck/Verify.hs | 121 +++++++---- tests/Test/Xrefcheck/ConfigSpec.hs | 4 +- tests/Test/Xrefcheck/RedirectChainSpec.hs | 113 +++++++++++ tests/Test/Xrefcheck/RedirectConfigSpec.hs | 190 ++++++++++++++++++ ...RequestsSpec.hs => RedirectDefaultSpec.hs} | 19 +- tests/configs/github-config.yaml | 52 +++-- .../check-autolinks/check-autolinks.bats | 8 +- .../golden/check-redirect-parse/bad-code.yaml | 13 ++ tests/golden/check-redirect-parse/bad-on.yaml | 13 ++ .../check-redirect-parse/bad-outcome.yaml | 11 + .../golden/check-redirect-parse/bad-rule.yaml | 10 + .../check-redirect-parse/bad-rules.yaml | 10 + tests/golden/check-redirect-parse/bad-to.yaml | 12 ++ .../check-redirect-parse.bats | 93 +++++++++ .../check-redirect-parse/full-rule.yaml | 13 ++ .../check-redirect-parse/no-outcome.yaml | 14 ++ .../golden/check-redirect-parse/no-rules.yaml | 10 + .../check-redirect-parse/only-outcome-on.yaml | 12 ++ .../check-redirect-parse/only-outcome-to.yaml | 12 ++ .../check-redirect-parse/only-outcome.yaml | 11 + 28 files changed, 978 insertions(+), 108 deletions(-) create mode 100644 src/Xrefcheck/Data/Redirect.hs create mode 100644 tests/Test/Xrefcheck/RedirectChainSpec.hs create mode 100644 tests/Test/Xrefcheck/RedirectConfigSpec.hs rename tests/Test/Xrefcheck/{RedirectRequestsSpec.hs => RedirectDefaultSpec.hs} (83%) create mode 100644 tests/golden/check-redirect-parse/bad-code.yaml create mode 100644 tests/golden/check-redirect-parse/bad-on.yaml create mode 100644 tests/golden/check-redirect-parse/bad-outcome.yaml create mode 100644 tests/golden/check-redirect-parse/bad-rule.yaml create mode 100644 tests/golden/check-redirect-parse/bad-rules.yaml create mode 100644 tests/golden/check-redirect-parse/bad-to.yaml create mode 100644 tests/golden/check-redirect-parse/check-redirect-parse.bats create mode 100644 tests/golden/check-redirect-parse/full-rule.yaml create mode 100644 tests/golden/check-redirect-parse/no-outcome.yaml create mode 100644 tests/golden/check-redirect-parse/no-rules.yaml create mode 100644 tests/golden/check-redirect-parse/only-outcome-on.yaml create mode 100644 tests/golden/check-redirect-parse/only-outcome-to.yaml create mode 100644 tests/golden/check-redirect-parse/only-outcome.yaml diff --git a/CHANGES.md b/CHANGES.md index de5363f1..2cc88d2c 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -9,8 +9,8 @@ Unreleased * [#176](https://github.com/serokell/xrefcheck/pull/176) + Enabled `autolink` extension for `cmark-gfm`, so now we're finding strings - like `www.google.com` or `https://google.com`, treating them as links - and checking. + like `www.google.com` or `https://google.com`, treating them as links + and checking. * [#175](https://github.com/serokell/xrefcheck/pull/175) + Reorganize top-level config keys. * [#178](https://github.com/serokell/xrefcheck/pull/178) @@ -19,13 +19,13 @@ Unreleased + Add support for image links. * [#199](https://github.com/serokell/xrefcheck/pull/199) + Now annotation - `` instead of `` - should be used to disable checking for links in file, so it's clearer that - file itself is not ignored (and links can target it). + `` instead of `` + should be used to disable checking for links in file, so it's clearer that + file itself is not ignored (and links can target it). * [#215](https://github.com/serokell/xrefcheck/pull/215) + Now we notify user when there are scannable files that were not added to Git - yet. Also added CLI option `--include-untracked` to scan such files and treat - as existing. + yet. Also added CLI option `--include-untracked` to scan such files and treat + as existing. * [#191](https://github.com/serokell/xrefcheck/pull/191) + Now we consider slash `/` (and only it) as path separator in local links for all OS, so xrefcheck's report is OS-independent @@ -40,10 +40,14 @@ Unreleased redirect responses (i.e. 301 and 308) and passes for temporary ones (i.e. 302, 303, 307). * [#231](https://github.com/serokell/xrefcheck/pull/231) + Anchor analysis takes now into account the appropriate case-sensitivity depending on - the configured Markdown flavour. + the configured Markdown flavour. * [#254](https://github.com/serokell/xrefcheck/pull/254) + Now the `dump-config` command does not overwrite a file unless explicitly told with a `--force` flag. Also, a `--stdout` flag allows to print the config to stdout instead. + the configured Markdown flavour. +* [#250](https://github.com/serokell/xrefcheck/pull/250) + + Now the redirect behavior for external references can be modified via rules in the + configuration file with the `externalRefRedirects` parameter. 0.2.2 ========== @@ -95,7 +99,7 @@ Unreleased + Make possible to specify whether ignore localhost links, use `check-localhost` CLA argument (by default localhost links will not be checked). + Make possible to ignore auth failures (assume 'protected' links - valid), use `ignoreAuthFailures` parameter of config. + valid), use `ignoreAuthFailures` parameter of config. * [#66](https://github.com/serokell/xrefcheck/pull/66) + Added support for ftp links. * [#74](https://github.com/serokell/xrefcheck/pull/83) @@ -144,10 +148,10 @@ Unreleased + Switch to lts-17.3. * [#53](https://github.com/serokell/xrefcheck/pull/53) + Make possible to include a regular expression in - `ignoreRefs` parameter of config to ignore external - references. + `ignoreRefs` parameter of config to ignore external + references. + Add support of right in-place ignoring annotations - such as `ignore file`, `ignore paragraph` and `ignore link`. + such as `ignore file`, `ignore paragraph` and `ignore link`. 0.1.2 ======= diff --git a/README.md b/README.md index 73d76c56..42bde3b0 100644 --- a/README.md +++ b/README.md @@ -141,8 +141,36 @@ There are several ways to fix this: * This behavior can be disabled by setting `ignoreAuthFailures: false` in the config file. 1. How does `xrefcheck` handle redirects? - * Permanent redirects (i.e. 301 and 308) are reported as errors. - * Temporary redirects (i.e. 302, 303 and 307) are assumed to be valid. + * The rules from the default configuration are as follows: + * Permanent redirects (i.e. 301 and 308) are reported as errors. + * Temporary redirects (i.e. 302, 303 and 307) are assumed to be valid. + * Redirect rules can be specified with the `externalRefRedirects` parameter within `networking`, which accepts an array of + rules with keys `from`, `to`, `on` and `outcome`. The rule applied is the first one that matches with + the `from`, `to` and `on` fields, if any, where + * `from` is a regular expression, as in `ignoreExternalRefsTo`, for the source link in a single redirection step. Its absence means that + every link matches. + * `to` is a regular expression for the target link in a single redirection step. Its absence also means that every link matches. + * `on` accepts `temporary`, `permanent` or a specific redirect HTTP code. Its absence also means that + every response code matches. + * The `outcome` parameter accepts `valid`, `invalid` or `follow`. The last one follows the redirect by applying the + same configuration rules. + + For example, this configuration forbids 307 redirects to a specific domain and makes redirections from HTTP to HTTPS to be followed: + + ``` + externalRefRedirects: + - to: "https?://forbidden.com.*" + on: 307 + outcome: invalid + - from: "^http://.*" + to: "^https://.*" + outcome: follow + ``` + + The first one applies if both of them match. + + * The number of redirects allowed in a single redirect chain is limited and can be configured with the + `maxRedirectFollows` parameter, also within `networking`. A number smaller than 0 disables the limit. 1. How does `xrefcheck` handle localhost links? * By default, `xrefcheck` will ignore links to localhost. diff --git a/ftp-tests/Test/Xrefcheck/FtpLinks.hs b/ftp-tests/Test/Xrefcheck/FtpLinks.hs index c2d9e407..b063125f 100644 --- a/ftp-tests/Test/Xrefcheck/FtpLinks.hs +++ b/ftp-tests/Test/Xrefcheck/FtpLinks.hs @@ -15,7 +15,7 @@ import Test.Tasty (TestTree, askOption, testGroup) import Test.Tasty.HUnit (assertBool, assertFailure, testCase, (@?=)) import Test.Tasty.Options as Tasty (IsOption (..), OptionDescription (Option), safeRead) -import Xrefcheck.Config (Config, cExclusionsL, defConfig) +import Xrefcheck.Config import Xrefcheck.Core (Flavor (GitHub)) import Xrefcheck.Scan (ecIgnoreExternalRefsToL) import Xrefcheck.Verify (VerifyError (..), checkExternalResource) @@ -48,27 +48,27 @@ test_FtpLinks = askOption $ \(FtpHostOpt host) -> do testGroup "Ftp links handler" [ testCase "handles correct link to file" $ do let link = host <> "/pub/file_exists.txt" - result <- runExceptT $ checkExternalResource config link + result <- runExceptT $ checkExternalResource emptyChain config link result @?= Right () , testCase "handles empty link (host only)" $ do let link = host - result <- runExceptT $ checkExternalResource config link + result <- runExceptT $ checkExternalResource emptyChain config link result @?= Right () , testCase "handles correct link to non empty directory" $ do let link = host <> "/pub/" - result <- runExceptT $ checkExternalResource config link + result <- runExceptT $ checkExternalResource emptyChain config link result @?= Right () , testCase "handles correct link to empty directory" $ do let link = host <> "/empty/" - result <- runExceptT $ checkExternalResource config link + result <- runExceptT $ checkExternalResource emptyChain config link result @?= Right () , testCase "throws exception when file not found" $ do let link = host <> "/pub/file_does_not_exists.txt" - result <- runExceptT $ checkExternalResource config link + result <- runExceptT $ checkExternalResource emptyChain config link case result of Right () -> assertFailure "No exception was raised, FtpEntryDoesNotExist expected" diff --git a/src/Xrefcheck/Config.hs b/src/Xrefcheck/Config.hs index b5a832ce..263b4306 100644 --- a/src/Xrefcheck/Config.hs +++ b/src/Xrefcheck/Config.hs @@ -7,6 +7,7 @@ module Xrefcheck.Config ( module Xrefcheck.Config + , module Xrefcheck.Data.Redirect , defConfigText ) where @@ -15,11 +16,12 @@ import Universum import Control.Lens (makeLensesWith) import Data.Aeson (genericParseJSON) import Data.Yaml (FromJSON (..), decodeEither', prettyPrintParseException, withText) - +import Text.Regex.TDFA.Text () import Time (KnownRatName, Second, Time (..), unitsP) import Xrefcheck.Config.Default import Xrefcheck.Core +import Xrefcheck.Data.Redirect import Xrefcheck.Scan import Xrefcheck.Scanners.Markdown import Xrefcheck.Util (Field, aesonConfigOption, postfixFields) @@ -78,8 +80,16 @@ data NetworkingConfig' f = NetworkingConfig -- this `maxTimeoutRetries` option limits only the number of retries -- caused by timeouts, and `maxRetries` limits the number of retries -- caused both by 429s and timeouts. + , ncMaxRedirectFollows :: Field f Int + -- ^ Maximum number of links that can be followed in a single redirect + -- chain. + , ncExternalRefRedirects :: Field f RedirectConfig + -- ^ Rules to override the redirect behavior for external references. } deriving stock (Generic) +-- | A list of custom redirect rules. +type RedirectConfig = [RedirectRule] + -- | Type alias for ScannersConfig' with all required fields. type ScannersConfig = ScannersConfig' Identity @@ -118,7 +128,8 @@ overrideConfig config defScanners = cScanners $ defConfig flavor defExclusions = cExclusions $ defConfig flavor - defNetworking = cNetworking $ defConfig flavor + defNetworking = cNetworking (defConfig flavor) + & ncExternalRefRedirectsL .~ [] overrideExclusions exclusionConfig = ExclusionConfig @@ -138,6 +149,8 @@ overrideConfig config , ncDefaultRetryAfter = overrideField ncDefaultRetryAfter , ncMaxRetries = overrideField ncMaxRetries , ncMaxTimeoutRetries = overrideField ncMaxTimeoutRetries + , ncMaxRedirectFollows = overrideField ncMaxRedirectFollows + , ncExternalRefRedirects = overrideField ncExternalRefRedirects } where overrideField :: (forall f. NetworkingConfig' f -> Field f a) -> a diff --git a/src/Xrefcheck/Config/Default.hs b/src/Xrefcheck/Config/Default.hs index 99ca96d6..efdb31f9 100644 --- a/src/Xrefcheck/Config/Default.hs +++ b/src/Xrefcheck/Config/Default.hs @@ -7,6 +7,7 @@ module Xrefcheck.Config.Default where import Universum +import Fmt (Builder) import Text.Interpolation.Nyan import Xrefcheck.Core @@ -65,22 +66,45 @@ networking: # On other errors xrefcheck fails immediately, without retrying. maxRetries: 3 -# Querying a given domain that ever returned 429 before, -# this defines how many timeouts are allowed during retries. -# -# For such domains, timeouts likely mean hitting the rate limiter, -# and so xrefcheck considers timeouts in the same way as 429 errors. -# -# For other domains, a timeout results in a respective error, no retry -# attempts will be performed. Use `externalRefCheckTimeout` option -# to increase the time after which timeout is declared. -# -# This option is similar to `maxRetries`, the difference is that -# this `maxTimeoutRetries` option limits only the number of retries -# caused by timeouts, and `maxRetries` limits the number of retries -# caused both by 429s and timeouts. + # Querying a given domain that ever returned 429 before, + # this defines how many timeouts are allowed during retries. + # + # For such domains, timeouts likely mean hitting the rate limiter, + # and so xrefcheck considers timeouts in the same way as 429 errors. + # + # For other domains, a timeout results in a respective error, no retry + # attempts will be performed. Use `externalRefCheckTimeout` option + # to increase the time after which timeout is declared. + # + # This option is similar to `maxRetries`, the difference is that + # this `maxTimeoutRetries` option limits only the number of retries + # caused by timeouts, and `maxRetries` limits the number of retries + # caused both by 429s and timeouts. maxTimeoutRetries: 1 + # Maximum number of links that can be followed in a single redirect + # chain. + # + # The link is considered as invalid if the limit is exceeded. + maxRedirectFollows: 10 + + # Rules to override the redirect behavior for external references that + # match, where + # - 'from' is a regular expression for the source link in a single + # redirection step. Its absence means that every link matches. + # - 'to' is a regular expression for the target link in a single + # redirection step. Its absence also means that every link matches. + # - 'on' accepts 'temporary', 'permanent' or a specific redirect HTTP code. + # Its absence also means that every response code matches. + # - 'outcome' accepts 'valid', 'invalid' or 'follow'. The last one follows + # the redirect by applying the same configuration rules so, for instance, + # exclusion rules would also apply to the following links. + # + # The first one that matches is applied, and the link is considered + # as valid if none of them does match. + externalRefRedirects: +#{interpolateIndentF 4 externalRefRedirects} + # Parameters of scanners for various file types. scanners: # On 'anchor not found' error, how much similar anchors should be displayed as @@ -95,7 +119,7 @@ scanners: |] where ignoreLocalRefsFrom :: NonEmpty Text - ignoreLocalRefsFrom = fromList $ case flavor of + ignoreLocalRefsFrom = fromList $ case flavor of GitHub -> [ ".github/pull_request_template.md" , ".github/issue_template.md" @@ -108,7 +132,7 @@ scanners: ] ignoreLocalRefsTo :: NonEmpty Text - ignoreLocalRefsTo = fromList $ case flavor of + ignoreLocalRefsTo = fromList $ case flavor of GitHub -> [ "../../../issues" , "../../../issues/*" @@ -121,3 +145,20 @@ scanners: , "../../merge_requests" , "../../merge_requests/*" ] + + externalRefRedirects :: Builder + externalRefRedirects = case flavor of + GitHub -> + [int|| + - on: permanent + outcome: invalid|] + GitLab -> + [int|| + - on: permanent + outcome: invalid + # GitLab redirects non-existing files to the repository's main page + # with a 302 code instead of answering with a 404 response. + - from: https?://gitlab.com/.*/-/blob/.* + to: https?://gitlab.com/.* + on: 302 + outcome: invalid|] diff --git a/src/Xrefcheck/Data/Redirect.hs b/src/Xrefcheck/Data/Redirect.hs new file mode 100644 index 00000000..c0366130 --- /dev/null +++ b/src/Xrefcheck/Data/Redirect.hs @@ -0,0 +1,170 @@ +{- SPDX-FileCopyrightText: 2022 Serokell + - + - SPDX-License-Identifier: MPL-2.0 + -} + +{-# OPTIONS_GHC -fno-warn-orphans #-} + +module Xrefcheck.Data.Redirect + ( RedirectChain + , RedirectChainLink (..) + , emptyChain + , pushRequest + , hasRequest + , totalFollowed + + , RedirectRule (..) + , RedirectRuleOn (..) + , RedirectRuleOutcome (..) + , redirectRule + + , isPermanentRedirectCode + , isRedirectCode + , isTemporaryRedirectCode + ) where + +import Universum + +import Data.Aeson (genericParseJSON) +import Data.Yaml (FromJSON (..), withText) +import Fmt (Buildable (..)) +import Text.Regex.TDFA.Text (Regex) + +import Data.Sequence ((|>)) +import Xrefcheck.Scan () +import Xrefcheck.Util + +-- | A custom redirect rule. +data RedirectRule = RedirectRule + { rrFrom :: Maybe Regex + -- ^ Redirect source links that match to apply the rule. + -- + -- 'Nothing' matches any link. + , rrTo :: Maybe Regex + -- ^ Redirect target links that match to apply the rule. + -- + -- 'Nothing' matches any link. + , rrOn :: Maybe RedirectRuleOn + -- ^ HTTP code selector to apply the rule. + -- + -- 'Nothing' matches any code. + , rrOutcome :: RedirectRuleOutcome + -- ^ What to do when an HTTP response matches the rule. + } deriving stock (Generic) + +-- | Rule selector depending on the response HTTP code. +data RedirectRuleOn + = RROCode Int + -- ^ An exact HTTP code + | RROPermanent + -- ^ Any HTTP code considered as permanent according to 'isPermanentRedirectCode' + | RROTemporary + -- ^ Any HTTP code considered as permanent according to 'isTemporaryRedirectCode' + deriving stock (Show, Eq) + +-- | What to do when receiving a redirect HTTP response. +data RedirectRuleOutcome + = RROValid + -- ^ Consider it as valid + | RROInvalid + -- ^ Consider it as invalid + | RROFollow + -- ^ Try again by following the redirect + deriving stock (Show, Eq) + +-- | Links in a redirection chain. +newtype RedirectChain = RedirectChain + { unRedirectChain :: Seq RedirectChainLink + } deriving newtype (Show, Eq) + +-- | A single link in a redirection chain. +newtype RedirectChainLink = RedirectChainLink + { unRedirectChainLink :: Text + } deriving newtype (Show, Eq) + +instance FromList RedirectChain where + type ListElement RedirectChain = Text + fromList = RedirectChain . fromList . fmap RedirectChainLink + +emptyChain :: RedirectChain +emptyChain = RedirectChain mempty + +pushRequest :: RedirectChain -> RedirectChainLink -> RedirectChain +pushRequest (RedirectChain chain) = RedirectChain . (chain |>) + +hasRequest :: RedirectChain -> RedirectChainLink -> Bool +hasRequest (RedirectChain chain) = (`elem` chain) + +totalFollowed :: RedirectChain -> Int +totalFollowed = length . unRedirectChain + +instance Buildable RedirectChain where + build (RedirectChain linksStack) = build chainText + where + link (True, RedirectChainLink l) = "-| " <> l + link (False, RedirectChainLink l) = "-> " <> l + + chainText = mconcat + $ intersperse "\n" + $ fmap link + $ zip (True : repeat False) + $ toList linksStack + +-- | Redirect rule to apply to a link when it has been responded with a given +-- HTTP code. +redirectRule :: Text -> Text -> Int -> [RedirectRule] -> Maybe RedirectRule +redirectRule source target code rules = + find (matchRule source target code) rules + +-- | Check if a 'RedirectRule' matches a given link and HTTP code. +matchRule :: Text -> Text -> Int -> RedirectRule -> Bool +matchRule source target code RedirectRule{..} = and + [ matchCode + , matchLink source rrFrom + , matchLink target rrTo + ] + where + matchCode = case rrOn of + Nothing -> True + Just RROPermanent -> isPermanentRedirectCode code + Just RROTemporary -> isTemporaryRedirectCode code + Just (RROCode other) -> code == other + + matchLink link = \case + Nothing -> True + Just regex -> doesMatchAnyRegex link [regex] + +isRedirectCode :: Int -> Bool +isRedirectCode code = code >= 300 && code < 400 + +isTemporaryRedirectCode :: Int -> Bool +isTemporaryRedirectCode = flip elem [302, 303, 307] + +isPermanentRedirectCode :: Int -> Bool +isPermanentRedirectCode = flip elem [301, 308] + +instance FromJSON (RedirectRule) where + parseJSON = genericParseJSON aesonConfigOption + +instance FromJSON (RedirectRuleOutcome) where + parseJSON = withText "Redirect rule outcome" $ + \case + "valid" -> pure RROValid + "invalid" -> pure RROInvalid + "follow" -> pure RROFollow + _ -> fail "expected (valid|invalid|follow)" + +instance FromJSON (RedirectRuleOn) where + parseJSON v = code v + <|> text v + <|> fail "expected a redirect (3XX) HTTP code or (permanent|temporary)" + where + code cv = do + i <- parseJSON cv + guard $ isRedirectCode i + pure $ RROCode i + text = withText "Redirect rule on" $ + \case + "permanent" -> pure RROPermanent + "temporary" -> pure RROTemporary + _ -> mzero diff --git a/src/Xrefcheck/Scan.hs b/src/Xrefcheck/Scan.hs index 252b9a61..f3928b78 100644 --- a/src/Xrefcheck/Scan.hs +++ b/src/Xrefcheck/Scan.hs @@ -23,6 +23,8 @@ module Xrefcheck.Scan , mkGatherScanError , scanRepo , specificFormatsSupport + , defaultCompOption + , defaultExecOption , ecIgnoreL , ecIgnoreLocalRefsToL , ecIgnoreRefsFromL diff --git a/src/Xrefcheck/Util.hs b/src/Xrefcheck/Util.hs index 4038f792..64b7e2ab 100644 --- a/src/Xrefcheck/Util.hs +++ b/src/Xrefcheck/Util.hs @@ -10,6 +10,7 @@ module Xrefcheck.Util , (-:) , aesonConfigOption , composeFuncList + , doesMatchAnyRegex , posixTimeToTimeSecond , utcTimeToTimeSecond , unlessFunc @@ -29,6 +30,7 @@ import Data.Time (UTCTime) import Data.Time.Clock (nominalDiffTimeToSeconds) import Data.Time.Clock.POSIX (POSIXTime, utcTimeToPOSIXSeconds) import Fmt (Builder) +import Text.Regex.TDFA.Text (Regex, regexec) import Time (Second, Time (..), sec) import Xrefcheck.Util.Colorize @@ -72,3 +74,12 @@ whenFunc False _ = id unlessFunc :: Bool -> (a -> a) -> (a -> a) unlessFunc = whenFunc . not + +doesMatchAnyRegex :: Text -> ([Regex] -> Bool) +doesMatchAnyRegex src = any $ \regex -> + case regexec regex src of + Right res -> case res of + Just (before, match, after, _) -> + null before && null after && not (null match) + Nothing -> False + Left _ -> False diff --git a/src/Xrefcheck/Verify.hs b/src/Xrefcheck/Verify.hs index 3b62bfd7..55e67a61 100644 --- a/src/Xrefcheck/Verify.hs +++ b/src/Xrefcheck/Verify.hs @@ -49,7 +49,7 @@ import Data.Text.Metrics (damerauLevenshteinNorm) import Data.Time (UTCTime, defaultTimeLocale, formatTime, readPTime, rfc822DateFormat) import Data.Time.Clock.POSIX (getPOSIXTime) import Data.Traversable (for) -import Fmt (Buildable (..), fmt, maybeF, nameF) +import Fmt (Buildable (..), Builder, fmt, maybeF, nameF) import GHC.Exts qualified as Exts import GHC.Read (Read (readPrec)) import Network.FTP.Client @@ -66,7 +66,6 @@ import System.FilePath (isPathSeparator) import System.FilePath.Posix (()) import Text.Interpolation.Nyan import Text.ParserCombinators.ReadPrec qualified as ReadPrec (lift) -import Text.Regex.TDFA.Text (Regex, regexec) import Text.URI (Authority (..), ParseExceptionBs, URI (..), mkURIBs, unRText) import Time (RatioNat, Second, Time (..), ms, sec, threadDelay, timeout, (+:+), (-:-)) import URI.ByteString qualified as URIBS @@ -143,9 +142,16 @@ data VerifyError | ExternalFtpException FTPException | FtpEntryDoesNotExist FilePath | ExternalResourceSomeError Text - | PermanentRedirectError Text (Maybe Text) + | RedirectChainCycle RedirectChain + | RedirectMissingLocation RedirectChain + | RedirectChainLimit RedirectChain + | RedirectRuleError RedirectChain (Maybe RedirectRuleOn) deriving stock (Show, Eq) +data ResponseResult + = RRDone + | RRFollow Text + instance Given ColorMode => Buildable VerifyError where build = \case LocalFileDoesNotExist file -> @@ -160,7 +166,6 @@ instance Given ColorMode => Buildable VerifyError where #{file} |] - LinkTargetNotAddedToGit file -> [int|| Link target is not tracked by Git: @@ -252,20 +257,43 @@ instance Given ColorMode => Buildable VerifyError where #{err} |] - PermanentRedirectError url Nothing -> + RedirectChainCycle chain -> [int|| - Permanent redirect found: - #{url} + Cycle found in the following redirect chain: + #{interpolateIndentF 2 $ attachToRedirectChain chain "here"} |] - PermanentRedirectError url (Just redirectedUrl) -> + RedirectMissingLocation chain -> [int|| - Permanent redirect found. Perhaps you want to replace the link: - #{url} - by: - #{redirectedUrl} + Missing location header in the following redirect chain: + #{interpolateIndentF 2 $ attachToRedirectChain chain "no location header"} |] + RedirectChainLimit chain -> + [int|| + The follow redirects limit has been reached in the following redirect chain: + #{interpolateIndentF 2 $ attachToRedirectChain chain "stopped before this one"} + |] + + RedirectRuleError chain mOn -> + [int|| + #{redirect} found: + #{interpolateIndentF 2 $ attachToRedirectChain chain "stopped before this one"} + |] + where + redirect :: Text + redirect = case mOn of + Nothing -> "Redirect" + Just RROPermanent -> "Permanent redirect" + Just RROTemporary -> "Temporary redirect" + Just (RROCode code) -> show code <> " redirect" + +attachToRedirectChain :: RedirectChain -> Text -> Builder +attachToRedirectChain chain attached + = build chain <> build attachedText + where + attachedText = "\n ^-- " <> attached + data RetryCounter = RetryCounter { rcTotalRetries :: Int , rcTimeoutRetries :: Int @@ -280,7 +308,6 @@ incTotalCounter rc = rc {rcTotalRetries = rcTotalRetries rc + 1} incTimeoutCounter :: RetryCounter -> RetryCounter incTimeoutCounter rc = rc {rcTimeoutRetries = rcTimeoutRetries rc + 1} - reportVerifyErrs :: Given ColorMode => NonEmpty (WithReferenceLoc VerifyError) -> IO () reportVerifyErrs errs = fmt @@ -468,7 +495,7 @@ verifyReference let shownFilepath = dropWhile isPathSeparator (toString rLink) canonicalPath <- liftIO $ riRoot checkExternalResource config rLink + RIExternal -> checkExternalResource emptyChain config rLink RIOtherProtocol -> pass else pass where @@ -688,9 +715,13 @@ parseUri link = do & handleJust (fromException @ParseExceptionBs) (throwError . ExternalResourceUriConversionError) -checkExternalResource :: Config -> Text -> ExceptT VerifyError IO () -checkExternalResource Config{..} link +checkExternalResource :: RedirectChain -> Config -> Text -> ExceptT VerifyError IO () +checkExternalResource followed config@Config{..} link | isIgnored = pass + | followed `hasRequest` (RedirectChainLink link) = + throwError $ RedirectChainCycle $ followed `pushRequest` (RedirectChainLink link) + | ncMaxRedirectFollows >= 0 && totalFollowed followed > ncMaxRedirectFollows = + throwError $ RedirectChainLimit $ followed `pushRequest` (RedirectChainLink link) | otherwise = do uri <- parseUri link case toString <$> uriScheme uri of @@ -705,15 +736,6 @@ checkExternalResource Config{..} link isIgnored = doesMatchAnyRegex link ecIgnoreExternalRefsTo - doesMatchAnyRegex :: Text -> ([Regex] -> Bool) - doesMatchAnyRegex src = any $ \regex -> - case regexec regex src of - Right res -> case res of - Just (before, match, after, _) -> - null before && null after && not (null match) - Nothing -> False - Left _ -> False - checkHttp :: URI -> ExceptT VerifyError IO () checkHttp uri = makeHttpRequest uri HEAD 0.3 `catchError` \case e@(ExternalHttpTooManyRequests _ _) -> throwError e @@ -735,6 +757,7 @@ checkExternalResource Config{..} link -- so just in case we throw exception here Nothing -> throwError $ ExternalResourceInvalidUrl Nothing Just u -> pure u + let reqLink = case parsedUrl of Left (url, option) -> runReq httpConfig $ @@ -745,20 +768,21 @@ checkExternalResource Config{..} link let maxTime = Time @Second $ unTime ncExternalRefCheckTimeout * timeoutFrac - mres <- liftIO (timeout maxTime $ void reqLink) `catch` - (either throwError (\() -> return (Just ())) . interpretErrors uri) - maybe (throwError $ ExternalHttpTimeout $ extractHost uri) pure mres + reqRes <- catch (liftIO (timeout maxTime $ reqLink $> RRDone)) $ \httpErr -> + case interpretErrors uri httpErr of + Left err -> throwError err + Right res -> pure $ Just res + + case reqRes of + Nothing -> throwError $ ExternalHttpTimeout $ extractHost uri + Just RRDone -> pass + Just (RRFollow nextLink) -> + checkExternalResource (followed `pushRequest` (RedirectChainLink link)) config nextLink extractHost :: URI -> Maybe DomainName extractHost = either (const Nothing) (Just . DomainName . unRText . authHost) . uriAuthority - isTemporaryRedirectCode :: Int -> Bool - isTemporaryRedirectCode = flip elem [302, 303, 307] - - isPermanentRedirectCode :: Int -> Bool - isPermanentRedirectCode = flip elem [301, 308] - isAllowedErrorCode :: Int -> Bool isAllowedErrorCode = or . sequence -- We have to stay conservative - if some URL can be accessed under @@ -775,18 +799,29 @@ checkExternalResource Config{..} link InvalidUrlException{} -> error "External link URL invalid exception" HttpExceptionRequest _ exc -> case exc of StatusCodeException resp _ - | isPermanentRedirectCode code -> Left - . PermanentRedirectError link - . fmap decodeUtf8 - . lookup "Location" - $ responseHeaders resp - | isTemporaryRedirectCode code -> Right () - | isAllowedErrorCode code -> Right () + | isRedirectCode code -> case redirectLocation of + Nothing -> Left $ RedirectMissingLocation $ followed `pushRequest` (RedirectChainLink link) + Just nextLink -> case redirectRule link nextLink code ncExternalRefRedirects of + Nothing -> Right RRDone + Just RedirectRule{..} -> + case rrOutcome of + RROValid -> Right RRDone + RROInvalid -> Left $ RedirectRuleError + (followed `pushRequest` (RedirectChainLink link) `pushRequest` (RedirectChainLink nextLink)) + rrOn + RROFollow -> Right $ RRFollow nextLink + | isAllowedErrorCode code -> Right RRDone | otherwise -> case statusCode (responseStatus resp) of - 429 -> Left $ ExternalHttpTooManyRequests (retryAfterInfo resp) (extractHost uri) - _ -> Left . ExternalHttpResourceUnavailable $ responseStatus resp + 429 -> Left $ ExternalHttpTooManyRequests (retryAfterInfo resp) (extractHost uri) + _ -> Left . ExternalHttpResourceUnavailable $ responseStatus resp where + code :: Int code = statusCode $ responseStatus resp + + redirectLocation :: Maybe Text + redirectLocation = fmap decodeUtf8 + . lookup "Location" + $ responseHeaders resp other -> Left . ExternalResourceSomeError $ show other where retryAfterInfo :: Response a -> Maybe RetryAfter diff --git a/tests/Test/Xrefcheck/ConfigSpec.hs b/tests/Test/Xrefcheck/ConfigSpec.hs index 2a3881ed..7ec9125d 100644 --- a/tests/Test/Xrefcheck/ConfigSpec.hs +++ b/tests/Test/Xrefcheck/ConfigSpec.hs @@ -18,7 +18,6 @@ import Test.Tasty.HUnit (assertFailure, testCase, (@?=)) import Test.Tasty.QuickCheck (ioProperty, testProperty) import Xrefcheck.Config - (Config, cExclusionsL, cNetworkingL, defConfig, defConfigText, ncIgnoreAuthFailuresL) import Xrefcheck.Core (Flavor (GitHub), allFlavors) import Xrefcheck.Scan (ecIgnoreExternalRefsToL) import Xrefcheck.Verify (VerifyError (..), checkExternalResource) @@ -31,7 +30,6 @@ test_config = testProperty (show flavor) $ ioProperty $ evaluateWHNF_ @_ @Config (defConfig flavor) | flavor <- allFlavors] - , testGroup "Filled default config matches the expected format" -- The config we match against can be regenerated with -- stack exec xrefcheck -- dump-config -t GitHub -o tests/configs/github-config.yaml --force @@ -84,5 +82,5 @@ test_config = where checkLinkWithServer config link expectation = E.bracket (forkIO mockServer) killThread $ \_ -> do - result <- runExceptT $ checkExternalResource config link + result <- runExceptT $ checkExternalResource emptyChain config link result @?= expectation diff --git a/tests/Test/Xrefcheck/RedirectChainSpec.hs b/tests/Test/Xrefcheck/RedirectChainSpec.hs new file mode 100644 index 00000000..a8784625 --- /dev/null +++ b/tests/Test/Xrefcheck/RedirectChainSpec.hs @@ -0,0 +1,113 @@ +{- SPDX-FileCopyrightText: 2022 Serokell + - + - SPDX-License-Identifier: MPL-2.0 + -} + +module Test.Xrefcheck.RedirectChainSpec where + +import Universum + +import Data.CaseInsensitive qualified as CI +import Data.Map qualified as M +import Network.HTTP.Types (mkStatus) +import Network.HTTP.Types.Header (hLocation) +import Test.Tasty (TestTree, testGroup) +import Test.Tasty.HUnit (testCase) +import Web.Firefly (App, ToResponse (toResponse), route, run) + +import Test.Xrefcheck.UtilRequests +import Xrefcheck.Config +import Xrefcheck.Progress +import Xrefcheck.Verify + +test_redirectRequests :: TestTree +test_redirectRequests = testGroup "Redirect chain tests" + [ testCase "Missing location" $ do + setRef <- newIORef mempty + checkLinkAndProgressWithServer + (configMod 5) + setRef + mockRedirect + (link "/broken1") + progress + (VerifyResult [RedirectMissingLocation $ chain [ "/broken1", "/broken2", "/broken3"]]) + , testCase "Cycle" $ do + setRef <- newIORef mempty + checkLinkAndProgressWithServer + (configMod 5) + setRef + mockRedirect + (link "/cycle1") + progress + (VerifyResult [RedirectChainCycle $ chain ["/cycle1", "/cycle2", "/cycle3", "/cycle4", "/cycle2"]]) + , testGroup "Limit" + [ testCase "Takes effect" $ do + setRef <- newIORef mempty + checkLinkAndProgressWithServer + (configMod 2) + setRef + mockRedirect + (link "/cycle1") + progress + (VerifyResult [RedirectChainLimit $ chain ["/cycle1", "/cycle2", "/cycle3", "/cycle4"]]) + , testCase "No redirects allowed" $ do + setRef <- newIORef mempty + checkLinkAndProgressWithServer + (configMod 0) + setRef + mockRedirect + (link "/cycle1") + progress + (VerifyResult [RedirectChainLimit $ chain ["/cycle1", "/cycle2"]]) + , testCase "Negative" $ do + setRef <- newIORef mempty + checkLinkAndProgressWithServer + (configMod (-1)) + setRef + mockRedirect + (link "/cycle1") + progress + (VerifyResult [RedirectChainCycle $ chain ["/cycle1", "/cycle2", "/cycle3", "/cycle4", "/cycle2"]]) + ] + ] + where + link :: Text -> Text + link = ("http://127.0.0.1:5000" <>) + + chain :: [Text] -> RedirectChain + chain = fromList . fmap link + + progress :: Progress Int + progress = Progress + { pTotal = 1 + , pCurrent = 1 + , pErrorsUnfixable = 1 + , pErrorsFixable = 0 + , pTaskTimestamp = Nothing + } + + configMod :: Int -> Config -> Config + configMod limit config = config + & cNetworkingL . ncExternalRefRedirectsL .~ [RedirectRule Nothing Nothing Nothing RROFollow] + & cNetworkingL . ncMaxRedirectFollowsL .~ limit + + redirectRoute :: Text -> Maybe Text -> App () + redirectRoute name to = route name $ pure $ toResponse + ( "" :: Text + , mkStatus 301 "Permanent redirect" + , M.fromList [(CI.map (decodeUtf8 @Text) hLocation, fmap link $ maybeToList to)] + ) + + mockRedirect :: IO () + mockRedirect = + run 5000 do + -- A set of redirect routes that correspond to a broken chain. + redirectRoute "/broken1" $ Just "/broken2" + redirectRoute "/broken2" $ Just "/broken3" + redirectRoute "/broken3" Nothing + + -- A set of redirect routes that correspond to a cycle. + redirectRoute "/cycle1" $ Just "/cycle2" + redirectRoute "/cycle2" $ Just "/cycle3" + redirectRoute "/cycle3" $ Just "/cycle4" + redirectRoute "/cycle4" $ Just "/cycle2" diff --git a/tests/Test/Xrefcheck/RedirectConfigSpec.hs b/tests/Test/Xrefcheck/RedirectConfigSpec.hs new file mode 100644 index 00000000..25043f24 --- /dev/null +++ b/tests/Test/Xrefcheck/RedirectConfigSpec.hs @@ -0,0 +1,190 @@ +{- SPDX-FileCopyrightText: 2022 Serokell + - + - SPDX-License-Identifier: MPL-2.0 + -} + +module Test.Xrefcheck.RedirectConfigSpec where + +import Universum + +import Data.CaseInsensitive qualified as CI +import Data.Map qualified as M +import Network.HTTP.Types (mkStatus) +import Network.HTTP.Types.Header (hLocation) +import Test.Tasty (TestTree, testGroup) +import Test.Tasty.HUnit (testCase) +import Text.Regex.TDFA.Text qualified as R +import Web.Firefly (App, Status, ToResponse (toResponse), route, run) + +import Test.Xrefcheck.UtilRequests +import Xrefcheck.Config +import Xrefcheck.Progress +import Xrefcheck.Scan +import Xrefcheck.Verify + +test_redirectRequests :: TestTree +test_redirectRequests = testGroup "Redirect config tests" + [ testGroup "Match" + [ testGroup "By \"on\"" + [ testCase "Do match" $ do + setRef <- newIORef mempty + checkLinkAndProgressWithServer + (configMod [RedirectRule Nothing Nothing (Just RROTemporary) RROInvalid] []) + setRef + mockRedirect + (link "/temporary-redirect") + (progress 1) + (VerifyResult [RedirectRuleError (chain ["/temporary-redirect", "/ok"]) (Just RROTemporary)]) + , testCase "Do not match" $ do + setRef <- newIORef mempty + checkLinkAndProgressWithServer + (configMod [RedirectRule Nothing Nothing (Just RROPermanent) RROInvalid] []) + setRef + mockRedirect + (link "/temporary-redirect") + (progress 0) + (VerifyResult []) + ] + , testGroup "By \"to\"" + [ testCase "Do match" $ do + setRef <- newIORef mempty + checkLinkAndProgressWithServer + (configMod [RedirectRule Nothing (regex "^.*/ok$") Nothing RROValid] []) + setRef + mockRedirect + (link "/permanent-redirect") + (progress 0) + (VerifyResult []) + , testCase "Do not match" $ do + setRef <- newIORef mempty + checkLinkAndProgressWithServer + (configMod [RedirectRule Nothing (regex "^.*/no-ok$") (Just RROPermanent) RROValid] []) + setRef + mockRedirect + (link "/permanent-redirect") + (progress 1) + (VerifyResult [RedirectRuleError (chain ["/permanent-redirect", "/ok"]) (Just RROPermanent)]) + ] + , testGroup "By \"from\"" + [ testCase "Do match" $ do + setRef <- newIORef mempty + checkLinkAndProgressWithServer + (configMod [RedirectRule (regex "^.*/permanent-.*$") Nothing Nothing RROValid] []) + setRef + mockRedirect + (link "/permanent-redirect") + (progress 0) + (VerifyResult []) + , testCase "Do not match" $ do + setRef <- newIORef mempty + checkLinkAndProgressWithServer + (configMod [RedirectRule (regex "^.*/temporary-.*$") Nothing (Just RROPermanent) RROValid] []) + setRef + mockRedirect + (link "/permanent-redirect") + (progress 1) + (VerifyResult [RedirectRuleError (chain ["/permanent-redirect", "/ok"]) (Just RROPermanent)]) + ] + , testGroup "By \"from\", \"to\" and \"on\"" + [ testCase "Do match" $ do + setRef <- newIORef mempty + checkLinkAndProgressWithServer + (configMod [RedirectRule (regex "^.*/follow[0-9]$") (regex "^.*/ok$") (Just (RROCode 307)) RROInvalid] []) + setRef + mockRedirect + (link "/follow3") + (progress 1) + (VerifyResult [RedirectRuleError (chain ["/follow3", "/ok"]) (Just (RROCode 307))]) + , testCase "Do not match" $ do + setRef <- newIORef mempty + checkLinkAndProgressWithServer + (configMod [RedirectRule (regex "^.*/follow[0-9]$") (regex "^.*/ok$") (Just (RROCode 307)) RROInvalid] []) + setRef + mockRedirect + (link "/follow2") + (progress 0) + (VerifyResult []) + ] + , testCase "By any" $ do + setRef <- newIORef mempty + checkLinkAndProgressWithServer + (configMod [RedirectRule Nothing Nothing Nothing RROValid] []) + setRef + mockRedirect + (link "/follow1") + (progress 0) + (VerifyResult []) + ] + , testGroup "Chain" + [ testCase "End valid" $ do + setRef <- newIORef mempty + checkLinkAndProgressWithServer + (configMod [RedirectRule Nothing Nothing Nothing RROFollow] []) + setRef + mockRedirect + (link "/follow1") + (progress 0) + (VerifyResult []) + , testCase "End invalid" $ do + setRef <- newIORef mempty + checkLinkAndProgressWithServer + (configMod [RedirectRule Nothing Nothing (Just (RROCode 307)) RROInvalid, RedirectRule Nothing Nothing Nothing RROFollow] []) + setRef + mockRedirect + (link "/follow1") + (progress 1) + (VerifyResult [RedirectRuleError (chain ["/follow1", "/follow2", "/follow3", "/ok"]) (Just (RROCode 307))]) + , testCase "Mixed with ignore" $ do + setRef <- newIORef mempty + checkLinkAndProgressWithServer + (configMod [RedirectRule Nothing Nothing (Just (RROCode 307)) RROInvalid, RedirectRule Nothing Nothing Nothing RROFollow] (maybeToList (regex "^.*/follow3$"))) + setRef + mockRedirect + (link "/follow1") + (progress 0) + (VerifyResult []) + ] + ] + where + link :: Text -> Text + link = ("http://127.0.0.1:5000" <>) + + chain :: [Text] -> RedirectChain + chain = fromList . fmap link + + regex :: Text -> Maybe R.Regex + regex = rightToMaybe . R.compile defaultCompOption defaultExecOption + + status :: Int -> Status + status code = mkStatus code "Redirect" + + configMod :: [RedirectRule] -> [R.Regex] -> Config -> Config + configMod rules exclussions config = config + & cNetworkingL . ncExternalRefRedirectsL %~ (rules <>) + & cExclusionsL . ecIgnoreExternalRefsToL .~ exclussions + + redirectRoute :: Text -> Int -> Maybe Text -> App () + redirectRoute name code to = route name $ pure $ toResponse + ( "" :: Text + , status code + , M.fromList [(CI.map (decodeUtf8 @Text) hLocation, fmap link $ maybeToList to)] + ) + + progress :: Int -> Progress Int + progress errors = Progress + { pTotal = 1 + , pCurrent = 1 + , pErrorsUnfixable = errors + , pErrorsFixable = 0 + , pTaskTimestamp = Nothing + } + + mockRedirect :: IO () + mockRedirect = + run 5000 do + route "/ok" $ pure $ toResponse ("Ok" :: Text) + redirectRoute "/permanent-redirect" 301 $ Just "/ok" + redirectRoute "/temporary-redirect" 302 $ Just "/ok" + redirectRoute "/follow1" 301 $ Just "/follow2" + redirectRoute "/follow2" 302 $ Just "/follow3" + redirectRoute "/follow3" 307 $ Just "/ok" diff --git a/tests/Test/Xrefcheck/RedirectRequestsSpec.hs b/tests/Test/Xrefcheck/RedirectDefaultSpec.hs similarity index 83% rename from tests/Test/Xrefcheck/RedirectRequestsSpec.hs rename to tests/Test/Xrefcheck/RedirectDefaultSpec.hs index be762a72..2b9bd660 100644 --- a/tests/Test/Xrefcheck/RedirectRequestsSpec.hs +++ b/tests/Test/Xrefcheck/RedirectDefaultSpec.hs @@ -3,7 +3,7 @@ - SPDX-License-Identifier: MPL-2.0 -} -module Test.Xrefcheck.RedirectRequestsSpec where +module Test.Xrefcheck.RedirectDefaultSpec where import Universum @@ -17,11 +17,12 @@ import Test.Tasty.HUnit (Assertion, testCase) import Web.Firefly (ToResponse (toResponse), route, run) import Test.Xrefcheck.UtilRequests +import Xrefcheck.Config import Xrefcheck.Progress import Xrefcheck.Verify test_redirectRequests :: TestTree -test_redirectRequests = testGroup "Redirect response tests" +test_redirectRequests = testGroup "Redirect response defaults" [ testGroup "Temporary" $ temporaryRedirectTests <$> [302, 303, 307] , testGroup "Permanent" $ permanentRedirectTests <$> [301, 308] ] @@ -37,14 +38,20 @@ test_redirectRequests = testGroup "Redirect response tests" redirectTests (show statusCode <> " passes by default") (mkStatus statusCode "Temporary redirect") - (const Nothing) + (\case + Nothing -> Just $ RedirectMissingLocation $ fromList [url] + Just _ -> Nothing + ) permanentRedirectTests :: Int -> TestTree permanentRedirectTests statusCode = redirectTests (show statusCode <> " fails by default") (mkStatus statusCode "Permanent redirect") - (Just . PermanentRedirectError url) + (\case + Nothing -> Just $ RedirectMissingLocation $ fromList [url] + Just loc -> Just $ RedirectRuleError (fromList [url, loc]) (Just RROPermanent) + ) redirectTests :: TestName -> Status -> (Maybe Text -> Maybe VerifyError) -> TestTree redirectTests name expectedStatus expectedError = @@ -74,9 +81,9 @@ test_redirectRequests = testGroup "Redirect response tests" (VerifyResult $ maybeToList expectedError) mockRedirect :: Maybe Text -> Status -> IO () - mockRedirect expectedLocation expectedSocation = + mockRedirect expectedLocation expectedStatus = run 5000 $ route "/redirect" $ pure $ toResponse ( "" :: Text - , expectedSocation + , expectedStatus , M.fromList [(CI.map (decodeUtf8 @Text) hLocation, maybeToList expectedLocation)] ) diff --git a/tests/configs/github-config.yaml b/tests/configs/github-config.yaml index edd8eb09..dcd09bdb 100644 --- a/tests/configs/github-config.yaml +++ b/tests/configs/github-config.yaml @@ -54,22 +54,46 @@ networking: # On other errors xrefcheck fails immediately, without retrying. maxRetries: 3 -# Querying a given domain that ever returned 429 before, -# this defines how many timeouts are allowed during retries. -# -# For such domains, timeouts likely mean hitting the rate limiter, -# and so xrefcheck considers timeouts in the same way as 429 errors. -# -# For other domains, a timeout results in a respective error, no retry -# attempts will be performed. Use `externalRefCheckTimeout` option -# to increase the time after which timeout is declared. -# -# This option is similar to `maxRetries`, the difference is that -# this `maxTimeoutRetries` option limits only the number of retries -# caused by timeouts, and `maxRetries` limits the number of retries -# caused both by 429s and timeouts. + # Querying a given domain that ever returned 429 before, + # this defines how many timeouts are allowed during retries. + # + # For such domains, timeouts likely mean hitting the rate limiter, + # and so xrefcheck considers timeouts in the same way as 429 errors. + # + # For other domains, a timeout results in a respective error, no retry + # attempts will be performed. Use `externalRefCheckTimeout` option + # to increase the time after which timeout is declared. + # + # This option is similar to `maxRetries`, the difference is that + # this `maxTimeoutRetries` option limits only the number of retries + # caused by timeouts, and `maxRetries` limits the number of retries + # caused both by 429s and timeouts. maxTimeoutRetries: 1 + # Maximum number of links that can be followed in a single redirect + # chain. + # + # The link is considered as invalid if the limit is exceeded. + maxRedirectFollows: 10 + + # Rules to override the redirect behavior for external references that + # match, where + # - 'from' is a regular expression for the source link in a single + # redirection step. Its absence means that every link matches. + # - 'to' is a regular expression for the target link in a single + # redirection step. Its absence also means that every link matches. + # - 'on' accepts 'temporary', 'permanent' or a specific redirect HTTP code. + # Its absence also means that every response code matches. + # - 'outcome' accepts 'valid', 'invalid' or 'follow'. The last one follows + # the redirect by applying the same configuration rules so, for instance, + # exclusion rules would also apply to the following links. + # + # The first one that matches is applied, and the link is considered + # as valid if none of them does match. + externalRefRedirects: + - on: permanent + outcome: invalid + # Parameters of scanners for various file types. scanners: # On 'anchor not found' error, how much similar anchors should be displayed as diff --git a/tests/golden/check-autolinks/check-autolinks.bats b/tests/golden/check-autolinks/check-autolinks.bats index 4d44a888..48a7f9b6 100644 --- a/tests/golden/check-autolinks/check-autolinks.bats +++ b/tests/golden/check-autolinks/check-autolinks.bats @@ -35,10 +35,10 @@ assert_diff - < https://commonmark.org/ + ^-- stopped before this one Invalid references dumped, 1 in total. EOF diff --git a/tests/golden/check-redirect-parse/bad-code.yaml b/tests/golden/check-redirect-parse/bad-code.yaml new file mode 100644 index 00000000..b9233a8f --- /dev/null +++ b/tests/golden/check-redirect-parse/bad-code.yaml @@ -0,0 +1,13 @@ +# SPDX-FileCopyrightText: 2022 Serokell +# +# SPDX-License-Identifier: Unlicense + +scanners: + markdown: + flavor: GitHub + +networking: + externalRefRedirects: + - outcome: valid + to: ^.*$ + on: 404 diff --git a/tests/golden/check-redirect-parse/bad-on.yaml b/tests/golden/check-redirect-parse/bad-on.yaml new file mode 100644 index 00000000..1c3d85bf --- /dev/null +++ b/tests/golden/check-redirect-parse/bad-on.yaml @@ -0,0 +1,13 @@ +# SPDX-FileCopyrightText: 2022 Serokell +# +# SPDX-License-Identifier: Unlicense + +scanners: + markdown: + flavor: GitHub + +networking: + externalRefRedirects: + - outcome: valid + to: ^.*$ + on: premanent diff --git a/tests/golden/check-redirect-parse/bad-outcome.yaml b/tests/golden/check-redirect-parse/bad-outcome.yaml new file mode 100644 index 00000000..03c293fb --- /dev/null +++ b/tests/golden/check-redirect-parse/bad-outcome.yaml @@ -0,0 +1,11 @@ +# SPDX-FileCopyrightText: 2022 Serokell +# +# SPDX-License-Identifier: Unlicense + +scanners: + markdown: + flavor: GitHub + +networking: + externalRefRedirects: + - outcome: flow diff --git a/tests/golden/check-redirect-parse/bad-rule.yaml b/tests/golden/check-redirect-parse/bad-rule.yaml new file mode 100644 index 00000000..954554aa --- /dev/null +++ b/tests/golden/check-redirect-parse/bad-rule.yaml @@ -0,0 +1,10 @@ +# SPDX-FileCopyrightText: 2022 Serokell +# +# SPDX-License-Identifier: Unlicense + +scanners: + markdown: + flavor: GitHub + +networking: + externalRefRedirects: [Bad] diff --git a/tests/golden/check-redirect-parse/bad-rules.yaml b/tests/golden/check-redirect-parse/bad-rules.yaml new file mode 100644 index 00000000..a7e094b4 --- /dev/null +++ b/tests/golden/check-redirect-parse/bad-rules.yaml @@ -0,0 +1,10 @@ +# SPDX-FileCopyrightText: 2022 Serokell +# +# SPDX-License-Identifier: Unlicense + +scanners: + markdown: + flavor: GitHub + +networking: + externalRefRedirects: Bad diff --git a/tests/golden/check-redirect-parse/bad-to.yaml b/tests/golden/check-redirect-parse/bad-to.yaml new file mode 100644 index 00000000..a6d89a44 --- /dev/null +++ b/tests/golden/check-redirect-parse/bad-to.yaml @@ -0,0 +1,12 @@ +# SPDX-FileCopyrightText: 2022 Serokell +# +# SPDX-License-Identifier: Unlicense + +scanners: + markdown: + flavor: GitHub + +networking: + externalRefRedirects: + - outcome: valid + to: 42 diff --git a/tests/golden/check-redirect-parse/check-redirect-parse.bats b/tests/golden/check-redirect-parse/check-redirect-parse.bats new file mode 100644 index 00000000..10bb13e4 --- /dev/null +++ b/tests/golden/check-redirect-parse/check-redirect-parse.bats @@ -0,0 +1,93 @@ +#!/usr/bin/env bats + +# SPDX-FileCopyrightText: 2022 Serokell +# +# SPDX-License-Identifier: MPL-2.0 + +load '../helpers/bats-support/load' +load '../helpers/bats-assert/load' +load '../helpers/bats-file/load' +load '../helpers' + + +@test "No redirect rules" { + to_temp xrefcheck -c no-rules.yaml + +assert_diff - < +# +# SPDX-License-Identifier: Unlicense + +scanners: + markdown: + flavor: GitHub + +networking: + externalRefRedirects: + - outcome: valid + to: ^https://.*$ + on: permanent diff --git a/tests/golden/check-redirect-parse/no-outcome.yaml b/tests/golden/check-redirect-parse/no-outcome.yaml new file mode 100644 index 00000000..ff5eab86 --- /dev/null +++ b/tests/golden/check-redirect-parse/no-outcome.yaml @@ -0,0 +1,14 @@ +# SPDX-FileCopyrightText: 2022 Serokell +# +# SPDX-License-Identifier: Unlicense + +scanners: + markdown: + flavor: GitHub + +networking: + externalRefRedirects: + - outcome: valid + on: temporary + - to: ^https://.*$ + on: temporary diff --git a/tests/golden/check-redirect-parse/no-rules.yaml b/tests/golden/check-redirect-parse/no-rules.yaml new file mode 100644 index 00000000..84652cfa --- /dev/null +++ b/tests/golden/check-redirect-parse/no-rules.yaml @@ -0,0 +1,10 @@ +# SPDX-FileCopyrightText: 2022 Serokell +# +# SPDX-License-Identifier: Unlicense + +scanners: + markdown: + flavor: GitHub + +networking: + externalRefRedirects: [] diff --git a/tests/golden/check-redirect-parse/only-outcome-on.yaml b/tests/golden/check-redirect-parse/only-outcome-on.yaml new file mode 100644 index 00000000..08a28de4 --- /dev/null +++ b/tests/golden/check-redirect-parse/only-outcome-on.yaml @@ -0,0 +1,12 @@ +# SPDX-FileCopyrightText: 2022 Serokell +# +# SPDX-License-Identifier: Unlicense + +scanners: + markdown: + flavor: GitHub + +networking: + externalRefRedirects: + - outcome: invalid + on: 302 diff --git a/tests/golden/check-redirect-parse/only-outcome-to.yaml b/tests/golden/check-redirect-parse/only-outcome-to.yaml new file mode 100644 index 00000000..bcb1dc6a --- /dev/null +++ b/tests/golden/check-redirect-parse/only-outcome-to.yaml @@ -0,0 +1,12 @@ +# SPDX-FileCopyrightText: 2022 Serokell +# +# SPDX-License-Identifier: Unlicense + +scanners: + markdown: + flavor: GitHub + +networking: + externalRefRedirects: + - outcome: valid + to: ^https://.*$ diff --git a/tests/golden/check-redirect-parse/only-outcome.yaml b/tests/golden/check-redirect-parse/only-outcome.yaml new file mode 100644 index 00000000..9931fe0f --- /dev/null +++ b/tests/golden/check-redirect-parse/only-outcome.yaml @@ -0,0 +1,11 @@ +# SPDX-FileCopyrightText: 2022 Serokell +# +# SPDX-License-Identifier: Unlicense + +scanners: + markdown: + flavor: GitHub + +networking: + externalRefRedirects: + - outcome: follow From 05fe537ae1d7d04c247a03707dc7b63d1ab89a66 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Enr=C3=ADquez?= Date: Wed, 28 Dec 2022 17:37:58 +0100 Subject: [PATCH 2/2] [Chore] Simplify regexp usages --- README.md | 4 ++-- src/Xrefcheck/Config/Default.hs | 2 +- tests/Test/Xrefcheck/RedirectConfigSpec.hs | 14 +++++++------- tests/configs/github-config.yaml | 2 +- .../config-check-disabled.yaml | 2 +- tests/golden/check-redirect-parse/bad-code.yaml | 1 - tests/golden/check-redirect-parse/bad-on.yaml | 1 - tests/golden/check-redirect-parse/full-rule.yaml | 2 +- tests/golden/check-redirect-parse/no-outcome.yaml | 2 +- .../check-redirect-parse/only-outcome-to.yaml | 2 +- 10 files changed, 15 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index 42bde3b0..3239f056 100644 --- a/README.md +++ b/README.md @@ -162,8 +162,8 @@ There are several ways to fix this: - to: "https?://forbidden.com.*" on: 307 outcome: invalid - - from: "^http://.*" - to: "^https://.*" + - from: "http://.*" + to: "https://.*" outcome: follow ``` diff --git a/src/Xrefcheck/Config/Default.hs b/src/Xrefcheck/Config/Default.hs index efdb31f9..86eec9e2 100644 --- a/src/Xrefcheck/Config/Default.hs +++ b/src/Xrefcheck/Config/Default.hs @@ -37,7 +37,7 @@ exclusions: # List of POSIX extended regular expressions. ignoreExternalRefsTo: # Ignore localhost links by default - - ^(https?|ftps?)://(localhost|127\\.0\\.0\\.1).* + - (https?|ftps?)://(localhost|127\\.0\\.0\\.1).* # Networking parameters. networking: diff --git a/tests/Test/Xrefcheck/RedirectConfigSpec.hs b/tests/Test/Xrefcheck/RedirectConfigSpec.hs index 25043f24..70fa2f31 100644 --- a/tests/Test/Xrefcheck/RedirectConfigSpec.hs +++ b/tests/Test/Xrefcheck/RedirectConfigSpec.hs @@ -49,7 +49,7 @@ test_redirectRequests = testGroup "Redirect config tests" [ testCase "Do match" $ do setRef <- newIORef mempty checkLinkAndProgressWithServer - (configMod [RedirectRule Nothing (regex "^.*/ok$") Nothing RROValid] []) + (configMod [RedirectRule Nothing (regex ".*/ok") Nothing RROValid] []) setRef mockRedirect (link "/permanent-redirect") @@ -58,7 +58,7 @@ test_redirectRequests = testGroup "Redirect config tests" , testCase "Do not match" $ do setRef <- newIORef mempty checkLinkAndProgressWithServer - (configMod [RedirectRule Nothing (regex "^.*/no-ok$") (Just RROPermanent) RROValid] []) + (configMod [RedirectRule Nothing (regex ".*/no-ok") (Just RROPermanent) RROValid] []) setRef mockRedirect (link "/permanent-redirect") @@ -69,7 +69,7 @@ test_redirectRequests = testGroup "Redirect config tests" [ testCase "Do match" $ do setRef <- newIORef mempty checkLinkAndProgressWithServer - (configMod [RedirectRule (regex "^.*/permanent-.*$") Nothing Nothing RROValid] []) + (configMod [RedirectRule (regex ".*/permanent-.*") Nothing Nothing RROValid] []) setRef mockRedirect (link "/permanent-redirect") @@ -78,7 +78,7 @@ test_redirectRequests = testGroup "Redirect config tests" , testCase "Do not match" $ do setRef <- newIORef mempty checkLinkAndProgressWithServer - (configMod [RedirectRule (regex "^.*/temporary-.*$") Nothing (Just RROPermanent) RROValid] []) + (configMod [RedirectRule (regex ".*/temporary-.*") Nothing (Just RROPermanent) RROValid] []) setRef mockRedirect (link "/permanent-redirect") @@ -89,7 +89,7 @@ test_redirectRequests = testGroup "Redirect config tests" [ testCase "Do match" $ do setRef <- newIORef mempty checkLinkAndProgressWithServer - (configMod [RedirectRule (regex "^.*/follow[0-9]$") (regex "^.*/ok$") (Just (RROCode 307)) RROInvalid] []) + (configMod [RedirectRule (regex ".*/follow[0-9]") (regex "^.*/ok$") (Just (RROCode 307)) RROInvalid] []) setRef mockRedirect (link "/follow3") @@ -98,7 +98,7 @@ test_redirectRequests = testGroup "Redirect config tests" , testCase "Do not match" $ do setRef <- newIORef mempty checkLinkAndProgressWithServer - (configMod [RedirectRule (regex "^.*/follow[0-9]$") (regex "^.*/ok$") (Just (RROCode 307)) RROInvalid] []) + (configMod [RedirectRule (regex ".*/follow[0-9]") (regex "^.*/ok$") (Just (RROCode 307)) RROInvalid] []) setRef mockRedirect (link "/follow2") @@ -137,7 +137,7 @@ test_redirectRequests = testGroup "Redirect config tests" , testCase "Mixed with ignore" $ do setRef <- newIORef mempty checkLinkAndProgressWithServer - (configMod [RedirectRule Nothing Nothing (Just (RROCode 307)) RROInvalid, RedirectRule Nothing Nothing Nothing RROFollow] (maybeToList (regex "^.*/follow3$"))) + (configMod [RedirectRule Nothing Nothing (Just (RROCode 307)) RROInvalid, RedirectRule Nothing Nothing Nothing RROFollow] (maybeToList (regex ".*/follow3"))) setRef mockRedirect (link "/follow1") diff --git a/tests/configs/github-config.yaml b/tests/configs/github-config.yaml index dcd09bdb..adf55523 100644 --- a/tests/configs/github-config.yaml +++ b/tests/configs/github-config.yaml @@ -25,7 +25,7 @@ exclusions: # List of POSIX extended regular expressions. ignoreExternalRefsTo: # Ignore localhost links by default - - ^(https?|ftps?)://(localhost|127\.0\.0\.1).* + - (https?|ftps?)://(localhost|127\.0\.0\.1).* # Networking parameters. networking: diff --git a/tests/golden/check-ignoreExternalRefsTo/config-check-disabled.yaml b/tests/golden/check-ignoreExternalRefsTo/config-check-disabled.yaml index df6ea981..351ac3d0 100644 --- a/tests/golden/check-ignoreExternalRefsTo/config-check-disabled.yaml +++ b/tests/golden/check-ignoreExternalRefsTo/config-check-disabled.yaml @@ -4,7 +4,7 @@ exclusions: ignoreExternalRefsTo: - - ^(https?|ftps?)://(localhost|127\.0\.0\.1).* + - (https?|ftps?)://(localhost|127\.0\.0\.1).* scanners: markdown: diff --git a/tests/golden/check-redirect-parse/bad-code.yaml b/tests/golden/check-redirect-parse/bad-code.yaml index b9233a8f..715e8dd9 100644 --- a/tests/golden/check-redirect-parse/bad-code.yaml +++ b/tests/golden/check-redirect-parse/bad-code.yaml @@ -9,5 +9,4 @@ scanners: networking: externalRefRedirects: - outcome: valid - to: ^.*$ on: 404 diff --git a/tests/golden/check-redirect-parse/bad-on.yaml b/tests/golden/check-redirect-parse/bad-on.yaml index 1c3d85bf..bd56376d 100644 --- a/tests/golden/check-redirect-parse/bad-on.yaml +++ b/tests/golden/check-redirect-parse/bad-on.yaml @@ -9,5 +9,4 @@ scanners: networking: externalRefRedirects: - outcome: valid - to: ^.*$ on: premanent diff --git a/tests/golden/check-redirect-parse/full-rule.yaml b/tests/golden/check-redirect-parse/full-rule.yaml index a0f7154d..69ff096a 100644 --- a/tests/golden/check-redirect-parse/full-rule.yaml +++ b/tests/golden/check-redirect-parse/full-rule.yaml @@ -9,5 +9,5 @@ scanners: networking: externalRefRedirects: - outcome: valid - to: ^https://.*$ + to: https://.* on: permanent diff --git a/tests/golden/check-redirect-parse/no-outcome.yaml b/tests/golden/check-redirect-parse/no-outcome.yaml index ff5eab86..9cf856c5 100644 --- a/tests/golden/check-redirect-parse/no-outcome.yaml +++ b/tests/golden/check-redirect-parse/no-outcome.yaml @@ -10,5 +10,5 @@ networking: externalRefRedirects: - outcome: valid on: temporary - - to: ^https://.*$ + - to: https://.* on: temporary diff --git a/tests/golden/check-redirect-parse/only-outcome-to.yaml b/tests/golden/check-redirect-parse/only-outcome-to.yaml index bcb1dc6a..d5ef9d75 100644 --- a/tests/golden/check-redirect-parse/only-outcome-to.yaml +++ b/tests/golden/check-redirect-parse/only-outcome-to.yaml @@ -9,4 +9,4 @@ scanners: networking: externalRefRedirects: - outcome: valid - to: ^https://.*$ + to: https://.*