From 0b0c96a2336eb9f117b0084fdcbe87d93b2cdbc4 Mon Sep 17 00:00:00 2001 From: Jacob Sikorski Date: Thu, 16 Jun 2022 17:15:08 -0400 Subject: [PATCH] Fix #5373: Add regex debouncing --- .../DebouncingResourceDownloader.swift | 309 ++++++++++++------ Tests/ClientTests/Resources/debouncing.json | 11 + .../DebouncingResourceDownloaderTests.swift | 32 ++ 3 files changed, 261 insertions(+), 91 deletions(-) diff --git a/Client/WebFilters/DebouncingResourceDownloader.swift b/Client/WebFilters/DebouncingResourceDownloader.swift index 18822e640e8..0af86f66cfd 100644 --- a/Client/WebFilters/DebouncingResourceDownloader.swift +++ b/Client/WebFilters/DebouncingResourceDownloader.swift @@ -12,24 +12,35 @@ private let log = Logger.browserLogger /// An helper class that manages the debouncing list and offers convenience methods public class DebouncingResourceDownloader { + public enum RuleError: Error { + case unsupportedAction(String) + case unsupportedPref(String) + } + /// Object representing an item in the debouncing.json file found here: /// https://github.com/brave/adblock-lists/blob/master/brave-lists/debounce.json - struct MatcherRule: Decodable { + public struct MatcherRule: Decodable { private enum CodingKeys: String, CodingKey { - case include - case exclude - case action - case param + case include, exclude, action, param, pref, prependScheme } + /// Defines actions that should be performed on the matching url + /// - Note: Some rules can be paired while others cannot. enum Action: String { + /// Requres the extracted query param to be base64 decoded + /// The extracted url is provided as a query param and the param name is defined in the `param` field case base64 + /// Defines that the url should be redirected to the extracted url + /// The extracted url is provided as a query param and the param name is defined in the `param` field + /// unless `regex-path` is used case redirect + /// Defines that the url should be extracted by a regex pattern. + /// The regex is available in the `param` field. case regexPath = "regex-path" } - enum RuleError: Error { - case unsupportedAction(String) + enum Preferance: String { + case deAmpEnabled = "brave.de_amp.enabled" } /// A set of patterns that are include in this rule @@ -38,8 +49,12 @@ public class DebouncingResourceDownloader { let exclude: Set? /// The query param that contains the redirect `URL` to be extractted from let param: String + /// The scheme to prepend if the scheme is not available + let prependScheme: String? /// The actions in a raw format. Need to be broken down to a set of `Action` entries private let action: String + /// A list of comma separated values representing preferences that need to be checked before applying this rule + private let pref: String? /// Actions in a strictly typed format and split up into an array /// - Note: Unrecognized actions will be filtered out @@ -54,6 +69,22 @@ public class DebouncingResourceDownloader { } })) } + + /// Actions in a strictly typed format and split up into an array + /// - Note: Unrecognized actions will be filtered out + func makePreferences() throws -> Set { + guard let prefStrings = pref?.split(separator: ",") else { + return [] + } + + return Set(try prefStrings.map({ + if let action = Preferance(rawValue: String($0)) { + return action + } else { + throw RuleError.unsupportedPref(String($0)) + } + })) + } /// Determines if this url is not excluded in the exclude list func isExcluded(url: URL) -> Bool { @@ -72,6 +103,97 @@ public class DebouncingResourceDownloader { return !isExcluded(url: url) && isIncluded(url: url) } } + + /// Defines redirect actions that should be performed on the matching url. + /// These rules contain additional information needed to successfully extract the redirect URL. + /// + /// - Note: Unlike `MatcherRule.Action`, only one action can be applied on a single url + enum RedirectAction { + case regexParam(regex: String) + case queryParam(name: String) + } + + struct RedirectRule { + let action: RedirectAction + let shouldBase64Decode: Bool + let preferences: Set + let prependScheme: String? + + func extractRedirectURL(from url: URL) throws -> URL? { + guard let value = try extractDecodedValue(from: url) else { return nil } + + if let prependScheme = prependScheme { + return makeValidURL(string: value, prependScheme: prependScheme) + } else { + return makeValidURL(string: value) + } + } + + private func extractDecodedValue(from url: URL) throws -> String? { + guard let value = try extractRawValue(from: url) else { return nil } + + if shouldBase64Decode { + // We need to base64 decode the url + guard let data = Data(base64Encoded: value) else { + return nil + } + + return String(data: data, encoding: .utf8) + } else { + return value + } + } + + private func extractRawValue(from url: URL) throws -> String? { + guard let components = URLComponents(url: url, resolvingAgainstBaseURL: false) else { return nil } + + switch action { + case .regexParam(let pattern): + let path = components.path + let regex = try NSRegularExpression(pattern: pattern) + let range = NSRange(location: 0, length: path.count) + let matches = regex.matches(in: path, range: range) + + // This is a requirement that only 1 match is found + guard matches.count == 1, let match = matches.first else { return nil } + let matchRange = match.range(at: 1) + + if let swiftRange = Range(matchRange, in: path) { + let result = String(path[swiftRange]) + return result + } else { + return nil + } + case .queryParam(let param): + guard let queryItems = components.queryItems else { return nil } + return queryItems.first(where: { $0.name == param })?.value + } + } + + func makeValidURL(string: String) -> URL? { + guard + let urlComponents = URLComponents(string: string), + urlComponents.scheme != nil, + urlComponents.host != nil, + let url = urlComponents.url, + url.isWebPage() + else { return nil } + + return url + } + + func makeValidURL(string: String, prependScheme: String) -> URL? { + if let url = makeValidURL(string: string) { + return url + } else if let url = makeValidURL(string: [prependScheme, string].joined()) { + return url + } else if let url = makeValidURL(string: [prependScheme, string].joined(separator: "://")) { + return url + } else { + return nil + } + } + } /// A class that, given a set of rules, helps in returning a redirect URL struct Matcher { @@ -101,28 +223,33 @@ public class DebouncingResourceDownloader { var etldToRelevantPatterns: [String: [String]] = [:] for pattern in rule.include { - let urlPattern = URLPattern(validSchemes: .all) - let parseResult = urlPattern.parse(pattern: pattern) - - switch parseResult { - case .success: - if urlPattern.isMatchingAllURLs { - // We put this query param and rule to our A bucket - // This seems to be an empty list for now - queryToRule[rule.param] = rule - } else if let etld1 = urlPattern.baseDomain { - // We put this etld and rule to our B bucket - var relevantPatterns = etldToRelevantPatterns[etld1] ?? [] - relevantPatterns.append(pattern) - etldToRelevantPatterns[etld1] = relevantPatterns - } else { - // Everything else goes to our C bucket - // For the time being this set only encompases patterns without a host - otherRules.append(rule) + do { + let urlPattern = URLPattern(validSchemes: .all) + let parseResult = urlPattern.parse(pattern: pattern) + let actions = try rule.makeActions() + + switch parseResult { + case .success: + if urlPattern.isMatchingAllURLs && !actions.contains(.regexPath) { + // We put this query param and rule to our A bucket + // This seems to be an empty list for now + queryToRule[rule.param] = rule + } else if let etld1 = urlPattern.baseDomain { + // We put this etld and rule to our B bucket + var relevantPatterns = etldToRelevantPatterns[etld1] ?? [] + relevantPatterns.append(pattern) + etldToRelevantPatterns[etld1] = relevantPatterns + } else { + // Everything else goes to our C bucket + // For the time being this set only encompases patterns without a host + otherRules.append(rule) + } + default: + log.error("Cannot parse pattern `\(pattern)`. Got status `\(parseResult)`") + continue } - default: - log.error("Cannot parse pattern `\(pattern)`. Got status `\(parseResult)`") - continue + } catch { + } } @@ -167,12 +294,18 @@ public class DebouncingResourceDownloader { /// 4. Apply all the rules from bucket C func redirectURLOnce(from url: URL) -> URL? { do { - guard let pair = try matchingQueryItemAndActions(for: url) else { + guard let rule = try matchingRedirectRule(for: url) else { return nil } - return redirectURL(queryItem: pair.queryItem, actions: pair.actions) - } catch MatcherRule.RuleError.unsupportedAction(let rawRule) { + // We want to ensure the extracted url isn't something like "https://foo" + // So we check for the presence of an `eTLD+1` + if let url = try rule.extractRedirectURL(from: url), url.baseDomain != nil { + return url + } else { + return nil + } + } catch RuleError.unsupportedAction(let rawRule) { // We may add new rules in the future. If thats the case, this will throw but we don't want // an error presented or anything to happen because ios is not ready to handle this new rule // But since it's impossible to discriminate a programmer mistake from a new rule added @@ -180,90 +313,73 @@ public class DebouncingResourceDownloader { log.error("Unsupported rule used for debouncing: `\(rawRule)`") return nil } catch { + log.error(error) assertionFailure(error.localizedDescription) return nil } } /// Attempts to find a valid query param and a set of actions for the given URL - /// - Throws: May throw `DebouncingResourceDownloader.MatcherRule.RuleError` - private func matchingQueryItemAndActions(for url: URL) throws -> (queryItem: URLQueryItem, actions: Set)? { - // Extract the redirect URL - guard let components = URLComponents(url: url, resolvingAgainstBaseURL: false) else { return nil } - guard let queryItems = components.queryItems else { return nil } - guard let etld1 = url.baseDomain else { return nil } - - if let entries = etldToRule[etld1] { + /// - Throws: May throw `DebouncingResourceDownloader.RuleError` + private func matchingRedirectRule(for url: URL) throws -> RedirectRule? { + guard let rule = matchingRule(for: url) else { + return nil + } + + let actions = try rule.makeActions() + let preferences = try rule.makePreferences() + + if actions.contains(.regexPath) { + return RedirectRule( + action: RedirectAction.regexParam(regex: rule.param), + shouldBase64Decode: actions.contains(.base64), + preferences: preferences, + prependScheme: rule.prependScheme + ) + } else { + return RedirectRule( + action: RedirectAction.queryParam(name: rule.param), + shouldBase64Decode: actions.contains(.base64), + preferences: preferences, + prependScheme: rule.prependScheme + ) + } + } + + /// Attempts to find a valid rule for the given URL + private func matchingRule(for url: URL) -> MatcherRule? { + if let etld1 = url.baseDomain, let entries = etldToRule[etld1] { guard let entry = entries.first(where: { url.matches(any: $0.relevantPatterns) }), - !entry.rule.isExcluded(url: url), - let queryItem = queryItems.first(where: { $0.name == entry.rule.param }) + !entry.rule.isExcluded(url: url) else { return nil } - return (queryItem, try entry.rule.makeActions()) - } else if let pair = try matchingQueryItemsAndActions(for: url, queryItems: queryItems) { - return (pair.queryItem, pair.actions) - } else if let rule = otherRules.first(where: { $0.handles(url: url) }) { - guard let queryItem = queryItems.first(where: { $0.name == rule.param }) else { - return nil - } - - return (queryItem, try rule.makeActions()) + return entry.rule } else { - return nil + return matchingCachedQueryParamRule(for: url) + ?? otherRules.first(where: { $0.handles(url: url) }) } } /// Attempt to extract the query value and actions from the list of queryItems using the `queryToRule` (all urls) map. /// - /// - Parameter queryItems: The query items to extract the value and actions from - /// - Returns: A raw query item and actions to perform on that query item - /// - Throws: May throw `DebouncingResourceDownloader.MatcherRule.RuleError` - private func matchingQueryItemsAndActions( - for url: URL, - queryItems: [URLQueryItem] - ) throws -> (queryItem: URLQueryItem, actions: Set)? { + /// - Parameter url: The url to extract the query params from + /// - Returns: A rule for the given url + private func matchingCachedQueryParamRule(for url: URL) -> MatcherRule? { + // Extract the redirect URL + guard let components = URLComponents(url: url, resolvingAgainstBaseURL: false) else { return nil } + guard let queryItems = components.queryItems else { return nil } + for queryItem in queryItems { - guard - let rule = queryToRule[queryItem.name], - !rule.isExcluded(url: url) - else { - continue + if let rule = queryToRule[queryItem.name], !rule.isExcluded(url: url) { + return rule } - - return (queryItem, try rule.makeActions()) } return nil } - - /// It attempts to extract a redirect `URL` from the given query item using the given actions - /// - /// It returns nil if it cannot find a valid URL within the query item - private func redirectURL(queryItem: URLQueryItem, actions: Set) -> URL? { - guard let queryValue = queryItem.value else { return nil } - - // For now we only support redirecting so it makes sense to check this right away - // No point in trying to decode anything if we don't have this action - guard actions.contains(.redirect) else { - return nil - } - - if actions.contains(.base64) { - // We need to base64 decode the url - guard let data = Data(base64Encoded: queryValue), - let decodedString = String(data: data, encoding: .utf8) - else { - return nil - } - - return URL(string: decodedString) - } else { - return URL(string: queryValue) - } - } } /// The base s3 environment url that hosts the debouncing (and other) files. @@ -460,3 +576,14 @@ private extension URLPattern { return result.isEmpty ? nil : result } } + +extension Result: Decodable where Success: Decodable, Failure == Error { + public init(from decoder: Decoder) throws { + do { + let decodable = try Success(from: decoder) + self = .success(decodable) + } catch { + self = .failure(error) + } + } +} diff --git a/Tests/ClientTests/Resources/debouncing.json b/Tests/ClientTests/Resources/debouncing.json index 40cdb6774da..684edaa5428 100644 --- a/Tests/ClientTests/Resources/debouncing.json +++ b/Tests/ClientTests/Resources/debouncing.json @@ -192,5 +192,16 @@ ], "action": "base64,redirect", "param": "url" + }, + { + "include": [ + "*://brave.com/*" + ], + "pref": "brave.de_amp.enabled", + "exclude": [ + ], + "prepend_scheme": "https", + "action": "regex-path", + "param": "^/(.*)$" } ] diff --git a/Tests/ClientTests/Web Filters/DebouncingResourceDownloaderTests.swift b/Tests/ClientTests/Web Filters/DebouncingResourceDownloaderTests.swift index ef26e9abe15..5478a2f010d 100644 --- a/Tests/ClientTests/Web Filters/DebouncingResourceDownloaderTests.swift +++ b/Tests/ClientTests/Web Filters/DebouncingResourceDownloaderTests.swift @@ -245,6 +245,38 @@ class DebouncingResourceDownloaderTests: XCTestCase { XCTAssertEqual(returnedURL1, landingURL) XCTAssertNil(returnedURL2) } + + /// Test a match-all rule + func testRegexRuleWithValidURL() { + // Given + // A cosntructed url + let baseURL = URL(string: "https://brave.com/https://braveattentiontoken.com")! + let landingURL = URL(string: "https://braveattentiontoken.com")! + + // When + // A redirect url + let returnedURL1 = downloader.redirectURL(for: baseURL) + + // Then + XCTAssertEqual(returnedURL1, landingURL) + } + + /// Test a match-all rule + func testRegexRuleWithInvalidURL() { + // Given + // A cosntructed url + let baseURL = URL(string: "https://brave.com/xyz")! + let landingURL = URL(string: "https://xyz")! + + // When + // A redirect url + let returnedURL1 = downloader.redirectURL(for: baseURL) + + // Then + // TODO: @JS Clarfy this. Because `https://xyz` is technically a valid url + // but according to specs it shouldn't not be one. But why? + XCTAssertEqual(returnedURL1, landingURL) + } func testRealisticExamples() { // Given