Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Switch List implementation to use Trie-based lookup #134

Open
wants to merge 9 commits into
base: main
Choose a base branch
from
8,061 changes: 8,061 additions & 0 deletions data/rules-ascii.txt

Large diffs are not rendered by default.

8,061 changes: 8,061 additions & 0 deletions data/rules-unicode.txt

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions lib/public_suffix.rb
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
require_relative "public_suffix/errors"
require_relative "public_suffix/rule"
require_relative "public_suffix/list"
require_relative "public_suffix/trie"

# PublicSuffix is a Ruby domain name parser based on the Public Suffix List.
#
Expand Down
19 changes: 17 additions & 2 deletions lib/public_suffix/list.rb
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ def self.parse(input, private_domains: true)
# @yieldparam [PublicSuffix::List] self The newly created instance.
def initialize
@rules = {}
@trie = PublicSuffix::Trie.new
yield(self) if block_given?
end

Expand Down Expand Up @@ -137,7 +138,9 @@ def each(&block)
# @param rule [PublicSuffix::Rule::*] the rule to add to the list
# @return [self]
def add(rule)
@rules[rule.value] = rule_to_entry(rule)
entry = rule_to_entry(rule)
@rules[rule.value] = entry
@trie.insert(rule.value, type: entry.type, private: entry.private)
self
end
alias << add
Expand Down Expand Up @@ -170,14 +173,26 @@ def clear
# @param default [PublicSuffix::Rule::*] the default rule to return in case no rule matches
# @return [PublicSuffix::Rule::*]
def find(name, default: default_rule, **options)
if ENV["WHAT"] == "hash"
find_hash(name, default: default, **options)
else
find_trie(name, default: default, **options)
end
end

def find_hash(name, default: default_rule, **options)
rule = select(name, **options).inject do |l, r|
return r if r.class == Rule::Exception
l.length > r.length ? l : r
end
rule || default
end

# Selects all the rules matching given hostame.
def find_trie(name, default: default_rule, ignore_private: false)
@trie.longest_prefix(name, ignore_private: ignore_private) || default
end

# Selects all the rules matching given hostname.
#
# If `ignore_private` is set to true, the algorithm will skip the rules that are flagged as
# private domain. Note that the rules will still be part of the loop.
Expand Down
97 changes: 97 additions & 0 deletions lib/public_suffix/trie.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
module PublicSuffix

# Implements a Trie data structure used to store the List.
class Trie

# @return [PublicSuffix::Node]
attr_reader :root

def initialize
@root = self.class::Node.new
end

def insert(word, type:, private:)
node = @root
word.split(DOT).reverse.each do |token|
node = node.put(token)
end
node.end!(type: type, private: private) && node
end

def longest_prefix(word, ignore_private: false)
node = @root

results = []
leaf = nil
excp = nil

word.split(DOT).reverse.each_with_index do |token, index|
break unless (child = node.get(token))
results << [child, token]
node = child

if node.end? && (ignore_private == false || node.private == false)
leaf = index + 1
excp = index + 1 if node.type == Rule::Exception
end
end

return nil if leaf.nil?

path = excp ? results[0, excp] : results[0, leaf]
node = path.last.first

tokens = []
(path.size - 1).downto(0).each do |index|
tokens << path[index].last
end
node.type.new(value: tokens.join(DOT), private: node.private)
end


# Node is a node of the Trie and contains references to all the children nodes.
#
# A node marked as "end" represents the final part of a rule. It contains the rule information
# such as the rule type and whether it belongs to PRIVATE.
class Node
attr_accessor :children
attr_accessor :type
attr_accessor :private

def initialize
@children = nil
end

def contains?(key)
return false if @children.nil?
!@children[index(key)].nil?
end

def put(key)
@children ||= {}
@children[index(key)] ||= self.class.new
end

def get(key)
return nil if @children.nil?
@children[index(key)]
end

def end?
!@type.nil?
end

def end!(type:, private:)
@type = type
@private = private
end

private

def index(key)
key
end
end

end
end
3 changes: 2 additions & 1 deletion test/profilers/list_profsize.rb
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,5 @@

prof = ObjectBinsize.new
prof.report(PublicSuffix::List.default, label: "PublicSuffix::List size")
prof.report(PublicSuffix::List.default.instance_variable_get(:@rules), label: "Size of rules")
prof.report(PublicSuffix::List.default.instance_variable_get(:@rules), label: "Size of @rules")
prof.report(PublicSuffix::List.default.instance_variable_get(:@trie), label: "Size of @trie")
15 changes: 15 additions & 0 deletions test/profilers/tries_profiler.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
$LOAD_PATH.unshift File.expand_path("../../lib", __dir__)

require "memory_profiler"
require "public_suffix"
require "public_suffix/trie"

list = PublicSuffix::List.default
puts "#{list.size} rules:"

report = MemoryProfiler.report do
@trie = PublicSuffix::Trie.new
list.instance_variable_get(:@rules).keys { |word| @trie.insert(word.split(".").reverse.join(".")) }
end

report.pretty_print
15 changes: 15 additions & 0 deletions test/profilers/tries_prosize.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
$LOAD_PATH.unshift File.expand_path("../../lib", __dir__)

require_relative "object_binsize"
require "public_suffix"
require "public_suffix/trie"

list = PublicSuffix::List.default
rules = list.instance_variable_get(:@rules)

@trie = PublicSuffix::Trie.new
rules.keys.each { |word| @trie.insert(word.split(".").reverse.join(".")) }

prof = ObjectBinsize.new
prof.report(rules, label: "@rules")
prof.report(@trie, label: "@trie")