From 8754ed45d2a5dcf3a7783056d16f5003bced2bce Mon Sep 17 00:00:00 2001 From: ChrisBAshton Date: Fri, 25 Nov 2016 12:58:10 +0000 Subject: [PATCH 1/3] started adding 'spider' command --- lib/wraith/cli.rb | 9 ++ lib/wraith/helpers/custom_exceptions.rb | 6 + lib/wraith/spider.rb | 197 +++++++++++++----------- lib/wraith/validate.rb | 10 ++ lib/wraith/wraith.rb | 12 +- spec/configs/test_config--spider.yaml | 46 ++++++ spec/validate_spec.rb | 23 ++- 7 files changed, 209 insertions(+), 94 deletions(-) create mode 100644 spec/configs/test_config--spider.yaml diff --git a/lib/wraith/cli.rb b/lib/wraith/cli.rb index 3d9333ed..97a0a02a 100644 --- a/lib/wraith/cli.rb +++ b/lib/wraith/cli.rb @@ -57,6 +57,15 @@ def setup end end + desc "spider [config_name]", "crawls a site for paths and stores them to YML file" + def spider(config) + within_acceptable_limits do + logger.info Wraith::Validate.new(config).validate("spider") + spider = Wraith::Spider.new(config) + spider.crawl + end + end + desc "reset_shots [config_name]", "removes all the files in the shots folder" def reset_shots(config_name) within_acceptable_limits do diff --git a/lib/wraith/helpers/custom_exceptions.rb b/lib/wraith/helpers/custom_exceptions.rb index 52b6d5db..0206aeb4 100644 --- a/lib/wraith/helpers/custom_exceptions.rb +++ b/lib/wraith/helpers/custom_exceptions.rb @@ -10,6 +10,12 @@ class MissingRequiredPropertyError < CustomError class ConfigFileDoesNotExistError < CustomError end +class PropertyOutOfContextError < CustomError +end + +class InvalidYamlError < CustomError +end + class MissingImageError < CustomError def initialize(msg = false) default_msg = "Something went wrong! It looks like you're missing some images. Check your output directory and make sure that each path has four files for every screen size (data.txt, diff, base, latest). If in doubt, delete your output directory and run Wraith again." diff --git a/lib/wraith/spider.rb b/lib/wraith/spider.rb index 76317c3f..b2044411 100644 --- a/lib/wraith/spider.rb +++ b/lib/wraith/spider.rb @@ -4,105 +4,120 @@ require "nokogiri" require "uri" -class Wraith::Spidering - include Logging - attr_reader :wraith +class Wraith::Spider def initialize(config) @wraith = Wraith::Wraith.new(config) end - def check_for_paths - if wraith.paths.nil? - unless wraith.sitemap.nil? - logger.info "no paths defined in config, loading paths from sitemap" - spider = Wraith::Sitemap.new(wraith) - else - logger.info "no paths defined in config, crawling from site root" - spider = Wraith::Crawler.new(wraith) - end - spider.determine_paths - end - end -end - -class Wraith::Spider - attr_reader :wraith - - def initialize(wraith) - @wraith = wraith - @paths = {} - end - - def determine_paths - spider - write_file - end - - private - - def write_file - File.open(wraith.spider_file, "w+") { |file| file.write(@paths) } - end - - def add_path(path) - @paths[path == "/" ? "home" : path.gsub("/", "__").chomp("__").downcase] = path.downcase + def crawl + throw 'test' end - def spider - end end -class Wraith::Crawler < Wraith::Spider - include Logging - EXT = %w(flv swf png jpg gif asx zip rar tar 7z \ - gz jar js css dtd xsd ico raw mp3 mp4 \ - wav wmv ape aac ac3 wma aiff mpg mpeg \ - avi mov ogg mkv mka asx asf mp2 m1v \ - m3u f4v pdf doc xls ppt pps bin exe rss xml) - def spider - if File.exist?(wraith.spider_file) && modified_since(wraith.spider_file, wraith.spider_days[0]) - logger.info "using existing spider file" - @paths = eval(File.read(wraith.spider_file)) - else - logger.info "creating new spider file" - Anemone.crawl(wraith.base_domain) do |anemone| - anemone.skip_links_like(/\.(#{EXT.join('|')})$/) - # Add user specified skips - anemone.skip_links_like(wraith.spider_skips) - anemone.on_every_page { |page| add_path(page.url.path) } - end - end - end - - def modified_since(file, since) - (Time.now - File.ctime(file)) / (24 * 3600) < since - end -end - -class Wraith::Sitemap < Wraith::Spider - include Logging - - def spider - unless wraith.sitemap.nil? - logger.info "reading sitemap.xml from #{wraith.sitemap}" - if wraith.sitemap =~ URI.regexp - sitemap = Nokogiri::XML(open(wraith.sitemap)) - else - sitemap = Nokogiri::XML(File.open(wraith.sitemap)) - end - sitemap.css("loc").each do |loc| - path = loc.content - # Allow use of either domain in the sitemap.xml - wraith.domains.each do |_k, v| - path.sub!(v, "") - end - if wraith.spider_skips.nil? || wraith.spider_skips.none? { |regex| regex.match(path) } - add_path(path) - end - end - end - end -end +# +# class Wraith::Spidering +# include Logging +# attr_reader :wraith +# +# def initialize(config) +# @wraith = Wraith::Wraith.new(config) +# end +# +# def check_for_paths +# if wraith.paths.nil? +# unless wraith.sitemap.nil? +# logger.info "no paths defined in config, loading paths from sitemap" +# spider = Wraith::Sitemap.new(wraith) +# else +# logger.info "no paths defined in config, crawling from site root" +# spider = Wraith::Crawler.new(wraith) +# end +# spider.determine_paths +# end +# end +# end +# +# class Wraith::Spider +# attr_reader :wraith +# +# def initialize(wraith) +# @wraith = wraith +# @paths = {} +# end +# +# def determine_paths +# spider +# write_file +# end +# +# private +# +# def write_file +# File.open(wraith.spider_file, "w+") { |file| file.write(@paths) } +# end +# +# def add_path(path) +# @paths[path == "/" ? "home" : path.gsub("/", "__").chomp("__").downcase] = path.downcase +# end +# +# def spider +# end +# end +# +# class Wraith::Crawler < Wraith::Spider +# include Logging +# +# EXT = %w(flv swf png jpg gif asx zip rar tar 7z \ +# gz jar js css dtd xsd ico raw mp3 mp4 \ +# wav wmv ape aac ac3 wma aiff mpg mpeg \ +# avi mov ogg mkv mka asx asf mp2 m1v \ +# m3u f4v pdf doc xls ppt pps bin exe rss xml) +# +# def spider +# if File.exist?(wraith.spider_file) && modified_since(wraith.spider_file, wraith.spider_days[0]) +# logger.info "using existing spider file" +# @paths = eval(File.read(wraith.spider_file)) +# else +# logger.info "creating new spider file" +# Anemone.crawl(wraith.base_domain) do |anemone| +# anemone.skip_links_like(/\.(#{EXT.join('|')})$/) +# # Add user specified skips +# anemone.skip_links_like(wraith.spider_skips) +# anemone.on_every_page { |page| add_path(page.url.path) } +# end +# end +# end +# +# def modified_since(file, since) +# (Time.now - File.ctime(file)) / (24 * 3600) < since +# end +# end +# +# class Wraith::Sitemap < Wraith::Spider +# include Logging +# +# def spider +# unless wraith.sitemap.nil? +# logger.info "reading sitemap.xml from #{wraith.sitemap}" +# if wraith.sitemap =~ URI.regexp +# sitemap = Nokogiri::XML(open(wraith.sitemap)) +# else +# sitemap = Nokogiri::XML(File.open(wraith.sitemap)) +# end +# sitemap.css("loc").each do |loc| +# path = loc.content +# # Allow use of either domain in the sitemap.xml +# wraith.domains.each do |_k, v| +# path.sub!(v, "") +# end +# if wraith.spider_skips.nil? || wraith.spider_skips.none? { |regex| regex.match(path) } +# add_path(path) +# end +# end +# end +# end +# end diff --git a/lib/wraith/validate.rb b/lib/wraith/validate.rb index e448deda..75fc53a0 100644 --- a/lib/wraith/validate.rb +++ b/lib/wraith/validate.rb @@ -37,6 +37,8 @@ def validate_mode_properties(mode) when "latest" validate_history_mode validate_base_shots_exist + when "spider" + validate_spider_mode else logger.warn "Wraith doesn't know how to validate mode '#{mode}'. Continuing..." end @@ -56,6 +58,14 @@ def validate_history_mode fail InvalidDomainsError, "History mode requires exactly one domain. #{docs_prompt}" if wraith.domains.length != 1 end + def validate_spider_mode + fail MissingRequiredPropertyError, "You must specify an `imports` YML"\ + " before running `wraith spider`. #{docs_prompt}" unless wraith.imports + + fail PropertyOutOfContextError, "Tried running `wraith spider` but you have already"\ + " specified paths in your YML. #{docs_prompt}" if wraith.paths + end + def validate_base_shots_exist unless File.directory?(wraith.history_dir) logger.error "You need to run `wraith history` at least once before you can run `wraith latest`!" diff --git a/lib/wraith/wraith.rb b/lib/wraith/wraith.rb index a689dbf3..b85e9a06 100644 --- a/lib/wraith/wraith.rb +++ b/lib/wraith/wraith.rb @@ -22,8 +22,12 @@ def open_config_file(config_name) possible_filenames.each do |filepath| if File.exist?(filepath) - config = File.open filepath - return YAML.load config + config = YAML.load_file(filepath) + if config + return config + else + fail InvalidYamlError, "could not parse \"#{config_name}\" as YAML" + end end end fail ConfigFileDoesNotExistError, "unable to find config \"#{config_name}\"" @@ -166,6 +170,10 @@ def phantomjs_options @config["phantomjs_options"] end + def imports + @config['imports'] || nil + end + def verbose # @TODO - also add a `--verbose` CLI flag which overrides whatever you have set in the config @config["verbose"] || false diff --git a/spec/configs/test_config--spider.yaml b/spec/configs/test_config--spider.yaml new file mode 100644 index 00000000..bedb4983 --- /dev/null +++ b/spec/configs/test_config--spider.yaml @@ -0,0 +1,46 @@ +imports: "spider_paths.yaml" + +#Headless browser option +browser: "phantomjs" + +# Type the name of the directory that shots will be stored in +directory: "shots" + +# Add only 2 domains, key will act as a label +domains: + english: "http://www.live.bbc.co.uk/news" + russian: "http://www.live.bbc.co.uk/russian" + +#Type screen widths below, here are a couple of examples +screen_widths: + - 320 + - 600 + - 768x1500 # you can also specify the height, as we've done here + - 1024 + - 1280 + +#Amount of fuzz ImageMagick will use +fuzz: "20%" + +#Set the filename of the spider file to use, if not specified it will fallback to spider.txt +spider_file: bbc_co_uk_spider.txt + +#Set the number of days to keep the site spider file +pider_days: + - 10 + +#A list of URLs to skip when spidering. Ruby regular expressions can be +#used, if prefixed with !ruby/regexp as defined in the YAML Cookbook +#http://www.yaml.org/YAML_for_ruby.html#regexps +# +# spider_skips: +# - /foo/bar.html # Matches /foo/bar.html explcitly +# - !ruby/regexp /^\/baz\// # Matches any URLs that start with /baz + +#Choose how results are displayed, by default alphanumeric. Different screen widths are always grouped. +#alphanumeric - all paths (with, and without, a difference) are shown, sorted by path +#diffs_first - all paths (with, and without, a difference) are shown, sorted by difference size (largest first) +#diffs_only - only paths with a difference are shown, sorted by difference size (largest first) +mode: diffs_first + +threshold: 15 diff --git a/spec/validate_spec.rb b/spec/validate_spec.rb index 150cf6fd..025446e9 100644 --- a/spec/validate_spec.rb +++ b/spec/validate_spec.rb @@ -53,7 +53,7 @@ ') Wraith::Validate.new(config, true).validate("capture") end - + it "should fail if no directory is specified" do config["domains"] = YAML.load(' test: http://something.bbc.com @@ -88,4 +88,25 @@ Wraith::Validate.new(history_conf, true).validate("history") end end + + describe "validations specific to spider mode" do + let(:spider_conf) do + config.merge(YAML.load(' + imports: "spider_paths.yml" + ')) + end + + it "should complain if imports is empty" do + spider_conf['imports'] = nil + expect { Wraith::Validate.new(spider_conf, true).validate("spider") }.to raise_error PropertyOutOfContextError + end + + it "should complain if paths is set" do + spider_conf.merge(YAML.load(' + paths: + home: / + ')) + expect { Wraith::Validate.new(spider_conf, true).validate("spider") }.to raise_error PropertyOutOfContextError + end + end end From b48e987414fc3622e257285ff219fa4356155461 Mon Sep 17 00:00:00 2001 From: ChrisBAshton Date: Fri, 25 Nov 2016 13:43:39 +0000 Subject: [PATCH 2/3] got new spider mode working --- lib/wraith/cli.rb | 24 +---- lib/wraith/helpers/utilities.rb | 7 ++ lib/wraith/spider.rb | 145 +++++++------------------- lib/wraith/wraith.rb | 2 +- spec/configs/test_config--spider.yaml | 12 +-- spec/validate_spec.rb | 17 ++- wraith.gemspec | 1 - 7 files changed, 60 insertions(+), 148 deletions(-) diff --git a/lib/wraith/cli.rb b/lib/wraith/cli.rb index 97a0a02a..1bebe4b2 100644 --- a/lib/wraith/cli.rb +++ b/lib/wraith/cli.rb @@ -22,26 +22,6 @@ def self.source_root File.expand_path("../../../", __FILE__) end - # define internal methods which user should not be able to run directly - no_commands do - def within_acceptable_limits - yield - rescue CustomError => e - logger.error e.message - # other errors, such as SystemError, will not be caught nicely and will give a stack trace (which we'd need) - end - - def check_for_paths(config_name) - spider = Wraith::Spidering.new(config_name) - spider.check_for_paths - end - - def copy_old_shots(config_name) - create = Wraith::FolderManager.new(config_name) - create.copy_old_shots - end - end - desc "validate [config_name]", "checks your configuration and validates that all required properties exist" def validate(config_name) within_acceptable_limits do @@ -140,7 +120,6 @@ def capture(config, multi = false) within_acceptable_limits do logger.info Wraith::Validate.new(config).validate("capture") reset_shots(config) - check_for_paths(config) setup_folders(config) save_images(config) crop_images(config) @@ -165,10 +144,9 @@ def history(config) within_acceptable_limits do logger.info Wraith::Validate.new(config).validate("history") reset_shots(config) - check_for_paths(config) setup_folders(config) save_images(config) - copy_old_shots(config) + Wraith::FolderManager.new(config).copy_old_shots end end diff --git a/lib/wraith/helpers/utilities.rb b/lib/wraith/helpers/utilities.rb index e98e1aa0..f4ff6c6c 100644 --- a/lib/wraith/helpers/utilities.rb +++ b/lib/wraith/helpers/utilities.rb @@ -1,5 +1,12 @@ require "wraith/helpers/custom_exceptions" +def within_acceptable_limits + yield +rescue CustomError => e + logger.error e.message + # other errors, such as SystemError, will not be caught nicely and will give a stack trace (which we'd need) +end + def convert_to_absolute(filepath) if !filepath "false" diff --git a/lib/wraith/spider.rb b/lib/wraith/spider.rb index b2044411..68f0b56b 100644 --- a/lib/wraith/spider.rb +++ b/lib/wraith/spider.rb @@ -1,123 +1,52 @@ require "wraith" require "wraith/helpers/logger" +require "yaml" require "anemone" -require "nokogiri" require "uri" class Wraith::Spider + include Logging + + EXT = %w(flv swf png jpg gif asx zip rar tar 7z \ + gz jar js css dtd xsd ico raw mp3 mp4 m4a \ + wav wmv ape aac ac3 wma aiff mpg mpeg \ + avi mov ogg mkv mka asx asf mp2 m1v \ + m3u f4v pdf doc xls ppt pps bin exe rss xml) + + attr_reader :wraith def initialize(config) @wraith = Wraith::Wraith.new(config) + @paths = {} end def crawl - throw 'test' - end - -end + logger.info "Crawling #{wraith.base_domain}" + Anemone.crawl(wraith.base_domain) do |anemone| + anemone.skip_links_like(/\.(#{EXT.join('|')})$/) + # Add user specified skips + anemone.skip_links_like(wraith.spider_skips) + anemone.on_every_page do |page| + logger.info " #{page.url.path}" + add_path(page.url.path) + end + end + logger.info "Crawl complete." + write_file + end + def add_path(path) + @paths[path == "/" ? "home" : path.gsub("/", "__").chomp("__").downcase] = path.downcase + end -# -# class Wraith::Spidering -# include Logging -# attr_reader :wraith -# -# def initialize(config) -# @wraith = Wraith::Wraith.new(config) -# end -# -# def check_for_paths -# if wraith.paths.nil? -# unless wraith.sitemap.nil? -# logger.info "no paths defined in config, loading paths from sitemap" -# spider = Wraith::Sitemap.new(wraith) -# else -# logger.info "no paths defined in config, crawling from site root" -# spider = Wraith::Crawler.new(wraith) -# end -# spider.determine_paths -# end -# end -# end -# -# class Wraith::Spider -# attr_reader :wraith -# -# def initialize(wraith) -# @wraith = wraith -# @paths = {} -# end -# -# def determine_paths -# spider -# write_file -# end -# -# private -# -# def write_file -# File.open(wraith.spider_file, "w+") { |file| file.write(@paths) } -# end -# -# def add_path(path) -# @paths[path == "/" ? "home" : path.gsub("/", "__").chomp("__").downcase] = path.downcase -# end -# -# def spider -# end -# end -# -# class Wraith::Crawler < Wraith::Spider -# include Logging -# -# EXT = %w(flv swf png jpg gif asx zip rar tar 7z \ -# gz jar js css dtd xsd ico raw mp3 mp4 \ -# wav wmv ape aac ac3 wma aiff mpg mpeg \ -# avi mov ogg mkv mka asx asf mp2 m1v \ -# m3u f4v pdf doc xls ppt pps bin exe rss xml) -# -# def spider -# if File.exist?(wraith.spider_file) && modified_since(wraith.spider_file, wraith.spider_days[0]) -# logger.info "using existing spider file" -# @paths = eval(File.read(wraith.spider_file)) -# else -# logger.info "creating new spider file" -# Anemone.crawl(wraith.base_domain) do |anemone| -# anemone.skip_links_like(/\.(#{EXT.join('|')})$/) -# # Add user specified skips -# anemone.skip_links_like(wraith.spider_skips) -# anemone.on_every_page { |page| add_path(page.url.path) } -# end -# end -# end -# -# def modified_since(file, since) -# (Time.now - File.ctime(file)) / (24 * 3600) < since -# end -# end -# -# class Wraith::Sitemap < Wraith::Spider -# include Logging -# -# def spider -# unless wraith.sitemap.nil? -# logger.info "reading sitemap.xml from #{wraith.sitemap}" -# if wraith.sitemap =~ URI.regexp -# sitemap = Nokogiri::XML(open(wraith.sitemap)) -# else -# sitemap = Nokogiri::XML(File.open(wraith.sitemap)) -# end -# sitemap.css("loc").each do |loc| -# path = loc.content -# # Allow use of either domain in the sitemap.xml -# wraith.domains.each do |_k, v| -# path.sub!(v, "") -# end -# if wraith.spider_skips.nil? || wraith.spider_skips.none? { |regex| regex.match(path) } -# add_path(path) -# end -# end -# end -# end -# end + def write_file + logger.info "Writing to YML file:" + config = {} + config['paths'] = @paths + File.open(wraith.imports, "w+") do |file| + file.write(config.to_yaml) + logger.info "Spider paths written to #{wraith.imports}" + end + end +end diff --git a/lib/wraith/wraith.rb b/lib/wraith/wraith.rb index b85e9a06..86c7977d 100644 --- a/lib/wraith/wraith.rb +++ b/lib/wraith/wraith.rb @@ -171,7 +171,7 @@ def phantomjs_options end def imports - @config['imports'] || nil + @config['imports'] || false end def verbose diff --git a/spec/configs/test_config--spider.yaml b/spec/configs/test_config--spider.yaml index bedb4983..ea99392b 100644 --- a/spec/configs/test_config--spider.yaml +++ b/spec/configs/test_config--spider.yaml @@ -8,8 +8,8 @@ directory: "shots" # Add only 2 domains, key will act as a label domains: - english: "http://www.live.bbc.co.uk/news" - russian: "http://www.live.bbc.co.uk/russian" + personal: "http://ashton.codes" + business: "http://webdapper.com" #Type screen widths below, here are a couple of examples screen_widths: @@ -22,17 +22,9 @@ screen_widths: #Amount of fuzz ImageMagick will use fuzz: "20%" -#Set the filename of the spider file to use, if not specified it will fallback to spider.txt -spider_file: bbc_co_uk_spider.txt - -#Set the number of days to keep the site spider file -pider_days: - - 10 - #A list of URLs to skip when spidering. Ruby regular expressions can be #used, if prefixed with !ruby/regexp as defined in the YAML Cookbook #http://www.yaml.org/YAML_for_ruby.html#regexps -# # spider_skips: # - /foo/bar.html # Matches /foo/bar.html explcitly # - !ruby/regexp /^\/baz\// # Matches any URLs that start with /baz diff --git a/spec/validate_spec.rb b/spec/validate_spec.rb index 025446e9..78e389bb 100644 --- a/spec/validate_spec.rb +++ b/spec/validate_spec.rb @@ -91,18 +91,25 @@ describe "validations specific to spider mode" do let(:spider_conf) do - config.merge(YAML.load(' + YAML.load(' + domains: + test: http://www.bbc.com + + browser: "casperjs" + + directory: some/dir + imports: "spider_paths.yml" - ')) + ') end it "should complain if imports is empty" do - spider_conf['imports'] = nil - expect { Wraith::Validate.new(spider_conf, true).validate("spider") }.to raise_error PropertyOutOfContextError + spider_conf.delete 'imports' + expect { Wraith::Validate.new(spider_conf, true).validate("spider") }.to raise_error MissingRequiredPropertyError end it "should complain if paths is set" do - spider_conf.merge(YAML.load(' + spider_conf.merge!(YAML.load(' paths: home: / ')) diff --git a/wraith.gemspec b/wraith.gemspec index 072c302c..2476304c 100644 --- a/wraith.gemspec +++ b/wraith.gemspec @@ -26,7 +26,6 @@ Gem::Specification.new do |spec| spec.add_runtime_dependency 'image_size' spec.add_runtime_dependency 'anemone' spec.add_runtime_dependency 'robotex' - spec.add_runtime_dependency 'nokogiri', '~> 1.6.7' spec.add_runtime_dependency 'log4r' spec.add_runtime_dependency 'thor' spec.add_runtime_dependency 'parallel' From e792e0f7c7992bd84ae1fd898e66085176fe0e52 Mon Sep 17 00:00:00 2001 From: ChrisBAshton Date: Fri, 25 Nov 2016 14:28:19 +0000 Subject: [PATCH 3/3] save spider paths file relative to main config file --- lib/wraith/spider.rb | 4 +-- lib/wraith/wraith.rb | 8 +++-- spec/configs/spider_paths.yaml | 66 ++++++++++++++++++++++++++++++++++ 3 files changed, 74 insertions(+), 4 deletions(-) create mode 100644 spec/configs/spider_paths.yaml diff --git a/lib/wraith/spider.rb b/lib/wraith/spider.rb index 68f0b56b..a0fb16db 100644 --- a/lib/wraith/spider.rb +++ b/lib/wraith/spider.rb @@ -41,10 +41,10 @@ def add_path(path) end def write_file - logger.info "Writing to YML file:" + logger.info "Writing to YML file..." config = {} config['paths'] = @paths - File.open(wraith.imports, "w+") do |file| + File.open("#{wraith.config_dir}/#{wraith.imports}", "w+") do |file| file.write(config.to_yaml) logger.info "Spider paths written to #{wraith.imports}" end diff --git a/lib/wraith/wraith.rb b/lib/wraith/wraith.rb index 8b4ede1c..47615afd 100644 --- a/lib/wraith/wraith.rb +++ b/lib/wraith/wraith.rb @@ -35,7 +35,7 @@ def determine_config_path(config_name) possible_filenames.each do |filepath| if File.exist?(filepath) - @config_dir = absolute_path_of_dir(convert_to_absolute filepath) + @calculated_config_dir = absolute_path_of_dir(convert_to_absolute filepath) return convert_to_absolute filepath end end @@ -43,8 +43,12 @@ def determine_config_path(config_name) fail ConfigFileDoesNotExistError, "unable to find config \"#{config_name}\"" end + def config_dir + @calculated_config_dir + end + def apply_imported_config(config_to_import, config) - path_to_config = "#{@config_dir}/#{config_to_import}" + path_to_config = "#{config_dir}/#{config_to_import}" if File.exist?(path_to_config) yaml = YAML.load_file path_to_config return yaml.merge(config) diff --git a/spec/configs/spider_paths.yaml b/spec/configs/spider_paths.yaml new file mode 100644 index 00000000..65f514b3 --- /dev/null +++ b/spec/configs/spider_paths.yaml @@ -0,0 +1,66 @@ +--- +paths: + home: / + __about: /about + __skills: /skills/ + __cv: /cv + __blog__category__dissertation: /blog/category/dissertation/ + __blog__category__reports: /blog/category/reports/ + __blog__category__portfolio: /blog/category/portfolio/ + __blog__chorister: /blog/chorister/ + __blog__category__updates: /blog/category/updates/ + __contact: /contact + __terminal: /terminal/ + __blog__dissertation: /blog/dissertation/ + __blog__discipline-good-enough: /blog/discipline-good-enough/ + __blog__it-is-done: /blog/it-is-done/ + __blog__composer: /blog/composer/ + __blog__mid-project-demonstration: /blog/mid-project-demonstration/ + __blog__completing-the-core: /blog/completing-the-core/ + __blog__commercial-viability: /blog/commercial-viability/ + __blog__cooking-on-gas: /blog/cooking-on-gas/ + __blog__choosing-bdd-framework: /blog/choosing-bdd-framework/ + __blog__reading: /blog/reading/ + __blog__following-plan: /blog/following-plan/ + __blog__outline-project-specification: /blog/outline-project-specification/ + __blog__php-frameworks: /blog/php-frameworks/ + __blog__industrial-year-report: /blog/industrial-year-report/ + __blog__category__dissertation__page__2: /blog/category/dissertation/page/2/ + __blog__exploring-suitability-viola-jones-framework-counting-people: /blog/exploring-suitability-viola-jones-framework-counting-people/ + __blog__smartresolution: /blog/smartresolution/ + __blog__commonwealth-games-quiz: /blog/commonwealth-games-quiz/ + __blog__magazine-parentchild-themes: /blog/magazine-parentchild-themes/ + __blog__reff: /blog/reff/ + __blog__voicecouncil-magazine: /blog/voicecouncil-magazine/ + __blog__hover-bike: /blog/hover-bike/ + __blog__3d-solar-system: /blog/3d-solar-system/ + __blog__how-stressed-are-you: /blog/how-stressed-are-you/ + __blog__studentmunch-com: /blog/studentmunch-com/ + __blog__alice-holmes-anthropomorphism: /blog/alice-holmes-anthropomorphism/ + __blog__nhs-winter: /blog/nhs-winter/ + __blog__abermads-co-uk: /blog/abermads-co-uk/ + __blog__tube-spotted: /blog/tube-spotted/ + __blog__radioactive-evolution: /blog/radioactive-evolution/ + __blog__when-do-i-need-a-non-javascript-solution: /blog/when-do-i-need-a-non-javascript-solution/ + __blog__use-requirejs-wordpress-plugins-jquery-ui: /blog/use-requirejs-wordpress-plugins-jquery-ui/ + __blog__high-impact-minimal-effort-cross-browser-testing: /blog/high-impact-minimal-effort-cross-browser-testing/ + __blog__five-line-rule: /blog/five-line-rule/ + __blog__the-pessimistic-ultimatum: /blog/the-pessimistic-ultimatum/ + __blog__how-to-differentiate-between-links: /blog/how-to-differentiate-between-links/ + __blog__stupidly-simple-programming-introduction: /blog/stupidly-simple-programming-introduction/ + __blog__zaatari-refugee-camp-rebuilding-lives-in-the-desert: /blog/zaatari-refugee-camp-rebuilding-lives-in-the-desert/ + __blog__category__updates__page__2: /blog/category/updates/page/2/ + __blog__requirements-features: /blog/requirements-features/ + __blog__building-shoulders-giants: /blog/building-shoulders-giants/ + __blog__clarifying-requirements: /blog/clarifying-requirements/ + __blog__orthodontic-refactoring: /blog/orthodontic-refactoring/ + __blog__localised-css: /blog/localised-css/ + __blog__project-clarified: /blog/project-clarified/ + __blog__identifying-competitors: /blog/identifying-competitors/ + __blog__and-so-it-begins: /blog/and-so-it-begins/ + __blog__does-drm-promote-piracy: /blog/does-drm-promote-piracy/ + __blog__what-is-the-surprise-app-for-blackberry: /blog/what-is-the-surprise-app-for-blackberry/ + __blog__if-humans-were-computer-components-an-analogy: /blog/if-humans-were-computer-components-an-analogy/ + __blog__redesigning-ip-datagrams-for-a-faster-internet: /blog/redesigning-ip-datagrams-for-a-faster-internet/ + __blog__trinary-is-it-the-future: /blog/trinary-is-it-the-future/ + __blog__the-demise-of-blackberry: /blog/the-demise-of-blackberry/