Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added wraith spider [config_name] command #488

Merged
merged 4 commits into from
Nov 25, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 10 additions & 23 deletions lib/wraith/cli.rb
Original file line number Diff line number Diff line change
Expand Up @@ -22,26 +22,6 @@ def self.source_root
File.expand_path("../../../", __FILE__)
end

# define internal methods which user should not be able to run directly
no_commands do
def within_acceptable_limits
yield
rescue CustomError => e
logger.error e.message
# other errors, such as SystemError, will not be caught nicely and will give a stack trace (which we'd need)
end

def check_for_paths(config_name)
spider = Wraith::Spidering.new(config_name)
spider.check_for_paths
end

def copy_old_shots(config_name)
create = Wraith::FolderManager.new(config_name)
create.copy_old_shots
end
end

desc "validate [config_name]", "checks your configuration and validates that all required properties exist"
def validate(config_name)
within_acceptable_limits do
Expand All @@ -57,6 +37,15 @@ def setup
end
end

desc "spider [config_name]", "crawls a site for paths and stores them to YML file"
def spider(config)
within_acceptable_limits do
logger.info Wraith::Validate.new(config).validate("spider")
spider = Wraith::Spider.new(config)
spider.crawl
end
end

desc "reset_shots [config_name]", "removes all the files in the shots folder"
def reset_shots(config_name)
within_acceptable_limits do
Expand Down Expand Up @@ -131,7 +120,6 @@ def capture(config, multi = false)
within_acceptable_limits do
logger.info Wraith::Validate.new(config).validate("capture")
reset_shots(config)
check_for_paths(config)
setup_folders(config)
save_images(config)
crop_images(config)
Expand All @@ -156,10 +144,9 @@ def history(config)
within_acceptable_limits do
logger.info Wraith::Validate.new(config).validate("history")
reset_shots(config)
check_for_paths(config)
setup_folders(config)
save_images(config)
copy_old_shots(config)
Wraith::FolderManager.new(config).copy_old_shots
end
end

Expand Down
6 changes: 6 additions & 0 deletions lib/wraith/helpers/custom_exceptions.rb
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,12 @@ class MissingRequiredPropertyError < CustomError
class ConfigFileDoesNotExistError < CustomError
end

class PropertyOutOfContextError < CustomError
end

class InvalidYamlError < CustomError
end

class MissingImageError < CustomError
def initialize(msg = false)
default_msg = "Something went wrong! It looks like you're missing some images. Check your output directory and make sure that each path has four files for every screen size (data.txt, diff, base, latest). If in doubt, delete your output directory and run Wraith again."
Expand Down
7 changes: 7 additions & 0 deletions lib/wraith/helpers/utilities.rb
Original file line number Diff line number Diff line change
@@ -1,5 +1,12 @@
require "wraith/helpers/custom_exceptions"

def within_acceptable_limits
yield
rescue CustomError => e
logger.error e.message
# other errors, such as SystemError, will not be caught nicely and will give a stack trace (which we'd need)
end

def absolute_path_of_dir(filepath)
path_parts = filepath.split('/')
path_to_dir = path_parts.first path_parts.size - 1
Expand Down
110 changes: 27 additions & 83 deletions lib/wraith/spider.rb
Original file line number Diff line number Diff line change
@@ -1,108 +1,52 @@
require "wraith"
require "wraith/helpers/logger"
require "yaml"
require "anemone"
require "nokogiri"
require "uri"

class Wraith::Spidering
class Wraith::Spider
include Logging

EXT = %w(flv swf png jpg gif asx zip rar tar 7z \
gz jar js css dtd xsd ico raw mp3 mp4 m4a \
wav wmv ape aac ac3 wma aiff mpg mpeg \
avi mov ogg mkv mka asx asf mp2 m1v \
m3u f4v pdf doc xls ppt pps bin exe rss xml)

attr_reader :wraith

def initialize(config)
@wraith = Wraith::Wraith.new(config)
@paths = {}
end

def check_for_paths
if wraith.paths.nil?
unless wraith.sitemap.nil?
logger.info "no paths defined in config, loading paths from sitemap"
spider = Wraith::Sitemap.new(wraith)
else
logger.info "no paths defined in config, crawling from site root"
spider = Wraith::Crawler.new(wraith)
def crawl
logger.info "Crawling #{wraith.base_domain}"
Anemone.crawl(wraith.base_domain) do |anemone|
anemone.skip_links_like(/\.(#{EXT.join('|')})$/)
# Add user specified skips
anemone.skip_links_like(wraith.spider_skips)
anemone.on_every_page do |page|
logger.info " #{page.url.path}"
add_path(page.url.path)
end
spider.determine_paths
end
end
end

class Wraith::Spider
attr_reader :wraith

def initialize(wraith)
@wraith = wraith
@paths = {}
end

def determine_paths
spider
logger.info "Crawl complete."
write_file
end

private

def write_file
File.open(wraith.spider_file, "w+") { |file| file.write(@paths) }
end

def add_path(path)
@paths[path == "/" ? "home" : path.gsub("/", "__").chomp("__").downcase] = path.downcase
end

def spider
end
end

class Wraith::Crawler < Wraith::Spider
include Logging

EXT = %w(flv swf png jpg gif asx zip rar tar 7z \
gz jar js css dtd xsd ico raw mp3 mp4 \
wav wmv ape aac ac3 wma aiff mpg mpeg \
avi mov ogg mkv mka asx asf mp2 m1v \
m3u f4v pdf doc xls ppt pps bin exe rss xml)

def spider
if File.exist?(wraith.spider_file) && modified_since(wraith.spider_file, wraith.spider_days[0])
logger.info "using existing spider file"
@paths = eval(File.read(wraith.spider_file))
else
logger.info "creating new spider file"
Anemone.crawl(wraith.base_domain) do |anemone|
anemone.skip_links_like(/\.(#{EXT.join('|')})$/)
# Add user specified skips
anemone.skip_links_like(wraith.spider_skips)
anemone.on_every_page { |page| add_path(page.url.path) }
end
end
end

def modified_since(file, since)
(Time.now - File.ctime(file)) / (24 * 3600) < since
end
end

class Wraith::Sitemap < Wraith::Spider
include Logging

def spider
unless wraith.sitemap.nil?
logger.info "reading sitemap.xml from #{wraith.sitemap}"
if wraith.sitemap =~ URI.regexp
sitemap = Nokogiri::XML(open(wraith.sitemap))
else
sitemap = Nokogiri::XML(File.open(wraith.sitemap))
end
sitemap.css("loc").each do |loc|
path = loc.content
# Allow use of either domain in the sitemap.xml
wraith.domains.each do |_k, v|
path.sub!(v, "")
end
if wraith.spider_skips.nil? || wraith.spider_skips.none? { |regex| regex.match(path) }
add_path(path)
end
end
def write_file
logger.info "Writing to YML file..."
config = {}
config['paths'] = @paths
File.open("#{wraith.config_dir}/#{wraith.imports}", "w+") do |file|
file.write(config.to_yaml)
logger.info "Spider paths written to #{wraith.imports}"
end
end
end
10 changes: 10 additions & 0 deletions lib/wraith/validate.rb
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ def validate_mode_properties(mode)
when "latest"
validate_history_mode
validate_base_shots_exist
when "spider"
validate_spider_mode
else
logger.warn "Wraith doesn't know how to validate mode '#{mode}'. Continuing..."
end
Expand All @@ -56,6 +58,14 @@ def validate_history_mode
fail InvalidDomainsError, "History mode requires exactly one domain. #{docs_prompt}" if wraith.domains.length != 1
end

def validate_spider_mode
fail MissingRequiredPropertyError, "You must specify an `imports` YML"\
" before running `wraith spider`. #{docs_prompt}" unless wraith.imports

fail PropertyOutOfContextError, "Tried running `wraith spider` but you have already"\
" specified paths in your YML. #{docs_prompt}" if wraith.paths
end

def validate_base_shots_exist
unless File.directory?(wraith.history_dir)
logger.error "You need to run `wraith history` at least once before you can run `wraith latest`!"
Expand Down
19 changes: 16 additions & 3 deletions lib/wraith/wraith.rb
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@ def initialize(config, yaml_passed = false)
else
filepath = determine_config_path config
@config = YAML.load_file filepath
if !@config
fail InvalidYamlError, "could not parse \"#{config}\" as YAML"
end
end

if @config['imports']
Expand All @@ -32,22 +35,28 @@ def determine_config_path(config_name)

possible_filenames.each do |filepath|
if File.exist?(filepath)
@config_dir = absolute_path_of_dir(convert_to_absolute filepath)
@calculated_config_dir = absolute_path_of_dir(convert_to_absolute filepath)
return convert_to_absolute filepath
end
end

fail ConfigFileDoesNotExistError, "unable to find config \"#{config_name}\""
end

def config_dir
@calculated_config_dir
end

def apply_imported_config(config_to_import, config)
path_to_config = "#{@config_dir}/#{config_to_import}"
path_to_config = "#{config_dir}/#{config_to_import}"
if File.exist?(path_to_config)
yaml = YAML.load_file path_to_config
return yaml.merge(config)
end

fail ConfigFileDoesNotExistError, "unable to find referenced imported config \"#{config_name}\""
# if we got this far, no config could be merged. Return original config.
logger.info "unable to find referenced imported config \"#{config_to_import}\""
config
end

def directory
Expand Down Expand Up @@ -187,6 +196,10 @@ def phantomjs_options
@config["phantomjs_options"]
end

def imports
@config['imports'] || false
end

def verbose
# @TODO - also add a `--verbose` CLI flag which overrides whatever you have set in the config
@config["verbose"] || false
Expand Down
66 changes: 66 additions & 0 deletions spec/configs/spider_paths.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
---
paths:
home: /
__about: /about
__skills: /skills/
__cv: /cv
__blog__category__dissertation: /blog/category/dissertation/
__blog__category__reports: /blog/category/reports/
__blog__category__portfolio: /blog/category/portfolio/
__blog__chorister: /blog/chorister/
__blog__category__updates: /blog/category/updates/
__contact: /contact
__terminal: /terminal/
__blog__dissertation: /blog/dissertation/
__blog__discipline-good-enough: /blog/discipline-good-enough/
__blog__it-is-done: /blog/it-is-done/
__blog__composer: /blog/composer/
__blog__mid-project-demonstration: /blog/mid-project-demonstration/
__blog__completing-the-core: /blog/completing-the-core/
__blog__commercial-viability: /blog/commercial-viability/
__blog__cooking-on-gas: /blog/cooking-on-gas/
__blog__choosing-bdd-framework: /blog/choosing-bdd-framework/
__blog__reading: /blog/reading/
__blog__following-plan: /blog/following-plan/
__blog__outline-project-specification: /blog/outline-project-specification/
__blog__php-frameworks: /blog/php-frameworks/
__blog__industrial-year-report: /blog/industrial-year-report/
__blog__category__dissertation__page__2: /blog/category/dissertation/page/2/
__blog__exploring-suitability-viola-jones-framework-counting-people: /blog/exploring-suitability-viola-jones-framework-counting-people/
__blog__smartresolution: /blog/smartresolution/
__blog__commonwealth-games-quiz: /blog/commonwealth-games-quiz/
__blog__magazine-parentchild-themes: /blog/magazine-parentchild-themes/
__blog__reff: /blog/reff/
__blog__voicecouncil-magazine: /blog/voicecouncil-magazine/
__blog__hover-bike: /blog/hover-bike/
__blog__3d-solar-system: /blog/3d-solar-system/
__blog__how-stressed-are-you: /blog/how-stressed-are-you/
__blog__studentmunch-com: /blog/studentmunch-com/
__blog__alice-holmes-anthropomorphism: /blog/alice-holmes-anthropomorphism/
__blog__nhs-winter: /blog/nhs-winter/
__blog__abermads-co-uk: /blog/abermads-co-uk/
__blog__tube-spotted: /blog/tube-spotted/
__blog__radioactive-evolution: /blog/radioactive-evolution/
__blog__when-do-i-need-a-non-javascript-solution: /blog/when-do-i-need-a-non-javascript-solution/
__blog__use-requirejs-wordpress-plugins-jquery-ui: /blog/use-requirejs-wordpress-plugins-jquery-ui/
__blog__high-impact-minimal-effort-cross-browser-testing: /blog/high-impact-minimal-effort-cross-browser-testing/
__blog__five-line-rule: /blog/five-line-rule/
__blog__the-pessimistic-ultimatum: /blog/the-pessimistic-ultimatum/
__blog__how-to-differentiate-between-links: /blog/how-to-differentiate-between-links/
__blog__stupidly-simple-programming-introduction: /blog/stupidly-simple-programming-introduction/
__blog__zaatari-refugee-camp-rebuilding-lives-in-the-desert: /blog/zaatari-refugee-camp-rebuilding-lives-in-the-desert/
__blog__category__updates__page__2: /blog/category/updates/page/2/
__blog__requirements-features: /blog/requirements-features/
__blog__building-shoulders-giants: /blog/building-shoulders-giants/
__blog__clarifying-requirements: /blog/clarifying-requirements/
__blog__orthodontic-refactoring: /blog/orthodontic-refactoring/
__blog__localised-css: /blog/localised-css/
__blog__project-clarified: /blog/project-clarified/
__blog__identifying-competitors: /blog/identifying-competitors/
__blog__and-so-it-begins: /blog/and-so-it-begins/
__blog__does-drm-promote-piracy: /blog/does-drm-promote-piracy/
__blog__what-is-the-surprise-app-for-blackberry: /blog/what-is-the-surprise-app-for-blackberry/
__blog__if-humans-were-computer-components-an-analogy: /blog/if-humans-were-computer-components-an-analogy/
__blog__redesigning-ip-datagrams-for-a-faster-internet: /blog/redesigning-ip-datagrams-for-a-faster-internet/
__blog__trinary-is-it-the-future: /blog/trinary-is-it-the-future/
__blog__the-demise-of-blackberry: /blog/the-demise-of-blackberry/
Loading