From 63d4e1539fdd02e128b4ace7d346806d355deaa2 Mon Sep 17 00:00:00 2001 From: aaronp Date: Wed, 5 Dec 2007 04:17:00 +0000 Subject: [PATCH] merging REL-0.9.0 -> trunk 308:HEAD git-svn-id: svn+ssh://rubyforge.org/var/svn/mechanize/trunk@451 f1cf478b-080f-0410-abad-959bfeec9ea8 --- CHANGELOG | 156 ------- CHANGELOG.txt | 297 ++++++++++++ EXAMPLES => EXAMPLES.txt | 0 FAQ.txt | 11 + GUIDE => GUIDE.txt | 8 +- LICENSE => LICENSE.txt | 0 Manifest.txt | 125 ++++++ NOTES => NOTES.txt | 58 +++ README => README.txt | 22 +- Rakefile | 147 +++--- lib/mechanize.rb | 472 +++++++++++++++----- lib/mechanize/cookie.rb | 45 +- lib/mechanize/errors.rb | 12 +- lib/mechanize/form.rb | 100 +++-- lib/mechanize/form_elements.rb | 20 +- lib/mechanize/history.rb | 67 +++ lib/mechanize/hpricot.rb | 12 - lib/mechanize/inspect.rb | 6 + lib/mechanize/list.rb | 14 +- lib/mechanize/mech_version.rb | 5 - lib/mechanize/monkey_patch.rb | 15 + lib/mechanize/net-overrides/net/http.rb | 4 +- lib/mechanize/net-overrides/net/https.rb | 1 + lib/mechanize/net-overrides/net/protocol.rb | 2 +- lib/mechanize/page.rb | 101 +++-- lib/mechanize/page_elements.rb | 35 +- lib/mechanize/parsers/rexml_page.rb | 35 ++ lib/mechanize/pluggable_parsers.rb | 54 ++- lib/mechanize/rexml.rb | 236 ++++++++++ test/README | 7 - test/htdocs/alt_text.html | 1 + test/htdocs/empty_form.html | 6 + test/htdocs/find_link.html | 1 + test/htdocs/form_select_none.html | 1 + test/htdocs/link with space.html | 5 + test/htdocs/relative/tc_relative_links.html | 20 + test/htdocs/tc_base_link.html | 8 + test/htdocs/tc_blank_form.html | 11 + test/htdocs/tc_encoded_links.html | 5 + test/htdocs/tc_follow_meta.html | 8 + test/htdocs/tc_form_action.html | 48 ++ test/htdocs/tc_links.html | 16 + test/htdocs/tc_referer.html | 10 + test/htdocs/tc_relative_links.html | 19 + test/htdocs/unusual______.html | 5 + test/proxy.rb | 30 -- test/server.rb | 42 -- test/tc_authenticate.rb | 15 +- test/tc_blank_form.rb | 23 + test/tc_checkboxes.rb | 2 +- test/tc_cookie_class.rb | 89 +++- test/tc_cookie_jar.rb | 55 +++ test/tc_cookies.rb | 6 +- test/tc_encoded_links.rb | 27 ++ test/tc_errors.rb | 8 + test/tc_follow_meta.rb | 32 ++ test/tc_form_action.rb | 52 +++ test/tc_form_as_hash.rb | 69 +++ test/tc_form_button.rb | 36 ++ test/tc_form_no_inputname.rb | 4 +- test/tc_forms.rb | 75 +++- test/tc_history.rb | 149 ++++++ test/tc_html_unscape_forms.rb | 46 ++ test/tc_if_modified_since.rb | 25 ++ test/tc_keep_alive.rb | 38 ++ test/tc_links.rb | 67 ++- test/tc_mech.rb | 57 +++ test/tc_no_attributes.rb | 2 +- test/tc_pluggable_parser.rb | 4 +- test/tc_referer.rb | 46 ++ test/tc_relative_links.rb | 47 ++ test/tc_response_code.rb | 20 + test/tc_save_file.rb | 31 ++ test/tc_subclass.rb | 28 ++ test/tc_upload.rb | 83 ++-- test/tc_watches.rb | 2 +- test/{ts_mech.rb => test_all.rb} | 61 ++- test/test_includes.rb | 116 +++++ test/test_mechanize_file.rb | 52 +++ test/{servlets.rb => test_servlets.rb} | 73 ++- 80 files changed, 3018 insertions(+), 725 deletions(-) delete mode 100644 CHANGELOG create mode 100644 CHANGELOG.txt rename EXAMPLES => EXAMPLES.txt (100%) create mode 100644 FAQ.txt rename GUIDE => GUIDE.txt (96%) rename LICENSE => LICENSE.txt (100%) create mode 100644 Manifest.txt rename NOTES => NOTES.txt (82%) rename README => README.txt (54%) create mode 100644 lib/mechanize/history.rb delete mode 100644 lib/mechanize/hpricot.rb delete mode 100644 lib/mechanize/mech_version.rb create mode 100644 lib/mechanize/monkey_patch.rb create mode 100644 lib/mechanize/parsers/rexml_page.rb create mode 100644 lib/mechanize/rexml.rb delete mode 100644 test/README create mode 100644 test/htdocs/empty_form.html create mode 100644 test/htdocs/link with space.html create mode 100644 test/htdocs/relative/tc_relative_links.html create mode 100644 test/htdocs/tc_base_link.html create mode 100644 test/htdocs/tc_blank_form.html create mode 100644 test/htdocs/tc_encoded_links.html create mode 100644 test/htdocs/tc_follow_meta.html create mode 100644 test/htdocs/tc_form_action.html create mode 100644 test/htdocs/tc_links.html create mode 100644 test/htdocs/tc_referer.html create mode 100644 test/htdocs/tc_relative_links.html create mode 100644 test/htdocs/unusual______.html delete mode 100644 test/proxy.rb delete mode 100644 test/server.rb create mode 100644 test/tc_blank_form.rb create mode 100644 test/tc_encoded_links.rb create mode 100644 test/tc_follow_meta.rb create mode 100644 test/tc_form_action.rb create mode 100644 test/tc_form_as_hash.rb create mode 100644 test/tc_form_button.rb create mode 100644 test/tc_history.rb create mode 100644 test/tc_html_unscape_forms.rb create mode 100644 test/tc_if_modified_since.rb create mode 100644 test/tc_keep_alive.rb create mode 100644 test/tc_referer.rb create mode 100644 test/tc_relative_links.rb create mode 100644 test/tc_subclass.rb rename test/{ts_mech.rb => test_all.rb} (72%) create mode 100644 test/test_mechanize_file.rb rename test/{servlets.rb => test_servlets.rb} (73%) diff --git a/CHANGELOG b/CHANGELOG deleted file mode 100644 index f97a2fc0..00000000 --- a/CHANGELOG +++ /dev/null @@ -1,156 +0,0 @@ -= Mechanize CHANGELOG - -== 0.6.0 - -* Changed main parser to use hpricot -* Made WWW::Mechanize::Page class searchable like hpricot -* Updated WWW::Mechanize#click to support hpricot links like this: - @agent.click (page/"a").first -* Clicking a Frame is now possible: - @agent.click (page/"frame").first -* Removed deprecated attr_finder -* Removed REXML helper methods since the main parser is now hpricot -* Overhauled cookie parser to use WEBrick::Cookie - -== 0.5.4 - -* Added WWW::Mechanize#trasact for saving history state between in a - transaction. See the EXAMPLES file. Thanks Johan Kiviniemi. -* Added support for gzip compressed pages -* Forms can now be accessed like a hash. For example, to set the value - of an input field named 'name' to "Aaron", you can do this: - form['name'] = "Aaron" - Or to get the value of a field named 'name', do this: - puts form['name'] -* File uploads will now read the file specified in FileUpload#file_name -* FileUpload can use an IO object in FileUpload#file_data -* Fixed a bug with saving files on windows -* Fixed a bug with the filename being set in forms - -== 0.5.3 - -* Mechanize#click will now act on the first element of an array. So if an - array of links is passed to WWW::Mechanize#click, the first link is clicked. - That means the syntax for clicking links is shortened and still supports - selecting a link. The following are equivalent: - agent.click page.links.first - agent.click page.links -* Fixed a bug with spaces in href's and get's -* Added a tick, untick, and click method to radio buttons so that - radiobuttons can be "clicked" -* Added a tick, untick, and click method to check boxes so that - checkboxes can be "clicked" -* Options on Select lists can now be "tick"ed, and "untick"ed. -* Fixed a potential bug conflicting with rails. Thanks Eric Kolve -* Updated log4r support for a speed increase. Thanks Yinon Bentor -* Added inspect methods and pretty printing - -== 0.5.2 - -* Fixed a bug with input names that are nil -* Added a warning when using attr_finder because attr_finder will be deprecated - in 0.6.0 in favor of method calls. So this syntax: - @agent.links(:text => 'foo') - should be changed to this: - @agent.links.text('foo') -* Added support for selecting multiple options in select tags that support - multiple options. See WWW::Mechanize::MultiSelectList. -* New select list methods have been added, select_all, select_none. -* Options for select lists can now be "clicked" which toggles their selection, - they can be "selected" and "unselected". See WWW::Mechanize::Option -* Added a method to set multiple fields at the same time, - WWW::Mechanize::Form#set_fields. Which can be used like so: - form.set_fields( :foo => 'bar', :name => 'Aaron' ) - -== 0.5.1 - -* Fixed bug with file uploads -* Added performance tweaks to the cookie class - -== 0.5.0 - -* Added pluggable parsers. (Thanks to Eric Kolve for the idea) -* Changed namespace so all classes are under WWW::Mechanize. -* Updating Forms so that fields can be used as accessors (Thanks Gregory Brown) -* Added WWW::Mechanize::File as default object used for unknown content types. -* Added 'save_as' method to Mechanize::File, so any page can be saved. -* Adding 'save_as' and 'load' to CookieJar so that cookies can be saved - between sessions. -* Added WWW::Mechanize::FileSaver pluggable parser to automatically save files. -* Added WWW::Mechanize::Page#title for page titles -* Added OpenSSL certificate support (Thanks Mike Dalessio) -* Removed support for body filters in favor of pluggable parsers. -* Fixed cookie bug adding a '/' when the url is missing one (Thanks Nick Dainty) - -== 0.4.7 - -* Fixed bug with no action in forms. Thanks to Adam Wiggins -* Setting a default user-agent string -* Added house cleaning to the cookie jar so expired cookies don't stick around. -* Added new method WWW::Form#field to find the first field with a given name. - (thanks to Gregory Brown) -* Added WWW::Mechanize#get_file for fetching non text/html files - -== 0.4.6 - -* Added support for proxies -* Added a uri field to WWW::Link -* Added a error class WWW::Mechanize::ContentTypeError -* Added image alt text to link text -* Added an visited? method to WWW::Mechanize -* Added Array#value= which will set the first value to the argument. That - allows syntax as such: form.fields.name('q').value = 'xyz' - Before it was like this: form.fields.name('q').first.value = 'xyz' - -== 0.4.5 - -* Added support for multiple values of the same name -* Updated build_query_string to take an array of arrays (Thanks Michal Janeczek) -* Added WWW::Mechanize#body_filter= so that response bodies can be preprocessed -* Added WWW::Page#body_filter= so that response bodies can be preprocessed -* Added support for more date formats in the cookie parser -* Fixed a bug with empty select lists -* Fixing a problem with cookies not handling no spaces after semicolons - -== 0.4.4 - -* Fixed error in method signature, basic_authetication is now basic_auth -* Fixed bug with encoding names in file uploads (Big thanks to Alex Young) -* Added options to the select list - -== 0.4.3 - -* Added syntactic sugar for finding things -* Fixed bug with HttpOnly option in cookies -* Fixed a bug with cookie date parsing -* Defaulted dropdown lists to the first element -* Added unit tests - -== 0.4.2 - -* Added support for iframes -* Made mechanize dependant on ruby-web rather than narf -* Added unit tests -* Fixed a bunch of warnings - -== 0.4.1 - -* Added support for file uploading -* Added support for frames (Thanks Gabriel[mailto:leerbag@googlemail.com]) -* Added more unit tests -* Fixed some bugs - -== 0.4.0 - -* Added more unit tests -* Added a cookie jar with better cookie support, included expiration of cookies - and general cookie security. -* Updated mechanize to use built in net/http if ruby version is new enough. -* Added support for meta refresh tags -* Defaulted form actions to 'GET' -* Fixed various bugs -* Added more unit tests -* Added a response code exception -* Thanks to Brian Ellin (brianellin@gmail.com) for: - Added support for CA files, and support for 301 response codes - diff --git a/CHANGELOG.txt b/CHANGELOG.txt new file mode 100644 index 00000000..4f7d7517 --- /dev/null +++ b/CHANGELOG.txt @@ -0,0 +1,297 @@ += Mechanize CHANGELOG + +== 0.6.11 + +* Detecting single quotes in meta redirects. +* Adding pretty inspect for ruby versions > 1.8.4 (Thanks Joel Kociolek) + http://rubyforge.org/tracker/index.php?func=detail&aid=13150&group_id=1453&atid=5709 +* Fixed bug with file name in multipart posts + http://rubyforge.org/tracker/?func=detail&aid=15594&group_id=1453&atid=5709 +* Posting forms relative to the originating page. Thanks Mortee. +* Added a FAQ + http://rubyforge.org/tracker/?func=detail&aid=15772&group_id=1453&atid=5709 + +== 0.6.10 + +* Made digest authentication work with POSTs. +* Made sure page was HTML before following meta refreshes. + http://rubyforge.org/tracker/index.php?func=detail&aid=12260&group_id=1453&atid=5709 +* Made sure that URLS with a host and no path would default to '/' for history + purposes. + http://rubyforge.org/tracker/index.php?func=detail&aid=12368&group_id=1453&atid=5709 +* Avoiding memory leaks with transact. Thanks Tobias Gruetzmacher! + http://rubyforge.org/tracker/index.php?func=detail&aid=12057&group_id=1453&atid=5711 +* Fixing a problem with # signs in the file name. Thanks Tobias Gruetzmacher! + http://rubyforge.org/tracker/index.php?func=detail&aid=12510&group_id=1453&atid=5711 +* Made sure that blank form values are submitted. + http://rubyforge.org/tracker/index.php?func=detail&aid=12505&group_id=1453&atid=5709 +* Mechanize now respects the base tag. Thanks Stephan Dale. + http://rubyforge.org/tracker/index.php?func=detail&aid=12468&group_id=1453&atid=5709 +* Aliasing inspect to pretty_inspect. Thanks Eric Promislow. + http://rubyforge.org/pipermail/mechanize-users/2007-July/000157.html + +== 0.6.9 + +* Updating UTF-8 support for urls +* Adding AREA tags to the links list. + http://rubyforge.org/pipermail/mechanize-users/2007-May/000140.html +* WWW::Mechanize#follow_meta_refresh will allow you to automatically follow + meta refresh tags. [#10032] +* Adding x-gzip to accepted content-encoding. Thanks Simon Strandgaard + http://rubyforge.org/tracker/index.php?func=detail&aid=11167&group_id=1453&atid=5711 +* Added Digest Authentication support. Thanks to Ryan Davis and Eric Hodel, + you get a gold star! + +== 0.6.8 + +* Keep alive can be shut off now with WWW::Mechanize#keep_alive +* Conditional requests can be shut off with WWW::Mechanize#conditional_requests +* Monkey patched Net::HTTP#keep_alive? +* [#9877] Moved last request time. Thanks Max Stepanov +* Added WWW::Mechanize::File#save +* Defaulting file name to URI or Content-Disposition +* Updating compatability with hpricot +* Added more unit tests + +== 0.6.7 + +* Fixed a bug with keep-alive requests +* [#9549] fixed problem with cookie paths + +== 0.6.6 + +* Removing hpricot overrides +* Fixed a bug where alt text can be nil. Thanks Yannick! +* Unparseable expiration dates in cookies are now treated as session cookies +* Caching connections +* Requests now default to keep alive +* [#9434] Fixed bug where html entities weren't decoded +* [#9150] Updated mechanize history to deal with redirects + +== 0.6.5 + +* Copying headers to a hash to prevent memory leaks +* Speeding up page parsing +* Aliased fields to elements +* Adding If-Modified-Since header +* Added delete_field! to form. Thanks to Sava Chankov +* Updated uri escaping to support high order characters. Thanks to Henrik Nyh. +* Better handling relative URIs. Thanks to Henrik Nyh +* Now handles pipes in URLs + http://rubyforge.org/tracker/?func=detail&aid=7140&group_id=1453&atid=5709 +* Now escaping html entities in form fields. + http://rubyforge.org/tracker/?func=detail&aid=7563&group_id=1453&atid=5709 +* Added MSIE 7.0 user agent string + +== 0.6.4 + +* Adding the "redirect_ok" method to Mechanize to stop mechanize from + following redirects. + http://rubyforge.org/tracker/index.php?func=detail&aid=6571&group_id=1453&atid=5712 +* Added protected method Mechanize#set_headers so that subclasses can set + custom headers. + http://rubyforge.org/tracker/?func=detail&aid=7208&group_id=1453&atid=5712 +* Aliased Page#referer to Page#page +* Fixed a bug when clicking relative urls + http://rubyforge.org/pipermail/mechanize-users/2006-November/000035.html +* Fixing a bug when bad version or max age is passed to Cookie::parse + http://rubyforge.org/pipermail/mechanize-users/2006-November/000033.html +* Fixing a bug with response codes. [#6526] +* Fixed bug [#6548]. Input type of 'button' was not being added as a button. +* Fixed bug [#7139]. REXML parser calls hpricot parser by accident + +== 0.6.3 + +* Added keys and values methods to Form +* Added has_value? to Form +* Added a has_field? method to Form +* The add_field! method on Form now creates a field for you on the form. + Thanks to Mat Schaffer for the patch. + http://rubyforge.org/pipermail/mechanize-users/2006-November/000025.html +* Fixed a bug when form actions have html ecoded entities in them. + http://rubyforge.org/pipermail/mechanize-users/2006-October/000019.html +* Fixed a bug when links or frame sources have html encoded entities in the + href or src. +* Fixed a bug where '#' symbols are encoded + http://rubyforge.org/forum/message.php?msg_id=14747 + +== 0.6.2 + +* Added a yield to Page#form so that dealing with forms can be more DSL like. +* Added the parsed page to the ResponseCodeError so that the parsed results + can be accessed even in the event of an error. + http://rubyforge.org/pipermail/mechanize-users/2006-September/000007.html +* Updated documentation (Thanks to Paul Smith) + +== 0.6.1 + +* Added a method to Form called "submit". Now forms can be submitted by + calling a method on the form. +* Added a click method to links +* Added an REXML pluggable parser for backwards compatability. To use it, + just do this: + agent.pluggable_parser.html = WWW::Mechanize::REXMLPage +* Fixed a bug with referrers by adding a page attribute to forms and links. +* Fixed a bug where domain names were case sensitive. + http://tenderlovemaking.com/2006/09/04/road-to-ruby-mechanize-060/#comment-53 +* Fixed a bug with URI escaped links. + http://rubyforge.org/pipermail/mechanize-users/2006-September/000002.html +* Fixed a bug when options in select lists don't have a value. Thanks Dan Higham + [#5837] Code in lib/mechanize/form_elements.rb is incorrect. +* Fixed a bug with loading text in to links. + http://rubyforge.org/pipermail/mechanize-users/2006-September/000000.html + +== 0.6.0 + +* Changed main parser to use hpricot +* Made WWW::Mechanize::Page class searchable like hpricot +* Updated WWW::Mechanize#click to support hpricot links like this: + @agent.click (page/"a").first +* Clicking a Frame is now possible: + @agent.click (page/"frame").first +* Removed deprecated attr_finder +* Removed REXML helper methods since the main parser is now hpricot +* Overhauled cookie parser to use WEBrick::Cookie + +== 0.5.4 + +* Added WWW::Mechanize#trasact for saving history state between in a + transaction. See the EXAMPLES file. Thanks Johan Kiviniemi. +* Added support for gzip compressed pages +* Forms can now be accessed like a hash. For example, to set the value + of an input field named 'name' to "Aaron", you can do this: + form['name'] = "Aaron" + Or to get the value of a field named 'name', do this: + puts form['name'] +* File uploads will now read the file specified in FileUpload#file_name +* FileUpload can use an IO object in FileUpload#file_data +* Fixed a bug with saving files on windows +* Fixed a bug with the filename being set in forms + +== 0.5.3 + +* Mechanize#click will now act on the first element of an array. So if an + array of links is passed to WWW::Mechanize#click, the first link is clicked. + That means the syntax for clicking links is shortened and still supports + selecting a link. The following are equivalent: + agent.click page.links.first + agent.click page.links +* Fixed a bug with spaces in href's and get's +* Added a tick, untick, and click method to radio buttons so that + radiobuttons can be "clicked" +* Added a tick, untick, and click method to check boxes so that + checkboxes can be "clicked" +* Options on Select lists can now be "tick"ed, and "untick"ed. +* Fixed a potential bug conflicting with rails. Thanks Eric Kolve +* Updated log4r support for a speed increase. Thanks Yinon Bentor +* Added inspect methods and pretty printing + +== 0.5.2 + +* Fixed a bug with input names that are nil +* Added a warning when using attr_finder because attr_finder will be deprecated + in 0.6.0 in favor of method calls. So this syntax: + @agent.links(:text => 'foo') + should be changed to this: + @agent.links.text('foo') +* Added support for selecting multiple options in select tags that support + multiple options. See WWW::Mechanize::MultiSelectList. +* New select list methods have been added, select_all, select_none. +* Options for select lists can now be "clicked" which toggles their selection, + they can be "selected" and "unselected". See WWW::Mechanize::Option +* Added a method to set multiple fields at the same time, + WWW::Mechanize::Form#set_fields. Which can be used like so: + form.set_fields( :foo => 'bar', :name => 'Aaron' ) + +== 0.5.1 + +* Fixed bug with file uploads +* Added performance tweaks to the cookie class + +== 0.5.0 + +* Added pluggable parsers. (Thanks to Eric Kolve for the idea) +* Changed namespace so all classes are under WWW::Mechanize. +* Updating Forms so that fields can be used as accessors (Thanks Gregory Brown) +* Added WWW::Mechanize::File as default object used for unknown content types. +* Added 'save_as' method to Mechanize::File, so any page can be saved. +* Adding 'save_as' and 'load' to CookieJar so that cookies can be saved + between sessions. +* Added WWW::Mechanize::FileSaver pluggable parser to automatically save files. +* Added WWW::Mechanize::Page#title for page titles +* Added OpenSSL certificate support (Thanks Mike Dalessio) +* Removed support for body filters in favor of pluggable parsers. +* Fixed cookie bug adding a '/' when the url is missing one (Thanks Nick Dainty) + +== 0.4.7 + +* Fixed bug with no action in forms. Thanks to Adam Wiggins +* Setting a default user-agent string +* Added house cleaning to the cookie jar so expired cookies don't stick around. +* Added new method WWW::Form#field to find the first field with a given name. + (thanks to Gregory Brown) +* Added WWW::Mechanize#get_file for fetching non text/html files + +== 0.4.6 + +* Added support for proxies +* Added a uri field to WWW::Link +* Added a error class WWW::Mechanize::ContentTypeError +* Added image alt text to link text +* Added an visited? method to WWW::Mechanize +* Added Array#value= which will set the first value to the argument. That + allows syntax as such: form.fields.name('q').value = 'xyz' + Before it was like this: form.fields.name('q').first.value = 'xyz' + +== 0.4.5 + +* Added support for multiple values of the same name +* Updated build_query_string to take an array of arrays (Thanks Michal Janeczek) +* Added WWW::Mechanize#body_filter= so that response bodies can be preprocessed +* Added WWW::Page#body_filter= so that response bodies can be preprocessed +* Added support for more date formats in the cookie parser +* Fixed a bug with empty select lists +* Fixing a problem with cookies not handling no spaces after semicolons + +== 0.4.4 + +* Fixed error in method signature, basic_authetication is now basic_auth +* Fixed bug with encoding names in file uploads (Big thanks to Alex Young) +* Added options to the select list + +== 0.4.3 + +* Added syntactic sugar for finding things +* Fixed bug with HttpOnly option in cookies +* Fixed a bug with cookie date parsing +* Defaulted dropdown lists to the first element +* Added unit tests + +== 0.4.2 + +* Added support for iframes +* Made mechanize dependant on ruby-web rather than narf +* Added unit tests +* Fixed a bunch of warnings + +== 0.4.1 + +* Added support for file uploading +* Added support for frames (Thanks Gabriel[mailto:leerbag@googlemail.com]) +* Added more unit tests +* Fixed some bugs + +== 0.4.0 + +* Added more unit tests +* Added a cookie jar with better cookie support, included expiration of cookies + and general cookie security. +* Updated mechanize to use built in net/http if ruby version is new enough. +* Added support for meta refresh tags +* Defaulted form actions to 'GET' +* Fixed various bugs +* Added more unit tests +* Added a response code exception +* Thanks to Brian Ellin (brianellin@gmail.com) for: + Added support for CA files, and support for 301 response codes + diff --git a/EXAMPLES b/EXAMPLES.txt similarity index 100% rename from EXAMPLES rename to EXAMPLES.txt diff --git a/FAQ.txt b/FAQ.txt new file mode 100644 index 00000000..7955aa42 --- /dev/null +++ b/FAQ.txt @@ -0,0 +1,11 @@ +Q: I keep getting an EOFError: + protocol.rb:133:in `sysread': end of file reached (EOFError) + +A: Some people have experienced an EOFError during normal mechanize usage. + Most of the time this occurs because the remote website claims to support + keep alives, but does not implement them correctly. Try turning off + keep alives on your mechanize object: + + mech.keep_alive = false + + diff --git a/GUIDE b/GUIDE.txt similarity index 96% rename from GUIDE rename to GUIDE.txt index c1f8ef6e..02f28d48 100644 --- a/GUIDE +++ b/GUIDE.txt @@ -34,19 +34,19 @@ Now that we've fetched google's homepage, lets try listing all of the links: We can list the links, but Mechanize gives a few shortcuts to help us find a link to click on. Lets say we wanted to click the link whose text is 'News'. Normally, we would have to do this: - page = agent.click page.links.find { |l| l.name == 'News' } + page = agent.click page.links.find { |l| l.text == 'News' } But Mechanize gives us a shortcut. Instead we can say this: - page = agent.click page.links.name('News') + page = agent.click page.links.text('News') That shortcut says "find all links with the name 'News'". You're probably thinking "there could be multiple links with that text!", and you would be correct! If you pass a list of links to the "click" method, Mechanize will click on the first one. If you wanted to click on the second news link, you could do this: - agent.click page.links.name('News')[1] + agent.click page.links.text('News')[1] We can even find a link with a certain href like so: page.links.href('/something') Or chain them together to find a link with certain text and certain href: - page.links.name('News').href('/something') + page.links.text('News').href('/something') These shortcuts that mechanize provides are available on any list that you can fetch like frames, iframes, or forms. Now that we know how to find and diff --git a/LICENSE b/LICENSE.txt similarity index 100% rename from LICENSE rename to LICENSE.txt diff --git a/Manifest.txt b/Manifest.txt new file mode 100644 index 00000000..5ca40bf3 --- /dev/null +++ b/Manifest.txt @@ -0,0 +1,125 @@ +CHANGELOG.txt +EXAMPLES.txt +FAQ.txt +GUIDE.txt +LICENSE.txt +Manifest.txt +NOTES.txt +README.txt +Rakefile +eg/flickr_upload.rb +eg/mech-dump.rb +eg/proxy_req.rb +eg/rubyforge.rb +eg/spider.rb +lib/mechanize.rb +lib/mechanize/cookie.rb +lib/mechanize/errors.rb +lib/mechanize/form.rb +lib/mechanize/form_elements.rb +lib/mechanize/history.rb +lib/mechanize/inspect.rb +lib/mechanize/list.rb +lib/mechanize/monkey_patch.rb +lib/mechanize/net-overrides/net/http.rb +lib/mechanize/net-overrides/net/https.rb +lib/mechanize/net-overrides/net/protocol.rb +lib/mechanize/page.rb +lib/mechanize/page_elements.rb +lib/mechanize/parsers/rexml_page.rb +lib/mechanize/pluggable_parsers.rb +lib/mechanize/rexml.rb +setup.rb +test/data/htpasswd +test/data/server.crt +test/data/server.csr +test/data/server.key +test/data/server.pem +test/htdocs/alt_text.html +test/htdocs/bad_form_test.html +test/htdocs/button.jpg +test/htdocs/empty_form.html +test/htdocs/file_upload.html +test/htdocs/find_link.html +test/htdocs/form_multi_select.html +test/htdocs/form_multival.html +test/htdocs/form_no_action.html +test/htdocs/form_no_input_name.html +test/htdocs/form_select.html +test/htdocs/form_select_all.html +test/htdocs/form_select_none.html +test/htdocs/form_select_noopts.html +test/htdocs/form_set_fields.html +test/htdocs/form_test.html +test/htdocs/frame_test.html +test/htdocs/google.html +test/htdocs/iframe_test.html +test/htdocs/index.html +test/htdocs/link with space.html +test/htdocs/no_title_test.html +test/htdocs/relative/tc_relative_links.html +test/htdocs/tc_bad_links.html +test/htdocs/tc_base_link.html +test/htdocs/tc_blank_form.html +test/htdocs/tc_checkboxes.html +test/htdocs/tc_encoded_links.html +test/htdocs/tc_follow_meta.html +test/htdocs/tc_form_action.html +test/htdocs/tc_links.html +test/htdocs/tc_no_attributes.html +test/htdocs/tc_pretty_print.html +test/htdocs/tc_radiobuttons.html +test/htdocs/tc_referer.html +test/htdocs/tc_relative_links.html +test/htdocs/tc_textarea.html +test/htdocs/unusual______.html +test/ssl_server.rb +test/tc_authenticate.rb +test/tc_bad_links.rb +test/tc_blank_form.rb +test/tc_checkboxes.rb +test/tc_cookie_class.rb +test/tc_cookie_jar.rb +test/tc_cookies.rb +test/tc_encoded_links.rb +test/tc_errors.rb +test/tc_follow_meta.rb +test/tc_form_action.rb +test/tc_form_as_hash.rb +test/tc_form_button.rb +test/tc_form_no_inputname.rb +test/tc_forms.rb +test/tc_frames.rb +test/tc_gzipping.rb +test/tc_history.rb +test/tc_html_unscape_forms.rb +test/tc_if_modified_since.rb +test/tc_keep_alive.rb +test/tc_links.rb +test/tc_mech.rb +test/tc_multi_select.rb +test/tc_no_attributes.rb +test/tc_page.rb +test/tc_pluggable_parser.rb +test/tc_post_form.rb +test/tc_pretty_print.rb +test/tc_proxy.rb +test/tc_radiobutton.rb +test/tc_referer.rb +test/tc_relative_links.rb +test/tc_response_code.rb +test/tc_save_file.rb +test/tc_select.rb +test/tc_select_all.rb +test/tc_select_none.rb +test/tc_select_noopts.rb +test/tc_set_fields.rb +test/tc_ssl_server.rb +test/tc_subclass.rb +test/tc_textarea.rb +test/tc_upload.rb +test/tc_watches.rb +test/test_all.rb +test/test_includes.rb +test/test_mechanize_file.rb +test/test_servlets.rb diff --git a/NOTES b/NOTES.txt similarity index 82% rename from NOTES rename to NOTES.txt index d09d6c12..50938054 100644 --- a/NOTES +++ b/NOTES.txt @@ -1,5 +1,63 @@ = Mechanize Release Notes +== 0.6.4 (Gwendolyn) + +Custom request headers can now be added to Mechanize by subclassing mechanize +and defining the Mechanize#set_headers method. For example: + class A < WWW::Mechanize + def set_headers(u, r, c) + super(uri, request, cur_page) + request.add_field('Cookie', 'name=Aaron') + request + end + end +The Mechanize#redirect_ok method has been added to that you can keep mechanize +from following redirects. + +== 0.6.3 (Big Man) + +Mechanize 0.6.3 (Big Man) has a few bug fixes and some new features added to +the Form class. The Form class is now more hash like. I've added an +Form#add_field! method that will add a field to your form. Form#[]= will now +add a field if the key doesn't exist. For example, your form doesn't have +an input field named 'foo', the following 2 lines of code are equivalent, and +will create a field named 'foo': + form['foo'] = 'bar' +or + form.add_field!('foo', 'bar') +To make forms more hashlike, has_value?, and has_key? methods. + +== 0.6.2 (Bridget) + +Mechanize 0.6.2 (Bridget) is a fairly small bug fix release. You can now +access the parsed page when a ResponseCodeError is thrown. For example, this +loads a page that doesn't exist, but gives you access to the parsed 404 page: + begin + WWW::Mechanize.new().get('http://google.com/asdfasdfadsf.html') + rescue WWW::Mechanize::ResponseCodeError => ex + puts ex.page + end +Accessing forms is now more DSL like. When manipulating a form, for example, +you can use the following syntax: + page.form('formname') { |form| + form.first_name = "Aaron" + }.submit +Documentation has also been updated thanks to Paul Smith. + +== 0.6.1 (Chuck) + +Mechanize version 0.6.1 (Chuck) is done, and is ready for you to use. This +post "my trip to europe" release includes many bug fixes and a handful of +new features. + +New features include, a submit method on forms, a click method on links, and an +REXML pluggable parser. Now you can submit a form just by calling a method on +the form, rather than passing the form to the submit method on the mech object. +The click method on links lets you click the link by calling a method on the +link rather than passing the link to the click method on the mech object. +Lastly, the REXML pluggable parser lets you use your pre-0.6.0 code with +0.6.1. See the CHANGELOG for more details. + == 0.6.0 (Rufus) WWW::Mechanize 0.6.0 aka Rufus is ready! This hpricot flavored pie has diff --git a/README b/README.txt similarity index 54% rename from README rename to README.txt index 73203f2e..1d39d1c9 100644 --- a/README +++ b/README.txt @@ -1,5 +1,9 @@ = WWW::Mechanize + http://mechanize.rubyforge.org/ + +== DESCRIPTION + The Mechanize library is used for automating interaction with websites. Mechanize automatically stores and sends cookies, follows redirects, can follow links, and submit forms. Form fields can be populated and @@ -16,8 +20,8 @@ Note that the files in the net-overrides/ directory are taken from Ruby 1.9.0. == Examples -If you are just starting, check out the GUIDE[link://files/GUIDE.html]. -Also, check out the EXAMPLES[link://files/EXAMPLES.html] file. +If you are just starting, check out the GUIDE[link://files/GUIDE_txt.html]. +Also, check out the EXAMPLES[link://files/EXAMPLES_txt.html] file. == Authors @@ -25,13 +29,23 @@ Original Code: Copyright (c) 2005 by Michael Neumann (mneumann@ntecs.de) New Code: -Copyright (c) 2006 by Aaron Patterson (aaronp@rubyforge.org) +Copyright (c) 2007 by Aaron Patterson (aaronp@rubyforge.org) This library comes with a shameless plug for employing me (Aaron[http://tenderlovemaking.com/]) programming Ruby, my favorite language! +== Acknowledgments + +This library was heavily influenced by its namesake in the perl world. A big +thanks goes to Andy Lester (andy@petdance.com), the author of the original +perl Mechanize which is available here[http://search.cpan.org/~petdance/WWW-Mechanize-1.20/]. Ruby Mechanize would not be around without you! + +Thank you to Michael Neumann for starting the Ruby version. Thanks to everyone +who's helped out in various ways. Finally, thank you to the people using this +library! + == License -This library is distributed under the GPL. Please see the LICENSE file. +This library is distributed under the GPL. Please see the LICENSE[link://files/LICENSE_txt.html] file. diff --git a/Rakefile b/Rakefile index 12764486..b11ca389 100644 --- a/Rakefile +++ b/Rakefile @@ -1,101 +1,60 @@ require 'rubygems' -require 'rake' -require 'rake/testtask' -require 'rake/gempackagetask' -require 'rake/rdoctask' -require 'rake/contrib/sshpublisher' - -def announce(msg='') - STDERR.puts msg -end - -PKG_BUILD = ENV['PKG_BUILD'] ? '.' + ENV['PKG_BUILD'] : '' -PKG_NAME = 'mechanize' -PKG_VERSION = '0.6.0' + PKG_BUILD -PKG_FILES = FileList["{doc,lib,test}/**/*"].exclude("rdoc").to_a - -spec = Gem::Specification.new do |s| - s.name = PKG_NAME - s.version = PKG_VERSION - s.author = "Aaron Patterson" - s.email = "aaronp@rubyforge.org" - s.homepage = "#{PKG_NAME}.rubyforge.org" - s.platform = Gem::Platform::RUBY - s.summary = "Mechanize provides automated web-browsing" - s.files = Dir.glob("{bin,test,lib,doc}/**/*").delete_if {|item| item.include?(".svn") } - s.require_path = "lib" - s.has_rdoc = true - s.extra_rdoc_files = ["README", "EXAMPLES", "CHANGELOG", "LICENSE", "NOTES", - "GUIDE"] - s.rdoc_options << "--main" << 'README' << "--title" << "'WWW::Mechanize RDoc'" - s.rubyforge_project = PKG_NAME - s.add_dependency('hpricot') -end - -Rake::GemPackageTask.new(spec) do |p| - p.gem_spec = spec - p.need_tar = true - p.need_zip = true -end - -Rake::RDocTask.new do |p| - p.main = "README" - p.rdoc_dir = "doc" - p.rdoc_files.include("README", "CHANGELOG", "LICENSE", "EXAMPLES", "NOTES", - "GUIDE", "lib/**/*.rb") - p.options << "--main" << 'README' << "--title" << "WWW::Mechanize RDoc" -end - -desc "Publish the API documentation" -task :pubrdoc => [ :rdoc ] do - Rake::SshDirPublisher.new( - "#{ENV['USER']}@rubyforge.org", - "/var/www/gforge-projects/#{PKG_NAME}/", - "doc" ).upload -end - -task :update_version do - announce "Updating Mechanize Version to #{PKG_VERSION}" - File.open("lib/mechanize/mech_version.rb", "w") do |f| - f.puts "module WWW" - f.puts " class Mechanize" - f.puts " Version = '#{PKG_VERSION}'" - f.puts " end" - f.puts "end" +require 'hoe' + +$LOAD_PATH.unshift File.join(File.dirname(__FILE__), "lib") +require 'mechanize' + +class MechHoe < Hoe + def define_tasks + super + + desc "Tag code" + task :tag do |p| + abort "Must supply VERSION=x.y.z" unless ENV['VERSION'] + v = ENV['VERSION'].gsub(/\./, '_') + + rf = RubyForge.new + user = rf.userconfig['username'] + + baseurl = "svn+ssh://#{user}@rubyforge.org//var/svn/#{name}" + sh "svn cp -m 'tagged REL-#{v}' . #{ baseurl }/tags/REL-#{ v }" + end + + desc "Branch code" + Rake::Task.define_task("branch") do |p| + abort "Must supply VERSION=x.y.z" unless ENV['VERSION'] + v = ENV['VERSION'].split(/\./)[0..1].join('_') + + rf = RubyForge.new + user = rf.userconfig['username'] + + baseurl = "svn+ssh://#{user}@rubyforge.org/var/svn/#{name}" + sh "svn cp -m'branched #{v}' #{baseurl}/trunk #{baseurl}/branches/RB-#{v}" + end + + desc "Update SSL Certificate" + Rake::Task.define_task('ssl_cert') do |p| + sh "openssl genrsa -des3 -out server.key 1024" + sh "openssl req -new -key server.key -out server.csr" + sh "cp server.key server.key.org" + sh "openssl rsa -in server.key.org -out server.key" + sh "openssl x509 -req -days 365 -in server.csr -signkey server.key -out server.crt" + sh "cp server.key server.pem" + sh "mv server.key server.csr server.crt server.pem test/data/" + sh "rm server.key.org" + end end - sh 'svn commit -m"updating version" lib/mechanize/mech_version.rb' end -desc "Create a new release" -task :release => [ :clobber, :update_version, :package, :tag ] do - announce - announce "**************************************************************" - announce "* Release #{PKG_VERSION} Complete." - announce "* Packages ready to upload." - announce "**************************************************************" - announce +MechHoe.new('mechanize', WWW::Mechanize::VERSION) do |p| + p.rubyforge_name = 'mechanize' + p.author = 'Aaron Patterson' + p.email = 'aaronp@rubyforge.org' + p.summary = "Mechanize provides automated web-browsing" + p.description = p.paragraphs_of('README.txt', 3).join("\n\n") + p.url = p.paragraphs_of('README.txt', 1).first.strip + p.changes = p.paragraphs_of('CHANGELOG.txt', 0..2).join("\n\n") + p.extra_deps = [['hpricot', '>= 0.5.0']] end -desc "Tag code" -Rake::Task.define_task("tag") do |p| - baseurl = "svn+ssh://#{ENV['USER']}@rubyforge.org//var/svn/#{PKG_NAME}" - sh "svn cp -m 'tagged #{ PKG_VERSION }' . #{ baseurl }/tags/REL-#{ PKG_VERSION }" -end - -desc "Branch code" -Rake::Task.define_task("branch") do |p| - baseurl = "svn+ssh://#{ENV['USER']}@rubyforge.org/var/svn/#{PKG_NAME}" - sh "svn cp -m 'branched #{ PKG_VERSION }' #{baseurl}/trunk #{ baseurl }/branches/RB-#{ PKG_VERSION }" -end -desc "Update SSL Certificate" -Rake::Task.define_task('ssl_cert') do |p| - sh "openssl genrsa -des3 -out server.key 1024" - sh "openssl req -new -key server.key -out server.csr" - sh "cp server.key server.key.org" - sh "openssl rsa -in server.key.org -out server.key" - sh "openssl x509 -req -days 365 -in server.csr -signkey server.key -out server.crt" - sh "cp server.key server.pem" - sh "mv server.key server.csr server.crt server.pem test/data/" - sh "rm server.key.org" -end diff --git a/lib/mechanize.rb b/lib/mechanize.rb index d258a28b..1df8e87a 100644 --- a/lib/mechanize.rb +++ b/lib/mechanize.rb @@ -11,20 +11,31 @@ unless RUBY_VERSION > "1.8.2" $LOAD_PATH.unshift File.join(File.dirname(__FILE__), "mechanize", "net-overrides") end + require 'net/http' require 'net/https' +# Monkey patch for ruby 1.8.4 +unless RUBY_VERSION > "1.8.4" +module Net # :nodoc: + class HTTPResponse # :nodoc: + CODE_TO_OBJ['500'] = HTTPInternalServerError + end +end +end + require 'uri' require 'webrick/httputils' require 'zlib' require 'stringio' -require 'mechanize/hpricot' -require 'mechanize/mech_version' +require 'digest/md5' +require 'mechanize/monkey_patch' require 'mechanize/cookie' require 'mechanize/errors' require 'mechanize/pluggable_parsers' require 'mechanize/form' require 'mechanize/form_elements' +require 'mechanize/history' require 'mechanize/list' require 'mechanize/page' require 'mechanize/page_elements' @@ -50,20 +61,27 @@ module WWW # search_results = agent.submit(search_form) # puts search_results.body class Mechanize + ## + # The version of Mechanize you are using. + + VERSION = '0.6.11' + + ## + # User Agent aliases AGENT_ALIASES = { 'Windows IE 6' => 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)', + 'Windows IE 7' => 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)', 'Windows Mozilla' => 'Mozilla/5.0 (Windows; U; Windows NT 5.0; en-US; rv:1.4b) Gecko/20030516 Mozilla Firebird/0.6', 'Mac Safari' => 'Mozilla/5.0 (Macintosh; U; PPC Mac OS X; en) AppleWebKit/418 (KHTML, like Gecko) Safari/417.9.3', 'Mac FireFox' => 'Mozilla/5.0 (Macintosh; U; PPC Mac OS X Mach-O; en-US; rv:1.8.0.3) Gecko/20060426 Firefox/1.5.0.3', 'Mac Mozilla' => 'Mozilla/5.0 (Macintosh; U; PPC Mac OS X Mach-O; en-US; rv:1.4a) Gecko/20030401', 'Linux Mozilla' => 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.4) Gecko/20030624', 'Linux Konqueror' => 'Mozilla/5.0 (compatible; Konqueror/3; Linux)', - 'Mechanize' => "WWW-Mechanize/#{Version} (http://rubyforge.org/projects/mechanize/)" + 'Mechanize' => "WWW-Mechanize/#{VERSION} (http://rubyforge.org/projects/mechanize/)" } attr_accessor :cookie_jar attr_accessor :log - attr_accessor :max_history attr_accessor :open_timeout, :read_timeout attr_accessor :user_agent attr_accessor :watch_for_set @@ -71,15 +89,24 @@ class Mechanize attr_accessor :key attr_accessor :cert attr_accessor :pass + attr_accessor :redirect_ok + attr_accessor :keep_alive_time + attr_accessor :keep_alive + attr_accessor :conditional_requests + attr_accessor :follow_meta_refresh attr_reader :history attr_reader :pluggable_parser + alias :follow_redirect? :redirect_ok + + @@nonce_count = -1 + CNONCE = Digest::MD5.hexdigest("%x" % (Time.now.to_i + rand(65535))) + def initialize # attr_accessors - @cookie_jar = CookieJar.new + @cookie_jar = CookieJar.new @log = nil - @max_history = nil @open_timeout = nil @read_timeout = nil @user_agent = AGENT_ALIASES['Mechanize'] @@ -88,14 +115,17 @@ def initialize @cert = nil # OpenSSL Certificate @key = nil # OpenSSL Private Key @pass = nil # OpenSSL Password + @redirect_ok = true # Should we follow redirects? # attr_readers - @history = [] + @history = WWW::Mechanize::History.new @pluggable_parser = PluggableParser.new - # Basic Auth variables - @user = nil # Basic Auth User - @password = nil # Basic Auth Password + # Auth variables + @user = nil # Auth User + @password = nil # Auth Password + @digest = nil # DigestAuth Digest + @auth_hash = {} # Keep track of urls for sending auth # Proxy settings @proxy_addr = nil @@ -103,9 +133,21 @@ def initialize @proxy_port = nil @proxy_user = nil + @conditional_requests = true + + @follow_meta_refresh = false + + # Connection Cache & Keep alive + @connection_cache = {} + @keep_alive_time = 300 + @keep_alive = true + yield self if block_given? end + def max_history=(length); @history.max_size = length; end + def max_history; @history.max_size; end + # Sets the proxy address, port, user, and password def set_proxy(addr, port, user = nil, pass = nil) @proxy_addr, @proxy_port, @proxy_user, @proxy_pass = addr, port, user, pass @@ -124,18 +166,23 @@ def cookies # Sets the user and password to be used for basic authentication. def basic_auth(user, password) - @user = user - @password = password + auth(user, password) + end + + def auth(user, password) + @user = user + @password = password end # Fetches the URL passed in and returns a page. - def get(url) - cur_page = current_page || Page.new( nil, {'content-type'=>'text/html'}) + def get(url, referer=nil, &block) + cur_page = referer || current_page || + Page.new( nil, {'content-type'=>'text/html'}) # fetch the page abs_uri = to_absolute_uri(url, cur_page) request = fetch_request(abs_uri) - page = fetch_page(abs_uri, request, cur_page) + page = fetch_page(abs_uri, request, cur_page, &block) add_to_history(page) page end @@ -149,10 +196,17 @@ def get_file(url) # Clicks the WWW::Mechanize::Link object passed in and returns the # page fetched. def click(link) + referer = + begin + link.page + rescue + nil + end uri = to_absolute_uri( - link.attributes['href'] || link.attributes['src'] || link.href + link.attributes['href'] || link.attributes['src'] || link.href, + referer || current_page() ) - get(uri) + get(uri, referer) end # Equivalent to the browser back button. Returns the most recent page @@ -169,9 +223,8 @@ def back # agent.post('http://example.com/', [ ["foo", "bar"] ]) def post(url, query={}) node = Hpricot::Elem.new(Hpricot::STag.new('form')) - node.attributes = {} - node.attributes['method'] = 'POST' - node.attributes['enctype'] = 'application/x-www-form-urlencoded' + node['method'] = 'POST' + node['enctype'] = 'application/x-www-form-urlencoded' form = Form.new(node) query.each { |k,v| @@ -188,17 +241,12 @@ def post(url, query={}) # agent.submit(page.forms.first, page.forms.first.buttons.first) def submit(form, button=nil) form.add_button_to_query(button) if button - uri = to_absolute_uri(form.action) + uri = to_absolute_uri(form.action, form.page) case form.method.upcase when 'POST' post_form(uri, form) when 'GET' - if uri.query.nil? - uri.query = WWW::Mechanize.build_query_string(form.build_query) - else - uri.query = uri.query + "&" + - WWW::Mechanize.build_query_string(form.build_query) - end + uri.query = WWW::Mechanize.build_query_string(form.build_query) get(uri) else raise "unsupported method: #{form.method.upcase}" @@ -212,9 +260,15 @@ def current_page # Returns whether or not a url has been visited def visited?(url) - url = url.uri if url.respond_to? :uri - uri = to_absolute_uri(url).to_s - ! @history.find { |h| h.uri.to_s == uri }.nil? + ! visited_page(url).nil? + end + + # Returns a visited page for the url passed in, otherwise nil + def visited_page(url) + if url.respond_to? :href + url = url.href + end + @history.visited_page(to_absolute_uri(url)) end # Runs given block, then resets the page history as it was before. self is @@ -230,22 +284,136 @@ def transact alias :page :current_page + protected + def set_headers(uri, request, cur_page) + if @keep_alive + request.add_field('Connection', 'keep-alive') + request.add_field('Keep-Alive', keep_alive_time.to_s) + else + request.add_field('Connection', 'close') + end + request.add_field('Accept-Encoding', 'gzip,identity') + request.add_field('Accept-Language', 'en-us,en;q0.5') + request.add_field('Accept-Charset', 'ISO-8859-1,utf-8;q=0.7,*;q=0.7') + + unless @cookie_jar.empty?(uri) + cookies = @cookie_jar.cookies(uri) + cookie = cookies.length > 0 ? cookies.join("; ") : nil + if log + cookies.each do |c| + log.debug("using cookie: #{c}") + end + end + request.add_field('Cookie', cookie) + end + + # Add Referer header to request + unless cur_page.uri.nil? + request.add_field('Referer', cur_page.uri.to_s) + end + + # Add User-Agent header to request + request.add_field('User-Agent', @user_agent) if @user_agent + + # Add If-Modified-Since if page is in history + if @conditional_requests + if( (page = visited_page(uri)) && page.response['Last-Modified'] ) + request.add_field('If-Modified-Since', page.response['Last-Modified']) + end + end + + if( @auth_hash[uri.host] ) + case @auth_hash[uri.host] + when :basic + request.basic_auth(@user, @password) + when :digest + @digest_response ||= nil + @digest_response = self.gen_auth_header(uri,request,@digest) if @digest + request.add_field('Authorization', @digest_response) if @digest_response + end + end + + request + end + + def gen_auth_header(uri, request, auth_header, is_IIS = false) + @@nonce_count += 1 + + user = @digest_user + password = @digest_password + + auth_header =~ /^(\w+) (.*)/ + + params = {} + $2.gsub(/(\w+)="(.*?)"/) { params[$1] = $2 } + + a_1 = "#{@user}:#{params['realm']}:#{@password}" + a_2 = "#{request.method}:#{uri.path}" + request_digest = '' + request_digest << Digest::MD5.hexdigest(a_1) + request_digest << ':' << params['nonce'] + request_digest << ':' << ('%08x' % @@nonce_count) + request_digest << ':' << CNONCE + request_digest << ':' << params['qop'] + request_digest << ':' << Digest::MD5.hexdigest(a_2) + + header = '' + header << "Digest username=\"#{@user}\", " + header << "realm=\"#{params['realm']}\", " + if is_IIS then + header << "qop=\"#{params['qop']}\", " + else + header << "qop=#{params['qop']}, " + end + header << "uri=\"#{uri.path}\", " + header << "algorithm=MD5, " + header << "nonce=\"#{params['nonce']}\", " + header << "nc=#{'%08x' % @@nonce_count}, " + header << "cnonce=\"#{CNONCE}\", " + header << "response=\"#{Digest::MD5.hexdigest(request_digest)}\"" + + return header + end + private def to_absolute_uri(url, cur_page=current_page()) - url = URI.parse(URI.escape(url.to_s.strip)) unless url.is_a? URI + unless url.is_a? URI + url = url.to_s.strip.gsub(/[^#{0.chr}-#{125.chr}]/) { |match| + sprintf('%%%X', match.unpack($KCODE == 'UTF8' ? 'U' : 'c')[0]) + } + + url = URI.parse( + Util.html_unescape( + url.split(/%[0-9A-Fa-f]{2}|#/).zip( + url.scan(/%[0-9A-Fa-f]{2}|#/) + ).map { |x,y| + "#{URI.escape(x)}#{y}" + }.join('') + ) + ) + end + + url.path = '/' if url.path.length == 0 # construct an absolute uri if url.relative? raise 'no history. please specify an absolute URL' unless cur_page.uri + base = cur_page.respond_to?(:bases) ? cur_page.bases.last : nil + url = ((base && base.uri && base.uri.absolute?) ? + base.uri : + cur_page.uri) + url url = cur_page.uri + url + # Strip initial "/.." bits from the path + url.path.sub!(/^(\/\.\.)+(?=\/)/, '') end return url end def post_form(url, form) - cur_page = current_page || Page.new(nil, {'content-type'=>'text/html'}) + cur_page = form.page || current_page || + Page.new( nil, {'content-type'=>'text/html'}) request_data = form.request_data @@ -276,19 +444,32 @@ def fetch_request(uri, type = :get) def fetch_page(uri, request, cur_page=current_page(), request_data=[]) raise "unsupported scheme" unless ['http', 'https'].include?(uri.scheme) - log.info("#{ request.class }: #{ uri.to_s }") if log + log.info("#{ request.class }: #{ request.path }") if log page = nil - http_obj = Net::HTTP.new( uri.host, - uri.port, - @proxy_addr, - @proxy_port, - @proxy_user, - @proxy_pass - ) + cache_obj = (@connection_cache["#{uri.host}:#{uri.port}"] ||= { + :connection => nil, + :keep_alive_options => {}, + }) + http_obj = cache_obj[:connection] + if http_obj.nil? || ! http_obj.started? + http_obj = cache_obj[:connection] = + Net::HTTP.new( uri.host, + uri.port, + @proxy_addr, + @proxy_port, + @proxy_user, + @proxy_pass + ) + cache_obj[:keep_alive_options] = {} + + # Specify timeouts if given + http_obj.open_timeout = @open_timeout if @open_timeout + http_obj.read_timeout = @read_timeout if @read_timeout + end - if uri.scheme == 'https' + if uri.scheme == 'https' && ! http_obj.started? http_obj.use_ssl = true http_obj.verify_mode = OpenSSL::SSL::VERIFY_NONE if @ca_file @@ -301,28 +482,24 @@ def fetch_page(uri, request, cur_page=current_page(), request_data=[]) end end - request.add_field('Accept-Encoding', 'gzip,identity') + # If we're keeping connections alive and the last request time is too + # long ago, stop the connection. Or, if the max requests left is 1, + # reset the connection. + if @keep_alive && http_obj.started? + opts = cache_obj[:keep_alive_options] + if((opts[:timeout] && + Time.now.to_i - cache_obj[:last_request_time] > opts[:timeout].to_i) || + opts[:max] && opts[:max].to_i == 1) - unless @cookie_jar.empty?(uri) - cookies = @cookie_jar.cookies(uri) - cookie = cookies.length > 0 ? cookies.join("; ") : nil - if log - cookies.each do |c| - log.debug("using cookie: #{c}") - end - end - request.add_field('Cookie', cookie) - end + log.debug('Finishing stale connection') if log + http_obj.finish - # Add Referer header to request - unless cur_page.uri.nil? - request.add_field('Referer', cur_page.uri.to_s) + end end - # Add User-Agent header to request - request.add_field('User-Agent', @user_agent) if @user_agent + http_obj.start unless http_obj.started? - request.basic_auth(@user, @password) if @user || @password + request = set_headers(uri, request, cur_page) # Log specified headers for the request if log @@ -331,76 +508,115 @@ def fetch_page(uri, request, cur_page=current_page(), request_data=[]) end end - http_obj.start { |http| - # Specify timeouts if given - http.open_timeout = @open_timeout if @open_timeout - http.read_timeout = @read_timeout if @read_timeout - - # Send the request - http.request(request, *request_data) {|response| - if log - response.each_header {|k,v| - log.debug("response-header: #{ k } => #{ v }") - } - end + cache_obj[:last_request_time] = Time.now.to_i - (response.get_fields('Set-Cookie')||[]).each do |cookie| - Cookie::parse(uri, cookie) { |c| - log.debug("saved cookie: #{c}") if log - @cookie_jar.add(uri, c) - } - end + # Send the request + response = http_obj.request(request, *request_data) {|response| + + body = StringIO.new + total = 0 + response.read_body { |part| + total += part.length + body.write(part) + log.debug("Read #{total} bytes") if log + } + body.rewind - response.read_body + response.each_header { |k,v| + log.debug("response-header: #{ k } => #{ v }") + } if log - content_type = nil - unless response['Content-Type'].nil? - data = response['Content-Type'].match(/^([^;]*)/) - content_type = data[1].downcase unless data.nil? - end + content_type = nil + unless response['Content-Type'].nil? + data = response['Content-Type'].match(/^([^;]*)/) + content_type = data[1].downcase unless data.nil? + end - response_body = - if encoding = response['Content-Encoding'] - case encoding.downcase - when 'gzip' - log.debug('gunzip body') if log - Zlib::GzipReader.new(StringIO.new(response.body)).read - else - raise 'Unsupported content encoding' - end + response_body = + if encoding = response['Content-Encoding'] + case encoding.downcase + when 'gzip' + log.debug('gunzip body') if log + Zlib::GzipReader.new(body).read + when 'x-gzip' + body.read else - response.body + raise 'Unsupported content encoding' end + else + body.read + end - # Find our pluggable parser - page = @pluggable_parser.parser(content_type).new( - uri, - response, - response_body, - response.code - ) + # Find our pluggable parser + page = @pluggable_parser.parser(content_type).new( + uri, + response, + response_body, + response.code + ) { |parser| + parser.mech = self if parser.respond_to? :mech= + if parser.respond_to?(:watch_for_set=) && @watch_for_set + parser.watch_for_set = @watch_for_set + end + } - log.info("status: #{ page.code }") if log + } - if page.respond_to? :watch_for_set - page.watch_for_set = @watch_for_set - end + # If the server sends back keep alive options, save them + if keep_alive_info = response['keep-alive'] + keep_alive_info.split(/,\s*/).each do |option| + k, v = option.split(/=/) + cache_obj[:keep_alive_options] ||= {} + cache_obj[:keep_alive_options][k.intern] = v + end + end - case page.code - when "200" - return page - when "301", "302" - log.info("follow redirect to: #{ response['Location'] }") if log - abs_uri = to_absolute_uri( - URI.parse( - URI.escape(URI.unescape(response['Location'].to_s))), page) - request = fetch_request(abs_uri) - return fetch_page(abs_uri, request, page) - else - raise ResponseCodeError.new(page.code), "Unhandled response", caller - end + (response.get_fields('Set-Cookie')||[]).each do |cookie| + Cookie::parse(uri, cookie, log) { |c| + log.debug("saved cookie: #{c}") if log + @cookie_jar.add(uri, c) } - } + end + + log.info("status: #{ page.code }") if log + + res_klass = Net::HTTPResponse::CODE_TO_OBJ[page.code.to_s] + + if follow_meta_refresh && page.respond_to?(:meta) && + (redirect = page.meta.first) + return redirect.click + end + + return page if res_klass <= Net::HTTPSuccess + + if res_klass == Net::HTTPNotModified + log.debug("Got cached page") if log + return visited_page(uri) + elsif res_klass <= Net::HTTPRedirection + return page unless follow_redirect? + log.info("follow redirect to: #{ response['Location'] }") if log + from_uri = page.uri + abs_uri = to_absolute_uri(response['Location'].to_s, page) + page = fetch_page(abs_uri, fetch_request(abs_uri), page) + @history.push(page, from_uri) + return page + elsif res_klass <= Net::HTTPUnauthorized + raise ResponseCodeError.new(page) unless @user || @password + raise ResponseCodeError.new(page) if @auth_hash.has_key?(uri.host) + if response['www-authenticate'] =~ /Digest/i + @auth_hash[uri.host] = :digest + @digest = response['www-authenticate'] + else + @auth_hash[uri.host] = :basic + end + return fetch_page( uri, + fetch_request(uri, request.method.downcase.to_sym), + cur_page, + request_data + ) + end + + raise ResponseCodeError.new(page), "Unhandled response", caller end def self.build_query_string(parameters) @@ -416,12 +632,26 @@ def self.build_query_string(parameters) end def add_to_history(page) - @history.push(page) - if @max_history and @history.size > @max_history - # keep only the last @max_history entries - @history = @history[@history.size - @max_history, @max_history] + @history.push(page, to_absolute_uri(page.uri)) + end + + # :stopdoc: + class Util + def self.html_unescape(s) + return s unless s + s.gsub(/&(\w+|#[0-9]+);/) { |match| + number = case match + when /&(\w+);/ + Hpricot::NamedCharacters[$1] + when /&#([0-9]+);/ + $1.to_i + end + + number ? ([number].pack('U') rescue match) : match + } end end + # :startdoc: end end # module WWW diff --git a/lib/mechanize/cookie.rb b/lib/mechanize/cookie.rb index 15996694..639cc5de 100644 --- a/lib/mechanize/cookie.rb +++ b/lib/mechanize/cookie.rb @@ -6,9 +6,8 @@ module WWW class Mechanize # This class is used to represent an HTTP Cookie. class Cookie < WEBrick::Cookie - def self.parse(uri, str) - cookies = [] - str.gsub(/(,([^;,]*=)|,$)/) { "\r\n#{$2}" }.split(/\r\n/).each { |c| + def self.parse(uri, str, log = nil) + return str.split(/,(?=[^;,]*=)|,$/).collect { |c| cookie_elem = c.split(/;/) first_elem = cookie_elem.shift first_elem.strip! @@ -24,25 +23,38 @@ def self.parse(uri, str) when "domain" then cookie.domain = value.sub(/^\./, '') when "path" then cookie.path = value when 'expires' - cookie.expires = begin - Time::parse(value) + begin + cookie.expires = Time::parse(value) + rescue + if log + log.warn("Couldn't parse expires: #{value}") + end + end + when "max-age" then + begin + cookie.max_age = Integer(value) rescue - Time.now + log.warn("Couldn't parse max age '#{value}'") if log + cookie.max_age = nil end - when "max-age" then cookie.max_age = Integer(value) when "comment" then cookie.comment = value - when "version" then cookie.version = Integer(value) + when "version" then + begin + cookie.version = Integer(value) + rescue + log.warn("Couldn't parse version '#{value}'") if log + cookie.version = nil + end when "secure" then cookie.secure = true end } - cookie.path ||= uri.path + + cookie.path ||= uri.path.to_s.sub(/[^\/]*$/, '') cookie.secure ||= false cookie.domain ||= uri.host # Move this in to the cookie jar yield cookie if block_given? - cookies << cookie } - return cookies end def to_s @@ -61,12 +73,13 @@ def initialize # Add a cookie to the Jar. def add(uri, cookie) - return unless uri.host =~ /#{cookie.domain}$/ - unless @jar.has_key?(cookie.domain) - @jar[cookie.domain] = Hash.new + return unless uri.host =~ /#{cookie.domain}$/i + normal_domain = cookie.domain.downcase + unless @jar.has_key?(normal_domain) + @jar[normal_domain] = Hash.new end - @jar[cookie.domain][cookie.name] = cookie + @jar[normal_domain][cookie.name] = cookie cleanup() cookie end @@ -77,7 +90,7 @@ def cookies(url) cookies = [] url.path = '/' if url.path.empty? @jar.each_key do |domain| - if url.host =~ /#{domain}$/ + if url.host =~ /#{domain}$/i @jar[domain].each_key do |name| if url.path =~ /^#{@jar[domain][name].path}/ if @jar[domain][name].expires.nil? diff --git a/lib/mechanize/errors.rb b/lib/mechanize/errors.rb index 97e12068..8396156a 100644 --- a/lib/mechanize/errors.rb +++ b/lib/mechanize/errors.rb @@ -20,10 +20,18 @@ def initialize(content_type) # Any other response code is up to the user to handle. class ResponseCodeError < RuntimeError attr_reader :response_code + attr_reader :page - def initialize(response_code) - @response_code = response_code + def initialize(page) + @page = page + @response_code = page.code end + + def to_s + "#{response_code} => #{Net::HTTPResponse::CODE_TO_OBJ[response_code]}" + end + + def inspect; to_s; end end end end diff --git a/lib/mechanize/form.rb b/lib/mechanize/form.rb index fff19965..ba19a5ed 100644 --- a/lib/mechanize/form.rb +++ b/lib/mechanize/form.rb @@ -26,15 +26,16 @@ class GlobalForm attr_reader :fields, :buttons, :file_uploads, :radiobuttons, :checkboxes attr_reader :enctype + + alias :elements :fields def initialize(form_node, elements_node) @form_node, @elements_node = form_node, elements_node - @form_node.attributes ||= {} - @method = (@form_node.attributes['method'] || 'GET').upcase - @action = @form_node.attributes['action'] - @name = @form_node.attributes['name'] - @enctype = @form_node.attributes['enctype'] || 'application/x-www-form-urlencoded' + @method = (@form_node['method'] || 'GET').upcase + @action = Util::html_unescape(@form_node['action']) + @name = @form_node['name'] + @enctype = @form_node['enctype'] || 'application/x-www-form-urlencoded' @clicked_buttons = [] parse @@ -47,7 +48,6 @@ def build_query(buttons = []) query = [] fields().each do |f| - next unless f.value query.push(*f.query_value) end @@ -105,6 +105,11 @@ def request_data end end + # Removes all fields with name +field_name+. + def delete_field!(field_name) + @fields.delete_if{ |f| f.name == field_name} + end + private def parse @fields = WWW::Mechanize::List.new @@ -115,41 +120,40 @@ def parse # Find all input tags (@elements_node/'input').each do |node| - node.attributes ||= {} - type = (node.attributes['type'] || 'text').downcase - name = node.attributes['name'] - next if type != 'submit' && name.nil? + type = (node['type'] || 'text').downcase + name = node['name'] + next if name.nil? && !(type == 'submit' || type =='button') case type when 'text', 'password', 'hidden', 'int' - @fields << Field.new(node.attributes['name'], node.attributes['value'] || '') + @fields << Field.new(node['name'], node['value'] || '') when 'radio' - @radiobuttons << RadioButton.new(node.attributes['name'], node.attributes['value'], node.attributes.has_key?('checked'), self) + @radiobuttons << RadioButton.new(node['name'], node['value'], node.has_attribute?('checked'), self) when 'checkbox' - @checkboxes << CheckBox.new(node.attributes['name'], node.attributes['value'], node.attributes.has_key?('checked'), self) + @checkboxes << CheckBox.new(node['name'], node['value'], node.has_attribute?('checked'), self) when 'file' - @file_uploads << FileUpload.new(node.attributes['name'], nil) + @file_uploads << FileUpload.new(node['name'], nil) when 'submit' - @buttons << Button.new(node.attributes['name'], node.attributes['value']) + @buttons << Button.new(node['name'], node['value']) + when 'button' + @buttons << Button.new(node['name'], node['value']) when 'image' - @buttons << ImageButton.new(node.attributes['name'], node.attributes['value']) + @buttons << ImageButton.new(node['name'], node['value']) end end # Find all textarea tags (@elements_node/'textarea').each do |node| - next if node.attributes.nil? - next if node.attributes['name'].nil? - @fields << Field.new(node.attributes['name'], node.all_text) + next if node['name'].nil? + @fields << Field.new(node['name'], node.inner_text) end # Find all select tags (@elements_node/'select').each do |node| - next if node.attributes.nil? - next if node.attributes['name'].nil? - if node.attributes.has_key? 'multiple' - @fields << MultiSelectList.new(node.attributes['name'], node) + next if node['name'].nil? + if node.has_attribute? 'multiple' + @fields << MultiSelectList.new(node['name'], node) else - @fields << SelectList.new(node.attributes['name'], node) + @fields << SelectList.new(node['name'], node) end end end @@ -172,9 +176,10 @@ def param_to_multipart(name, value) end def file_to_multipart(file) + file_name = file.file_name ? ::File.basename(file.file_name) : '' body = "Content-Disposition: form-data; name=\"" + "#{mime_value_quote(file.name)}\"; " + - "filename=\"#{mime_value_quote(file.file_name || '')}\"\r\n" + + "filename=\"#{mime_value_quote(file_name)}\"\r\n" + "Content-Transfer-Encoding: binary\r\n" if file.file_data.nil? and ! file.file_name.nil? @@ -212,17 +217,39 @@ def file_to_multipart(file) # puts form['name'] class Form < GlobalForm attr_reader :node + attr_reader :page - def initialize(node) - @node = node - super(@node, @node) + def initialize(node, mech=nil, page=nil) + super(node, node) + @page = page + @mech = mech + end + + # Returns whether or not the form contains a field with +field_name+ + def has_field?(field_name) + ! fields.find { |f| f.name.eql? field_name }.nil? end - # Fetch the first field whose name is equal to field_name + alias :has_key? :has_field? + + def has_value?(value) + ! fields.find { |f| f.value.eql? value }.nil? + end + + def keys; fields.map { |f| f.name }; end + + def values; fields.map { |f| f.value }; end + + # Fetch the first field whose name is equal to +field_name+ def field(field_name) fields.find { |f| f.name.eql? field_name } end + # Add a field with +field_name+ and +value+ + def add_field!(field_name, value = nil) + fields << WWW::Mechanize::Field.new(field_name, value) + end + # This method sets multiple fields on the form. It takes a list of field # name, value pairs. If there is more than one field found with the # same name, this method will set the first one found. If you want to @@ -248,7 +275,8 @@ def set_fields(fields = {}) # Fetch the value set in the input field 'name' # puts form['name'] def [](field_name) - field(field_name).value + f = field(field_name) + f && f.value end # Set the value of the first input field with the name passed in @@ -256,7 +284,12 @@ def [](field_name) # Set the value in the input field 'name' to "Aaron" # form['name'] = 'Aaron' def []=(field_name, value) - field(field_name).value = value + f = field(field_name) + if f.nil? + add_field!(field_name, value) + else + f.value = value + end end # Treat form fields like accessors. @@ -268,6 +301,11 @@ def method_missing(id,*args) end super end + + # Submit this form with the button passed in + def submit(button=nil) + @mech.submit(self, button) + end end end end diff --git a/lib/mechanize/form_elements.rb b/lib/mechanize/form_elements.rb index f119f945..5f8838c4 100644 --- a/lib/mechanize/form_elements.rb +++ b/lib/mechanize/form_elements.rb @@ -10,7 +10,12 @@ class Field attr_accessor :name, :value def initialize(name, value) - @name, @value = name, value + @name = Util.html_unescape(name) + @value = if value.is_a? String + Util.html_unescape(value) + else + value + end end def query_value @@ -22,9 +27,8 @@ def query_value # class, set WWW::FileUpload#file_data= to the data of the file you want # to upload and WWW::FileUpload#mime_type= to the appropriate mime type # of the file. - # See the example in EXAMPLES[link://files/EXAMPLES.html] + # See the example in EXAMPLES[link://files/EXAMPLES_txt.html] class FileUpload < Field - attr_accessor :name # Field name attr_accessor :file_name # File name attr_accessor :mime_type # Mime Type (Optional) @@ -32,7 +36,7 @@ class FileUpload < Field alias :file_data= :value= def initialize(name, file_name) - @file_name = file_name + @file_name = Util.html_unescape(file_name) @file_data = nil super(name, @file_data) end @@ -125,7 +129,7 @@ def initialize(name, node) end def query_value - value.collect { |v| [name, v] } + value ? value.collect { |v| [name, v] } : '' end # Select no options @@ -213,9 +217,9 @@ class Option alias :selected? :selected def initialize(node, select_list) - @text = node.all_text - @value = node.attributes['value'] - @selected = node.attributes.has_key?('selected') ? true : false + @text = node.inner_text + @value = Util.html_unescape(node['value']) + @selected = node.has_attribute? 'selected' @select_list = select_list # The select list this option belongs to end diff --git a/lib/mechanize/history.rb b/lib/mechanize/history.rb new file mode 100644 index 00000000..d544e1c0 --- /dev/null +++ b/lib/mechanize/history.rb @@ -0,0 +1,67 @@ +module WWW + class Mechanize + ## + # This class manages history for your mechanize object. + class History < Array + attr_accessor :max_size + + def initialize(max_size = nil) + @max_size = max_size + @history_index = {} + end + + def initialize_copy(orig) + super + @history_index = orig.instance_variable_get(:@history_index).dup + end + + def push(page, uri = nil) + super(page) + @history_index[(uri ? uri : page.uri).to_s] = page + if @max_size && self.length > @max_size + while self.length > @max_size + self.shift + end + end + self + end + alias :<< :push + + def visited?(url) + ! visited_page(url).nil? + end + + def visited_page(url) + @history_index[(url.respond_to?(:uri) ? url.uri : url).to_s] + end + + def clear + @history_index.clear + super + end + + def shift + return nil if length == 0 + page = self[0] + self[0] = nil + super + remove_from_index(page) + page + end + + def pop + return nil if length == 0 + page = super + remove_from_index(page) + page + end + + private + def remove_from_index(page) + @history_index.each do |k,v| + @history_index.delete(k) if v == page + end + end + end + end +end diff --git a/lib/mechanize/hpricot.rb b/lib/mechanize/hpricot.rb deleted file mode 100644 index 686cc612..00000000 --- a/lib/mechanize/hpricot.rb +++ /dev/null @@ -1,12 +0,0 @@ -require 'hpricot' -class Hpricot::Elem - def all_text - text = '' - children.each do |child| - if child.respond_to? :content - text << child.content - end - end - text - end -end diff --git a/lib/mechanize/inspect.rb b/lib/mechanize/inspect.rb index f73e6440..fda49999 100644 --- a/lib/mechanize/inspect.rb +++ b/lib/mechanize/inspect.rb @@ -40,6 +40,9 @@ def pretty_print(q) } } end + if RUBY_VERSION > '1.8.4' + alias :inspect :pretty_inspect + end end class Link @@ -49,6 +52,9 @@ def pretty_print(q) q.breakable; q.pp href } end + if RUBY_VERSION > '1.8.4' + alias :inspect :pretty_inspect + end end class Form diff --git a/lib/mechanize/list.rb b/lib/mechanize/list.rb index 669ec19d..ac39e1ae 100644 --- a/lib/mechanize/list.rb +++ b/lib/mechanize/list.rb @@ -54,12 +54,16 @@ def value=(arg) alias :and :with def method_missing(meth_sym, *args) - return first.send(meth_sym) if args.empty? - arg = args.first - if arg.class == Regexp - WWW::Mechanize::List.new(find_all { |e| e.send(meth_sym) =~ arg }) + if length > 0 + return first.send(meth_sym) if args.empty? + arg = args.first + if arg.class == Regexp + WWW::Mechanize::List.new(find_all { |e| e.send(meth_sym) =~ arg }) + else + WWW::Mechanize::List.new(find_all { |e| e.send(meth_sym) == arg }) + end else - WWW::Mechanize::List.new(find_all { |e| e.send(meth_sym) == arg }) + '' end end end diff --git a/lib/mechanize/mech_version.rb b/lib/mechanize/mech_version.rb deleted file mode 100644 index 609e7220..00000000 --- a/lib/mechanize/mech_version.rb +++ /dev/null @@ -1,5 +0,0 @@ -module WWW - class Mechanize - Version = '0.6.0' - end -end diff --git a/lib/mechanize/monkey_patch.rb b/lib/mechanize/monkey_patch.rb new file mode 100644 index 00000000..951f116e --- /dev/null +++ b/lib/mechanize/monkey_patch.rb @@ -0,0 +1,15 @@ +module Net + class HTTP + alias :old_keep_alive? :keep_alive? + def keep_alive?(req, res) + return false if /close/i =~ req['connection'].to_s + return false if @seems_1_0_server + return false if /close/i =~ res['connection'].to_s + return true if /keep-alive/i =~ res['connection'].to_s + return false if /close/i =~ res['proxy-connection'].to_s + return true if /keep-alive/i =~ res['proxy-connection'].to_s + (@curr_http_version == '1.1') + end + end +end + diff --git a/lib/mechanize/net-overrides/net/http.rb b/lib/mechanize/net-overrides/net/http.rb index 03b434c5..6cf6f81c 100644 --- a/lib/mechanize/net-overrides/net/http.rb +++ b/lib/mechanize/net-overrides/net/http.rb @@ -1,4 +1,4 @@ -# +# :enddoc: # = net/http.rb # # Copyright (C) 1999-2005 Yukihiro Matsumoto @@ -1826,7 +1826,7 @@ class HTTPResponse # redefine '416' => HTTPRequestedRangeNotSatisfiable, '417' => HTTPExpectationFailed, - '501' => HTTPInternalServerError, + '500' => HTTPInternalServerError, '501' => HTTPNotImplemented, '502' => HTTPBadGateway, '503' => HTTPServiceUnavailable, diff --git a/lib/mechanize/net-overrides/net/https.rb b/lib/mechanize/net-overrides/net/https.rb index 360d807b..efd40090 100644 --- a/lib/mechanize/net-overrides/net/https.rb +++ b/lib/mechanize/net-overrides/net/https.rb @@ -1,3 +1,4 @@ +# :enddoc: =begin = $RCSfile: https.rb,v $ -- SSL/TLS enhancement for Net::HTTP. diff --git a/lib/mechanize/net-overrides/net/protocol.rb b/lib/mechanize/net-overrides/net/protocol.rb index 9e444f9f..bd0d91ff 100644 --- a/lib/mechanize/net-overrides/net/protocol.rb +++ b/lib/mechanize/net-overrides/net/protocol.rb @@ -1,4 +1,4 @@ -# +# :enddoc: # = net/protocol.rb # #-- diff --git a/lib/mechanize/page.rb b/lib/mechanize/page.rb index 734d8748..af3a8f62 100644 --- a/lib/mechanize/page.rb +++ b/lib/mechanize/page.rb @@ -1,5 +1,6 @@ require 'fileutils' require 'hpricot' +require 'forwardable' module WWW class Mechanize @@ -15,23 +16,27 @@ class Mechanize # agent.get('http://google.com/').class #=> WWW::Mechanize::Page # class Page < File - attr_reader :root, :title, :watch_for_set - attr_reader :frames, :iframes, :links, :forms, :meta, :watches + extend Forwardable - def initialize(uri=nil, response=nil, body=nil, code=nil) - super(uri, response, body, code) - @watch_for_set = {} + attr_reader :parser, :title, :watch_for_set + attr_reader :frames, :iframes, :links, :forms, :meta, :watches, :bases + attr_accessor :mech + + alias :root :parser - yield self if block_given? + def initialize(uri=nil, response=nil, body=nil, code=nil, mech=nil) + super(uri, response, body, code) + @watch_for_set ||= {} + @mech ||= mech raise Mechanize::ContentTypeError.new(response['content-type']) unless content_type() =~ /^text\/html/ - parse_html if body && response - end - - # Get the response header - def header - @response + + # construct parser and feed with HTML + if body && response + @parser ||= Hpricot.parse(body) + parse_html + end end # Get the content type @@ -40,84 +45,88 @@ def content_type end # Search through the page like HPricot - def search(*args) - @root.search(*args) - end + def_delegator :@parser, :search, :search + def_delegator :@parser, :/, :/ + def_delegator :@parser, :at, :at - def at(*args) - @root.at(*args) - end - - alias :/ :search - def watch_for_set=(obj) @watch_for_set = obj - parse_html if @body + parse_html if @body && @watch_for_set end + # Find a form with +name+. Form will be yeilded if a block is given. def form(name) - forms.name(name).first + f = forms.name(name).first + yield f if block_given? + f end private def parse_html - # construct parser and feed with HTML - @root = Hpricot.parse(@body) - @forms = WWW::Mechanize::List.new @links = WWW::Mechanize::List.new @meta = WWW::Mechanize::List.new @frames = WWW::Mechanize::List.new @iframes = WWW::Mechanize::List.new + @bases = WWW::Mechanize::List.new @watches = {} # Set the title - @title = if (@root/'title').text.length > 0 - (@root/'title').text + @title = if (@parser/'title').text.length > 0 + (@parser/'title').text + end + + # Find all 'base' tags + (@parser/'base').each do |node| + @bases << Base.new(node, @mech, self) end # Find all the form tags - (@root/'form').each do |html_form| - form = Form.new(html_form) + (@parser/'form').each do |html_form| + form = Form.new(html_form, @mech, self) form.action ||= @uri @forms << form end # Find all the 'a' tags - (@root/'a').each do |node| - @links << Link.new(node) + (@parser/'a').each do |node| + @links << Link.new(node, @mech, self) + end + + # Find all the 'area' tags + (@parser/'area').each do |node| + @links << Link.new(node, @mech, self) end # Find all 'meta' tags - (@root/'meta').each do |node| - next if node.attributes.nil? - next unless node.attributes.has_key? 'http-equiv' - next unless node.attributes.has_key? 'content' - equiv = node.attributes['http-equiv'] - content = node.attributes['content'] + (@parser/'meta').each do |node| + next unless node['http-equiv'] + next unless node['content'] + equiv = node['http-equiv'] + content = node['content'] if equiv != nil && equiv.downcase == 'refresh' - if content != nil && content =~ /^\d+\s*;\s*url\s*=\s*(\S+)/i - node.attributes['href'] = $1 - @meta << Meta.new(node) + if content != nil && content =~ /^\d+\s*;\s*url\s*=\s*'?([^\s']+)/i + node['href'] = $1 + @meta << Meta.new(node, @mech, self) end end end # Find all 'frame' tags - (@root/'frame').each do |node| - @frames << Frame.new(node) + (@parser/'frame').each do |node| + @frames << Frame.new(node, @mech, self) end # Find all 'iframe' tags - (@root/'iframe').each do |node| - @iframes << Frame.new(node) + (@parser/'iframe').each do |node| + @iframes << Frame.new(node, @mech, self) end # Find all watch tags unless @watch_for_set.nil? @watch_for_set.each do |key, klass| - (@root/key).each do |node| + (@parser/key).each do |node| @watches[key] ||= [] @watches[key] << (klass ? klass.new(node) : node) end diff --git a/lib/mechanize/page_elements.rb b/lib/mechanize/page_elements.rb index 3b5735df..8698510d 100644 --- a/lib/mechanize/page_elements.rb +++ b/lib/mechanize/page_elements.rb @@ -13,21 +13,23 @@ class Link attr_reader :href attr_reader :text attr_reader :attributes + attr_reader :page alias :to_s :text + alias :referer :page - def initialize(node) - node.attributes ||= {} + def initialize(node, mech, page) @node = node - @href = node.attributes['href'] - @text = node.all_text - @attributes = node.attributes + @href = node['href'] + @text = node.inner_text + @page = page + @mech = mech + @attributes = node # If there is no text, try to find an image and use it's alt text if (@text.nil? || @text.length == 0) && (node/'img').length > 0 @text = '' (node/'img').each do |e| - e.attributes ||= {} - @text << (e.attributes.has_key?('alt') ? e.attributes['alt'] : '') + @text << ( e['alt'] || '') end end @@ -36,6 +38,11 @@ def initialize(node) def uri URI.parse(@href) end + + # Click on this link + def click + @mech.click self + end end # This class encapsulates a Meta tag. Mechanize treats meta tags just @@ -53,12 +60,18 @@ class Frame < Link alias :src :href alias :name :text - def initialize(node) - node.attributes ||= {} + def initialize(node, mech, referer) + super(node, mech, referer) @node = node - @text = node.attributes['name'] - @href = node.attributes['src'] + @text = node['name'] + @href = node['src'] end end + + # This class encapsulates a Base tag. Mechanize treats base tags just like + # 'a' tags. Base objects will contain links, but most likely will have + # no text. + class Base < Link + end end end diff --git a/lib/mechanize/parsers/rexml_page.rb b/lib/mechanize/parsers/rexml_page.rb new file mode 100644 index 00000000..c8ab28ae --- /dev/null +++ b/lib/mechanize/parsers/rexml_page.rb @@ -0,0 +1,35 @@ +require 'web/htmltools/xmltree' +require 'mechanize/rexml' + +class WWW::Mechanize::REXMLPage < WWW::Mechanize::Page + def initialize(uri=nil, response=nil, body=nil, code=nil, mech=nil) + @body = body + @watch_for_set = {} + @mech = mech + + # construct parser and feed with HTML + parser = HTMLTree::XMLParser.new + begin + parser.feed(@body) + rescue => ex + if ex.message =~ /attempted adding second root element to document/ and + # Put the whole document inside a single root element, which I + # simply name , just to make the parser happy. It's no + #longer valid HTML, but without a single root element, it's not + # valid HTML as well. + + # TODO: leave a possible doctype definition outside this element. + parser = HTMLTree::XMLParser.new + parser.feed("" + @body + "") + else + raise + end + end + + @root = parser.document + + yield self if block_given? + + super(uri, response, body, code) + end +end diff --git a/lib/mechanize/pluggable_parsers.rb b/lib/mechanize/pluggable_parsers.rb index 0b930934..6709b18a 100644 --- a/lib/mechanize/pluggable_parsers.rb +++ b/lib/mechanize/pluggable_parsers.rb @@ -17,20 +17,57 @@ class Mechanize # agent.get('http://example.com/foo.jpg').class #=> WWW::Mechanize::File # class File - attr_accessor :uri, :response, :body, :code + attr_accessor :uri, :response, :body, :code, :filename + alias :header :response alias :content :body def initialize(uri=nil, response=nil, body=nil, code=nil) - @uri, @response, @body, @code = uri, response, body, code + @uri, @body, @code = uri, body, code + @response = Headers.new + + # Copy the headers in to a hash to prevent memory leaks + if response + response.each { |k,v| + @response[k] = v + } + end + + @filename = 'index.html' + + # Set the filename + if disposition = @response['content-disposition'] + disposition.split(/;\s*/).each do |pair| + k,v = pair.split(/=/, 2) + @filename = v if k.downcase == 'filename' + end + else + if @uri + @filename = @uri.path.split(/\//).last || 'index.html' + @filename << ".html" unless @filename =~ /\./ + end + end + + yield self if block_given? end # Use this method to save the content of this object to filename - def save_as(filename) + def save_as(filename = nil) + if filename.nil? + filename = @filename + number = 1 + while(::File.exists?(filename)) + filename = "#{@filename}.#{number}" + number += 1 + end + end + ::File::open(filename, "wb") { |f| f.write body } end + + alias :save :save_as end # = Synopsis @@ -50,7 +87,7 @@ class FileSaver < File attr_reader :filename def initialize(uri=nil, response=nil, body=nil, code=nil) - @uri, @response, @body, @code = uri, response, body, code + super(uri, response, body, code) path = uri.path.empty? ? 'index.html' : uri.path.gsub(/^[\/]*/, '') path += 'index.html' if path =~ /\/$/ @@ -154,5 +191,14 @@ def []=(content_type, klass) @parsers[content_type] = klass end end + + class Headers < Hash + def [](key) + super(key.downcase) + end + def []=(key, value) + super(key.downcase, value) + end + end end end diff --git a/lib/mechanize/rexml.rb b/lib/mechanize/rexml.rb new file mode 100644 index 00000000..2b6c4214 --- /dev/null +++ b/lib/mechanize/rexml.rb @@ -0,0 +1,236 @@ +# :enddoc: +# Copyright (c) 2005 by Michael Neumann (mneumann@ntecs.de). +# Released under the same terms of license as Ruby. +# + +require 'rexml/rexml' + +class REXML::Text + def collect_text_recursively + value() + end +end + +class REXML::Comment + def collect_text_recursively + [] + end +end + +module REXML::Node + +# Aliasing functions to get rid of warnings. Remove when support for 1.8.2 +# is dropped. +if RUBY_VERSION > "1.8.2" + alias :old_each_recursive :each_recursive + alias :old_find_first_recursive :find_first_recursive + alias :old_index_in_parent :index_in_parent +end + + def search(arg) + list = WWW::Mechanize::List.new + each_recursive { |n| + list << n if n.name.downcase == arg + } + list + end + + alias :/ :search + + # Visit all subnodes of +self+ recursively + + def each_recursive(&block) # :yields: node + self.elements.each {|node| + block.call(node) + node.each_recursive(&block) + } + end + + # Find (and return) first subnode (recursively) for which the block evaluates + # to true. Returns +nil+ if none was found. + + def find_first_recursive(&block) # :yields: node + each_recursive {|node| + return node if block.call(node) + } + return nil + end + + # Find all subnodes (recursively) for which the block evaluates to true. + + def find_all_recursive(&block) # :yields: node + arr = [] + each_recursive {|node| + arr << node if block.call(node) + } + arr + end + + # Returns the index that +self+ has in its parent's elements array, so that + # the following equation holds true: + # + # node == node.parent.elements[node.index_in_parent] + + def index_in_parent + parent.index(self)+1 + end + + # Recursivly collects all text strings starting into an array. + # + # E.g. the method would return [["abc"], "def"] for this node: + # + # abcdef + + def collect_text_recursively + map {|n| n.collect_text_recursively} + end + + # Returns all text of all subnodes (recursivly), merged into one string. + # This is equivalent to: + # + # collect_text_recursively.flatten.join("") + + def inner_text + collect_text_recursively.flatten.join("") + end + + alias :text :inner_text + +end + +# +# Starting with +root_node+, we recursively look for a node with the given +# +tag+, the given +attributes+ (a Hash) and whoose text equals or matches the +# +text+ string or regular expression. +# +# To find the following node: +# +# text +# +# We use: +# +# find_node(root, 'td', {'class' => 'abc'}, "text") +# +# Returns +nil+ if no matching node was found. + +def find_node(root_node, tag, attributes, text=nil) + root_node.find_first_recursive {|node| + node.name == tag and + attributes.all? {|attr, val| node.attributes[attr] == val} and + (text ? text === node.text : true) + } +end + +# +# Extract specific columns (specified by the position of it's corrensponding +# header column) from a table. +# +# Given the following table: +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +#
ABC
A.1B.1C.1
A.2B.2C.2
+# +# To extract the first (A) and last (C) column: +# +# extract_from_table(root_node, ["A", "C"]) +# +# And you get this as result: +# +# [ +# ["A.1", "C.1"], +# ["A.2", "C.2"] +# ] +# + +def extract_from_table(root_node, headers, header_tags = %w(td th)) + + # extract and collect all header nodes + + header_nodes = headers.collect { |header| + root_node.find_first_recursive {|node| + header_tags.include?(node.name.downcase) and header === node.inner_text + } + } + + raise "some headers not found" if header_nodes.compact.size < headers.size + + # assert that all headers have the same parent 'header_row', which is the row + # in which the header_nodes are contained. 'table' is the surrounding table tag. + + header_row = header_nodes.first.parent + table = header_row.parent + + raise "different parents" unless header_nodes.all? {|n| n.parent == header_row} + + # we now iterate over all rows in the table that follows the header_row. + # for each row we collect the elements at the same positions as the header_nodes. + # this is what we finally return from the method. + + (header_row.index_in_parent .. table.elements.size).collect do |inx| + row = table.elements[inx] + header_nodes.collect { |n| row.elements[ n.parent.elements.index(n) ].text } + end +end + +# Given a HTML table, this method returns a matrix (2-dim array), with all the +# table-data elements correctly placed in it. +# +# If there's a table data element which uses 'colspan', that node is stored in +# at the current position of the row followed by (colspan-1) nil values. +# +# Example: +# +# +# +# +# +# +# +# +# +#
AB
C
+# +# Result: +# +# [ +# [A, B], +# [C, nil] +# ] +# +# where A, B and C are the corresponding "" nodes. +# + +def table_to_matrix(table_node) + matrix = [] + + # for each row + table_node.elements.each('tr') {|r| + row = [] + r.elements.each {|data| + next unless ['td', 'th'].include?(data.name) + row << data + + # fill with empty elements + colspan = (data.attributes['colspan'] || 1).to_i + (colspan - 1).times { row << nil } + } + matrix << row + } + + return matrix +end diff --git a/test/README b/test/README deleted file mode 100644 index 9325af70..00000000 --- a/test/README +++ /dev/null @@ -1,7 +0,0 @@ -= Mechanize Testing - -To run the tests, execute ts_mech.rb - -ts_mech.rb spawns a thread that has a WEBrick server in it to test against. -The WEBrick server can be found in server.rb. - diff --git a/test/htdocs/alt_text.html b/test/htdocs/alt_text.html index 1b9a066d..1037cb31 100644 --- a/test/htdocs/alt_text.html +++ b/test/htdocs/alt_text.html @@ -3,6 +3,7 @@ alt text + no image diff --git a/test/htdocs/empty_form.html b/test/htdocs/empty_form.html new file mode 100644 index 00000000..4f818b1d --- /dev/null +++ b/test/htdocs/empty_form.html @@ -0,0 +1,6 @@ + + +
+
+ + diff --git a/test/htdocs/find_link.html b/test/htdocs/find_link.html index 324a9b6d..1708e7a3 100644 --- a/test/htdocs/find_link.html +++ b/test/htdocs/find_link.html @@ -3,6 +3,7 @@ + Testing the links diff --git a/test/htdocs/form_select_none.html b/test/htdocs/form_select_none.html index e035ae66..28cde6b2 100644 --- a/test/htdocs/form_select_none.html +++ b/test/htdocs/form_select_none.html @@ -4,6 +4,7 @@ + + + + + + + diff --git a/test/htdocs/tc_encoded_links.html b/test/htdocs/tc_encoded_links.html new file mode 100644 index 00000000..a65430e0 --- /dev/null +++ b/test/htdocs/tc_encoded_links.html @@ -0,0 +1,5 @@ + + + test link + + diff --git a/test/htdocs/tc_follow_meta.html b/test/htdocs/tc_follow_meta.html new file mode 100644 index 00000000..21956255 --- /dev/null +++ b/test/htdocs/tc_follow_meta.html @@ -0,0 +1,8 @@ + + + + + + This page has a meta refresh. + + diff --git a/test/htdocs/tc_form_action.html b/test/htdocs/tc_form_action.html new file mode 100644 index 00000000..e824cdb1 --- /dev/null +++ b/test/htdocs/tc_form_action.html @@ -0,0 +1,48 @@ + + Page Title + +

Post Form 1

+
+ + + + + +
First Name

+ +
+ +

Post Form 2

+
+ + + + + +
First Name

+ +
+ +

Post Form 3

+
+ + + + + +
First Name

+ +
+ +

Post Form 4

+
+ + + + + +
First Name

+ +
+ + diff --git a/test/htdocs/tc_links.html b/test/htdocs/tc_links.html new file mode 100644 index 00000000..75f3e37e --- /dev/null +++ b/test/htdocs/tc_links.html @@ -0,0 +1,16 @@ + + + Bold Dude + Dude + Aaron James Patterson + Aaron Patterson + Ruby Rocks! + + encoded space + not encoded space + + unusual characters + + diff --git a/test/htdocs/tc_referer.html b/test/htdocs/tc_referer.html new file mode 100644 index 00000000..9fb759a9 --- /dev/null +++ b/test/htdocs/tc_referer.html @@ -0,0 +1,10 @@ + + + Referer Servlet +
+
+
+ +
+ + diff --git a/test/htdocs/tc_relative_links.html b/test/htdocs/tc_relative_links.html new file mode 100644 index 00000000..aaeccd71 --- /dev/null +++ b/test/htdocs/tc_relative_links.html @@ -0,0 +1,19 @@ + + + forward + + + + + + + + + + + + diff --git a/test/htdocs/unusual______.html b/test/htdocs/unusual______.html new file mode 100644 index 00000000..d546c758 --- /dev/null +++ b/test/htdocs/unusual______.html @@ -0,0 +1,5 @@ + + + This is a webpage that has a very unusual name. + + diff --git a/test/proxy.rb b/test/proxy.rb deleted file mode 100644 index ff276042..00000000 --- a/test/proxy.rb +++ /dev/null @@ -1,30 +0,0 @@ -# This is a simple proxy that assumes the destination server will -# close the connection after sending data, otherwise it will get blocked -# on reads. - -require 'rubygems' -require 'eventmachine' -require 'socket' - -module HttpProxy - include Socket::Constants - - def receive_data(data) - if data =~ /Host: (.*)$/ - (host, port) = $1.chomp.split(/:/) - port ||= 80 - socket = Socket.new( AF_INET, SOCK_STREAM, 0 ) - puts port.to_i - puts host - sockaddr = Socket.pack_sockaddr_in( port.to_i, host ) - socket.connect(sockaddr) - socket.write(data) - results = socket.read - send_data results - end - end -end - -EventMachine::run { - EventMachine::start_server "127.0.0.1", 2001, HttpProxy -} diff --git a/test/server.rb b/test/server.rb deleted file mode 100644 index 60bd1180..00000000 --- a/test/server.rb +++ /dev/null @@ -1,42 +0,0 @@ -require 'webrick' -require 'base64' -require 'servlets' -require 'logger' - -base_dir = FileTest.exists?(Dir::pwd + '/test') ? Dir::pwd + '/test' : Dir::pwd - -s = WEBrick::HTTPServer.new( - :Port => 2000, - :DocumentRoot => base_dir + "/htdocs", - :Logger => Logger.new(nil), - :AccessLog => Logger.new(nil) -) -s.mount("/one_cookie", OneCookieTest) -s.mount("/one_cookie_no_space", OneCookieNoSpacesTest) -s.mount("/many_cookies", ManyCookiesTest) -s.mount("/many_cookies_as_string", ManyCookiesAsStringTest) -s.mount("/send_cookies", SendCookiesTest) -s.mount("/form_post", FormTest) -s.mount("/form post", FormTest) -s.mount("/response_code", ResponseCodeTest) -s.mount("/file_upload", FileUploadTest) -s.mount("/bad_content_type", BadContentTypeTest) -s.mount("/content_type_test", ContentTypeTest) -s.mount("/gzip", GzipServlet) - -htpasswd = WEBrick::HTTPAuth::Htpasswd.new(base_dir + '/data/htpasswd') -auth = WEBrick::HTTPAuth::BasicAuth.new( - :UserDB => htpasswd, - :Realm => 'mechanize', - :Logger => Logger.new(nil), - :AccessLog => Logger.new(nil) -) -s.mount_proc('/htpasswd_auth') { |req, res| - auth.authenticate(req, res) - res.body = "You are authenticated" -} - -trap("INT") { s.stop } - -s.start - diff --git a/test/tc_authenticate.rb b/test/tc_authenticate.rb index 8bc8c4de..033127b6 100644 --- a/test/tc_authenticate.rb +++ b/test/tc_authenticate.rb @@ -13,14 +13,23 @@ def setup end def test_auth_success - @agent.basic_auth('mech', 'password') - page = @agent.get("http://localhost:#{PORT}/htpasswd_auth") + @agent.basic_auth('user', 'pass') + page = @agent.get("http://localhost/basic_auth") assert_equal('You are authenticated', page.body) end + def test_auth_bad_user_pass + @agent.basic_auth('aaron', 'aaron') + begin + page = @agent.get("http://localhost/basic_auth") + rescue WWW::Mechanize::ResponseCodeError => e + assert_equal("401", e.response_code) + end + end + def test_auth_failure begin - page = @agent.get("http://localhost:#{PORT}/htpasswd_auth") + page = @agent.get("http://localhost/basic_auth") rescue WWW::Mechanize::ResponseCodeError => e assert_equal("401", e.response_code) end diff --git a/test/tc_blank_form.rb b/test/tc_blank_form.rb new file mode 100644 index 00000000..8a179c22 --- /dev/null +++ b/test/tc_blank_form.rb @@ -0,0 +1,23 @@ +$:.unshift File.join(File.dirname(__FILE__), "..", "lib") + +require 'test/unit' +require 'rubygems' +require 'mechanize' +require 'test_includes' + +class BlankFormTest < Test::Unit::TestCase + include TestMethods + + def setup + @agent = WWW::Mechanize.new + end + + def test_blank_form_query_string + page = @agent.get('http://localhost/tc_blank_form.html') + form = page.forms.first + query = form.build_query + assert(query.length > 0) + assert query.all? { |x| x[1] == '' } + end +end + diff --git a/test/tc_checkboxes.rb b/test/tc_checkboxes.rb index b9781286..76312644 100644 --- a/test/tc_checkboxes.rb +++ b/test/tc_checkboxes.rb @@ -10,7 +10,7 @@ class TestCheckBoxes < Test::Unit::TestCase def setup @agent = WWW::Mechanize.new - @page = @agent.get("http://localhost:#{PORT}/tc_checkboxes.html") + @page = @agent.get('http://localhost/tc_checkboxes.html') end def test_select_one diff --git a/test/tc_cookie_class.rb b/test/tc_cookie_class.rb index 4538fe62..6e514aeb 100644 --- a/test/tc_cookie_class.rb +++ b/test/tc_cookie_class.rb @@ -7,8 +7,8 @@ module Enumerable def combine - masks = inject([[], 1]){|(ar, m), e| [ar< (Time.now - 86400)) - } + silently do + dates.each do |date| + cookie = "PREF=1; expires=#{date}" + WWW::Mechanize::Cookie.parse(url, cookie) { |cookie| + assert_equal(true, cookie.expires.nil?) + } + end end end @@ -104,9 +132,46 @@ def test_parse_valid_cookie end end + def test_parse_valid_cookie_empty_value + url = URI.parse('http://rubyforge.org/') + cookie_params = {} + cookie_params['expires'] = 'expires=Sun, 27-Sep-2037 00:00:00 GMT' + cookie_params['path'] = 'path=/' + cookie_params['domain'] = 'domain=.rubyforge.org' + cookie_params['httponly'] = 'HttpOnly' + cookie_value = '12345%7D=' + + expires = Time.parse('Sun, 27-Sep-2037 00:00:00 GMT') + + cookie_params.keys.combine.each do |c| + cookie_text = "#{cookie_value}; " + c.each_with_index do |key, idx| + if idx == (c.length - 1) + cookie_text << "#{cookie_params[key]}" + else + cookie_text << "#{cookie_params[key]}; " + end + end + cookie = nil + WWW::Mechanize::Cookie.parse(url, cookie_text) { |p_cookie| cookie = p_cookie } + assert_not_nil(cookie) + assert_equal('12345%7D=', cookie.to_s) + assert_equal('', cookie.value) + assert_equal('/', cookie.path) + assert_equal('rubyforge.org', cookie.domain) + + # if expires was set, make sure we parsed it + if c.find { |k| k == 'expires' } + assert_equal(expires, cookie.expires) + else + assert_nil(cookie.expires) + end + end + end + # If no path was given, use the one from the URL def test_cookie_using_url_path - url = URI.parse('http://rubyforge.org/login') + url = URI.parse('http://rubyforge.org/login.php') cookie_params = {} cookie_params['expires'] = 'expires=Sun, 27-Sep-2037 00:00:00 GMT' cookie_params['path'] = 'path=/' @@ -131,7 +196,7 @@ def test_cookie_using_url_path assert_not_nil(cookie) assert_equal('12345%7D=ASDFWEE345%3DASda', cookie.to_s) assert_equal('rubyforge.org', cookie.domain) - assert_equal('/login', cookie.path) + assert_equal('/', cookie.path) # if expires was set, make sure we parsed it if c.find { |k| k == 'expires' } diff --git a/test/tc_cookie_jar.rb b/test/tc_cookie_jar.rb index 9d828342..88bf1a0b 100644 --- a/test/tc_cookie_jar.rb +++ b/test/tc_cookie_jar.rb @@ -15,6 +15,61 @@ def cookie_from_hash(hash) } c end + + def test_domain_case + values = { :name => 'Foo', + :value => 'Bar', + :path => '/', + :expires => Time.now + (10 * 86400), + :domain => 'rubyforge.org' + } + url = URI.parse('http://rubyforge.org/') + + jar = WWW::Mechanize::CookieJar.new + assert_equal(0, jar.cookies(url).length) + + # Add one cookie with an expiration date in the future + cookie = cookie_from_hash(values) + jar.add(url, cookie) + assert_equal(1, jar.cookies(url).length) + + jar.add(url, cookie_from_hash( values.merge( :domain => 'RuByForge.Org', + :name => 'aaron' + ) ) ) + + assert_equal(2, jar.cookies(url).length) + + url2 = URI.parse('http://RuByFoRgE.oRg/') + assert_equal(2, jar.cookies(url2).length) + end + + def test_empty_value + values = { :name => 'Foo', + :value => '', + :path => '/', + :expires => Time.now + (10 * 86400), + :domain => 'rubyforge.org' + } + url = URI.parse('http://rubyforge.org/') + + jar = WWW::Mechanize::CookieJar.new + assert_equal(0, jar.cookies(url).length) + + # Add one cookie with an expiration date in the future + cookie = cookie_from_hash(values) + jar.add(url, cookie) + assert_equal(1, jar.cookies(url).length) + + jar.add(url, cookie_from_hash( values.merge( :domain => 'RuByForge.Org', + :name => 'aaron' + ) ) ) + + assert_equal(2, jar.cookies(url).length) + + url2 = URI.parse('http://RuByFoRgE.oRg/') + assert_equal(2, jar.cookies(url2).length) + end + def test_add_future_cookies values = { :name => 'Foo', :value => 'Bar', diff --git a/test/tc_cookies.rb b/test/tc_cookies.rb index ff276184..531ebfcc 100644 --- a/test/tc_cookies.rb +++ b/test/tc_cookies.rb @@ -15,7 +15,7 @@ def setup def test_send_cookies page = @agent.get("http://localhost:#{PORT}/many_cookies") page = @agent.get("http://localhost:#{PORT}/send_cookies") - assert_equal(2, page.links.length) + assert_equal(3, page.links.length) assert_not_nil(page.links.find { |l| l.text == "name:Aaron" }) assert_not_nil(page.links.find { |l| l.text == "no_expires:nope" }) end @@ -57,7 +57,7 @@ def test_many_cookies_as_string no_path_cookie = @agent.cookies.find { |k| k.name == "no_path" } assert_not_nil(no_path_cookie, "No path cookie is nil") assert_equal("no_path", no_path_cookie.value) - assert_equal("/many_cookies_as_string", no_path_cookie.path) + assert_equal("/", no_path_cookie.path) assert_equal(true, Time.now < no_path_cookie.expires) end @@ -88,7 +88,7 @@ def test_many_cookies no_path_cookie = @agent.cookies.find { |k| k.name == "no_path" } assert_not_nil(no_path_cookie, "No path cookie is nil") assert_equal("no_path", no_path_cookie.value) - assert_equal("/many_cookies", no_path_cookie.path) + assert_equal("/", no_path_cookie.path) assert_equal(true, Time.now < no_path_cookie.expires) end diff --git a/test/tc_encoded_links.rb b/test/tc_encoded_links.rb new file mode 100644 index 00000000..2d35c7a4 --- /dev/null +++ b/test/tc_encoded_links.rb @@ -0,0 +1,27 @@ +$:.unshift File.join(File.dirname(__FILE__), "..", "lib") + +require 'test/unit' +require 'rubygems' +require 'mechanize' +require 'test_includes' + +class TestEncodedLinks < Test::Unit::TestCase + include TestMethods + + def setup + @agent = WWW::Mechanize.new + @page = @agent.get("http://localhost:#{PORT}/tc_encoded_links.html") + end + + def test_click_link + link = @page.links.first + assert_equal('/form_post?a=b&b=c', link.href) + page = @agent.click(link) + assert_equal("http://localhost:#{PORT}/form_post?a=b&b=c", page.uri.to_s) + end + + def test_hpricot_link + page = @agent.click(@page.search('a').first) + assert_equal("http://localhost:#{PORT}/form_post?a=b&b=c", page.uri.to_s) + end +end diff --git a/test/tc_errors.rb b/test/tc_errors.rb index 83f11ecd..ad5cf8ce 100644 --- a/test/tc_errors.rb +++ b/test/tc_errors.rb @@ -19,6 +19,14 @@ def test_bad_form_method } end + def test_non_exist + begin + page = @agent.get("http://localhost:#{PORT}/bad_form_test.html") + rescue RuntimeError => ex + assert_equal("404", ex.inspect) + end + end + def test_too_many_radio page = @agent.get("http://localhost:#{PORT}/form_test.html") form = page.forms.name('post_form1').first diff --git a/test/tc_follow_meta.rb b/test/tc_follow_meta.rb new file mode 100644 index 00000000..f69aabec --- /dev/null +++ b/test/tc_follow_meta.rb @@ -0,0 +1,32 @@ +$:.unshift File.join(File.dirname(__FILE__), "..", "lib") + +require 'test/unit' +require 'rubygems' +require 'mechanize' +require 'test_includes' + +class FollowMetaTest < Test::Unit::TestCase + include TestMethods + + def setup + @agent = WWW::Mechanize.new + end + + def test_follow_meta + page = @agent.get('http://localhost/tc_follow_meta.html') + assert_equal('http://localhost/tc_follow_meta.html', page.uri.to_s) + assert_equal(1, page.meta.length) + + @agent.follow_meta_refresh = true + page = @agent.get('http://localhost/tc_follow_meta.html') + assert_equal('http://localhost/index.html', page.uri.to_s) + assert_equal(3, @agent.history.length) + end + + def test_follow_meta_on_302 + @agent.follow_meta_refresh = true + assert_nothing_raised { + @agent.get("http://localhost/response_code?code=302&ct=test/xml") + } + end +end diff --git a/test/tc_form_action.rb b/test/tc_form_action.rb new file mode 100644 index 00000000..ccd239f6 --- /dev/null +++ b/test/tc_form_action.rb @@ -0,0 +1,52 @@ +$:.unshift File.join(File.dirname(__FILE__), "..", "lib") + +require 'test/unit' +require 'rubygems' +require 'mechanize' +require 'test_includes' +require 'logger' + +class TestFormAction < Test::Unit::TestCase + include TestMethods + + def setup + @agent = WWW::Mechanize.new + @page = @agent.get("http://localhost:#{PORT}/tc_form_action.html") + end + + def test_post_encoded_action + form = @page.form('post_form1') { |f| + f.first_name = "Aaron" + } + assert_equal('/form_post?a=b&b=c', form.action) + page = form.submit + assert_equal("http://localhost:#{PORT}/form_post?a=b&b=c", page.uri.to_s) + end + + def test_get_encoded_action + form = @page.form('post_form2') { |f| + f.first_name = "Aaron" + } + assert_equal('/form_post?a=b&b=c', form.action) + page = form.submit + assert_equal("http://localhost:#{PORT}/form_post?first_name=Aaron", page.uri.to_s) + end + + def test_post_nonencoded_action + form = @page.form('post_form3') { |f| + f.first_name = "Aaron" + } + assert_equal('/form_post?a=b&b=c', form.action) + page = form.submit + assert_equal("http://localhost:#{PORT}/form_post?a=b&b=c", page.uri.to_s) + end + + def test_post_pound_sign + form = @page.form('post_form4') { |f| + f.first_name = "Aaron" + } + assert_equal('/form_post#1', form.action) + page = form.submit + assert_equal("http://localhost:#{PORT}/form_post#1", page.uri.to_s) + end +end diff --git a/test/tc_form_as_hash.rb b/test/tc_form_as_hash.rb new file mode 100644 index 00000000..b77b988d --- /dev/null +++ b/test/tc_form_as_hash.rb @@ -0,0 +1,69 @@ +$:.unshift File.join(File.dirname(__FILE__), "..", "lib") + +require 'webrick' +require 'test/unit' +require 'rubygems' +require 'mechanize' +require 'test_includes' + +class TestFormHash < Test::Unit::TestCase + include TestMethods + + def setup + @agent = WWW::Mechanize.new + @page = @agent.get('http://localhost/form_multival.html') + end + + def test_form_hash + form = @page.forms.name('post_form').first + + assert_not_nil(form) + field_length = form.fields.length + assert_nil(form['intarweb']) + form['intarweb'] = 'Aaron' + + assert_not_nil(form['intarweb']) + assert_equal(field_length + 1, form.fields.length) + end + + def test_add_field_via_hash + form = @page.forms.name('post_form').first + + assert_not_nil(form) + field_length = form.fields.length + assert_nil(form['intarweb']) + form['intarweb'] = 'Aaron' + + assert_not_nil(form['intarweb']) + assert_equal(field_length + 1, form.fields.length) + end + + def test_fields_as_hash + form = @page.forms.name('post_form').first + + assert_not_nil(form) + assert_equal(2, form.fields.name('first').length) + + form['first'] = 'Aaron' + assert_equal('Aaron', form['first']) + assert_equal('Aaron', form.fields.name('first').first.value) + end + + def test_keys + @page = @agent.get('http://localhost/empty_form.html') + form = @page.forms.first + + assert_not_nil(form) + assert_equal(false, form.has_field?('name')) + assert_equal(false, form.has_value?('Aaron')) + assert_equal(0, form.keys.length) + assert_equal(0, form.values.length) + form['name'] = 'Aaron' + assert_equal(true, form.has_field?('name')) + assert_equal(true, form.has_value?('Aaron')) + assert_equal(1, form.keys.length) + assert_equal(['name'], form.keys) + assert_equal(1, form.values.length) + assert_equal(['Aaron'], form.values) + end +end diff --git a/test/tc_form_button.rb b/test/tc_form_button.rb new file mode 100644 index 00000000..ccd621f5 --- /dev/null +++ b/test/tc_form_button.rb @@ -0,0 +1,36 @@ +$:.unshift File.join(File.dirname(__FILE__), "..", "lib") + +require 'test/unit' +require 'rubygems' +require 'mechanize' +require 'test_includes' + +class TestFormButtons < Test::Unit::TestCase + include TestMethods + + def setup + @agent = WWW::Mechanize.new + end + + def test_submit_button + html = <<-END + +
+ + END + page = WWW::Mechanize::Page.new( nil, html_response, html, 200, @agent ) + assert_equal(1, page.forms.length) + assert_equal(1, page.forms.first.buttons.length) + end + + def test_button_button + html = <<-END + +
+ + END + page = WWW::Mechanize::Page.new( nil, html_response, html, 200, @agent ) + assert_equal(1, page.forms.length) + assert_equal(1, page.forms.first.buttons.length) + end +end diff --git a/test/tc_form_no_inputname.rb b/test/tc_form_no_inputname.rb index 9153508b..392922ae 100644 --- a/test/tc_form_no_inputname.rb +++ b/test/tc_form_no_inputname.rb @@ -10,11 +10,11 @@ class FormNoInputNameTest < Test::Unit::TestCase def setup @agent = WWW::Mechanize.new + @page = @agent.get('http://localhost/form_no_input_name.html') end def test_no_input_name - page = @agent.get("http://localhost:#{PORT}/form_no_input_name.html") - form = page.forms.first + form = @page.forms.first assert_equal(0, form.fields.length) assert_equal(0, form.radiobuttons.length) assert_equal(0, form.checkboxes.length) diff --git a/test/tc_forms.rb b/test/tc_forms.rb index 8c41aab9..319a9604 100644 --- a/test/tc_forms.rb +++ b/test/tc_forms.rb @@ -39,6 +39,26 @@ def test_post_multival assert_not_nil(page.links.text('first:Patterson').first) end + # Test calling submit on the form object + def test_submit_on_form + page = @agent.get("http://localhost:#{PORT}/form_multival.html") + form = page.forms.name('post_form').first + + assert_not_nil(form) + assert_equal(2, form.fields.name('first').length) + + form.fields.name('first')[0].value = 'Aaron' + form.fields.name('first')[1].value = 'Patterson' + + page = form.submit + + assert_not_nil(page) + + assert_equal(2, page.links.length) + assert_not_nil(page.links.text('first:Aaron').first) + assert_not_nil(page.links.text('first:Patterson').first) + end + # Test submitting form with two fields of the same name def test_get_multival page = @agent.get("http://localhost:#{PORT}/form_multival.html") @@ -436,7 +456,7 @@ def test_get_with_param_in_action page = @agent.submit(get_form, get_form.buttons.first) # Check that the submitted fields exist - assert_equal(5, page.links.size, "Not enough links") + assert_equal(3, page.links.size, "Not enough links") assert_not_nil( page.links.find { |l| l.text == "likes ham:on" }, "likes ham check box missing" @@ -449,14 +469,6 @@ def test_get_with_param_in_action page.links.find { |l| l.text == "gender:male" }, "gender field missing" ) - assert_not_nil( - page.links.find { |l| l.text == "great day:yes" }, - "great day field missing" - ) - assert_not_nil( - page.links.find { |l| l.text == "one:two" }, - "one field missing" - ) end def test_field_addition @@ -477,15 +489,50 @@ def test_fields_as_accessors assert_equal('Aaron', form.first) end - def test_fields_as_hash + def test_add_field page = @agent.get("http://localhost:#{PORT}/form_multival.html") form = page.forms.name('post_form').first assert_not_nil(form) - assert_equal(2, form.fields.name('first').length) + number_of_fields = form.fields.length + + f = form.add_field!('intarweb') + assert_not_nil(f) + assert_equal(number_of_fields + 1, form.fields.length) + end + + def test_delete_field + page = @agent.get("http://localhost:#{PORT}/form_multival.html") + form = page.forms.name('post_form').first + + assert_not_nil(form) + number_of_fields = form.fields.length + + form.delete_field!('first') + assert_nil(form['first']) + assert_equal(number_of_fields - 2, form.fields.length) + end + + def test_has_field + page = @agent.get("http://localhost:#{PORT}/form_multival.html") + form = page.forms.name('post_form').first + + assert_not_nil(form) + assert_equal(false, form.has_field?('intarweb')) + f = form.add_field!('intarweb') + assert_not_nil(f) + assert_equal(true, form.has_field?('intarweb')) + end + + def test_field_error + @page = @agent.get('http://localhost/empty_form.html') + form = @page.forms.first + assert_raise(NoMethodError) { + form.foo = 'asdfasdf' + } - form['first'] = 'Aaron' - assert_equal('Aaron', form['first']) - assert_equal('Aaron', form.fields.name('first').first.value) + assert_raise(NoMethodError) { + form.foo + } end end diff --git a/test/tc_history.rb b/test/tc_history.rb new file mode 100644 index 00000000..21ac97da --- /dev/null +++ b/test/tc_history.rb @@ -0,0 +1,149 @@ +$:.unshift File.join(File.dirname(__FILE__), "..", "lib") + +require 'test/unit' +require 'rubygems' +require 'mechanize' +require 'test_includes' + +class TestHistory < Test::Unit::TestCase + include TestMethods + + def setup + @agent = WWW::Mechanize.new + @history = WWW::Mechanize::History.new + end + + def test_push + assert_equal(0, @history.length) + + page = @agent.get("http://localhost/tc_bad_links.html") + x = @history.push(page) + assert_equal(x, @history) + assert_equal(1, @history.length) + assert(@history.visited?(page)) + assert(@history.visited?(page.uri)) + assert(@history.visited?(page.uri.to_s)) + assert_equal(page, @history.visited_page(page)) + assert_equal(page, @history.visited_page(page.uri)) + assert_equal(page, @history.visited_page(page.uri.to_s)) + + @history.push(@agent.get("/tc_bad_links.html")) + assert_equal(2, @history.length) + end + + def test_shift + assert_equal(0, @history.length) + page = @agent.get("http://localhost/tc_bad_links.html") + @history.push(page) + assert_equal(1, @history.length) + + @history.push(@agent.get("/tc_bad_links.html")) + assert_equal(2, @history.length) + + @history.push(@agent.get("/index.html")) + assert_equal(3, @history.length) + + page2 = @history.shift + assert_equal(page, page2) + assert_equal(2, @history.length) + + @history.shift + assert_equal(1, @history.length) + assert_equal(false, @history.visited?(page)) + + @history.shift + assert_equal(0, @history.length) + + assert_nil(@history.shift) + assert_equal(0, @history.length) + end + + def test_pop + assert_equal(0, @history.length) + page = @agent.get("http://localhost/tc_bad_links.html") + @history.push(page) + assert_equal(1, @history.length) + + page2 = @agent.get("/index.html") + @history.push(page2) + assert_equal(2, @history.length) + assert_equal(page2, @history.pop) + assert_equal(1, @history.length) + assert_equal(true, @history.visited?(page)) + assert_equal(false, @history.visited?(page2)) + assert_equal(page, @history.pop) + assert_equal(0, @history.length) + assert_equal(false, @history.visited?(page)) + assert_equal(false, @history.visited?(page2)) + assert_nil(@history.pop) + end + + def test_max_size + @history = WWW::Mechanize::History.new(10) + 1.upto(20) do |i| + page = @agent.get('http://localhost/index.html') + @history.push page + assert_equal(true, @history.visited?(page)) + if i < 10 + assert_equal(i, @history.length) + else + assert_equal(10, @history.length) + end + end + + @history.clear + @history.max_size = 5 + 1.upto(20) do |i| + page = @agent.get('http://localhost/index.html') + @history.push page + assert_equal(true, @history.visited?(page)) + if i < 5 + assert_equal(i, @history.length) + else + assert_equal(5, @history.length) + end + end + + @history.max_size = 0 + 1.upto(20) do |i| + page = @agent.get('http://localhost/index.html') + @history.push page + assert_equal(false, @history.visited?(page)) + assert_equal(0, @history.length) + end + end + + def test_no_slash + page = @agent.get('http://localhost') + + node = Struct.new(:href, :inner_text).new('http://localhost/', 'blah') + link = WWW::Mechanize::Link.new(node, nil, nil) + assert(@agent.visited?(link)) + + node = Struct.new(:href, :inner_text).new('http://localhost', 'blah') + link = WWW::Mechanize::Link.new(node, nil, nil) + assert(@agent.visited?(link)) + end + + def test_with_slash + page = @agent.get('http://localhost/') + + node = Struct.new(:href, :inner_text).new('http://localhost/', 'blah') + link = WWW::Mechanize::Link.new(node, nil, nil) + assert(@agent.visited?(link)) + + node = Struct.new(:href, :inner_text).new('http://localhost', 'blah') + link = WWW::Mechanize::Link.new(node, nil, nil) + assert(@agent.visited?(link)) + end + + def test_clear + page = nil + 20.times { @history.push(page = @agent.get('http://localhost/index.html')) } + assert_equal(20, @history.length) + assert_equal(true, @history.visited?(page)) + @history.clear + assert_equal(0, @history.length) + assert_equal(false, @history.visited?(page)) + end +end diff --git a/test/tc_html_unscape_forms.rb b/test/tc_html_unscape_forms.rb new file mode 100644 index 00000000..f999b2c9 --- /dev/null +++ b/test/tc_html_unscape_forms.rb @@ -0,0 +1,46 @@ +$:.unshift File.join(File.dirname(__FILE__), "..", "lib") + +require 'test/unit' +require 'rubygems' +require 'mechanize' +require 'test_includes' + +class TestCheckBoxes < Test::Unit::TestCase + include TestMethods + + def test_field + f = WWW::Mechanize::Field.new('a&b', 'a&b') + assert_equal('a&b', f.name) + assert_equal('a&b', f.value) + + f = WWW::Mechanize::Field.new('a&b', 'a&b') + assert_equal('a&b', f.name) + assert_equal('a&b', f.value) + + f = WWW::Mechanize::Field.new('a&b', 'a&b') + assert_equal('a&b', f.name) + assert_equal('a&b', f.value) + end + + def test_file_upload + f = WWW::Mechanize::FileUpload.new('a&b', 'a&b') + assert_equal('a&b', f.name) + assert_equal('a&b', f.file_name) + + f = WWW::Mechanize::FileUpload.new('a&b', 'a&b') + assert_equal('a&b', f.name) + assert_equal('a&b', f.file_name) + end + + def test_image_button + f = WWW::Mechanize::ImageButton.new('a&b', 'a&b') + assert_equal('a&b', f.name) + assert_equal('a&b', f.value) + end + + def test_radio_button + f = WWW::Mechanize::RadioButton.new('a&b', 'a&b', nil, nil) + assert_equal('a&b', f.name) + assert_equal('a&b', f.value) + end +end diff --git a/test/tc_if_modified_since.rb b/test/tc_if_modified_since.rb new file mode 100644 index 00000000..c8e07c33 --- /dev/null +++ b/test/tc_if_modified_since.rb @@ -0,0 +1,25 @@ +$:.unshift File.join(File.dirname(__FILE__), "..", "lib") + +require 'test/unit' +require 'rubygems' +require 'mechanize' +require 'test_includes' + +class TestIfModifiedSince < Test::Unit::TestCase + def setup + @agent = WWW::Mechanize.new + end + + def test_get_twice + assert_equal(0, @agent.history.length) + page = @agent.get('http://localhost/if_modified_since') + assert_match(/You did not send/, page.body) + + assert_equal(1, @agent.history.length) + page2 = @agent.get('http://localhost/if_modified_since') + + assert_equal(2, @agent.history.length) + assert_equal(page.object_id, page2.object_id) + assert_match(/You did not send/, page.body) + end +end diff --git a/test/tc_keep_alive.rb b/test/tc_keep_alive.rb new file mode 100644 index 00000000..2f5fa68e --- /dev/null +++ b/test/tc_keep_alive.rb @@ -0,0 +1,38 @@ +$:.unshift File.join(File.dirname(__FILE__), "..", "lib") + +require 'test/unit' +require 'rubygems' +require 'mechanize' +require 'test_includes' + +class TestKeepAlive < Test::Unit::TestCase + include TestMethods + + def setup + @agent = WWW::Mechanize.new + end + + def test_keep_alive + page = @agent.get('http://localhost/http_headers') + headers = {} + page.body.split(/[\r\n]+/).each do |header| + headers.[]=(*header.chomp.split(/\|/)) + end + assert(headers.has_key?('connection')) + assert_equal('keep-alive', headers['connection']) + assert(headers.has_key?('keep-alive')) + assert_equal('300', headers['keep-alive']) + end + + def test_close_connection + @agent.keep_alive = false + page = @agent.get('http://localhost/http_headers') + headers = {} + page.body.split(/[\r\n]+/).each do |header| + headers.[]=(*header.chomp.split(/\|/)) + end + assert(headers.has_key?('connection')) + assert_equal('close', headers['connection']) + assert(!headers.has_key?('keep-alive')) + end +end diff --git a/test/tc_links.rb b/test/tc_links.rb index 3e130595..bce05173 100644 --- a/test/tc_links.rb +++ b/test/tc_links.rb @@ -12,21 +12,30 @@ def setup @agent = WWW::Mechanize.new end + def test_base + page = @agent.get("http://google.com/tc_base_link.html") + page = page.links.first.click + assert @agent.visited?("http://localhost/index.html") + end + def test_find_meta page = @agent.get("http://localhost:#{PORT}/find_link.html") - assert_equal(2, page.meta.length) - assert_equal("http://www.drphil.com/", page.meta[0].href.downcase) - assert_equal("http://www.upcase.com/", page.meta[1].href.downcase) + assert_equal(3, page.meta.length) + assert_equal(%w{ + http://www.drphil.com/ + http://www.upcase.com/ + http://tenderlovemaking.com/ }.sort, + page.meta.map { |x| x.href.downcase }.sort) end def test_find_link page = @agent.get("http://localhost:#{PORT}/find_link.html") - assert_equal(15, page.links.length) + assert_equal(18, page.links.length) end def test_alt_text page = @agent.get("http://localhost:#{PORT}/alt_text.html") - assert_equal(4, page.links.length) + assert_equal(5, page.links.length) assert_equal(1, page.meta.length) assert_equal('', page.meta.first.text) @@ -34,6 +43,7 @@ def test_alt_text assert_equal('', page.links.href('no_alt_text.html').first.text) assert_equal('no image', page.links.href('no_image.html').first.text) assert_equal('', page.links.href('no_text.html').first.text) + assert_equal('', page.links.href('nil_alt_text.html').first.text) end def test_click_link @@ -46,4 +56,51 @@ def test_click_link assert_equal("http://localhost:#{PORT}/form_test.html", @agent.history.last.uri.to_s) end + + def test_click_method + page = @agent.get("http://localhost:#{PORT}/frame_test.html") + link = page.links.text("Form Test") + assert_not_nil(link) + assert_equal('Form Test', link.text) + page = link.click + assert_equal("http://localhost:#{PORT}/form_test.html", + @agent.history.last.uri.to_s) + end + + def test_find_bold_link + page = @agent.get("http://localhost:#{PORT}/tc_links.html") + link = page.links.text(/Bold Dude/) + assert_equal(1, link.length) + assert_equal('Bold Dude', link.first.text) + + link = page.links.text('Aaron James Patterson') + assert_equal(1, link.length) + assert_equal('Aaron James Patterson', link.first.text) + + link = page.links.text('Aaron Patterson') + assert_equal(1, link.length) + assert_equal('Aaron Patterson', link.first.text) + + link = page.links.text('Ruby Rocks!') + assert_equal(1, link.length) + assert_equal('Ruby Rocks!', link.first.text) + end + + def test_link_with_encoded_space + page = @agent.get("http://localhost:#{PORT}/tc_links.html") + link = page.links.text('encoded space').first + page = @agent.click link + end + + def test_link_with_space + page = @agent.get("http://localhost:#{PORT}/tc_links.html") + link = page.links.text('not encoded space').first + page = @agent.click link + end + + def test_link_with_unusual_characters + page = @agent.get("http://localhost:#{PORT}/tc_links.html") + link = page.links.text('unusual characters').first + assert_nothing_raised { @agent.click link } + end end diff --git a/test/tc_mech.rb b/test/tc_mech.rb index fd2dbaa4..715c7026 100644 --- a/test/tc_mech.rb +++ b/test/tc_mech.rb @@ -13,6 +13,26 @@ def setup @agent = WWW::Mechanize.new end + def test_weird_url + assert_nothing_raised { + @agent.get('http://localhost/?action=bing&bang=boom=1|a=|b=|c=') + } + assert_nothing_raised { + @agent.get('http://localhost/?a=b&b=c&c=d') + } + assert_nothing_raised { + @agent.get("http://localhost/?a=#{[0xd6].pack('U')}") + } + end + + def test_kcode_url + $KCODE = 'u' + page = @agent.get("http://localhost/?a=#{[0xd6].pack('U')}") + assert_not_nil(page) + assert_equal('http://localhost/?a=%D6', page.uri.to_s) + $KCODE = 'NONE' + end + def test_history 0.upto(25) do |i| assert_equal(i, @agent.history.size) @@ -24,6 +44,8 @@ def test_history @agent.history.last.uri.to_s) assert_equal("http://localhost:#{PORT}/", @agent.history[-2].uri.to_s) + assert_equal("http://localhost:#{PORT}/", + @agent.history[-2].uri.to_s) assert_equal(true, @agent.visited?("http://localhost:#{PORT}/")) assert_equal(true, @agent.visited?("/form_test.html")) @@ -32,6 +54,24 @@ def test_history end + def test_visited + @agent.get("http://localhost/content_type_test?ct=application/pdf") + assert_equal(true, + @agent.visited?("http://localhost/content_type_test?ct=application/pdf")) + assert_equal(false, + @agent.visited?("http://localhost/content_type_test")) + assert_equal(false, + @agent.visited?("http://localhost/content_type_test?ct=text/html")) + end + + def test_visited_after_redirect + @agent.get("http://localhost/response_code?code=302") + assert_equal("http://localhost/index.html", + @agent.current_page.uri.to_s) + assert_equal(true, + @agent.visited?('http://localhost/response_code?code=302')) + end + def test_max_history @agent.max_history = 10 0.upto(10) do |i| @@ -45,6 +85,23 @@ def test_max_history end end + def test_max_history_order + @agent.max_history = 2 + assert_equal(0, @agent.history.length) + + @agent.get('http://localhost/form_test.html') + assert_equal(1, @agent.history.length) + + @agent.get('http://localhost/empty_form.html') + assert_equal(2, @agent.history.length) + + @agent.get('http://localhost/tc_checkboxes.html') + assert_equal(2, @agent.history.length) + assert_equal('http://localhost/empty_form.html', @agent.history[0].uri.to_s) + assert_equal('http://localhost/tc_checkboxes.html', + @agent.history[1].uri.to_s) + end + def test_back_button 0.upto(5) do |i| assert_equal(i, @agent.history.size) diff --git a/test/tc_no_attributes.rb b/test/tc_no_attributes.rb index 7f79e09b..7589d940 100644 --- a/test/tc_no_attributes.rb +++ b/test/tc_no_attributes.rb @@ -14,7 +14,7 @@ def setup def test_parse_no_attributes assert_nothing_raised do - page = @agent.get("http://localhost:#{PORT}/tc_no_attributes.html") + page = @agent.get('http://localhost/tc_no_attributes.html') end end end diff --git a/test/tc_pluggable_parser.rb b/test/tc_pluggable_parser.rb index fc9b93a0..bd86a10f 100644 --- a/test/tc_pluggable_parser.rb +++ b/test/tc_pluggable_parser.rb @@ -63,7 +63,7 @@ def test_filter @agent.pluggable_parser.html = Filter page = @agent.get("http://localhost:#{PORT}/find_link.html") assert_kind_of(Filter, page) - assert_equal(16, page.links.length) + assert_equal(19, page.links.length) assert_not_nil(page.links.text('Net::DAAP::Client').first) assert_equal(1, page.links.text('Net::DAAP::Client').length) end @@ -74,7 +74,7 @@ def test_filter_hash assert_kind_of(Class, @agent.pluggable_parser['text/html']) assert_equal(Filter, @agent.pluggable_parser['text/html']) assert_kind_of(Filter, page) - assert_equal(16, page.links.length) + assert_equal(19, page.links.length) assert_not_nil(page.links.text('Net::DAAP::Client').first) assert_equal(1, page.links.text('Net::DAAP::Client').length) end diff --git a/test/tc_referer.rb b/test/tc_referer.rb new file mode 100644 index 00000000..5270fb99 --- /dev/null +++ b/test/tc_referer.rb @@ -0,0 +1,46 @@ +$:.unshift File.join(File.dirname(__FILE__), "..", "lib") + +require 'test/unit' +require 'rubygems' +require 'mechanize' +require 'test_includes' + +class RefererTest < Test::Unit::TestCase + include TestMethods + + def setup + @agent = WWW::Mechanize.new + end + + def test_no_referer + page = @agent.get("http://localhost:#{PORT}/referer") + assert_equal('', page.body) + end + + def test_send_referer + page = @agent.get("http://localhost:#{PORT}/tc_referer.html") + page = @agent.click page.links.first + assert_equal("http://localhost:#{PORT}/tc_referer.html", page.body) + end + + def test_fetch_two + page1 = @agent.get("http://localhost:#{PORT}/tc_referer.html") + page2 = @agent.get("http://localhost:#{PORT}/tc_pretty_print.html") + page = @agent.click page1.links.first + assert_equal("http://localhost:#{PORT}/tc_referer.html", page.body) + end + + def test_fetch_two_first + page1 = @agent.get("http://localhost:#{PORT}/tc_referer.html") + page2 = @agent.get("http://localhost:#{PORT}/tc_pretty_print.html") + page = @agent.click page1.links + assert_equal("http://localhost:#{PORT}/tc_referer.html", page.body) + end + + def test_post_form + page1 = @agent.get("http://localhost:#{PORT}/tc_referer.html") + page2 = @agent.get("http://localhost:#{PORT}/tc_pretty_print.html") + page = @agent.submit page1.forms.first + assert_equal("http://localhost:#{PORT}/tc_referer.html", page.body) + end +end diff --git a/test/tc_relative_links.rb b/test/tc_relative_links.rb new file mode 100644 index 00000000..defb8a6d --- /dev/null +++ b/test/tc_relative_links.rb @@ -0,0 +1,47 @@ +$:.unshift File.join(File.dirname(__FILE__), "..", "lib") + +require 'test/unit' +require 'rubygems' +require 'mechanize' +require 'test_includes' + +class TestRelativeLinks < Test::Unit::TestCase + include TestMethods + + def setup + @agent = WWW::Mechanize.new + end + + def test_dot_dot_slash + @page = @agent.get("http://localhost/relative/tc_relative_links.html") + page = @page.links.first.click + assert_equal('http://localhost/tc_relative_links.html', @agent.current_page.uri.to_s) + end + + def test_too_many_dots + @page = @agent.get("http://localhost/relative/tc_relative_links.html") + page = @page.links.text('too many dots').click + assert_not_nil(page) + assert_equal('http://localhost/tc_relative_links.html', page.uri.to_s) + end + + def test_go_forward + @page = @agent.get("http://localhost/tc_relative_links.html") + @page = @page.links.first.click + assert_equal('http://localhost/relative/tc_relative_links.html', @agent.current_page.uri.to_s) + end + + def test_frame_dot_dot_slash + @page = @agent.get("http://localhost/relative/tc_relative_links.html") + page = @agent.click(@page.frames.text('frame1')) + assert_equal('http://localhost/tc_relative_links.html', @agent.current_page.uri.to_s) + end + + def test_frame_forward_back_forward + @page = @agent.get("http://localhost/tc_relative_links.html") + page1 = @agent.click @page.frames.name('frame1') + assert_equal('http://localhost/relative/tc_relative_links.html', @agent.current_page.uri.to_s) + page2 = @agent.click @page.frames.name('frame2') + assert_equal('http://localhost/relative/tc_relative_links.html', @agent.current_page.uri.to_s) + end +end diff --git a/test/tc_response_code.rb b/test/tc_response_code.rb index 2de7c6dd..2e8533ba 100644 --- a/test/tc_response_code.rb +++ b/test/tc_response_code.rb @@ -13,6 +13,10 @@ def setup end def test_redirect + @agent.get("http://localhost:#{PORT}/response_code?code=300") + assert_equal("http://localhost:#{PORT}/index.html", + @agent.current_page.uri.to_s) + @agent.get("http://localhost:#{PORT}/response_code?code=301") assert_equal("http://localhost:#{PORT}/index.html", @agent.current_page.uri.to_s) @@ -20,6 +24,22 @@ def test_redirect @agent.get("http://localhost:#{PORT}/response_code?code=302") assert_equal("http://localhost:#{PORT}/index.html", @agent.current_page.uri.to_s) + + @agent.get("http://localhost:#{PORT}/response_code?code=303") + assert_equal("http://localhost:#{PORT}/index.html", + @agent.current_page.uri.to_s) + + @agent.get("http://localhost:#{PORT}/response_code?code=307") + assert_equal("http://localhost:#{PORT}/index.html", + @agent.current_page.uri.to_s) + end + + def test_do_not_follow_redirect + @agent.redirect_ok = false + + @agent.get("http://localhost:#{PORT}/response_code?code=302") + assert_equal("http://localhost:#{PORT}/response_code?code=302", + @agent.current_page.uri.to_s) end def test_error diff --git a/test/tc_save_file.rb b/test/tc_save_file.rb index 2c1afa1c..c563081a 100644 --- a/test/tc_save_file.rb +++ b/test/tc_save_file.rb @@ -22,4 +22,35 @@ def test_save_file FileUtils.rm("test.html") assert_equal(length.to_i, file_length) end + + def test_save_file_default + page = WWW::Mechanize::File.new( + URI.parse('http://localhost/test.html'), + {}, + "hello" + ) + page.save + assert(File.exists?('test.html')) + page.save + assert(File.exists?('test.html.1')) + page.save + assert(File.exists?('test.html.2')) + FileUtils.rm("test.html") + FileUtils.rm("test.html.1") + FileUtils.rm("test.html.2") + end + + def test_save_file_default_with_dots + page = WWW::Mechanize::File.new( + URI.parse('http://localhost/../test.html'), + {}, + "hello" + ) + page.save + assert(File.exists?('test.html')) + page.save + assert(File.exists?('test.html.1')) + FileUtils.rm("test.html") + FileUtils.rm("test.html.1") + end end diff --git a/test/tc_subclass.rb b/test/tc_subclass.rb new file mode 100644 index 00000000..4804845c --- /dev/null +++ b/test/tc_subclass.rb @@ -0,0 +1,28 @@ +$:.unshift File.join(File.dirname(__FILE__), "..", "lib") + +require 'test/unit' +require 'rubygems' +require 'mechanize' +require 'test_includes' + +class MechSubclass < WWW::Mechanize + def set_headers(uri, request, cur_page) + super(uri, request, cur_page) + request.add_field('Cookie', 'name=Aaron') + request + end +end + +class TestSubclass < Test::Unit::TestCase + include TestMethods + + def setup + @agent = MechSubclass.new + end + + def test_send_cookie + page = @agent.get("http://localhost:#{PORT}/send_cookies") + assert_equal(1, page.links.length) + assert_not_nil(page.links.find { |l| l.text == "name:Aaron" }) + end +end diff --git a/test/tc_upload.rb b/test/tc_upload.rb index 9d24977f..3a61f27f 100644 --- a/test/tc_upload.rb +++ b/test/tc_upload.rb @@ -10,99 +10,94 @@ class UploadMechTest < Test::Unit::TestCase def setup @agent = WWW::Mechanize.new + @page = @agent.get("http://localhost:#{PORT}/file_upload.html") end def test_form_enctype - page = @agent.get("http://localhost:#{PORT}/file_upload.html") - assert_equal('multipart/form-data', page.forms[0].enctype) + assert_equal('multipart/form-data', @page.forms[0].enctype) - form = page.forms.first - form.file_uploads.first.file_name = "README" + form = @page.forms.first + form.file_uploads.first.file_name = "#{BASE_DIR}/test_all.rb" form.file_uploads.first.mime_type = "text/plain" form.file_uploads.first.file_data = "Hello World\n\n" - page = @agent.submit(form) + @page = @agent.submit(form) assert_match( - "Content-Disposition: form-data; name=\"userfile1\"; filename=\"README\"", - page.body + "Content-Disposition: form-data; name=\"userfile1\"; filename=\"test_all.rb\"", + @page.body ) assert_match( "Content-Disposition: form-data; name=\"name\"", - page.body + @page.body ) - assert_match('Content-Type: text/plain', page.body) - assert_match('Hello World', page.body) - assert_match('foo[aaron]', page.body) + assert_match('Content-Type: text/plain', @page.body) + assert_match('Hello World', @page.body) + assert_match('foo[aaron]', @page.body) end def test_form_multipart - page = @agent.get("http://localhost:#{PORT}/file_upload.html") - assert_equal('multipart/form-data', page.forms[1].enctype) + assert_equal('multipart/form-data', @page.forms[1].enctype) - form = page.forms[1] - form.file_uploads.first.file_name = "README" + form = @page.forms[1] + form.file_uploads.first.file_name = "#{BASE_DIR}/test_all.rb" form.file_uploads.first.mime_type = "text/plain" form.file_uploads.first.file_data = "Hello World\n\n" - page = @agent.submit(form) + @page = @agent.submit(form) assert_match( - "Content-Disposition: form-data; name=\"green[eggs]\"; filename=\"README\"", - page.body + "Content-Disposition: form-data; name=\"green[eggs]\"; filename=\"test_all.rb\"", + @page.body ) end def test_form_read_file - page = @agent.get("http://localhost:#{PORT}/file_upload.html") - assert_equal('multipart/form-data', page.forms[1].enctype) + assert_equal('multipart/form-data', @page.forms[1].enctype) - form = page.forms[1] - form.file_uploads.first.file_name = "README" + form = @page.forms[1] + form.file_uploads.first.file_name = "#{BASE_DIR}/test_all.rb" - page = @agent.submit(form) + @page = @agent.submit(form) - contents = File.open("README", 'rb') { |f| f.read } + contents = File.open("#{BASE_DIR}/test_all.rb", 'rb') { |f| f.read } assert_match( - "Content-Disposition: form-data; name=\"green[eggs]\"; filename=\"README\"", - page.body + "Content-Disposition: form-data; name=\"green[eggs]\"; filename=\"test_all.rb\"", + @page.body ) - assert_match(contents, page.body) + assert_match(contents, @page.body) end def test_form_io_obj - page = @agent.get("http://localhost:#{PORT}/file_upload.html") - assert_equal('multipart/form-data', page.forms[1].enctype) + assert_equal('multipart/form-data', @page.forms[1].enctype) - form = page.forms[1] - form.file_uploads.first.file_name = "README" - form.file_uploads.first.file_data = File.open("README", 'rb') + form = @page.forms[1] + form.file_uploads.first.file_name = "#{BASE_DIR}/test_all.rb" + form.file_uploads.first.file_data = File.open("#{BASE_DIR}/test_all.rb", 'rb') - page = @agent.submit(form) + @page = @agent.submit(form) - contents = File.open("README", 'rb') { |f| f.read } + contents = File.open("#{BASE_DIR}/test_all.rb", 'rb') { |f| f.read } assert_match( - "Content-Disposition: form-data; name=\"green[eggs]\"; filename=\"README\"", - page.body + "Content-Disposition: form-data; name=\"green[eggs]\"; filename=\"test_all.rb\"", + @page.body ) - assert_match(contents, page.body) + assert_match(contents, @page.body) end def test_submit_no_file - page = @agent.get("http://localhost:#{PORT}/file_upload.html") - form = page.forms.first + form = @page.forms.first form.fields.name('name').value = 'Aaron' - page = @agent.submit(form) - assert_match('Aaron', page.body) + @page = @agent.submit(form) + assert_match('Aaron', @page.body) assert_match( "Content-Disposition: form-data; name=\"userfile1\"; filename=\"\"", - page.body + @page.body ) end def test_no_value - page = @agent.get("http://localhost:#{PORT}/file_upload.html") - form = page.form('value_test') + form = @page.form('value_test') assert_nil(form.file_uploads.first.value) assert_nil(form.file_uploads.first.file_name) end diff --git a/test/tc_watches.rb b/test/tc_watches.rb index 58a28bbc..3b6d114d 100644 --- a/test/tc_watches.rb +++ b/test/tc_watches.rb @@ -9,7 +9,7 @@ class Area attr_reader :name def initialize(node) - @name = node.attributes['name'] + @name = node['name'] end end diff --git a/test/ts_mech.rb b/test/test_all.rb similarity index 72% rename from test/ts_mech.rb rename to test/test_all.rb index b8b62e0e..2b2396b4 100644 --- a/test/ts_mech.rb +++ b/test/test_all.rb @@ -1,45 +1,44 @@ $:.unshift File.join(File.dirname(__FILE__), "..", "lib") $:.unshift File.join(File.dirname(__FILE__), "..", "test") -Thread.new { - require 'server' -} - -#Thread.new { -# require 'ssl_server' -#} - -require 'test/unit' -require 'tc_cookies' -require 'tc_forms' -require 'tc_mech' -require 'tc_links' -require 'tc_response_code' -require 'tc_upload' -require 'tc_forms' -require 'tc_watches' require 'tc_authenticate' +require 'tc_bad_links' +require 'tc_blank_form' +require 'tc_checkboxes' require 'tc_cookie_class' require 'tc_cookie_jar' +require 'tc_cookies' require 'tc_errors' -require 'tc_save_file' -require 'tc_post_form' -require 'tc_pluggable_parser' -require 'tc_page' +require 'tc_follow_meta' +require 'tc_form_action' +require 'tc_form_as_hash' +require 'tc_form_button' require 'tc_form_no_inputname' +require 'tc_forms' +require 'tc_gzipping' +require 'tc_history' +require 'tc_html_unscape_forms' +require 'tc_if_modified_since' +require 'tc_links' +require 'tc_mech' require 'tc_multi_select' +require 'tc_no_attributes' +require 'tc_page' +require 'tc_pluggable_parser' +require 'tc_post_form' +require 'tc_pretty_print' +require 'tc_radiobutton' +require 'tc_referer' +require 'tc_relative_links' +require 'tc_response_code' +require 'tc_save_file' +require 'tc_select' require 'tc_select_all' require 'tc_select_none' -require 'tc_select' require 'tc_select_noopts' require 'tc_set_fields' -require 'tc_bad_links' -require 'tc_radiobutton' -require 'tc_checkboxes' -require 'tc_pretty_print' +require 'tc_subclass' require 'tc_textarea' -require 'tc_no_attributes' -require 'tc_gzipping' -#require 'tc_proxy' -#require 'tc_ssl_server' - +require 'tc_upload' +require 'tc_watches' +require 'test_mechanize_file' diff --git a/test/test_includes.rb b/test/test_includes.rb index 382aaf8b..768256f5 100644 --- a/test/test_includes.rb +++ b/test/test_includes.rb @@ -1,5 +1,121 @@ +require 'net/http' +require 'test_servlets' +require 'webrick/httputils' + +BASE_DIR = File.dirname(__FILE__) + +class Net::HTTP + #def self.new(*args) + # obj = allocate + # return obj + #end + + alias :old_do_start :do_start + + def do_start + @started = true + end + + SERVLETS = { + '/gzip' => GzipServlet, + '/form_post' => FormTest, + '/basic_auth' => BasicAuthServlet, + '/form post' => FormTest, + '/response_code' => ResponseCodeTest, + '/bad_content_type' => BadContentTypeTest, + '/content_type_test' => ContentTypeTest, + '/referer' => RefererServlet, + '/file_upload' => FileUploadTest, + '/one_cookie' => OneCookieTest, + '/one_cookie_no_space' => OneCookieNoSpacesTest, + '/many_cookies' => ManyCookiesTest, + '/many_cookies_as_string' => ManyCookiesAsStringTest, + '/send_cookies' => SendCookiesTest, + '/if_modified_since' => ModifiedSinceServlet, + '/http_headers' => HeaderServlet, + } + + PAGE_CACHE = {} + + alias :old_request :request + + def request(request, *data, &block) + url = URI.parse(request.path) + path = URI.unescape(url.path) + + path = '/index.html' if path == '/' + + res = Response.new + request.query = WEBrick::HTTPUtils.parse_query(url.query) + request.cookies = WEBrick::Cookie.parse(request['Cookie']) + if SERVLETS[path] + if request.method == "POST" + if request['Content-Type'] =~ /^multipart\/form-data/ + request.body = data.first + else + request.query = WEBrick::HTTPUtils.parse_query(data.first) + end + end + SERVLETS[path].new({}).send("do_#{request.method}", request, res) + else + filename = "htdocs#{path.gsub(/[^\/\\.\w_\s]/, '_')}" + unless PAGE_CACHE[filename] + File.open("#{BASE_DIR}/#{filename}", 'rb') { |file| + PAGE_CACHE[filename] = file.read + } + end + res.body = PAGE_CACHE[filename] + end + + res['Content-Type'] ||= 'text/html' + res['Content-Length'] ||= res.body.length.to_s + res.code ||= "200" + + res.cookies.each do |cookie| + res.add_field('Set-Cookie', cookie.to_s) + end + yield res if block_given? + res + end +end + +class Net::HTTPRequest + attr_accessor :query, :body, :cookies, :user +end + +class Response + include Net::HTTPHeader + + attr_reader :code + attr_accessor :body, :query, :cookies + + def code=(c) + @code = c.to_s + end + + alias :status :code + alias :status= :code= + + def initialize + @header = {} + @body = '' + @code = nil + @query = nil + @cookies = [] + end + + def read_body + yield body + end +end + + module TestMethods PORT = 2000 PROXYPORT = 2001 SSLPORT = 2002 + + def html_response + { 'content-type' => 'text/html' } + end end diff --git a/test/test_mechanize_file.rb b/test/test_mechanize_file.rb new file mode 100644 index 00000000..9b0e821e --- /dev/null +++ b/test/test_mechanize_file.rb @@ -0,0 +1,52 @@ +$:.unshift File.join(File.dirname(__FILE__), "..", "lib") + +require 'test/unit' +require 'rubygems' +require 'mechanize' +require 'test_includes' + +class MechanizeFileTest < Test::Unit::TestCase + def test_content_disposition + file = WWW::Mechanize::File.new( + URI.parse('http://localhost/foo'), + { 'content-disposition' => 'attachment; filename=genome.jpeg; modification-date="Wed, 12 Feb 1997 16:29:51 -0500"', } + ) + assert_equal('genome.jpeg', file.filename) + + file = WWW::Mechanize::File.new( + URI.parse('http://localhost/foo'), + { 'content-disposition' => 'filename=genome.jpeg; modification-date="Wed, 12 Feb 1997 16:29:51 -0500"', } + ) + assert_equal('genome.jpeg', file.filename) + + file = WWW::Mechanize::File.new( + URI.parse('http://localhost/foo'), + { 'content-disposition' => 'filename=genome.jpeg', } + ) + assert_equal('genome.jpeg', file.filename) + end + + def test_from_uri + file = WWW::Mechanize::File.new( + URI.parse('http://localhost/foo'), + {} + ) + assert_equal('foo.html', file.filename) + + file = WWW::Mechanize::File.new( + URI.parse('http://localhost/foo.jpg'), + {} + ) + assert_equal('foo.jpg', file.filename) + + file = WWW::Mechanize::File.new( + URI.parse('http://localhost/foo.jpg') + ) + assert_equal('foo.jpg', file.filename) + end + + def test_no_uri + file = WWW::Mechanize::File.new() + assert_equal('index.html', file.filename) + end +end diff --git a/test/servlets.rb b/test/test_servlets.rb similarity index 73% rename from test/servlets.rb rename to test/test_servlets.rb index 72847812..1fac8355 100644 --- a/test/servlets.rb +++ b/test/test_servlets.rb @@ -3,11 +3,76 @@ require 'date' require 'zlib' require 'stringio' +require 'base64' + +class BasicAuthServlet < WEBrick::HTTPServlet::AbstractServlet + def do_GET(req,res) + htpd = WEBrick::HTTPAuth::Htpasswd.new('dot.htpasswd') + htpd.set_passwd('Blah', 'user', 'pass') + authenticator = WEBrick::HTTPAuth::BasicAuth.new({ + :UserDB => htpd, + :Realm => 'Blah', + :Logger => Logger.new(nil) + } + ) + begin + authenticator.authenticate(req,res) + res.body = 'You are authenticated' + rescue WEBrick::HTTPStatus::Unauthorized => ex + res.status = 401 + end + FileUtils.rm('dot.htpasswd') + end +end + +class HeaderServlet < WEBrick::HTTPServlet::AbstractServlet + def do_GET(req, res) + res['Content-Type'] = "text/html" + body = '' + req.each_header do |k,v| + body << "#{k}|#{v}\n" + end + res.body = body + end +end + +class RefererServlet < WEBrick::HTTPServlet::AbstractServlet + def do_GET(req, res) + res['Content-Type'] = "text/html" + res.body = req['Referer'] || '' + end + + def do_POST(req, res) + res['Content-Type'] = "text/html" + res.body = req['Referer'] || '' + end +end + +class ModifiedSinceServlet < WEBrick::HTTPServlet::AbstractServlet + def do_GET(req, res) + s_time = 'Fri, 04 May 2001 00:00:38 GMT' + + my_time = Time.parse(s_time) + + if req['If-Modified-Since'] + your_time = Time.parse(req['If-Modified-Since']) + if my_time > your_time + res.body = 'This page was updated since you requested' + else + res.status = 304 + end + else + res.body = 'You did not send an If-Modified-Since header' + end + + res['Last-Modified'] = s_time + end +end class GzipServlet < WEBrick::HTTPServlet::AbstractServlet def do_GET(req, res) if req['Accept-Encoding'] =~ /gzip/ - File.open("htdocs/#{req.query['file']}", 'r') do |file| + File.open("#{BASE_DIR}/htdocs/#{req.query['file']}", 'r') do |file| string = "" zipped = StringIO.new string, 'w' gz = Zlib::GzipWriter.new(zipped) @@ -46,13 +111,11 @@ def do_POST(req, res) class ResponseCodeTest < WEBrick::HTTPServlet::AbstractServlet def do_GET(req, res) - res['Content-Type'] = "text/html" + res['Content-Type'] = req.query['ct'] || "text/html" if req.query['code'] code = req.query['code'].to_i case code - when 301 - res['Location'] = "/index.html" - when 302 + when 300, 301, 302, 303, 304, 305, 307 res['Location'] = "/index.html" end res.status = code