Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Option to use Ox as a SAX handler #49

Merged
merged 4 commits into from
Jul 18, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Gemfile
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,5 @@ group :development, :test do
gem 'guard-rspec'
gem 'simplecov', require: false, platforms: :mri
gem 'activerecord', '~> 4.1'
gem 'ox', '>= 2.1.2'
end
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

## Description

A declarative SAX parsing library backed by Nokogiri
A declarative SAX parsing library backed by Nokogiri or Ox

## Usage
```ruby
Expand Down Expand Up @@ -104,4 +104,4 @@ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
14 changes: 12 additions & 2 deletions lib/sax-machine.rb
Original file line number Diff line number Diff line change
@@ -1,8 +1,18 @@
require "sax-machine/version"
require "sax-machine/sax_document"
require "sax-machine/sax_configure"
require "sax-machine/sax_handler"
require "sax-machine/sax_config"
require "sax-machine/handlers/sax_abstract_handler"
require "sax-machine/handlers/sax_nokogiri_handler"

module SAXMachine
end
@@handler = :nokogiri

def self.handler
@@handler
end

def self.handler=(handler)
@@handler = handler
end
end
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
require "nokogiri"
require "time"

module SAXMachine
class SAXHandler < Nokogiri::XML::SAX::Document
module SAXAbstractHandler
NO_BUFFER = :no_buffer

class StackNode < Struct.new(:object, :config, :buffer)
Expand All @@ -13,14 +12,14 @@ def initialize(object, config = nil, buffer = NO_BUFFER)
end
end

def initialize(object, on_error = nil, on_warning = nil)
def _initialize(object, on_error = nil, on_warning = nil)
@stack = [ StackNode.new(object) ]
@parsed_configs = {}
@on_error = on_error
@on_warning = on_warning
end

def characters(data)
def _characters(data)
node = stack.last

if node.buffer == NO_BUFFER
Expand All @@ -29,10 +28,8 @@ def characters(data)
node.buffer << data
end
end
alias cdata_block characters

def start_element(name, attrs = [])

def _start_element(name, attrs = [])
name = normalize_name(name)
node = stack.last
object = node.object
Expand Down Expand Up @@ -76,7 +73,7 @@ def start_element(name, attrs = [])
end
end

def end_element(name)
def _end_element(name)
name = normalize_name(name)

start_tag = stack[-2]
Expand Down Expand Up @@ -134,30 +131,29 @@ def end_element(name)
stack.pop
end

private

def mark_as_parsed(object, element_config)
unless element_config.collection?
@parsed_configs[[object.object_id, element_config.object_id]] = true
def _error(string)
if @on_error
@on_error.call(string)
end
end

def parsed_config?(object, element_config)
@parsed_configs[[object.object_id, element_config.object_id]]
end

def warning(string)
def _warning(string)
if @on_warning
@on_warning.call(string)
end
end

def error(string)
if @on_error
@on_error.call(string)
private

def mark_as_parsed(object, element_config)
unless element_config.collection?
@parsed_configs[[object.object_id, element_config.object_id]] = true
end
end

def parsed_config?(object, element_config)
@parsed_configs[[object.object_id, element_config.object_id]]
end

def sax_config_for(object)
if object.class.respond_to?(:sax_config)
Expand Down
15 changes: 15 additions & 0 deletions lib/sax-machine/handlers/sax_nokogiri_handler.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
require "nokogiri"

module SAXMachine
class SAXNokogiriHandler < Nokogiri::XML::SAX::Document
include SAXAbstractHandler

alias_method :initialize, :_initialize
alias_method :characters, :_characters
alias_method :cdata_block, :_characters
alias_method :start_element, :_start_element
alias_method :end_element, :_end_element
alias_method :error, :_error
alias_method :warning, :_warning
end
end
40 changes: 40 additions & 0 deletions lib/sax-machine/handlers/sax_ox_handler.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
require "ox"

module SAXMachine
class SAXOxHandler < Ox::Sax
include SAXAbstractHandler

def initialize(*args)
_initialize(*args)
_reset_element
end

def attr(name, str)
@attrs[name] = str
end

def attrs_done
_start_element(@element, @attrs)
_reset_element
end

def start_element(name)
@element = name
end

def error(message, line, column)
_error("#{message} on line #{line} column #{column}")
end

alias_method :text, :_characters
alias_method :cdata, :_characters
alias_method :end_element, :_end_element

private

def _reset_element
@attrs = {}
@element = ""
end
end
end
21 changes: 17 additions & 4 deletions lib/sax-machine/sax_document.rb
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,24 @@ def self.included(base)
end

def parse(xml_text, on_error = nil, on_warning = nil)
sax_handler = SAXHandler.new(self, on_error, on_warning)
parser = Nokogiri::XML::SAX::Parser.new(sax_handler)
parser.parse(xml_text) do |ctx|
ctx.replace_entities = true
if SAXMachine.handler == :ox
Ox.sax_parse(
SAXOxHandler.new(self, on_error, on_warning),
StringIO.new(xml_text),
{
symbolize: false,
convert_special: true,
skip: :skip_return,
}
)
else
handler = SAXNokogiriHandler.new(self, on_error, on_warning)
parser = Nokogiri::XML::SAX::Parser.new(handler)
parser.parse(xml_text) do |ctx|
ctx.replace_entities = true
end
end

self
end

Expand Down
2 changes: 1 addition & 1 deletion sax-machine.gemspec
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ Gem::Specification.new do |s|
s.email = %q{paul@pauldix.net}
s.homepage = %q{http://github.com/pauldix/sax-machine}

s.summary = %q{Declarative SAX Parsing with Nokogiri}
s.summary = %q{Declarative SAX Parsing with Nokogiri or Ox}

s.license = %q{MIT}

Expand Down
4 changes: 2 additions & 2 deletions spec/benchmarks/benchmark.rb
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ class Atom
elements :entry, :as => :entries, :class => AtomEntry
end
end
feed_text = File.read("spec/sax-machine/atom.xml")
feed_text = File.read("spec/fixtures/atom.xml")

benchmark do |t|
t.report("feedzirra") do
Expand Down Expand Up @@ -65,7 +65,7 @@ class Atom
# element :title, String
# has_many :entry, Entry
# end
# feed_text = File.read("spec/sax-machine/atom.xml")
# feed_text = File.read("spec/fixtures/atom.xml")
#
# benchmark do |t|
# t.report("sax-machine") do
Expand Down
15 changes: 15 additions & 0 deletions spec/fixtures/atom-content.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@

<div xmlns="http://www.w3.org/1999/xhtml"><p>In my previous <a href="http://www.pauldix.net/2008/08/serializing-dat.html">post about the speed of serializing data</a>, I concluded that Marshal was the quickest way to get things done. So I set about using Marshal to store some data in an ActiveRecord object. Things worked great at first, but on some test data I got this error: marshal data too short. Luckily, <a href="http://www.brynary.com/">Bryan Helmkamp</a> had helpfully pointed out that there were sometimes problems with storing marshaled data in the database. He said it was best to base64 encode the marshal dump before storing.</p>

<p>I was curious why it was working on some things and not others. It turns out that some types of data being marshaled were causing the error to pop up. Here's the test data I used in my specs:</p>
<pre>{ :foo =&gt; 3, :bar =&gt; 2 } # hash with symbols for keys and integer values<br />[3, 2.1, 4, 8]&nbsp; &nbsp;&nbsp; &nbsp;&nbsp; &nbsp;&nbsp; # array with integer and float values</pre>
<p>Everything worked when I switched the array values to all integers so it seems that floats were causing the problem. However, in the interest of keeping everything working regardless of data types, I base64 encoded before going into the database and decoded on the way out.</p>

<p>I also ran the benchmarks again to determine what impact this would have on speed. Here are the results for 100 iterations on a 10k element array and a 10k element hash with and without base64 encode/decode:</p>
<pre>&nbsp; &nbsp;&nbsp; &nbsp;&nbsp; &nbsp;&nbsp; &nbsp;&nbsp; &nbsp; user&nbsp; &nbsp;&nbsp; &nbsp; system&nbsp; &nbsp;&nbsp; total&nbsp; &nbsp;&nbsp; &nbsp; real<br />array marshal&nbsp; 0.200000&nbsp; &nbsp;0.010000&nbsp; &nbsp;0.210000 (&nbsp; 0.214018) (without Base64)<br />array marshal&nbsp; 0.220000&nbsp; &nbsp;0.010000&nbsp; &nbsp;0.230000 (&nbsp; 0.250260)<br /><br />hash marshal&nbsp; &nbsp;1.830000&nbsp; &nbsp;0.040000&nbsp; &nbsp;1.870000 (&nbsp; 1.892874) (without Base64)<br />hash marshal&nbsp; &nbsp;2.040000&nbsp; &nbsp;0.100000&nbsp; &nbsp;2.140000 (&nbsp; 2.170405)</pre>
<p>As you can see the difference in speed is pretty negligible. I assume that the error has to do with AR cleaning the stuff that gets inserted into the database, but I'm not really sure. In the end it's just easier to use Base64.encode64 when serializing data into a text field in ActiveRecord using Marshal.</p>

<p>I've also read people posting about this error when using the database session store. I can only assume that it's because they were trying to store either way too much data in their session (too much for a regular text field) or they were storing float values or some other data type that would cause this to pop up. Hopefully this helps.</p></div>
<div class="feedflare">
<a href="http://feeds.feedburner.com/~f/PaulDixExplainsNothing?a=rWfWO"><img src="http://feeds.feedburner.com/~f/PaulDixExplainsNothing?i=rWfWO" border="0"></img></a> <a href="http://feeds.feedburner.com/~f/PaulDixExplainsNothing?a=RaCqo"><img src="http://feeds.feedburner.com/~f/PaulDixExplainsNothing?i=RaCqo" border="0"></img></a> <a href="http://feeds.feedburner.com/~f/PaulDixExplainsNothing?a=1CBLo"><img src="http://feeds.feedburner.com/~f/PaulDixExplainsNothing?i=1CBLo" border="0"></img></a>
</div><img src="http://feeds.feedburner.com/~r/PaulDixExplainsNothing/~4/383536354" height="1" width="1"/>
File renamed without changes.
55 changes: 40 additions & 15 deletions spec/sax-machine/sax_document_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def title=(val)
it "should be available" do
@klass.data_class(:date).should == DateTime
end

it "should handle an integer class" do
@klass = Class.new do
include SAXMachine
Expand All @@ -73,15 +73,15 @@ def title=(val)
document = @klass.parse("<number>5</number>")
document.number.should == 5
end

it "should handle an float class" do
@klass = Class.new do
include SAXMachine
element :number, :class => Float
end
document = @klass.parse("<number>5.5</number>")
document.number.should == 5.5
end
end

it "should handle an string class" do
@klass = Class.new do
Expand All @@ -91,7 +91,7 @@ def title=(val)
document = @klass.parse("<number>5.5</number>")
document.number.should == "5.5"
end

it "should handle a time class" do
@klass = Class.new do
include SAXMachine
Expand All @@ -100,7 +100,7 @@ def title=(val)
document = @klass.parse("<time>1994-02-04T06:20:00Z</time>")
document.time.should == Time.utc(1994, 2, 4, 6, 20, 0, 0)
end

end
describe "the required attribute" do
it "should be available" do
Expand Down Expand Up @@ -466,7 +466,7 @@ class Item
elements :item, :as => :items, :with => {:type => /Foo/}, :class => Foo
end
end

it "should cast into the correct class" do
document = @klass.parse("<items><item type=\"Bar\"><title>Bar title</title></item><item type=\"Foo\"><title>Foo title</title></item></items>")
document.items.size.should == 2
Expand Down Expand Up @@ -536,22 +536,22 @@ class Foo
end
end
end

describe "when dealing with element names containing dashes" do
it 'should automatically convert dashes to underscores' do
it 'should automatically convert dashes to underscores' do
class Dashes
include SAXMachine
element :dashed_element
end

parsed = Dashes.parse('<dashed-element>Text</dashed-element>')
parsed.dashed_element.should eq "Text"
end
end

describe "full example" do
before :each do
@xml = File.read('spec/sax-machine/atom.xml')
@xml = File.read('spec/fixtures/atom.xml')
class AtomEntry
include SAXMachine
element :title
Expand All @@ -570,17 +570,21 @@ class Atom
element :link, :value => :href, :as => :feed_url, :with => {:type => "application/atom+xml"}
elements :entry, :as => :entries, :class => AtomEntry
end

@feed = Atom.parse(@xml)
end # before

it "should parse the url" do
f = Atom.parse(@xml)
f.url.should == "http://www.pauldix.net/"
@feed.url.should == "http://www.pauldix.net/"
end

it "should parse entry url" do
f = Atom.parse(@xml)
f.entries.first.url.should == "http://www.pauldix.net/2008/09/marshal-data-to.html?param1=1&param2=2"
f.entries.first.alternate.should == "http://feeds.feedburner.com/~r/PaulDixExplainsNothing/~3/383536354/marshal-data-to.html?param1=1&param2=2"
@feed.entries.first.url.should == "http://www.pauldix.net/2008/09/marshal-data-to.html?param1=1&param2=2"
@feed.entries.first.alternate.should == "http://feeds.feedburner.com/~r/PaulDixExplainsNothing/~3/383536354/marshal-data-to.html?param1=1&param2=2"
end

it "should parse content" do
@feed.entries.first.content.should == File.read('spec/fixtures/atom-content.html')
end
end

Expand Down Expand Up @@ -835,4 +839,25 @@ def title=(blah)
@item.authors.last.role.should == 'artist'
end
end

describe "with error handling" do
before do
@xml = %[
<item id="1">
<title>sweet</title>
]

class ItemElement5
include SAXMachine
element :title
end

@errors = []
@item = ItemElement5.parse(@xml, ->(x) { @errors << x })
end

it 'should have error' do
@errors.uniq.size.should == 1
end
end
end
Loading