Skip to content

Commit

Permalink
debug handling of Word HTML namespaces: metanorma/metanorma#363
Browse files Browse the repository at this point in the history
  • Loading branch information
opoudjis committed Apr 9, 2024
1 parent eacbb04 commit 40e5db4
Show file tree
Hide file tree
Showing 4 changed files with 20 additions and 18 deletions.
4 changes: 2 additions & 2 deletions html2doc.gemspec
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ Gem::Specification.new do |spec|
spec.add_dependency "mime-types"
spec.add_dependency "nokogiri", "~> 1.15"
spec.add_dependency "plane1converter", "~> 0.0.1"
spec.add_dependency "plurimath", "~> 0.7.0"
spec.add_dependency "plurimath", "~> 0.8.0"
spec.add_dependency "thread_safe"
spec.add_dependency "uuidtools"
spec.add_dependency "unitsml"
Expand All @@ -43,7 +43,7 @@ Gem::Specification.new do |spec|
spec.add_development_dependency "guard-rspec", "~> 4.7"
spec.add_development_dependency "rake", "~> 12.0"
spec.add_development_dependency "rspec", "~> 3.6"
spec.add_development_dependency "rspec-match_fuzzy", "~> 0.1.3"
spec.add_development_dependency "rspec-match_fuzzy", "~> 0.2.0"
spec.add_development_dependency "rubocop", "~> 1.5.2"
spec.add_development_dependency "simplecov", "~> 0.15"
spec.add_development_dependency "timecop", "~> 0.9"
Expand Down
2 changes: 1 addition & 1 deletion lib/html2doc/mime.rb
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ def rename_image(img, dir, localdir)

def skip_image_cleanup?(img)
src = img["src"]
(img.element? && %w(img v:imagedata).include?(img.name)) or return true
(img.element? && %w(img imagedata).include?(img.name)) or return true
(src.nil? || src.empty? || /^http/.match?(src) ||
%r{^data:(image|application)/[^;]+;base64}.match?(src)) and return true
false
Expand Down
28 changes: 14 additions & 14 deletions spec/html2doc_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

def html_input(xml)
<<~HTML
<html><head><title>blank</title>
<html xmlns:epub="http://www.idpf.org/2007/ops" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:w="urn:schemas-microsoft-com:office:word" xmlns:m="http://schemas.microsoft.com/office/2004/12/omml" lang="en" xml:lang="en"><head><title>blank</title>
<meta name="Originator" content="Me"/>
</head>
<body>
Expand All @@ -13,7 +13,7 @@ def html_input(xml)

def html_input_no_title(xml)
<<~HTML
<html><head>
<html xmlns:epub="http://www.idpf.org/2007/ops" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:w="urn:schemas-microsoft-com:office:word" xmlns:m="http://schemas.microsoft.com/office/2004/12/omml" lang="en" xml:lang="en"><head>
<meta name="Originator" content="Me"/>
</head>
<body>
Expand All @@ -24,7 +24,7 @@ def html_input_no_title(xml)

def html_input_empty_head(xml)
<<~HTML
<html><head></head>
<html xmlns:epub="http://www.idpf.org/2007/ops" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:w="urn:schemas-microsoft-com:office:word" xmlns:m="http://schemas.microsoft.com/office/2004/12/omml" lang="en" xml:lang="en"><head></head>
<body>
#{xml}
</body></html>
Expand All @@ -47,7 +47,7 @@ def mock_plurimath_error
Content-Type: text/html; charset="utf-8"
<?xml version="1.0"?>
<html xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:w="urn:schemas-microsoft-com:office:word" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:m="http://schemas.microsoft.com/office/2004/12/omml" xmlns="http://www.w3.org/TR/REC-html40"><head>
<html xmlns:epub="http://www.idpf.org/2007/ops" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:w="urn:schemas-microsoft-com:office:word" xmlns:m="http://schemas.microsoft.com/office/2004/12/omml" xmlns="http://www.w3.org/TR/REC-html40" lang="en" xml:lang="en"><head>
<xml>
<w:WordDocument>
<w:View>Print</w:View>
Expand Down Expand Up @@ -476,16 +476,16 @@ def image_clean(xml)
</mstyle>
</mstyle>
</math></div>]))
expect(guid_clean(File.read("test.doc", encoding: "utf-8")))
.to match_fuzzy(<<~OUTPUT)
#{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END}
#{word_body(%{
<div><m:oMathPara>
<m:oMath>
<m:r><m:rPr><m:sty m:val="b"></m:sty></m:rPr><m:r><m:t><span style="font-style:normal;font-weight:bold;">&#x2212;</span></m:t></m:r><m:nary><m:naryPr><m:chr m:val="&#x222B;"></m:chr><m:limLoc m:val="subSup"></m:limLoc><m:subHide m:val="0"></m:subHide><m:supHide m:val="0"></m:supHide></m:naryPr><m:sub><m:r><m:t><span style="font-style:normal;font-weight:bold;">log</span></m:t></m:r></m:sub><m:sup><m:r><m:t><span style="font-style:normal;font-weight:bold;">2</span></m:t></m:r></m:sup><m:e><m:d><m:dPr><m:begChr m:val="("></m:begChr><m:endChr m:val=")"></m:endChr><m:ctrlPr><w:rPr><w:rFonts w:ascii="Cambria Math" w:hAnsi="Cambria Math"></w:rFonts><w:i></w:i></w:rPr></m:ctrlPr></m:dPr><m:e><m:sSub><m:sSubPr><m:ctrlPr><w:rPr><w:rFonts w:ascii="Cambria Math" w:hAnsi="Cambria Math"></w:rFonts><w:i></w:i></w:rPr></m:ctrlPr></m:sSubPr><m:e><m:r><m:t><span style="font-style:normal;font-weight:bold;">p</span></m:t></m:r></m:e><m:sub><m:r><m:t><span style="font-style:normal;font-weight:bold;">u</span></m:t></m:r></m:sub></m:sSub></m:e></m:d></m:e></m:nary></m:r><m:r><m:rPr><m:sty m:val="b"></m:sty></m:rPr><m:t><span style="font-style:normal;font-weight:bold;">BB</span></m:t></m:r><m:r><m:rPr><m:scr m:val="double-struck"></m:scr></m:rPr><m:t>&#x1D539;&#x1D539;&#x1D539;</m:t></m:r><m:r><m:rPr><m:scr m:val="script"></m:scr><m:sty m:val="p"></m:sty></m:rPr><m:t>&#x1D49E;&#x1D49E;</m:t></m:r><m:r><m:rPr><m:scr m:val="script"></m:scr><m:sty m:val="b"></m:sty></m:rPr><m:t>&#x1D4D1;&#x1D4D2;&#x1D4D2;</m:t></m:r><m:r><m:rPr><m:scr m:val="monospace"></m:scr></m:rPr><m:t>&#x1D683;&#x1D683;</m:t></m:r><m:r><m:rPr><m:scr m:val="fraktur"></m:scr><m:sty m:val="p"></m:sty></m:rPr><m:t>&#x1D509;&#x211C;</m:t></m:r><m:r><m:rPr><m:scr m:val="fraktur"></m:scr><m:sty m:val="b"></m:sty></m:rPr><m:t>&#x1D56D;&#x1D571;&#x1D57D;</m:t></m:r><m:r><m:rPr><m:scr m:val="sans-serif"></m:scr><m:sty m:val="p"></m:sty></m:rPr><m:t>&#x1D5B2;&#x1D5A5;</m:t></m:r><m:r><m:rPr><m:scr m:val="sans-serif"></m:scr><m:sty m:val="b"></m:sty></m:rPr><m:t>&#x1D5D5;&#x1D5E6;&#x1D5D9;&#x1D770;</m:t></m:r><m:r><m:rPr><m:scr m:val="sans-serif"></m:scr><m:sty m:val="i"></m:sty></m:rPr><m:t>&#x1D61A;&#x1D60D;&#x1D610;</m:t></m:r><m:r><m:rPr><m:scr m:val="sans-serif"></m:scr><m:sty m:val="bi"></m:sty></m:rPr><m:t>&#x1D64E;&#x1D641;&#x1D63D;&#x1D644;&#x1D7AA;</m:t></m:r><m:r><m:rPr><m:sty m:val="bi"></m:sty></m:rPr><m:t><span class="nostem" style="font-weight:bold;"><em></em>BII</span></m:t></m:r><m:r><m:rPr><m:sty m:val="i"></m:sty></m:rPr><m:t><span class="nostem"><em></em>II</span></m:t></m:r>
</m:oMath>
</m:oMathPara></div>}, '<div style="mso-element:footnote-list"/>')}
#{WORD_FTR1}
doc = File.read("test.doc", encoding: "utf-8")
.sub(%r{^.*<m:oMathPara>}m, "<m:oMathPara>")
.sub(%r{</m:oMathPara>.*$}m, "</m:oMathPara>")
expect(doc)
.to be_equivalent_to(<<~OUTPUT)
<m:oMathPara>
<m:oMath>
<m:r><m:rPr><m:sty m:val="b"></m:sty></m:rPr><m:r><m:t><span style="font-style:normal;font-weight:bold;">&#x2212;</span></m:t></m:r><m:nary><m:naryPr><m:chr m:val="&#x222B;"></m:chr><m:limLoc m:val="subSup"></m:limLoc><m:subHide m:val="0"></m:subHide><m:supHide m:val="0"></m:supHide></m:naryPr><m:sub><m:r><m:t><span style="font-style:normal;font-weight:bold;">log</span></m:t></m:r></m:sub><m:sup><m:r><m:t><span style="font-style:normal;font-weight:bold;">2</span></m:t></m:r></m:sup><m:e><m:d><m:dPr><m:begChr m:val="("></m:begChr><m:sepChr m:val=""></m:sepChr><m:endChr m:val=")"></m:endChr></m:dPr><m:e><m:sSub><m:sSubPr><m:ctrlPr><w:rPr><w:rFonts w:ascii="Cambria Math" w:hAnsi="Cambria Math"></w:rFonts><w:i></w:i></w:rPr></m:ctrlPr></m:sSubPr><m:e><m:r><m:t><span style="font-style:normal;font-weight:bold;">p</span></m:t></m:r></m:e><m:sub><m:r><m:t><span style="font-style:normal;font-weight:bold;">u</span></m:t></m:r></m:sub></m:sSub></m:e></m:d></m:e></m:nary></m:r><m:r><m:rPr><m:sty m:val="b"></m:sty></m:rPr><m:t><span style="font-style:normal;font-weight:bold;">BB</span></m:t></m:r><m:r><m:rPr><m:scr m:val="double-struck"></m:scr></m:rPr><m:t>&#x1D539;&#x1D539;&#x1D539;</m:t></m:r><m:r><m:rPr><m:scr m:val="script"></m:scr><m:sty m:val="p"></m:sty></m:rPr><m:t>&#x1D49E;&#x1D49E;</m:t></m:r><m:r><m:rPr><m:scr m:val="script"></m:scr><m:sty m:val="b"></m:sty></m:rPr><m:t>&#x1D4D1;&#x1D4D2;&#x1D4D2;</m:t></m:r><m:r><m:rPr><m:scr m:val="monospace"></m:scr></m:rPr><m:t>&#x1D683;&#x1D683;</m:t></m:r><m:r><m:rPr><m:scr m:val="fraktur"></m:scr><m:sty m:val="p"></m:sty></m:rPr><m:t>&#x1D509;&#x211C;</m:t></m:r><m:r><m:rPr><m:scr m:val="fraktur"></m:scr><m:sty m:val="b"></m:sty></m:rPr><m:t>&#x1D56D;&#x1D571;&#x1D57D;</m:t></m:r><m:r><m:rPr><m:scr m:val="sans-serif"></m:scr><m:sty m:val="p"></m:sty></m:rPr><m:t>&#x1D5B2;&#x1D5A5;</m:t></m:r><m:r><m:rPr><m:scr m:val="sans-serif"></m:scr><m:sty m:val="b"></m:sty></m:rPr><m:t>&#x1D5D5;&#x1D5E6;&#x1D5D9;&#x1D770;</m:t></m:r><m:r><m:rPr><m:scr m:val="sans-serif"></m:scr><m:sty m:val="i"></m:sty></m:rPr><m:t>&#x1D61A;&#x1D60D;&#x1D610;</m:t></m:r><m:r><m:rPr><m:scr m:val="sans-serif"></m:scr><m:sty m:val="bi"></m:sty></m:rPr><m:t>&#x1D64E;&#x1D641;&#x1D63D;&#x1D644;&#x1D7AA;</m:t></m:r><m:r><m:rPr><m:sty m:val="bi"></m:sty></m:rPr><m:t><span class="nostem" style="font-weight:bold;"><em></em>BII</span></m:t></m:r><m:r><m:rPr><m:sty m:val="i"></m:sty></m:rPr><m:t><span class="nostem"><em></em>II</span></m:t></m:r>
</m:oMath>
</m:oMathPara>
OUTPUT
end

Expand Down
4 changes: 3 additions & 1 deletion spec/spec_helper.rb
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,10 @@
end

require "bundler/setup"
require 'rspec/match_fuzzy'
require "rspec/match_fuzzy"
require "html2doc"
require "rspec/matchers"
require "equivalent-xml"

RSpec.configure do |config|
# Enable flags like --only-failures and --next-failure
Expand Down

0 comments on commit 40e5db4

Please sign in to comment.