Skip to content

Commit

Permalink
Merge pull request #3804 from benwbrum/3720-spreadsheets-in-pdf
Browse files Browse the repository at this point in the history
Added spreadsheet exports to PDFs for #3720
  • Loading branch information
benwbrum authored Sep 28, 2023
2 parents 458b743 + 72f0978 commit 44e8f9e
Show file tree
Hide file tree
Showing 4 changed files with 83 additions and 45 deletions.
2 changes: 1 addition & 1 deletion app/controllers/concerns/export_service.rb
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ def export_printable(work, edition, format, preserve_lb, include_metadata, inclu

# run pandoc against the temp directory
log_file = File.join(temp_dir, "#{file_stub}.log")
cmd = "pandoc --from markdown+superscript -o #{output_file} #{md_file} --pdf-engine=xelatex --verbose --abbreviations=/dev/null > #{log_file} 2>&1"
cmd = "pandoc --from markdown+superscript+pipe_tables -o #{output_file} #{md_file} --pdf-engine=xelatex --verbose --abbreviations=/dev/null > #{log_file} 2>&1"
puts cmd
logger.info(cmd)
system(cmd)
Expand Down
31 changes: 26 additions & 5 deletions app/helpers/export_helper.rb
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
module ExportHelper

include XmlSourceProcessor

def xml_to_pandoc_md(xml_text, preserve_lb=true, flatten_links=false, collection=nil, div_pad=true)

# do some escaping of the document for markdown
Expand All @@ -25,11 +26,27 @@ def xml_to_pandoc_md(xml_text, preserve_lb=true, flatten_links=false, collection
e.replace_with(sup)
end



postprocessed = ""
doc.write(postprocessed)
html = xml_to_html(postprocessed, preserve_lb, flatten_links, collection)


# use Nokogiri for doc
markdown_tables = []
doc = Nokogiri::XML(postprocessed)
doc.xpath("//table").each_with_index do |n,i|
markdown_tables << xml_table_to_markdown_table(n, true)
n.replace("REPLACEMETABLE#{i}")
end

# now back to REXML
# html = xml_to_html(postprocessed, preserve_lb, flatten_links, collection)
# doc = REXML::Document.new("<html>#{html}</html>")
# doc.elements.each_with_index("//table") do |n,i|
# n.replace_with(REXML::Text.new(markdown_tables[i]))
# end
html = doc.to_s
# html=postprocessed

if div_pad
doc = REXML::Document.new("<div>#{html}</div>")
else
Expand All @@ -46,13 +63,17 @@ def xml_to_pandoc_md(xml_text, preserve_lb=true, flatten_links=false, collection

processed = "never ran"

cmd = "pandoc --from html --to markdown"
cmd = "pandoc --from html --to markdown+pipe_tables"
Open3.popen2(cmd) do |stdin, stdout, t|
stdin.print(html)
stdin.close
processed = stdout.read
end

markdown_tables.each_with_index do |table,i|
processed.gsub!("REPLACEMETABLE#{i}", table)
end

return processed
end

Expand Down
40 changes: 1 addition & 39 deletions app/models/page.rb
Original file line number Diff line number Diff line change
Expand Up @@ -579,45 +579,7 @@ def formatted_plaintext_doc(doc)
end

def formatted_plaintext_table(table_element)
text_table = ""

# clean up in-cell line-breaks
table_element.xpath('//lb').each { |n| n.replace(' ')}

# calculate the widths of each column based on max(header, cell[0...end])
column_count = ([table_element.xpath("//th").count] + table_element.xpath('//tr').map{|e| e.xpath('td').count }).max
column_widths = {}
1.upto(column_count) do |column_index|
longest_cell = (table_element.xpath("//tr/td[position()=#{column_index}]").map{|e| e.text().length}.max || 0)
corresponding_heading = heading_length = table_element.xpath("//th[position()=#{column_index}]").first
heading_length = corresponding_heading.nil? ? 0 : corresponding_heading.text().length
column_widths[column_index] = [longest_cell, heading_length].max
end

# print the header as markdown
cell_strings = []
table_element.xpath("//th").each_with_index do |e,i|
cell_strings << e.text.rjust(column_widths[i+1], ' ')
end
text_table << cell_strings.join(' | ') << "\n"

# print the separator
text_table << column_count.times.map{|i| ''.rjust(column_widths[i+1], '-')}.join(' | ') << "\n"

# print each row as markdown
table_element.xpath('//tr').each do |row_element|
text_table << row_element.xpath('td').map do |e|
width = 80 #default for hand-coded tables
index = e.path.match(/.*td\[(\d+)\]/)
if index
width = column_widths[index[1].to_i] || 80
else
width = column_widths.values.first
end
e.text.rjust(width, ' ')
end.join(' | ') << "\n"
end

text_table = xml_table_to_markdown_table(table_element)
table_element.replace(text_table)
end

Expand Down
55 changes: 55 additions & 0 deletions app/models/xml_source_processor.rb
Original file line number Diff line number Diff line change
Expand Up @@ -479,6 +479,61 @@ def rename_link_in_text(text, title_regex, new_title)
text
end


def pipe_tables_formatting(text)
# since Pandoc Pipe Tables extension requires pipe characters at the beginning and end of each line we must add them
# to the beginning and end of each line
text.split("\n").map{|line| "|#{line}|"}.join("\n")
end

def xml_table_to_markdown_table(table_element, pandoc_format=false)
text_table = ""

# clean up in-cell line-breaks
table_element.xpath('//lb').each { |n| n.replace(' ')}

# calculate the widths of each column based on max(header, cell[0...end])
column_count = ([table_element.xpath("//th").count] + table_element.xpath('//tr').map{|e| e.xpath('td').count }).max
column_widths = {}
1.upto(column_count) do |column_index|
longest_cell = (table_element.xpath("//tr/td[position()=#{column_index}]").map{|e| e.text().length}.max || 0)
corresponding_heading = heading_length = table_element.xpath("//th[position()=#{column_index}]").first
heading_length = corresponding_heading.nil? ? 0 : corresponding_heading.text().length
column_widths[column_index] = [longest_cell, heading_length].max
end

# print the header as markdown
cell_strings = []
table_element.xpath("//th").each_with_index do |e,i|
cell_strings << e.text.rjust(column_widths[i+1], ' ')
end
text_table << cell_strings.join(' | ') << "\n"

# print the separator
text_table << column_count.times.map{|i| ''.rjust(column_widths[i+1], '-')}.join(' | ') << "\n"

# print each row as markdown
table_element.xpath('//tr').each do |row_element|
text_table << row_element.xpath('td').map do |e|
width = 80 #default for hand-coded tables
index = e.path.match(/.*td\[(\d+)\]/)
if index
width = column_widths[index[1].to_i] || 80
else
width = column_widths.values.first
end
e.text.rjust(width, ' ')
end.join(' | ') << "\n"
end
if pandoc_format
text_table = pipe_tables_formatting(text_table)
end

text_table
end



def debug(msg)
logger.debug("DEBUG: #{msg}")
end
Expand Down

0 comments on commit 44e8f9e

Please sign in to comment.