From 72f09789cc927b6e9b035826e355f49316a7c939 Mon Sep 17 00:00:00 2001 From: "Ben W. Brumfield" Date: Thu, 28 Sep 2023 08:07:50 -0500 Subject: [PATCH] Added spreadsheet exports to PDFs for #3720 --- app/controllers/concerns/export_service.rb | 2 +- app/helpers/export_helper.rb | 31 ++++++++++-- app/models/page.rb | 40 +--------------- app/models/xml_source_processor.rb | 55 ++++++++++++++++++++++ 4 files changed, 83 insertions(+), 45 deletions(-) diff --git a/app/controllers/concerns/export_service.rb b/app/controllers/concerns/export_service.rb index ccb4b7f897..e86bcf3454 100644 --- a/app/controllers/concerns/export_service.rb +++ b/app/controllers/concerns/export_service.rb @@ -83,7 +83,7 @@ def export_printable(work, edition, format, preserve_lb, include_metadata, inclu # run pandoc against the temp directory log_file = File.join(temp_dir, "#{file_stub}.log") - cmd = "pandoc --from markdown+superscript -o #{output_file} #{md_file} --pdf-engine=xelatex --verbose --abbreviations=/dev/null > #{log_file} 2>&1" + cmd = "pandoc --from markdown+superscript+pipe_tables -o #{output_file} #{md_file} --pdf-engine=xelatex --verbose --abbreviations=/dev/null > #{log_file} 2>&1" puts cmd logger.info(cmd) system(cmd) diff --git a/app/helpers/export_helper.rb b/app/helpers/export_helper.rb index b115b97809..6ac6a04da9 100644 --- a/app/helpers/export_helper.rb +++ b/app/helpers/export_helper.rb @@ -1,5 +1,6 @@ module ExportHelper - + include XmlSourceProcessor + def xml_to_pandoc_md(xml_text, preserve_lb=true, flatten_links=false, collection=nil, div_pad=true) # do some escaping of the document for markdown @@ -25,11 +26,27 @@ def xml_to_pandoc_md(xml_text, preserve_lb=true, flatten_links=false, collection e.replace_with(sup) end - - postprocessed = "" doc.write(postprocessed) - html = xml_to_html(postprocessed, preserve_lb, flatten_links, collection) + + + # use Nokogiri for doc + markdown_tables = [] + doc = Nokogiri::XML(postprocessed) + doc.xpath("//table").each_with_index do |n,i| + markdown_tables << xml_table_to_markdown_table(n, true) + n.replace("REPLACEMETABLE#{i}") + end + + # now back to REXML + # html = xml_to_html(postprocessed, preserve_lb, flatten_links, collection) + # doc = REXML::Document.new("#{html}") + # doc.elements.each_with_index("//table") do |n,i| + # n.replace_with(REXML::Text.new(markdown_tables[i])) + # end + html = doc.to_s + # html=postprocessed + if div_pad doc = REXML::Document.new("
#{html}
") else @@ -46,13 +63,17 @@ def xml_to_pandoc_md(xml_text, preserve_lb=true, flatten_links=false, collection processed = "never ran" - cmd = "pandoc --from html --to markdown" + cmd = "pandoc --from html --to markdown+pipe_tables" Open3.popen2(cmd) do |stdin, stdout, t| stdin.print(html) stdin.close processed = stdout.read end + markdown_tables.each_with_index do |table,i| + processed.gsub!("REPLACEMETABLE#{i}", table) + end + return processed end diff --git a/app/models/page.rb b/app/models/page.rb index 94cc72180c..5f90712a86 100644 --- a/app/models/page.rb +++ b/app/models/page.rb @@ -579,45 +579,7 @@ def formatted_plaintext_doc(doc) end def formatted_plaintext_table(table_element) - text_table = "" - - # clean up in-cell line-breaks - table_element.xpath('//lb').each { |n| n.replace(' ')} - - # calculate the widths of each column based on max(header, cell[0...end]) - column_count = ([table_element.xpath("//th").count] + table_element.xpath('//tr').map{|e| e.xpath('td').count }).max - column_widths = {} - 1.upto(column_count) do |column_index| - longest_cell = (table_element.xpath("//tr/td[position()=#{column_index}]").map{|e| e.text().length}.max || 0) - corresponding_heading = heading_length = table_element.xpath("//th[position()=#{column_index}]").first - heading_length = corresponding_heading.nil? ? 0 : corresponding_heading.text().length - column_widths[column_index] = [longest_cell, heading_length].max - end - - # print the header as markdown - cell_strings = [] - table_element.xpath("//th").each_with_index do |e,i| - cell_strings << e.text.rjust(column_widths[i+1], ' ') - end - text_table << cell_strings.join(' | ') << "\n" - - # print the separator - text_table << column_count.times.map{|i| ''.rjust(column_widths[i+1], '-')}.join(' | ') << "\n" - - # print each row as markdown - table_element.xpath('//tr').each do |row_element| - text_table << row_element.xpath('td').map do |e| - width = 80 #default for hand-coded tables - index = e.path.match(/.*td\[(\d+)\]/) - if index - width = column_widths[index[1].to_i] || 80 - else - width = column_widths.values.first - end - e.text.rjust(width, ' ') - end.join(' | ') << "\n" - end - + text_table = xml_table_to_markdown_table(table_element) table_element.replace(text_table) end diff --git a/app/models/xml_source_processor.rb b/app/models/xml_source_processor.rb index 375a193acb..a46d5eefb6 100644 --- a/app/models/xml_source_processor.rb +++ b/app/models/xml_source_processor.rb @@ -479,6 +479,61 @@ def rename_link_in_text(text, title_regex, new_title) text end + + def pipe_tables_formatting(text) + # since Pandoc Pipe Tables extension requires pipe characters at the beginning and end of each line we must add them + # to the beginning and end of each line + text.split("\n").map{|line| "|#{line}|"}.join("\n") + end + + def xml_table_to_markdown_table(table_element, pandoc_format=false) + text_table = "" + + # clean up in-cell line-breaks + table_element.xpath('//lb').each { |n| n.replace(' ')} + + # calculate the widths of each column based on max(header, cell[0...end]) + column_count = ([table_element.xpath("//th").count] + table_element.xpath('//tr').map{|e| e.xpath('td').count }).max + column_widths = {} + 1.upto(column_count) do |column_index| + longest_cell = (table_element.xpath("//tr/td[position()=#{column_index}]").map{|e| e.text().length}.max || 0) + corresponding_heading = heading_length = table_element.xpath("//th[position()=#{column_index}]").first + heading_length = corresponding_heading.nil? ? 0 : corresponding_heading.text().length + column_widths[column_index] = [longest_cell, heading_length].max + end + + # print the header as markdown + cell_strings = [] + table_element.xpath("//th").each_with_index do |e,i| + cell_strings << e.text.rjust(column_widths[i+1], ' ') + end + text_table << cell_strings.join(' | ') << "\n" + + # print the separator + text_table << column_count.times.map{|i| ''.rjust(column_widths[i+1], '-')}.join(' | ') << "\n" + + # print each row as markdown + table_element.xpath('//tr').each do |row_element| + text_table << row_element.xpath('td').map do |e| + width = 80 #default for hand-coded tables + index = e.path.match(/.*td\[(\d+)\]/) + if index + width = column_widths[index[1].to_i] || 80 + else + width = column_widths.values.first + end + e.text.rjust(width, ' ') + end.join(' | ') << "\n" + end + if pandoc_format + text_table = pipe_tables_formatting(text_table) + end + + text_table + end + + + def debug(msg) logger.debug("DEBUG: #{msg}") end