Merge pull request #3804 from benwbrum/3720-spreadsheets-in-pdf

Added spreadsheet exports to PDFs for #3720
benwbrum · Sep 28, 2023 · 44e8f9e · 44e8f9e
2 parents 458b743 + 72f0978
commit 44e8f9e
Show file tree

Hide file tree

Showing 4 changed files with 83 additions and 45 deletions.
diff --git a/app/controllers/concerns/export_service.rb b/app/controllers/concerns/export_service.rb
@@ -83,7 +83,7 @@ def export_printable(work, edition, format, preserve_lb, include_metadata, inclu
 
     # run pandoc against the temp directory
     log_file = File.join(temp_dir, "#{file_stub}.log")
-    cmd = "pandoc --from markdown+superscript -o #{output_file} #{md_file} --pdf-engine=xelatex --verbose --abbreviations=/dev/null > #{log_file} 2>&1"
+    cmd = "pandoc --from markdown+superscript+pipe_tables -o #{output_file} #{md_file} --pdf-engine=xelatex --verbose --abbreviations=/dev/null > #{log_file} 2>&1"
     puts cmd
     logger.info(cmd)
     system(cmd)

diff --git a/app/helpers/export_helper.rb b/app/helpers/export_helper.rb
@@ -1,5 +1,6 @@
 module ExportHelper
-
+  include XmlSourceProcessor
+
   def xml_to_pandoc_md(xml_text, preserve_lb=true, flatten_links=false, collection=nil, div_pad=true)
 
     # do some escaping of the document for markdown
@@ -25,11 +26,27 @@ def xml_to_pandoc_md(xml_text, preserve_lb=true, flatten_links=false, collection
       e.replace_with(sup)
     end
 
-
-
     postprocessed = ""
     doc.write(postprocessed)
-    html = xml_to_html(postprocessed, preserve_lb, flatten_links, collection)
+
+
+    # use Nokogiri for doc
+    markdown_tables = []
+    doc = Nokogiri::XML(postprocessed)
+    doc.xpath("//table").each_with_index do |n,i|
+      markdown_tables << xml_table_to_markdown_table(n, true)
+      n.replace("REPLACEMETABLE#{i}")
+    end
+
+    # now back to REXML
+    # html = xml_to_html(postprocessed, preserve_lb, flatten_links, collection)
+    # doc = REXML::Document.new("<html>#{html}</html>")
+    # doc.elements.each_with_index("//table") do |n,i|
+    #   n.replace_with(REXML::Text.new(markdown_tables[i]))
+    # end
+    html = doc.to_s
+    # html=postprocessed
+
     if div_pad
       doc = REXML::Document.new("<div>#{html}</div>")
     else
@@ -46,13 +63,17 @@ def xml_to_pandoc_md(xml_text, preserve_lb=true, flatten_links=false, collection
 
     processed = "never ran"
 
-    cmd = "pandoc --from html --to markdown"
+    cmd = "pandoc --from html --to markdown+pipe_tables"
     Open3.popen2(cmd) do |stdin, stdout, t| 
       stdin.print(html)
       stdin.close
       processed = stdout.read
     end
 
+    markdown_tables.each_with_index do |table,i|
+      processed.gsub!("REPLACEMETABLE#{i}", table)
+    end
+
     return processed
   end
 

diff --git a/app/models/page.rb b/app/models/page.rb
@@ -579,45 +579,7 @@ def formatted_plaintext_doc(doc)
   end
 
   def formatted_plaintext_table(table_element)
-    text_table = ""
-
-    # clean up in-cell line-breaks
-    table_element.xpath('//lb').each { |n| n.replace(' ')}
-
-    # calculate the widths of each column based on max(header, cell[0...end])
-    column_count = ([table_element.xpath("//th").count] + table_element.xpath('//tr').map{|e| e.xpath('td').count }).max
-    column_widths = {}
-    1.upto(column_count) do |column_index|
-      longest_cell = (table_element.xpath("//tr/td[position()=#{column_index}]").map{|e| e.text().length}.max || 0)
-      corresponding_heading = heading_length = table_element.xpath("//th[position()=#{column_index}]").first
-      heading_length = corresponding_heading.nil? ? 0 : corresponding_heading.text().length
-      column_widths[column_index] = [longest_cell, heading_length].max
-    end
-
-    # print the header as markdown
-    cell_strings = []
-    table_element.xpath("//th").each_with_index do |e,i|
-      cell_strings << e.text.rjust(column_widths[i+1], ' ')
-    end
-    text_table << cell_strings.join(' | ') << "\n"
-
-    # print the separator
-    text_table << column_count.times.map{|i| ''.rjust(column_widths[i+1], '-')}.join(' | ') << "\n"
-
-    # print each row as markdown
-    table_element.xpath('//tr').each do |row_element|
-      text_table << row_element.xpath('td').map do |e|
-        width = 80 #default for hand-coded tables
-        index = e.path.match(/.*td\[(\d+)\]/)
-        if index
-          width = column_widths[index[1].to_i] || 80 
-        else
-          width = column_widths.values.first
-        end
-        e.text.rjust(width, ' ') 
-      end.join(' | ') << "\n"
-    end
-
+    text_table = xml_table_to_markdown_table(table_element)
     table_element.replace(text_table)
   end
 

diff --git a/app/models/xml_source_processor.rb b/app/models/xml_source_processor.rb
@@ -479,6 +479,61 @@ def rename_link_in_text(text, title_regex, new_title)
     text
   end
 
+
+  def pipe_tables_formatting(text)
+    # since Pandoc Pipe Tables extension requires pipe characters at the beginning and end of each line we must add them
+    # to the beginning and end of each line
+    text.split("\n").map{|line| "|#{line}|"}.join("\n")
+  end
+
+  def xml_table_to_markdown_table(table_element, pandoc_format=false)
+    text_table = ""
+
+    # clean up in-cell line-breaks
+    table_element.xpath('//lb').each { |n| n.replace(' ')}
+
+    # calculate the widths of each column based on max(header, cell[0...end])
+    column_count = ([table_element.xpath("//th").count] + table_element.xpath('//tr').map{|e| e.xpath('td').count }).max
+    column_widths = {}
+    1.upto(column_count) do |column_index|
+      longest_cell = (table_element.xpath("//tr/td[position()=#{column_index}]").map{|e| e.text().length}.max || 0)
+      corresponding_heading = heading_length = table_element.xpath("//th[position()=#{column_index}]").first
+      heading_length = corresponding_heading.nil? ? 0 : corresponding_heading.text().length
+      column_widths[column_index] = [longest_cell, heading_length].max
+    end
+
+    # print the header as markdown
+    cell_strings = []
+    table_element.xpath("//th").each_with_index do |e,i|
+      cell_strings << e.text.rjust(column_widths[i+1], ' ')
+    end
+    text_table << cell_strings.join(' | ') << "\n"
+
+    # print the separator
+    text_table << column_count.times.map{|i| ''.rjust(column_widths[i+1], '-')}.join(' | ') << "\n"
+
+    # print each row as markdown
+    table_element.xpath('//tr').each do |row_element|
+      text_table << row_element.xpath('td').map do |e|
+        width = 80 #default for hand-coded tables
+        index = e.path.match(/.*td\[(\d+)\]/)
+        if index
+          width = column_widths[index[1].to_i] || 80 
+        else
+          width = column_widths.values.first
+        end
+        e.text.rjust(width, ' ') 
+      end.join(' | ') << "\n"
+    end
+    if pandoc_format
+      text_table = pipe_tables_formatting(text_table)
+    end
+
+    text_table
+  end
+
+
+
   def debug(msg)
     logger.debug("DEBUG: #{msg}")
   end