Add test cases and make modification

Cleaning up code Flatten the return value Make it not add the path to the return value if an exception-worthy event occurred. Instead, merely raise that exception Make text_extractor also return paths to processed files Make function extract_images always return array of image paths Refine specs Fix tests Add nil check Refactor tests to better isolate functionality remove debugger remove logger Add printf debugging Sanity checking Printfs Remove puts Remove annoying line Cleanup Fix unnecessary usage of ternary operation to 'wrap' an Array and replaced with Array() as it is more idiomatic revert to original
documentcloud · Apr 5, 2016 · 9789dd5 · 9789dd5
1 parent 3d630b3
commit 9789dd5
Show file tree

Hide file tree

Showing 5 changed files with 66 additions and 20 deletions.
diff --git a/lib/docsplit/command_line.rb b/lib/docsplit/command_line.rb
@@ -116,4 +116,4 @@ def parse_options
 
   end
 
-end
+end
diff --git a/lib/docsplit/image_extractor.rb b/lib/docsplit/image_extractor.rb
@@ -13,13 +13,15 @@ class ImageExtractor
     def extract(pdfs, options)
       @pdfs = [pdfs].flatten
       extract_options(options)
+      images = []
       @pdfs.each do |pdf|
         previous = nil
         @sizes.each_with_index do |size, i|
-          @formats.each {|format| convert(pdf, size, format, previous) }
+          images += @formats.map {|format| convert(pdf, size, format, previous) }
           previous = size if @rolling
         end
       end
+       return images.reject{|i| i.nil? or i.empty?}.flatten
     end
 
     # Convert a single PDF into page images at the specified size and format.
@@ -32,20 +34,28 @@ def convert(pdf, size, format, previous=nil)
       basename  = File.basename(pdf, File.extname(pdf))
       directory = directory_for(size)
       pages     = @pages || '1-' + Docsplit.extract_length(pdf).to_s
-      escaped_pdf = ESCAPE[pdf]
+      escaped_pdf =  ESCAPE[pdf]
       FileUtils.mkdir_p(directory) unless File.exists?(directory)
       common    = "#{MEMORY_ARGS} -density #{@density} #{resize_arg(size)} #{quality_arg(format)}"
+      image_paths = []
       if previous
         FileUtils.cp(Dir[directory_for(previous) + '/*'], directory)
         result = `MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1`.chomp
-        raise ExtractionFailed, result if $? != 0
+        if $? != 0
+        raise ExtractionFailed, result 
+        end
       else
         page_list(pages).each do |page|
           out_file  = ESCAPE[File.join(directory, "#{basename}_#{page}.#{format}")]
           cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin -define pdf:use-cropbox=true #{common} #{escaped_pdf}[#{page - 1}] #{out_file} 2>&1".chomp
           result = `#{cmd}`.chomp
-          raise ExtractionFailed, result if $? != 0
+          if $? != 0
+          raise ExtractionFailed, result 
+          else
+            image_paths << out_file
+          end
         end
+        return image_paths
       end
     ensure
       FileUtils.remove_entry_secure tempdir if File.exists?(tempdir)

diff --git a/lib/docsplit/text_extractor.rb b/lib/docsplit/text_extractor.rb
@@ -29,18 +29,23 @@ def initialize
     def extract(pdfs, opts)
       extract_options opts
       FileUtils.mkdir_p @output unless File.exists?(@output)
+      pdfs = Array(pdfs)
+      paths = []
       [pdfs].flatten.each do |pdf|
         @pdf_name = File.basename(pdf, File.extname(pdf))
         pages = (@pages == 'all') ? 1..Docsplit.extract_length(pdf) : @pages
-        if @force_ocr || (!@forbid_ocr && !contains_text?(pdf))
-          extract_from_ocr(pdf, pages)
-        else
-          extract_from_pdf(pdf, pages)
-          if !@forbid_ocr && DEPENDENCIES[:tesseract] && !@pages_to_ocr.empty?
-            extract_from_ocr(pdf, @pages_to_ocr)
-          end
-        end
+        return_value = if @force_ocr || (!@forbid_ocr && !contains_text?(pdf))
+                         extract_from_ocr(pdf, pages)
+                       else
+                         if !@forbid_ocr && DEPENDENCIES[:tesseract] && !@pages_to_ocr.empty?
+                           extract_from_ocr(pdf, @pages_to_ocr)
+                         else
+                           extract_from_pdf(pdf, pages)
+                         end
+                       end
+        paths << return_value
       end
+      return paths.flatten.compact
     end
 
     # Does a PDF have any text embedded?
@@ -52,31 +57,37 @@ def contains_text?(pdf)
     # Extract a page range worth of text from a PDF, directly.
     def extract_from_pdf(pdf, pages)
       return extract_full(pdf) unless pages
-      pages.each {|page| extract_page(pdf, page) }
+      pages.map {|page| extract_page(pdf, page) }
     end
 
     # Extract a page range worth of text from a PDF via OCR.
     def extract_from_ocr(pdf, pages)
       tempdir = Dir.mktmpdir
       base_path = File.join(@output, @pdf_name)
       escaped_pdf = ESCAPE[pdf]
+      paths = []
       if pages
         pages.each do |page|
           tiff = "#{tempdir}/#{@pdf_name}_#{page}.tif"
           escaped_tiff = ESCAPE[tiff]
           file = "#{base_path}_#{page}"
           run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf}[#{page - 1}] #{escaped_tiff} 2>&1"
           run "tesseract #{escaped_tiff} #{ESCAPE[file]} -l #{@language} 2>&1"
-          clean_text(file + '.txt') if @clean_ocr
+          file_name = file + '.txt'
+          paths << file_name
+          clean_text(file_name) if @clean_ocr
           FileUtils.remove_entry_secure tiff
         end
       else
         tiff = "#{tempdir}/#{@pdf_name}.tif"
         escaped_tiff = ESCAPE[tiff]
         run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1"
         run "tesseract #{escaped_tiff} #{base_path} -l #{@language} 2>&1"
-        clean_text(base_path + '.txt') if @clean_ocr
+        file_name = base_path + '.txt'
+        paths << file_name
+        clean_text(file_name) if @clean_ocr
       end
+      return paths
     ensure
       FileUtils.remove_entry_secure tempdir if File.exists?(tempdir)
     end
@@ -104,16 +115,19 @@ def run(command)
     def extract_full(pdf)
       text_path = File.join(@output, "#{@pdf_name}.txt")
       run "pdftotext -enc UTF-8 #{ESCAPE[pdf]} #{ESCAPE[text_path]} 2>&1"
+      return text_path
     end
 
     # Extract the contents of a single page of text, directly, adding it to
     # the `@pages_to_ocr` list if the text length is inadequate.
     def extract_page(pdf, page)
       text_path = File.join(@output, "#{@pdf_name}_#{page}.txt")
       run "pdftotext -enc UTF-8 -f #{page} -l #{page} #{ESCAPE[pdf]} #{ESCAPE[text_path]} 2>&1"
+
       unless @forbid_ocr
         @pages_to_ocr.push(page) if File.read(text_path).length < MIN_TEXT_PER_PAGE
       end
+      return text_path
     end
 
     def extract_options(options)
@@ -127,4 +141,4 @@ def extract_options(options)
 
   end
 
-end
+end
diff --git a/test/unit/test_extract_images.rb b/test/unit/test_extract_images.rb
@@ -13,6 +13,19 @@ def test_image_formatting
     assert Dir["#{OUTPUT}/*.jpg"].length == 2
   end
 
+  def test_return_value
+    return_value = Docsplit.extract_images('test/fixtures/obama_arts.pdf', :format => :gif, :size => "50x", :pages => 2, :output => OUTPUT)
+    assert return_value.length == 1
+    assert return_value.is_a?(Enumerable)
+    assert return_value.all?{|el| el =~ /\.gif/}
+    return_value = Docsplit.extract_images('test/fixtures/obama_arts.pdf', :format => [:jpg, :gif], :size => "50x", :pages => 2, :output => OUTPUT)
+    assert return_value.length == 2
+    assert return_value.is_a?(Enumerable)
+    assert return_value.any?{|el| el =~ /\.gif/}
+    assert return_value.any?{|el| el =~ /\.jpg/}
+  end
+
+
   def test_page_ranges
     Docsplit.extract_images('test/fixtures/obama_arts.pdf', :format => :gif, :size => "50x", :pages => 2, :output => OUTPUT)
     assert Dir["#{OUTPUT}/*.gif"] == ["#{OUTPUT}/obama_arts_2.gif"]

diff --git a/test/unit/test_extract_text.rb b/test/unit/test_extract_text.rb
@@ -4,9 +4,12 @@
 class ExtractTextTest < Test::Unit::TestCase
 
   def test_paged_extraction
-    Docsplit.extract_text('test/fixtures/obama_arts.pdf', :pages => 'all', :output => OUTPUT)
+    return_value = Docsplit.extract_text('test/fixtures/obama_arts.pdf', :pages => 'all', :output => OUTPUT)
     assert Dir["#{OUTPUT}/*.txt"].length == 2
     assert File.read("#{OUTPUT}/obama_arts_1.txt").match("Paid for by Obama for America")
+    assert return_value.is_a?(Enumerable)
+    assert return_value.all?{|val| val =~ /\.txt/}
+    assert return_value.length == 2
   end
 
   def test_page_only_extraction
@@ -24,19 +27,25 @@ def test_capitalized_pdf_extraction
   end
 
   def test_unicode_extraction
-    Docsplit.extract_text('test/fixtures/unicode.pdf', :pages => 'all', :output => OUTPUT)
+    return_value = Docsplit.extract_text('test/fixtures/unicode.pdf', :pages => 'all', :output => OUTPUT)
     assert Dir["#{OUTPUT}/*.txt"].length == 3
+    assert return_value.is_a?(Enumerable)
+    assert return_value.all?{|val| val =~ /\.txt/}
+    assert return_value.length == 3
   end
 
   def test_ocr_extraction
-    Docsplit.extract_text('test/fixtures/corrosion.pdf', :pages => 'all', :output => OUTPUT)
+    return_value =Docsplit.extract_text('test/fixtures/corrosion.pdf', :pages => 'all', :output => OUTPUT)
     4.times do |i|
       file = "corrosion_#{i + 1}.txt"
       assert_directory_contains(OUTPUT, file)
       assert File.read(File.join(OUTPUT, file)).size > 1, "Expected that file with extracted text should have reasonable size"
     end
+    assert return_value.is_a?(Enumerable)
+    assert return_value.all?(/\.txt/)
   end
 
+
   def test_ocr_extraction_in_mock_language
     exception = assert_raise(Docsplit::ExtractionFailed) {Docsplit.extract_text('test/fixtures/corrosion.pdf', :pages => 'all', :output => OUTPUT, :language => "mock")}
     assert exception.message.match("tessdata/mock"), "Expected problem with loading data for language 'mock'"