Skip to content

Commit

Permalink
Add test cases and make modification
Browse files Browse the repository at this point in the history
Cleaning up code

Flatten the return value

Make it not add the path to the return value if an exception-worthy
event occurred. Instead, merely raise that exception

Make text_extractor also return paths to processed files

Make function extract_images always return array of image paths

Refine specs

Fix tests

Add nil check

Refactor tests to better isolate functionality

remove debugger

remove logger

Add printf debugging

Sanity checking

Printfs

Remove puts

Remove annoying line

Cleanup

Fix unnecessary usage of ternary operation to 'wrap' an Array and
replaced with Array() as it is more idiomatic

revert to original
  • Loading branch information
hderms authored and Dermot Haughey committed Apr 5, 2016
1 parent 3d630b3 commit 9789dd5
Show file tree
Hide file tree
Showing 5 changed files with 66 additions and 20 deletions.
2 changes: 1 addition & 1 deletion lib/docsplit/command_line.rb
Original file line number Diff line number Diff line change
Expand Up @@ -116,4 +116,4 @@ def parse_options

end

end
end
18 changes: 14 additions & 4 deletions lib/docsplit/image_extractor.rb
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,15 @@ class ImageExtractor
def extract(pdfs, options)
@pdfs = [pdfs].flatten
extract_options(options)
images = []
@pdfs.each do |pdf|
previous = nil
@sizes.each_with_index do |size, i|
@formats.each {|format| convert(pdf, size, format, previous) }
images += @formats.map {|format| convert(pdf, size, format, previous) }
previous = size if @rolling
end
end
return images.reject{|i| i.nil? or i.empty?}.flatten
end

# Convert a single PDF into page images at the specified size and format.
Expand All @@ -32,20 +34,28 @@ def convert(pdf, size, format, previous=nil)
basename = File.basename(pdf, File.extname(pdf))
directory = directory_for(size)
pages = @pages || '1-' + Docsplit.extract_length(pdf).to_s
escaped_pdf = ESCAPE[pdf]
escaped_pdf = ESCAPE[pdf]
FileUtils.mkdir_p(directory) unless File.exists?(directory)
common = "#{MEMORY_ARGS} -density #{@density} #{resize_arg(size)} #{quality_arg(format)}"
image_paths = []
if previous
FileUtils.cp(Dir[directory_for(previous) + '/*'], directory)
result = `MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1`.chomp
raise ExtractionFailed, result if $? != 0
if $? != 0
raise ExtractionFailed, result
end
else
page_list(pages).each do |page|
out_file = ESCAPE[File.join(directory, "#{basename}_#{page}.#{format}")]
cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin -define pdf:use-cropbox=true #{common} #{escaped_pdf}[#{page - 1}] #{out_file} 2>&1".chomp
result = `#{cmd}`.chomp
raise ExtractionFailed, result if $? != 0
if $? != 0
raise ExtractionFailed, result
else
image_paths << out_file
end
end
return image_paths
end
ensure
FileUtils.remove_entry_secure tempdir if File.exists?(tempdir)
Expand Down
38 changes: 26 additions & 12 deletions lib/docsplit/text_extractor.rb
Original file line number Diff line number Diff line change
Expand Up @@ -29,18 +29,23 @@ def initialize
def extract(pdfs, opts)
extract_options opts
FileUtils.mkdir_p @output unless File.exists?(@output)
pdfs = Array(pdfs)
paths = []
[pdfs].flatten.each do |pdf|
@pdf_name = File.basename(pdf, File.extname(pdf))
pages = (@pages == 'all') ? 1..Docsplit.extract_length(pdf) : @pages
if @force_ocr || (!@forbid_ocr && !contains_text?(pdf))
extract_from_ocr(pdf, pages)
else
extract_from_pdf(pdf, pages)
if !@forbid_ocr && DEPENDENCIES[:tesseract] && !@pages_to_ocr.empty?
extract_from_ocr(pdf, @pages_to_ocr)
end
end
return_value = if @force_ocr || (!@forbid_ocr && !contains_text?(pdf))
extract_from_ocr(pdf, pages)
else
if !@forbid_ocr && DEPENDENCIES[:tesseract] && !@pages_to_ocr.empty?
extract_from_ocr(pdf, @pages_to_ocr)
else
extract_from_pdf(pdf, pages)
end
end
paths << return_value
end
return paths.flatten.compact
end

# Does a PDF have any text embedded?
Expand All @@ -52,31 +57,37 @@ def contains_text?(pdf)
# Extract a page range worth of text from a PDF, directly.
def extract_from_pdf(pdf, pages)
return extract_full(pdf) unless pages
pages.each {|page| extract_page(pdf, page) }
pages.map {|page| extract_page(pdf, page) }
end

# Extract a page range worth of text from a PDF via OCR.
def extract_from_ocr(pdf, pages)
tempdir = Dir.mktmpdir
base_path = File.join(@output, @pdf_name)
escaped_pdf = ESCAPE[pdf]
paths = []
if pages
pages.each do |page|
tiff = "#{tempdir}/#{@pdf_name}_#{page}.tif"
escaped_tiff = ESCAPE[tiff]
file = "#{base_path}_#{page}"
run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf}[#{page - 1}] #{escaped_tiff} 2>&1"
run "tesseract #{escaped_tiff} #{ESCAPE[file]} -l #{@language} 2>&1"
clean_text(file + '.txt') if @clean_ocr
file_name = file + '.txt'
paths << file_name
clean_text(file_name) if @clean_ocr
FileUtils.remove_entry_secure tiff
end
else
tiff = "#{tempdir}/#{@pdf_name}.tif"
escaped_tiff = ESCAPE[tiff]
run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1"
run "tesseract #{escaped_tiff} #{base_path} -l #{@language} 2>&1"
clean_text(base_path + '.txt') if @clean_ocr
file_name = base_path + '.txt'
paths << file_name
clean_text(file_name) if @clean_ocr
end
return paths
ensure
FileUtils.remove_entry_secure tempdir if File.exists?(tempdir)
end
Expand Down Expand Up @@ -104,16 +115,19 @@ def run(command)
def extract_full(pdf)
text_path = File.join(@output, "#{@pdf_name}.txt")
run "pdftotext -enc UTF-8 #{ESCAPE[pdf]} #{ESCAPE[text_path]} 2>&1"
return text_path
end

# Extract the contents of a single page of text, directly, adding it to
# the `@pages_to_ocr` list if the text length is inadequate.
def extract_page(pdf, page)
text_path = File.join(@output, "#{@pdf_name}_#{page}.txt")
run "pdftotext -enc UTF-8 -f #{page} -l #{page} #{ESCAPE[pdf]} #{ESCAPE[text_path]} 2>&1"

unless @forbid_ocr
@pages_to_ocr.push(page) if File.read(text_path).length < MIN_TEXT_PER_PAGE
end
return text_path
end

def extract_options(options)
Expand All @@ -127,4 +141,4 @@ def extract_options(options)

end

end
end
13 changes: 13 additions & 0 deletions test/unit/test_extract_images.rb
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,19 @@ def test_image_formatting
assert Dir["#{OUTPUT}/*.jpg"].length == 2
end

def test_return_value
return_value = Docsplit.extract_images('test/fixtures/obama_arts.pdf', :format => :gif, :size => "50x", :pages => 2, :output => OUTPUT)
assert return_value.length == 1
assert return_value.is_a?(Enumerable)
assert return_value.all?{|el| el =~ /\.gif/}
return_value = Docsplit.extract_images('test/fixtures/obama_arts.pdf', :format => [:jpg, :gif], :size => "50x", :pages => 2, :output => OUTPUT)
assert return_value.length == 2
assert return_value.is_a?(Enumerable)
assert return_value.any?{|el| el =~ /\.gif/}
assert return_value.any?{|el| el =~ /\.jpg/}
end


def test_page_ranges
Docsplit.extract_images('test/fixtures/obama_arts.pdf', :format => :gif, :size => "50x", :pages => 2, :output => OUTPUT)
assert Dir["#{OUTPUT}/*.gif"] == ["#{OUTPUT}/obama_arts_2.gif"]
Expand Down
15 changes: 12 additions & 3 deletions test/unit/test_extract_text.rb
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,12 @@
class ExtractTextTest < Test::Unit::TestCase

def test_paged_extraction
Docsplit.extract_text('test/fixtures/obama_arts.pdf', :pages => 'all', :output => OUTPUT)
return_value = Docsplit.extract_text('test/fixtures/obama_arts.pdf', :pages => 'all', :output => OUTPUT)
assert Dir["#{OUTPUT}/*.txt"].length == 2
assert File.read("#{OUTPUT}/obama_arts_1.txt").match("Paid for by Obama for America")
assert return_value.is_a?(Enumerable)
assert return_value.all?{|val| val =~ /\.txt/}
assert return_value.length == 2
end

def test_page_only_extraction
Expand All @@ -24,19 +27,25 @@ def test_capitalized_pdf_extraction
end

def test_unicode_extraction
Docsplit.extract_text('test/fixtures/unicode.pdf', :pages => 'all', :output => OUTPUT)
return_value = Docsplit.extract_text('test/fixtures/unicode.pdf', :pages => 'all', :output => OUTPUT)
assert Dir["#{OUTPUT}/*.txt"].length == 3
assert return_value.is_a?(Enumerable)
assert return_value.all?{|val| val =~ /\.txt/}
assert return_value.length == 3
end

def test_ocr_extraction
Docsplit.extract_text('test/fixtures/corrosion.pdf', :pages => 'all', :output => OUTPUT)
return_value =Docsplit.extract_text('test/fixtures/corrosion.pdf', :pages => 'all', :output => OUTPUT)
4.times do |i|
file = "corrosion_#{i + 1}.txt"
assert_directory_contains(OUTPUT, file)
assert File.read(File.join(OUTPUT, file)).size > 1, "Expected that file with extracted text should have reasonable size"
end
assert return_value.is_a?(Enumerable)
assert return_value.all?(/\.txt/)
end


def test_ocr_extraction_in_mock_language
exception = assert_raise(Docsplit::ExtractionFailed) {Docsplit.extract_text('test/fixtures/corrosion.pdf', :pages => 'all', :output => OUTPUT, :language => "mock")}
assert exception.message.match("tessdata/mock"), "Expected problem with loading data for language 'mock'"
Expand Down

0 comments on commit 9789dd5

Please sign in to comment.