-
Notifications
You must be signed in to change notification settings - Fork 1
/
convert_objects.rb
executable file
·142 lines (105 loc) · 4.38 KB
/
convert_objects.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
#!/usr/bin/env ruby
require 'rubygems'
require 'nokogiri'
# => This is script converts html files into postscript files, which are needed for the
# This requires: perl
# html2ps perl script and the sample profile ( http://user.it.uu.se/~jan/html2ps.html )
# ps2pdf to convert Postscripts to PDF (http://www.ps2pdf.com/)
# imagicemagik with jasper/jp2000 libraries installed (http://www.imagemagick.org)
# Step 3
# ./convert_ojects.rb /tmp/testout
class ConvertObjects
attr_accessor :source
def initialize(source=nil)
if !source.nil? and File.exists?(source)
@source = source
else
raise "You must provide either a html file or a directory of html files"
end
end
def process()
if File.directory?(source)
processDirectory(source)
else
processFile(source)
end
end #process
# this method takes a string pointing to a directory and processes it to the ps conversion.
def processDirectory(directory)
Dir["#{directory}/**/*.htm", "#{directory}/**/*.html"].each do |f|
processFile(f)
end
end #processDirectory
# this method processes the file taking a string pointing to its path
def processFile(file)
if file.include?(".html")
base = File.basename(file, ".html")
else
base = File.basename(file, ".htm")
end
psFile = File.join(File.dirname(file), base) + ".ps"
removeGif(file) #remove the watermark gif
system( "perl ./html2ps/html2ps -f ./html2ps/sample -o #{psFile.dump} #{file.dump}") #convert to .ps
system("convert #{psFile.dump} #{File.join(File.dirname(psFile), "XXZZYY.jp2")}") #convert to jpeg2000
fixJp2PageNumbers(File.dirname(psFile)) #ImageMagik page numbers are off by one. this correct that.
convertPsToPdf(File.dirname(psFile)) #Converts Postscript files into a PDF file.
convertPsToText(File.dirname(psFile)) #convert Postscript files into a text file
rescue Exception => e
puts e
end #processFile
# This is a method to remove a gif that the forensic toolkit adds to the HTML for watermarking. We don't want this.
def removeGif(file)
doc = Nokogiri::HTML(open(file))
doc.search("//a[@href='http://www.avantstar.com']").each {|g| g.remove }
doc.search("//img").each {|j| puts j }
output = File.open(file,'w')
output << doc.to_xml
output.close
end #removeGif
# Correct ImageMagik's page numbering
def fixJp2PageNumbers(directory)
Dir["#{directory}/**/*.jp2"].each do |f|
if f.include?('.jp2')
#if there is no dash, its a one page documnet. we still need the -1.
if f.include?('-')
fbase = File.basename(f, '.jp2')
fparts = fbase.split('-')
num = fparts[1].to_i + 1
newname = fparts[0] + '-' + num.to_s + '.jp2'
else
newname = File.basename(f, '.jp2') + '-1.jp2'
end
newname.gsub!("XXZZYY", "page")
FileUtils.mv f, File.join(File.dirname(f), newname), :verbose => true
end #if f.include?('.jp2')
end #Dir["#{directory}/**/*.jp2"].each
end #fixJp2PageNumbers
#Take the object directory, finds the postscripts, and convert them to PDF files
def convertPsToPdf(directory)
Dir["#{directory}/**/*.ps"].each do |f|
if f.include?('.ps')
fname = File.basename(f, '.ps') + ".pdf"
fout = File.join(File.dirname(f), fname)
system("ps2pdf #{f.dump} #{fout.dump}")
end #f.include?
end #DIR
end #convertPsToPdf
# Takes the object directory, finds the postscripts, and convert thems to Text files.
def convertPsToText(directory)
# pdf to text. Find each jp2, then find the pdf related to it, and export out the page.
Dir["#{directory}/**/*.jp2"].each do |f|
jp2name = File.basename(f, '.jp2')
jp2part = jp2name.split("-")
num = jp2part[1]
Dir["#{File.dirname(f)}/*.pdf"].each do |pdf|
out = File.join()
pdf = system("pdftotext -f #{num} -l #{num} -layout #{pdf.dump} #{File.dirname(f).dump}/page-#{num}.txt")
end #Dir["#{File.dirname(f)}/*.pdf"]
end #Dir["#{directory}/**/*.jp2"]
end #convertPsToText
end #class
# #========== This is the equivalent of a java main method ==========#
if __FILE__ == $0
convert = ConvertObjects.new(ARGV[0])
convert.process
end