-
Notifications
You must be signed in to change notification settings - Fork 1
/
html_to_csv.rb
executable file
·102 lines (77 loc) · 2.77 KB
/
html_to_csv.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
#!/usr/bin/env ruby
require 'rubygems'
require 'nokogiri'
require 'fastercsv'
#
# => This script parse the XML files from the forensics html output
# => and creates a csv file for importing into fedora
# => Point the script toward a directory with the html bookmark files
#
# Step 1.
# ./html_to_csv.rb /tmp/Chris_08_27/Gould_08_27_html/ Gouldoutput.csv
class HtmlToCsv
attr_accessor :csv
attr_accessor :directory
attr_accessor :output
def initialize(directory=nil, output=nil)
if directory.nil? or output.nil?
raise "You must pass a directory and output file for processing"
elsif File.exists?(directory)
@directory = directory
@output = output
@csv = FasterCSV.open(@output, 'w', :headers=> true)
header = ["subseries", "filename", "path", "size", "created", "modified", "accessed", "md5", "sha1",
"flagged", "labels", "comment", "type", "exportedAs"]
@csv << header
else
raise "#{@directory} does not exist."
end
end #intialize
def process()
puts "Searching #{@directory}"
Dir["#{@directory}/*.html"].each do |f|
puts "Processing #{f} .... "
rows = parseHTML(f)
if rows.nil?
next
else
rows.each {|row| @csv << row}
end
end #dir each
@csv.close
end #process
# this method takes a filename globbed from the directory and returns an array with the data.
def parseHTML(f)
puts "Processing #{f} .... "
xml = Nokogiri::HTML(open(f))
#first get the series name for this file. Each file has one series name
subseriesXML = xml.search('//th[@class = "columnHead"]').first
unless subseriesXML.nil?
subseries = subseriesXML.content
end
#now iterate thru the divs, which are each a file
spans = xml.search('//span[@class = "bkmkColRight bkmkValue"]')
#all files should have 13 attributes (not including the subseries).
#If the html file has less, then there are no files described in this html file.
print spans.length
if spans.length < 13
print "No files described in #{f} \n"
return nil
else
a = spans.to_a
rows = []
while a.length > 12
row = [subseries]
ss = a.slice!(0..12)
ss.each {|s| row << s.content}
rows << row
end #while
return rows
end #if
end #parseHTML
end #class
# #========== This is the equivalent of a java main method ==========#
if __FILE__ == $0
html_to_csv = HtmlToCsv.new(ARGV[0], ARGV[1])
html_to_csv.process
end