-
Notifications
You must be signed in to change notification settings - Fork 0
/
xmlDir.py
191 lines (159 loc) · 6.33 KB
/
xmlDir.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
#!/usr/bin/env python
"""
xmlDir.py - set up files for OCR processing
- art rhyno <http://projectconifer.ca/>
Hadoop jobs produce files with the prefix "part*"
and need to be sorted out afterwards. This
program produces the base xml list of images
and sets up the script to run the processing.
(c) Copyright GNU General Public License (GPL)
"""
from array import array
from xml.etree import ElementTree as ET
import glob,json, os, tempfile
import traceback
import Image
import sys,os,re,glob
import urllib
import cStringIO
from xml.dom import minidom
from xml.etree import ElementTree
from xml.etree.ElementTree import Element, SubElement, Comment, tostring
from datetime import datetime
class newsImg:
def __init__(self, imgName, width, height):
self.imgName = imgName
self.width = width
self.height = height
def find_after( s, first):
try:
start = s.index( first ) + len( first )
end = len(s)
return s[start:end]
except ValueError:
return ""
def cleanprint(elem):
# see http://renesd.blogspot.com/2007/05/pretty-print-xml-with-python.html
utf_string = ElementTree.tostring(elem, 'utf-8')
reparsed = minidom.parseString(utf_string)
return reparsed.toprettyxml(indent=" ")
def extractDate(dtName):
dtParts = dtName.split("_")
print "return", dtParts[1]
return dtParts[1]
def xmlDirEntry(haPath,webPath):
xmlStr = "echo \"prepping %s\"\n" % haPath
xmlStr += "cat %s/part* > results.txt\n" % haPath
xmlStr += "cat results.txt | python reducer.py > results2.txt\n"
xmlStr += "cat results2.txt | sort > page.txt\n"
xmlStr += "python sortout.py\n"
xmlStr += "mkdir %s\n" % webPath
xmlStr += "mv *.box %s\n" % webPath
xmlStr += "mv *.html %s\n" % webPath
xmlStr += "mv *.xml %s\n" % webPath
xmlStr += "mv page.txt %s\n" % webPath
xmlStr += "rm results*.txt\n"
return xmlStr
from optparse import OptionParser
parser = OptionParser(usage="""
%prog [options] image1
Possible choices are:
--base -- specify base for work
--date -- specify date format
--hapath -- specify path for hadoop files
--image -- specify image extension
--webpath -- specify path for apache files
--name -- specify newspaper title
""")
# options
parser.add_option("-b","--base",help="base for file",default=None)
parser.add_option("-d","--date",help="file date info",default=None)
parser.add_option("-a","--hapath",help="path for hadoop files",default=None)
parser.add_option("-i","--image",help="specify image extension",default=None)
parser.add_option("-w","--webpath",help="path for apache files",default=None)
parser.add_option("-n","--newspaper",help="newspaper title",default=None)
(options,args) = parser.parse_args()
if options.base:
taskFolder = open(options.base + ".sh", 'w')
fileList = glob.glob('*' + options.base + '_*')
fileList = sorted(fileList,key=extractDate)
for name in fileList:
print "name", name
xmlfile = open(name + '.xml','wb')
newspaper = Element('newspaper')
base = SubElement(newspaper,'base',{'name':options.newspaper,'id':options.base,})
reel = SubElement(base,'reel',{'name':name,'id':name,})
if not options.date:
issue = SubElement(reel,'issue',{'name':name,})
print "->", (name + '/part*')
partList = glob.glob(name + '/part*')
newsImgs = []
img_no = 0
for part in partList:
print "part", part
filebit = part.split('/')
last_file = '@@@'
file = open(part, 'r')
for line in file:
try:
char_cnt, fileinfo, word, x0, y0, x1, y1 = line.split('\t', 7)
odw_flag = False
if fileinfo.startswith('/tmp/'):
odw_flag = True
last_slash = fileinfo.rfind('/')
if last_slash > -1:
last_slash+=1
fileinfo = fileinfo[last_slash:]
if odw_flag:
fileinfo = fileinfo.replace('_', '@',1)
file_tmp = fileinfo.split('@')
file_tmp_num = file_tmp[1]
first_dot = file_tmp_num.find('.')
if first_dot > -1:
file_tmp_num = file_tmp_num[first_dot:]
fileinfo = file_tmp[0] + file_tmp_num
else:
fileinfo = fileinfo.replace('@', '_',1)
file_parts = fileinfo.split('_')
filename = file_parts[0]
width = file_parts[1]
height = file_parts[2]
except:
filename = ''
if len(filename) > 0:
if filename != last_file:
#img_no = img_no + 1
#img = SubElement(issue,'image',{'w':width,'h':height,'no':str(img_no),'align':'0'})
#img.text = filename
#print "file", filename
#print "width", width
#print "height", height
newsImgs.append(newsImg(filename,width,height))
last_file = filename
last_file = '@@@'
issue_str = None
newsImgs = sorted(newsImgs,key=lambda seg:seg.imgName)
for newsPage in newsImgs:
img_no = img_no + 1
if options.date:
date_parts = newsPage.imgName.split('-')
issue_str = date_parts[0]+'-'+date_parts[1]+'-'+date_parts[2]
issue_date = datetime.strptime(issue_str, options.date)
issue_str = issue_date.strftime('%B') + ' ' + issue_date.strftime('%d') + ', ' + issue_date.strftime('%Y')
if issue_str != last_file and options.date:
issue = SubElement(reel,'issue',{'name':issue_str,})
img_no = 1
img = SubElement(issue,'image',{'w':newsPage.width,'h':newsPage.height,'no':str(img_no),'align':'0'})
img_name = newsPage.imgName
if options.image:
img_parts = img_name.split('.')
img_name = img_parts[0] + "." + options.image
img.text = img_name
last_file = issue_str
xmlfile.write(cleanprint(newspaper))
xmlfile.close()
date_info = find_after(name,options.base + "_")
taskFolder.write(xmlDirEntry(options.hapath+'/'+name,options.webpath+'/'+date_info))
taskFolder.close()
else:
print "missing parameter(s)"