forked from agbilotia1998/slydok
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdoc.py
120 lines (96 loc) · 3.09 KB
/
doc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
from docx import Document
import pprint
from collections import OrderedDict
import io
import xml.etree.ElementTree as ET
from images import extract_images
import re
pp = pprint.PrettyPrinter(indent=2)
filename = 'c.docx'
document = Document(filename)
images = extract_images(filename)
image_count = 1
def convert(p):
return p.text
def find_all_font_sizes(paragraphs):
sizes = set()
for p in paragraphs:
font_size = p.style.font.size
sizes.add(font_size)
return sorted(list(sizes), reverse=True)
def analyze_by_size(paragraphs, sizes):
cur_size = sizes[0]
lines = []
headings = 0
prev_heading = ''
d= OrderedDict()
d["children"] = {}
d["images"] = []
d["ul"] = []
d["text"] = []
image_count = 1
for p in paragraphs:
if len(p._p.r_lst[0].drawing_lst):
print p._p.r_lst[0].xml, dir(p._p.r_lst[0]), p._p.r_lst[0].drawing_lst
xml_string = str(p._p.xml)
result = re.search('graphic', xml_string)
if result is not None:
image_name = 'image{0}'.format(image_count)
print(image_name)
if 'images' in d:
d['images'].append(image_name)
else:
d['images'] = [image_name]
image_count += 1
if cur_size is not None and p.style.font.size == cur_size:
if headings > 0:
d['children'][prev_heading] = analyze_by_size(lines, sizes[1:])
elif headings == 0 and len(lines) > 0:
d['text'] = map(convert, lines)
prev_heading = p.text
d['children'][prev_heading] = {}
headings += 1
lines = []
else:
lines.append(p)
if headings > 0:
d['children'][prev_heading] = analyze_by_size(lines, sizes[1:])
else:
d['text'] = map(convert, lines)
return d
def analyze_by_heading(paragraphs, level):
heading = 'Heading {0}'.format(level)
lines = []
headings = 0
prev_heading = ''
d = OrderedDict()
for p in paragraphs:
if p.style.name == heading:
if headings > 0:
d[prev_heading] = analyze_by_heading(lines, level + 1)
elif headings == 0 and len(lines) > 0:
d['intro_text'] = map(convert, lines)
prev_heading = p.text
d[prev_heading] = {}
headings += 1
lines = []
else:
lines.append(p)
if headings > 0:
d[prev_heading] = analyze_by_heading(lines, level + 1)
return d
else:
return map(convert, lines)
def remove_extra_images(store):
if 'images' in store:
for image in store['images']:
for k in store.keys():
if isinstance(store[k], dict) and 'images' in store[k] and image in store[k]['images']:
store['images'].remove(image)
# store = analyze(document.paragraphs, 1)
# pp.pprint(store)
title = document.paragraphs[0].text
l = find_all_font_sizes(document.paragraphs[1:])
store = analyze_by_size(document.paragraphs[1:], l)
remove_extra_images(store)
pp.pprint(store)