-
Notifications
You must be signed in to change notification settings - Fork 0
/
converter.py
301 lines (240 loc) · 8.79 KB
/
converter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
import mammoth
from bs4 import BeautifulSoup
import pprint
from yaml import load, dump
SEP = " " * 4
def get_city_image(city):
city = city.capitalize()
images = {
"Lyon": "/imgs/photos/villes/lyon3.jpg",
"Lille": "/imgs/photos/villes/lille.jpg",
"Paris": "/imgs/photos/villes/paris3.jpg",
}
if city not in images:
print(f"{SEP*2}Unknown city: {city}")
return images.get(city, "")
class Node:
def __init__(self, title=None):
self.title = title
self.content = None
self.children = []
def add_child(self, title):
self.children.append(Node(title))
def get_child(self, title):
for c in self.children:
if c.title == title:
return c
print(f"{SEP}unknown child: {title}")
return Node()
def add_content(self, element):
if element.name in ("h1", "h2", "h3", "p"):
self.content = element.text
elif element.name == "ul":
self.content = [e.text for e in element.children]
else:
raise TypeError(f"unknown name: {element.name}")
def __hash__(self):
return hash(self.title)
def __iter__(self):
yield from self.children
def __repr__(self):
children = ""
if self.children:
children = f"> {self.children}"
return f"\n{self.title}: {self.content} {children}"
def parse_file(fname):
tree = Node()
last_h1 = None
last_h2 = None
last_h3 = None
with open(fname, "rb") as docx_file:
result = mammoth.convert_to_html(docx_file)
html = result.value # The generated HTML
messages = result.messages # Any messages, such as warnings during conversion
soup = BeautifulSoup(html, "html.parser")
for element in soup:
if element.name == "h1":
last_h1 = element.text
last_h2, last_h3 = None, None
tree.add_child(element.text)
continue
if element.name == "h2":
last_h2 = element.text
last_h3 = None
tree.get_child(last_h1).add_child(element.text)
if element.name == "h3":
last_h3 = element.text
tree.get_child(last_h1).get_child(last_h2).add_child(element.text)
if last_h3:
if element.name == "p":
tree.get_child(last_h1).get_child(last_h2).add_child("free_text")
tree.get_child(last_h1).get_child(last_h2).get_child(
"free_text"
).add_content(element)
else:
tree.get_child(last_h1).get_child(last_h2).get_child(
last_h3
).add_content(element)
elif last_h2:
tree.get_child(last_h1).get_child(last_h2).add_content(element)
elif last_h1:
tree.get_child(last_h1).add_content(element)
else:
print(f"{SEP}???")
return tree
def prepare_output_programme(tree):
sections = []
for section in tree.get_child("Plan de formation / programme"):
free_text = ""
elements = []
for element in section:
subelements = []
if isinstance(element.content, list):
for subelement in element.content:
subelements.append(subelement)
if element.title == "free_text":
free_text = element.content
continue
res = {"title": element.title}
if subelements:
res["subelements"] = subelements
elements.append(res)
res = {"title": section.title, "elements": elements}
if free_text:
res["free_text"] = free_text
sections.append(res)
return sections
def prepare_output_title(tree):
key = "Titre"
return tree.get_child(key).content
def prepare_output_description(tree):
key = "Description (10 lignes max)"
return tree.get_child(key).content
def prepare_output_identifier(tree):
key = "Identifiant technique"
return tree.get_child(key).content
def prepare_output_domaines(tree):
key = "Domaine"
return tree.get_child(key).content
def prepare_output_subdomain(tree):
key = "Sous-domaine"
return tree.get_child(key).content
def prepare_output_url(tree):
key = "Url"
return tree.get_child(key).content
def prepare_output_weight(tree):
key = "Ordre dans la page"
return tree.get_child(key).content
def prepare_output_catchphrase(tree):
key = "Catch phrase (2 lignes max)"
return tree.get_child(key).content
def prepare_output_duration(tree):
key = "Durée"
return tree.get_child(key).content
def prepare_output_equilibre(tree):
key = "Équilibre théorie / pratique"
return tree.get_child(key).content
def prepare_output_pricing(tree):
key = "Tarifs"
return tree.get_child(key).content
def prepare_output_objectifs(tree):
key = "Objectifs pédagogiques"
return tree.get_child(key).content
def prepare_output_prerequis(tree):
key = "Prérequis"
content = tree.get_child(key).content
if isinstance(content, str):
content = [content]
return content
def prepare_output_publics(tree):
key = "Public visé / participants"
return tree.get_child(key).content
def prepare_output_sessions(tree):
key = "Prochaines sessions"
sessions = []
for session in tree.get_child(key).children:
sessions.append(
{
"city": session.title,
"date": session.content,
"image": get_city_image(session.title),
}
)
return sessions
def preprare_output_draft(tree):
key = "Contenu validé"
content = tree.get_child(key).content
return content not in ("oui", "Oui")
def prepare_output(tree, fname_output):
frontmatter_data = {
"title": prepare_output_title(tree),
"draft": preprare_output_draft(tree),
"identifier": prepare_output_identifier(tree),
"domaines": prepare_output_domaines(tree),
"subdomain": prepare_output_subdomain(tree),
"url": prepare_output_url(tree),
"weight": prepare_output_weight(tree),
"catchphrase": prepare_output_catchphrase(tree),
"duration": prepare_output_duration(tree),
"equilibre": prepare_output_equilibre(tree),
"pricing": prepare_output_pricing(tree),
"objectifs": prepare_output_objectifs(tree),
"prerequis": prepare_output_prerequis(tree),
"publics": prepare_output_publics(tree),
"programme": prepare_output_programme(tree),
"sessions": prepare_output_sessions(tree),
}
description = prepare_output_description(tree)
content_file = open(fname_output, "w")
frontmatter = dump(
frontmatter_data,
allow_unicode=True,
default_flow_style=False,
sort_keys=False,
)
content_file.write(f"---\n{frontmatter}\n---\n\n{description}\n")
def convert_all():
names = [
(
"./content_google_doc/datascience/01_SQL Pratique.docx",
"./content/formations/datascience/sql_pratique.md",
),
(
"./content_google_doc/developpement/01_Initiation à la Programmation avec Python .docx",
"./content/formations/developpement/python/python_debutant.md",
),
(
"./content_google_doc/developpement/02_Python objet.docx",
"./content/formations/developpement/python/python_objet.md",
),
(
"./content_google_doc/developpement/03_Python avancé.docx",
"./content/formations/developpement/python/python_avance.md",
),
(
"./content_google_doc/developpement/04_Python idiomatique.docx",
"./content/formations/developpement/python/python_idiomatique.md",
),
(
"./content_google_doc/developpement/05_Programmation concurrente en python.docx",
"./content/formations/developpement/python/python_concurrence.md",
),
(
"./content_google_doc/developpement/06_Clean code en Python et gestion de code legacy.docx",
"./content/formations/developpement/python/python_clean_code.md",
),
(
"./content_google_doc/developpement/07_Distribuer son code python.docx",
"./content/formations/developpement/python/python_distribution.md",
),
]
identifiers = []
for input, output in names:
print(f"Converting {input.split('/')[-1]} -> {output.split('/')[-1]}")
tree = parse_file(input)
identifiers.append(prepare_output_identifier(tree))
prepare_output(tree, output)
print(f"done")
print(identifiers)
if __name__ == "__main__":
convert_all()