-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmd2tex.py
executable file
·98 lines (88 loc) · 3.83 KB
/
md2tex.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
#!/usr/bin/python
# coding: utf-8
import hashlib
import mistune
import bs4
from bs4 import BeautifulSoup
import urllib
import os.path
input = open("input.md", "r")
output = open("converted.tex", "w")
md = input.read()
html = mistune.markdown(md)
soup = BeautifulSoup(html, "html.parser")
def latex_escape(text):
text = text.replace("\\", "\\textbackslash")
text = text.replace("&", "\\&")
text = text.replace("%", "\\%")
text = text.replace("$", "\\$")
text = text.replace("#", "\\#")
text = text.replace("_", "\\_")
text = text.replace("{", "\\{")
text = text.replace("}", "\\}")
text = text.replace("~", "\\textasciitilde")
text = text.replace("^", "\\textasciicircum")
return text
def tex_output(html_soup):
result = ""
for child in list(html_soup.children):
tagname = child.name
if tagname == None:
result += latex_escape(unicode(child).encode("utf-8"))
elif tagname == "p":
result += "\\paragraph{{}}\n{0}".format(tex_output(child))
elif tagname == "em":
result += "\\textit{{{0}}}".format(tex_output(child))
elif tagname == "strong":
result += "\\textbf{{{0}}}".format(tex_output(child))
elif tagname == "a":
src = child["href"]
result += "\\url{{{0}}}".format(latex_escape(src))
elif tagname == "h1":
# Here we assume there's only one level 1 title in the document!
result += "\\part*{{{0}}}\n\\renewcommand{{\\contentsname}}{{Sommaire}}\\tableofcontents\\newpage\n".format(tex_output(child))
elif tagname == "h2":
result += "\\section{{{0}}}".format(tex_output(child))
elif tagname == "h3":
result += "\\subsection{{{0}}}".format(tex_output(child))
elif tagname == "h4":
result += "\\subsubsection{{{0}}}".format(tex_output(child))
elif tagname == "h5":
result += "\\paragraph{{{0}}}".format(tex_output(child))
elif tagname == "ul":
result += "\n\\begin{{itemize}}\n{0}\\end{{itemize}}\n".format(tex_output(child))
elif tagname == "ol":
result += "\n\\begin{{enumerate}}\n{0}\\end{{enumerate}}\n".format(tex_output(child))
elif tagname == "li":
result += "\\item {0}\n".format(tex_output(child))
elif tagname == "code":
result += "\\texttt{{{0}}}".format(tex_output(child))
elif tagname == "pre":
# A pre tag contains a code tag and no other formatting should occur to it, so we fetch it directly with no recursive call
lstlisting_format = "\n\\begin{{lstlisting}}\n{0}\n\\end{{lstlisting}}\n"
result += lstlisting_format.format(child.find("code").string.encode("utf-8"))
elif tagname == "img":
src = child["src"]
# We use hases to make sure two images with the same name won't conflict
src_hash = hashlib.md5(src).hexdigest()
extension = src.split(".")[-1]
img_path = "img/" + src_hash + "." + extension
try:
if not os.path.isfile(img_path):
print("An image has to be downloaded.")
urllib.urlretrieve(src, img_path)
else:
print("Image already exists, not downloading it again.")
except IOError:
print("Downloading the image failed")
else:
alt = child["alt"]
if alt.strip():
alt = "\n\\caption{{{0}}}".format(alt)
else:
alt_format = ""
image_format = "\n\\begin{{figure}}[h]\n\\centering\n\\includegraphics[width=0.7\\linewidth]{{{0}}}\n{1}\\end{{figure}}\n"
result += image_format.format(img_path, alt)
return(result)
output.write(tex_output(soup))
output.close()