-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwikidot.py
executable file
·73 lines (67 loc) · 4.55 KB
/
wikidot.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
#!/usr/bin/env python
# -*- encoding: UTF8 -*-
# Copyright 2012 Philipp Klaus
# Part of https://github.com/vLj2/wikidot-to-markdown
import re ## The most important module here!
import string ## for string.join()
#import markdown
import uuid ## to generate random UUIDs using uuid.uuid4()
class WikidotToMarkdown(object):
def __init__(self):
# regex for URL found on http://regexlib.com/REDetails.aspx?regex_id=501
self.url_regex = r"(http|https|ftp)\://([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.(com|edu|gov|int|mil|net|org|biz|arpa|info|name|pro|aero|coop|museum|[a-zA-Z]{2}))(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*[/]?"
self.static_replacements = { '[[toc]]': '', # no equivalent for table of contents in Markdown
}
self.regex_replacements = { r'^\+ ([^\n]*)$': r"# \1\n", # headings
r'^\+\+ ([^\n]*)$': r"## \1\n",
r'^\+\+\+ ([^\n]*)$': r"### \1\n",
r'^\+\+\+\+ ([^\n]*)$': r"#### \1\n",
r'^\+\+\+\+\+ ([^\n]*)$': r"##### \1\n",
r'([^:])//([\s\S ]*?)//': r'\1*\2*', # italics
r'([^:])__([\s\S ]*?)__': r'\1**\2**', # underlining → bold
r'([^:]){{([\s\S ]*?)}}': r'\1`\2`', # inline monospaced text
}
self.regex_split_condition = r"^\+ ([^\n]*)$"
def convert(self, text):
text = '\n'+text+'\n'# add embed in newlines (makes regex replaces work better)
# first we search for [[code]] statements as we don't want any replacement to happen inside those code blocks!
code_blocks = dict()
code_blocks_found = re.findall(re.compile(r'(\[\[code( type="([\S]+)")?\]\]([\s\S ]*?)\[\[/code\]\])',re.MULTILINE), text)
for code_block_found in code_blocks_found:
tmp_hash = str(uuid.uuid4())
text = text.replace(code_block_found[0],tmp_hash,1) # replace code block with a hash - to fill it in later
code_blocks[tmp_hash] = "\n"+string.join([" " + l for l in code_block_found[-1].strip().split("\n") ],"\n")+"\n"
for search, replacement in self.static_replacements.items():
text = text.replace(search,replacement,1)
# search for any of the simpler replacements in the dictionary regex_replacements
for s_reg, r_reg in self.regex_replacements.items():
text = re.sub(re.compile(s_reg,re.MULTILINE),r_reg,text)
# search for image of the form [[image https://linyehui.com/test.png]]
for link in re.finditer(r"\[\[image ("+self.url_regex+r")\]\]", text):
#print link.group(0), "![alt text](%s)" % (link.groups()[-1])
text = text.replace(link.group(0),"![%s](%s)" % (link.groups()[-1],link.group(1)),1)
# search for simple http://www.google.com links:
for link in re.finditer(r"[\s\S\n ]("+self.url_regex+r")", text):
print link.group(0)
if link.group(0)[0] == "[" : continue
elif link.group(0)[0] == "(" : continue
text = text.replace(link.group(1),"<%s> " % link.group(1),1)
# search for links of the form [http://www.google.com Google Website]
for link in re.finditer(r"\[("+self.url_regex+r") ([^\]]*)\]", text):
#print link.group(0), "[%s](%s)" % (link.groups()[-1],link.group(1))
text = text.replace(link.group(0),"[%s](%s)" % (link.groups()[-1],link.group(1)),1)
# search for unhandled tags and state them
for unhandled_tag in re.finditer(r"\[\[/([\s\S ]*?)\]\]", text):
print("Found an unhandled tag: %s" % unhandled_tag.group(1))
# now we substitute back our code blocks
for tmp_hash, code in code_blocks.items():
text = text.replace(tmp_hash, code, 1)
return text[1:-1]
def split_text(self, text):
output_parts = []
split_regex = re.compile(self.regex_split_condition)
for line in text.split("\n"):
line += "\n"
if len(output_parts) > 0 and (re.match(split_regex,line) == None): output_parts[-1] += line
else: output_parts.append(line)
return output_parts