-
Notifications
You must be signed in to change notification settings - Fork 0
/
parse_mmd.py
46 lines (32 loc) · 1.72 KB
/
parse_mmd.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import re
def clean_markdown(text):
# Remove title tags and keep only the content within, regardless of case
text = re.sub(r'\\title\{(.*?)\}', r'\1', text, flags=re.DOTALL | re.IGNORECASE)
# Remove author section
text = re.sub(r'\\author\{[\s\S]*?\}', '', text)
# Remove English abstract section
text = re.sub(r'\\begin\{abstract\}([\s\S]*?)\\end\{abstract\}', r'ABSTRAK: \1', text)
# Remove keywords section, including the "Keywords:" text and possible spaces after the colon
text = re.sub(r'Keywords:\s*.*?(?=\n\n|\Z)', '', text, flags=re.DOTALL | re.IGNORECASE)
text = re.sub(r'Kata kunci:\s*.*?(?=\n\n|\Z)', '', text, flags=re.DOTALL | re.IGNORECASE)
# Remove footnotetext tags and their content
text = re.sub(r'\\footnotetext\{[\s\S]*?\}', '', text)
# Remove LaTeX-style commands
text = re.sub(r'\\[a-zA-Z]+(\{.*?\})*', '', text)
# Remove Markdown formatting
text = re.sub(r'\*+', '', text)
text = re.sub(r'#+\s', '', text)
# Remove code blocks
text = re.sub(r'```[\s\S]*?```', '', text)
# Remove empty lines and join remaining lines
lines = [line.strip() for line in text.split('\n') if line.strip()]
return ' '.join(lines)
# Read the file content
with open('/data/users/brandon/ob1-projects/data_processing/md_completed/2024_10_08_d5e2bec671d0129bd68ag.mmd', 'r', encoding='utf-8') as file:
content = file.read()
# Clean the content
cleaned_text = clean_markdown(content)
# Write the cleaned text to a new file
with open('/data/users/brandon/ob1-projects/data_processing/cleaned_output.txt', 'w', encoding='utf-8') as file:
file.write(cleaned_text)
print("Cleaning complete. Output saved to 'cleaned_output.txt'")