-
Notifications
You must be signed in to change notification settings - Fork 1
/
clean.pl
138 lines (117 loc) · 4.99 KB
/
clean.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
#!/usr/bin/perl
# Program to filter Wikipedia XML dumps to "clean" text
# Written by Milton Huang, 2017. MIT licence
use Time::HiRes qw( gettimeofday tv_interval );
my $t0 = [gettimeofday];
$/="</page>"; # while record separator
while (<>) {
# process only inside text tag
/<text(?<text>.+?)<\/text>/s;
$text = $+{text};
if ($text !~ /#redirect/i) {
# remove rest of initial <text> tag
$text =~ s/.*?>//;
# clean text
$text =~ s/\{\|.+?\|\}//sg; # remove tables
$text =~ s/^\[\[Category:[^\]\n]+?\]\]//smgi; # remove Categories
$text =~ s/^\[\[Category:[^\]\n]+?\]\]\z//smgi; # some end with </text>
$text =~ s/^\[\[[^\[\]\n]+?\]\]\n//smg; # remove link lines
$text =~ s/^\[\[[a-z\-]+?:.+?\]\]\n//smg; # remove translation
$text =~ s/^\[\[[a-z\-]+?:.+?\]\]\z//smg; # some end with </text>
$text =~ s/\[http\S+?\]//sgi; # http links[] without anchor
$text =~ s/\[http\S+?\s+?(.*?)\]/$1/sgi; # http links[]
# is there a better way to manage embedded links?
# would still over reach with .+?\]\]
$text =~ s/\[\[Image:[^\[]+?\]\]//sgi; # remove images without embedded
$text =~ s/\[\[[^\|\[]+?\|([^\[]+?)\]\]/$1/sg; # links[[a|a]] without embedded
$text =~ s/\[\[([^\|\[]+?)\]\]/$1/sg; # unnamed links[[]] without embedded
$text =~ s/\[\[Image:[^\[]+?\]\]//sgi; # remove images after embedded removed
## templates
# take first number and first measure of convert
$text =~ s/\{\{convert\|([^\|]+?)\|(.+?)(?:\}\}|\|[^\}]+?\}\})/$1 $2/sgi;
$text =~ s/\{\{IPA\|(.+?)\}\}/$1/sgi; #IPA
$text =~ s/\{\{nihongo\|([^\|]+?)\|[^\}]+?\}\}/$1/sgi; #nihongo
$text =~ s/\{\{lang\|[^\|]+?\|([^\}]+?)\}\}/$1/sgi; #lang
$text =~ s/\{\{Unicode\|(.+?)\}\}/$1/sgi; #Unicode
$text =~ s/\{\{Audio\|[^\|]+?\|(.+?)\}\}/$1/sgi; #audio title
$text =~ s/\{\{cquote\|(.+?)\}\}/"$1"/sgi; #cquote ignore embedded
# get rid of rest
# includes wikiquote, wikisource, cite, date, infobox, main
$text =~ s/\{\{([^\{\}]+?)\}\}//sg; # only unnested
$text =~ s/\{\{([^\{\}]+?)\}\}//sg; # another round
# remove markdown
$text =~ s/'''([^']+?)'''/$1/sg; # bold'''
$text =~ s/''([^']+?)''/$1/sg; # italic''
## HTML entity codes
# creates HTML tags including <!-- comments
$text =~ s/&/&/gi;
$text =~ s/"/"/gi;
$text =~ s/>/>/gi;
$text =~ s/</</gi;
$text =~ s/ / /gi;
$text =~ s/–/–/gi;
$text =~ s/—/—/gi;
$text =~ s/[/[/g;
$text =~ s/]/]/g;
$text =~ s/(/(/g;
$text =~ s/)/)/g;
#html
$text =~ s/<!--.+?-->//sg; # remove comments
$text =~ s/<small>([^\<]*?)<\/small>/$1/gi;
$text =~ s/<s>([^\<]*?)<\/s>/$1/gi; # unclear if delete or keep
$text =~ s/<sup>2<\/sup>/²/gi;
$text =~ s/<sup>3<\/sup>/³/gi;
$text =~ s/<sup>([^\<]+?)<\/sup>/$1/gi;
$text =~ s/<sub>([^\<]+?)<\/sub>/$1/gi;
# just get rid of math for now
$text =~ s/<math[^<]*?<\/math>//gi; #fail if <tag> inside
$text =~ s/<br[^<>]*?>//gi;
$text =~ s/<\/br[^<>]*?>//gi;
$text =~ s/<blockquote>//gi;
$text =~ s/<\/blockquote>//gi;
$text =~ s/<span[^\>]*?\>([^\<]+?)<\/span>/$1/sgi;
$text =~ s/<i>([^<]*?)<\/i>/$1/gi; #fail if <tag> inside
$text =~ s/<gallery[^<]*?<\/gallery>//gi; #fail if <tag> inside
$text =~ s/<gallery\/>//gi;
$text =~ s/^<table.+?<\/table>//smgi;
$text =~ s/^<div.+?<\/div>//smgi;
# remove references
$text =~ s/<ref[^\/]*?\/>//sgi; # get <ref /> singles
$text =~ s/<ref.*?>.*?<\/ref>//sgi; # get <ref></ref> pairs
# delete rest of html tags
$text =~ s/<nowiki>//gi;
$text =~ s/<\/nowiki>//gi;
$text =~ s/<u>//gi;
$text =~ s/<\/u>//gi;
$text =~ s/<[^>]*?>//sgi;
# remove bullet points avoid clipping matched {{ }}
$text =~ s/^\*[^\n]*?\n//smg; # remove bullet points
$text =~ s/^#[^\n]*?\n//smg; # remove numbered points
$text =~ s/^:[^\n]*?\n//smg; # remove : points
$text =~ s/^;[^\n]*?\n//smg; # remove ; points
$text =~ s/^\|[^\n]*?\n//smg; # remove infobox lines that didn't get taken out before
# uniform quotes
$text =~ s/''/"/g;
$text =~ s/“/"/g;
$text =~ s/”/"/g;
# beginning and endings
$text =~ s/^\s+?([^\n]*?\n)/$1/smg; # chomp initial whitespace
$text =~ s/^([^\n]*?\n)\s+?\n/$1/smg; # chomp terminal whitespace
$text =~ s/^([^\n]*?)\s+?\n/$1\n/smg; # extra spaces before \n
$text =~ s/^[^\n]*?[^\.\?!]\n//smg; # delete not ending with punct
#titles (already covered by not ending in punct)
# $text =~ s/^=+?[^=]+?=+?\s*?\n//sgm;
#multiple \n
$text =~ s/\n\s/\n/g;
$text =~ s/\n+/\n/g;
# multiple spaces
$text =~ s/\s+\,/\,/g;
$text =~ s/\s+\./\./g;
$text =~ s/ +/ /g;
$text =~ s/([\.\?,!])\s*[\.\?,!]/$1/g;
print "$text\n";
}
}
print("processing time: ");
$text =~ s/”//g;
printf("%.6f\n", tv_interval ( $t0 )); # doesn't display if don't printf .6