-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathcommon_wikipedia_words.pl
67 lines (47 loc) · 1.3 KB
/
common_wikipedia_words.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
#!/usr/local/bin/perl
use strict;
require HTML::Parser;
package MyParser;
@MyParser::ISA = qw(HTML::Parser); #extend HTML::Parser class... error?
my %globalhash = ();
my %globalfreqs = ();
my $num_articles = 1000;
for my $i (1..$num_articles)
{
print "$i of $num_articles\n";
my %localhash = ();
system("wget", "http://en.wikipedia.org/wiki/Special:Random", "-O", "wikipedia_random_article.html", "-q");
open(INFILE, "<wikipedia_random_article.html") or die "couldn't open file wikipedia_random_article.html";
my $parser = MyParser->new; #new instance of this class
$parser->parse_file('wikipedia_random_article.html');
my $file = $parser->{TEXT};
my @tokens = split(/ /, $file);
foreach my $token (@tokens)
{
$token = lc($token); #yes?
if($token=~/^[A-Za-z]+[,\.\?\!]?$/)
{
if($token=~/[,\.\?\!]/)
{
chop($token);
}
$localhash{$token} = 1; #default is nonexistent/zero
$globalfreqs{$token}++;
}
}
foreach my $key (keys(%localhash))
{
$globalhash{$key}++;
}
close(INFILE);
}
open (OUTFILE, ">common_wikipedia_words.txt") or die "couldn't open file";
foreach ( sort {$globalhash{$b} <=> $globalhash{$a}} keys(%globalhash))
{
print OUTFILE "$_\t$globalhash{$_}\t$globalfreqs{$_}\n";
}
sub text
{
my ($self,$text) = @_;
$self->{TEXT} .= $text;
}