-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathdictionary.pl
83 lines (67 loc) · 2.04 KB
/
dictionary.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
#!/usr/bin/perl -w
use strict;
## Written by Max Bileschi, Spring 2011
## mlbileschi@gmail.com
## Generates a frequency list of words as they appear in random wikipedia articles
## Output format: word freq (space delmited)
## Requires that a file "dictionary.txt" exists, even if it's empty. If it's not empty,
## requires that "dictionary.txt" must be in the form above.
## The program will read each word and frequency from "dictionary.txt" and adds them to a hash
## then runs a loop which wgets random wikipedia articles and (tries to) parse them into plaintext
open(INDEX, "<dictionary20110411.txt") or die "couldn't open file dictionary.txt to read from\n";
my %whash=(); #(key,value) = (word, frequency)
#read dictionary.txt into whash
foreach (<INDEX>)
{
chomp;
my @key_value=split(/\s+/, $_);
$whash{@key_value[0]}=@key_value[1];
}
close(INDEX);
#wget the top bound on the loop no. of random wikipedia articles
for my $i (1..1)
{
print $i."\n"; #print progress
#wget and open a random wikipedia article
system("wget", "http://en.wikipedia.org/wiki/Special:Random", "-O", "wikipedia_random_article.txt", "-q");
open(INFILE, "<wikipedia_random_article.txt") or die "couldn't open file wikipedia_random_article.txt\n";
my @filearr = <INFILE>;
foreach (@filearr)
{
chomp;
@line = split(/ /, $_);
foreach my $token (@line)
{
if($token =~ /^[>][A-Za-z]+[<]$/)
{
$token = lc($token);
#either increment the frequency or set equal to 1
if(exists($whash{$token}))
{
$whash{$token}++;
}
else
{
$whash{$token} = 1;
}
}
}
}
}
open(OUTFILE, ">dictionary20110411.txt")or die "couldn't open file dictionary.txt";
#print the dicitonary sorted alphabetically
foreach my $key ( sort(keys(%whash))) {
print OUTFILE "$key $whash{$key} \n";
}
close(INFILE);
close(OUTFILE);
##### IDEAS
#for when we want to be concerned about first letter's capitalization
# my $wholefile = "";
# #read file into sentences
# foreach (@filearr)
# {
# chomp;
# $wholefile.=$_;
# }
# my @sentences = split(/\./, $wholefile);