-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathtfidf.pl
77 lines (73 loc) · 1.28 KB
/
tfidf.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
use strict;
my %idfdat;
$|=1;
my @domainList;
my %idf;
sub tf{
my $domain = $_[0];
my @backdl=@domainList;
my $cou;
my %word;
open IN,"< text/$domain.txt";
while(<IN>){
$_=lc $_;
while(/(\S{2,})/g){
$word{$1}++;
$cou++;
}
}
close IN;
open OUT,"> tmp/$domain.tf.tmp";
for(keys %word){
$word{$_}/=$cou;
$idfdat{$_}++;
print OUT "$word{$_} $_\n";
}
close OUT;
`sort -n -r tmp/$domain.tf.tmp > text/$domain.tf`;
@domainList=@backdl;
}
sub tfidf{
my $domain = $_[0];
my %tfidf;
open IN,"< text/$domain.tf"
or die "cannot open tf data [text/$domain.tf]";
while(<IN>){
chomp;
if(/([.\d]+) (\w+)/){
my $tf=$1;
my $word=$2;
if($word=~/^\d*(\D+)\d*$/){
$word=$1;
}
$tfidf{$word}+=$tf*$idf{$word};
}else{
print "error:$_\n";
}
}
close IN;
open OUT,"> tmp/$domain.tfidf.tmp";
for(keys %tfidf){
print OUT "$tfidf{$_} $_\n";
}
close OUT;
`sort -r -n tmp/$domain.tfidf.tmp > text/$domain.tfidf`;
}
while(<STDIN>){
chomp;
push @domainList,$_;
}
for(@domainList){
&tf($_);
}
open OUT,"> tmp/idf.txt.tmp";
for(keys %idfdat){
$idf{$_}=log($#domainList/$idfdat{$_});
print OUT "$idf{$_} $_\n";
}
close OUT;
`sort -n -r tmp/idf.txt.tmp > text/idf.txt`;
for(@domainList){
&tfidf($_);
}
`perl makehtml.pl <list >/var/www/html/tor/tfidfs.html`;