-
Notifications
You must be signed in to change notification settings - Fork 1
/
normcase
executable file
·106 lines (87 loc) · 3.77 KB
/
normcase
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
#!/usr/local/bin/perl -w
# Author: Jason Eisner, University of Pennsylvania
# Usage: normcase [-i] [files ...]
#
# Filters parses that are in the format that "oneline" outputs. For
# each terminal symbol, ensures that all tokens of it have the same
# case. In particular, we normalize these tokens to the "least
# capitalized" version that actually appears, i.e., we prefer
# all-lowercase to initial caps to all caps.
#
# The -i flag says to modify only sentence-initial words and words that
# are in ALL CAPS. Other words are assumed to have accurate capitalization.
require("stamp.inc"); &stamp; # modify $0 and @INC, and print timestamp
$restricted = 1, shift(@ARGV) if $ARGV[0] eq "-i";
die "$0: bad command line flags" if @ARGV && $ARGV[0] =~ /^-./;
$token = "[^ ()]+"; # matches tokens: anything but parens or whitespace can be a token character
$allcapstoken = "[^ ()a-z]+(?![^ ()])";
# Make sure we clean up gracefully when we finish, die, or are killed.
sub END { close(TMP),print STDERR ("$0: ".(system("rm $tmpfile 2>/dev/null") ? "couldn't remove" : "removed")." temporary file $tmpfile\n"), if defined $tmpfile; } # Perl calls this when we finish or die
@exit_or_dumpcore_signals = ("HUP","INT","QUIT","ILL","TRAP","ABRT","EMT","FPE","KILL", "BUS","SEGV","SYS","PIPE","ALRM","TERM","USR1","USR2","POLL", "VTALRM","PROF","XCPU","XFSZ"); # signals for which the DEFAULT action would be to exit or dump core. See signal(5). Excludes the real-time signals, RTMIN through RTMAX.
foreach (@exit_or_dumpcore_signals) { $SIG{$_}=sub { die "$0: aborting (signal $_[0])\n"; } } # ensure that signal gets caught. If it's a fatal error, Perl would automatically call END for us after the handler's executed, but we explictly die in the handler in case it's a nonfatal error.
# ----------
# This version just lowercases everything.
#
# while (<>) { # for all sentences
# chop;
# s/^(\S+:[0-9]+:\t)?//, $location = $&;
# $words += s/(\s$token)/\L$&\E/go unless /^\#/; # look for token preceded by whitespace rather than paren, and lowercase it
# print "$location$_\n";
# }
# ----------
# We'll do two passes through the input.
# On the first pass, we save the input in a temporary file, and
# remember which words we've seen.
$tmpfile = "/tmp/normcase.$$";
open(TMP, ">$tmpfile");
while (<>) {
print TMP;
chop;
s/^\S+:[0-9]+:\t//; # strip off initial location
next if /\#/; # skip if comment
$seen{$1} = 1 while /\s($token)/go;
}
close(TMP);
# Now, on the second pass, read again and normalize.
$lowers = $tolowers = $toinitcaps = $leavealones = 0;
open(TMP, "<$tmpfile");
while (<TMP>) {
chop;
s/^(\S+:[0-9]+:\t)?//, $location = $&;
unless (/^\#/) { # unless a comment
if ($restricted) {
s/(\s)($token)/$1.&normcase($2)/eo; # not global replacement - first word in sentence only!
s/(\s)($allcapstoken)/$1.&normcase($2)/geo; # all-caps words
} else {
s/(\s)($token)/$1.&normcase($2)/geo; # all the words
}
}
print "$location$_\n";
}
close(TMP);
print STDERR "$0: $lowers tokens stayed lowercase; $tolowers became lowercase; $toinitcaps stayed/became Capitalized; $leavealones others\n";
#------------------------------
sub normcase {
local($w) = @_;
# simple version that would work but probably inefficiently:
# $seen{"\L$w\E"} ? "\L$w\E" : $seen{"\u\L$w\E"} ? "\u\L$w\E" : $w;
if ($w !~ /[A-Z]/) { # simple case - it's already lowercase
$lowers++;
$w;
} else {
local($lower) = "\L$w\E";
if ($seen{$lower}) { # we've seen a lowercase equivalent
$tolowers++;
$lower;
} else {
local($initcap) = "\u$lower";
if ($seen{$initcap}) {
$toinitcaps++;
$initcap;
} else {
$leavealones++;
$w;
}
}
}
}