-
Notifications
You must be signed in to change notification settings - Fork 0
/
StringHelpers.php
339 lines (284 loc) · 11.1 KB
/
StringHelpers.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
<?php
/**
* Class StringHelpers
*
* added to encourage decoupling of classes
*
* @todo incorporate ideas from ruby NameCase class (included in comment at bottom of this file)
* @todo run capitalize_names on entire database via utility script instead of calling it over and over again
*
*/
class StringHelpers
{
public $C;
public function __construct($constants)
{
$this->C = $constants;
}
/**
* function that takes a name and returns a properly capitalized name
*
* @param $first
* @param $last string optional last name (the name may come all in one string)
* @throws InvalidStringException
* @return string
*/
public function capitalize_name($first, $last = null)
{
if (!is_string($first))
{
throw new InvalidStringException("Raised from capitalize_name method");
}
$space = $this->C->SPACE;
$first = $this->capitalize_words($first);
if (is_null($last) || !is_string($last))
{
return $first;
}
else
{
$last = $this->capitalize_words($last);
return $first.$space.$last;
}
}
/**
* function that properly capitalizes every word in a sentence
*
* treats a single name as a sentence as its possible to have many space separated names
*
* mainly helper for capitalize_names
*
* Yes, its O(n^3) for special cases but n is very small, being the number of words and/or word-components in a first or last name
* and O(n^2) for names that do not contain special characters which is in most cases
*
* works for 99.5% of names and 100% of names on the sunshine list
*
* @todo fix for hyphenated mac-mc names by refactoring (very rare)
* @todo replace everything with a set of regular expressions
*
* @param $sentence
* @throws EmptyStringException
* @throws InvalidStringException
* @return string
*/
public function capitalize_words($sentence)
{
if (!is_string($sentence))
{
throw new InvalidStringException("Raised from capitalize_words method");
}
if (strlen($sentence) == 0)
{
throw new EmptyStringException("Raised from capitalize_words method.");
}
if (str_word_count($sentence) == 0)
{
return $sentence;
}
$sentence = strtolower($sentence);
$separator = null;
$space = $this->C->SPACE;
$dash = $this->C->DASH;
$left = $this->C->LEFT_PAR;
$right = $this->C->RIGHT_PAR;
$luck_o_the_irish = $this->C->SINGLE_Q;
$special_capitializations = array('Mc', 'Mac');
#$special_non_capitalizations = array("von", "de" );
$proper_capitalization = array();
#TODO determine if more characters exist
$funny_characters = array($dash, $luck_o_the_irish, $left, $right);
$words = explode($space, $sentence);
foreach ($words as $a_word)
{
$is_separated_name = false;
foreach ($funny_characters as $separator)
{
if (strstr($a_word, $separator))
{
#if $separator is present then capitalize all portions
$separate_name = explode($separator, $a_word);
$put_it_back_together_again = array();
#there can be more than 1 hyphen in a name eg: Nicholas Ng-A-Fook, esteemed Ottawa U professor.
#capitalize each piece and stack it up
foreach ($separate_name as $name_piece)
{
array_push($put_it_back_together_again, ucfirst($name_piece));
}
#implode the stack
$a_word = implode($separator, $put_it_back_together_again);
$is_separated_name = true;
}
}
if (!$is_separated_name)
{
array_push ($proper_capitalization, ucfirst($a_word));
}
else
{
array_push ($proper_capitalization, $a_word);
}
}
#$proceed_to_last_check = implode($space, $proper_capitalization);
$final_array = array();
foreach ($proper_capitalization as $proceed_to_last_check)
{
$word_pushed = false;
foreach ($special_capitializations as $special_cap)
{
$prefix_length = strlen($special_cap);
$test_prefix = substr($proceed_to_last_check, 0, $prefix_length);
#if the first n characters are equal then capitalize the suffix
if ($test_prefix == $special_cap)
{
$suffix = substr($proceed_to_last_check, $prefix_length);
array_push($final_array, ucfirst($test_prefix) . ucfirst($suffix));
$word_pushed = true;
}
}
if (!$word_pushed)
{
array_push($final_array, $proceed_to_last_check);
}
}
return implode($space, $final_array);
}
/**
* function that strips undesirable characters from a string and then
* normalizes spaces to 1 space between words
*
* non utf-8 are stripped, which is a temporary measure (see @todo)
*
* @todo DEAL WITH UNICODE
* @param $tweet
* @throws StringContainsURLException
* @throws InvalidStringException
* @return string
*/
public function strip_undesirables($tweet)
{
if (!is_string($tweet)) throw new InvalidStringException("Invalid string from strip_undesirables method");
#strip url
if (!strpos($tweet, 'http'))
{
#strips all non-utf-8 to prevent system crash (MongoCrash)
#TODO must handle unicode!!!!!!!!!!!!!!!!!!!!!!!!!!!!
$tweet = preg_replace('/[^(\x20-\x7F)]*/','', $tweet );
return $this->normalize_spaces($tweet);
}
else
{
throw new StringContainsURLException("url detected");
}
}
/**
* function that strips extra whitespace, ensuring only 1 space between words
* and none at either ends
*
* @param $a_string
* @throws InvalidStringException
* @return string
*/
public function normalize_spaces($a_string)
{
if (!is_string($a_string)) throw new InvalidStringException("Invalid string from normalize_spaces method");
$space = $this->C->SPACE;
$match_multiple_spaces_regex = '/\s+/';
return preg_replace($match_multiple_spaces_regex, $space, trim($a_string, $space));
}
}
class InvalidStringException extends Exception
{
public function __construct($message, $code = 0, Exception $previous = null)
{
parent::__construct($message);
}
public function err()
{
$error_type = "Not a string";
$e = "$error_type error on line {$this->getLine()} in file {$this->getFile()}".PHP_EOL."ERR: {$this->getMessage()}".PHP_EOL;
return $e;
}
}
class EmptyStringException extends Exception
{
public function __construct($message, $code = 0, Exception $previous = null)
{
parent::__construct($message);
}
public function err()
{
$error_type = "Zero length string";
$e = "$error_type error on line {$this->getLine()} in file {$this->getFile()}".PHP_EOL."ERR: {$this->getMessage()}".PHP_EOL;
return $e;
}
}
class StringContainsURLException extends Exception
{
public function __construct($message, $code = 0, Exception $previous = null)
{
parent::__construct($message);
}
public function err()
{
$error_type = "Zero length string";
$e = "$error_type error on line {$this->getLine()} in file {$this->getFile()}".PHP_EOL."ERR: {$this->getMessage()}".PHP_EOL;
return $e;
}
}
/*
class NameCase < String
VERSION = '1.1.0'
class << self
def nc string
new(string).nc
end
end
# Returns a new +String+ with the contents properly namecased
def nc
localstring = downcase
localstring.gsub!(/\b\w/) { |first| first.upcase }
localstring.gsub!(/\'\w\b/) { |c| c.downcase } # Lowercase 's
if localstring =~ /\bMac[A-Za-z]{2,}[^aciozj]\b/ or localstring =~ /\bMc/
localstring.gsub!(/\b(Ma?c)([A-Za-z]+)/) { |match| $1 + $2.capitalize }
# Now fix "Mac" exceptions
localstring.gsub!(/\bMacEvicius/, 'Macevicius')
localstring.gsub!(/\bMacHado/, 'Machado')
localstring.gsub!(/\bMacHar/, 'Machar')
localstring.gsub!(/\bMacHin/, 'Machin')
localstring.gsub!(/\bMacHlin/, 'Machlin')
localstring.gsub!(/\bMacIas/, 'Macias')
localstring.gsub!(/\bMacIulis/, 'Maciulis')
localstring.gsub!(/\bMacKie/, 'Mackie')
localstring.gsub!(/\bMacKle/, 'Mackle')
localstring.gsub!(/\bMacKlin/, 'Macklin')
localstring.gsub!(/\bMacQuarie/, 'Macquarie')
end
localstring.gsub!('Macmurdo','MacMurdo')
# Fixes for "son (daughter) of" etc
localstring.gsub!(/\bAl(?=\s+\w)/, 'al') # al Arabic or forename Al.
localstring.gsub!(/\bAp\b/, 'ap') # ap Welsh.
localstring.gsub!(/\bBen(?=\s+\w)/,'ben') # ben Hebrew or forename Ben.
localstring.gsub!(/\bDell([ae])\b/,'dell\1') # della and delle Italian.
localstring.gsub!(/\bD([aeiu])\b/,'d\1') # da, de, di Italian; du French.
localstring.gsub!(/\bDe([lr])\b/,'de\1') # del Italian; der Dutch/Flemish.
localstring.gsub!(/\bEl\b/,'el') # el Greek or El Spanish.
localstring.gsub!(/\bLa\b/,'la') # la French or La Spanish.
localstring.gsub!(/\bL([eo])\b/,'l\1') # lo Italian; le French.
localstring.gsub!(/\bVan(?=\s+\w)/,'van') # van German or forename Van.
localstring.gsub!(/\bVon\b/,'von') # von Dutch/Flemish
# Fix roman numeral names
localstring.gsub!(
/ \b ( (?: [Xx]{1,3} | [Xx][Ll] | [Ll][Xx]{0,3} )?
(?: [Ii]{1,3} | [Ii][VvXx] | [Vv][Ii]{0,3} )? ) \b /x
) { |match| match.upcase }
localstring
end
# Modifies _str_ in place and properly namecases the string.
def nc!
self.gsub!(self, self.nc)
end
end
def NameCase string
NameCase.new(string).nc
end
*/