-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
moved rules from ucto/config to uctodata/config
- Loading branch information
Ko van der Sloot
authored and
Ko van der Sloot
committed
May 17, 2018
1 parent
789e615
commit 04415bc
Showing
7 changed files
with
140 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
E-MAIL=^([\p{L}\p{N}\._%+\-]+@[\p{L}\p{N}\.\-]+\.\p{L}{2,4})\P{L}?$ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
SMILEY=^(?:[oO>}\])]?)(?:[\:;8][',]?[-\^]?(?:[sSdDpPcCoO#@*$|?]|\)\)*|\{|\[|\(\(*)=?)$ | ||
REVERSE-SMILEY=^(?:\(|\)|\}*)(?:[sScCoO#@*$|?]?|\{|\[|\(\(?)=?(?:[',]?[-\^]?[\:;8])(?:[oO<}\[)]?)$ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
# Character: . | ||
# Name: FULL STOP | ||
# Code: 46 (0x2E) | ||
\u002E | ||
|
||
# Character: ! | ||
# Name: EXCLAMATION MARK | ||
# Code: 33 (0x21) | ||
\u0021 | ||
|
||
# Character: ? | ||
# Name: QUESTION MARK | ||
# Code: 63 (0x3f) | ||
\u003F | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
# all quotes must be entered as pairs of open en close quotes | ||
# separated by a space | ||
# When more opening quotes match a single closing quote (or visa versa) | ||
# they must be aggregated in one string! | ||
# the ambiguous quotes " and ' are handled automaticly | ||
|
||
‘ ’ | ||
“„‟ ” |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,107 @@ | ||
version=0.2 | ||
[RULE-ORDER] | ||
URL URL-WWW URL-DOMAIN | ||
E-MAIL WORD-PARPREFIX WORD-PARSUFFIX WORD-COMPOUND | ||
ABBREVIATION INITIAL SMILEY REVERSE-SMILEY PUNCTUATION-MULTI DATE-REVERSE DATE | ||
NUMBER-YEAR TIME FRACNUMBER NUMBER CURRENCY WORD PUNCTUATION UNKNOWN | ||
|
||
|
||
[META-RULES] | ||
|
||
[RULES] | ||
%include url | ||
%include e-mail | ||
%include smiley | ||
|
||
#Ex: (dis)information | ||
WORD-PARPREFIX=(?:\p{Ps}\p{L}+[\p{Pc}\p{Pd}]?\p{Pe}[\p{Pc}\p{Pd}]?)\p{L}+(?:[\p{Pc}\p{Pd}]\p{L}+)* | ||
|
||
#Ex: understand(s) | ||
WORD-PARSUFFIX=\p{L}+(?:[\p{Pc}\p{Pd}]\p{L}+)*(?:[\p{Pc}\p{Pd}]?\p{Ps}[\p{Pc}\p{Pd}]?\p{L}+\p{Pe}) | ||
|
||
#Keep dash/underscore connected parts (even if they are in parenthesis) | ||
WORD-COMPOUND=\p{L}+(?:[\p{Pc}\p{Pd}]\p{L}+)+ | ||
|
||
#Abbreviations with multiple periods | ||
ABBREVIATION=^(\p{L}{1,3}(?:\.\p{L}{1,3})+\.?)(?:\Z|[,:;]) | ||
|
||
#retain initials | ||
INITIAL=^(?:\p{Lt}|\p{Lu})\.$ | ||
|
||
#Homogeneous punctuation (ellipsis etc) | ||
PUNCTUATION-MULTI=(?:\.|\-|[!\?]){2,} | ||
|
||
#Date | ||
DATE=\p{N}{1,2}[/\-]\p{N}{1,2}[/-]\p{N}{2,4} | ||
DATE-REVERSE=\p{N}{4}[/\-]\p{N}{1,2}[/\-]\p{N}{1,2} | ||
|
||
FRACNUMBER=\p{N}+(?:/\p{N}+)+ | ||
|
||
NUMBER-YEAR=(['`’‘´]\p{N}{2})(?:\P{N}|\z) | ||
|
||
#Times | ||
TIME=\p{N}{1,2}:\p{N}{1,2}(?::\p{N})?(?i:a\.?m\.?|p\.?m\.?)? | ||
|
||
#retain digits, including those starting with initial period (.22), and negative numbers | ||
NUMBER=-?(?:[\.,]?\p{N}+)+ | ||
|
||
CURRENCY=\p{Sc} | ||
|
||
WORD=[\p{L}\p{Mn}]+ | ||
|
||
PUNCTUATION=\p{P} | ||
|
||
UNKNOWN=. | ||
|
||
[PREFIXES] | ||
|
||
[SUFFIXES] | ||
|
||
[ORDINALS] | ||
|
||
[TOKENS] | ||
|
||
[UNITS] | ||
km | ||
m | ||
cm | ||
mm | ||
g | ||
kg | ||
C | ||
l | ||
s | ||
sec | ||
min | ||
gb | ||
mb | ||
kb | ||
|
||
|
||
[CURRENCY] | ||
USD | ||
GBP | ||
CAD | ||
NZD | ||
AUD | ||
SGD | ||
HKD | ||
EUR | ||
|
||
[ABBREVIATIONS] | ||
|
||
|
||
[FILTER] | ||
fl fl | ||
ff ff | ||
ffi ffi | ||
ffl ffl | ||
# also filter soft hyphen | ||
\u00AD | ||
|
||
|
||
[EOSMARKERS] | ||
%include standard-eos | ||
|
||
[QUOTES] | ||
%include standard-quotes |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
URL=(?i:https?|ftps?|nfs|sshfs|gopher|smb)://[\p{L}\p{N}]+(?:[[:punct:]=]+[\p{L}\p{N}]+)+/? | ||
URL-WWW=www\.[\p{L}\p{N}]+(?:[[:punct:]]+[\p{L}\p{N}]+)*/? | ||
URL-DOMAIN=^[\p{L}\p{N}]+(?:\.[\p{L}\p{N}]+)*\.(?:com|org|net|edu|mil|int|nl|be|fr|de|uk|es|it|pt|dk|se|no|fi|ch|at|hr|bg|ro|br|ru|cn|in|id|eu|ly|to|tk|za|ko|jp) |