Skip to content

Commit

Permalink
moved rules from ucto/config to uctodata/config
Browse files Browse the repository at this point in the history
  • Loading branch information
Ko van der Sloot authored and Ko van der Sloot committed May 17, 2018
1 parent 789e615 commit 04415bc
Show file tree
Hide file tree
Showing 7 changed files with 140 additions and 1 deletion.
5 changes: 4 additions & 1 deletion config/Makefile.am
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
config_DATA = tokconfig-eng tokconfig-nld tokconfig-fra tokconfig-ita \
tokconfig-spa tokconfig-por tokconfig-deu tokconfig-swe \
tokconfig-nld-twitter tokconfig-nld-sonarchat tokconfig-tur \
tokconfig-nld-withplaceholder tokconfig-nld-historical tokconfig-fry tokconfig-rus \
tokconfig-nld-withplaceholder tokconfig-nld-historical tokconfig-fry \
tokconfig-rus tokconfig-generic \
ligatures.filter \
e-mail.rule smiley.rule url.rule \
standard-quotes.quote standard-eos.eos \
exotic-quotes.quote exotic-eos.eos \
nld_afk.abr spa.abr por.abr

Expand Down
1 change: 1 addition & 0 deletions config/e-mail.rule
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
E-MAIL=^([\p{L}\p{N}\._%+\-]+@[\p{L}\p{N}\.\-]+\.\p{L}{2,4})\P{L}?$
2 changes: 2 additions & 0 deletions config/smiley.rule
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
SMILEY=^(?:[oO>}\])]?)(?:[\:;8][',]?[-\^]?(?:[sSdDpPcCoO#@*$|?]|\)\)*|\{|\[|\(\(*)=?)$
REVERSE-SMILEY=^(?:\(|\)|\}*)(?:[sScCoO#@*$|?]?|\{|\[|\(\(?)=?(?:[',]?[-\^]?[\:;8])(?:[oO<}\[)]?)$
15 changes: 15 additions & 0 deletions config/standard-eos.eos
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# Character: .
# Name: FULL STOP
# Code: 46 (0x2E)
\u002E

# Character: !
# Name: EXCLAMATION MARK
# Code: 33 (0x21)
\u0021

# Character: ?
# Name: QUESTION MARK
# Code: 63 (0x3f)
\u003F

8 changes: 8 additions & 0 deletions config/standard-quotes.quote
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# all quotes must be entered as pairs of open en close quotes
# separated by a space
# When more opening quotes match a single closing quote (or visa versa)
# they must be aggregated in one string!
# the ambiguous quotes " and ' are handled automaticly

‘ ’
“„‟ ”
107 changes: 107 additions & 0 deletions config/tokconfig-generic
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
version=0.2
[RULE-ORDER]
URL URL-WWW URL-DOMAIN
E-MAIL WORD-PARPREFIX WORD-PARSUFFIX WORD-COMPOUND
ABBREVIATION INITIAL SMILEY REVERSE-SMILEY PUNCTUATION-MULTI DATE-REVERSE DATE
NUMBER-YEAR TIME FRACNUMBER NUMBER CURRENCY WORD PUNCTUATION UNKNOWN


[META-RULES]

[RULES]
%include url
%include e-mail
%include smiley

#Ex: (dis)information
WORD-PARPREFIX=(?:\p{Ps}\p{L}+[\p{Pc}\p{Pd}]?\p{Pe}[\p{Pc}\p{Pd}]?)\p{L}+(?:[\p{Pc}\p{Pd}]\p{L}+)*

#Ex: understand(s)
WORD-PARSUFFIX=\p{L}+(?:[\p{Pc}\p{Pd}]\p{L}+)*(?:[\p{Pc}\p{Pd}]?\p{Ps}[\p{Pc}\p{Pd}]?\p{L}+\p{Pe})

#Keep dash/underscore connected parts (even if they are in parenthesis)
WORD-COMPOUND=\p{L}+(?:[\p{Pc}\p{Pd}]\p{L}+)+

#Abbreviations with multiple periods
ABBREVIATION=^(\p{L}{1,3}(?:\.\p{L}{1,3})+\.?)(?:\Z|[,:;])

#retain initials
INITIAL=^(?:\p{Lt}|\p{Lu})\.$

#Homogeneous punctuation (ellipsis etc)
PUNCTUATION-MULTI=(?:\.|\-|[!\?]){2,}

#Date
DATE=\p{N}{1,2}[/\-]\p{N}{1,2}[/-]\p{N}{2,4}
DATE-REVERSE=\p{N}{4}[/\-]\p{N}{1,2}[/\-]\p{N}{1,2}

FRACNUMBER=\p{N}+(?:/\p{N}+)+

NUMBER-YEAR=(['`’‘´]\p{N}{2})(?:\P{N}|\z)

#Times
TIME=\p{N}{1,2}:\p{N}{1,2}(?::\p{N})?(?i:a\.?m\.?|p\.?m\.?)?

#retain digits, including those starting with initial period (.22), and negative numbers
NUMBER=-?(?:[\.,]?\p{N}+)+

CURRENCY=\p{Sc}

WORD=[\p{L}\p{Mn}]+

PUNCTUATION=\p{P}

UNKNOWN=.

[PREFIXES]

[SUFFIXES]

[ORDINALS]

[TOKENS]

[UNITS]
km
m
cm
mm
g
kg
C
l
s
sec
min
gb
mb
kb


[CURRENCY]
USD
GBP
CAD
NZD
AUD
SGD
HKD
EUR

[ABBREVIATIONS]


[FILTER]
fl fl
ff ff
ffi ffi
ffl ffl
# also filter soft hyphen
\u00AD


[EOSMARKERS]
%include standard-eos

[QUOTES]
%include standard-quotes
3 changes: 3 additions & 0 deletions config/url.rule
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
URL=(?i:https?|ftps?|nfs|sshfs|gopher|smb)://[\p{L}\p{N}]+(?:[[:punct:]=]+[\p{L}\p{N}]+)+/?
URL-WWW=www\.[\p{L}\p{N}]+(?:[[:punct:]]+[\p{L}\p{N}]+)*/?
URL-DOMAIN=^[\p{L}\p{N}]+(?:\.[\p{L}\p{N}]+)*\.(?:com|org|net|edu|mil|int|nl|be|fr|de|uk|es|it|pt|dk|se|no|fi|ch|at|hr|bg|ro|br|ru|cn|in|id|eu|ly|to|tk|za|ko|jp)

0 comments on commit 04415bc

Please sign in to comment.