[RULE-ORDER]
WORD-WITHSUFFIX WORD-TOKEN ABBREVIATION-KNOWN NUMBER-ORDINAL URL URL-WWW URL-DOMAIN
E-MAIL WORD-PARPREFIX-PARSUFFIX WORD-PARPREFIX WORD-PARSUFFIX WORD-COMPOUND 
ABBREVIATION INITIALS INITIAL SMILEY PUNCTUATION-MULTI DATE DATE-REVERSE
NUMBER-YEAR TIME NUMBER CURRENCY WORD PUNCTUATION UNKNOWN 
# to do PREFIXES (is leeg nu) UNITS (uitgecommentarieerd in de c++ code) 

[RULES]
%include url
%include e-mail
%include smiley

#Ex (oud)-studente(s)
WORD-PARPREFIX-PARSUFFIX=(?:\p{Ps}\p{L}+[\p{Pc}\p{Pd}]?\p{Pe}[\p{Pc}\p{Pd}]?)\p{L}+(?:[\p{Pc}\p{Pd}]\p{L}+)*(?:[\p{Pc}\p{Pd}]?\p{Ps}[\p{Pc}\p{Pd}]?\p{L}+\p{Pe})

#Ex: (oud)-studente, (on)zin,
WORD-PARPREFIX=(?:\p{Ps}\p{L}+[\p{Pc}\p{Pd}]?\p{Pe}[\p{Pc}\p{Pd}]?)\p{L}+(?:[\p{Pc}\p{Pd}]\p{L}+)*

#Ex: koning(in)
WORD-PARSUFFIX=\p{L}+(?:[\p{Pc}\p{Pd}]\p{L}+)*(?:[\p{Pc}\p{Pd}]?\p{Ps}[\p{Pc}\p{Pd}]?\p{L}+\p{Pe})

#Keep dash/underscore connected parts (even if they are in parenthesis)
WORD-COMPOUND=\p{L}+(?:[\p{Pc}\p{Pd}]\p{L}+)+

#Abbreviations with multiple periods
ABBREVIATION=^(\p{L}{1,3}(?:\.\p{L}{1,3})+\.?)\Z

#Initials glued to a longer word: A.F.Zetterij -> A.F. Zetterij
INITIALS=(\p{L}(?:\.\p{L})+\.)\p{Lu}\p{L}{3,999}+

#retain initials
INITIAL=^(?:\p{Lt}|\p{Lu})\.$

#SMILEY=^(?:>?[:;]['`^]?[-~]*[)}\](\\/\[{Ss\$PpDd]+)$

#Homogeneous punctuation (ellipsis etc)
PUNCTUATION-MULTI=(?:\.|\-|[!\?]){2,}

#Date
DATE=\p{N}{1,2}-\p{N}{1,2}-\p{N}{2,4}
DATE-REVERSE=\p{N}{4}-\p{N}{1,2}-\p{N}{1,2}

NUMBER-YEAR=('\p{N}{2})(?:\P{N}|\z)
#NUMBER-YEAR=('\p{N}{2})\P{N}

#Times
TIME=\p{N}{1,2}:\p{N}{1,2}(?::\p{N})?(?i:am|pm)?

#retain digits, including those starting with initial period (.22), and negative numbers
NUMBER=-?(?:[\.,]?\p{N}+)+

CURRENCY=\p{Sc}

WORD=[\p{L}\p{Mn}]+

PUNCTUATION=\p{P}

UNKNOWN=.

[PREFIXES]

[ATTACHEDSUFFIXES]
's
't
'n

[ORDINALS]
e
de
ste
er


[TOKENS]
's
'k
'm
'n
't

[UNITS]
km
m
cm
mm
g
kg
C
l
s
sec
min
gb
mb
kb


[CURRENCY]
EUR
hfl
fl
f


[ABBREVIATIONS]
Aardoliemij
Adm
Adriaansz
Afd
Am
Ant
Anthoniszn
Ave
BMCie
Bel
Belastinggr
Bfr
Bijv
Bk
Blvd
Br
Bros
Burg
CHR
Ch
Chr
Cie
Co
Com
Corneliszn
Corp
CvN
Cy
Dep
Dept
Di
Do
Dhr
Dr
Drs
Ed
Em
Eng
Esq
Eur
Exc
Exp
F
Fa
Fam
Fed
Fl
Fr
Fred
Gebr
Gem
Gen
Gld
H
HH
Hd
Herv
Hoogl
Hr
Hub
Hzn
Inc
Ing
Inl
Inst
Int
Ir
Isr
It
J-P
Jac
Jacq
Jan
Jhr
Jkvr
Joh
Jr
Jul
Jzn
KLu
Kcal
Kon
Krj
L
Lat
Ltd
M
Ma
Mad
Mass
Mej
Mevr
Mgr
Mij
Min
Mr
Mrs
Ms
Mus
Mw
N
NH
NL
Nd
Ndl
Ned
Nic
Nov
O
Oct
Olym
Org
Oud-Eng
P
PE
Pct
PepsiCo
Ph
Phs
Pol
Prof
Prov
RED
Red
Rijkscomm
Rom
SEPT
Sept
Sj
Sp
Sr
St
Stbl
Stct
Sted
TH
Tel
Th
Tijdschr
Tj
Uitg
Univ
VS
Ver
Vic
Vl
Vlnr
Vr
Vz
W
Werkn
Wo
Z
Za
Zl
Zn
a
aanv
acad
acc
adj
adm
adv
afb
afd
afk
afl
afz
al
alg
alt
arr
art
asp
ass
atm
aug
beh
beheerscomm
ben
benod
betr
bijv
bijz
bl
blz
br
brab
brandm
btw
bur
bv
c
ca
cal
cand
cao
cap
cat
cc
cf
chr
cm
cod
com
commer
comp
coop
cq
ct
deb
dec
derg
dgl
dgs
dhr
di
dipl
dir
distr
div
do
don
dr
drs
ds
dw
ed
eerste-luit
eerw
eig
em
enk
enz
etc
ev
evt
ex
excl
f
fa
feb
febr
fec
fig
fl
fol
fr
geb
gebr
gedipl
geh
gem
gep
gesch
get
gez
gld
gr
gymn
h
herv
hh
hoogl
hs
ib
ibid
id
ill
imp
impr
inc
incl
indiv
inf
ing
ink
inl
insp
int
intr
inw
inz
ir
it
j
jan
jg
jhr
jl
joh
jr
kHz
kand
kath
kcal
kg
kl
km
l
lb
lib
lic
ll
lt
ltd
m
ma
maj
max
med
medew
mej
mevr
mg
mgr
mil
milj
mld
mln
mm
mnd
mr
mrd
mrs
mrt
ms
mtr
muz
mv
mw
n
ned
nl
nom
nov
nr
o
oa
ob
obl
okt
olv
ong
ongeh
onz
opm
opp
or
org
oud-bevelv
oud-penn
oud-secr
oud-voorz
oud-vrijw
oud-vrz
p
pCt
pag
par
pct
pd
penn
penningm
perf
persc
pl
plm
plv
pnt
pr
praes
pres
prk
proc
prof
prot
prov
ps
pt
r
re
reg
resp
ret
rk
sc
scholengem
schr
scr
sec
sept
seq
ser
sin
sing
soc
spr
sq
sr
st
subs
subst
sup
t
tab
td
tech
temp
terugbez
tg
tgov
theel
tit
tv
tw
v
vac
var
vdt
verb
verg
versch
vert
vgl
vice-voorz
vice-vrz
vid
vlg
vlgg
vlnr
vml
vnl
vnlr
vnw
voc
voorl
voorm
voorw
voorz
vorstverl
vr
vrijw
vrijwil
vrijwill
vrz
vs
wd
weled
weledelgeb
weledelgestr
weleerw
werkg
wo
wsch
z
za
zelfst
zg
zgn
zn
zog
zw
zwemb

[FILTER]
%include ligatures
# also filter soft hyphen 
\u00AD

[EOSMARKERS]
%include standard-eos

[QUOTES]
%include standard-quotes
%include exotic-quotes

