Skip to content

Commit

Permalink
Add rules for Swedish
Browse files Browse the repository at this point in the history
  • Loading branch information
andersjohansson committed Jul 27, 2020
1 parent 9d222e6 commit fd46275
Show file tree
Hide file tree
Showing 2 changed files with 223 additions and 0 deletions.
180 changes: 180 additions & 0 deletions src/rules/disallowed_words/sv.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,180 @@
bl
fr
IUCN
bg
bh
c:a
cd
d:r
dr
iaf
iofs
ISBN
IVA
jbo
jb/o
k:a
kbfd
KBM
KFM
mc
mm
mnkr
Mkr
msk
mtp
n/a
N:o
nr
N:s
obs
omm
oms
pg
pua
RC
R:dr
Rdr
R:gs
RIP
s:a
SEK
Sk:pund
Sk
sms
sr
ss
ssk
S:t
S:ta
tfn
tgm
tjf
tjl
tjm
tkr
t:r
trpt
trsp
VAB
vd
vpl
C/o
OSA
ZIP code
Ftf
AA
AB
AF
ASEA
ASEAN
CSA
CSA
CSN
DACO
DAFA
DCRI
EG
EMU
FK
FRA
FN
JK
JO
JämO
KDU
KF
KO
Komintern
LO
MUF
NATO
NF
NBV
NO
OD
OK
PO
RFoD
RFSL
RFSU
ROKS
SACO
SAP
SCB
SEB
SIFO
SKF
SKL
SKR
SKTF
SR
SSAB
SSG
SSR
SSRS
SSU
TCO
TRV
Unesco
Unicef
WHO
WWF
sic
AC
AM
ATB
bps
DC
DIN
DPF
DSG
DVD
EAN
EGR
FM
HF
IF
ISO
KV
LV
LW
MF
MV
MW
MMS
NOM
PC
PCM
PWM
RAM
ROM
rpm
SQR
SQRT
SSB
SW
TDI
UHF
UKV
VHF
VLF
DM
DNF
DNS
efl
FM
GM
GP
GS
GS
IEM
IVM
JSM
KO
MMA
MTB
NM
TKO
USM
VC
43 changes: 43 additions & 0 deletions src/rules/sv.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
min_trimmed_length = 3
min_word_count = 2
max_word_count = 14
min_characters = 2
may_end_with_colon = false
quote_start_with_letter = true
needs_punctuation_end = true
needs_letter_start = true
needs_uppercase_start = true
# This should cover most common Swedish words
allowed_symbols_regex = "[a-zåäöA-ZÅÄÖé,.?!: ]"
broken_whitespace = [" ", " ,", " .", " ?", " !", " ;"]

# No roman numerals (common in texts about Swedish kings etc., but hard to read out)
# The :s is for genitive constructions (Karl XII:s)
other_patterns = ["[MDCLXVI]+(:s)?"]

# We don’t allow quotes or parenthesis symbols. No other use for matching
# (Swedish quotes are actually opening "”" and closing "”", so impossible to
# match anyway)
# matching_symbols = [ ]

#expand some commmon abbreviations that will probably be reasonable and unambigous to speak out
replacements = [
["bl.a.", "bland annat"],
["ca.", "cirka"],
["d.v.s.", "det vill säga"],
["fr.o.m.", "från och med"],
["i.o.m.", "i och med"],
["m.m.", "med mera"],
["m.fl.", "med flera"],
["o.d.", "och dylikt"],
["o.dyl.", "och dylikt"],
["o.s.v.", "och så vidare"],
["p.g.a.", "på grund av"],
["resp.", "respektive"],
["t.ex.", "till exempel"],
["t.o.m.", "till och med"],
]

# Filter out lots of other abbreviations
# Taken from list: https://sv.wikipedia.org/wiki/Lista_%C3%B6ver_f%C3%B6rkortningar
abbreviation_patterns = [ "art\\.|bl\\.a\\.|B\\.V\\.|civ\\.ek\\.|civ\\.ing\\.|doc\\.|d\\.v\\.s\\.|d\\.y\\.|d\\.ä\\.|ekon\\.|farm\\.|f\\.d\\.|fig\\.|fil\\.|fr\\.o\\.m\\.|Ibid\\.|ib\\.|i\\.o\\.m\\.|i\\.s\\.f\\.|lb\\.|lic\\.|lisp\\.|mag\\.|med\\.|m\\.h\\.p\\.p\\.|min\\.|m\\.m\\.|m\\.m\\.d\\.|mom\\.|m\\.v\\.h\\.|möjl\\.|m ö\\.h\\.|n\\.b\\.|näml\\.|nästk\\.|obs\\.|o\\.d\\.|odont\\.|o\\.dyl\\.|o\\.k\\.s\\.|omkr\\.|o\\.m\\.s\\.|op\\.|ordf\\.|org\\.nr|o\\.s\\.v\\.|pers\\.|p\\.g\\.a\\.|pol\\.|prel\\.|prof\\.|prov\\.|rc\\.|ref\\.|resp\\.|R\\.I\\.P\\.|rst\\.|s\\.a\\.s\\.|sek\\.|sekr\\.|s\\.g\\.s\\.|sid\\.|sign\\.|sistl\\.|s\\.k\\.|sk\\.|skepp\\.|skålp\\.|s\\.m\\.|s\\.m\\.s\\.|sp\\.|spec\\.|s\\.st\\.|st\\.|St\\.|stud\\.|särsk\\.|tab\\.|tekn\\.|tel\\.|temp\\.|teol\\.|t\\.ex\\.|tf\\.|t\\.h\\.|tim\\.|t\\.o\\.m\\.|trol\\.|t\\.v\\.|ung\\.|u\\.a\\.|u\\.f\\.a\\.|u\\.p\\.a\\.|urspr\\.|usk\\.|utg\\.|å\\.k\\.|äv\\.|ö\\.a\\.|övers\\.anm\\.|ö\\.h\\.|ö\\.h\\.t\\.|ök\\.|övers\\.|att\\.|Avs\\.|b\\.v\\.|D\\.S\\.|n\\.b\\.|o\\.s\\.a\\.|P\\.P\\.S\\.|P\\.S\\.|tr\\.|ö\\.g\\.|A\\.D\\.|e\\.Kr\\.|e\\.v\\.t\\.|g\\.s\\.|mån\\.|s\\.å\\.|civ\\.ek\\.|civ\\.ing\\.|ekon\\.dr|ekon\\.mag\\.|ekon\\.kand\\.|fil\\.dr|fil\\.lic\\.|fil\\.kand|fil\\.mag|jur\\.kand\\.|jur\\.utr\\.kand\\.|jur\\.lic\\.|jur\\.dr|med\\.dr|med\\.lic\\.|med\\.kand\\.|odont\\.kand\\.|odont\\.lic\\.|odont\\.dr|pol\\.kand\\.|pol\\.mag\\.|pol\\.dr|tekn\\.dr|tekn\\.lic\\.|teol\\.kand\\.|teol\\.lic\\.|teol\\.dr|a\\.a\\.|e\\.g\\.|e\\.o\\.|ibid\\.|id\\.|i\\.e\\.|L\\.s\\.|l\\.s\\.|m\\.p\\.|N\\.N\\.|op\\.cit\\.|alban\\.|arab\\.|aram\\.|armen\\.|assyr\\.|avest\\.|babyl\\.|bret\\.|bulg\\.|dan\\.|egypt\\.|eng\\.|est\\.|fa\\.|fd\\.|fenic\\.|fe\\.|feng\\.|ffrans\\.|ffris\\.|fgutn\\.|fht\\.|fi\\.|fin\\.|find\\.|flfran\\.|fnor\\.|fpers\\.|fpreuss\\.|fr\\.|frans\\.|frank\\.|fris\\.|fsax\\.|fslav\\.|fsv\\.|fvn\\.|fär\\.|gael\\.|gall\\.|georg\\.|got\\.|gr\\.|grek\\.|hebr\\.|hett\\.|hind\\.|hindost\\.|holl\\.|ieur\\.|ir\\.|irl\\.|isl\\.|it\\.|ital\\.|jap\\.|kelt\\.|kines\\.|korn\\.|kymr\\.|kyrkslav\\.|lat\\.|lap\\.|lapp\\.|lett\\.|lit\\.|lt\\.|mag\\.|meng\\.|mfr\\.|mholl\\.|mht\\.|mlat\\.|mlt\\.|mnl\\.|mnt\\.|moes\\.|nfris\\.|nht\\.|nl\\.|no\\.|nor\\.|ns\\.|nt\\.|nyfris\\.|nygr\\.|nyheb\\.|nyisl\\.|nysv\\.|osset\\.|osk\\.|per\\.|pers\\.|pol\\.|port\\.|prt\\.|pt\\.|provenç\\.|prov\\.|rundan\\.|runsv\\.|ry\\.|ryss\\.|sam\\.|sard\\.|skr\\.|slav\\.|semit\\.|sengr\\.|senlat\\.|shet\\.|slovak\\.|sloven\\.|slov\\.|sorb\\.|sp\\.|span\\.|sv\\.|syr\\.|tam\\.|tjeck\\.|tochar\\.|ty\\.|ung\\.|urgerm\\.|urn\\.|wal\\.|vall\\.|wogul\\.|zig\\.|æthiop\\.|disk\\.|s\\.d\\.|w\\.o\\." ]

0 comments on commit fd46275

Please sign in to comment.