-
Notifications
You must be signed in to change notification settings - Fork 6
/
config.ini
67 lines (52 loc) · 3.13 KB
/
config.ini
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
[important config]
; device to use for ocr model training and loading
; to run on gpu: set to cuda:0, cuda:1, ...
device = cpu
; targeted set of font classes separated through commas
font_classes = antiqua,fraktur
; supported languages for "enhance" action, separated through commas
supported_languages = lb,de,fr
[ocr config]
; whitelisted characters that can be learned by the ocr model
whitelist = !%&()*+,-./0123456789:;=?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_abcdefghijklmnopqrstuvwxyz|§«°»½ÄÆÈÉÊËÖÜßàáâäæçèéêëìíîïòóôöùúûüěıŒœǎǒǔḻṉ‘’‚“”„‟•‹›✝⸗
; target block types in "enhance" action
; example: target_types = a->b,c->d
; meaning: b and d are target types if they are within a and c respectively (logical structure)
target_types = PARAGRAPH->TEXT
; min and max year of publication of target ocr data (used in enhancement prediction)
min_year = 1840
max_year = 1960
; list of all possible vowel characters (used in enhancement prediction)
vowels = aäàáâǎeéèêëěiîïíìıoöôòóǒuüûùúǔ
[language recognition config]
; language recognition strategy in "enhance" action:
; - langid package is used to determine the language
; - optionally, the ratio #stopWords/#blockWords can be computed first, to eventually avoid langid altogether, should the ratio exceed or equal stop_words_thresh for any language
; TO-DO:
; 1. define the set of supported_languages (blocks identified to be in other languages are not processed)
; 2. define value of stop_words_thresh (set to -1 to not use stop words ratio)
; 3. define set of xx_stop_words for language xx
; ratio of stop words that needs to be exceeded or equaled for the block to be in this particular language
stop_words_thresh = 0.1
; sets of stop words for some/all supported_languages
lb_stop_words = äis,är,ärt,äert,ären,alleguer,anerefalls,ass,awer,dach,datt,däin,där,dat,dee,deel,deem,deen,deene,déi,deng,denger,dësem,di,do,domat,dozou,drop,duerch,duerno,e,ee,em,een,eent,ë,ënner,ëm,ech,eis,eise,eisen,eiser,eises,eisereen,esou,een,eng,enger,engem,entweder,eréischt,fir,géint,géif,gëtt,gët,geet,gi,ginn,gouf,gouff,goung,hatt,hätt,hei,hu,huet,hun,hunn,hiren,hien,hin,hir,jidderen,jiddereen,jiddwereen,jiddereng,jiddwerengen,jo,iech,iwwer,kee,keen,kënne,kënnt,kéng,kéngen,kéngem,koum,kuckt,mam,mat,ma,mä,mech,méi,mécht,meng,menger,mer,nach,nämmlech,nämmelech,näischt,nawell,nëmme,nëmmen,net,nees,nee,nu,nom,och,ons,onsen,onser,onsereen,onst,om,op,ouni,säi,säin,schonn,schonns,sid,sech,seng,senge,sengem,senger,selwecht,selwer,sinn,souguer,sou,soss,sot,tëscht,u,um,virdrun,vu,vum,vun,wat,wëllt,wéi,wéini,wéinst,wi,wollt,wou,wouhin,zanter,ze,zwar
de_stop_words =
fr_stop_words =
; segmenter parameters as described in the following article: https://jdmdh.episciences.org/8642
[segmenter config]
; preprocessing element height
p1 = 100
; first (text) dilation width
p2 = 90
; background protection element height
p3 = 25
; background separator width
p4 = 35
; second (background) dilation width
p5 = 330
; minimum text line height
p6 = 14
; relative peak height threshold
p7 = 0.3
; postprocessing height adjustment
p8 = 5