-
Notifications
You must be signed in to change notification settings - Fork 0
/
classical_automata.ml
292 lines (223 loc) · 6.63 KB
/
classical_automata.ml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
(*
Compilation of regexp into an automaton
*)
module IntMap =
Map.Make(struct type t = int let compare (x:int) (y:int) = compare x y end);;
module IntSet =
Set.Make(struct type t = int let compare (x:int) (y:int) = compare x y end);;
module CharMap =
Map.Make(struct type t = char let compare (x:char) (y:char) = compare x y end);;
open Regular_expr;;
(*
\subsection{The type of automata}
Automata considered here are deterministic.
The states of these automata are always represented as natural
numbers, the initial state always being 0.
An automaton is then made of a transition table, giving for each
state $n$ a map that maps characters to states ; and a table of
accepting states.
*)
type automaton =
{
auto_trans : int CharMap.t array;
auto_accept : bool array;
}
;;
(*
\subsection{Compilation of a regexp}
[compile r] returns an automaton that recognizes the language of [r].
*)
let compile r =
(* we have a hash table to avoid several compilation of the same regexp *)
let hashtable = Hashtbl.create 37
(* [transtable] is the transition table to fill, and [acceptable] is
the table of accepting states. [transtable] maps any state into a
[CharMap.t], which itself maps characters to states. *)
and transtable = ref IntMap.empty
and acceptable = ref IntSet.empty
and next_state = ref 0
in
(* [loop r] fills the tables for the regexp [r], and return the
initial state of the resulting automaton. *)
let rec loop r =
try
Hashtbl.find hashtable r
with
Not_found ->
(* generate a new state *)
let init = !next_state
and next_chars = Regular_expr.firstchars r
in
incr next_state;
(* fill the hash table before recursion *)
Hashtbl.add hashtable r init;
(* fill the set of acceptable states *)
if nullable r then acceptable := IntSet.add init !acceptable;
(* compute the map from chars to states for the new state *)
let t =
CharSet.fold
(fun c accu ->
let s = loop (residual r c) in
CharMap.add c s accu)
next_chars
CharMap.empty
in
(* add it to the transition table *)
transtable := IntMap.add init t !transtable;
(* return the new state *)
init
in
let _ = loop r in
(* we then fill the arrays defining the automaton *)
let trans = Array.create !next_state CharMap.empty
and accept = Array.create !next_state false
in
for i=0 to pred !next_state do
trans.(i) <- IntMap.find i !transtable;
accept.(i) <- IntSet.mem i !acceptable
done;
{ auto_trans = trans ; auto_accept = accept }
;;
(*
\subsection{Execution of automata}
*)
(*
[exec_automaton auto str pos] executes the automaton [auto] on
string [str] starting at position [pos]. Returns the maximal
position [p] such that the substring of [str] from positions [pos]
(included) to [p] (excluded) is acceptable by the automaton, [-1] if
no such position exists.
*)
let exec_automaton auto s pos =
let state = ref 0
and last_accept_pos =
ref (if auto.auto_accept.(0) then pos else -1)
and i = ref pos
and l = String.length s
in
try
while !i < l do
let m = auto.auto_trans.(!state) in
state := CharMap.find (String.get s !i) m;
incr i;
if auto.auto_accept.(!state) then last_accept_pos := !i;
done;
!last_accept_pos;
with
Not_found ->
!last_accept_pos;
;;
(*
\subsection{Searching functions}
*)
(*
[search_forward a str pos] search in the string [str], starting at
position [pos] a word that is in the language of automaton
[a]. Returns a pair [(b,e)] where [b] is position of the first char
matched, and [e] is the position following the position of the last
char matched.
Raises [Not_found] of no matching word is found.
Notice: even if the automaton accepts the empty word, this function
will never return [(b,e)] with [e=b]. In other words, this function
always search for non-empty words in the language of automaton [a].
Unpredictable results may occur if [pos < 0].
*)
let rec search_forward auto s pos =
if pos >= String.length s
then raise Not_found
else
let n = exec_automaton auto s pos in
if n > pos then pos,n else search_forward auto s (succ pos)
;;
(*
[split_strings a s] extract from string [s] the subwords (of maximal
size) that are in the language of [a]
*)
let split_strings auto line =
let rec loop pos =
try
let b,e = search_forward auto line pos in
let id = String.sub line b (e-b) in
id::(loop e)
with Not_found -> []
in
loop 0
;;
let split_delim auto line =
let rec loop pos =
try
let b,e = search_forward auto line pos in
let id = String.sub line pos (b-pos) in
id::(loop e)
with Not_found ->
[String.sub line pos (String.length line - pos)]
in
loop 0
;;
(*
\subsection{Output functions}
*)
(*
[to_dot a f] exports the automaton [a] to the file [f] in DOT format.
*)
open Printf
let complement = CharSet.diff Regexp_lexer.all_chars
let intervals s =
let rec interv = function
| i, [] -> List.rev i
| [], n::l -> interv ([(n,n)], l)
| (mi,ma)::i as is, n::l ->
if Char.code n = succ (Char.code ma) then
interv ((mi,n)::i,l)
else
interv ((n,n)::is,l)
in
interv ([], CharSet.elements s)
let output_label cout s =
let char = function
| '"' -> "\\\""
| '\\' -> "#92"
| c ->
let n = Char.code c in
if n > 32 && n < 127 then String.make 1 c else sprintf "#%d" n
in
let output_interv (mi,ma) =
if mi = ma then
fprintf cout "%s " (char mi)
else if Char.code mi = pred (Char.code ma) then
fprintf cout "%s %s " (char mi) (char ma)
else
fprintf cout "%s-%s " (char mi) (char ma)
in
let is = intervals s in
let ics = intervals (complement s) in
if List.length is < List.length ics then
List.iter output_interv is
else begin
fprintf cout "[^"; List.iter output_interv ics; fprintf cout "]"
end
let output_transitions cout i m =
let rev_m =
CharMap.fold
(fun c j rm ->
let s = try IntMap.find j rm with Not_found -> CharSet.empty in
IntMap.add j (CharSet.add c s) rm)
m IntMap.empty
in
IntMap.iter
(fun j s ->
fprintf cout " %d -> %d [ label = \"" i j;
output_label cout s;
fprintf cout "\" ];\n")
rev_m
let to_dot a f =
let cout = open_out f in
fprintf cout "digraph finite_state_machine {
/* rankdir=LR; */
orientation=land;
node [shape = doublecircle];";
Array.iteri (fun i b -> if b then fprintf cout "%d " i) a.auto_accept;
fprintf cout ";\n node [shape = circle];\n";
Array.iteri (output_transitions cout) a.auto_trans;
fprintf cout "}\n";
close_out cout