forked from ermine/sulci
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfind_url.ml
175 lines (148 loc) · 4.65 KB
/
find_url.ml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
(*
* (c) 2005-2010 Anastasia Gornostaeva
*
* Searches potentional url in text and replace it into hyperlink
*
* TODO: validate query
*)
let regexp alpha = ['a'-'z' 'A'-'Z']
let regexp digit = ['0'-'9']
let regexp alphadigit = alpha | digit
let regexp safe = "$" | "-" | "_" | "." | "+"
let regexp extra = "!" | "*" | "'" | "(" | ")" | ","
let regexp national = "{" | "}" | "|" | "\\" | "^" | "~" | "[" | "]" | "`"
let regexp punctuation = "<" | ">" | "#" | "%" | "\""
let regexp unreserved = alpha | digit | safe | extra
let regexp reserved = [';' '/' '?' ':' '@' '&' '=']
let regexp hex = ['A' - 'F' 'a'-'f' '0'-'9']
let regexp escape = "%" hex hex
let regexp uchar = unreserved | escape
let regexp xchar = unreserved | reserved | escape
let regexp hostnumber = digit "." digit "." digit "." digit
let regexp toplabel = alpha | alpha (alphadigit | "-")* alphadigit
let regexp domainlabel = alphadigit | alphadigit (alphadigit | "-")* alphadigit
let regexp hostname = (domainlabel ".")+ toplabel
let regexp host = hostname | hostnumber
let regexp port = digit+
let regexp hostport = host (":" port)?
let regexp wwwhost = ['w''W']['w''W']['w''W'] "."? (domainlabel ".")* toplabel
let regexp ftphost = ['f''F']['t''T']['p''P'] "."? (domainlabel ".")* toplabel
let regexp user = (uchar | ":" | "?" | "&" | "=")*
let regexp password = (uchar | ":" | "?" | "&" | "=")*
let regexp urlpath = xchar*
let regexp login = (user (":" password)? "@")? hostport
let regexp ftptype = "A" | "I" | "D" | "a" | "i" | "d"
let regexp fsegment = ( uchar | "?" | ":" | "@" | "&" | "=" )*
let regexp fpath = fsegment ( "/" fsegment )*
let regexp search = (uchar | ";" | ":" | "@" | "&" | "=")*
let regexp hsegment = (uchar | ";" | ":" | "@" | "&" | "=")*
let regexp hpath = hsegment ("/" hsegment)*
let regexp httpurl =
"http" "s"? "://" hostport ( "/" hpath ( "?" search )?)?
| host ":" port ("/" hpath ("?" search)?)?
| hostport "/" hpath ("?" search)?
| wwwhost (":" port)? ( "/" hpath ( "?" search )?)?
let enclosed lexbuf offset_begin offset_end =
let len = Ulexing.lexeme_length lexbuf in
Ulexing.utf8_sub_lexeme lexbuf offset_begin
(len - (offset_end + offset_begin))
let rec do_find_url (callback:string -> string) acc = lexer
| httpurl ->
let url = Ulexing.utf8_lexeme lexbuf in
do_find_url callback (acc ^ callback url) lexbuf
| "(" httpurl ")" ->
let url = enclosed lexbuf 1 1 in
do_find_url callback (acc ^ "(" ^ callback url ^ ")") lexbuf
| "ftp://" login ( "/" fpath ( ";type=" ftptype )?)?
| ftphost ( "/" fpath ( ";type=" ftptype )?)? ->
let url = Ulexing.utf8_lexeme lexbuf in
do_find_url callback (acc ^ callback url) lexbuf
| eof ->
acc
| punctuation
| national
| "(" ->
do_find_url callback (acc ^ Ulexing.utf8_lexeme lexbuf) lexbuf
| _ ->
Ulexing.rollback lexbuf;
let skip = skip lexbuf in
do_find_url callback (acc ^ skip) lexbuf
and skip = lexer
| [^ ' ' '\r' '\n' '\t' '.' ',' '|' ';']+ ->
Ulexing.utf8_lexeme lexbuf
| eof ->
Ulexing.rollback lexbuf;
""
| _ ->
Ulexing.utf8_lexeme lexbuf
(*
let valid_tld =
[ "aero";
"biz";
"cat";
"com";
"coop";
"edu";
"eu";
"gov";
"info";
"int";
"jobs";
"mil";
"mobi";
"museum";
"name";
"net";
"org";
"pro";
"travel";
"asia";
"post";
"tel";
"xxx"
]
*)
let find_url callback text =
let lexbuf = Ulexing.from_utf8_string text in
do_find_url callback "" lexbuf
let compare str1 str2 =
if String.length str1 > String.length str2 &&
String.sub str1 0 (String.length str2) = str2 then
true
else
false
let make_hyperlink url =
if compare url "http://" ||
compare url "https://" ||
compare url "ftp://" then
"<a href='" ^ url ^ "'>" ^ url ^ "</a>"
else
if compare url "ftp" then
"<a href='ftp://" ^ url ^ "'>" ^ url ^ "</a>"
else
"<a href='http://" ^ url ^ "'>" ^ url ^ "</a>"
(*
let _ =
let rec scan list =
match list with
| [] -> ()
| text :: s ->
Printf.printf "Orig: %s\n" text;
Printf.printf "Result: %s\n\n"
(find_url make_hyperlink text);
scan s
in
scan ["www.ytro.ru";
"(www.jabber.ru/index.html)";
"[www.jabber.ru/index.html]";
"{www.jabber.ru/index.htm}";
"http://www.ytro.ru";
"abc http://http://www.ytro.ru. - 20 k";
" def: www.jabber.ru, ftp.jabber.ru i dr.";
"http:// abc";
"http://http://";
"http://";
"http://internet.rumus www.jabber.ru - 20k";
"http://abc.MuSeUm/def.php";
"(ftp.ir.ru)"]
*)