forked from fsprojects/fantomas
-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathregexdna3.fs
81 lines (72 loc) · 2.29 KB
/
regexdna3.fs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
/// The Computer Language Benchmarks Game
/// http://shootout.alioth.debian.org/
///
/// Modified version of Valentin Kraevskiy
/// Contributed by Vassil Keremidchiev
module Regexdna
open System.Text.RegularExpressions
open System.Threading
let regex s = Regex(s, RegexOptions.Compiled)
let input = stdin.ReadToEnd()
let withoutComments = (regex ">.*\n").Replace(input, "")
let text = (regex "\n").Replace(withoutComments, "")
let textSize = text.Length
let blockSize = textSize / 2
let onblocks overlapSize blockSize =
let rec onblocks' res =
function
| "" -> res
| s when s.Length <= blockSize -> res @ [ s ]
| s ->
onblocks' (res @ [ s.Substring(0, blockSize) ])
(s.Substring(blockSize - overlapSize))
onblocks' []
let onProcBlocks =
onblocks 0 ((textSize / System.Environment.ProcessorCount) + 1)
let DNAcodes =
[ "agggtaaa|tttaccct"; "[cgt]gggtaaa|tttaccc[acg]";
"a[act]ggtaaa|tttacc[agt]t"; "ag[act]gtaaa|tttac[agt]ct";
"agg[act]taaa|ttta[agt]cct"; "aggg[acg]aaa|ttt[cgt]ccct";
"agggt[cgt]aa|tt[acg]accct"; "agggta[cgt]a|t[acg]taccct";
"agggtaa[cgt]|[acg]ttaccct" ]
/// Calculate all chunks in parallel
let chunksCounts =
let chunkedMatch (matchStr : string) =
text
|> onblocks (matchStr.Length - 1) blockSize
|> List.map
(fun t -> async { return matchStr, ((regex matchStr).Matches t).Count })
DNAcodes
|> List.collect chunkedMatch
|> Async.Parallel
|> Async.RunSynchronously
/// Gather result counts by summing them per DNA code
DNAcodes
|> List.map (fun key ->
key,
chunksCounts |> Array.fold (fun S (k, cnt) ->
if k = key then S + cnt
else S) 0)
|> List.iter (fun (key, cnt) -> printfn "%s %i" key cnt)
let lengthAfterReplace text =
[ "B", "(c|g|t)"
"D", "(a|g|t)"
"H", "(a|c|t)"
"K", "(g|t)"
"M", "(a|c)"
"N", "(a|c|g|t)"
"R", "(a|g)"
"S", "(c|g)"
"V", "(a|c|g)"
"W", "(a|t)"
"Y", "(c|t)" ]
|> List.fold (fun s (code, alt) -> (regex code).Replace(s, alt)) text
|> String.length
let replacedSize =
text
|> onProcBlocks
|> List.map (fun chunk -> async { return lengthAfterReplace chunk })
|> Async.Parallel
|> Async.RunSynchronously
|> Array.sum
printf "\n%i\n%i\n%i\n" input.Length textSize replacedSize