-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathTextRazorEntitySummarizer.php
73 lines (66 loc) · 1.79 KB
/
TextRazorEntitySummarizer.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
<?php
$handle = fopen($argv[1], 'r');
$header = fgetcsv($handle);
$header[] = 'Count';
$header[] = 'Variant Forms';
$data = [];
while ($line = fgetcsv($handle)) {
$id = strtolower($line[0]);
if (!isset($data[$id])) {
$data[$id] = [$line];
} else {
$data[$id][] = $line;
}
}
fclose($handle);
$handle = fopen(dirname($argv[1]) . '/' . 'summary-' . basename($argv[1]), 'w');
fputcsv($handle, $header);
foreach ($data as $group) {
$count = $relevance = $confidence = 0;
$forms = $lastLine = $matchedTexts = [];
foreach ($group as $line) {
if ($count > 0) {
for ($i = 2; $i < 4; $i++) {
if (empty($line[$i]) && !empty($lastLine[$i])) {
$line[$i] = $lastLine[$i];
} elseif ($lastLine[$i] != $line[$i] && !empty($lastLine[$i])) {
echo "Warning: mismatch in position $i for " . $line[0] . "\n";
}
}
}
$count++;
$forms[] = $line[0];
$matchedTexts[] = $line[1];
$confidence += $line[4];
$relevance += $line[5];
$lastLine = $line;
}
$line[0] = pickBestForm($forms);
$line[1] = implode(', ', array_unique($matchedTexts));
$line[4] = $confidence / $count;
$line[5] = $relevance / $count;
$line[] = $count;
$line[] = implode(', ', array_unique($forms));
fputcsv($handle, $line);
}
fclose($handle);
function pickBestForm($forms)
{
$upper = $lower = $mixed = false;
foreach ($forms as $form) {
if (strtoupper($form) === $form) {
$upper = $form;
} else if (strtolower($form) === $form) {
$lower = $form;
} else {
// mixed is most preferred
return $form;
}
}
if ($lower) {
// lower is second best
return $lower;
}
// upper is last resort
return $upper;
}