Skip to content

Commit

Permalink
ENH Split sentences by configurable punctuation for summary
Browse files Browse the repository at this point in the history
Co-authored-by: Lukas Erni <[email protected]>
  • Loading branch information
GuySartorelli and lerni committed Dec 20, 2023
1 parent 6c69d32 commit bf629df
Show file tree
Hide file tree
Showing 2 changed files with 56 additions and 8 deletions.
21 changes: 17 additions & 4 deletions src/ORM/FieldType/DBText.php
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,11 @@ class DBText extends DBString
'Summary' => 'Text',
];

/**
* Punctuation that marks an end of a sentence for the Summary() method
*/
private static array $summary_sentence_separators = ['.', '?', '!'];

/**
* (non-PHPdoc)
* @see DBField::requireField()
Expand Down Expand Up @@ -130,10 +135,18 @@ public function Summary($maxWords = 50, $add = false)
$add = $this->defaultEllipsis();
}

// Split on sentences (don't remove period)
$sentences = array_filter(array_map(function ($str) {
return trim($str ?? '');
}, preg_split('@(?<=\.)@', $value ?? '') ?: []));
// Split on sentences (don't remove punctuation)
$summarySentenceSeparators = preg_quote(implode(static::config()->get('summary_sentence_separators')), '@');
$possibleSentences = preg_split('@(?<=[' . $summarySentenceSeparators . '])@', $value ?? '') ?: [];
$sentences = [];

foreach ($possibleSentences as $sentence) {
$sentence = trim($sentence);
if ($sentence) {
$sentences[] = $sentence;
}
}

$wordCount = count(preg_split('#\s+#u', $sentences[0] ?? '') ?: []);

// if the first sentence is too long, show only the first $maxWords words
Expand Down
43 changes: 39 additions & 4 deletions tests/php/ORM/DBTextTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -282,32 +282,56 @@ public function providerContextSummary()
public function providerSummary()
{
return [
[
'simple test' => [
'This is some text. It is a test',
3,
false,
'This is some…',
],
[
'custom ellipses' => [
// check custom ellipsis
'This is a test text in a longer sentence and a custom ellipsis.',
8,
'...', // regular dots instead of the ellipsis character
'This is a test text in a longer...',
],
[
'umlauts' => [
'both schön and können have umlauts',
5,
false,
'both schön and können have…',
],
[
'invalid UTF' => [
// check invalid UTF8 handling — input is an invalid UTF sequence, output should be empty string
"\xf0\x28\x8c\xbc",
50,
false,
'',
],
'treats period as sentence boundary' => [
'This is some text. It is a test. There are three sentences.',
10,
false,
'This is some text. It is a test.',
],
'treats exclamation mark as sentence boundary' => [
'This is some text! It is a test! There are three sentences.',
10,
false,
'This is some text! It is a test!',
],
'treats question mark as sentence boundary' => [
'This is some text? It is a test? There are three sentences.',
10,
false,
'This is some text? It is a test?',
],
'does not treat colon as sentence boundary' => [
'This is some text: It is a test: There are three sentences.',
10,
false,
'This is some text: It is a test: There are…',
],
];
}

Expand Down Expand Up @@ -401,4 +425,15 @@ public function testSummary($originalValue, $words, $add, $expectedValue)
$result = $text->obj('Summary', [$words, $add])->forTemplate();
$this->assertEquals($expectedValue, $result);
}

public function testSummaryConfiguration()
{
$text = DBField::create_field(DBText::class, 'This is some text: It is a test: There are three sentences.');
// Doesn't treat colon as a boundary by default
$this->assertSame('This is some text: It is a test: There are…', $text->Summary(10));

DBText::config()->merge('summary_sentence_separators', [':']);
// Does treat colon as a boundary if configured to do so
$this->assertSame('This is some text: It is a test:', $text->Summary(10));
}
}

0 comments on commit bf629df

Please sign in to comment.