Use a custom regexp to tokenize words.

This respects accented characters and apostrophes in words.
bxjx · Jul 15, 2015 · 1e16c22 · 1e16c22
1 parent f0b262a
commit 1e16c22
Show file tree

Hide file tree

Showing 3 changed files with 33 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -154,6 +154,22 @@ be returned as a keyword as `20 / 22 < 1 - 0.5`.
 
 Wow. I could probably make this more intuitive. Open to suggestions.
 
+#### Option: alternativeTokenizer
+
+Gramophone uses Natural to tokenize and extract ngrams. By default, natural uses
+its WordTokenizer which splits words using `/\W+/` as a separator regular
+expression.
+
+This causes words like "London's" to be split into two tokens rather than one.
+It also breaks up accented words, using the accented character as the separator.
+
+Setting `alternativeTokenizer` to `true` uses a more generous regular expression
+that respects these characters.
+
+```js
+keyword.extract('Lörem Ipsüm Lörem Ipsüm.', {alternativeTokenizer: true})
+```
+
 --------------------------------------------------------
 <a name="stream"></a>
 ### gramophone.stream([options])

diff --git a/index.js b/index.js
@@ -50,6 +50,9 @@ exports.extract = function(text, options){
   _.each(options.ngrams, function(ngram){
     var keywordsForNgram;
     var tf = new Tf();
+    if (options.alternativeTokenizer) {
+      natural.NGrams.setTokenizer(new natural.RegexpTokenizer({pattern: /\b[^\s]+\b/g, gaps: false}));
+    }
     var tokenized = _.map(natural.NGrams.ngrams(text, ngram), function(ngram){
       if (options.stem){
         ngram = _.map(ngram, stem);

diff --git a/test/extract.js b/test/extract.js
@@ -106,3 +106,17 @@ test('with {cutoff: float} as option', function(t){
 
   t.end();
 });
+
+test('extract apostrophe', function (t){
+  var text = "Today is 15 July - St Swithin's Day. Legend has it that if it rains on St Swithin's Day then the wet weather will continue for 40 days.";
+  var options = {alternativeTokenizer: true};
+  t.deepEqual(k.extract(text, options), ['st swithin\'s day']);
+  t.end();
+});
+
+test('accented characters', function (t){
+  var text = 'Hallo Welt! Das ist ein Text über ganz viele Umlaute wie äöüÄÖÜß. Lörem Ipsüm Lörem Ipsüm.';
+  var options = {ngrams: 1, alternativeTokenizer: true};
+  t.deepEqual(k.extract(text, options), ['lörem', 'ipsüm']);
+  t.end();
+});