Skip to content

Commit

Permalink
Update with German stemmer -syst-em change
Browse files Browse the repository at this point in the history
  • Loading branch information
ojwb committed Nov 9, 2023
1 parent e8c202f commit 9752ceb
Show file tree
Hide file tree
Showing 6 changed files with 236 additions and 89 deletions.
9 changes: 5 additions & 4 deletions algorithms/german/stemmer.tt
Original file line number Diff line number Diff line change
Expand Up @@ -75,13 +75,14 @@ Step 1:
<DL><DD>
Search for the longest among the following suffixes,
<DL><DD>
(<I>a</I>) <B><I>em &nbsp; ern &nbsp; er</I></B><BR>
(<I>b</I>) <B><I>e &nbsp; en &nbsp; es</I></B><BR>
(<I>c</I>) <B><I>s</I></B> (preceded by a valid <B><I>s</I></B>-ending)
(<I>a</I>) <B><I>em</I></B> (not preceded by <B><I>syst</I></B> [condition added in Snowball 2.3.0])<BR>
(<I>b</I>) <B><I>ern &nbsp; er</I></B><BR>
(<I>c</I>) <B><I>e &nbsp; en &nbsp; es</I></B><BR>
(<I>d</I>) <B><I>s</I></B> (preceded by a valid <B><I>s</I></B>-ending)
</DL>
<p>
and delete if in <I>R</I>1. (Of course the letter of the valid <B><I>s</I></B>-ending is
not necessarily in <I>R</I>1.) If an ending of group (<I>b</I>) is deleted, and the ending
not necessarily in <I>R</I>1.) If an ending of group (<I>c</I>) is deleted, and the ending
is preceded by <B><I>niss</I></B>, delete the final <B><I>s</I></B>.
</p>

Expand Down
6 changes: 5 additions & 1 deletion code/german.sbl
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,11 @@ backwardmode (
define standard_suffix as (
do (
[substring] R1 among(
'em' 'ern' 'er'
'em'
( not 'syst'
delete
)
'ern' 'er'
( delete
)
'e' 'en' 'es'
Expand Down
104 changes: 103 additions & 1 deletion js/base-stemmer.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,18 @@
// @ts-check

/**@constructor*/
const BaseStemmer = function() {
/** @protected */
this.current = '';
this.cursor = 0;
this.limit = 0;
this.limit_backward = 0;
this.bra = 0;
this.ket = 0;

/**
* @param {string} value
*/
this.setCurrent = function(value) {
this.current = value;
this.cursor = 0;
Expand All @@ -9,11 +22,18 @@ const BaseStemmer = function() {
this.ket = this.limit;
};

/**
* @return {string}
*/
this.getCurrent = function() {
return this.current;
};

/**
* @param {BaseStemmer} other
*/
this.copy_from = function(other) {
/** @protected */
this.current = other.current;
this.cursor = other.cursor;
this.limit = other.limit;
Expand All @@ -22,7 +42,14 @@ const BaseStemmer = function() {
this.ket = other.ket;
};

/**
* @param {number[]} s
* @param {number} min
* @param {number} max
* @return {boolean}
*/
this.in_grouping = function(s, min, max) {
/** @protected */
if (this.cursor >= this.limit) return false;
var ch = this.current.charCodeAt(this.cursor);
if (ch > max || ch < min) return false;
Expand All @@ -32,7 +59,14 @@ const BaseStemmer = function() {
return true;
};

/**
* @param {number[]} s
* @param {number} min
* @param {number} max
* @return {boolean}
*/
this.in_grouping_b = function(s, min, max) {
/** @protected */
if (this.cursor <= this.limit_backward) return false;
var ch = this.current.charCodeAt(this.cursor - 1);
if (ch > max || ch < min) return false;
Expand All @@ -42,7 +76,14 @@ const BaseStemmer = function() {
return true;
};

/**
* @param {number[]} s
* @param {number} min
* @param {number} max
* @return {boolean}
*/
this.out_grouping = function(s, min, max) {
/** @protected */
if (this.cursor >= this.limit) return false;
var ch = this.current.charCodeAt(this.cursor);
if (ch > max || ch < min) {
Expand All @@ -57,7 +98,14 @@ const BaseStemmer = function() {
return false;
};

/**
* @param {number[]} s
* @param {number} min
* @param {number} max
* @return {boolean}
*/
this.out_grouping_b = function(s, min, max) {
/** @protected */
if (this.cursor <= this.limit_backward) return false;
var ch = this.current.charCodeAt(this.cursor - 1);
if (ch > max || ch < min) {
Expand All @@ -72,8 +120,13 @@ const BaseStemmer = function() {
return false;
};

/**
* @param {string} s
* @return {boolean}
*/
this.eq_s = function(s)
{
/** @protected */
if (this.limit - this.cursor < s.length) return false;
if (this.current.slice(this.cursor, this.cursor + s.length) != s)
{
Expand All @@ -83,8 +136,13 @@ const BaseStemmer = function() {
return true;
};

/**
* @param {string} s
* @return {boolean}
*/
this.eq_s_b = function(s)
{
/** @protected */
if (this.cursor - this.limit_backward < s.length) return false;
if (this.current.slice(this.cursor - s.length, this.cursor) != s)
{
Expand All @@ -94,8 +152,13 @@ const BaseStemmer = function() {
return true;
};

/** @return {number} */ this.find_among = function(v)
/**
* @param {Among[]} v
* @return {number}
*/
this.find_among = function(v)
{
/** @protected */
var i = 0;
var j = v.length;

Expand Down Expand Up @@ -165,8 +228,13 @@ const BaseStemmer = function() {
};

// find_among_b is for backwards processing. Same comments apply
/**
* @param {Among[]} v
* @return {number}
*/
this.find_among_b = function(v)
{
/** @protected */
var i = 0;
var j = v.length

Expand Down Expand Up @@ -232,8 +300,15 @@ const BaseStemmer = function() {
/* to replace chars between c_bra and c_ket in this.current by the
* chars in s.
*/
/**
* @param {number} c_bra
* @param {number} c_ket
* @param {string} s
* @return {number}
*/
this.replace_s = function(c_bra, c_ket, s)
{
/** @protected */
var adjustment = s.length - (c_ket - c_bra);
this.current = this.current.slice(0, c_bra) + s + this.current.slice(c_ket);
this.limit += adjustment;
Expand All @@ -242,8 +317,12 @@ const BaseStemmer = function() {
return adjustment;
};

/**
* @return {boolean}
*/
this.slice_check = function()
{
/** @protected */
if (this.bra < 0 ||
this.bra > this.ket ||
this.ket > this.limit ||
Expand All @@ -254,8 +333,13 @@ const BaseStemmer = function() {
return true;
};

/**
* @param {number} c_bra
* @return {boolean}
*/
this.slice_from = function(s)
{
/** @protected */
var result = false;
if (this.slice_check())
{
Expand All @@ -265,20 +349,34 @@ const BaseStemmer = function() {
return result;
};

/**
* @return {boolean}
*/
this.slice_del = function()
{
/** @protected */
return this.slice_from("");
};

/**
* @param {number} c_bra
* @param {number} c_ket
* @param {string} s
*/
this.insert = function(c_bra, c_ket, s)
{
/** @protected */
var adjustment = this.replace_s(c_bra, c_ket, s);
if (c_bra <= this.bra) this.bra += adjustment;
if (c_bra <= this.ket) this.ket += adjustment;
};

/**
* @return {string}
*/
this.slice_to = function()
{
/** @protected */
var result = '';
if (this.slice_check())
{
Expand All @@ -287,8 +385,12 @@ const BaseStemmer = function() {
return result;
};

/**
* @return {string}
*/
this.assign_to = function()
{
/** @protected */
return this.current.slice(0, this.limit);
};
};
Expand Down
57 changes: 37 additions & 20 deletions js/english-stemmer.js

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 9752ceb

Please sign in to comment.