Skip to content

Commit

Permalink
WordSegmentation improved
Browse files Browse the repository at this point in the history
WordSegmentation now with Circular Array: reduces memory consumption from linear O(n) to constant O(1).
SymSpell changed to from NetStandstandard 1.3 to 2.0
  • Loading branch information
wolfgarbe committed Apr 28, 2018
1 parent 28ceecd commit dff1025
Show file tree
Hide file tree
Showing 4 changed files with 48 additions and 68 deletions.
10 changes: 4 additions & 6 deletions SymSpell.CommandLine/SymSpell.CommandLine.cs
Original file line number Diff line number Diff line change
Expand Up @@ -116,13 +116,11 @@ static void Main(string[] args)
break;

case "wordsegment":
var suggestions3 = symSpell.WordSegmentation(inputTerm);
var suggestion3 = symSpell.WordSegmentation(inputTerm);
//display suggestions, edit distance and term frequency
foreach (var suggestion in suggestions3)
{
if (outputStats) Console.WriteLine(suggestion.correctedString + " " + suggestion.distanceSum.ToString("N0") + " " + suggestion.probabilityLogSum.ToString());
else Console.WriteLine(suggestion.correctedString);
}
if (outputStats) Console.WriteLine(suggestion3.correctedString + " " + suggestion3.distanceSum.ToString("N0") + " " + suggestion3.probabilityLogSum.ToString());
else Console.WriteLine(suggestion3.correctedString);

break;

default:
Expand Down
9 changes: 2 additions & 7 deletions SymSpell.SegmentationDemo/SymSpell.SegmentationDemo.cs
Original file line number Diff line number Diff line change
Expand Up @@ -63,16 +63,11 @@ static void Main(string[] args)

private static void Correct(string input, SymSpell symSpell)
{
List<SymSpell.Composition> suggestions = null;

//check if input term or similar terms within edit-distance are in dictionary, return results sorted by ascending edit distance, then by descending word frequency
suggestions = symSpell.WordSegmentation(input);
var suggestion = symSpell.WordSegmentation(input);

//display term and frequency
foreach (var suggestion in suggestions)
{
Console.WriteLine(suggestion.correctedString + " " + suggestion.distanceSum.ToString("N0") + " " + suggestion.probabilityLogSum.ToString());
}
Console.WriteLine(suggestion.correctedString + " " + suggestion.distanceSum.ToString("N0") + " " + suggestion.probabilityLogSum.ToString());
}
}
}
91 changes: 39 additions & 52 deletions SymSpell/SymSpell.cs
Original file line number Diff line number Diff line change
Expand Up @@ -917,24 +917,13 @@ public List<SuggestItem> LookupCompound(string input, int editDistanceMax)
//N equals the sum of all counts c in the dictionary only if the dictionary is complete, but not if the dictionary is truncated or filtered
public static long N = 1024908267229L;

/// <summary>Composition returned from WordSegmentation.</summary>
public class Composition
{
/// <summary>The suggested correctly spelled word.</summary>
public string segmentedString = "";
/// <summary>The suggested correctly spelled word.</summary>
public string correctedString = "";
/// <summary>Edit distance sum between searched for word and suggestion.</summary>
public int distanceSum = int.MaxValue;
/// <summary>Frequency sum of suggestion in the dictionary (a measure of how common the word is).</summary>
public decimal probabilityLogSum = 0;
}


/// <summary>Find suggested spellings for a multi-word input string (supports word splitting/merging).</summary>
/// <param name="input">The string being spell checked.</param>
/// <returns>A List of Composition object representing the suggested word segmented and spelling corrected text.</returns>
public List<Composition> WordSegmentation(string input)
/// <returns>The word segmented string,
/// the word segmented and spelling corrected string,
/// the Edit distance sum between input string and corrected string,
/// the Sum of word occurence probabilities in log scale (a measure of how common and probable the corrected segmentation is).</returns>
public (string segmentedString, string correctedString, int distanceSum, decimal probabilityLogSum) WordSegmentation(string input)
{
return WordSegmentation(input, this.MaxDictionaryEditDistance, this.maxDictionaryWordLength);
}
Expand All @@ -943,8 +932,11 @@ public List<Composition> WordSegmentation(string input)
/// <param name="input">The string being spell checked.</param>
/// <param name="maxEditDistance">The maximum edit distance between input and corrected words
/// (0=no correction/segmentation only).</param>
/// <returns>A List of Composition object representing the suggested word segmented and spelling corrected text.</returns>
public List<Composition> WordSegmentation(string input, int maxEditDistance)
/// <returns>The word segmented string,
/// the word segmented and spelling corrected string,
/// the Edit distance sum between input string and corrected string,
/// the Sum of word occurence probabilities in log scale (a measure of how common and probable the corrected segmentation is).</returns>
public (string segmentedString, string correctedString, int distanceSum, decimal probabilityLogSum) WordSegmentation(string input, int maxEditDistance)
{
return WordSegmentation(input, maxEditDistance, this.maxDictionaryWordLength);
}
Expand All @@ -954,25 +946,23 @@ public List<Composition> WordSegmentation(string input, int maxEditDistance)
/// <param name="maxSegmentationWordLength">The maximum word length that should be considered.</param>
/// <param name="maxEditDistance">The maximum edit distance between input and corrected words
/// (0=no correction/segmentation only).</param>
/// <returns>A List of Composition object representing the suggested word segmented and spelling corrected text.</returns>
public List<Composition> WordSegmentation(string input, int maxEditDistance, int maxSegmentationWordLength)
/// <returns>The word segmented string,
/// the word segmented and spelling corrected string,
/// the Edit distance sum between input string and corrected string,
/// the Sum of word occurence probabilities in log scale (a measure of how common and probable the corrected segmentation is).</returns>
public (string segmentedString, string correctedString, int distanceSum, decimal probabilityLogSum) WordSegmentation(string input, int maxEditDistance, int maxSegmentationWordLength)
{
Composition[] compositions = new Composition[input.Length];
for (int i = 0; i < input.Length; i++) compositions[i] = new Composition();
int arraySize = Math.Min(maxSegmentationWordLength, input.Length);
(string segmentedString, string correctedString, int distanceSum, decimal probabilityLogSum)[] compositions = new(string segmentedString, string correctedString, int distanceSum, decimal probabilityLogSum)[arraySize];
int circularIndex = -1;

//outer loop: left/right
//outer loop (column): all possible part start positions
for (int j = 0; j < input.Length; j++)
{
int callingIndex = 0; if (j > 0) callingIndex = j - 1;
int remainderLength = input.Length - j;

//inner loop : top/down (loop becomes shorter as remainder becomes shorter)
//generate/test all possible part lengths: part can't be bigger than longest word in dictionary (other than long unknown word)
for (int i = 1; i <= Math.Min(remainderLength, maxSegmentationWordLength); i++)
//inner loop (row): all possible part lengths (from start position): part can't be bigger than longest word in dictionary (other than long unknown word)
int imax = Math.Min(input.Length - j, maxSegmentationWordLength);
for (int i = 1; i <= imax; i++)
{
//destinationIndex = calling length + part1.Length (=i)
int destinationIndex = j + i - 1;

//get top spelling correction/ed for part
string part = input.Substring(j, i);
int separatorLength = 0;
Expand All @@ -995,7 +985,7 @@ public List<Composition> WordSegmentation(string input, int maxEditDistance, int
topEd += part.Length;
//remove space
part = part.Replace(" ", ""); //=System.Text.RegularExpressions.Regex.Replace(part1, @"\s+", "");
//add number of removed spaces to ed
//add number of removed spaces to ed
topEd -= part.Length;

List<SymSpell.SuggestItem> results = this.Lookup(part, SymSpell.Verbosity.Top, maxEditDistance);
Expand All @@ -1021,33 +1011,30 @@ public List<Composition> WordSegmentation(string input, int maxEditDistance, int
topProbabilityLog = (decimal)Math.Log10(10.0 / (N * Math.Pow(10.0, part.Length)));
}

int destinationIndex = ((i + circularIndex) % arraySize);

//set values in first loop
if (j == 0)
{
compositions[destinationIndex].segmentedString = part;
compositions[destinationIndex].correctedString = topResult;
compositions[destinationIndex].distanceSum = topEd;
compositions[destinationIndex].probabilityLogSum = topProbabilityLog;
}
//replace values if better probabilityLogSum, if same edit distance OR one space difference
else if ((i == maxSegmentationWordLength) || (((compositions[callingIndex].distanceSum + topEd == compositions[destinationIndex].distanceSum) || (compositions[callingIndex].distanceSum + separatorLength + topEd == compositions[destinationIndex].distanceSum)) && (compositions[destinationIndex].probabilityLogSum < compositions[callingIndex].probabilityLogSum + topProbabilityLog)))
{
compositions[destinationIndex].segmentedString = compositions[callingIndex].segmentedString + " " + part;
compositions[destinationIndex].correctedString = compositions[callingIndex].correctedString + " " + topResult;
compositions[destinationIndex].distanceSum = compositions[callingIndex].distanceSum + separatorLength + topEd;
compositions[destinationIndex].probabilityLogSum = compositions[callingIndex].probabilityLogSum + topProbabilityLog;
compositions[destinationIndex] = (part, topResult, topEd, topProbabilityLog);
}
//replace values if smaller edit distance
else if (compositions[callingIndex].distanceSum + separatorLength + topEd < compositions[destinationIndex].distanceSum)
else if ((i == maxSegmentationWordLength)
//replace values if better probabilityLogSum, if same edit distance OR one space difference
|| (((compositions[circularIndex].distanceSum + topEd == compositions[destinationIndex].distanceSum) || (compositions[circularIndex].distanceSum + separatorLength + topEd == compositions[destinationIndex].distanceSum)) && (compositions[destinationIndex].probabilityLogSum < compositions[circularIndex].probabilityLogSum + topProbabilityLog))
//replace values if smaller edit distance
|| (compositions[circularIndex].distanceSum + separatorLength + topEd < compositions[destinationIndex].distanceSum))
{
compositions[destinationIndex].segmentedString = compositions[callingIndex].segmentedString + " " + part;
compositions[destinationIndex].correctedString = compositions[callingIndex].correctedString + " " + topResult;
compositions[destinationIndex].distanceSum = compositions[callingIndex].distanceSum + separatorLength + topEd;
compositions[destinationIndex].probabilityLogSum = compositions[callingIndex].probabilityLogSum + topProbabilityLog;
compositions[destinationIndex] = (
compositions[circularIndex].segmentedString + " " + part,
compositions[circularIndex].correctedString + " " + topResult,
compositions[circularIndex].distanceSum + separatorLength + topEd,
compositions[circularIndex].probabilityLogSum + topProbabilityLog);
}
}
circularIndex++; if (circularIndex == arraySize) circularIndex = 0;
}
return new List<Composition> { compositions[input.Length - 1] };
return compositions[circularIndex];
}


}
6 changes: 3 additions & 3 deletions SymSpell/SymSpell.csproj
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
<Project Sdk="Microsoft.NET.Sdk">

<PropertyGroup>
<TargetFramework>netstandard1.3</TargetFramework>
<TargetFramework>netstandard2.0</TargetFramework>
<GeneratePackageOnBuild>True</GeneratePackageOnBuild>
<PackageRequireLicenseAcceptance>True</PackageRequireLicenseAcceptance>
<Authors>Wolf Garbe &lt;[email protected]&gt;</Authors>
Expand All @@ -18,12 +18,12 @@
</PropertyGroup>

<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|AnyCPU'">
<DefineConstants>RELEASE;NETSTANDARD1_3;</DefineConstants>
<DefineConstants>RELEASE;NETSTANDARD2_0;</DefineConstants>
</PropertyGroup>

<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|AnyCPU'">
<Optimize>True</Optimize>
<DefineConstants>NETSTANDARD1_3</DefineConstants>
<DefineConstants>NETSTANDARD2_0</DefineConstants>
</PropertyGroup>

<ItemGroup>
Expand Down

0 comments on commit dff1025

Please sign in to comment.