Fix the spellchecker for identifiers with apostrophes

carymrobbins · Aug 14, 2020 · c0771e0 · c0771e0
1 parent 4285d3d
commit c0771e0
Show file tree

Hide file tree

Showing 3 changed files with 145 additions and 6 deletions.
diff --git a/src/com/haskforce/spellchecker/HaskellSpellcheckingSplitter.java b/src/com/haskforce/spellchecker/HaskellSpellcheckingSplitter.java
@@ -0,0 +1,109 @@
+package com.haskforce.spellchecker;
+
+import com.intellij.openapi.progress.ProcessCanceledException;
+import com.intellij.openapi.util.TextRange;
+import com.intellij.openapi.util.text.StringUtil;
+import com.intellij.spellchecker.inspections.BaseSplitter;
+import com.intellij.spellchecker.inspections.PlainTextSplitter;
+import com.intellij.spellchecker.inspections.Splitter;
+import com.intellij.spellchecker.inspections.TextSplitter;
+import com.intellij.util.Consumer;
+import org.jdom.Verifier;
+import org.jetbrains.annotations.NonNls;
+import org.jetbrains.annotations.NotNull;
+import org.jetbrains.annotations.Nullable;
+
+import java.util.Collections;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import static com.intellij.util.io.URLUtil.URL_PATTERN;
+
+/**
+ * Shameless copy-pasta of {@link PlainTextSplitter} that also splits on
+ * apostrophes since they are ubiquitous in Haskell identifiers.
+ */
+public class HaskellSpellcheckingSplitter extends BaseSplitter {
+
+  private static final PlainTextSplitter INSTANCE = new PlainTextSplitter();
+
+  public static PlainTextSplitter getInstance() {
+    return INSTANCE;
+  }
+
+  @NonNls
+  private static final
+  Pattern SPLIT_PATTERN = Pattern.compile("(\\s|\b|')");
+
+  @NonNls
+  private static final Pattern MAIL =
+    Pattern.compile("([\\p{L}0-9\\.\\-\\_\\+]+@([\\p{L}0-9\\-\\_]+(\\.)?)+(com|net|[a-z]{2})?)");
+
+  @NonNls
+  private static final Pattern UUID_PATTERN = Pattern.compile("[a-fA-F0-9]{8}(-[a-fA-F0-9]{4}){3}-[a-fA-F0-9]{12}");
+
+  @Override
+  public void split(@Nullable String text, @NotNull TextRange range, Consumer<TextRange> consumer) {
+    if (StringUtil.isEmpty(text)) {
+      return;
+    }
+    final Splitter ws = getTextSplitter();
+    int from = range.getStartOffset();
+    int till;
+
+    try {
+      Matcher matcher;
+      final String substring = range.substring(text).replace('\b', '\n').replace('\f', '\n');
+      if (Verifier.checkCharacterData(SPLIT_PATTERN.matcher(newBombedCharSequence(substring)).replaceAll("")) != null) {
+        return;
+      }
+      matcher = SPLIT_PATTERN.matcher(newBombedCharSequence(text, range));
+
+      while (true) {
+        checkCancelled();
+        List<TextRange> toCheck;
+        TextRange wRange;
+        String word;
+        if (matcher.find()) {
+          TextRange found = matcherRange(range, matcher);
+          till = found.getStartOffset();
+          if (badSize(from, till)) {
+            from = found.getEndOffset();
+            continue;
+          }
+          wRange = new TextRange(from, till);
+          word = wRange.substring(text);
+          from = found.getEndOffset();
+        }
+        else { // end hit or zero matches
+          wRange = new TextRange(from, range.getEndOffset());
+          word = wRange.substring(text);
+        }
+        if (word.contains("@")) {
+          toCheck = excludeByPattern(text, wRange, MAIL, 0);
+        }
+        else if (word.contains("://")) {
+          toCheck = excludeByPattern(text, wRange, URL_PATTERN, 0);
+        }
+        else if (word.contains("-")) {
+          toCheck = excludeByPattern(text, wRange, UUID_PATTERN, 0);
+        }
+        else {
+          toCheck = Collections.singletonList(wRange);
+        }
+        for (TextRange r : toCheck) {
+          ws.split(text, r, consumer);
+        }
+        if (matcher.hitEnd()) break;
+      }
+    }
+    catch (ProcessCanceledException ignored) {
+    }
+  }
+
+  @NotNull
+  protected Splitter getTextSplitter() {
+    return TextSplitter.getInstance();
+  }
+}
diff --git a/src/com/haskforce/spellchecker/HaskellSpellcheckingStrategy.scala b/src/com/haskforce/spellchecker/HaskellSpellcheckingStrategy.scala
@@ -6,22 +6,37 @@ import com.haskforce.psi._
 import com.haskforce.utils.CastUtil.Ops
 import com.haskforce.utils.PQ
 import com.intellij.psi.{PsiComment, PsiElement}
-import com.intellij.spellchecker.tokenizer.SpellcheckingStrategy
+import com.intellij.spellchecker.inspections.PlainTextSplitter
+import com.intellij.spellchecker.tokenizer.{SpellcheckingStrategy, Tokenizer, TokenizerBase}
 
 /**
  * Provide spellchecker support for Haskell/Cabal sources.
  */
 class HaskellSpellcheckingStrategy extends SpellcheckingStrategy {
 
+  override def getTokenizer(element: PsiElement): Tokenizer[_ <: PsiElement] = {
+    // We need to split on apostrophes when spellchecking Haskell identifiers.
+    if (isHaskellIdent(element)) {
+      HaskellSpellcheckingStrategy.HASKELL_IDENT_TOKENIZER
+    } else {
+      HaskellSpellcheckingStrategy.STANDARD_TOKENIZER
+    }
+  }
+
   override def isMyContext(e: PsiElement): Boolean = {
-    isHaskell(e) && isDefinitionNode(e)
+    (isHaskell(e) || isCabal(e)) && isDefinitionNode(e)
   }
 
   private def isHaskell(e: PsiElement): Boolean = {
-    Seq(
-      HaskellLanguage.INSTANCE,
-      CabalLanguage.INSTANCE
-    ).exists(_.is(e.getLanguage))
+    HaskellLanguage.INSTANCE.is(e.getLanguage)
+  }
+
+  private def isHaskellIdent(e: PsiElement): Boolean = {
+    e.isInstanceOf[HaskellNamedElement]
+  }
+
+  private def isCabal(e: PsiElement): Boolean = {
+    CabalLanguage.INSTANCE.is(e.getLanguage)
   }
 
   private def isDefinitionNode(e: PsiElement): Boolean = {
@@ -129,3 +144,14 @@ class HaskellSpellcheckingStrategy extends SpellcheckingStrategy {
       .flatMap(_.getParent.cast[HaskellNewtypedecl])
   }
 }
+
+object HaskellSpellcheckingStrategy {
+
+  private val HASKELL_IDENT_TOKENIZER = new TokenizerBase[PsiElement](
+    HaskellSpellcheckingSplitter.getInstance()
+  )
+
+  private val STANDARD_TOKENIZER = new TokenizerBase[PsiElement](
+    PlainTextSplitter.getInstance()
+  )
+}
diff --git a/tests/gold/spellchecker/Comments.hs b/tests/gold/spellchecker/Comments.hs
@@ -5,3 +5,7 @@ module Comments where
 -- | And doc <TYPO>coments</TYPO>
 
 -- ^ And and these <TYPO>commens</TYPO> two
+
+-- Also <TYPO>the'se</TYPO>
+
+-- But doesn't catch this.