From d21276d92b2984d47bd5cb0a1f662c288b0743de Mon Sep 17 00:00:00 2001
From: David Mejorado <david.mejorado@gmail.com>
Date: Mon, 20 Nov 2023 19:12:22 -0800
Subject: [PATCH] fix: diagnostics in lines with multi-byte chars
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There's a conflict between the way Lua interprets strings with
multi-byte characters and the way we pass the `col` field through the
patterns.

For example, the length for the string: `* example typox` in every other
language would be `15`, but Lua counts the bytes in the string, not the
number of printable characters. This means that for the same string, lua
returns `16` as the length of the string.

The report coming from CSpell also counts only printable characters, so
for a file like this:

`test.md`
```markdown
* example typox
· example typox
```

The report will be:

`npx cspell -c cspell.json lint --language-id markdown test.md`
```
1/1 ./test.md 163.45ms X
./test.md:1:11 - Unknown word (typox)
./test.md:2:11 - Unknown word (typox)
```

Both lines have the same column as the start of the unknown word,
because CSpell doesn't count bytes when reporting the position of the
error.

So when we read the column from the report we just forward whatever we
got from the CSpell report.

The `end_col` ends up with the correct position because we calculate it
with the custom `from_quote` adapter, which finds the end column
programmatically.

To counter that discrepancy, I'm using the column reported by CSpell
only as an index to start looking for the word reported as an error in
the `end_col` function, and mutating the entries table to define the
`col` property in the same function.

I have a proof of concept that seems to work as expected, I'll test a
few scenarios before I push anything.

IMO, that feels a bit too hacky to keep as a long-term solution, we
should look into validating the `col` property in none-ls.
---
 lua/cspell/diagnostics/parser.lua | 29 +++++++++++++++++++++++++----
 stylua.toml                       |  1 +
 tests/spec/diagnostics_spec.lua   | 10 ++++++----
 3 files changed, 32 insertions(+), 8 deletions(-)

diff --git a/lua/cspell/diagnostics/parser.lua b/lua/cspell/diagnostics/parser.lua
index e559a69..aab6468 100644
--- a/lua/cspell/diagnostics/parser.lua
+++ b/lua/cspell/diagnostics/parser.lua
@@ -18,15 +18,36 @@ local custom_user_data = {
     end,
 }
 
+--- CSpell doesn't care about multi-byte characters when calculating the
+--- column number for the start of the error. Forwarding the column number
+--- as reported by CSpell, would cause the error to be diagnostic to highlight
+--- the wrong range.
+--- So we delegate that value as a helper property that will help us find the
+--- start and end of the word.
+local custom_from_quote = {
+    end_col = function(entries, line)
+        local quote = entries["_quote"]
+        --- We use the column reported by CSpell as the start index to find the
+        --- current word in the line, in case the word shows up multiple times
+        --- in the same line.
+        local col, end_col = line:find(quote, entries["_col"], true)
+        --- HACK: Since the column reported by CSpell may not match the column
+        --- as counted by lua, we're mutating the entries table to define the
+        --- column property here, so we can account for special characters.
+        entries["col"] = col
+        return end_col + 1
+    end,
+}
+
 -- Finds the messages including a suggestions array, which comes from passing
 -- the --show-suggestions flag to cspell.
 -- That flag is only available when the user has registered the code action.
 local matcher_with_suggestions = {
     pattern = ".*:(%d+):(%d+)%s*-%s*(.*%((.*)%))%s*Suggestions:%s*%[(.*)%]",
-    groups = { "row", "col", "message", "_quote", "_suggestions" },
+    groups = { "row", "_col", "message", "_quote", "_suggestions" },
     overrides = {
         adapters = {
-            h.diagnostics.adapters.end_col.from_quote,
+            custom_from_quote,
             custom_user_data,
         },
     },
@@ -38,10 +59,10 @@ local matcher_with_suggestions = {
 -- used by the code actions.
 local matcher_without_suggestions = {
     pattern = [[.*:(%d+):(%d+)%s*-%s*(.*%((.*)%))]],
-    groups = { "row", "col", "message", "_quote" },
+    groups = { "row", "_col", "message", "_quote" },
     overrides = {
         adapters = {
-            h.diagnostics.adapters.end_col.from_quote,
+            custom_from_quote,
         },
     },
 }
diff --git a/stylua.toml b/stylua.toml
index 394e884..a10ab56 100644
--- a/stylua.toml
+++ b/stylua.toml
@@ -1 +1,2 @@
 indent_type = "Spaces"
+indent_width = 4
diff --git a/tests/spec/diagnostics_spec.lua b/tests/spec/diagnostics_spec.lua
index 2d9ad45..378d887 100644
--- a/tests/spec/diagnostics_spec.lua
+++ b/tests/spec/diagnostics_spec.lua
@@ -19,10 +19,11 @@ describe("diagnostics", function()
 
         it("should create a diagnostic", function()
             local output = "/some/path/file.lua:1:18 - Unknown word (variabl)"
-            local diagnostic = parser(output, { content = content })
+            local diagnostic = parser(output, { content = { content } })
 
             assert.same({
-                col = "18",
+                col = 18,
+                end_col = 25,
                 message = "Unknown word (variabl)",
                 row = "1",
             }, diagnostic)
@@ -31,10 +32,11 @@ describe("diagnostics", function()
         it("includes suggestions", function()
             local output =
                 "/some/path/file.lua:1:18 - Unknown word (variabl) Suggestions: [variable, variably, variables, variant, variate]"
-            local diagnostic = parser(output, { content = content })
+            local diagnostic = parser(output, { content = { content } })
 
             assert.same({
-                col = "18",
+                col = 18,
+                end_col = 25,
                 message = "Unknown word (variabl)",
                 row = "1",
                 user_data = {