Merge pull request #54 from bynect/utf8-fix

(Try to) fix Utf8
arthurbacci · Apr 2, 2021 · 7953eec · 7953eec
2 parents 4c24b03 + 53c2e82
commit 7953eec
Show file tree

Hide file tree

Showing 6 changed files with 164 additions and 38 deletions.
diff --git a/src/keypress.c b/src/keypress.c
@@ -248,17 +248,33 @@ void process_keypress(int c) {
                 add_char(cx, cy, config.current_syntax->match[1][match - config.current_syntax->match[0]]);
         }
 
-        uchar32_t ec = c;
+        unsigned char ucs[4] = {c, 0, 0, 0};
+        int len = 1;
 
-        if ((c >= 0xC0 && c <= 0xDF) || (c >= 0xE0 && c <= 0xEF) || (c >= 0xF0 && c <= 0xF7))
-            ec += getch() << 8;
-        if ((c >= 0xE0 && c <= 0xEF) || (c >= 0xF0 && c <= 0xF7))
-            ec += getch() << 16;
-        if (c >= 0xF0 && c <= 0xF7)
-            ec += getch() << 24;
+        if ((c >= 0xC2 && c <= 0xDF) || (c >= 0xE0 && c <= 0xEF) || (c >= 0xF0 && c <= 0xF4)) {
+            ucs[1] = getch(), len++;
+        }
+        if ((c >= 0xE0 && c <= 0xEF) || (c >= 0xF0 && c <= 0xF4)) {
+            ucs[2] = getch(), len++;
+        }
+        if (c >= 0xF0 && c <= 0xF4) {
+            ucs[3] = getch(), len++;
+        }
 
-        if (add_char(cx, cy, ec))
-            process_keypress(KEY_RIGHT);
+        if (validate_utf8(ucs)) {
+            uchar32_t ec = c;
+            for (int i = 1, off = 8; i < len; i++, off += 8)
+                ec += ucs[i] << off;
+
+            if (add_char(cx, cy, ec))
+                process_keypress(KEY_RIGHT);
+        } else {
+            for (int i = 0; i < len; i++) {
+                if (add_char(cx, cy, substitute_char))
+                    process_keypress(KEY_RIGHT);
+                else break;
+            }
+        }
 
         syntaxHighlight();
     }

diff --git a/src/open_and_save.c b/src/open_and_save.c
@@ -25,7 +25,8 @@ void savefile(void) {
     for (unsigned int i = 0; i < num_lines; i++) {
         for (unsigned int j = 0; j < lines[i].length; j++) {
             unsigned char b[4];
-            fwrite(b, sizeof(unsigned char), utf8ToMultibyte(lines[i].data[j], b), fpw);
+            int len = utf8ToMultibyte(lines[i].data[j], b, 0);
+            fwrite(b, sizeof(unsigned char), len, fpw);
         }
         if (num_lines > 1) {
             if (config.line_break_type == 0)
@@ -91,10 +92,8 @@ void read_lines(void) {
             else if (passed_spaces == 0)
                 lines[i].ident++;
 
-            unsigned char uc = *(unsigned char *)&c;
-
-            utf8ReadFile(uc, j, i, fp);
-
+            unsigned char uc = c;
+            utf8ReadFile(uc, &lines[i].data[j], fp);
             lines[i].length++;
         }
 

diff --git a/src/show.c b/src/show.c
@@ -107,7 +107,16 @@ void show_lines(void) {
                 size += config.tablen - 1;
             } else {
                 unsigned char b[4];
-                printw("%.*s", utf8ToMultibyte(el, b), b);
+                int len = utf8ToMultibyte(el, b, 1);
+
+                if (len == -1) {
+                    b[0] = substitute_string[0];
+                    b[1] = substitute_string[1];
+                    b[2] = substitute_string[2];
+                    len = 3;
+                }
+
+                printw("%.*s", len, b);
             }
 
             if (j == cursor.x + text_scroll.x && i == cursor.y)

diff --git a/src/ted.c b/src/ted.c
@@ -24,7 +24,7 @@ void setcolor(int c) {
 unsigned int last_cursor_x = 0;
 
 struct CFG config = {
-    4, 0, 0, 1, 1, 1, 1,
+    1, 4, 0, 0, 1, 1, 1, 1,
     &default_syntax, 0, NULL,
     {0, 0, 1},
 };

diff --git a/src/ted.h b/src/ted.h
@@ -25,6 +25,9 @@
 
 #define NUM_PAIRS 6
 
+#define IN_RANGE(x, min, max)   ((x) >= (min)) && ((x) <= (max))
+#define OUT_RANGE(x, min, max)  ((x) < (min)) || ((x) > (max))
+
 typedef uint32_t uchar32_t;
 
 // message_and_prompt.c
@@ -72,8 +75,9 @@ void change_position(unsigned int x, unsigned int y);
 void processMouseEvent(MEVENT ev);
 
 // utf8.c
-void utf8ReadFile(unsigned char uc, unsigned int lc, unsigned int i, FILE *fp);
-uint16_t utf8ToMultibyte(uchar32_t c, unsigned char *out);
+void utf8ReadFile(unsigned char uc, uchar32_t *out, FILE *fp_);
+int utf8ToMultibyte(uchar32_t c, unsigned char *out, bool validate);
+bool validate_utf8(unsigned char *ucs);
 
 // color.c
 void syntaxHighlight(void);
@@ -131,6 +135,7 @@ struct BUFFER {
 };
 
 struct CFG {
+    bool strict_utf8; // high/low surrogates will be replaced (for now leave it always set)
     unsigned int tablen;
     int lines;
     unsigned char line_break_type; // 0: LF  1: CRLF  2: CR
@@ -186,5 +191,7 @@ extern bool colors_on;
 extern bool needs_to_free_filename;
 extern char *menu_message;
 extern struct SHD default_syntax;
+extern const uchar32_t substitute_char;
+extern const char *substitute_string;
 
 #endif
diff --git a/src/utf8.c b/src/utf8.c
@@ -1,33 +1,128 @@
 #include "ted.h"
 
-void utf8ReadFile(unsigned char uc, unsigned int lc, unsigned int i, FILE *fp_) {
-    if (uc >= 0xC0 && uc <= 0xDF) {
-        lines[i].data[lc] = uc;
-        lines[i].data[lc] += (unsigned int)fgetc(fp_) << 8;
-    } else if (uc >= 0xE0 && uc <= 0xEF) {
-        lines[i].data[lc] = uc;
-        lines[i].data[lc] += (unsigned int)fgetc(fp_) << 8;
-        lines[i].data[lc] += (unsigned int)fgetc(fp_) << 16;
-    } else if (uc >= 0xF0 && uc <= 0xF7) {
-        lines[i].data[lc] = uc;
-        lines[i].data[lc] += (unsigned int)fgetc(fp_) << 8;
-        lines[i].data[lc] += (unsigned int)fgetc(fp_) << 16;
-        lines[i].data[lc] += (unsigned int)fgetc(fp_) << 24;
-    } else
-        lines[i].data[lc] = uc;
+// Displayed instead of invalid utf8 (codepoint U+FFFD)
+const uchar32_t substitute_char = 0xEF + (0xBF << 8) + (0xBD << 16);
+const char *substitute_string = "\xEF\xBF\xBD";
+
+/*
+Utf8 sequences range
+1 byte sequence (1* INCLUSIVE 0x00 - x7F)
+2 bytes sequence (1* INCLUSIVE 0xC2 - 0xDF) (2* INCLUSIVE 0x80 - 0xBF)
+3 bytes sequence (1* INCLUSIVE 0xE0 - 0xEF) (2* INCLUSIVE 0xA0 - 0xBF) (3* INCLUSIVE 0x80 - 0xBF)
+4 bytes sequence (1* INCLUSIVE 0xF0 - 0xF4) (2* INCLUSIVE 0x90 - 0x8F) (3* INCLUSIVE 0x80 - 0xBF) (4* INCLUSIVE 0x80 - 0xBF)
+*/
+
+void utf8ReadFile(unsigned char uc, uchar32_t *out, FILE *fp_) {
+    if (uc <= 0x7F) {
+        *out = uc;
+
+    } else if (IN_RANGE(uc, 0xC2, 0xDF)) {
+        int uc2 = fgetc(fp_);
+        if (OUT_RANGE(uc2, 0x80, 0xBF)) {
+            ungetc(uc2, fp_);
+            goto invalid;
+        }
+
+        *out = uc;
+        *out += (unsigned int)uc2 << 8;
+
+    } else if (IN_RANGE(uc, 0xE0, 0xEF) && (!config.strict_utf8 || uc != 0xED)) {
+        int uc2 = fgetc(fp_);
+        if (OUT_RANGE(uc2, 0xA0, 0xBF)) {
+            ungetc(uc2, fp_);
+            goto invalid;
+        }
+
+        int uc3 = fgetc(fp_);
+        if (OUT_RANGE(uc3, 0x80, 0xBF)) {
+            ungetc(uc3, fp_);
+            ungetc(uc2, fp_);
+            goto invalid;
+        }
+
+        *out = uc;
+        *out += (unsigned int)uc2 << 8;
+        *out += (unsigned int)uc3 << 16;
+
+    } else if (IN_RANGE(uc, 0xF0, 0xF4)) {
+        int uc2 = fgetc(fp_);
+        if (OUT_RANGE(uc2, 0x90, 0x8F)) {
+            ungetc(uc2, fp_);
+            goto invalid;
+        }
+
+        int uc3 = fgetc(fp_);
+        if (OUT_RANGE(uc3, 0x80, 0xBF)) {
+            ungetc(uc3, fp_);
+            ungetc(uc2, fp_);
+            goto invalid;
+        }
+
+        int uc4 = fgetc(fp_);
+        if (OUT_RANGE(uc4, 0x80, 0xBF)) {
+            ungetc(uc4, fp_);
+            ungetc(uc3, fp_);
+            ungetc(uc2, fp_);
+            goto invalid;
+        }
+
+        *out = uc;
+        *out += (unsigned int)uc2 << 8;
+        *out += (unsigned int)uc3 << 16;
+        *out += (unsigned int)uc4 << 24;
+    } else goto invalid;
+    return;
+
+invalid:
+    *out = uc; //threat as a single char
 }
 
-uint16_t utf8ToMultibyte(uchar32_t c, unsigned char *out) {
+int utf8ToMultibyte(uchar32_t c, unsigned char *out, bool validate) {
     out[0] = (unsigned char)(c % (1 << 8));
     out[1] = (unsigned char)((c >> 8) % (1 << 8));
     out[2] = (unsigned char)((c >> 16) % (1 << 8));
     out[3] = (unsigned char)(c >> 24);
 
-    if (out[0] >= 0xC0 && out[0] <= 0xDF)
+    if (out[0] <= 0x7F)
+        return 1;
+
+    if (IN_RANGE(out[0], 0xC2, 0xDF)) {
+        if (OUT_RANGE(out[1], 0x80, 0xBF))
+            goto invalid;
         return 2;
-    if (out[0] >= 0xE0 && out[0] <= 0xEF)
+    }
+    if (IN_RANGE(out[0], 0xE0, 0xEF)) {
+        if (OUT_RANGE(out[1], 0xA0, 0xBF) || OUT_RANGE(out[2], 0x80, 0xBF) || (config.strict_utf8 && out[0] == 0xED))
+            goto invalid;
         return 3;
-    if (out[0] >= 0xF0 && out[0] <= 0xF7)
+    }
+    if (IN_RANGE(out[0], 0xF0, 0xF4)) {
+        if (OUT_RANGE(out[1], 0x90, 0x8F) || OUT_RANGE(out[2], 0x80, 0xBF) || OUT_RANGE(out[3], 0x80, 0xBF))
+            goto invalid;
         return 4;
-    return 1;
+    }
+
+invalid: // may do other things here
+    return validate ? -1 : 1; //threat as a single char
+}
+
+bool validate_utf8(unsigned char *ucs) {
+    if (ucs[0] <= 0x7F)
+        return 1;
+    if (IN_RANGE(ucs[0], 0xC2, 0xDF)) {
+        if (OUT_RANGE(ucs[1], 0x80, 0xBF))
+            return 0;
+        return 1;
+    }
+    if (IN_RANGE(ucs[0], 0xE0, 0xEF)) {
+        if (OUT_RANGE(ucs[1], 0xA0, 0xBF) || OUT_RANGE(ucs[2], 0x80, 0xBF) || (config.strict_utf8 && ucs[0] == 0xED))
+            return 0;
+        return 1;
+    }
+    if (IN_RANGE(ucs[0], 0xF0, 0xF4)) {
+        if (OUT_RANGE(ucs[1], 0x90, 0x8F) || OUT_RANGE(ucs[2], 0x80, 0xBF) || OUT_RANGE(ucs[3], 0x80, 0xBF))
+            return 0;
+        return 1;
+    }
+    return 0;
 }