Skip to content

Commit

Permalink
Merge pull request #54 from bynect/utf8-fix
Browse files Browse the repository at this point in the history
(Try to) fix Utf8
  • Loading branch information
arthurbacci authored Apr 2, 2021
2 parents 4c24b03 + 53c2e82 commit 7953eec
Show file tree
Hide file tree
Showing 6 changed files with 164 additions and 38 deletions.
34 changes: 25 additions & 9 deletions src/keypress.c
Original file line number Diff line number Diff line change
Expand Up @@ -248,17 +248,33 @@ void process_keypress(int c) {
add_char(cx, cy, config.current_syntax->match[1][match - config.current_syntax->match[0]]);
}

uchar32_t ec = c;
unsigned char ucs[4] = {c, 0, 0, 0};
int len = 1;

if ((c >= 0xC0 && c <= 0xDF) || (c >= 0xE0 && c <= 0xEF) || (c >= 0xF0 && c <= 0xF7))
ec += getch() << 8;
if ((c >= 0xE0 && c <= 0xEF) || (c >= 0xF0 && c <= 0xF7))
ec += getch() << 16;
if (c >= 0xF0 && c <= 0xF7)
ec += getch() << 24;
if ((c >= 0xC2 && c <= 0xDF) || (c >= 0xE0 && c <= 0xEF) || (c >= 0xF0 && c <= 0xF4)) {
ucs[1] = getch(), len++;
}
if ((c >= 0xE0 && c <= 0xEF) || (c >= 0xF0 && c <= 0xF4)) {
ucs[2] = getch(), len++;
}
if (c >= 0xF0 && c <= 0xF4) {
ucs[3] = getch(), len++;
}

if (add_char(cx, cy, ec))
process_keypress(KEY_RIGHT);
if (validate_utf8(ucs)) {
uchar32_t ec = c;
for (int i = 1, off = 8; i < len; i++, off += 8)
ec += ucs[i] << off;

if (add_char(cx, cy, ec))
process_keypress(KEY_RIGHT);
} else {
for (int i = 0; i < len; i++) {
if (add_char(cx, cy, substitute_char))
process_keypress(KEY_RIGHT);
else break;
}
}

syntaxHighlight();
}
Expand Down
9 changes: 4 additions & 5 deletions src/open_and_save.c
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,8 @@ void savefile(void) {
for (unsigned int i = 0; i < num_lines; i++) {
for (unsigned int j = 0; j < lines[i].length; j++) {
unsigned char b[4];
fwrite(b, sizeof(unsigned char), utf8ToMultibyte(lines[i].data[j], b), fpw);
int len = utf8ToMultibyte(lines[i].data[j], b, 0);
fwrite(b, sizeof(unsigned char), len, fpw);
}
if (num_lines > 1) {
if (config.line_break_type == 0)
Expand Down Expand Up @@ -91,10 +92,8 @@ void read_lines(void) {
else if (passed_spaces == 0)
lines[i].ident++;

unsigned char uc = *(unsigned char *)&c;

utf8ReadFile(uc, j, i, fp);

unsigned char uc = c;
utf8ReadFile(uc, &lines[i].data[j], fp);
lines[i].length++;
}

Expand Down
11 changes: 10 additions & 1 deletion src/show.c
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,16 @@ void show_lines(void) {
size += config.tablen - 1;
} else {
unsigned char b[4];
printw("%.*s", utf8ToMultibyte(el, b), b);
int len = utf8ToMultibyte(el, b, 1);

if (len == -1) {
b[0] = substitute_string[0];
b[1] = substitute_string[1];
b[2] = substitute_string[2];
len = 3;
}

printw("%.*s", len, b);
}

if (j == cursor.x + text_scroll.x && i == cursor.y)
Expand Down
2 changes: 1 addition & 1 deletion src/ted.c
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ void setcolor(int c) {
unsigned int last_cursor_x = 0;

struct CFG config = {
4, 0, 0, 1, 1, 1, 1,
1, 4, 0, 0, 1, 1, 1, 1,
&default_syntax, 0, NULL,
{0, 0, 1},
};
Expand Down
11 changes: 9 additions & 2 deletions src/ted.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@

#define NUM_PAIRS 6

#define IN_RANGE(x, min, max) ((x) >= (min)) && ((x) <= (max))
#define OUT_RANGE(x, min, max) ((x) < (min)) || ((x) > (max))

typedef uint32_t uchar32_t;

// message_and_prompt.c
Expand Down Expand Up @@ -72,8 +75,9 @@ void change_position(unsigned int x, unsigned int y);
void processMouseEvent(MEVENT ev);

// utf8.c
void utf8ReadFile(unsigned char uc, unsigned int lc, unsigned int i, FILE *fp);
uint16_t utf8ToMultibyte(uchar32_t c, unsigned char *out);
void utf8ReadFile(unsigned char uc, uchar32_t *out, FILE *fp_);
int utf8ToMultibyte(uchar32_t c, unsigned char *out, bool validate);
bool validate_utf8(unsigned char *ucs);

// color.c
void syntaxHighlight(void);
Expand Down Expand Up @@ -131,6 +135,7 @@ struct BUFFER {
};

struct CFG {
bool strict_utf8; // high/low surrogates will be replaced (for now leave it always set)
unsigned int tablen;
int lines;
unsigned char line_break_type; // 0: LF 1: CRLF 2: CR
Expand Down Expand Up @@ -186,5 +191,7 @@ extern bool colors_on;
extern bool needs_to_free_filename;
extern char *menu_message;
extern struct SHD default_syntax;
extern const uchar32_t substitute_char;
extern const char *substitute_string;

#endif
135 changes: 115 additions & 20 deletions src/utf8.c
Original file line number Diff line number Diff line change
@@ -1,33 +1,128 @@
#include "ted.h"

void utf8ReadFile(unsigned char uc, unsigned int lc, unsigned int i, FILE *fp_) {
if (uc >= 0xC0 && uc <= 0xDF) {
lines[i].data[lc] = uc;
lines[i].data[lc] += (unsigned int)fgetc(fp_) << 8;
} else if (uc >= 0xE0 && uc <= 0xEF) {
lines[i].data[lc] = uc;
lines[i].data[lc] += (unsigned int)fgetc(fp_) << 8;
lines[i].data[lc] += (unsigned int)fgetc(fp_) << 16;
} else if (uc >= 0xF0 && uc <= 0xF7) {
lines[i].data[lc] = uc;
lines[i].data[lc] += (unsigned int)fgetc(fp_) << 8;
lines[i].data[lc] += (unsigned int)fgetc(fp_) << 16;
lines[i].data[lc] += (unsigned int)fgetc(fp_) << 24;
} else
lines[i].data[lc] = uc;
// Displayed instead of invalid utf8 (codepoint U+FFFD)
const uchar32_t substitute_char = 0xEF + (0xBF << 8) + (0xBD << 16);
const char *substitute_string = "\xEF\xBF\xBD";

/*
Utf8 sequences range
1 byte sequence (1* INCLUSIVE 0x00 - x7F)
2 bytes sequence (1* INCLUSIVE 0xC2 - 0xDF) (2* INCLUSIVE 0x80 - 0xBF)
3 bytes sequence (1* INCLUSIVE 0xE0 - 0xEF) (2* INCLUSIVE 0xA0 - 0xBF) (3* INCLUSIVE 0x80 - 0xBF)
4 bytes sequence (1* INCLUSIVE 0xF0 - 0xF4) (2* INCLUSIVE 0x90 - 0x8F) (3* INCLUSIVE 0x80 - 0xBF) (4* INCLUSIVE 0x80 - 0xBF)
*/

void utf8ReadFile(unsigned char uc, uchar32_t *out, FILE *fp_) {
if (uc <= 0x7F) {
*out = uc;

} else if (IN_RANGE(uc, 0xC2, 0xDF)) {
int uc2 = fgetc(fp_);
if (OUT_RANGE(uc2, 0x80, 0xBF)) {
ungetc(uc2, fp_);
goto invalid;
}

*out = uc;
*out += (unsigned int)uc2 << 8;

} else if (IN_RANGE(uc, 0xE0, 0xEF) && (!config.strict_utf8 || uc != 0xED)) {
int uc2 = fgetc(fp_);
if (OUT_RANGE(uc2, 0xA0, 0xBF)) {
ungetc(uc2, fp_);
goto invalid;
}

int uc3 = fgetc(fp_);
if (OUT_RANGE(uc3, 0x80, 0xBF)) {
ungetc(uc3, fp_);
ungetc(uc2, fp_);
goto invalid;
}

*out = uc;
*out += (unsigned int)uc2 << 8;
*out += (unsigned int)uc3 << 16;

} else if (IN_RANGE(uc, 0xF0, 0xF4)) {
int uc2 = fgetc(fp_);
if (OUT_RANGE(uc2, 0x90, 0x8F)) {
ungetc(uc2, fp_);
goto invalid;
}

int uc3 = fgetc(fp_);
if (OUT_RANGE(uc3, 0x80, 0xBF)) {
ungetc(uc3, fp_);
ungetc(uc2, fp_);
goto invalid;
}

int uc4 = fgetc(fp_);
if (OUT_RANGE(uc4, 0x80, 0xBF)) {
ungetc(uc4, fp_);
ungetc(uc3, fp_);
ungetc(uc2, fp_);
goto invalid;
}

*out = uc;
*out += (unsigned int)uc2 << 8;
*out += (unsigned int)uc3 << 16;
*out += (unsigned int)uc4 << 24;
} else goto invalid;
return;

invalid:
*out = uc; //threat as a single char
}

uint16_t utf8ToMultibyte(uchar32_t c, unsigned char *out) {
int utf8ToMultibyte(uchar32_t c, unsigned char *out, bool validate) {
out[0] = (unsigned char)(c % (1 << 8));
out[1] = (unsigned char)((c >> 8) % (1 << 8));
out[2] = (unsigned char)((c >> 16) % (1 << 8));
out[3] = (unsigned char)(c >> 24);

if (out[0] >= 0xC0 && out[0] <= 0xDF)
if (out[0] <= 0x7F)
return 1;

if (IN_RANGE(out[0], 0xC2, 0xDF)) {
if (OUT_RANGE(out[1], 0x80, 0xBF))
goto invalid;
return 2;
if (out[0] >= 0xE0 && out[0] <= 0xEF)
}
if (IN_RANGE(out[0], 0xE0, 0xEF)) {
if (OUT_RANGE(out[1], 0xA0, 0xBF) || OUT_RANGE(out[2], 0x80, 0xBF) || (config.strict_utf8 && out[0] == 0xED))
goto invalid;
return 3;
if (out[0] >= 0xF0 && out[0] <= 0xF7)
}
if (IN_RANGE(out[0], 0xF0, 0xF4)) {
if (OUT_RANGE(out[1], 0x90, 0x8F) || OUT_RANGE(out[2], 0x80, 0xBF) || OUT_RANGE(out[3], 0x80, 0xBF))
goto invalid;
return 4;
return 1;
}

invalid: // may do other things here
return validate ? -1 : 1; //threat as a single char
}

bool validate_utf8(unsigned char *ucs) {
if (ucs[0] <= 0x7F)
return 1;
if (IN_RANGE(ucs[0], 0xC2, 0xDF)) {
if (OUT_RANGE(ucs[1], 0x80, 0xBF))
return 0;
return 1;
}
if (IN_RANGE(ucs[0], 0xE0, 0xEF)) {
if (OUT_RANGE(ucs[1], 0xA0, 0xBF) || OUT_RANGE(ucs[2], 0x80, 0xBF) || (config.strict_utf8 && ucs[0] == 0xED))
return 0;
return 1;
}
if (IN_RANGE(ucs[0], 0xF0, 0xF4)) {
if (OUT_RANGE(ucs[1], 0x90, 0x8F) || OUT_RANGE(ucs[2], 0x80, 0xBF) || OUT_RANGE(ucs[3], 0x80, 0xBF))
return 0;
return 1;
}
return 0;
}

0 comments on commit 7953eec

Please sign in to comment.