Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

(Try to) fix Utf8 #54

Merged
merged 5 commits into from
Apr 2, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 25 additions & 9 deletions src/keypress.c
Original file line number Diff line number Diff line change
Expand Up @@ -248,17 +248,33 @@ void process_keypress(int c) {
add_char(cx, cy, config.current_syntax->match[1][match - config.current_syntax->match[0]]);
}

uchar32_t ec = c;
unsigned char ucs[4] = {c, 0, 0, 0};
int len = 1;

if ((c >= 0xC0 && c <= 0xDF) || (c >= 0xE0 && c <= 0xEF) || (c >= 0xF0 && c <= 0xF7))
ec += getch() << 8;
if ((c >= 0xE0 && c <= 0xEF) || (c >= 0xF0 && c <= 0xF7))
ec += getch() << 16;
if (c >= 0xF0 && c <= 0xF7)
ec += getch() << 24;
if ((c >= 0xC2 && c <= 0xDF) || (c >= 0xE0 && c <= 0xEF) || (c >= 0xF0 && c <= 0xF4)) {
ucs[1] = getch(), len++;
}
if ((c >= 0xE0 && c <= 0xEF) || (c >= 0xF0 && c <= 0xF4)) {
ucs[2] = getch(), len++;
}
if (c >= 0xF0 && c <= 0xF4) {
ucs[3] = getch(), len++;
}

if (add_char(cx, cy, ec))
process_keypress(KEY_RIGHT);
if (validate_utf8(ucs)) {
uchar32_t ec = c;
for (int i = 1, off = 8; i < len; i++, off += 8)
ec += ucs[i] << off;

if (add_char(cx, cy, ec))
process_keypress(KEY_RIGHT);
} else {
for (int i = 0; i < len; i++) {
if (add_char(cx, cy, substitute_char))
process_keypress(KEY_RIGHT);
else break;
}
}

syntaxHighlight();
}
Expand Down
9 changes: 4 additions & 5 deletions src/open_and_save.c
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,8 @@ void savefile(void) {
for (unsigned int i = 0; i < num_lines; i++) {
for (unsigned int j = 0; j < lines[i].length; j++) {
unsigned char b[4];
fwrite(b, sizeof(unsigned char), utf8ToMultibyte(lines[i].data[j], b), fpw);
int len = utf8ToMultibyte(lines[i].data[j], b, 0);
fwrite(b, sizeof(unsigned char), len, fpw);
}
if (num_lines > 1) {
if (config.line_break_type == 0)
Expand Down Expand Up @@ -91,10 +92,8 @@ void read_lines(void) {
else if (passed_spaces == 0)
lines[i].ident++;

unsigned char uc = *(unsigned char *)&c;

utf8ReadFile(uc, j, i, fp);

unsigned char uc = c;
utf8ReadFile(uc, &lines[i].data[j], fp);
lines[i].length++;
}

Expand Down
11 changes: 10 additions & 1 deletion src/show.c
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,16 @@ void show_lines(void) {
size += config.tablen - 1;
} else {
unsigned char b[4];
printw("%.*s", utf8ToMultibyte(el, b), b);
int len = utf8ToMultibyte(el, b, 1);

if (len == -1) {
b[0] = substitute_string[0];
b[1] = substitute_string[1];
b[2] = substitute_string[2];
len = 3;
}

printw("%.*s", len, b);
}

if (j == cursor.x + text_scroll.x && i == cursor.y)
Expand Down
2 changes: 1 addition & 1 deletion src/ted.c
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ void setcolor(int c) {
unsigned int last_cursor_x = 0;

struct CFG config = {
4, 0, 0, 1, 1, 1, 1,
1, 4, 0, 0, 1, 1, 1, 1,
&default_syntax, 0, NULL,
{0, 0, 1},
};
Expand Down
11 changes: 9 additions & 2 deletions src/ted.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@

#define NUM_PAIRS 6

#define IN_RANGE(x, min, max) ((x) >= (min)) && ((x) <= (max))
#define OUT_RANGE(x, min, max) ((x) < (min)) || ((x) > (max))

typedef uint32_t uchar32_t;

// message_and_prompt.c
Expand Down Expand Up @@ -72,8 +75,9 @@ void change_position(unsigned int x, unsigned int y);
void processMouseEvent(MEVENT ev);

// utf8.c
void utf8ReadFile(unsigned char uc, unsigned int lc, unsigned int i, FILE *fp);
uint16_t utf8ToMultibyte(uchar32_t c, unsigned char *out);
void utf8ReadFile(unsigned char uc, uchar32_t *out, FILE *fp_);
int utf8ToMultibyte(uchar32_t c, unsigned char *out, bool validate);
bool validate_utf8(unsigned char *ucs);

// color.c
void syntaxHighlight(void);
Expand Down Expand Up @@ -131,6 +135,7 @@ struct BUFFER {
};

struct CFG {
bool strict_utf8; // high/low surrogates will be replaced (for now leave it always set)
unsigned int tablen;
int lines;
unsigned char line_break_type; // 0: LF 1: CRLF 2: CR
Expand Down Expand Up @@ -186,5 +191,7 @@ extern bool colors_on;
extern bool needs_to_free_filename;
extern char *menu_message;
extern struct SHD default_syntax;
extern const uchar32_t substitute_char;
extern const char *substitute_string;

#endif
135 changes: 115 additions & 20 deletions src/utf8.c
Original file line number Diff line number Diff line change
@@ -1,33 +1,128 @@
#include "ted.h"

void utf8ReadFile(unsigned char uc, unsigned int lc, unsigned int i, FILE *fp_) {
if (uc >= 0xC0 && uc <= 0xDF) {
lines[i].data[lc] = uc;
lines[i].data[lc] += (unsigned int)fgetc(fp_) << 8;
} else if (uc >= 0xE0 && uc <= 0xEF) {
lines[i].data[lc] = uc;
lines[i].data[lc] += (unsigned int)fgetc(fp_) << 8;
lines[i].data[lc] += (unsigned int)fgetc(fp_) << 16;
} else if (uc >= 0xF0 && uc <= 0xF7) {
lines[i].data[lc] = uc;
lines[i].data[lc] += (unsigned int)fgetc(fp_) << 8;
lines[i].data[lc] += (unsigned int)fgetc(fp_) << 16;
lines[i].data[lc] += (unsigned int)fgetc(fp_) << 24;
} else
lines[i].data[lc] = uc;
// Displayed instead of invalid utf8 (codepoint U+FFFD)
const uchar32_t substitute_char = 0xEF + (0xBF << 8) + (0xBD << 16);
const char *substitute_string = "\xEF\xBF\xBD";

/*
Utf8 sequences range
1 byte sequence (1* INCLUSIVE 0x00 - x7F)
2 bytes sequence (1* INCLUSIVE 0xC2 - 0xDF) (2* INCLUSIVE 0x80 - 0xBF)
3 bytes sequence (1* INCLUSIVE 0xE0 - 0xEF) (2* INCLUSIVE 0xA0 - 0xBF) (3* INCLUSIVE 0x80 - 0xBF)
4 bytes sequence (1* INCLUSIVE 0xF0 - 0xF4) (2* INCLUSIVE 0x90 - 0x8F) (3* INCLUSIVE 0x80 - 0xBF) (4* INCLUSIVE 0x80 - 0xBF)
*/

void utf8ReadFile(unsigned char uc, uchar32_t *out, FILE *fp_) {
if (uc <= 0x7F) {
*out = uc;

} else if (IN_RANGE(uc, 0xC2, 0xDF)) {
int uc2 = fgetc(fp_);
if (OUT_RANGE(uc2, 0x80, 0xBF)) {
ungetc(uc2, fp_);
goto invalid;
}

*out = uc;
*out += (unsigned int)uc2 << 8;

} else if (IN_RANGE(uc, 0xE0, 0xEF) && (!config.strict_utf8 || uc != 0xED)) {
int uc2 = fgetc(fp_);
if (OUT_RANGE(uc2, 0xA0, 0xBF)) {
ungetc(uc2, fp_);
goto invalid;
}

int uc3 = fgetc(fp_);
if (OUT_RANGE(uc3, 0x80, 0xBF)) {
ungetc(uc3, fp_);
ungetc(uc2, fp_);
goto invalid;
}

*out = uc;
*out += (unsigned int)uc2 << 8;
*out += (unsigned int)uc3 << 16;

} else if (IN_RANGE(uc, 0xF0, 0xF4)) {
int uc2 = fgetc(fp_);
if (OUT_RANGE(uc2, 0x90, 0x8F)) {
ungetc(uc2, fp_);
goto invalid;
}

int uc3 = fgetc(fp_);
if (OUT_RANGE(uc3, 0x80, 0xBF)) {
ungetc(uc3, fp_);
ungetc(uc2, fp_);
goto invalid;
}

int uc4 = fgetc(fp_);
if (OUT_RANGE(uc4, 0x80, 0xBF)) {
ungetc(uc4, fp_);
ungetc(uc3, fp_);
ungetc(uc2, fp_);
goto invalid;
}

*out = uc;
*out += (unsigned int)uc2 << 8;
*out += (unsigned int)uc3 << 16;
*out += (unsigned int)uc4 << 24;
} else goto invalid;
return;

invalid:
*out = uc; //threat as a single char
}

uint16_t utf8ToMultibyte(uchar32_t c, unsigned char *out) {
int utf8ToMultibyte(uchar32_t c, unsigned char *out, bool validate) {
out[0] = (unsigned char)(c % (1 << 8));
out[1] = (unsigned char)((c >> 8) % (1 << 8));
out[2] = (unsigned char)((c >> 16) % (1 << 8));
out[3] = (unsigned char)(c >> 24);

if (out[0] >= 0xC0 && out[0] <= 0xDF)
if (out[0] <= 0x7F)
return 1;

if (IN_RANGE(out[0], 0xC2, 0xDF)) {
if (OUT_RANGE(out[1], 0x80, 0xBF))
goto invalid;
return 2;
if (out[0] >= 0xE0 && out[0] <= 0xEF)
}
if (IN_RANGE(out[0], 0xE0, 0xEF)) {
if (OUT_RANGE(out[1], 0xA0, 0xBF) || OUT_RANGE(out[2], 0x80, 0xBF) || (config.strict_utf8 && out[0] == 0xED))
goto invalid;
return 3;
if (out[0] >= 0xF0 && out[0] <= 0xF7)
}
if (IN_RANGE(out[0], 0xF0, 0xF4)) {
if (OUT_RANGE(out[1], 0x90, 0x8F) || OUT_RANGE(out[2], 0x80, 0xBF) || OUT_RANGE(out[3], 0x80, 0xBF))
goto invalid;
return 4;
return 1;
}

invalid: // may do other things here
return validate ? -1 : 1; //threat as a single char
}

bool validate_utf8(unsigned char *ucs) {
if (ucs[0] <= 0x7F)
return 1;
if (IN_RANGE(ucs[0], 0xC2, 0xDF)) {
if (OUT_RANGE(ucs[1], 0x80, 0xBF))
return 0;
return 1;
}
if (IN_RANGE(ucs[0], 0xE0, 0xEF)) {
if (OUT_RANGE(ucs[1], 0xA0, 0xBF) || OUT_RANGE(ucs[2], 0x80, 0xBF) || (config.strict_utf8 && ucs[0] == 0xED))
return 0;
return 1;
}
if (IN_RANGE(ucs[0], 0xF0, 0xF4)) {
if (OUT_RANGE(ucs[1], 0x90, 0x8F) || OUT_RANGE(ucs[2], 0x80, 0xBF) || OUT_RANGE(ucs[3], 0x80, 0xBF))
return 0;
return 1;
}
return 0;
}