From 25316baf4c486035f5683e0a7f1c88e05111e0da Mon Sep 17 00:00:00 2001 From: Ugo Majer Date: Tue, 16 Jan 2024 13:24:46 +0100 Subject: [PATCH 1/3] test: adding one new test --- src/lexer/tests/lexer2_tests.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/src/lexer/tests/lexer2_tests.c b/src/lexer/tests/lexer2_tests.c index 33ab76f4..f3ac51e0 100644 --- a/src/lexer/tests/lexer2_tests.c +++ b/src/lexer/tests/lexer2_tests.c @@ -483,6 +483,22 @@ Test(lexer2, token_word_assignment_in_echo) lexer_free(lexer); } +Test(lexer2, token_word_assignment_in_echo1) +{ + struct lexer *lexer = lexer_new("echo '$a'"); + struct token tok = lexer_pop(lexer); + cr_assert_eq(tok.type, TOKEN_WORD); + cr_assert_str_eq(tok.data, "echo"); + token_free(tok); + + tok = lexer_pop(lexer); + cr_assert_eq(tok.type, TOKEN_WORD); + cr_assert_str_eq(tok.data, "$a"); + token_free(tok); + + lexer_free(lexer); +} + Test(lexer2, token_word_assignment_in_echo2) { struct lexer *lexer = lexer_new("echo toto=2 tata=3"); From ca13197527105810a41836d5538b6d6214be5048 Mon Sep 17 00:00:00 2001 From: Ugo Majer Date: Tue, 16 Jan 2024 17:04:20 +0100 Subject: [PATCH 2/3] fix: fixing double quote and variable This commit fix some issue with variable and double quote. --- src/lexer/lexer.c | 87 ++++++++++++++++-- src/lexer/lexer.h | 30 ++++++ src/lexer/lexer_utils.c | 162 +++++++++++++++++++++++---------- src/lexer/tests/lexer2_tests.c | 51 ++++++++--- src/lexer/token.h | 7 +- 5 files changed, 266 insertions(+), 71 deletions(-) diff --git a/src/lexer/lexer.c b/src/lexer/lexer.c index 261aca7d..9bef6b57 100644 --- a/src/lexer/lexer.c +++ b/src/lexer/lexer.c @@ -54,27 +54,31 @@ void token_free(struct token token) char *get_word(struct lexer *lexer, bool *is_diactivated) { char *word = malloc(sizeof(char) * 2); + unsigned word_index = 0; + // Word start with ; or \n and return its token if (lexer->data[lexer->index] == ';' || lexer->data[lexer->index] == '\n') { word[0] = lexer->data[lexer->index]; word_index = 1; ++lexer->index; - if (lexer->data[lexer->index] == ' ') - { - ++lexer->index; - } } + + // Handle comments return the next word else if (lexer->data[lexer->index] == '#') { word = handle_comment(lexer, word, &word_index); } + + // Handle redirections return the token else if (lexer->data[lexer->index] == '>' || lexer->data[lexer->index] == '<') { word = handle_redir(lexer, &word_index); } + + // Handle (||, &&, | and &) return the token else if (lexer->data[lexer->index] == '|' || lexer->data[lexer->index] == '&') { @@ -82,6 +86,7 @@ char *get_word(struct lexer *lexer, bool *is_diactivated) word_index = 1; ++lexer->index; + // Handle ||, && if (lexer->data[lexer->index] == '|' || lexer->data[lexer->index] == '&') { @@ -93,6 +98,7 @@ char *get_word(struct lexer *lexer, bool *is_diactivated) } else { + // Handle the word while (lexer->data[lexer->index] != ' ' && lexer->data[lexer->index] != '\0' && lexer->data[lexer->index] != ';' @@ -103,26 +109,49 @@ char *get_word(struct lexer *lexer, bool *is_diactivated) && lexer->data[lexer->index] != '|' && lexer->data[lexer->index] != '&') { - if (lexer->data[lexer->index] == '=' && word_index > 0 && lexer->curr_tok.type != TOKEN_DOUBLE_QUOTE) + // Handle the variable + if (lexer->data[lexer->index] == '$') + { + if (word_index != 0) + { + break; + } + if (handle_dollar(lexer, &word, &word_index)) + { + word = realloc(word, sizeof(char) * (word_index + 1)); + word[word_index] = '\0'; + return word; + } + } + // Handle the word assignement if it's contain '=' and it's not the first character + else if (lexer->data[lexer->index] == '=' && word_index > 0 && lexer->curr_tok.type != TOKEN_DOUBLE_QUOTE && lexer->curr_tok.type != TOKEN_VARIABLE_VALUE) { lexer->curr_tok.type = TOKEN_WORD_ASSIGNMENT; lexer->index += 1; break; } + + // Take next char and put it in the word word = realloc(word, sizeof(char) * (word_index + 1)); word[word_index] = lexer->data[lexer->index]; ++word_index; ++lexer->index; + + // Handle the double quote if (lexer->data[lexer->index - 1] == '\"' - || lexer->curr_tok.type == TOKEN_DOUBLE_QUOTE) + || lexer->curr_tok.type == TOKEN_DOUBLE_QUOTE || lexer->curr_tok.type == TOKEN_VARIABLE_AND_DOUBLE_QUOTE) { + // Handle the end of the double quote if (lexer->data[lexer->index - 1] == '\"') { word_index -= 1; lexer->curr_tok.type = TOKEN_DOUBLE_QUOTE; } + + // Handle the double quote word = handle_double_quote(lexer, is_diactivated, word, &word_index); + // Missing closing double quote if (!word) { return NULL; @@ -130,14 +159,21 @@ char *get_word(struct lexer *lexer, bool *is_diactivated) word[word_index] = '\0'; return word; } + + // Handle backslash else if (lexer->data[lexer->index - 1] == '\\') { + // TODO: check if it's handle backslash in double quote handle_backslash(lexer, is_diactivated, word, word_index); } + + // Handle simple quote else if (lexer->data[lexer->index - 1] == '\'') { word = handle_simple_quote(lexer, is_diactivated, word, &word_index); + + // Missing closing simple quote if (!word) { return NULL; @@ -146,9 +182,12 @@ char *get_word(struct lexer *lexer, bool *is_diactivated) } } } + + // End of the word word = realloc(word, sizeof(char) * (word_index + 1)); word[word_index] = '\0'; + // Skip spaces and tabs while (lexer->data[lexer->index] == ' ' || lexer->data[lexer->index] == '\t') { @@ -162,16 +201,19 @@ struct token parse_input_for_tok(struct lexer *lexer) { struct token token; + // Usefull to diactivate the special meaning of the next character when it's a backslash bool is_diactivated = false; + // Skip spaces and tabs at first while (lexer->data[lexer->index] == ' ' || lexer->data[lexer->index] == '\t') { ++lexer->index; } + // Return the next word char *word = get_word(lexer, &is_diactivated); - + // If word is NULL, return an EOF tokens if (!word) { token.type = TOKEN_ERROR; @@ -179,14 +221,33 @@ struct token parse_input_for_tok(struct lexer *lexer) return token; } - if (lexer->curr_tok.type == TOKEN_WORD_ASSIGNMENT) + // Check if the word is a word_assignement (contains a '=') and if it's a variable name is valid + if (lexer->curr_tok.type == TOKEN_WORD_ASSIGNMENT && check_variable_assignement(word)) { token.type = TOKEN_WORD_ASSIGNMENT; token.data = word; - lexer->curr_tok.type = TOKEN_EOL; + // Usefull to have the next word token + lexer->curr_tok.type = TOKEN_VARIABLE_VALUE; return token; } + // Check if the word is a variable name + if (lexer->curr_tok.type == TOKEN_VARIABLE || lexer->curr_tok.type == TOKEN_VARIABLE_AND_DOUBLE_QUOTE) + { + token.type = TOKEN_VARIABLE; + token.data = word; + if (lexer->curr_tok.type == TOKEN_VARIABLE_AND_DOUBLE_QUOTE) + { + lexer->curr_tok.type = TOKEN_DOUBLE_QUOTE; + } + else + { + lexer->curr_tok.type = TOKEN_EOL; + } + return token; + } + + // Check if is in the lex_match table for (unsigned i = 0; i < sizeof(lex_match) / sizeof(*lex_match); ++i) { if (fnmatch(lex_match[i].str, word, 0) == 0 && !is_diactivated) @@ -196,7 +257,13 @@ struct token parse_input_for_tok(struct lexer *lexer) return token; } } - + + // Check if it's a variable value + if (lexer->curr_tok.type == TOKEN_VARIABLE_VALUE) + { + lexer->curr_tok.type = TOKEN_EOL; + } + // Else it's a word token.type = TOKEN_WORD; token.data = word; return token; diff --git a/src/lexer/lexer.h b/src/lexer/lexer.h index f5f8c5e2..3a21f978 100644 --- a/src/lexer/lexer.h +++ b/src/lexer/lexer.h @@ -91,6 +91,36 @@ void handle_backslash(struct lexer *lexer, bool *is_diactivated, char *word, char *handle_simple_quote(struct lexer *lexer, bool *is_diactivated, char *word, unsigned *word_index); + +/** + * \brief Check if the word is a variable assignement. + * + * \param lexer The lexer. + * \param word The word to check. + * \param word_index The index of the word. + * + * \return if the word is a variable assignement. + */ +bool check_variable_assignement(char *word); + +/** + * \brief Check if the given word is a variable name. + * \param lexer The lexer. + * \param word The word to check. + * \param word_index The index of the word. + * + * \return if the word is a variable name. + */ +bool check_variable_name(struct lexer *lexer, char **word, unsigned *word_index); + +/** + * \brief Handle the dollar character. + * \param lexer The lexer. + * \return The next word. + */ +bool handle_dollar(struct lexer *lexer, char **word, + unsigned *word_index); + /** * \brief Handle the double quote character. * \param lexer The lexer. diff --git a/src/lexer/lexer_utils.c b/src/lexer/lexer_utils.c index 597f3163..584f57b1 100644 --- a/src/lexer/lexer_utils.c +++ b/src/lexer/lexer_utils.c @@ -41,36 +41,84 @@ char *handle_simple_quote(struct lexer *lexer, bool *is_diactivated, char *word, return word; } -/** - * \brief Check if the given word is a variable name. - * \param lexer The lexer. - * \param word The word to check. - */ -char *check_variable_name(struct lexer *lexer, char *word, unsigned *word_index) +bool check_variable_assignement(char *word) { + size_t i = 0; + if (word[i] == '_' || word[i] == '-' || (word[i] >= 'a' && word[i] <= 'z') || (word[i] >= 'A' && word[i] <= 'Z')) + { + i++; + } + else + { + return false; + } + while (word[i] != ' ' && word[i] != '\t' + && word[i] != '\n' + && word[i] != '\0') + { + if (word[i] == '_' || word[i] == '-' + || (word[i] >= 'a' + && word[i] <= 'z') + || (word[i] >= 'A' + && word[i] <= 'Z') + || (word[i] >= '0' + && word[i] <= '9')) + { + i++; + } + else + { + return false; + } + } + + return true; +} + +bool check_variable_name(struct lexer *lexer, char **word, unsigned *word_index) +{ + char *curr_word = *word; + + // Handle variable in double quote + if (lexer->curr_tok.type == TOKEN_DOUBLE_QUOTE) + { + lexer->curr_tok.type = TOKEN_VARIABLE_AND_DOUBLE_QUOTE; + } + else + { + lexer->curr_tok.type = TOKEN_VARIABLE; + } + + // Check if it's a special variable (like $?, $*, $@, $# or $$) if (lexer->data[lexer->index] == '?' || lexer->data[lexer->index] == '*' || lexer->data[lexer->index] == '@' || lexer->data[lexer->index] == '#' || lexer->data[lexer->index] == '$') { - word = realloc(word, sizeof(char) * (*word_index + 1)); - word[*word_index] = lexer->data[lexer->index]; + curr_word = realloc(curr_word, sizeof(char) * (*word_index + 1)); + curr_word[*word_index] = lexer->data[lexer->index]; *word_index += 1; lexer->index += 1; - return word; + *word = curr_word; + return true; } + + // Chech if it's a special variable (like $n) else if (lexer->data[lexer->index] >= '0' && lexer->data[lexer->index] <= '9') { while (lexer->data[lexer->index] >= '0' && lexer->data[lexer->index] <= '9') { - word = realloc(word, sizeof(char) * (*word_index + 1)); - word[*word_index] = lexer->data[lexer->index]; + curr_word = realloc(curr_word, sizeof(char) * (*word_index + 1)); + curr_word[*word_index] = lexer->data[lexer->index]; *word_index += 1; lexer->index += 1; } - return word; + *word = curr_word; + return true; } + + // Classic variable name else if (lexer->data[lexer->index] == '_' || lexer->data[lexer->index] == '-' || (lexer->data[lexer->index] >= 'a' @@ -78,89 +126,109 @@ char *check_variable_name(struct lexer *lexer, char *word, unsigned *word_index) || (lexer->data[lexer->index] >= 'A' && lexer->data[lexer->index] <= 'Z')) { - word = realloc(word, sizeof(char) * (*word_index + 1)); - word[*word_index] = lexer->data[lexer->index]; + curr_word = realloc(curr_word, sizeof(char) * (*word_index + 1)); + curr_word[*word_index] = lexer->data[lexer->index]; *word_index += 1; lexer->index += 1; } + // Not a valid variable name else { - return NULL; + if (lexer->curr_tok.type != TOKEN_DOUBLE_QUOTE) + { + lexer->curr_tok.type = TOKEN_WORD; + } + return false; } - while (lexer->data[lexer->index] != ' ' && lexer->data[lexer->index] != '\t' - && lexer->data[lexer->index] != '\n' - && lexer->data[lexer->index] != '\0') - { - if (lexer->data[lexer->index] == '_' || lexer->data[lexer->index] == '-' + // Check the rest of the variable name break + while (lexer->data[lexer->index] == '_' || lexer->data[lexer->index] == '-' || (lexer->data[lexer->index] >= 'a' && lexer->data[lexer->index] <= 'z') || (lexer->data[lexer->index] >= 'A' && lexer->data[lexer->index] <= 'Z') || (lexer->data[lexer->index] >= '0' && lexer->data[lexer->index] <= '9')) - { - word = realloc(word, sizeof(char) * (*word_index + 1)); - word[*word_index] = lexer->data[lexer->index]; - *word_index += 1; - lexer->index += 1; - } - else - { - return word; - } + { + curr_word = realloc(curr_word, sizeof(char) * (*word_index + 1)); + curr_word[*word_index] = lexer->data[lexer->index]; + *word_index += 1; + lexer->index += 1; } + *word = curr_word; + return true; +} - return word; +bool handle_dollar(struct lexer *lexer, char **word, + unsigned *word_index) +{ + char *curr_word = *word; + // Add the dollar to the word + curr_word = realloc(curr_word, sizeof(char) * (*word_index + 1)); + curr_word[*word_index] = lexer->data[lexer->index]; + *word_index += 1; + lexer->index += 1; + *word = curr_word; + + // Check if the name of the variable is correct + return check_variable_name(lexer, word, word_index); } char *handle_double_quote(struct lexer *lexer, bool *is_diactivated, char *word, unsigned *word_index) { *is_diactivated = true; + // Check if a the first word is a variable if (lexer->data[lexer->index] == '$') { - word = realloc(word, sizeof(char) * (*word_index + 1)); - word[*word_index] = lexer->data[lexer->index]; - *word_index += 1; - lexer->index += 1; - char *word_tmp = check_variable_name(lexer, word, word_index); - if (word_tmp != NULL) - { - word = word_tmp; - } - else + if (handle_dollar(lexer, &word, word_index)) { + word = realloc(word, sizeof(char) * (*word_index + 1)); + word[*word_index] = '\0'; return word; } } + + // While it's different from a double quote or a variable while (lexer->data[lexer->index] != '\"' && lexer->data[lexer->index] != '$') { + // Missing closing double quote if (lexer->data[lexer->index] == '\0') { free(word); word = NULL; return NULL; } + // Handle the backslash if the back slash is alone we need to add it to the word else if (lexer->data[lexer->index] == '\\') { lexer->index += 1; + word = realloc(word, sizeof(char) * (*word_index + 1)); if (lexer->data[lexer->index] == '\"' || lexer->data[lexer->index] == '$' || lexer->data[lexer->index] == '\\' || lexer->data[lexer->index] == '\n') { - word = realloc(word, sizeof(char) * (*word_index + 1)); word[*word_index] = lexer->data[lexer->index]; - *word_index += 1; lexer->index += 1; } + else + { + word[*word_index] = '\\'; + } + *word_index += 1; + } + else + { + // Add the character to the word + word = realloc(word, sizeof(char) * (*word_index + 1)); + word[*word_index] = lexer->data[lexer->index]; + *word_index += 1; + lexer->index += 1; } - word = realloc(word, sizeof(char) * (*word_index + 1)); - word[*word_index] = lexer->data[lexer->index]; - *word_index += 1; - lexer->index += 1; } + + // If if (lexer->data[lexer->index] == '\"') { lexer->curr_tok.type = TOKEN_EOL; diff --git a/src/lexer/tests/lexer2_tests.c b/src/lexer/tests/lexer2_tests.c index f3ac51e0..cdddf2fb 100644 --- a/src/lexer/tests/lexer2_tests.c +++ b/src/lexer/tests/lexer2_tests.c @@ -291,6 +291,27 @@ Test(lexer2, token_redir_stick_left3) lexer_free(lexer); } +Test(lexer2, token_redir_backslash) +{ + struct lexer *lexer = lexer_new("ls -la 2\\>file"); + struct token tok = lexer_pop(lexer); + cr_assert_eq(tok.type, TOKEN_WORD); + cr_assert_str_eq(tok.data, "ls"); + token_free(tok); + + tok = lexer_pop(lexer); + cr_assert_eq(tok.type, TOKEN_WORD); + cr_assert_str_eq(tok.data, "-la"); + token_free(tok); + + tok = lexer_pop(lexer); + cr_assert_eq(tok.type, TOKEN_WORD); + cr_assert_str_eq(tok.data, "2>file"); + token_free(tok); + + lexer_free(lexer); +} + Test(lexer2, token_while) { struct lexer *lexer = lexer_new("while"); @@ -345,7 +366,7 @@ Test(lexer2, token_variable_parameters) { struct lexer *lexer = lexer_new("$@"); struct token tok = lexer_pop(lexer); - cr_assert_eq(tok.type, TOKEN_WORD); + cr_assert_eq(tok.type, TOKEN_VARIABLE); cr_assert_str_eq(tok.data, "$@"); token_free(tok); lexer_free(lexer); @@ -355,7 +376,7 @@ Test(lexer2, token_variable_parameters2) { struct lexer *lexer = lexer_new("$*"); struct token tok = lexer_pop(lexer); - cr_assert_eq(tok.type, TOKEN_WORD); + cr_assert_eq(tok.type, TOKEN_VARIABLE); cr_assert_str_eq(tok.data, "$*"); token_free(tok); lexer_free(lexer); @@ -423,7 +444,7 @@ Test(lexer2, token_double_quote_variable) token_free(tok); tok = lexer_pop(lexer); - cr_assert_eq(tok.type, TOKEN_WORD); + cr_assert_eq(tok.type, TOKEN_VARIABLE); cr_assert_str_eq(tok.data, "$test"); token_free(tok); @@ -450,7 +471,7 @@ Test(lexer2, token_word_assignment) { struct lexer *lexer = lexer_new("toto=2"); struct token tok = lexer_pop(lexer); - cr_assert_eq(tok.type, TOKEN_WORD_ASSIGNMENT); + cr_assert_eq(tok.type, TOKEN_WORD_ASSIGNMENT, "got %d", tok.type); cr_assert_str_eq(tok.data, "toto"); token_free(tok); @@ -518,7 +539,7 @@ Test(lexer2, token_word_assignment_in_echo2) token_free(tok); tok = lexer_pop(lexer); - cr_assert_eq(tok.type, TOKEN_WORD_ASSIGNMENT); + cr_assert_eq(tok.type, TOKEN_WORD_ASSIGNMENT, "got %d", tok.type); cr_assert_str_eq(tok.data, "tata"); token_free(tok); @@ -550,7 +571,7 @@ Test(lexer2, token_word_assignment_name1) { struct lexer *lexer = lexer_new("1_t11oto=2"); struct token tok = lexer_pop(lexer); - cr_assert_eq(tok.type, TOKEN_ERROR, "got %d", tok.type); + cr_assert_eq(tok.type, TOKEN_WORD, "got %d", tok.type); token_free(tok); tok = lexer_pop(lexer); @@ -684,7 +705,7 @@ Test(lexer2, word_assignement7) token_free(tok); tok = lexer_pop(lexer); - cr_assert_eq(tok.type, TOKEN_WORD); + cr_assert_eq(tok.type, TOKEN_VARIABLE); cr_assert_str_eq(tok.data, "$auhi"); token_free(tok); @@ -737,7 +758,7 @@ Test(lexer2, variable_find) token_free(tok); tok = lexer_pop(lexer); - cr_assert_eq(tok.type, TOKEN_WORD); + cr_assert_eq(tok.type, TOKEN_VARIABLE); cr_assert_str_eq(tok.data, "$toto"); token_free(tok); @@ -753,15 +774,19 @@ Test(lexer2, variable_find2) token_free(tok); tok = lexer_pop(lexer); - cr_assert_eq(tok.type, TOKEN_WORD); + cr_assert_eq(tok.type, TOKEN_VARIABLE); cr_assert_str_eq(tok.data, "$a1"); token_free(tok); tok = lexer_pop(lexer); - cr_assert_eq(tok.type, TOKEN_WORD); + cr_assert_eq(tok.type, TOKEN_WORD, "got %d", tok.type); cr_assert_str_eq(tok.data, ":dddfff"); token_free(tok); + tok = lexer_pop(lexer); + cr_assert_eq(tok.type, TOKEN_EOF); + token_free(tok); + lexer_free(lexer); } @@ -774,7 +799,7 @@ Test(lexer2, variable_find3) token_free(tok); tok = lexer_pop(lexer); - cr_assert_eq(tok.type, TOKEN_WORD); + cr_assert_eq(tok.type, TOKEN_VARIABLE); cr_assert_str_eq(tok.data, "$a1_dddfff"); token_free(tok); @@ -790,12 +815,12 @@ Test(lexer2, variable_find4) token_free(tok); tok = lexer_pop(lexer); - cr_assert_eq(tok.type, TOKEN_WORD); + cr_assert_eq(tok.type, TOKEN_VARIABLE); cr_assert_str_eq(tok.data, "$a1_dddfff"); token_free(tok); tok = lexer_pop(lexer); - cr_assert_eq(tok.type, TOKEN_WORD); + cr_assert_eq(tok.type, TOKEN_VARIABLE); cr_assert_str_eq(tok.data, "$toto"); token_free(tok); diff --git a/src/lexer/token.h b/src/lexer/token.h index b117e679..9756336b 100644 --- a/src/lexer/token.h +++ b/src/lexer/token.h @@ -34,7 +34,12 @@ enum token_type TOKEN_NEGATE, // \! TOKEN_REDIR, // >, <, >>, >&, <&, >|, <> TOKEN_DOUBLE_QUOTE, // " - TOKEN_WORD_ASSIGNMENT // variable= + TOKEN_WORD_ASSIGNMENT, // variable= + TOKEN_VARIABLE, // $variable + + // Internal values for lexer + TOKEN_VARIABLE_VALUE, // =value + TOKEN_VARIABLE_AND_DOUBLE_QUOTE // $variable" }; /** From aa6d18224bc9182793aa37787dd2ac4cd6e77b00 Mon Sep 17 00:00:00 2001 From: Ugo Majer Date: Tue, 16 Jan 2024 17:06:27 +0100 Subject: [PATCH 3/3] fix: fixing merge request --- src/lexer/token.h | 1 + 1 file changed, 1 insertion(+) diff --git a/src/lexer/token.h b/src/lexer/token.h index 9756336b..61b17b27 100644 --- a/src/lexer/token.h +++ b/src/lexer/token.h @@ -28,6 +28,7 @@ enum token_type TOKEN_ERROR, // Error // Step 2 + TOKEN_DONE, // done TOKEN_AND, // && TOKEN_OR, // || TOKEN_PIPE, // |