From b189123aa984f37053441cc7a4ddcff2b07b0e34 Mon Sep 17 00:00:00 2001 From: erwann lesech Date: Sun, 14 Jan 2024 02:58:01 +0100 Subject: [PATCH 1/4] feat: add pretty-print with graphviz --- .github/workflows/clang_tidy.sh | 3 + .gitignore | 3 + src/ast/ast.h | 10 ++- src/lexer/lexer.c | 3 +- src/options/options.c | 116 ++++++++++++++++++++++++++++++++ src/options/options.h | 8 +++ src/parser/parser.c | 4 +- 7 files changed, 143 insertions(+), 4 deletions(-) diff --git a/.github/workflows/clang_tidy.sh b/.github/workflows/clang_tidy.sh index cebb7456..cc006a36 100755 --- a/.github/workflows/clang_tidy.sh +++ b/.github/workflows/clang_tidy.sh @@ -24,10 +24,12 @@ for file in $(find "$root_dir/src" -type f -name '*.c'); do if [[ "$parameters" -gt 4 ]]; then echo "Too many parameters in function: $function_name" + echo "Parameters: $parameters" fi if [[ "$lines_in_function" -gt 40 ]]; then echo "Too many lines in function: $function_name" + echo "Lines in function: $lines_in_function" fi # echo "Function: $function_name" @@ -40,5 +42,6 @@ for file in $(find "$root_dir/src" -type f -name '*.c'); do # echo "Total functions: $function_count" if [[ "$function_count" -gt 10 ]]; then echo "Too many functions in file: $file" + echo "Total functions: $function_count" fi done \ No newline at end of file diff --git a/.gitignore b/.gitignore index 94804143..3af408ff 100644 --- a/.gitignore +++ b/.gitignore @@ -25,6 +25,9 @@ config.status *.diff +*.gv +*.png + # Doxygen Doxyfile.bak html diff --git a/src/ast/ast.h b/src/ast/ast.h index 8cb86f60..fe03589c 100644 --- a/src/ast/ast.h +++ b/src/ast/ast.h @@ -9,11 +9,10 @@ #ifndef AST_H #define AST_H +#include #include #include -#include "../options/options.h" - /** * \enum ast_type * \brief Enumerate the different types of AST nodes. @@ -67,6 +66,13 @@ void ast_free(struct ast_node *node); */ struct ast_node *ast_node_word(char *value); +/** + * \brief Convert an AST type to a string. + * \param type The type to convert. + * \return The AST type string. + */ +char *ast_type_to_string(enum ast_type type); + /** * \brief Create a new AST node of type AST_SIMPLE_COMMAND. * \param value The value of the node. diff --git a/src/lexer/lexer.c b/src/lexer/lexer.c index 5912a4c4..c7e12a37 100644 --- a/src/lexer/lexer.c +++ b/src/lexer/lexer.c @@ -178,7 +178,8 @@ char *get_word(struct lexer *lexer, bool *is_diactivated) word = realloc(word, sizeof(char) * (word_index + 1)); word[word_index] = '\0'; - while (lexer->data[lexer->index] == ' ') + while (lexer->data[lexer->index] == ' ' + || lexer->data[lexer->index] == '\t') { ++lexer->index; } diff --git a/src/options/options.c b/src/options/options.c index 52e1d9c2..6ff607ce 100644 --- a/src/options/options.c +++ b/src/options/options.c @@ -8,9 +8,13 @@ #include "options.h" +#include #include #include #include +#include + +#include "../ast/ast.h" bool check_logger(int *argc, char **argv) { @@ -71,3 +75,115 @@ void logger(char *str, enum logger_step step, bool logger_enabled) break; } } + +/** + * \brief digit counter + * \param number the number to count digits + * \return the number of digits + */ +int count_digits(int number) +{ + if (number == 0) + return 1; + int count = 0; + while (number != 0) + { + number /= 10; + ++count; + } + return count; +} + +void pp_node(struct ast_node *ast, int fd, int *number) +{ + if (!ast) + return; + + char *buff = malloc(sizeof(char) * 1000); + + write(fd, "node", 4); + sprintf(buff, "%d", *number); + buff[strlen(buff)] = '\0'; + write(fd, buff, count_digits(*number)); + write(fd, " [label=\"", 9); + write(fd, ast_type_to_string(ast->type), + strlen(ast_type_to_string(ast->type))); + if (ast->value) + { + write(fd, " - ", 3); + write(fd, ast->value, strlen(ast->value)); + } + write(fd, "\"];\n", 4); + + int i = 0; + while (i < ast->children_count && ast->children[i]) + { + write(fd, "node", 4); + sprintf(buff, "%d", (*number + i + 11)); + write(fd, buff, count_digits(*number + i + 11)); + write(fd, " [label=\"", 9); + write(fd, ast_type_to_string(ast->children[i]->type), + strlen(ast_type_to_string(ast->children[i]->type))); + if (ast->children[i]->value) + { + write(fd, " - ", 3); + write(fd, ast->children[i]->value, strlen(ast->children[i]->value)); + } + write(fd, "\"];\n", 4); + i++; + } +} + +void pp_link(struct ast_node *ast, int fd, int *number) +{ + if (!ast->children) + return; + + pp_node(ast, fd, number); + + char *buff = malloc(sizeof(char) * 1000); + int i = 0; + + while (i < ast->children_count && ast->children[i]) + { + write(fd, "node", 4); + sprintf(buff, "%d", *number); + write(fd, buff, count_digits(*number)); + write(fd, " -> ", 4); + write(fd, "node", 4); + sprintf(buff, "%d", (*number + i + 11)); + write(fd, buff, count_digits(*number + i + 11)); + write(fd, ";\n", 2); + i++; + } + i = 0; + + while (i < ast->children_count) + { + int new = (*number + i + 11); + pp_link(ast->children[i], fd, &new); + i++; + } + + free(buff); +} + +void pretty_print(struct ast_node *ast, bool pretty_print_enabled, int *number) +{ + if (!ast || !pretty_print_enabled) + return; + + int fd = open("pretty_print.gv", O_WRONLY | O_CREAT | O_TRUNC, 0644); + + write(fd, "digraph AST {\n", 14); + + write(fd, "graph [rankdir=TB, ranksep=0.8, nodesep=0.4];\n", 46); + write(fd, "node [shape=box, color=lightblue, style=filled, fontsize=14];\n", + 62); + write(fd, "edge [color=black, style=solid, arrowhead=vee];\n\n", 48); + + pp_link(ast, fd, number); + + write(fd, "}\n", 3); + close(fd); +} \ No newline at end of file diff --git a/src/options/options.h b/src/options/options.h index b8fa97db..777b9b76 100644 --- a/src/options/options.h +++ b/src/options/options.h @@ -12,6 +12,8 @@ #include #include +#include "../ast/ast.h" + /** * \brief Enum for the different logger steps. */ @@ -46,4 +48,10 @@ bool check_pretty_print(int *argc, char **argv); */ void logger(char *str, enum logger_step step, bool logger_enabled); +/** + * \brief Pretty printf of ast. + */ +void pretty_print(struct ast_node *ast, bool pretty_print_enabled, + int *depths); + #endif /* ! OPTIONS_H */ diff --git a/src/parser/parser.c b/src/parser/parser.c index c593d49d..869d2c48 100644 --- a/src/parser/parser.c +++ b/src/parser/parser.c @@ -78,7 +78,9 @@ int parser_loop(struct lexer *lexer, bool logger_enabled, } if (pretty_print_enabled) { - print_ast(ast, 0, logger_enabled); + // print_ast(ast, 0, logger_enabled); + int depths = 0; + pretty_print(ast, pretty_print_enabled, &depths); } return_value = match_ast(ast, logger_enabled); if (return_value != 0 && return_value != 1) From f29001729eb89ef16fbc8d5698189dd0fbde1fcb Mon Sep 17 00:00:00 2001 From: erwann lesech Date: Sun, 14 Jan 2024 22:14:15 +0100 Subject: [PATCH 2/4] feat: add new tokens --- src/lexer/Makefile.am | 2 +- src/lexer/lexer.c | 112 +++++-------------- src/lexer/lexer.h | 23 ++++ src/lexer/lexer_utils.c | 71 ++++++++++++ src/lexer/tests/lexer2_tests.c | 198 +++++++++++++++++++++++++++++++++ src/lexer/tests/lexer_tests.c | 4 +- src/lexer/token.h | 23 +++- src/options/options.h | 3 +- tests/Makefile.am | 1 + 9 files changed, 347 insertions(+), 90 deletions(-) create mode 100644 src/lexer/lexer_utils.c create mode 100644 src/lexer/tests/lexer2_tests.c diff --git a/src/lexer/Makefile.am b/src/lexer/Makefile.am index 197e6792..526b315d 100644 --- a/src/lexer/Makefile.am +++ b/src/lexer/Makefile.am @@ -1,5 +1,5 @@ lib_LIBRARIES = liblexer.a -liblexer_a_SOURCES = lexer.c lexer.h token.h +liblexer_a_SOURCES = lexer.c lexer_utils.c lexer.h token.h liblexer_a_CFLAGS = -Wall -Wextra -Werror -std=c99 -pedantic liblexer_a_CPPFLAGS = -I$(top_srcdir) diff --git a/src/lexer/lexer.c b/src/lexer/lexer.c index c7e12a37..9cb5c7a8 100644 --- a/src/lexer/lexer.c +++ b/src/lexer/lexer.c @@ -9,16 +9,38 @@ #include "lexer.h" #include +#include #include #include #include #include -struct lex_match lex_match[8] = { - { "if", TOKEN_IF }, { "then", TOKEN_THEN }, { "elif", TOKEN_ELIF }, - { "else", TOKEN_ELSE }, { "fi", TOKEN_FI }, { ";", TOKEN_SEMICOLON }, - { "\n", TOKEN_EOL }, { "\0", TOKEN_EOF } -}; +struct lex_match lex_match[25] = { { "if", TOKEN_IF }, + { "then", TOKEN_THEN }, + { "elif", TOKEN_ELIF }, + { "else", TOKEN_ELSE }, + { "fi", TOKEN_FI }, + { ";", TOKEN_SEMICOLON }, + { "\n", TOKEN_EOL }, + { "\0", TOKEN_EOF }, + + { "&&", TOKEN_AND }, + { "||", TOKEN_OR }, + { "|", TOKEN_PIPE }, + { "!", TOKEN_NEGATE }, + { "[0-9]*<", TOKEN_INPUT_REDIR }, + { "[0-9]*>", TOKEN_OUTPUT_REDIR }, + { "[0-9]*>>", TOKEN_APPEND }, + { "[0-9]*<&", TOKEN_DUP_INPUT }, + { "[0-9]*>&", TOKEN_DUP_INPUT_OUTPUT }, + { "[0-9]*>|", TOKEN_NOCLOBBER }, + { "[0-9]*<>", TOKEN_DUP_INPUT_OUTPUT }, + { "while", TOKEN_WHILE }, + { "until", TOKEN_UNTIL }, + { "for", TOKEN_FOR }, + { "do", TOKEN_DO }, + { "done", TOKEN_DONE }, + { "$*", TOKEN_VARIABLE } }; struct lexer *lexer_new(const char *input) { @@ -45,84 +67,6 @@ void token_free(struct token token) free(token.data); } -/** - * \brief Handle the backslash character. - * - * \return false if it's the end of the string, true otherwise. - */ -bool handle_backslash(struct lexer *lexer, bool *is_diactivated, char *word, - unsigned word_index) -{ - *is_diactivated = true; - if (lexer->data[lexer->index] != '\0') - { - word[word_index - 1] = lexer->data[lexer->index]; - ++lexer->index; - } - else - { - word[word_index - 1] = '\0'; - return false; - } - - return true; -} - -/** - * \brief Handle the simple quote character. - * - * \return false if a closing quote was not found, true otherwise. - */ -char *handle_simple_quote(struct lexer *lexer, bool *is_diactivated, char *word, - unsigned *word_index) -{ - *is_diactivated = true; - *word_index -= 1; - while (lexer->data[lexer->index] != '\'') - { - if (lexer->data[lexer->index] == '\0') - { - free(word); - word = NULL; - return NULL; - } - word = realloc(word, sizeof(char) * (*word_index + 1)); - word[*word_index] = lexer->data[lexer->index]; - *word_index += 1; - lexer->index += 1; - } - - return word; -} - -char *handle_comment(struct lexer *lexer, char *word, unsigned word_index) -{ - // Skip the comment - ++lexer->index; - - // Find the end of the comment - while (lexer->data[lexer->index] != '\n' - && lexer->data[lexer->index] != '\0') - { - ++lexer->index; - } - word[word_index] = lexer->data[lexer->index]; - ++lexer->index; - // If the comment isn't the last thing in the string, we need to add a '\0' - // at the end of the word. - if (word[word_index] != '\0') - { - word[word_index + 1] = '\0'; - } - - // Skip the spaces after the comment - while (lexer->data[lexer->index] == ' ') - { - ++lexer->index; - } - return word; -} - char *get_word(struct lexer *lexer, bool *is_diactivated) { char *word = malloc(sizeof(char) * 2); @@ -210,7 +154,7 @@ struct token parse_input_for_tok(struct lexer *lexer) for (unsigned i = 0; i < sizeof(lex_match) / sizeof(*lex_match); ++i) { - if (!strcmp(word, lex_match[i].str) && !is_diactivated) + if (fnmatch(lex_match[i].str, word, 0) == 0 && !is_diactivated) { token.type = lex_match[i].type; token.data = word; diff --git a/src/lexer/lexer.h b/src/lexer/lexer.h index fe5cb470..63452eb6 100644 --- a/src/lexer/lexer.h +++ b/src/lexer/lexer.h @@ -75,6 +75,29 @@ struct token lexer_peek(struct lexer *lexer); */ struct token lexer_pop(struct lexer *lexer); +/** + * \brief Handle the backslash character. + * + * \return false if it's the end of the string, true otherwise. + */ +bool handle_backslash(struct lexer *lexer, bool *is_diactivated, char *word, + unsigned word_index); + +/** + * \brief Handle the simple quote character. + * + * \return false if a closing quote was not found, true otherwise. + */ +char *handle_simple_quote(struct lexer *lexer, bool *is_diactivated, char *word, + unsigned *word_index); + +/** + * \brief Handle the comment character. + * + * \return The next word. + */ +char *handle_comment(struct lexer *lexer, char *word, unsigned word_index); + /** * \brief Returns the next word in the input string. * \param lexer The lexer. diff --git a/src/lexer/lexer_utils.c b/src/lexer/lexer_utils.c new file mode 100644 index 00000000..908a877c --- /dev/null +++ b/src/lexer/lexer_utils.c @@ -0,0 +1,71 @@ +#include + +#include "lexer.h" + +bool handle_backslash(struct lexer *lexer, bool *is_diactivated, char *word, + unsigned word_index) +{ + *is_diactivated = true; + if (lexer->data[lexer->index] != '\0') + { + word[word_index - 1] = lexer->data[lexer->index]; + ++lexer->index; + } + else + { + word[word_index - 1] = '\0'; + return false; + } + + return true; +} + +char *handle_simple_quote(struct lexer *lexer, bool *is_diactivated, char *word, + unsigned *word_index) +{ + *is_diactivated = true; + *word_index -= 1; + while (lexer->data[lexer->index] != '\'') + { + if (lexer->data[lexer->index] == '\0') + { + free(word); + word = NULL; + return NULL; + } + word = realloc(word, sizeof(char) * (*word_index + 1)); + word[*word_index] = lexer->data[lexer->index]; + *word_index += 1; + lexer->index += 1; + } + + return word; +} + +char *handle_comment(struct lexer *lexer, char *word, unsigned word_index) +{ + // Skip the comment + ++lexer->index; + + // Find the end of the comment + while (lexer->data[lexer->index] != '\n' + && lexer->data[lexer->index] != '\0') + { + ++lexer->index; + } + word[word_index] = lexer->data[lexer->index]; + ++lexer->index; + // If the comment isn't the last thing in the string, we need to add a '\0' + // at the end of the word. + if (word[word_index] != '\0') + { + word[word_index + 1] = '\0'; + } + + // Skip the spaces after the comment + while (lexer->data[lexer->index] == ' ') + { + ++lexer->index; + } + return word; +} \ No newline at end of file diff --git a/src/lexer/tests/lexer2_tests.c b/src/lexer/tests/lexer2_tests.c new file mode 100644 index 00000000..9afb640a --- /dev/null +++ b/src/lexer/tests/lexer2_tests.c @@ -0,0 +1,198 @@ +/** + * \file lexer2_tests.c + * \brief Tests the lexer functions. + * \author Erwann Lesech, Valentin Gibbe, Ugo Majer, Alexandre Privat + * \version 1.0 + * \date 12/01/2024 + */ + +#include +#include +#include +#include + +#include "../lexer.h" + +TestSuite(lexer2, .timeout = 1); + +Test(lexer2, token_and) +{ + struct lexer *lexer = lexer_new("&&"); + struct token tok = lexer_pop(lexer); + cr_assert_eq(tok.type, TOKEN_AND); + cr_assert_str_eq(tok.data, "&&"); + token_free(tok); + lexer_free(lexer); +} + +Test(lexer2, token_or) +{ + struct lexer *lexer = lexer_new("||"); + struct token tok = lexer_pop(lexer); + cr_assert_eq(tok.type, TOKEN_OR); + cr_assert_str_eq(tok.data, "||"); + token_free(tok); + lexer_free(lexer); +} + +Test(lexer2, token_pipe) +{ + struct lexer *lexer = lexer_new("|"); + struct token tok = lexer_pop(lexer); + cr_assert_eq(tok.type, TOKEN_PIPE); + cr_assert_str_eq(tok.data, "|"); + token_free(tok); + lexer_free(lexer); +} + +Test(lexer2, token_negate) +{ + struct lexer *lexer = lexer_new("!"); + struct token tok = lexer_pop(lexer); + cr_assert_eq(tok.type, TOKEN_NEGATE); + cr_assert_str_eq(tok.data, "!"); + token_free(tok); + lexer_free(lexer); +} + +Test(lexer2, token_input_redir) +{ + struct lexer *lexer = lexer_new("<"); + struct token tok = lexer_pop(lexer); + cr_assert_eq(tok.type, TOKEN_INPUT_REDIR); + cr_assert_str_eq(tok.data, "<"); + token_free(tok); + lexer_free(lexer); +} + +Test(lexer2, token_output_redir) +{ + struct lexer *lexer = lexer_new(">"); + struct token tok = lexer_pop(lexer); + cr_assert_eq(tok.type, TOKEN_OUTPUT_REDIR); + cr_assert_str_eq(tok.data, ">"); + token_free(tok); + lexer_free(lexer); +} + +Test(lexer2, token_append) +{ + struct lexer *lexer = lexer_new(">>"); + struct token tok = lexer_pop(lexer); + printf("%s-\n", tok.data); + printf("%d\n", fnmatch("[0-9]*>>", ">>", 0)); + cr_assert_eq(tok.type, TOKEN_APPEND, "got %d", tok.type); + cr_assert_str_eq(tok.data, ">>"); + token_free(tok); + lexer_free(lexer); +} + +Test(lexer2, token_dup_input) +{ + struct lexer *lexer = lexer_new("<&"); + struct token tok = lexer_pop(lexer); + cr_assert_eq(tok.type, TOKEN_DUP_INPUT); + cr_assert_str_eq(tok.data, "<&"); + token_free(tok); + lexer_free(lexer); +} + +Test(lexer2, token_dup_input_output) +{ + struct lexer *lexer = lexer_new(">&"); + struct token tok = lexer_pop(lexer); + cr_assert_eq(tok.type, TOKEN_DUP_INPUT_OUTPUT); + cr_assert_str_eq(tok.data, ">&"); + token_free(tok); + lexer_free(lexer); +} + +Test(lexer2, token_noclobber) +{ + struct lexer *lexer = lexer_new(">|"); + struct token tok = lexer_pop(lexer); + cr_assert_eq(tok.type, TOKEN_NOCLOBBER); + cr_assert_str_eq(tok.data, ">|"); + token_free(tok); + lexer_free(lexer); +} + +Test(lexer2, token_dup_input_output2) +{ + struct lexer *lexer = lexer_new("<>"); + struct token tok = lexer_pop(lexer); + cr_assert_eq(tok.type, TOKEN_DUP_INPUT_OUTPUT); + cr_assert_str_eq(tok.data, "<>"); + token_free(tok); + lexer_free(lexer); +} + +Test(lexer2, token_while) +{ + struct lexer *lexer = lexer_new("while"); + struct token tok = lexer_pop(lexer); + cr_assert_eq(tok.type, TOKEN_WHILE); + cr_assert_str_eq(tok.data, "while"); + token_free(tok); + lexer_free(lexer); +} + +Test(lexer2, token_until) +{ + struct lexer *lexer = lexer_new("until"); + struct token tok = lexer_pop(lexer); + cr_assert_eq(tok.type, TOKEN_UNTIL); + cr_assert_str_eq(tok.data, "until"); + token_free(tok); + lexer_free(lexer); +} + +Test(lexer2, token_for) +{ + struct lexer *lexer = lexer_new("for"); + struct token tok = lexer_pop(lexer); + cr_assert_eq(tok.type, TOKEN_FOR); + cr_assert_str_eq(tok.data, "for"); + token_free(tok); + lexer_free(lexer); +} + +Test(lexer2, token_do) +{ + struct lexer *lexer = lexer_new("do"); + struct token tok = lexer_pop(lexer); + cr_assert_eq(tok.type, TOKEN_DO); + cr_assert_str_eq(tok.data, "do"); + token_free(tok); + lexer_free(lexer); +} + +Test(lexer2, token_done) +{ + struct lexer *lexer = lexer_new("done"); + struct token tok = lexer_pop(lexer); + cr_assert_eq(tok.type, TOKEN_DONE); + cr_assert_str_eq(tok.data, "done"); + token_free(tok); + lexer_free(lexer); +} + +Test(lexer2, token_variable_parameters) +{ + struct lexer *lexer = lexer_new("$@"); + struct token tok = lexer_pop(lexer); + cr_assert_eq(tok.type, TOKEN_VARIABLE); + cr_assert_str_eq(tok.data, "$@"); + token_free(tok); + lexer_free(lexer); +} + +Test(lexer2, token_variable_parameters2) +{ + struct lexer *lexer = lexer_new("$*"); + struct token tok = lexer_pop(lexer); + cr_assert_eq(tok.type, TOKEN_VARIABLE); + cr_assert_str_eq(tok.data, "$*"); + token_free(tok); + lexer_free(lexer); +} \ No newline at end of file diff --git a/src/lexer/tests/lexer_tests.c b/src/lexer/tests/lexer_tests.c index 26f00bfd..03d1b795 100644 --- a/src/lexer/tests/lexer_tests.c +++ b/src/lexer/tests/lexer_tests.c @@ -304,7 +304,7 @@ Test(Lexer, multiple_spaces) lexer_free(lexer); } -/* + Test(Lexer, lexer_if_then) { struct lexer *lexer = lexer_new("if 1=1 then echo 'ok' fi;"); @@ -373,7 +373,7 @@ Test(Lexer, lexer_if_then) lexer_free(lexer); } -*/ + Test(Lexer, lexer_elif) { struct lexer *lexer = lexer_new("elif 1=1"); diff --git a/src/lexer/token.h b/src/lexer/token.h index 8f7e24db..373474fb 100644 --- a/src/lexer/token.h +++ b/src/lexer/token.h @@ -15,6 +15,7 @@ */ enum token_type { + // Step 1 TOKEN_IF, TOKEN_THEN, TOKEN_ELIF, @@ -25,7 +26,27 @@ enum token_type TOKEN_WORD, // Any word TOKEN_EOL, // End of line ('\n') TOKEN_EOF, // End of file - TOKEN_ERROR // Error + TOKEN_ERROR, // Error + + // Step 2 + TOKEN_AND, // && + TOKEN_OR, // || + TOKEN_PIPE, // | + TOKEN_NEGATE, // \! + TOKEN_INPUT_REDIR, // < + TOKEN_OUTPUT_REDIR, // > + TOKEN_APPEND, // >> + TOKEN_DUP_INPUT, // <& + TOKEN_DUP_OUTPUT, // >& + TOKEN_NOCLOBBER, // >| + TOKEN_DUP_INPUT_OUTPUT, // <> + TOKEN_WHILE, // while + TOKEN_UNTIL, // until + TOKEN_FOR, // for + TOKEN_DO, // do + TOKEN_DONE, // done + TOKEN_DOUBLE_QUOTE, // " + TOKEN_VARIABLE // $ }; /** diff --git a/src/options/options.h b/src/options/options.h index 777b9b76..04b048bc 100644 --- a/src/options/options.h +++ b/src/options/options.h @@ -51,7 +51,6 @@ void logger(char *str, enum logger_step step, bool logger_enabled); /** * \brief Pretty printf of ast. */ -void pretty_print(struct ast_node *ast, bool pretty_print_enabled, - int *depths); +void pretty_print(struct ast_node *ast, bool pretty_print_enabled, int *depths); #endif /* ! OPTIONS_H */ diff --git a/tests/Makefile.am b/tests/Makefile.am index 931618e5..80fd018c 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -1,6 +1,7 @@ check_PROGRAMS = criterion criterion_SOURCES = $(top_builddir)/src/lexer/tests/lexer_tests.c \ + $(top_builddir)/src/lexer/tests/lexer2_tests.c \ $(top_builddir)/src/io_backend/tests/io_backend_tests.c \ $(top_builddir)/src/parser/tests/parser_tests.c \ $(top_builddir)/src/execute/tests/exec_tests.c \ From daae1d40b426a8cb68e842805bbb36e2eb859d0f Mon Sep 17 00:00:00 2001 From: erwann lesech Date: Mon, 15 Jan 2024 13:57:07 +0100 Subject: [PATCH 3/4] feat: add new tokens except double quote and variable --- src/Makefile.am | 2 +- src/lexer/lexer.c | 122 +++++++++++---------- src/lexer/lexer.h | 9 +- src/lexer/lexer_utils.c | 36 +++++-- src/lexer/tests/lexer2_tests.c | 192 ++++++++++++++++++++++++++++++--- src/lexer/tests/lexer_tests.c | 24 ++--- src/lexer/token.h | 16 +-- 7 files changed, 297 insertions(+), 104 deletions(-) diff --git a/src/Makefile.am b/src/Makefile.am index 07212fec..daf06742 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -11,7 +11,7 @@ bin_PROGRAMS = 42sh 42sh_CPPFLAGS = -I%D% -42sh_CFLAGS = -std=c99 -Werror -Wall -Wextra -Wvla -pedantic +42sh_CFLAGS = -std=c99 -Werror -Wall -Wextra -Wvla -pedantic -fsanitize=address -g 42sh_LDADD = lexer/liblexer.a \ ast/libast.a \ diff --git a/src/lexer/lexer.c b/src/lexer/lexer.c index 9cb5c7a8..953419e9 100644 --- a/src/lexer/lexer.c +++ b/src/lexer/lexer.c @@ -15,32 +15,18 @@ #include #include -struct lex_match lex_match[25] = { { "if", TOKEN_IF }, - { "then", TOKEN_THEN }, - { "elif", TOKEN_ELIF }, - { "else", TOKEN_ELSE }, - { "fi", TOKEN_FI }, - { ";", TOKEN_SEMICOLON }, - { "\n", TOKEN_EOL }, - { "\0", TOKEN_EOF }, - - { "&&", TOKEN_AND }, - { "||", TOKEN_OR }, - { "|", TOKEN_PIPE }, - { "!", TOKEN_NEGATE }, - { "[0-9]*<", TOKEN_INPUT_REDIR }, - { "[0-9]*>", TOKEN_OUTPUT_REDIR }, - { "[0-9]*>>", TOKEN_APPEND }, - { "[0-9]*<&", TOKEN_DUP_INPUT }, - { "[0-9]*>&", TOKEN_DUP_INPUT_OUTPUT }, - { "[0-9]*>|", TOKEN_NOCLOBBER }, - { "[0-9]*<>", TOKEN_DUP_INPUT_OUTPUT }, - { "while", TOKEN_WHILE }, - { "until", TOKEN_UNTIL }, - { "for", TOKEN_FOR }, - { "do", TOKEN_DO }, - { "done", TOKEN_DONE }, - { "$*", TOKEN_VARIABLE } }; +struct lex_match lex_match[] = { + { "if", TOKEN_IF }, { "then", TOKEN_THEN }, { "elif", TOKEN_ELIF }, + { "else", TOKEN_ELSE }, { "fi", TOKEN_FI }, { ";", TOKEN_SEMICOLON }, + { "\n", TOKEN_EOL }, { "\0", TOKEN_EOF }, + + { "&&", TOKEN_AND }, { "||", TOKEN_OR }, { "|", TOKEN_PIPE }, + { "!", TOKEN_NEGATE }, { "<", TOKEN_REDIR }, { ">", TOKEN_REDIR }, + { ">>", TOKEN_REDIR }, { "<&", TOKEN_REDIR }, { ">&", TOKEN_REDIR }, + { ">|", TOKEN_REDIR }, { "<>", TOKEN_REDIR }, + + { "$*", TOKEN_VARIABLE } +}; struct lexer *lexer_new(const char *input) { @@ -71,52 +57,75 @@ char *get_word(struct lexer *lexer, bool *is_diactivated) { char *word = malloc(sizeof(char) * 2); unsigned word_index = 0; - if (lexer->data[lexer->index] == '\0') - { - ++lexer->index; - word[0] = '\0'; - return word; - } + if (lexer->data[lexer->index] == ';' || lexer->data[lexer->index] == '\n') { word[0] = lexer->data[lexer->index]; - word[1] = '\0'; + word_index = 1; ++lexer->index; if (lexer->data[lexer->index] == ' ') { ++lexer->index; } - return word; } - if (lexer->data[lexer->index] == '#') + else if (lexer->data[lexer->index] == '#') + { + word = handle_comment(lexer, word, &word_index); + } + else if (lexer->data[lexer->index] == '>' + || lexer->data[lexer->index] == '<') { - return handle_comment(lexer, word, 0); + word = handle_redir(lexer, &word_index); } - while (lexer->data[lexer->index] != ' ' && lexer->data[lexer->index] != '\0' - && lexer->data[lexer->index] != ';' - && lexer->data[lexer->index] != '\n' - && lexer->data[lexer->index] != '\t') + else if (lexer->data[lexer->index] == '|' + || lexer->data[lexer->index] == '&') { - word = realloc(word, sizeof(char) * (word_index + 1)); - word[word_index] = lexer->data[lexer->index]; - ++word_index; + word[0] = lexer->data[lexer->index]; + word_index = 1; ++lexer->index; - if (lexer->data[lexer->index - 1] == '\\') + + if (lexer->data[lexer->index] == '|' + || lexer->data[lexer->index] == '&') { - if (!handle_backslash(lexer, is_diactivated, word, word_index)) - { - return word; - } + word = realloc(word, sizeof(char) * (word_index + 1)); + word[word_index] = lexer->data[lexer->index]; + word_index = 2; + ++lexer->index; } - else if (lexer->data[lexer->index - 1] == '\'') + } + else + { + while (lexer->data[lexer->index] != ' ' + && lexer->data[lexer->index] != '\0' + && lexer->data[lexer->index] != ';' + && lexer->data[lexer->index] != '\n' + && lexer->data[lexer->index] != '\t' + && lexer->data[lexer->index] != '>' + && lexer->data[lexer->index] != '<' + && lexer->data[lexer->index] != '|' + && lexer->data[lexer->index] != '&') { - word = - handle_simple_quote(lexer, is_diactivated, word, &word_index); - if (!word) + word = realloc(word, sizeof(char) * (word_index + 1)); + word[word_index] = lexer->data[lexer->index]; + ++word_index; + ++lexer->index; + if (lexer->data[lexer->index - 1] == '\\') + { + if (!handle_backslash(lexer, is_diactivated, word, word_index)) + { + return word; + } + } + else if (lexer->data[lexer->index - 1] == '\'') { - return NULL; + word = handle_simple_quote(lexer, is_diactivated, word, + &word_index); + if (!word) + { + return NULL; + } + lexer->index += 1; } - lexer->index += 1; } } word = realloc(word, sizeof(char) * (word_index + 1)); @@ -192,6 +201,9 @@ struct token lexer_pop(struct lexer *lexer) return token; } struct token token = parse_input_for_tok(lexer); - lexer->curr_tok = token; + if (token.type != TOKEN_EOF) + { + lexer->curr_tok = token; + } return token; } diff --git a/src/lexer/lexer.h b/src/lexer/lexer.h index 63452eb6..f832e685 100644 --- a/src/lexer/lexer.h +++ b/src/lexer/lexer.h @@ -96,7 +96,14 @@ char *handle_simple_quote(struct lexer *lexer, bool *is_diactivated, char *word, * * \return The next word. */ -char *handle_comment(struct lexer *lexer, char *word, unsigned word_index); +char *handle_comment(struct lexer *lexer, char *word, unsigned *word_index); + +/** + * \brief Handle the redirection character. + * \param lexer The lexer. + * \return The next redirection word. + */ +char *handle_redir(struct lexer *lexer, unsigned *word_index); /** * \brief Returns the next word in the input string. diff --git a/src/lexer/lexer_utils.c b/src/lexer/lexer_utils.c index 908a877c..c14a816e 100644 --- a/src/lexer/lexer_utils.c +++ b/src/lexer/lexer_utils.c @@ -42,7 +42,7 @@ char *handle_simple_quote(struct lexer *lexer, bool *is_diactivated, char *word, return word; } -char *handle_comment(struct lexer *lexer, char *word, unsigned word_index) +char *handle_comment(struct lexer *lexer, char *word, unsigned *word_index) { // Skip the comment ++lexer->index; @@ -53,13 +53,11 @@ char *handle_comment(struct lexer *lexer, char *word, unsigned word_index) { ++lexer->index; } - word[word_index] = lexer->data[lexer->index]; - ++lexer->index; - // If the comment isn't the last thing in the string, we need to add a '\0' - // at the end of the word. - if (word[word_index] != '\0') + word[*word_index] = lexer->data[lexer->index]; + *word_index += 1; + if (lexer->data[lexer->index] != '\0') { - word[word_index + 1] = '\0'; + ++lexer->index; } // Skip the spaces after the comment @@ -68,4 +66,28 @@ char *handle_comment(struct lexer *lexer, char *word, unsigned word_index) ++lexer->index; } return word; +} + +char *handle_redir(struct lexer *lexer, unsigned *word_index) +{ + char *redir = malloc(sizeof(char) * 2); + redir[0] = lexer->data[lexer->index]; + ++lexer->index; + *word_index += 1; + if (lexer->data[lexer->index] == '>' || lexer->data[lexer->index] == '&' + || lexer->data[lexer->index] == '|') + { + *word_index += 1; + redir = realloc(redir, sizeof(char) * 3); + if (lexer->data[lexer->index - 1] == '<' + && lexer->data[lexer->index] == '|') + { + free(redir); + return NULL; + } + + redir[1] = lexer->data[lexer->index]; + ++lexer->index; + } + return redir; } \ No newline at end of file diff --git a/src/lexer/tests/lexer2_tests.c b/src/lexer/tests/lexer2_tests.c index 9afb640a..7e38dc73 100644 --- a/src/lexer/tests/lexer2_tests.c +++ b/src/lexer/tests/lexer2_tests.c @@ -25,6 +25,48 @@ Test(lexer2, token_and) lexer_free(lexer); } +Test(lexer2, token_and2) +{ + struct lexer *lexer = lexer_new("false && true"); + struct token tok = lexer_pop(lexer); + cr_assert_eq(tok.type, TOKEN_WORD); + cr_assert_str_eq(tok.data, "false"); + token_free(tok); + + tok = lexer_pop(lexer); + cr_assert_eq(tok.type, TOKEN_AND); + cr_assert_str_eq(tok.data, "&&"); + token_free(tok); + + tok = lexer_pop(lexer); + cr_assert_eq(tok.type, TOKEN_WORD); + cr_assert_str_eq(tok.data, "true"); + token_free(tok); + + lexer_free(lexer); +} + +Test(lexer2, token_and_stick) +{ + struct lexer *lexer = lexer_new("false&&true"); + struct token tok = lexer_pop(lexer); + cr_assert_eq(tok.type, TOKEN_WORD); + cr_assert_str_eq(tok.data, "false"); + token_free(tok); + + tok = lexer_pop(lexer); + cr_assert_eq(tok.type, TOKEN_AND); + cr_assert_str_eq(tok.data, "&&"); + token_free(tok); + + tok = lexer_pop(lexer); + cr_assert_eq(tok.type, TOKEN_WORD); + cr_assert_str_eq(tok.data, "true"); + token_free(tok); + + lexer_free(lexer); +} + Test(lexer2, token_or) { struct lexer *lexer = lexer_new("||"); @@ -45,6 +87,42 @@ Test(lexer2, token_pipe) lexer_free(lexer); } +Test(lexer2, token_pipe2) +{ + struct lexer *lexer = lexer_new("echo papa|tr a e"); + struct token tok = lexer_pop(lexer); + cr_assert_eq(tok.type, TOKEN_WORD); + cr_assert_str_eq(tok.data, "echo"); + token_free(tok); + + tok = lexer_pop(lexer); + cr_assert_eq(tok.type, TOKEN_WORD); + cr_assert_str_eq(tok.data, "papa"); + token_free(tok); + + tok = lexer_pop(lexer); + cr_assert_eq(tok.type, TOKEN_PIPE); + cr_assert_str_eq(tok.data, "|"); + token_free(tok); + + tok = lexer_pop(lexer); + cr_assert_eq(tok.type, TOKEN_WORD); + cr_assert_str_eq(tok.data, "tr"); + token_free(tok); + + tok = lexer_pop(lexer); + cr_assert_eq(tok.type, TOKEN_WORD); + cr_assert_str_eq(tok.data, "a"); + token_free(tok); + + tok = lexer_pop(lexer); + cr_assert_eq(tok.type, TOKEN_WORD); + cr_assert_str_eq(tok.data, "e"); + token_free(tok); + + lexer_free(lexer); +} + Test(lexer2, token_negate) { struct lexer *lexer = lexer_new("!"); @@ -59,7 +137,7 @@ Test(lexer2, token_input_redir) { struct lexer *lexer = lexer_new("<"); struct token tok = lexer_pop(lexer); - cr_assert_eq(tok.type, TOKEN_INPUT_REDIR); + cr_assert_eq(tok.type, TOKEN_REDIR); cr_assert_str_eq(tok.data, "<"); token_free(tok); lexer_free(lexer); @@ -69,7 +147,7 @@ Test(lexer2, token_output_redir) { struct lexer *lexer = lexer_new(">"); struct token tok = lexer_pop(lexer); - cr_assert_eq(tok.type, TOKEN_OUTPUT_REDIR); + cr_assert_eq(tok.type, TOKEN_REDIR); cr_assert_str_eq(tok.data, ">"); token_free(tok); lexer_free(lexer); @@ -79,9 +157,7 @@ Test(lexer2, token_append) { struct lexer *lexer = lexer_new(">>"); struct token tok = lexer_pop(lexer); - printf("%s-\n", tok.data); - printf("%d\n", fnmatch("[0-9]*>>", ">>", 0)); - cr_assert_eq(tok.type, TOKEN_APPEND, "got %d", tok.type); + cr_assert_eq(tok.type, TOKEN_REDIR, "got %d", tok.type); cr_assert_str_eq(tok.data, ">>"); token_free(tok); lexer_free(lexer); @@ -91,7 +167,7 @@ Test(lexer2, token_dup_input) { struct lexer *lexer = lexer_new("<&"); struct token tok = lexer_pop(lexer); - cr_assert_eq(tok.type, TOKEN_DUP_INPUT); + cr_assert_eq(tok.type, TOKEN_REDIR); cr_assert_str_eq(tok.data, "<&"); token_free(tok); lexer_free(lexer); @@ -101,7 +177,7 @@ Test(lexer2, token_dup_input_output) { struct lexer *lexer = lexer_new(">&"); struct token tok = lexer_pop(lexer); - cr_assert_eq(tok.type, TOKEN_DUP_INPUT_OUTPUT); + cr_assert_eq(tok.type, TOKEN_REDIR); cr_assert_str_eq(tok.data, ">&"); token_free(tok); lexer_free(lexer); @@ -111,7 +187,7 @@ Test(lexer2, token_noclobber) { struct lexer *lexer = lexer_new(">|"); struct token tok = lexer_pop(lexer); - cr_assert_eq(tok.type, TOKEN_NOCLOBBER); + cr_assert_eq(tok.type, TOKEN_REDIR); cr_assert_str_eq(tok.data, ">|"); token_free(tok); lexer_free(lexer); @@ -121,17 +197,105 @@ Test(lexer2, token_dup_input_output2) { struct lexer *lexer = lexer_new("<>"); struct token tok = lexer_pop(lexer); - cr_assert_eq(tok.type, TOKEN_DUP_INPUT_OUTPUT); + cr_assert_eq(tok.type, TOKEN_REDIR); cr_assert_str_eq(tok.data, "<>"); token_free(tok); lexer_free(lexer); } +Test(lexer2, token_redir_stick_left) +{ + struct lexer *lexer = lexer_new("ls -la 2>file"); + struct token tok = lexer_pop(lexer); + cr_assert_eq(tok.type, TOKEN_WORD); + cr_assert_str_eq(tok.data, "ls"); + token_free(tok); + + tok = lexer_pop(lexer); + cr_assert_eq(tok.type, TOKEN_WORD); + cr_assert_str_eq(tok.data, "-la"); + token_free(tok); + + tok = lexer_pop(lexer); + cr_assert_eq(tok.type, TOKEN_WORD); + cr_assert_str_eq(tok.data, "2"); + token_free(tok); + + tok = lexer_pop(lexer); + cr_assert_eq(tok.type, TOKEN_REDIR); + cr_assert_str_eq(tok.data, ">"); + token_free(tok); + + tok = lexer_pop(lexer); + cr_assert_eq(tok.type, TOKEN_WORD); + cr_assert_str_eq(tok.data, "file"); + token_free(tok); + + lexer_free(lexer); +} + +Test(lexer2, token_redir_stick_left2) +{ + struct lexer *lexer = lexer_new("ls -la 2<>file"); + struct token tok = lexer_pop(lexer); + cr_assert_eq(tok.type, TOKEN_WORD); + cr_assert_str_eq(tok.data, "ls"); + token_free(tok); + + tok = lexer_pop(lexer); + cr_assert_eq(tok.type, TOKEN_WORD); + cr_assert_str_eq(tok.data, "-la"); + token_free(tok); + + tok = lexer_pop(lexer); + cr_assert_eq(tok.type, TOKEN_WORD); + cr_assert_str_eq(tok.data, "2"); + token_free(tok); + + tok = lexer_pop(lexer); + cr_assert_eq(tok.type, TOKEN_REDIR); + cr_assert_str_eq(tok.data, "<>"); + token_free(tok); + + tok = lexer_pop(lexer); + cr_assert_eq(tok.type, TOKEN_WORD); + cr_assert_str_eq(tok.data, "file"); + token_free(tok); + + lexer_free(lexer); +} + +Test(lexer2, token_redir_stick_left3) +{ + struct lexer *lexer = lexer_new("ls -la >| file"); + struct token tok = lexer_pop(lexer); + cr_assert_eq(tok.type, TOKEN_WORD); + cr_assert_str_eq(tok.data, "ls"); + token_free(tok); + + tok = lexer_pop(lexer); + cr_assert_eq(tok.type, TOKEN_WORD); + cr_assert_str_eq(tok.data, "-la"); + token_free(tok); + + tok = lexer_pop(lexer); + cr_assert_eq(tok.type, TOKEN_REDIR); + cr_assert_str_eq(tok.data, ">|"); + token_free(tok); + + tok = lexer_pop(lexer); + cr_assert_eq(tok.type, TOKEN_WORD); + cr_assert_str_eq(tok.data, "file"); + token_free(tok); + + lexer_free(lexer); +} + Test(lexer2, token_while) { struct lexer *lexer = lexer_new("while"); struct token tok = lexer_pop(lexer); - cr_assert_eq(tok.type, TOKEN_WHILE); + cr_assert_eq(tok.type, TOKEN_WORD); cr_assert_str_eq(tok.data, "while"); token_free(tok); lexer_free(lexer); @@ -141,7 +305,7 @@ Test(lexer2, token_until) { struct lexer *lexer = lexer_new("until"); struct token tok = lexer_pop(lexer); - cr_assert_eq(tok.type, TOKEN_UNTIL); + cr_assert_eq(tok.type, TOKEN_WORD); cr_assert_str_eq(tok.data, "until"); token_free(tok); lexer_free(lexer); @@ -151,7 +315,7 @@ Test(lexer2, token_for) { struct lexer *lexer = lexer_new("for"); struct token tok = lexer_pop(lexer); - cr_assert_eq(tok.type, TOKEN_FOR); + cr_assert_eq(tok.type, TOKEN_WORD); cr_assert_str_eq(tok.data, "for"); token_free(tok); lexer_free(lexer); @@ -161,7 +325,7 @@ Test(lexer2, token_do) { struct lexer *lexer = lexer_new("do"); struct token tok = lexer_pop(lexer); - cr_assert_eq(tok.type, TOKEN_DO); + cr_assert_eq(tok.type, TOKEN_WORD); cr_assert_str_eq(tok.data, "do"); token_free(tok); lexer_free(lexer); @@ -171,7 +335,7 @@ Test(lexer2, token_done) { struct lexer *lexer = lexer_new("done"); struct token tok = lexer_pop(lexer); - cr_assert_eq(tok.type, TOKEN_DONE); + cr_assert_eq(tok.type, TOKEN_WORD); cr_assert_str_eq(tok.data, "done"); token_free(tok); lexer_free(lexer); diff --git a/src/lexer/tests/lexer_tests.c b/src/lexer/tests/lexer_tests.c index 03d1b795..7b5a2d18 100644 --- a/src/lexer/tests/lexer_tests.c +++ b/src/lexer/tests/lexer_tests.c @@ -74,7 +74,7 @@ Test(lexer, lexer_pop_simple) token = lexer_pop(lexer); cr_assert_eq(token.type, TOKEN_EOF); cr_assert_str_eq(token.data, "\0"); - cr_assert_eq(lexer->index, 17, "index = %lu", lexer->index); + cr_assert_eq(lexer->index, 16, "index = %lu", lexer->index); token_free(token); lexer_free(lexer); @@ -104,7 +104,7 @@ Test(lexer, lexer_pop_with_semicolon) token = lexer_pop(lexer); cr_assert_eq(token.type, TOKEN_EOF); cr_assert_str_eq(token.data, "\0"); - cr_assert_eq(lexer->index, 10); + cr_assert_eq(lexer->index, 9); token_free(token); lexer_free(lexer); @@ -134,7 +134,7 @@ Test(lexer, lexer_pop_with_backslash_semicolon) token = lexer_pop(lexer); cr_assert_eq(token.type, TOKEN_EOF); cr_assert_str_eq(token.data, "\0"); - cr_assert_eq(lexer->index, 11); + cr_assert_eq(lexer->index, 10); token_free(token); lexer_free(lexer); @@ -158,7 +158,7 @@ Test(Lexer, lexer_pop_with_backslash_semicolon_in_word) token = lexer_pop(lexer); cr_assert_eq(token.type, TOKEN_EOF); cr_assert_str_eq(token.data, "\0"); - cr_assert_eq(lexer->index, 10); + cr_assert_eq(lexer->index, 9); token_free(token); lexer_free(lexer); @@ -182,7 +182,7 @@ Test(Lexer, lexer_pop_with_single_quote) token = lexer_pop(lexer); cr_assert_eq(token.type, TOKEN_EOF); cr_assert_str_eq(token.data, "\0"); - cr_assert_eq(lexer->index, 9); + cr_assert_eq(lexer->index, 8); token_free(token); lexer_free(lexer); @@ -206,7 +206,7 @@ Test(Lexer, lexer_pop_with_backslash_single_quote) token = lexer_pop(lexer); cr_assert_eq(token.type, TOKEN_EOF); cr_assert_str_eq(token.data, "\0"); - cr_assert_eq(lexer->index, 8); + cr_assert_eq(lexer->index, 7); token_free(token); lexer_free(lexer); @@ -230,7 +230,7 @@ Test(Lexer, lexer_pop_with_backslash_single_quote_2) token = lexer_pop(lexer); cr_assert_eq(token.type, TOKEN_EOF); cr_assert_str_eq(token.data, "\0"); - cr_assert_eq(lexer->index, 17); + cr_assert_eq(lexer->index, 16); token_free(token); lexer_free(lexer); @@ -254,7 +254,7 @@ Test(Lexer, lexer_pop_with_backslash_diactivate_single_quote) token = lexer_pop(lexer); cr_assert_eq(token.type, TOKEN_EOF); cr_assert_str_eq(token.data, "\0"); - cr_assert_eq(lexer->index, 19); + cr_assert_eq(lexer->index, 18); token_free(token); lexer_free(lexer); @@ -458,7 +458,7 @@ Test(Lexer, simple_comment) token = lexer_pop(lexer); cr_assert_eq(token.type, TOKEN_EOF); cr_assert_str_eq(token.data, "\0"); - cr_assert_eq(lexer->index, 31); + cr_assert_eq(lexer->index, 30); token_free(token); lexer_free(lexer); @@ -476,7 +476,7 @@ Test(Lexer, comment_with_semicolon) token = lexer_pop(lexer); cr_assert_eq(token.type, TOKEN_EOF, "token.type = %d", token.type); cr_assert_str_eq(token.data, "\0"); - cr_assert_eq(lexer->index, 37, "lexer->index = %lu", lexer->index); + cr_assert_eq(lexer->index, 36, "lexer->index = %lu", lexer->index); token_free(token); lexer_free(lexer); @@ -506,7 +506,7 @@ Test(Lexer, comment_with_backslash_n) token = lexer_pop(lexer); cr_assert_eq(token.type, TOKEN_EOF, "token.type = %d", token.type); cr_assert_str_eq(token.data, "\0"); - cr_assert_eq(lexer->index, 37, "lexer->index = %lu", lexer->index); + cr_assert_eq(lexer->index, 36, "lexer->index = %lu", lexer->index); token_free(token); lexer_free(lexer); @@ -524,7 +524,7 @@ Test(Lexer, comment_with_back_slash2) token = lexer_pop(lexer); cr_assert_eq(token.type, TOKEN_EOF, "token.type = %d", token.type); cr_assert_str_eq(token.data, "\0"); - cr_assert_eq(lexer->index, 38, "lexer->index = %lu", lexer->index); + cr_assert_eq(lexer->index, 37, "lexer->index = %lu", lexer->index); token_free(token); lexer_free(lexer); diff --git a/src/lexer/token.h b/src/lexer/token.h index 373474fb..3d2a38c5 100644 --- a/src/lexer/token.h +++ b/src/lexer/token.h @@ -22,7 +22,6 @@ enum token_type TOKEN_ELSE, TOKEN_FI, TOKEN_SEMICOLON, - TOKEN_SIMPLE_QUOTE, TOKEN_WORD, // Any word TOKEN_EOL, // End of line ('\n') TOKEN_EOF, // End of file @@ -33,19 +32,8 @@ enum token_type TOKEN_OR, // || TOKEN_PIPE, // | TOKEN_NEGATE, // \! - TOKEN_INPUT_REDIR, // < - TOKEN_OUTPUT_REDIR, // > - TOKEN_APPEND, // >> - TOKEN_DUP_INPUT, // <& - TOKEN_DUP_OUTPUT, // >& - TOKEN_NOCLOBBER, // >| - TOKEN_DUP_INPUT_OUTPUT, // <> - TOKEN_WHILE, // while - TOKEN_UNTIL, // until - TOKEN_FOR, // for - TOKEN_DO, // do - TOKEN_DONE, // done - TOKEN_DOUBLE_QUOTE, // " + TOKEN_REDIR, // >, <, >>, >&, <&, >|, <> + TOKEN_VARIABLE // $ }; From 646c74c9add210287455170e14a3b51b353fa6d8 Mon Sep 17 00:00:00 2001 From: erwann lesech Date: Mon, 15 Jan 2024 16:53:05 +0100 Subject: [PATCH 4/4] fix: fix and add double quote and variable assignment This commit should add the double quote handling and variable assignment handling in lexer --- README.md | 7 +- src/Makefile.am | 2 +- src/lexer/lexer.c | 44 +++++++++-- src/lexer/lexer.h | 10 ++- src/lexer/lexer_utils.c | 137 +++++++++++++++++++++++++++++++-- src/lexer/tests/lexer2_tests.c | 134 +++++++++++++++++++++++++++++++- src/lexer/tests/lexer_tests.c | 16 ++-- src/lexer/token.h | 4 +- 8 files changed, 324 insertions(+), 30 deletions(-) diff --git a/README.md b/README.md index d66565f2..6696699e 100644 --- a/README.md +++ b/README.md @@ -13,9 +13,10 @@ ## Installation ```sh -git clone -cd -make +git clone git@github.com:ErwannLesech/42-Sh.git +cd 42-Sh +./42-install.sh +cd src/ ``` ## Usage diff --git a/src/Makefile.am b/src/Makefile.am index daf06742..07212fec 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -11,7 +11,7 @@ bin_PROGRAMS = 42sh 42sh_CPPFLAGS = -I%D% -42sh_CFLAGS = -std=c99 -Werror -Wall -Wextra -Wvla -pedantic -fsanitize=address -g +42sh_CFLAGS = -std=c99 -Werror -Wall -Wextra -Wvla -pedantic 42sh_LDADD = lexer/liblexer.a \ ast/libast.a \ diff --git a/src/lexer/lexer.c b/src/lexer/lexer.c index 953419e9..42c9a955 100644 --- a/src/lexer/lexer.c +++ b/src/lexer/lexer.c @@ -23,9 +23,7 @@ struct lex_match lex_match[] = { { "&&", TOKEN_AND }, { "||", TOKEN_OR }, { "|", TOKEN_PIPE }, { "!", TOKEN_NEGATE }, { "<", TOKEN_REDIR }, { ">", TOKEN_REDIR }, { ">>", TOKEN_REDIR }, { "<&", TOKEN_REDIR }, { ">&", TOKEN_REDIR }, - { ">|", TOKEN_REDIR }, { "<>", TOKEN_REDIR }, - - { "$*", TOKEN_VARIABLE } + { ">|", TOKEN_REDIR }, { "<>", TOKEN_REDIR } }; struct lexer *lexer_new(const char *input) @@ -105,16 +103,37 @@ char *get_word(struct lexer *lexer, bool *is_diactivated) && lexer->data[lexer->index] != '|' && lexer->data[lexer->index] != '&') { + if (lexer->data[lexer->index] == '=' && word_index > 0 && lexer->curr_tok.type != TOKEN_DOUBLE_QUOTE) + { + printf("word1: %s\n", word); + lexer->curr_tok.type = TOKEN_WORD_ASSIGNMENT; + lexer->index += 1; + break; + } word = realloc(word, sizeof(char) * (word_index + 1)); word[word_index] = lexer->data[lexer->index]; ++word_index; ++lexer->index; - if (lexer->data[lexer->index - 1] == '\\') + if (lexer->data[lexer->index - 1] == '\"' + || lexer->curr_tok.type == TOKEN_DOUBLE_QUOTE) { - if (!handle_backslash(lexer, is_diactivated, word, word_index)) + if (lexer->data[lexer->index - 1] == '\"') + { + word_index -= 1; + lexer->curr_tok.type = TOKEN_DOUBLE_QUOTE; + } + word = handle_double_quote(lexer, is_diactivated, word, + &word_index); + if (!word) { - return word; + return NULL; } + word[word_index] = '\0'; + return word; + } + else if (lexer->data[lexer->index - 1] == '\\') + { + handle_backslash(lexer, is_diactivated, word, word_index); } else if (lexer->data[lexer->index - 1] == '\'') { @@ -161,6 +180,15 @@ struct token parse_input_for_tok(struct lexer *lexer) return token; } + if (lexer->curr_tok.type == TOKEN_WORD_ASSIGNMENT) + { + token.type = TOKEN_WORD_ASSIGNMENT; + token.data = word; + lexer->curr_tok.type = TOKEN_EOL; + printf("word: %s\n", word); + return token; + } + for (unsigned i = 0; i < sizeof(lex_match) / sizeof(*lex_match); ++i) { if (fnmatch(lex_match[i].str, word, 0) == 0 && !is_diactivated) @@ -170,7 +198,7 @@ struct token parse_input_for_tok(struct lexer *lexer) return token; } } - + token.type = TOKEN_WORD; token.data = word; return token; @@ -201,7 +229,7 @@ struct token lexer_pop(struct lexer *lexer) return token; } struct token token = parse_input_for_tok(lexer); - if (token.type != TOKEN_EOF) + if (token.type == TOKEN_EOF) { lexer->curr_tok = token; } diff --git a/src/lexer/lexer.h b/src/lexer/lexer.h index f832e685..f5f8c5e2 100644 --- a/src/lexer/lexer.h +++ b/src/lexer/lexer.h @@ -80,7 +80,7 @@ struct token lexer_pop(struct lexer *lexer); * * \return false if it's the end of the string, true otherwise. */ -bool handle_backslash(struct lexer *lexer, bool *is_diactivated, char *word, +void handle_backslash(struct lexer *lexer, bool *is_diactivated, char *word, unsigned word_index); /** @@ -91,6 +91,14 @@ bool handle_backslash(struct lexer *lexer, bool *is_diactivated, char *word, char *handle_simple_quote(struct lexer *lexer, bool *is_diactivated, char *word, unsigned *word_index); +/** + * \brief Handle the double quote character. + * \param lexer The lexer. + * \return The next word. + */ +char *handle_double_quote(struct lexer *lexer, bool *is_diactivated, char *word, + unsigned *word_index); + /** * \brief Handle the comment character. * diff --git a/src/lexer/lexer_utils.c b/src/lexer/lexer_utils.c index c14a816e..597f3163 100644 --- a/src/lexer/lexer_utils.c +++ b/src/lexer/lexer_utils.c @@ -1,8 +1,10 @@ +#include #include +#include #include "lexer.h" -bool handle_backslash(struct lexer *lexer, bool *is_diactivated, char *word, +void handle_backslash(struct lexer *lexer, bool *is_diactivated, char *word, unsigned word_index) { *is_diactivated = true; @@ -14,10 +16,7 @@ bool handle_backslash(struct lexer *lexer, bool *is_diactivated, char *word, else { word[word_index - 1] = '\0'; - return false; } - - return true; } char *handle_simple_quote(struct lexer *lexer, bool *is_diactivated, char *word, @@ -42,6 +41,134 @@ char *handle_simple_quote(struct lexer *lexer, bool *is_diactivated, char *word, return word; } +/** + * \brief Check if the given word is a variable name. + * \param lexer The lexer. + * \param word The word to check. + */ +char *check_variable_name(struct lexer *lexer, char *word, unsigned *word_index) +{ + if (lexer->data[lexer->index] == '?' || lexer->data[lexer->index] == '*' + || lexer->data[lexer->index] == '@' || lexer->data[lexer->index] == '#' + || lexer->data[lexer->index] == '$') + { + word = realloc(word, sizeof(char) * (*word_index + 1)); + word[*word_index] = lexer->data[lexer->index]; + *word_index += 1; + lexer->index += 1; + return word; + } + else if (lexer->data[lexer->index] >= '0' + && lexer->data[lexer->index] <= '9') + { + while (lexer->data[lexer->index] >= '0' + && lexer->data[lexer->index] <= '9') + { + word = realloc(word, sizeof(char) * (*word_index + 1)); + word[*word_index] = lexer->data[lexer->index]; + *word_index += 1; + lexer->index += 1; + } + return word; + } + else if (lexer->data[lexer->index] == '_' + || lexer->data[lexer->index] == '-' + || (lexer->data[lexer->index] >= 'a' + && lexer->data[lexer->index] <= 'z') + || (lexer->data[lexer->index] >= 'A' + && lexer->data[lexer->index] <= 'Z')) + { + word = realloc(word, sizeof(char) * (*word_index + 1)); + word[*word_index] = lexer->data[lexer->index]; + *word_index += 1; + lexer->index += 1; + } + else + { + return NULL; + } + + while (lexer->data[lexer->index] != ' ' && lexer->data[lexer->index] != '\t' + && lexer->data[lexer->index] != '\n' + && lexer->data[lexer->index] != '\0') + { + if (lexer->data[lexer->index] == '_' || lexer->data[lexer->index] == '-' + || (lexer->data[lexer->index] >= 'a' + && lexer->data[lexer->index] <= 'z') + || (lexer->data[lexer->index] >= 'A' + && lexer->data[lexer->index] <= 'Z') + || (lexer->data[lexer->index] >= '0' + && lexer->data[lexer->index] <= '9')) + { + word = realloc(word, sizeof(char) * (*word_index + 1)); + word[*word_index] = lexer->data[lexer->index]; + *word_index += 1; + lexer->index += 1; + } + else + { + return word; + } + } + + return word; +} + +char *handle_double_quote(struct lexer *lexer, bool *is_diactivated, char *word, + unsigned *word_index) +{ + *is_diactivated = true; + if (lexer->data[lexer->index] == '$') + { + word = realloc(word, sizeof(char) * (*word_index + 1)); + word[*word_index] = lexer->data[lexer->index]; + *word_index += 1; + lexer->index += 1; + char *word_tmp = check_variable_name(lexer, word, word_index); + if (word_tmp != NULL) + { + word = word_tmp; + } + else + { + return word; + } + } + while (lexer->data[lexer->index] != '\"' && lexer->data[lexer->index] != '$') + { + if (lexer->data[lexer->index] == '\0') + { + free(word); + word = NULL; + return NULL; + } + else if (lexer->data[lexer->index] == '\\') + { + lexer->index += 1; + if (lexer->data[lexer->index] == '\"' + || lexer->data[lexer->index] == '$' + || lexer->data[lexer->index] == '\\' + || lexer->data[lexer->index] == '\n') + { + word = realloc(word, sizeof(char) * (*word_index + 1)); + word[*word_index] = lexer->data[lexer->index]; + *word_index += 1; + lexer->index += 1; + } + } + word = realloc(word, sizeof(char) * (*word_index + 1)); + word[*word_index] = lexer->data[lexer->index]; + *word_index += 1; + lexer->index += 1; + } + if (lexer->data[lexer->index] == '\"') + { + lexer->curr_tok.type = TOKEN_EOL; + ++lexer->index; + } + return word; +} + char *handle_comment(struct lexer *lexer, char *word, unsigned *word_index) { // Skip the comment @@ -90,4 +217,4 @@ char *handle_redir(struct lexer *lexer, unsigned *word_index) ++lexer->index; } return redir; -} \ No newline at end of file +} diff --git a/src/lexer/tests/lexer2_tests.c b/src/lexer/tests/lexer2_tests.c index 7e38dc73..aaebaf85 100644 --- a/src/lexer/tests/lexer2_tests.c +++ b/src/lexer/tests/lexer2_tests.c @@ -345,7 +345,7 @@ Test(lexer2, token_variable_parameters) { struct lexer *lexer = lexer_new("$@"); struct token tok = lexer_pop(lexer); - cr_assert_eq(tok.type, TOKEN_VARIABLE); + cr_assert_eq(tok.type, TOKEN_WORD); cr_assert_str_eq(tok.data, "$@"); token_free(tok); lexer_free(lexer); @@ -355,8 +355,138 @@ Test(lexer2, token_variable_parameters2) { struct lexer *lexer = lexer_new("$*"); struct token tok = lexer_pop(lexer); - cr_assert_eq(tok.type, TOKEN_VARIABLE); + cr_assert_eq(tok.type, TOKEN_WORD); cr_assert_str_eq(tok.data, "$*"); token_free(tok); + lexer_free(lexer); +} + +Test(lexer2, token_double_quote) +{ + struct lexer *lexer = lexer_new("echo \"tata toto\""); + struct token tok = lexer_pop(lexer); + cr_assert_eq(tok.type, TOKEN_WORD); + cr_assert_str_eq(tok.data, "echo"); + token_free(tok); + + tok = lexer_pop(lexer); + cr_assert_eq(tok.type, TOKEN_WORD, "got %d", tok.type); + cr_assert_str_eq(tok.data, "tata toto", "got %s", tok.data); + token_free(tok); + + lexer_free(lexer); +} + +Test(lexer2, token_double_quote_newline) +{ + struct lexer *lexer = lexer_new("echo \"tata \n toto\""); + struct token tok = lexer_pop(lexer); + cr_assert_eq(tok.type, TOKEN_WORD); + cr_assert_str_eq(tok.data, "echo"); + token_free(tok); + + tok = lexer_pop(lexer); + cr_assert_eq(tok.type, TOKEN_WORD, "got %d", tok.type); + cr_assert_str_eq(tok.data, "tata \n toto"); + token_free(tok); + + lexer_free(lexer); +} + +/* +Test(lexer2, token_double_quote_escaped) +{ + struct lexer *lexer = lexer_new("echo \"tata \\\n toto\""); + struct token tok = lexer_pop(lexer); + cr_assert_eq(tok.type, TOKEN_WORD); + cr_assert_str_eq(tok.data, "echo"); + token_free(tok); + + tok = lexer_pop(lexer); + cr_assert_eq(tok.type, TOKEN_WORD, "got %d", tok.type); + cr_assert_str_eq(tok.data, "tata ", "got %s", tok.data); + token_free(tok); + + tok = lexer_pop(lexer); + printf("%s\n", tok.data); + cr_assert_eq(tok.type, TOKEN_EOL, "got %d", tok.type); + cr_assert_str_eq(tok.data, "\n"); + token_free(tok); + + tok = lexer_pop(lexer); + cr_assert_eq(tok.type, TOKEN_WORD); + cr_assert_str_eq(tok.data, "toto"); + token_free(tok); + + lexer_free(lexer); +} +*/ +Test(lexer2, token_double_quote_variable) +{ + struct lexer *lexer = lexer_new("echo \"$test\""); + struct token tok = lexer_pop(lexer); + cr_assert_eq(tok.type, TOKEN_WORD); + cr_assert_str_eq(tok.data, "echo"); + token_free(tok); + + tok = lexer_pop(lexer); + cr_assert_eq(tok.type, TOKEN_WORD); + cr_assert_str_eq(tok.data, "$test"); + token_free(tok); + + lexer_free(lexer); +} + +Test(lexer2, token_double_quote_variable_escaped) +{ + struct lexer *lexer = lexer_new("echo \"\\$test\""); + struct token tok = lexer_pop(lexer); + cr_assert_eq(tok.type, TOKEN_WORD); + cr_assert_str_eq(tok.data, "echo"); + token_free(tok); + + tok = lexer_pop(lexer); + cr_assert_eq(tok.type, TOKEN_WORD); + cr_assert_str_eq(tok.data, "$test"); + token_free(tok); + + lexer_free(lexer); +} + +Test(lexer2, token_word_assignment) +{ + struct lexer *lexer = lexer_new("toto=2"); + struct token tok = lexer_pop(lexer); + cr_assert_eq(tok.type, TOKEN_WORD_ASSIGNMENT); + cr_assert_str_eq(tok.data, "toto"); + token_free(tok); + + tok = lexer_pop(lexer); + printf("%s\n", tok.data); + cr_assert_eq(tok.type, TOKEN_WORD, "got %d", tok.type); + cr_assert_str_eq(tok.data, "2"); + token_free(tok); + + lexer_free(lexer); +} + +Test(lexer2, token_word_assignment_in_echo) +{ + struct lexer *lexer = lexer_new("echo toto=2"); + struct token tok = lexer_pop(lexer); + cr_assert_eq(tok.type, TOKEN_WORD); + cr_assert_str_eq(tok.data, "echo"); + token_free(tok); + + tok = lexer_pop(lexer); + cr_assert_eq(tok.type, TOKEN_WORD_ASSIGNMENT); + cr_assert_str_eq(tok.data, "toto"); + token_free(tok); + + tok = lexer_pop(lexer); + cr_assert_eq(tok.type, TOKEN_WORD, "got %d", tok.type); + cr_assert_str_eq(tok.data, "2"); + token_free(tok); + lexer_free(lexer); } \ No newline at end of file diff --git a/src/lexer/tests/lexer_tests.c b/src/lexer/tests/lexer_tests.c index 7b5a2d18..f60b07ac 100644 --- a/src/lexer/tests/lexer_tests.c +++ b/src/lexer/tests/lexer_tests.c @@ -307,7 +307,7 @@ Test(Lexer, multiple_spaces) Test(Lexer, lexer_if_then) { - struct lexer *lexer = lexer_new("if 1=1 then echo 'ok' fi;"); + struct lexer *lexer = lexer_new("if true then echo 'ok' fi;"); struct token token = lexer_pop(lexer); cr_assert_eq(token.type, TOKEN_IF, "token.type = %d", token.type, "token.type = %d", token.type); @@ -320,9 +320,9 @@ Test(Lexer, lexer_if_then) token = lexer_pop(lexer); cr_assert_eq(token.type, TOKEN_WORD, "token.type = %d", token.type, "token.type = %d", token.type); - cr_assert_str_eq(token.data, "1=1", "token.data = %s", token.data, + cr_assert_str_eq(token.data, "true", "token.data = %s", token.data, "token.data = %s", token.data); - cr_assert_eq(lexer->index, 7, "lexer->index = %lu", lexer->index, + cr_assert_eq(lexer->index, 8, "lexer->index = %lu", lexer->index, "lexer->index = %lu", lexer->index); token_free(token); @@ -331,7 +331,7 @@ Test(Lexer, lexer_if_then) "token.type = %d", token.type); cr_assert_str_eq(token.data, "then", "token.data = %s", token.data, "token.data = %s", token.data); - cr_assert_eq(lexer->index, 12, "lexer->index = %lu", lexer->index, + cr_assert_eq(lexer->index, 13, "lexer->index = %lu", lexer->index, "lexer->index = %lu", lexer->index); token_free(token); @@ -340,7 +340,7 @@ Test(Lexer, lexer_if_then) "token.type = %d", token.type); cr_assert_str_eq(token.data, "echo", "token.data = %s", token.data, "token.data = %s", token.data); - cr_assert_eq(lexer->index, 17, "lexer->index = %lu", lexer->index, + cr_assert_eq(lexer->index, 18, "lexer->index = %lu", lexer->index, "lexer->index = %lu", lexer->index); token_free(token); @@ -349,7 +349,7 @@ Test(Lexer, lexer_if_then) "token.type = %d", token.type); cr_assert_str_eq(token.data, "ok", "token.data = %s", token.data, "token.data = %s", token.data); - cr_assert_eq(lexer->index, 22, "lexer->index = %lu", lexer->index, + cr_assert_eq(lexer->index, 23, "lexer->index = %lu", lexer->index, "lexer->index = %lu", lexer->index); token_free(token); @@ -358,7 +358,7 @@ Test(Lexer, lexer_if_then) "token.type = %d", token.type); cr_assert_str_eq(token.data, "fi", "token.data = %s", token.data, "token.data = %s", token.data); - cr_assert_eq(lexer->index, 24, "lexer->index = %lu", lexer->index, + cr_assert_eq(lexer->index, 25, "lexer->index = %lu", lexer->index, "lexer->index = %lu", lexer->index); token_free(token); @@ -367,7 +367,7 @@ Test(Lexer, lexer_if_then) "token.type = %d", token.type); cr_assert_str_eq(token.data, ";", "token.data = %s", token.data, "token.data = %s", token.data); - cr_assert_eq(lexer->index, 25, "lexer->index = %lu", lexer->index, + cr_assert_eq(lexer->index, 26, "lexer->index = %lu", lexer->index, "lexer->index = %lu", lexer->index); token_free(token); diff --git a/src/lexer/token.h b/src/lexer/token.h index 3d2a38c5..b117e679 100644 --- a/src/lexer/token.h +++ b/src/lexer/token.h @@ -33,8 +33,8 @@ enum token_type TOKEN_PIPE, // | TOKEN_NEGATE, // \! TOKEN_REDIR, // >, <, >>, >&, <&, >|, <> - - TOKEN_VARIABLE // $ + TOKEN_DOUBLE_QUOTE, // " + TOKEN_WORD_ASSIGNMENT // variable= }; /**